sentence_util.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import re
  2. from typing import List
  3. import logging
  4. import argparse
  5. import sys
  6. logger = logging.getLogger(__name__)
  7. class SentenceUtil:
  8. """中文文本句子拆分工具类
  9. 用于将中文文本按照标点符号拆分成句子列表
  10. """
  11. def __init__(self):
  12. # 定义结束符号,包括常见的中文和英文标点
  13. self.end_symbols = ['。', '!', '?', '!', '?', '\n']
  14. # 定义引号对
  15. self.quote_pairs = [("'", "'"), ('"', '"'), ('「', '」'), ('『', '』'), ('(', ')'), ('(', ')')]
  16. @staticmethod
  17. def split_text(text: str, length: int = None) -> List[str]:
  18. """将文本拆分成句子列表
  19. Args:
  20. text: 输入的文本字符串
  21. length: 可选参数,指定拆分后句子的最大长度
  22. Returns:
  23. 拆分后的句子列表
  24. """
  25. sentences = SentenceUtil()._split(text)
  26. if length is not None:
  27. i = 0
  28. while i < len(sentences):
  29. if SentenceUtil().get_valid_length(sentences[i]) <= length and i + 1 < len(sentences):
  30. sentences[i] = sentences[i] + sentences[i+1]
  31. del sentences[i+1]
  32. else:
  33. i += 1
  34. return sentences
  35. def _split(self, text: str) -> List[str]:
  36. """内部拆分方法
  37. Args:
  38. text: 输入的文本字符串
  39. length: 可选参数,指定拆分后句子的最大长度
  40. Returns:
  41. 拆分后的句子列表
  42. """
  43. if not text or not text.strip():
  44. return []
  45. try:
  46. # 通用拆分逻辑
  47. sentences = []
  48. current_sentence = ""
  49. # 用于跟踪引号状态的栈
  50. quote_stack = []
  51. i = 0
  52. while i < len(text):
  53. char = text[i]
  54. current_sentence += char
  55. # 处理引号开始
  56. for start, end in self.quote_pairs:
  57. if char == start:
  58. if not quote_stack or quote_stack[-1][0] != end:
  59. quote_stack.append((end, i))
  60. break
  61. # 处理引号闭合
  62. if quote_stack and char == quote_stack[-1][0] and i > quote_stack[-1][1]:
  63. quote_stack.pop()
  64. # 处理结束符号,仅在非引号环境中
  65. if not quote_stack and char in self.end_symbols:
  66. if current_sentence.strip():
  67. # 保留句子末尾的换行符
  68. if char == '\n':
  69. current_sentence = current_sentence.rstrip('\n')
  70. sentences.append(current_sentence)
  71. current_sentence = '\n'
  72. else:
  73. sentences.append(current_sentence)
  74. current_sentence = ""
  75. # 处理空格 - 保留空格在下一个句子的开头
  76. if i + 1 < len(text) and text[i + 1].isspace() and text[i + 1] != '\n':
  77. i += 1
  78. current_sentence = text[i]
  79. i += 1
  80. # 处理循环结束时的剩余内容
  81. if current_sentence.strip():
  82. sentences.append(current_sentence)
  83. # 如果没有找到任何句子,返回原文本作为一个句子
  84. if not sentences:
  85. return [text]
  86. return sentences
  87. except Exception as e:
  88. logger.error(f"拆分文本时发生错误: {str(e)}")
  89. return []
  90. @staticmethod
  91. def clean_text(text: str) -> str:
  92. """去除除中英文和数字以外的所有字符
  93. Args:
  94. text: 输入的文本字符串
  95. Returns:
  96. 处理后的字符串
  97. """
  98. if not text:
  99. return text
  100. return re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
  101. @staticmethod
  102. def get_valid_length(text: str) -> int:
  103. """计算只包含中英文和数字的有效长度
  104. Args:
  105. text: 输入的文本字符串
  106. Returns:
  107. 有效字符的长度
  108. """
  109. if not text:
  110. return 0
  111. return len(re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text))
  112. def split_by_regex(self, text: str) -> List[str]:
  113. """使用正则表达式拆分文本
  114. 这是一个备选方法,使用正则表达式进行拆分
  115. Args:
  116. text: 输入的文本字符串
  117. Returns:
  118. 拆分后的句子列表
  119. """
  120. if not text or not text.strip():
  121. return []
  122. try:
  123. # 使用正则表达式拆分,保留分隔符
  124. pattern = r'([。!?!?]|\n)'
  125. parts = re.split(pattern, text)
  126. # 组合分隔符与前面的部分
  127. sentences = []
  128. for i in range(0, len(parts), 2):
  129. if i + 1 < len(parts):
  130. sentences.append(parts[i] + parts[i+1])
  131. else:
  132. # 处理最后一个部分(如果没有对应的分隔符)
  133. if parts[i].strip():
  134. sentences.append(parts[i])
  135. return sentences
  136. except Exception as e:
  137. logger.error(f"使用正则表达式拆分文本时发生错误: {str(e)}")
  138. return [text] if text else []
  139. if __name__ == '__main__':
  140. input_text = """急性期护理:
  141. - 每4h评估腹痛程度 3-1 PDF
  142. 延续护理: 1-2 PDF
  143. 患者教育: 3-3 PDF
  144. - 识别复发症状(发热/黄疸)"""
  145. sentences = SentenceUtil.split_text(input_text,10)
  146. for sentence in sentences:
  147. print(sentence)
  148. print('-----------')