test_parse_txt.py 1.1 KB

1234567891011121314151617181920212223242526272829
  1. new_filename = 'D:\work\data\典型病例\典型病例合并后的数据1.txt'
  2. with open(f"{new_filename}", 'r', encoding="utf-8") as f:
  3. buf_str = ""
  4. for line in f.readlines():
  5. line = line.strip()
  6. if len(buf_str) > 0:
  7. buf_str = buf_str + "\n" + line
  8. else:
  9. buf_str = buf_str + line
  10. while len(buf_str) > 256:
  11. chunk = buf_str[0:256]
  12. buf_str = buf_str[256:]
  13. print("*" * 60)
  14. if chunk[-1] != '\n' and chunk[-1] != "。":
  15. last_end1 = chunk.rfind("。")
  16. last_end2 = chunk.rfind("\n")
  17. if last_end1 == -1 and last_end2 == -1:
  18. last_end1 = len(chunk)
  19. print("found end char: ", last_end1, last_end2)
  20. print("*" * 60)
  21. if last_end2 > last_end1:
  22. last_end1 = last_end2
  23. buf_str = chunk[last_end1 + 1:] + buf_str
  24. chunk = chunk[0:last_end1 + 1]
  25. print(len(chunk),chunk)