standard_txt_chunk.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import os,sys
  2. current_path = os.getcwd()
  3. sys.path.append(current_path)
  4. from libs.text_processor import TextProcessor
  5. if __name__ == "__main__":
  6. if len(sys.argv) != 2:
  7. print("Usage: python standard_txt_chunk.py <path_of_job>")
  8. sys.exit(-1)
  9. job_path = sys.argv[1]
  10. if not os.path.exists(job_path):
  11. print(f"job path not exists: {job_path}")
  12. sys.exit(-1)
  13. ocr_path = os.path.join(job_path,"ocr_output")
  14. if not os.path.exists(ocr_path):
  15. print(f"OCR path not exists: {ocr_path}")
  16. sys.exit(-1)
  17. chunk_path = os.path.join(job_path,"chunks")
  18. os.makedirs(chunk_path ,exist_ok=True)
  19. processor = TextProcessor()
  20. for root,dirs,files in os.walk(ocr_path):
  21. for file in files:
  22. if file.endswith(".txt"):
  23. print(f"Processing {file}")
  24. with open(os.path.join(root,file),"r",encoding="utf-8") as f:
  25. text = f.read()
  26. chunks = processor.chunk_text(text)
  27. with open(os.path.join(chunk_path,file),"w",encoding="utf-8") as f:
  28. for chunk in chunks:
  29. f.write("```txt\n"+chunk+"\n```\n")
  30. print(f"Done {file}")
  31. print("Done")