123456789101112131415161718192021222324252627282930313233343536 |
- import os,sys
- current_path = os.getcwd()
- sys.path.append(current_path)
- from libs.text_processor import TextProcessor
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("Usage: python standard_txt_chunk.py <path_of_job>")
- sys.exit(-1)
- job_path = sys.argv[1]
- if not os.path.exists(job_path):
- print(f"job path not exists: {job_path}")
- sys.exit(-1)
- ocr_path = os.path.join(job_path,"ocr_output")
- if not os.path.exists(ocr_path):
- print(f"OCR path not exists: {ocr_path}")
- sys.exit(-1)
- chunk_path = os.path.join(job_path,"chunks")
- os.makedirs(chunk_path ,exist_ok=True)
-
- processor = TextProcessor()
- for root,dirs,files in os.walk(ocr_path):
- for file in files:
- if file.endswith(".txt"):
- print(f"Processing {file}")
- with open(os.path.join(root,file),"r",encoding="utf-8") as f:
- text = f.read()
- chunks = processor.chunk_text(text)
- with open(os.path.join(chunk_path,file),"w",encoding="utf-8") as f:
- for chunk in chunks:
- f.write("```txt\n"+chunk+"\n```\n")
- print(f"Done {file}")
- print("Done")
|