python
/
self-constructing_graph


			
							123456789101112131415161718192021222324252627282930313233343536
							import os,sys

current_path = os.getcwd()
sys.path.append(current_path)

from libs.text_processor import TextProcessor


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python standard_txt_chunk.py <path_of_job>")
        sys.exit(-1)
    job_path = sys.argv[1]
    if not os.path.exists(job_path):
        print(f"job path not exists: {job_path}")
        sys.exit(-1)
    ocr_path = os.path.join(job_path,"ocr_output")
    if not os.path.exists(ocr_path):
        print(f"OCR path not exists: {ocr_path}")
        sys.exit(-1)
    chunk_path = os.path.join(job_path,"chunks")
    os.makedirs(chunk_path ,exist_ok=True)
    
    processor = TextProcessor()
    for root,dirs,files in os.walk(ocr_path):
        for file in files:
            if file.endswith(".txt"):
                print(f"Processing {file}")
                with open(os.path.join(root,file),"r",encoding="utf-8") as f:
                    text = f.read()
                    chunks = processor.chunk_text(text)
                    with open(os.path.join(chunk_path,file),"w",encoding="utf-8") as f:
                        for chunk in chunks:
                            f.write("```txt\n"+chunk+"\n```\n")
                print(f"Done {file}")
    print("Done")