import os,sys current_path = os.getcwd() sys.path.append(current_path) from libs.text_processor import TextProcessor if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python standard_txt_chunk.py ") sys.exit(-1) job_path = sys.argv[1] if not os.path.exists(job_path): print(f"job path not exists: {job_path}") sys.exit(-1) ocr_path = os.path.join(job_path,"ocr_output") if not os.path.exists(ocr_path): print(f"OCR path not exists: {ocr_path}") sys.exit(-1) chunk_path = os.path.join(job_path,"chunks") os.makedirs(chunk_path ,exist_ok=True) processor = TextProcessor() for root,dirs,files in os.walk(ocr_path): for file in files: if file.endswith(".txt"): print(f"Processing {file}") with open(os.path.join(root,file),"r",encoding="utf-8") as f: text = f.read() chunks = processor.chunk_text(text) with open(os.path.join(chunk_path,file),"w",encoding="utf-8") as f: for chunk in chunks: f.write("```txt\n"+chunk+"\n```\n") print(f"Done {file}") print("Done")