import os,sys current_path = os.getcwd() sys.path.append(current_path) from spire.doc import * from spire.doc.common import * def word_to_txt(word_file_path, txt_file_path): # 加载Word文档 doc = Document() doc.LoadFromFile(word_file_path) doc.SaveToFile(txt_file_path, FileFormat.Txt) doc.Close() if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python standard_word_extractor.py ") sys.exit(-1) job_path = sys.argv[1] if not os.path.exists(job_path): print(f"job path not exists: {job_path}") sys.exit(-1) upload_path = os.path.join(job_path,"upload") if not os.path.exists(upload_path): print(f"OCR path not exists: {upload_path}") sys.exit(-1) ocr_path = os.path.join(job_path,"ocr_output") os.makedirs(ocr_path ,exist_ok=True) print("start scan") for root,dirs,files in os.walk(upload_path): for file in files: print("check file: ", file) if file.endswith(".doc") or file.endswith(".docx"): print(f"Processing {file}") word_file_path = os.path.join(root,file) word_to_txt(word_file_path, os.path.join(ocr_path,file+".txt")) print(f"Done {file}") print("Done")