123456789101112131415161718192021222324252627282930313233343536373839404142 |
- import os,sys
- current_path = os.getcwd()
- sys.path.append(current_path)
- from spire.doc import *
- from spire.doc.common import *
- def word_to_txt(word_file_path, txt_file_path):
- # 加载Word文档
- doc = Document()
- doc.LoadFromFile(word_file_path)
- doc.SaveToFile(txt_file_path, FileFormat.Txt)
- doc.Close()
- if __name__ == "__main__":
- if len(sys.argv) != 2:
- print("Usage: python standard_word_extractor.py <path_of_job>")
- sys.exit(-1)
- job_path = sys.argv[1]
- if not os.path.exists(job_path):
- print(f"job path not exists: {job_path}")
- sys.exit(-1)
- upload_path = os.path.join(job_path,"upload")
- if not os.path.exists(upload_path):
- print(f"OCR path not exists: {upload_path}")
- sys.exit(-1)
- ocr_path = os.path.join(job_path,"ocr_output")
- os.makedirs(ocr_path ,exist_ok=True)
- print("start scan")
- for root,dirs,files in os.walk(upload_path):
- for file in files:
- print("check file: ", file)
- if file.endswith(".doc") or file.endswith(".docx"):
- print(f"Processing {file}")
- word_file_path = os.path.join(root,file)
- word_to_txt(word_file_path, os.path.join(ocr_path,file+".txt"))
- print(f"Done {file}")
- print("Done")
|