standard_word_extractor.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. import os,sys
  2. current_path = os.getcwd()
  3. sys.path.append(current_path)
  4. from spire.doc import *
  5. from spire.doc.common import *
  6. def word_to_txt(word_file_path, txt_file_path):
  7. # 加载Word文档
  8. doc = Document()
  9. doc.LoadFromFile(word_file_path)
  10. doc.SaveToFile(txt_file_path, FileFormat.Txt)
  11. doc.Close()
  12. if __name__ == "__main__":
  13. if len(sys.argv) != 2:
  14. print("Usage: python standard_word_extractor.py <path_of_job>")
  15. sys.exit(-1)
  16. job_path = sys.argv[1]
  17. if not os.path.exists(job_path):
  18. print(f"job path not exists: {job_path}")
  19. sys.exit(-1)
  20. upload_path = os.path.join(job_path,"upload")
  21. if not os.path.exists(upload_path):
  22. print(f"OCR path not exists: {upload_path}")
  23. sys.exit(-1)
  24. ocr_path = os.path.join(job_path,"ocr_output")
  25. os.makedirs(ocr_path ,exist_ok=True)
  26. print("start scan")
  27. for root,dirs,files in os.walk(upload_path):
  28. for file in files:
  29. print("check file: ", file)
  30. if file.endswith(".doc") or file.endswith(".docx"):
  31. print(f"Processing {file}")
  32. word_file_path = os.path.join(root,file)
  33. word_to_txt(word_file_path, os.path.join(ocr_path,file+".txt"))
  34. print(f"Done {file}")
  35. print("Done")