python
/
self-constructing_graph


			
							123456789101112131415161718192021222324252627282930313233343536373839404142
							
import os,sys

current_path = os.getcwd()
sys.path.append(current_path)

from spire.doc import *
from spire.doc.common import *

def word_to_txt(word_file_path, txt_file_path):
    # 加载Word文档
    doc = Document()
    doc.LoadFromFile(word_file_path)
    doc.SaveToFile(txt_file_path, FileFormat.Txt)
    doc.Close()


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python standard_word_extractor.py <path_of_job>")
        sys.exit(-1)
    job_path = sys.argv[1]
    if not os.path.exists(job_path):
        print(f"job path not exists: {job_path}")
        sys.exit(-1)
    upload_path = os.path.join(job_path,"upload")
    if not os.path.exists(upload_path):
        print(f"OCR path not exists: {upload_path}")
        sys.exit(-1)
    ocr_path = os.path.join(job_path,"ocr_output")
    os.makedirs(ocr_path ,exist_ok=True)
    print("start scan")
    for root,dirs,files in os.walk(upload_path):
        for file in files:
            print("check file: ", file)
            if file.endswith(".doc") or file.endswith(".docx"):
                print(f"Processing {file}")
                word_file_path = os.path.join(root,file)
                word_to_txt(word_file_path, os.path.join(ocr_path,file+".txt"))
                print(f"Done {file}")
    print("Done")