extract_doc_from_json.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. from typing import List, Dict
  2. from libs.chunc_helper import ChuncHelper
  3. import json
  4. import os
  5. from dotenv import load_dotenv
  6. import sys
  7. load_dotenv()
  8. # DeepSeek API配置
  9. TRUNC_OUTPUT_PATH = os.getenv("TRUNC_OUTPUT_PATH")
  10. DOC_STORAGE_PATH = os.getenv("DOC_STORAGE_PATH")
  11. DOC_PATH = os.getenv("DOC_PATH")
  12. JIEBA_USER_DICT = os.getenv("JIEBA_USER_DICT")
  13. def title_reverse_index():
  14. chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
  15. chunc_helper.cut_title_vector()
  16. chunc_helper.title_reverse_index()
  17. def embed_doc():
  18. chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
  19. chunc_helper.process_data()
  20. def extract_law_document():
  21. json_data = None
  22. with open(DOC_PATH,"r",encoding="utf-8") as f:
  23. lines = f.readlines()
  24. json_data = json.loads(''.join(lines))
  25. print(">>> finished process document ")
  26. if json_data:
  27. index = 1
  28. for item in json_data:
  29. author = item["meta_data"]["author"]
  30. description = item["meta_data"]["description"]
  31. keywords = item["meta_data"]["keywords"]
  32. SiteName = item["meta_data"]["SiteName"]
  33. SiteDomain = item["meta_data"]["SiteDomain"]
  34. SiteIDCode = item["meta_data"]["SiteIDCode"]
  35. ColumnName = item["meta_data"]["ColumnName"]
  36. ColumnType = item["meta_data"]["ColumnType"]
  37. ArticleTitle = item["meta_data"]["ArticleTitle"]
  38. PubDate = item["meta_data"]["PubDate"]
  39. ContentSource = item["meta_data"]["ContentSource"]
  40. article_text = item["article_text"]
  41. filename = ArticleTitle.replace("\\", "-")
  42. filename = filename.replace("/", "-")
  43. with open(f"{DOC_STORAGE_PATH}/{filename}.txt", "w", encoding="utf-8") as f:
  44. f.write("```meta\n")
  45. f.write(f"标题: {ArticleTitle}\n")
  46. f.write(f"作者: {author}\n")
  47. f.write(f"描述: {description}\n")
  48. f.write(f"关键字: {keywords}\n")
  49. f.write(f"类型: {ColumnType}\n")
  50. f.write(f"发布日期: {PubDate}\n")
  51. f.write("```\n")
  52. f.write("\n")
  53. f.write("```doc\n")
  54. f.write(article_text)
  55. f.write("```\n")
  56. print(item["meta_data"]['ArticleTitle'],f"processed {index}")
  57. index = index + 1
  58. def extract_law_document_single():
  59. json_data = None
  60. with open(DOC_PATH,"r",encoding="utf-8") as f:
  61. lines = f.readlines()
  62. json_data = json.loads(''.join(lines))
  63. print(">>> finished process document ")
  64. if json_data:
  65. index = 1
  66. with open(f"{DOC_STORAGE_PATH}/single.txt", "w", encoding="utf-8") as f:
  67. for item in json_data:
  68. url = item["url"]
  69. author = item["meta_data"]["author"]
  70. description = item["meta_data"]["description"]
  71. keywords = item["meta_data"]["keywords"]
  72. SiteName = item["meta_data"]["SiteName"]
  73. SiteDomain = item["meta_data"]["SiteDomain"]
  74. SiteIDCode = item["meta_data"]["SiteIDCode"]
  75. ColumnName = item["meta_data"]["ColumnName"]
  76. ColumnType = item["meta_data"]["ColumnType"]
  77. ArticleTitle = item["meta_data"]["ArticleTitle"]
  78. PubDate = item["meta_data"]["PubDate"]
  79. ContentSource = item["meta_data"]["ContentSource"]
  80. article_text = item["article_text"]
  81. filename = ArticleTitle.replace("\\", "-")
  82. filename = filename.replace("/", "-")
  83. f.write("```doc\n")
  84. f.write(f"标题: {ArticleTitle}\n")
  85. f.write(f"作者: {author}\n")
  86. f.write(f"描述: {description}\n")
  87. f.write(f"关键字: {keywords}\n")
  88. f.write(f"类型: {ColumnType}\n")
  89. f.write(f"发布日期: {PubDate}\n")
  90. f.write(f"原文链接: {url}\n")
  91. f.write("\n")
  92. f.write(article_text)
  93. f.write("```\n")
  94. print(item["meta_data"]['ArticleTitle'],f"processed {index}")
  95. index = index + 1
  96. # 使用示例
  97. if __name__ == "__main__":
  98. count_of_param = len(sys.argv)
  99. if count_of_param == 2:
  100. action = sys.argv[1]
  101. if action == "extract_single":
  102. #从json数据文件中抽取文章内容,写入数据目录
  103. extract_law_document_single()
  104. if action == "extract":
  105. #从json数据文件中抽取文章内容,写入数据目录
  106. extract_law_document()
  107. if action == "embed":
  108. #从json文件中读取文章,生成关键词向量,标题向量,chuncs和向量
  109. embed_doc()
  110. if action == "title":
  111. #从json文件中读取文章标题,切词,生成切词到文章标题的倒排索引,以及切词对应的向量
  112. title_reverse_index()
  113. #embed_doc()