123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120 |
- from typing import List, Dict
- from libs.chunc_helper import ChuncHelper
- import json
- import os
- from dotenv import load_dotenv
- import sys
- load_dotenv()
- # DeepSeek API配置
- TRUNC_OUTPUT_PATH = os.getenv("TRUNC_OUTPUT_PATH")
- DOC_STORAGE_PATH = os.getenv("DOC_STORAGE_PATH")
- DOC_PATH = os.getenv("DOC_PATH")
- JIEBA_USER_DICT = os.getenv("JIEBA_USER_DICT")
- def title_reverse_index():
- chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
- chunc_helper.cut_title_vector()
- chunc_helper.title_reverse_index()
- def embed_doc():
- chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
- chunc_helper.process_data()
- def extract_law_document():
- json_data = None
- with open(DOC_PATH,"r",encoding="utf-8") as f:
- lines = f.readlines()
- json_data = json.loads(''.join(lines))
- print(">>> finished process document ")
-
- if json_data:
- index = 1
- for item in json_data:
- author = item["meta_data"]["author"]
- description = item["meta_data"]["description"]
- keywords = item["meta_data"]["keywords"]
- SiteName = item["meta_data"]["SiteName"]
- SiteDomain = item["meta_data"]["SiteDomain"]
- SiteIDCode = item["meta_data"]["SiteIDCode"]
- ColumnName = item["meta_data"]["ColumnName"]
- ColumnType = item["meta_data"]["ColumnType"]
- ArticleTitle = item["meta_data"]["ArticleTitle"]
- PubDate = item["meta_data"]["PubDate"]
- ContentSource = item["meta_data"]["ContentSource"]
- article_text = item["article_text"]
- filename = ArticleTitle.replace("\\", "-")
- filename = filename.replace("/", "-")
- with open(f"{DOC_STORAGE_PATH}/{filename}.txt", "w", encoding="utf-8") as f:
- f.write("```meta\n")
- f.write(f"标题: {ArticleTitle}\n")
- f.write(f"作者: {author}\n")
- f.write(f"描述: {description}\n")
- f.write(f"关键字: {keywords}\n")
- f.write(f"类型: {ColumnType}\n")
- f.write(f"发布日期: {PubDate}\n")
- f.write("```\n")
- f.write("\n")
- f.write("```doc\n")
- f.write(article_text)
- f.write("```\n")
- print(item["meta_data"]['ArticleTitle'],f"processed {index}")
- index = index + 1
-
- def extract_law_document_single():
- json_data = None
- with open(DOC_PATH,"r",encoding="utf-8") as f:
- lines = f.readlines()
- json_data = json.loads(''.join(lines))
- print(">>> finished process document ")
-
- if json_data:
- index = 1
- with open(f"{DOC_STORAGE_PATH}/single.txt", "w", encoding="utf-8") as f:
- for item in json_data:
- url = item["url"]
- author = item["meta_data"]["author"]
- description = item["meta_data"]["description"]
- keywords = item["meta_data"]["keywords"]
- SiteName = item["meta_data"]["SiteName"]
- SiteDomain = item["meta_data"]["SiteDomain"]
- SiteIDCode = item["meta_data"]["SiteIDCode"]
- ColumnName = item["meta_data"]["ColumnName"]
- ColumnType = item["meta_data"]["ColumnType"]
- ArticleTitle = item["meta_data"]["ArticleTitle"]
- PubDate = item["meta_data"]["PubDate"]
- ContentSource = item["meta_data"]["ContentSource"]
- article_text = item["article_text"]
- filename = ArticleTitle.replace("\\", "-")
- filename = filename.replace("/", "-")
- f.write("```doc\n")
- f.write(f"标题: {ArticleTitle}\n")
- f.write(f"作者: {author}\n")
- f.write(f"描述: {description}\n")
- f.write(f"关键字: {keywords}\n")
- f.write(f"类型: {ColumnType}\n")
- f.write(f"发布日期: {PubDate}\n")
- f.write(f"原文链接: {url}\n")
- f.write("\n")
- f.write(article_text)
- f.write("```\n")
- print(item["meta_data"]['ArticleTitle'],f"processed {index}")
- index = index + 1
- # 使用示例
- if __name__ == "__main__":
- count_of_param = len(sys.argv)
- if count_of_param == 2:
- action = sys.argv[1]
- if action == "extract_single":
- #从json数据文件中抽取文章内容,写入数据目录
- extract_law_document_single()
- if action == "extract":
- #从json数据文件中抽取文章内容,写入数据目录
- extract_law_document()
- if action == "embed":
- #从json文件中读取文章,生成关键词向量,标题向量,chuncs和向量
- embed_doc()
- if action == "title":
- #从json文件中读取文章标题,切词,生成切词到文章标题的倒排索引,以及切词对应的向量
- title_reverse_index()
- #embed_doc()
|