from typing import List, Dict from libs.chunc_helper import ChuncHelper import json import os from dotenv import load_dotenv import sys load_dotenv() # DeepSeek API配置 TRUNC_OUTPUT_PATH = os.getenv("TRUNC_OUTPUT_PATH") DOC_STORAGE_PATH = os.getenv("DOC_STORAGE_PATH") DOC_PATH = os.getenv("DOC_PATH") JIEBA_USER_DICT = os.getenv("JIEBA_USER_DICT") def title_reverse_index(): chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT) chunc_helper.cut_title_vector() chunc_helper.title_reverse_index() def embed_doc(): chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT) chunc_helper.process_data() def extract_law_document(): json_data = None with open(DOC_PATH,"r",encoding="utf-8") as f: lines = f.readlines() json_data = json.loads(''.join(lines)) print(">>> finished process document ") if json_data: index = 1 for item in json_data: author = item["meta_data"]["author"] description = item["meta_data"]["description"] keywords = item["meta_data"]["keywords"] SiteName = item["meta_data"]["SiteName"] SiteDomain = item["meta_data"]["SiteDomain"] SiteIDCode = item["meta_data"]["SiteIDCode"] ColumnName = item["meta_data"]["ColumnName"] ColumnType = item["meta_data"]["ColumnType"] ArticleTitle = item["meta_data"]["ArticleTitle"] PubDate = item["meta_data"]["PubDate"] ContentSource = item["meta_data"]["ContentSource"] article_text = item["article_text"] filename = ArticleTitle.replace("\\", "-") filename = filename.replace("/", "-") with open(f"{DOC_STORAGE_PATH}/{filename}.txt", "w", encoding="utf-8") as f: f.write("```meta\n") f.write(f"标题: {ArticleTitle}\n") f.write(f"作者: {author}\n") f.write(f"描述: {description}\n") f.write(f"关键字: {keywords}\n") f.write(f"类型: {ColumnType}\n") f.write(f"发布日期: {PubDate}\n") f.write("```\n") f.write("\n") f.write("```doc\n") f.write(article_text) f.write("```\n") print(item["meta_data"]['ArticleTitle'],f"processed {index}") index = index + 1 def extract_law_document_single(): json_data = None with open(DOC_PATH,"r",encoding="utf-8") as f: lines = f.readlines() json_data = json.loads(''.join(lines)) print(">>> finished process document ") if json_data: index = 1 with open(f"{DOC_STORAGE_PATH}/single.txt", "w", encoding="utf-8") as f: for item in json_data: url = item["url"] author = item["meta_data"]["author"] description = item["meta_data"]["description"] keywords = item["meta_data"]["keywords"] SiteName = item["meta_data"]["SiteName"] SiteDomain = item["meta_data"]["SiteDomain"] SiteIDCode = item["meta_data"]["SiteIDCode"] ColumnName = item["meta_data"]["ColumnName"] ColumnType = item["meta_data"]["ColumnType"] ArticleTitle = item["meta_data"]["ArticleTitle"] PubDate = item["meta_data"]["PubDate"] ContentSource = item["meta_data"]["ContentSource"] article_text = item["article_text"] filename = ArticleTitle.replace("\\", "-") filename = filename.replace("/", "-") f.write("```doc\n") f.write(f"标题: {ArticleTitle}\n") f.write(f"作者: {author}\n") f.write(f"描述: {description}\n") f.write(f"关键字: {keywords}\n") f.write(f"类型: {ColumnType}\n") f.write(f"发布日期: {PubDate}\n") f.write(f"原文链接: {url}\n") f.write("\n") f.write(article_text) f.write("```\n") print(item["meta_data"]['ArticleTitle'],f"processed {index}") index = index + 1 # 使用示例 if __name__ == "__main__": count_of_param = len(sys.argv) if count_of_param == 2: action = sys.argv[1] if action == "extract_single": #从json数据文件中抽取文章内容,写入数据目录 extract_law_document_single() if action == "extract": #从json数据文件中抽取文章内容,写入数据目录 extract_law_document() if action == "embed": #从json文件中读取文章,生成关键词向量,标题向量,chuncs和向量 embed_doc() if action == "title": #从json文件中读取文章标题,切词,生成切词到文章标题的倒排索引,以及切词对应的向量 title_reverse_index() #embed_doc()