python
/
self-constructing_graph


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
							from typing import List, Dict
from libs.chunc_helper import ChuncHelper
import json
import os
from dotenv import load_dotenv
import sys
load_dotenv()

# DeepSeek API配置
TRUNC_OUTPUT_PATH = os.getenv("TRUNC_OUTPUT_PATH")
DOC_STORAGE_PATH = os.getenv("DOC_STORAGE_PATH")
DOC_PATH = os.getenv("DOC_PATH")
JIEBA_USER_DICT = os.getenv("JIEBA_USER_DICT")

def title_reverse_index():
    chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
    chunc_helper.cut_title_vector()
    chunc_helper.title_reverse_index()
def embed_doc():
    chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
    chunc_helper.process_data()
def extract_law_document():
    json_data = None
    with open(DOC_PATH,"r",encoding="utf-8") as f:
        lines = f.readlines()
        json_data = json.loads(''.join(lines))
    print(">>> finished process document  ")
    
    if json_data:
        index = 1
        for item in json_data:
            author = item["meta_data"]["author"]
            description = item["meta_data"]["description"]
            keywords = item["meta_data"]["keywords"]
            SiteName = item["meta_data"]["SiteName"]
            SiteDomain = item["meta_data"]["SiteDomain"]
            SiteIDCode = item["meta_data"]["SiteIDCode"]
            ColumnName = item["meta_data"]["ColumnName"]
            ColumnType = item["meta_data"]["ColumnType"]
            ArticleTitle = item["meta_data"]["ArticleTitle"]
            PubDate = item["meta_data"]["PubDate"]
            ContentSource = item["meta_data"]["ContentSource"]
            article_text = item["article_text"]
            filename = ArticleTitle.replace("\\", "-")
            filename = filename.replace("/", "-")
            with open(f"{DOC_STORAGE_PATH}/{filename}.txt", "w", encoding="utf-8") as f:
                f.write("```meta\n")
                f.write(f"标题: {ArticleTitle}\n")
                f.write(f"作者: {author}\n")
                f.write(f"描述: {description}\n")
                f.write(f"关键字: {keywords}\n")
                f.write(f"类型: {ColumnType}\n")
                f.write(f"发布日期: {PubDate}\n")
                f.write("```\n")
                f.write("\n")
                f.write("```doc\n")
                f.write(article_text)
                f.write("```\n")
            print(item["meta_data"]['ArticleTitle'],f"processed {index}")
            index = index + 1
            

def extract_law_document_single():
    json_data = None
    with open(DOC_PATH,"r",encoding="utf-8") as f:
        lines = f.readlines()
        json_data = json.loads(''.join(lines))
    print(">>> finished process document  ")
    
    if json_data:
        index = 1
        with open(f"{DOC_STORAGE_PATH}/single.txt", "w", encoding="utf-8") as f:
            for item in json_data:
                url = item["url"]
                author = item["meta_data"]["author"]
                description = item["meta_data"]["description"]
                keywords = item["meta_data"]["keywords"]
                SiteName = item["meta_data"]["SiteName"]
                SiteDomain = item["meta_data"]["SiteDomain"]
                SiteIDCode = item["meta_data"]["SiteIDCode"]
                ColumnName = item["meta_data"]["ColumnName"]
                ColumnType = item["meta_data"]["ColumnType"]
                ArticleTitle = item["meta_data"]["ArticleTitle"]
                PubDate = item["meta_data"]["PubDate"]
                ContentSource = item["meta_data"]["ContentSource"]
                article_text = item["article_text"]
                filename = ArticleTitle.replace("\\", "-")
                filename = filename.replace("/", "-")
                f.write("```doc\n")
                f.write(f"标题: {ArticleTitle}\n")
                f.write(f"作者: {author}\n")
                f.write(f"描述: {description}\n")
                f.write(f"关键字: {keywords}\n")
                f.write(f"类型: {ColumnType}\n")
                f.write(f"发布日期: {PubDate}\n")
                f.write(f"原文链接: {url}\n")
                f.write("\n")
                f.write(article_text)
                f.write("```\n")
                print(item["meta_data"]['ArticleTitle'],f"processed {index}")
                index = index + 1
# 使用示例
if __name__ == "__main__":
    count_of_param = len(sys.argv)
    if count_of_param == 2:
        action =  sys.argv[1]
        if action == "extract_single": 
            #从json数据文件中抽取文章内容，写入数据目录
            extract_law_document_single()
        if action == "extract": 
            #从json数据文件中抽取文章内容，写入数据目录
            extract_law_document()
        if action == "embed": 
            #从json文件中读取文章，生成关键词向量，标题向量，chuncs和向量
            embed_doc()
        if action == "title": 
            #从json文件中读取文章标题，切词，生成切词到文章标题的倒排索引，以及切词对应的向量
            title_reverse_index()
    #embed_doc()