python
/
self-constructing_graph


			
				
					
						
						
							1234567891011121314151617181920212223242526272829
							from libs.text_processor import TextProcessor
from elasticsearch import Elasticsearch
from typing import List, Dict
import json
from utils.es import ElasticsearchOperations

def clean_law_document(ops):
    json_data = None
    with open(r"D:\work\03\regulations.json","r",encoding="utf-8") as f:
        lines = f.readlines()
        json_data = json.loads(''.join(lines))
        
    print(">>> finished process document  ")
    
    if json_data:
        processor = TextProcessor()
        index = 1
        total = len(json_data)
        for item in json_data:
            es_ops.del_document(item["article_text"],item["meta_data"]['ArticleTitle'])
            print(item["meta_data"]['ArticleTitle'],f"processed {index}/{total}")
            index = index + 1
# 使用示例
if __name__ == "__main__":
    es_ops = ElasticsearchOperations()
    es_ops.es.indices.delete(index="text_chunks", ignore=[400, 404])
    print(">>> finished delete index")
    #clean_law_document(es_ops)