elasticsearch_clean.py 993 B

1234567891011121314151617181920212223242526272829
  1. from libs.text_processor import TextProcessor
  2. from elasticsearch import Elasticsearch
  3. from typing import List, Dict
  4. import json
  5. from utils.es import ElasticsearchOperations
  6. def clean_law_document(ops):
  7. json_data = None
  8. with open(r"D:\work\03\regulations.json","r",encoding="utf-8") as f:
  9. lines = f.readlines()
  10. json_data = json.loads(''.join(lines))
  11. print(">>> finished process document ")
  12. if json_data:
  13. processor = TextProcessor()
  14. index = 1
  15. total = len(json_data)
  16. for item in json_data:
  17. es_ops.del_document(item["article_text"],item["meta_data"]['ArticleTitle'])
  18. print(item["meta_data"]['ArticleTitle'],f"processed {index}/{total}")
  19. index = index + 1
  20. # 使用示例
  21. if __name__ == "__main__":
  22. es_ops = ElasticsearchOperations()
  23. es_ops.es.indices.delete(index="text_chunks", ignore=[400, 404])
  24. print(">>> finished delete index")
  25. #clean_law_document(es_ops)