build_es_index.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from typing import List, Dict
  2. from libs.import_chunc import ImportChunc
  3. from libs.chunc_helper import ChuncHelper
  4. import json
  5. import sys
  6. import os
  7. from dotenv import load_dotenv
  8. from utils.es import ElasticsearchOperations
  9. from utils.factors import FactorsHelper
  10. load_dotenv()
  11. # DeepSeek API配置
  12. TRUNC_OUTPUT_PATH = os.getenv("TRUNC_OUTPUT_PATH")
  13. DOC_STORAGE_PATH = os.getenv("DOC_STORAGE_PATH")
  14. DOC_PATH = os.getenv("DOC_PATH")
  15. JIEBA_USER_DICT = os.getenv("JIEBA_USER_DICT")
  16. WORD_INDEX = os.getenv("WORD_INDEX")
  17. TITLE_INDEX = os.getenv("TITLE_INDEX")
  18. CHUNC_INDEX = os.getenv("CHUNC_INDEX")
  19. def build_test():
  20. helper = ImportChunc(data_dir=TRUNC_OUTPUT_PATH)
  21. helper.import_chunc_reverse_index()
  22. def build_index():
  23. helper = ImportChunc(data_dir=TRUNC_OUTPUT_PATH)
  24. helper.import_word_reverse_index()
  25. helper.import_title_reverse_index()
  26. #helper.import_chunc_reverse_index()
  27. def build_chunc():
  28. helper = ImportChunc(data_dir=TRUNC_OUTPUT_PATH)
  29. helper.import_chunc_reverse_index()
  30. def delete_index():
  31. helper = ElasticsearchOperations()
  32. print(">>> delete index")
  33. helper.delete_index(WORD_INDEX)
  34. helper.delete_index(TITLE_INDEX)
  35. helper.delete_index(CHUNC_INDEX)
  36. helper.delete_index("text_chunks")
  37. from functions.basic_function import search_document
  38. def test_index():
  39. helper = ElasticsearchOperations()
  40. try:
  41. question = "银行销售保险产品的规定"
  42. result = search_document(question)
  43. print(result)
  44. # articles = FactorsHelper()
  45. # chunc_helper = ChuncHelper(data_file=DOC_PATH, output_dir=TRUNC_OUTPUT_PATH, user_dict=JIEBA_USER_DICT)
  46. # print(">>> question: test word index")
  47. # words = chunc_helper.cut_word(question)
  48. # data = helper.search_word_index(WORD_INDEX, [question]) #words)
  49. # for item in data:
  50. # print(f"{item['word']} {item['score']}")
  51. # for art in item["articles"]:
  52. # articles.add_factors(art, item['score'])
  53. # print(">>> test title index")
  54. # data = helper.search_title_index(TITLE_INDEX, question)
  55. # for item in data:
  56. # print(f"{item['title']} {item['score']}")
  57. # articles.add_factors(item['title'], item['score'])
  58. # print(">>> test chunc index")
  59. # data = helper.search_title_index(CHUNC_INDEX, question)
  60. # for item in data:
  61. # print(f"{item['title']} {item['score']}")
  62. # articles.add_factors(item['title'], item['score'])
  63. # print(">>> test factors calc")
  64. # sorted_articals = articles.sort_factors()
  65. # for key in sorted_articals:
  66. # print(key)
  67. #data = helper.get_document(TITLE_INDEX, helper.get_doc_id("保险代理人监管规定"))
  68. #print(data)
  69. except Exception as e:
  70. raise e
  71. # 使用示例
  72. if __name__ == "__main__":
  73. param_count = len(sys.argv)
  74. if param_count == 2:
  75. action = sys.argv[1]
  76. if action== "test":
  77. test_index()
  78. if action == "build":
  79. build_index()
  80. if action == "delete":
  81. delete_index()
  82. if action == "chunc":
  83. build_chunc()
  84. #build_index()