import_chunc.py 3.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from utils.es import ElasticsearchOperations
  2. import asyncio
  3. import os
  4. import time
  5. from utils.file import load_file
  6. import json
  7. from dotenv import load_dotenv
  8. load_dotenv()
  9. WORD_INDEX = os.getenv("WORD_INDEX")
  10. TITLE_INDEX = os.getenv("TITLE_INDEX")
  11. CHUNC_INDEX = os.getenv("CHUNC_INDEX")
  12. class ImportChunc:
  13. def __init__(self, data_dir: str):
  14. try:
  15. self.es = ElasticsearchOperations()
  16. except Exception as e:
  17. print(e)
  18. self.data_dir = data_dir
  19. def import_chunc_reverse_index(self):
  20. print(">>> import title reverse index")
  21. sample = "f155479939c4563f036993d200a1d0d2.json"
  22. for file in os.listdir(self.data_dir):
  23. if len(file) == len(sample):
  24. print(f">>> process {file}")
  25. data = {}
  26. with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f:
  27. data = json.load(f)
  28. if "chuncs" in data and "chuncs_vector" in data:
  29. index = 0
  30. for chunc in data["chuncs"]:
  31. print(f">>> process {file} chunc {index}")
  32. chunc_vector = data["chuncs_vector"][index]
  33. doc = { "title": data["title"],
  34. "text": chunc,
  35. "embedding": chunc_vector}
  36. self.es.add_document(CHUNC_INDEX, self.es.get_doc_id(chunc), doc)
  37. index += 1
  38. def import_title_reverse_index(self):
  39. print(">>> import title reverse index")
  40. sample = "f155479939c4563f036993d200a1d0d2.json"
  41. for file in os.listdir(self.data_dir):
  42. if len(file) == len(sample):
  43. print(f">>> process {file}")
  44. data = {}
  45. with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f:
  46. data = json.load(f)
  47. if "title" in data and "title_vector" in data:
  48. title = data["title"]
  49. print(f">>> process {file} title: {title}")
  50. doc = { "title": data["title"],
  51. "text": data["title"],
  52. "embedding": data["title_vector"]}
  53. self.es.add_document(TITLE_INDEX, self.es.get_doc_id(data["title"]), doc)
  54. def import_word_reverse_index(self):
  55. print(">>> import title reverse index")
  56. filename = f"{self.data_dir}/title_reverse_index.json"
  57. vectors_file = f"{self.data_dir}/words_vector.json"
  58. if os.path.exists(filename) and os.path.exists(vectors_file):
  59. vectors = {}
  60. with open(vectors_file, "r", encoding="utf-8") as f:
  61. vectors = json.load(f)
  62. data = {}
  63. with open(filename, "r", encoding="utf-8") as f:
  64. data = json.load(f)
  65. for key in data.keys():
  66. vector = vectors[key]
  67. print(f">>> process {key}", end="")
  68. if vector:
  69. data[key]["word"] = key
  70. data[key]["embedding"] = vector
  71. data[key]["articles"]
  72. self.es.add_document(WORD_INDEX, self.es.get_doc_id(key), data[key])
  73. print(f" ok")
  74. else:
  75. print(" failed")
  76. else:
  77. print(f"{filename} and {vectors} was not found" )