from utils.es import ElasticsearchOperations import asyncio import os import time from utils.file import load_file import json from dotenv import load_dotenv load_dotenv() WORD_INDEX = os.getenv("WORD_INDEX") TITLE_INDEX = os.getenv("TITLE_INDEX") CHUNC_INDEX = os.getenv("CHUNC_INDEX") class ImportChunc: def __init__(self, data_dir: str): try: self.es = ElasticsearchOperations() except Exception as e: print(e) self.data_dir = data_dir def import_chunc_reverse_index(self): print(">>> import title reverse index") sample = "f155479939c4563f036993d200a1d0d2.json" for file in os.listdir(self.data_dir): if len(file) == len(sample): print(f">>> process {file}") data = {} with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f: data = json.load(f) if "chuncs" in data and "chuncs_vector" in data: index = 0 for chunc in data["chuncs"]: print(f">>> process {file} chunc {index}") chunc_vector = data["chuncs_vector"][index] doc = { "title": data["title"], "text": chunc, "embedding": chunc_vector} self.es.add_document(CHUNC_INDEX, self.es.get_doc_id(chunc), doc) index += 1 def import_title_reverse_index(self): print(">>> import title reverse index") sample = "f155479939c4563f036993d200a1d0d2.json" for file in os.listdir(self.data_dir): if len(file) == len(sample): print(f">>> process {file}") data = {} with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f: data = json.load(f) if "title" in data and "title_vector" in data: title = data["title"] print(f">>> process {file} title: {title}") doc = { "title": data["title"], "text": data["title"], "embedding": data["title_vector"]} self.es.add_document(TITLE_INDEX, self.es.get_doc_id(data["title"]), doc) def import_word_reverse_index(self): print(">>> import title reverse index") filename = f"{self.data_dir}/title_reverse_index.json" vectors_file = f"{self.data_dir}/words_vector.json" if os.path.exists(filename) and os.path.exists(vectors_file): vectors = {} with open(vectors_file, "r", encoding="utf-8") as f: vectors = json.load(f) data = {} with open(filename, "r", encoding="utf-8") as f: data = json.load(f) for key in data.keys(): vector = vectors[key] print(f">>> process {key}", end="") if vector: data[key]["word"] = key data[key]["embedding"] = vector data[key]["articles"] self.es.add_document(WORD_INDEX, self.es.get_doc_id(key), data[key]) print(f" ok") else: print(" failed") else: print(f"{filename} and {vectors} was not found" )