1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- from utils.es import ElasticsearchOperations
- import asyncio
- import os
- import time
- from utils.file import load_file
- import json
- from dotenv import load_dotenv
- load_dotenv()
- WORD_INDEX = os.getenv("WORD_INDEX")
- TITLE_INDEX = os.getenv("TITLE_INDEX")
- CHUNC_INDEX = os.getenv("CHUNC_INDEX")
- class ImportChunc:
- def __init__(self, data_dir: str):
- try:
- self.es = ElasticsearchOperations()
- except Exception as e:
- print(e)
- self.data_dir = data_dir
-
- def import_chunc_reverse_index(self):
- print(">>> import title reverse index")
- sample = "f155479939c4563f036993d200a1d0d2.json"
- for file in os.listdir(self.data_dir):
- if len(file) == len(sample):
- print(f">>> process {file}")
- data = {}
- with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f:
- data = json.load(f)
- if "chuncs" in data and "chuncs_vector" in data:
- index = 0
- for chunc in data["chuncs"]:
- print(f">>> process {file} chunc {index}")
- chunc_vector = data["chuncs_vector"][index]
- doc = { "title": data["title"],
- "text": chunc,
- "embedding": chunc_vector}
- self.es.add_document(CHUNC_INDEX, self.es.get_doc_id(chunc), doc)
- index += 1
-
- def import_title_reverse_index(self):
- print(">>> import title reverse index")
- sample = "f155479939c4563f036993d200a1d0d2.json"
- for file in os.listdir(self.data_dir):
- if len(file) == len(sample):
- print(f">>> process {file}")
- data = {}
- with open(f"{self.data_dir}/{file}", "r", encoding="utf-8") as f:
- data = json.load(f)
- if "title" in data and "title_vector" in data:
- title = data["title"]
- print(f">>> process {file} title: {title}")
- doc = { "title": data["title"],
- "text": data["title"],
- "embedding": data["title_vector"]}
- self.es.add_document(TITLE_INDEX, self.es.get_doc_id(data["title"]), doc)
-
- def import_word_reverse_index(self):
- print(">>> import title reverse index")
-
- filename = f"{self.data_dir}/title_reverse_index.json"
- vectors_file = f"{self.data_dir}/words_vector.json"
- if os.path.exists(filename) and os.path.exists(vectors_file):
- vectors = {}
- with open(vectors_file, "r", encoding="utf-8") as f:
- vectors = json.load(f)
- data = {}
- with open(filename, "r", encoding="utf-8") as f:
- data = json.load(f)
- for key in data.keys():
- vector = vectors[key]
- print(f">>> process {key}", end="")
- if vector:
- data[key]["word"] = key
- data[key]["embedding"] = vector
- data[key]["articles"]
- self.es.add_document(WORD_INDEX, self.es.get_doc_id(key), data[key])
- print(f" ok")
- else:
- print(" failed")
- else:
- print(f"{filename} and {vectors} was not found" )
-
|