1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283 |
- import sys,os
- current_path = os.getcwd()
- sys.path.append(current_path)
- import json
- from libs.embed_helper import EmbedHelper
- def embed_test():
- embed_helper = EmbedHelper()
- result = embed_helper.embed_text("你好")
- print(f"result length: {len(result)}")
- print(result)
- def search_test():
- from utils.es import ElasticsearchOperations
- es = ElasticsearchOperations()
- result = es.search("graph_entity_index", "上呼吸道感染", 10)
- for item in result:
- print(item)
- def load_entities():
- print("load entity data")
- with open(f"{current_path}\\web\\cached_data\\entities_med.json", "r", encoding="utf-8") as f:
- entities = json.load(f)
- return entities
-
- def load_relationships():
- print("load relationship data")
- with open(f"{current_path}\\web\\cached_data\\relationship_med.json", "r", encoding="utf-8") as f:
- relationships = json.load(f)
- return relationships
- def write_data_file(file_name, data):
- if len(data) == 0:
- return
- print("write data file", file_name)
- with open(file_name, "w", encoding="utf-8") as f:
- f.write(json.dumps(data, ensure_ascii=False,indent=4))
- def build_index():
- print("build index")
- embed_helper = EmbedHelper()
- entities = load_entities()
- count = 0
- index = 0
- for item in entities:
- node_id = item[0]
- print("process node: ",count, node_id)
- texts = []
- attrs = item[1]
- attr_embed_list = []
- if attrs["type"] == "Disease":
- for attr in attrs:
- if len(attrs[attr])>3 and attr not in ["type", "description"]:
- texts.append(attrs[attr])
- attr_embed = embed_helper.embed_text(node_id+"-"+attr+"-"+attrs[attr])
- attr_embed_list.append(
- {
- "title": node_id+"-"+attr,
- "text": attrs[attr],
- "embedding": attr_embed}
- )
- else:
- print("skip", attr)
- doc = { "title": node_id,
- "text": "\n".join(texts),
- "embedding": attr_embed_list} # 初始化doc对象,确保它在循环外部定义
- count += 1
- if count % 1 == 0:
- write_data_file(f"{current_path}\\web\\cached_data\\diseases\\{index}.json", doc)
- index = index + 1
-
- #write_data_file(f"{current_path}\\web\\cached_data\\diseases\\{index}.json", records)
- # 使用示例
- if __name__ == "__main__":
- param_count = len(sys.argv)
- if param_count == 2:
- action = sys.argv[1]
- if action== "test":
- embed_test()
- search_test()
- if action == "build":
- build_index()
- #build_index()
|