python
/
self-constructing_graph


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
							import sys,os
current_path = os.getcwd()
sys.path.append(current_path)

import json
from libs.embed_helper import EmbedHelper

def embed_test():
    embed_helper = EmbedHelper()
    result = embed_helper.embed_text("你好")
    print(f"result length: {len(result)}")
    print(result)
def search_test():
    from utils.es import ElasticsearchOperations
    es = ElasticsearchOperations()
    result = es.search("graph_entity_index", "上呼吸道感染", 10)
    for item in result:
        print(item)
def load_entities():
    print("load entity data")
    with open(f"{current_path}\\web\\cached_data\\entities_med.json", "r", encoding="utf-8") as f:
        entities = json.load(f)
        return entities
    
def load_relationships():
    print("load relationship data")
    with open(f"{current_path}\\web\\cached_data\\relationship_med.json", "r", encoding="utf-8") as f:
        relationships = json.load(f)
        return relationships

def write_data_file(file_name, data):
    if len(data) == 0:
        return
    print("write data file", file_name)
    with open(file_name, "w", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False,indent=4))

def build_index():
    print("build index")
    embed_helper = EmbedHelper()
    entities = load_entities()
    count = 0
    index = 0
    for item in entities:
        node_id = item[0]
        print("process node: ",count, node_id)
        texts = []
        attrs = item[1]
        attr_embed_list = []
        if attrs["type"] == "Disease":
            for attr in attrs:
                if len(attrs[attr])>3 and attr not in ["type", "description"]:
                    texts.append(attrs[attr])
                    attr_embed = embed_helper.embed_text(node_id+"-"+attr+"-"+attrs[attr])
                    attr_embed_list.append(
                        {   
                            "title": node_id+"-"+attr, 
                            "text": attrs[attr],
                            "embedding": attr_embed} 
                        )
                else:
                    print("skip", attr)
            doc = { "title": node_id, 
                "text": "\n".join(texts),
                "embedding": attr_embed_list}      # 初始化doc对象，确保它在循环外部定义
            count += 1
            if count % 1 == 0:
                write_data_file(f"{current_path}\\web\\cached_data\\diseases\\{index}.json", doc)
                index = index + 1
            
    #write_data_file(f"{current_path}\\web\\cached_data\\diseases\\{index}.json", records)
# 使用示例
if __name__ == "__main__":
    param_count = len(sys.argv)
    if param_count == 2:
        action =  sys.argv[1]
        if action== "test":
            embed_test()
            search_test()
        if action == "build":
            build_index()
    #build_index()