Browse Source

Merge branch 'master' of http://173.18.12.196:3000/python/knowledge

# Conflicts:
#	router/graph_router.py
SGTY 1 month ago
parent
commit
21f98bcf18
3 changed files with 120 additions and 37 deletions
  1. 3 0
      requirements.txt
  2. 1 1
      router/graph_router.py
  3. 116 36
      router/text_search.py

+ 3 - 0
requirements.txt

@@ -12,4 +12,7 @@ starlette==0.46.1
 tabulate==0.9.0
 urllib3==2.3.0
 uvicorn==0.34.0
+dotenv==0.9.9
+cachetools==5.5.2
+pandas==2.2.3
 

+ 1 - 1
router/graph_router.py

@@ -65,7 +65,7 @@ async def neighbor_search(
         )
         # 使用从main.py导入的capability实例处理CDSS逻辑
         output = capability.process(input=record)
-        
+
         output.diagnosis.value = [{"name":key,"old_score":value["old_score"],"count":value["count"],"score":value["score"],
             "hasInfo": 1,
             "type": 1} for key,value in output.diagnosis.value.items()]

+ 116 - 36
router/text_search.py

@@ -8,7 +8,7 @@ from model.response import StandardResponse
 from utils.vectorizer import Vectorizer
 # from utils.find_text_in_pdf import find_text_in_pdf
 import os
-DISTANCE_THRESHOLD = 0.8
+DISTANCE_THRESHOLD = 0.73
 import logging
 import time
 from db.session import get_db
@@ -16,9 +16,14 @@ from sqlalchemy.orm import Session
 from service.kg_node_service import KGNodeService
 from service.kg_prop_service import KGPropService
 
+from cachetools import TTLCache
+
 logger = logging.getLogger(__name__)
 router = APIRouter(prefix="/text", tags=["Text Search"])
 
+# 创建全局缓存实例
+cache = TTLCache(maxsize=1000, ttl=3600)
+
 class TextSearchRequest(BaseModel):
     text: str
     conversation_id: Optional[str] = None
@@ -61,6 +66,16 @@ class NodePropsSearchRequest(BaseModel):
     node_id: int
     props_ids: List[int]
 
+@router.post("/clear_cache", response_model=StandardResponse)
+async def clear_cache():
+    try:
+        # 清除全局缓存
+        cache.clear()
+        return StandardResponse(success=True, data={"message": "缓存已清除"})
+    except Exception as e:
+        logger.error(f"清除缓存失败: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
 @router.post("/search", response_model=StandardResponse)
 async def search_text(request: TextSearchRequest):
     try:
@@ -318,6 +333,13 @@ async def compare_text(request: TextCompareMultiRequest):
 async def node_props_search(request: NodePropsSearchRequest, db: Session = Depends(get_db)):
     try:
         start_time = time.time()
+        
+        # 检查缓存
+        cache_key = f"xunzheng_{request.node_id}"
+        cached_result = cache.get(cache_key)
+        if cached_result:
+            logger.info(f"从缓存获取结果,node_id: {request.node_id}")
+            return StandardResponse(success=True, data=cached_result)
         # 初始化服务
         trunks_service = TrunksService()
         node_service = KGNodeService(db)
@@ -351,8 +373,14 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
             prop_title = prop.get('prop_title', '')
             prop_value = prop.get('prop_value', '')
 
-            # 拆分属性值为句子
-            sentences = TextSplitter.split_text(prop_value)
+            # 先用完整的prop_value进行搜索
+            search_text = f"{node_name}:{prop_title}:{prop_value}"
+            full_search_results = trunks_service.search_by_vector(
+                text=search_text,
+                limit=1,
+                type='trunk'
+            )
+
             prop_result = {
                 "id": prop_id,
                 "category": prop.get('category', 0),
@@ -370,37 +398,57 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
             all_references = []
             reference_index = 1
 
-            # 对每个句子进行向量搜索
-            i = 0
-            while i < len(sentences):
-                original_sentence = sentences[i]
-                sentence = original_sentence
-                
-                # 如果当前句子长度小于10且不是最后一句,则与下一句合并
-                if len(sentence) < 10 and i + 1 < len(sentences):
-                    next_sentence = sentences[i + 1]
-                    combined_sentence = sentence + " " + next_sentence
-                    # 添加原短句到结果,flag为空
-                    result_sentences.append({
-                        "sentence": sentence,
-                        "flag": ""
-                    })
-                    # 使用合并后的句子进行搜索
-                    search_text = f"{node_name}:{prop_title}:{combined_sentence}"
-                    i += 1  # 跳过下一句,因为已经合并使用
-                elif len(sentence) < 10:
-                    # 如果是最后一句且长度小于10,直接添加到结果,flag为空
-                    result_sentences.append({
-                        "sentence": sentence,
-                        "flag": ""
-                    })
+            # 如果整体搜索找到匹配结果且距离小于阈值,直接使用整体结果
+            if full_search_results and full_search_results[0].get("distance", DISTANCE_THRESHOLD) < DISTANCE_THRESHOLD:
+                search_result = full_search_results[0]
+                reference = {
+                    "index": str(reference_index),
+                    "id": search_result["id"],
+                    "content": search_result["content"],
+                    "file_path": search_result.get("file_path", ""),
+                    "title": search_result.get("title", ""),
+                    "distance": search_result.get("distance", DISTANCE_THRESHOLD),
+                    "page_no": search_result.get("page_no", ""),
+                    "referrence": search_result.get("referrence", "")
+                }
+                all_references.append(reference)
+                result_sentences.append({
+                    "sentence": prop_value,
+                    "flag": str(reference_index)
+                })
+            else:
+                # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
+                sentences = TextSplitter.split_text(prop_value)
+                i = 0
+                while i < len(sentences):
+                    original_sentence = sentences[i]
+                    sentence = original_sentence
+                    
+                    # 如果当前句子长度小于10且不是最后一句,则与下一句合并
+                    if len(sentence) < 10 and i + 1 < len(sentences):
+                        next_sentence = sentences[i + 1]
+                        combined_sentence = sentence + " " + next_sentence
+                        # 添加原短句到结果,flag为空
+                        result_sentences.append({
+                            "sentence": sentence,
+                            "flag": ""
+                        })
+                        # 使用合并后的句子进行搜索
+                        search_text = f"{node_name}:{prop_title}:{combined_sentence}"
+                        i += 1  # 跳过下一句,因为已经合并使用
+                    elif len(sentence) < 10:
+                        # 如果是最后一句且长度小于10,直接添加到结果,flag为空
+                        result_sentences.append({
+                            "sentence": sentence,
+                            "flag": ""
+                        })
+                        i += 1
+                        continue
+                    else:
+                        # 句子长度足够,直接使用
+                        search_text = f"{node_name}:{prop_title}:{sentence}"
+                    
                     i += 1
-                    continue
-                else:
-                    # 句子长度足够,直接使用
-                    search_text = f"{node_name}:{prop_title}:{sentence}"
-                
-                i += 1
 
                 # 进行向量搜索
                 search_results = trunks_service.search_by_vector(
@@ -465,6 +513,10 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
 
         # 处理所有引用中的文件信息
         all_files = set()
+        file_index_map = {}
+        file_index = 1
+        
+        # 第一次遍历收集所有文件信息
         for prop_result in result["props"]:
             if "references" in prop_result:
                 for ref in prop_result["references"]:
@@ -486,16 +538,44 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
                             else:
                                 file_type = "other"
                             
+                            if file_name not in file_index_map:
+                                file_index_map[file_name] = file_index
+                                file_index += 1
                             all_files.add((file_name, file_type))
         
+        # 第二次遍历更新引用的index
+        for prop_result in result["props"]:
+            if "references" in prop_result:
+                for ref in prop_result["references"]:
+                    referrence = ref.get("referrence", "")
+                    if referrence and "/books/" in referrence:
+                        file_name = referrence.split("/books/")[-1]
+                        if file_name in file_index_map:
+                            # 更新reference的index为"文件索引-原索引"
+                            ref["index"] = f"{file_index_map[file_name]}-{ref['index']}"
+            
+            # 更新answer中的flag
+            if "answer" in prop_result:
+                for sentence in prop_result["answer"]:
+                    if sentence["flag"]:
+                        for ref in prop_result["references"]:
+                            if ref["index"].endswith(f"-{sentence['flag']}"):
+                                sentence["flag"] = ref["index"]
+                                break
+        
         # 将文件信息添加到结果中
-        result["files"] = [{
+        result["files"] = sorted([{
             "file_name": file_name,
-            "file_type": file_type
-        } for file_name, file_type in all_files]
+            "file_type": file_type,
+            "index": str(file_index_map[file_name])
+        } for file_name, file_type in all_files], key=lambda x: int(x["index"]))
         
         end_time = time.time()
         logger.info(f"node_props_search接口耗时: {(end_time - start_time) * 1000:.2f}ms")
+        
+        # 将结果存入缓存
+        cache[cache_key] = result
+        
         return StandardResponse(success=True, data=result)
     except Exception as e:
         logger.error(f"Node props search failed: {str(e)}")