|
@@ -8,7 +8,7 @@ from model.response import StandardResponse
|
|
|
from utils.vectorizer import Vectorizer
|
|
|
# from utils.find_text_in_pdf import find_text_in_pdf
|
|
|
import os
|
|
|
-DISTANCE_THRESHOLD = 0.8
|
|
|
+DISTANCE_THRESHOLD = 0.73
|
|
|
import logging
|
|
|
import time
|
|
|
from db.session import get_db
|
|
@@ -16,9 +16,14 @@ from sqlalchemy.orm import Session
|
|
|
from service.kg_node_service import KGNodeService
|
|
|
from service.kg_prop_service import KGPropService
|
|
|
|
|
|
+from cachetools import TTLCache
|
|
|
+
|
|
|
logger = logging.getLogger(__name__)
|
|
|
router = APIRouter(prefix="/text", tags=["Text Search"])
|
|
|
|
|
|
+# 创建全局缓存实例
|
|
|
+cache = TTLCache(maxsize=1000, ttl=3600)
|
|
|
+
|
|
|
class TextSearchRequest(BaseModel):
|
|
|
text: str
|
|
|
conversation_id: Optional[str] = None
|
|
@@ -61,6 +66,16 @@ class NodePropsSearchRequest(BaseModel):
|
|
|
node_id: int
|
|
|
props_ids: List[int]
|
|
|
|
|
|
+@router.post("/clear_cache", response_model=StandardResponse)
|
|
|
+async def clear_cache():
|
|
|
+ try:
|
|
|
+ # 清除全局缓存
|
|
|
+ cache.clear()
|
|
|
+ return StandardResponse(success=True, data={"message": "缓存已清除"})
|
|
|
+ except Exception as e:
|
|
|
+ logger.error(f"清除缓存失败: {str(e)}")
|
|
|
+ raise HTTPException(status_code=500, detail=str(e))
|
|
|
+
|
|
|
@router.post("/search", response_model=StandardResponse)
|
|
|
async def search_text(request: TextSearchRequest):
|
|
|
try:
|
|
@@ -318,6 +333,13 @@ async def compare_text(request: TextCompareMultiRequest):
|
|
|
async def node_props_search(request: NodePropsSearchRequest, db: Session = Depends(get_db)):
|
|
|
try:
|
|
|
start_time = time.time()
|
|
|
+
|
|
|
+ # 检查缓存
|
|
|
+ cache_key = f"xunzheng_{request.node_id}"
|
|
|
+ cached_result = cache.get(cache_key)
|
|
|
+ if cached_result:
|
|
|
+ logger.info(f"从缓存获取结果,node_id: {request.node_id}")
|
|
|
+ return StandardResponse(success=True, data=cached_result)
|
|
|
# 初始化服务
|
|
|
trunks_service = TrunksService()
|
|
|
node_service = KGNodeService(db)
|
|
@@ -351,8 +373,14 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
|
|
|
prop_title = prop.get('prop_title', '')
|
|
|
prop_value = prop.get('prop_value', '')
|
|
|
|
|
|
- # 拆分属性值为句子
|
|
|
- sentences = TextSplitter.split_text(prop_value)
|
|
|
+ # 先用完整的prop_value进行搜索
|
|
|
+ search_text = f"{node_name}:{prop_title}:{prop_value}"
|
|
|
+ full_search_results = trunks_service.search_by_vector(
|
|
|
+ text=search_text,
|
|
|
+ limit=1,
|
|
|
+ type='trunk'
|
|
|
+ )
|
|
|
+
|
|
|
prop_result = {
|
|
|
"id": prop_id,
|
|
|
"category": prop.get('category', 0),
|
|
@@ -370,37 +398,57 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
|
|
|
all_references = []
|
|
|
reference_index = 1
|
|
|
|
|
|
- # 对每个句子进行向量搜索
|
|
|
- i = 0
|
|
|
- while i < len(sentences):
|
|
|
- original_sentence = sentences[i]
|
|
|
- sentence = original_sentence
|
|
|
-
|
|
|
- # 如果当前句子长度小于10且不是最后一句,则与下一句合并
|
|
|
- if len(sentence) < 10 and i + 1 < len(sentences):
|
|
|
- next_sentence = sentences[i + 1]
|
|
|
- combined_sentence = sentence + " " + next_sentence
|
|
|
- # 添加原短句到结果,flag为空
|
|
|
- result_sentences.append({
|
|
|
- "sentence": sentence,
|
|
|
- "flag": ""
|
|
|
- })
|
|
|
- # 使用合并后的句子进行搜索
|
|
|
- search_text = f"{node_name}:{prop_title}:{combined_sentence}"
|
|
|
- i += 1 # 跳过下一句,因为已经合并使用
|
|
|
- elif len(sentence) < 10:
|
|
|
- # 如果是最后一句且长度小于10,直接添加到结果,flag为空
|
|
|
- result_sentences.append({
|
|
|
- "sentence": sentence,
|
|
|
- "flag": ""
|
|
|
- })
|
|
|
+ # 如果整体搜索找到匹配结果且距离小于阈值,直接使用整体结果
|
|
|
+ if full_search_results and full_search_results[0].get("distance", DISTANCE_THRESHOLD) < DISTANCE_THRESHOLD:
|
|
|
+ search_result = full_search_results[0]
|
|
|
+ reference = {
|
|
|
+ "index": str(reference_index),
|
|
|
+ "id": search_result["id"],
|
|
|
+ "content": search_result["content"],
|
|
|
+ "file_path": search_result.get("file_path", ""),
|
|
|
+ "title": search_result.get("title", ""),
|
|
|
+ "distance": search_result.get("distance", DISTANCE_THRESHOLD),
|
|
|
+ "page_no": search_result.get("page_no", ""),
|
|
|
+ "referrence": search_result.get("referrence", "")
|
|
|
+ }
|
|
|
+ all_references.append(reference)
|
|
|
+ result_sentences.append({
|
|
|
+ "sentence": prop_value,
|
|
|
+ "flag": str(reference_index)
|
|
|
+ })
|
|
|
+ else:
|
|
|
+ # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
|
|
|
+ sentences = TextSplitter.split_text(prop_value)
|
|
|
+ i = 0
|
|
|
+ while i < len(sentences):
|
|
|
+ original_sentence = sentences[i]
|
|
|
+ sentence = original_sentence
|
|
|
+
|
|
|
+ # 如果当前句子长度小于10且不是最后一句,则与下一句合并
|
|
|
+ if len(sentence) < 10 and i + 1 < len(sentences):
|
|
|
+ next_sentence = sentences[i + 1]
|
|
|
+ combined_sentence = sentence + " " + next_sentence
|
|
|
+ # 添加原短句到结果,flag为空
|
|
|
+ result_sentences.append({
|
|
|
+ "sentence": sentence,
|
|
|
+ "flag": ""
|
|
|
+ })
|
|
|
+ # 使用合并后的句子进行搜索
|
|
|
+ search_text = f"{node_name}:{prop_title}:{combined_sentence}"
|
|
|
+ i += 1 # 跳过下一句,因为已经合并使用
|
|
|
+ elif len(sentence) < 10:
|
|
|
+ # 如果是最后一句且长度小于10,直接添加到结果,flag为空
|
|
|
+ result_sentences.append({
|
|
|
+ "sentence": sentence,
|
|
|
+ "flag": ""
|
|
|
+ })
|
|
|
+ i += 1
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ # 句子长度足够,直接使用
|
|
|
+ search_text = f"{node_name}:{prop_title}:{sentence}"
|
|
|
+
|
|
|
i += 1
|
|
|
- continue
|
|
|
- else:
|
|
|
- # 句子长度足够,直接使用
|
|
|
- search_text = f"{node_name}:{prop_title}:{sentence}"
|
|
|
-
|
|
|
- i += 1
|
|
|
|
|
|
# 进行向量搜索
|
|
|
search_results = trunks_service.search_by_vector(
|
|
@@ -465,6 +513,10 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
|
|
|
|
|
|
# 处理所有引用中的文件信息
|
|
|
all_files = set()
|
|
|
+ file_index_map = {}
|
|
|
+ file_index = 1
|
|
|
+
|
|
|
+ # 第一次遍历收集所有文件信息
|
|
|
for prop_result in result["props"]:
|
|
|
if "references" in prop_result:
|
|
|
for ref in prop_result["references"]:
|
|
@@ -486,16 +538,44 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
|
|
|
else:
|
|
|
file_type = "other"
|
|
|
|
|
|
+ if file_name not in file_index_map:
|
|
|
+ file_index_map[file_name] = file_index
|
|
|
+ file_index += 1
|
|
|
all_files.add((file_name, file_type))
|
|
|
|
|
|
+ # 第二次遍历更新引用的index
|
|
|
+ for prop_result in result["props"]:
|
|
|
+ if "references" in prop_result:
|
|
|
+ for ref in prop_result["references"]:
|
|
|
+ referrence = ref.get("referrence", "")
|
|
|
+ if referrence and "/books/" in referrence:
|
|
|
+ file_name = referrence.split("/books/")[-1]
|
|
|
+ if file_name in file_index_map:
|
|
|
+ # 更新reference的index为"文件索引-原索引"
|
|
|
+ ref["index"] = f"{file_index_map[file_name]}-{ref['index']}"
|
|
|
+
|
|
|
+ # 更新answer中的flag
|
|
|
+ if "answer" in prop_result:
|
|
|
+ for sentence in prop_result["answer"]:
|
|
|
+ if sentence["flag"]:
|
|
|
+ for ref in prop_result["references"]:
|
|
|
+ if ref["index"].endswith(f"-{sentence['flag']}"):
|
|
|
+ sentence["flag"] = ref["index"]
|
|
|
+ break
|
|
|
+
|
|
|
# 将文件信息添加到结果中
|
|
|
- result["files"] = [{
|
|
|
+ result["files"] = sorted([{
|
|
|
"file_name": file_name,
|
|
|
- "file_type": file_type
|
|
|
- } for file_name, file_type in all_files]
|
|
|
+ "file_type": file_type,
|
|
|
+ "index": str(file_index_map[file_name])
|
|
|
+ } for file_name, file_type in all_files], key=lambda x: int(x["index"]))
|
|
|
|
|
|
end_time = time.time()
|
|
|
logger.info(f"node_props_search接口耗时: {(end_time - start_time) * 1000:.2f}ms")
|
|
|
+
|
|
|
+ # 将结果存入缓存
|
|
|
+ cache[cache_key] = result
|
|
|
+
|
|
|
return StandardResponse(success=True, data=result)
|
|
|
except Exception as e:
|
|
|
logger.error(f"Node props search failed: {str(e)}")
|