text_search.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. from fastapi import APIRouter, HTTPException, Depends
  2. from pydantic import BaseModel, Field, validator
  3. from typing import List, Optional
  4. from service.trunks_service import TrunksService
  5. from utils.sentence_util import SentenceUtil
  6. from utils.vector_distance import VectorDistance
  7. from model.response import StandardResponse
  8. from utils.vectorizer import Vectorizer
  9. # from utils.find_text_in_pdf import find_text_in_pdf
  10. import os
  11. DISTANCE_THRESHOLD = 0.73
  12. import logging
  13. import time
  14. from db.session import get_db
  15. from sqlalchemy.orm import Session
  16. from service.kg_node_service import KGNodeService
  17. from service.kg_prop_service import KGPropService
  18. from cachetools import TTLCache
  19. # 使用TextSimilarityFinder进行文本相似度匹配
  20. from utils.text_similarity import TextSimilarityFinder
  21. logger = logging.getLogger(__name__)
  22. router = APIRouter(tags=["Text Search"])
  23. # 创建全局缓存实例
  24. cache = TTLCache(maxsize=1000, ttl=3600)
  25. class TextSearchRequest(BaseModel):
  26. text: str
  27. conversation_id: Optional[str] = None
  28. need_convert: Optional[bool] = False
  29. class TextCompareRequest(BaseModel):
  30. sentence: str
  31. text: str
  32. class TextMatchRequest(BaseModel):
  33. text: str = Field(..., min_length=1, max_length=10000, description="需要搜索的文本内容")
  34. @validator('text')
  35. def validate_text(cls, v):
  36. # 保留所有可打印字符、换行符和中文字符
  37. v = ''.join(char for char in v if char.isprintable() or char in '\n\r')
  38. # 转义JSON特殊字符
  39. # 先处理反斜杠,避免后续转义时出现问题
  40. v = v.replace('\\', '\\\\')
  41. # 处理引号和其他特殊字符
  42. v = v.replace('"', '\\"')
  43. v = v.replace('/', '\\/')
  44. # 处理控制字符
  45. v = v.replace('\n', '\\n')
  46. v = v.replace('\r', '\\r')
  47. v = v.replace('\t', '\\t')
  48. v = v.replace('\b', '\\b')
  49. v = v.replace('\f', '\\f')
  50. # 处理Unicode转义
  51. # v = v.replace('\u', '\\u')
  52. return v
  53. class TextCompareMultiRequest(BaseModel):
  54. origin: str
  55. similar: str
  56. class NodePropsSearchRequest(BaseModel):
  57. node_id: int
  58. props_ids: List[int]
  59. @router.post("/kgrt_api/text/clear_cache", response_model=StandardResponse)
  60. async def clear_cache():
  61. try:
  62. # 清除全局缓存
  63. cache.clear()
  64. return StandardResponse(success=True, data={"message": "缓存已清除"})
  65. except Exception as e:
  66. logger.error(f"清除缓存失败: {str(e)}")
  67. raise HTTPException(status_code=500, detail=str(e))
  68. @router.post("/kgrt_api/text/search", response_model=StandardResponse)
  69. @router.post("/knowledge/text/search", response_model=StandardResponse)
  70. async def search_text(request: TextSearchRequest):
  71. try:
  72. #判断request.text是否为json格式,如果是,使用JsonToText的convert方法转换为text
  73. if request.text.startswith('{') and request.text.endswith('}'):
  74. from utils.json_to_text import JsonToTextConverter
  75. converter = JsonToTextConverter()
  76. request.text = converter.convert(request.text)
  77. # 使用TextSplitter拆分文本
  78. sentences = SentenceUtil.split_text(request.text)
  79. if not sentences:
  80. return StandardResponse(success=True, data={"answer": "", "references": []})
  81. # 初始化服务和结果列表
  82. trunks_service = TrunksService()
  83. result_sentences = []
  84. all_references = []
  85. reference_index = 1
  86. # 根据conversation_id获取缓存结果
  87. cached_results = trunks_service.get_cached_result(request.conversation_id) if request.conversation_id else []
  88. for sentence in sentences:
  89. # if request.need_convert:
  90. sentence = sentence.replace("\n", "<br>")
  91. if len(sentence) < 10:
  92. result_sentences.append(sentence)
  93. continue
  94. if cached_results:
  95. # 如果有缓存结果,计算向量距离
  96. min_distance = float('inf')
  97. best_result = None
  98. sentence_vector = Vectorizer.get_embedding(sentence)
  99. for cached_result in cached_results:
  100. content_vector = cached_result['embedding']
  101. distance = VectorDistance.calculate_distance(sentence_vector, content_vector)
  102. if distance < min_distance:
  103. min_distance = distance
  104. best_result = {**cached_result, 'distance': distance}
  105. if best_result and best_result['distance'] < DISTANCE_THRESHOLD:
  106. search_results = [best_result]
  107. else:
  108. search_results = []
  109. else:
  110. # 如果没有缓存结果,进行向量搜索
  111. search_results = trunks_service.search_by_vector(
  112. text=sentence,
  113. limit=1,
  114. type='trunk'
  115. )
  116. # 处理搜索结果
  117. for search_result in search_results:
  118. distance = search_result.get("distance", DISTANCE_THRESHOLD)
  119. if distance >= DISTANCE_THRESHOLD:
  120. result_sentences.append(sentence)
  121. continue
  122. # 检查是否已存在相同引用
  123. existing_ref = next((ref for ref in all_references if ref["id"] == search_result["id"]), None)
  124. current_index = reference_index
  125. if existing_ref:
  126. current_index = int(existing_ref["index"])
  127. else:
  128. # 添加到引用列表
  129. # 从referrence中提取文件名
  130. file_name = ""
  131. referrence = search_result.get("referrence", "")
  132. if referrence and "/books/" in referrence:
  133. file_name = referrence.split("/books/")[-1]
  134. # 去除文件扩展名
  135. file_name = os.path.splitext(file_name)[0]
  136. reference = {
  137. "index": str(reference_index),
  138. "id": search_result["id"],
  139. "content": search_result["content"],
  140. "file_path": search_result.get("file_path", ""),
  141. "title": search_result.get("title", ""),
  142. "distance": distance,
  143. "file_name": file_name,
  144. "referrence": referrence
  145. }
  146. all_references.append(reference)
  147. reference_index += 1
  148. # 添加引用标记
  149. if sentence.endswith('<br>'):
  150. # 如果有多个<br>,在所有<br>前添加^[current_index]^
  151. result_sentence = sentence.replace('<br>', f'^[{current_index}]^<br>')
  152. else:
  153. # 直接在句子末尾添加^[current_index]^
  154. result_sentence = f'{sentence}^[{current_index}]^'
  155. result_sentences.append(result_sentence)
  156. # 组装返回数据
  157. response_data = {
  158. "answer": result_sentences,
  159. "references": all_references
  160. }
  161. return StandardResponse(success=True, data=response_data)
  162. except Exception as e:
  163. logger.error(f"Text search failed: {str(e)}")
  164. raise HTTPException(status_code=500, detail=str(e))
  165. @router.post("/kgrt_api/text/match", response_model=StandardResponse)
  166. @router.post("/knowledge/text/match", response_model=StandardResponse)
  167. async def match_text(request: TextCompareRequest):
  168. try:
  169. sentences = SentenceUtil.split_text(request.text)
  170. sentence_vector = Vectorizer.get_embedding(request.sentence)
  171. min_distance = float('inf')
  172. best_sentence = ""
  173. result_sentences = []
  174. for temp in sentences:
  175. result_sentences.append(temp)
  176. if len(temp) < 10:
  177. continue
  178. temp_vector = Vectorizer.get_embedding(temp)
  179. distance = VectorDistance.calculate_distance(sentence_vector, temp_vector)
  180. if distance < min_distance and distance < DISTANCE_THRESHOLD:
  181. min_distance = distance
  182. best_sentence = temp
  183. for i in range(len(result_sentences)):
  184. result_sentences[i] = {"sentence": result_sentences[i], "matched": False}
  185. if result_sentences[i]["sentence"] == best_sentence:
  186. result_sentences[i]["matched"] = True
  187. return StandardResponse(success=True, records=result_sentences)
  188. except Exception as e:
  189. logger.error(f"Text comparison failed: {str(e)}")
  190. raise HTTPException(status_code=500, detail=str(e))
  191. @router.post("/kgrt_api/text/mr_search", response_model=StandardResponse)
  192. @router.post("/knowledge/text/mr_search", response_model=StandardResponse)
  193. async def mr_search_text_content(request: TextMatchRequest):
  194. try:
  195. # 初始化服务
  196. trunks_service = TrunksService()
  197. # 获取文本向量并搜索相似内容
  198. search_results = trunks_service.search_by_vector(
  199. text=request.text,
  200. limit=10,
  201. type="mr"
  202. )
  203. # 处理搜索结果
  204. records = []
  205. for result in search_results:
  206. distance = result.get("distance", DISTANCE_THRESHOLD)
  207. if distance >= DISTANCE_THRESHOLD:
  208. continue
  209. # 添加到引用列表
  210. record = {
  211. "content": result["content"],
  212. "file_path": result.get("file_path", ""),
  213. "title": result.get("title", ""),
  214. "distance": distance,
  215. }
  216. records.append(record)
  217. # 组装返回数据
  218. response_data = {
  219. "records": records
  220. }
  221. return StandardResponse(success=True, data=response_data)
  222. except Exception as e:
  223. logger.error(f"Mr search failed: {str(e)}")
  224. raise HTTPException(status_code=500, detail=str(e))
  225. @router.post("/kgrt_api/text/mr_match", response_model=StandardResponse)
  226. @router.post("/knowledge/text/mr_match", response_model=StandardResponse)
  227. async def compare_text(request: TextCompareMultiRequest):
  228. start_time = time.time()
  229. try:
  230. # 拆分两段文本
  231. origin_sentences = SentenceUtil.split_text(request.origin)
  232. similar_sentences = SentenceUtil.split_text(request.similar)
  233. end_time = time.time()
  234. logger.info(f"mr_match接口处理文本耗时: {(end_time - start_time) * 1000:.2f}ms")
  235. # 初始化结果列表
  236. origin_results = []
  237. # 过滤短句并预计算向量
  238. valid_origin_sentences = [(sent, len(sent) >= 10) for sent in origin_sentences]
  239. valid_similar_sentences = [(sent, len(sent) >= 10) for sent in similar_sentences]
  240. # 初始化similar_results,所有matched设为False
  241. similar_results = [{"sentence": sent, "matched": False} for sent, _ in valid_similar_sentences]
  242. # 批量获取向量
  243. origin_vectors = {}
  244. similar_vectors = {}
  245. origin_batch = [sent for sent, is_valid in valid_origin_sentences if is_valid]
  246. similar_batch = [sent for sent, is_valid in valid_similar_sentences if is_valid]
  247. if origin_batch:
  248. origin_embeddings = [Vectorizer.get_embedding(sent) for sent in origin_batch]
  249. origin_vectors = dict(zip(origin_batch, origin_embeddings))
  250. if similar_batch:
  251. similar_embeddings = [Vectorizer.get_embedding(sent) for sent in similar_batch]
  252. similar_vectors = dict(zip(similar_batch, similar_embeddings))
  253. end_time = time.time()
  254. logger.info(f"mr_match接口处理向量耗时: {(end_time - start_time) * 1000:.2f}ms")
  255. # 处理origin文本
  256. for origin_sent, is_valid in valid_origin_sentences:
  257. if not is_valid:
  258. origin_results.append({"sentence": origin_sent, "matched": False})
  259. continue
  260. origin_vector = origin_vectors[origin_sent]
  261. matched = False
  262. # 优化的相似度计算
  263. for i, similar_result in enumerate(similar_results):
  264. if similar_result["matched"]:
  265. continue
  266. similar_sent = similar_result["sentence"]
  267. if len(similar_sent) < 10:
  268. continue
  269. similar_vector = similar_vectors.get(similar_sent)
  270. if not similar_vector:
  271. continue
  272. distance = VectorDistance.calculate_distance(origin_vector, similar_vector)
  273. if distance < DISTANCE_THRESHOLD:
  274. matched = True
  275. similar_results[i]["matched"] = True
  276. break
  277. origin_results.append({"sentence": origin_sent, "matched": matched})
  278. response_data = {
  279. "origin": origin_results,
  280. "similar": similar_results
  281. }
  282. end_time = time.time()
  283. logger.info(f"mr_match接口耗时: {(end_time - start_time) * 1000:.2f}ms")
  284. return StandardResponse(success=True, data=response_data)
  285. except Exception as e:
  286. end_time = time.time()
  287. logger.error(f"Text comparison failed: {str(e)}")
  288. logger.info(f"mr_match接口耗时: {(end_time - start_time) * 1000:.2f}ms")
  289. raise HTTPException(status_code=500, detail=str(e))
  290. def _check_cache(node_id: int) -> Optional[dict]:
  291. """检查并返回缓存结果"""
  292. cache_key = f"xunzheng_{node_id}"
  293. cached_result = cache.get(cache_key)
  294. if cached_result:
  295. logger.info(f"从缓存获取结果,node_id: {node_id}")
  296. return cached_result
  297. return None
  298. def _get_node_info(node_service: KGNodeService, node_id: int) -> dict:
  299. """获取并验证节点信息"""
  300. node = node_service.get_node(node_id)
  301. if not node:
  302. raise ValueError(f"节点不存在: {node_id}")
  303. return {
  304. "id": node_id,
  305. "name": node.get('name', ''),
  306. "category": node.get('category', ''),
  307. "props": [],
  308. "files": [],
  309. "distance": 0
  310. }
  311. def _process_search_result(search_result: dict, reference_index: int) -> tuple[dict, str]:
  312. """处理搜索结果,返回引用信息和文件名"""
  313. file_name = ""
  314. referrence = search_result.get("referrence", "")
  315. if referrence and "/books/" in referrence:
  316. file_name = referrence.split("/books/")[-1]
  317. file_name = os.path.splitext(file_name)[0]
  318. reference = {
  319. "index": str(reference_index),
  320. "id": search_result["id"],
  321. "content": search_result["content"],
  322. "file_path": search_result.get("file_path", ""),
  323. "title": search_result.get("title", ""),
  324. "distance": search_result.get("distance", DISTANCE_THRESHOLD),
  325. "page_no": search_result.get("page_no", ""),
  326. "file_name": file_name,
  327. "referrence": referrence
  328. }
  329. return reference, file_name
  330. def _get_file_type(file_name: str) -> str:
  331. """根据文件名确定文件类型"""
  332. file_name_lower = file_name.lower()
  333. if file_name_lower.endswith(".pdf"):
  334. return "pdf"
  335. elif file_name_lower.endswith((".doc", ".docx")):
  336. return "doc"
  337. elif file_name_lower.endswith((".xls", ".xlsx")):
  338. return "excel"
  339. elif file_name_lower.endswith((".ppt", ".pptx")):
  340. return "ppt"
  341. return "other"
  342. def _process_sentence_search(node_name: str, prop_title: str, sentences: list, trunks_service: TrunksService) -> tuple[list, list]:
  343. """处理句子搜索,返回结果句子和引用列表"""
  344. result_sentences = []
  345. all_references = []
  346. reference_index = 1
  347. i = 0
  348. while i < len(sentences):
  349. sentence = sentences[i]
  350. search_text = f"{node_name}:{prop_title}:{sentence}"
  351. if len(sentence) < 10 and i + 1 < len(sentences):
  352. next_sentence = sentences[i + 1]
  353. result_sentences.append({"sentence": sentence, "flag": ""})
  354. search_text = f"{node_name}:{prop_title}:{sentence} {next_sentence}"
  355. i += 1
  356. elif len(sentence) < 10:
  357. result_sentences.append({"sentence": sentence, "flag": ""})
  358. i += 1
  359. continue
  360. i += 1
  361. # 使用向量搜索获取相似内容
  362. search_results = trunks_service.search_by_vector(
  363. text=search_text,
  364. limit=500,
  365. type='trunk'
  366. )
  367. # 查询1000条切片数据
  368. # db_trunks = db.query(Trunks).filter(Trunks.type == 'trunk').limit(1000).all()
  369. # 准备语料库数据
  370. trunk_texts = []
  371. trunk_ids = []
  372. # 创建一个字典来存储trunk的详细信息
  373. trunk_details = {}
  374. for trunk in search_results:
  375. trunk_texts.append(trunk.get('content'))
  376. trunk_ids.append(trunk.get('id'))
  377. # 缓存trunk的详细信息
  378. trunk_details[trunk.get('id')] = {
  379. 'id': trunk.get('id'),
  380. 'content': trunk.get('content'),
  381. 'file_path': trunk.get('file_path'),
  382. 'title': trunk.get('title'),
  383. 'referrence': trunk.get('referrence'),
  384. 'page_no': trunk.get('page_no')
  385. }
  386. # 初始化TextSimilarityFinder并加载语料库
  387. similarity_finder = TextSimilarityFinder(method='tfidf', use_jieba=True)
  388. similarity_finder.load_corpus(trunk_texts, trunk_ids)
  389. # 使用TextSimilarityFinder进行相似度匹配
  390. similar_results = similarity_finder.find_most_similar(search_text, top_n=1)
  391. if not similar_results: # 设置相似度阈值
  392. result_sentences.append({"sentence": sentence, "flag": ""})
  393. continue
  394. # 获取最相似的文本对应的trunk_id
  395. trunk_id = similar_results[0]['path']
  396. # 从缓存中获取trunk详细信息
  397. trunk_info = trunk_details.get(trunk_id)
  398. if trunk_info:
  399. search_result = {
  400. **trunk_info,
  401. 'distance': similar_results[0]['similarity'] # 转换相似度为距离
  402. }
  403. # 检查相似度是否达到阈值
  404. if search_result['distance'] >= DISTANCE_THRESHOLD:
  405. result_sentences.append({"sentence": sentence, "flag": ""})
  406. continue
  407. # 检查是否已存在相同引用
  408. existing_ref = next((ref for ref in all_references if ref["id"] == search_result["id"]), None)
  409. current_index = int(existing_ref["index"]) if existing_ref else reference_index
  410. if not existing_ref:
  411. reference, _ = _process_search_result(search_result, reference_index)
  412. all_references.append(reference)
  413. reference_index += 1
  414. result_sentences.append({"sentence": sentence, "flag": str(current_index)})
  415. return result_sentences, all_references
  416. @router.post("/kgrt_api/text/eb_search", response_model=StandardResponse)
  417. @router.post("/knowledge/text/eb_search", response_model=StandardResponse)
  418. async def node_props_search(request: NodePropsSearchRequest, db: Session = Depends(get_db)):
  419. try:
  420. start_time = time.time()
  421. # 检查缓存
  422. cached_result = _check_cache(request.node_id)
  423. if cached_result:
  424. return StandardResponse(success=True, data=cached_result)
  425. # 初始化服务
  426. trunks_service = TrunksService()
  427. node_service = KGNodeService(db)
  428. prop_service = KGPropService(db)
  429. # 获取节点信息
  430. result = _get_node_info(node_service, request.node_id)
  431. node_name = result["name"]
  432. # 遍历props_ids查询属性信息
  433. for prop_id in request.props_ids:
  434. prop = prop_service.get_props_by_id(prop_id)
  435. if not prop:
  436. logger.warning(f"属性不存在: {prop_id}")
  437. continue
  438. prop_title = prop.get('prop_title', '')
  439. prop_value = prop.get('prop_value', '')
  440. # 创建属性结果对象
  441. prop_result = {
  442. "id": prop_id,
  443. "category": prop.get('category', 0),
  444. "prop_name": prop.get('prop_name', ''),
  445. "prop_value": prop_value,
  446. "prop_title": prop_title,
  447. "type": prop.get('type', 1)
  448. }
  449. result["props"].append(prop_result)
  450. # 如果prop_value为'无',则跳过搜索
  451. if prop_value == '无':
  452. prop_result["answer"] = [{
  453. "sentence": prop_value,
  454. "flag": ""
  455. }]
  456. continue
  457. # 先用完整的prop_value进行搜索
  458. search_text = f"{node_name}:{prop_title}:{prop_value}"
  459. # 使用向量搜索获取相似内容
  460. search_results = trunks_service.search_by_vector(
  461. text=search_text,
  462. limit=500,
  463. type='trunk'
  464. )
  465. # 准备语料库数据
  466. trunk_texts = []
  467. trunk_ids = []
  468. # 创建一个字典来存储trunk的详细信息
  469. trunk_details = {}
  470. for trunk in search_results:
  471. trunk_texts.append(trunk.get('content'))
  472. trunk_ids.append(trunk.get('id'))
  473. # 缓存trunk的详细信息
  474. trunk_details[trunk.get('id')] = {
  475. 'id': trunk.get('id'),
  476. 'content': trunk.get('content'),
  477. 'file_path': trunk.get('file_path'),
  478. 'title': trunk.get('title'),
  479. 'referrence': trunk.get('referrence'),
  480. 'page_no': trunk.get('page_no')
  481. }
  482. # 初始化TextSimilarityFinder并加载语料库
  483. similarity_finder = TextSimilarityFinder(method='tfidf', use_jieba=True)
  484. similarity_finder.load_corpus(trunk_texts, trunk_ids)
  485. similar_results = similarity_finder.find_most_similar(search_text, top_n=1)
  486. # 处理搜索结果
  487. if similar_results: # 设置相似度阈值
  488. # 获取最相似的文本对应的trunk_id
  489. trunk_id = similar_results[0]['path']
  490. # 从缓存中获取trunk详细信息
  491. trunk_info = trunk_details.get(trunk_id)
  492. if trunk_info:
  493. search_result = {
  494. **trunk_info,
  495. 'distance': similar_results[0]['similarity'] # 转换相似度为距离
  496. }
  497. reference, _ = _process_search_result(search_result, 1)
  498. prop_result["references"] = [reference]
  499. prop_result["answer"] = [{
  500. "sentence": prop_value,
  501. "flag": "1"
  502. }]
  503. else:
  504. # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
  505. sentences = SentenceUtil.split_text(prop_value)
  506. else:
  507. # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
  508. sentences = SentenceUtil.split_text(prop_value)
  509. result_sentences, references = _process_sentence_search(
  510. node_name, prop_title, sentences, trunks_service
  511. )
  512. if references:
  513. prop_result["references"] = references
  514. if result_sentences:
  515. prop_result["answer"] = result_sentences
  516. # 处理文件信息
  517. all_files = set()
  518. file_index_map = {}
  519. file_index = 1
  520. # 收集文件信息
  521. for prop_result in result["props"]:
  522. if "references" not in prop_result:
  523. continue
  524. for ref in prop_result["references"]:
  525. referrence = ref.get("referrence", "")
  526. if not (referrence and "/books/" in referrence):
  527. continue
  528. file_name = referrence.split("/books/")[-1]
  529. if not file_name:
  530. continue
  531. file_type = _get_file_type(file_name)
  532. if file_name not in file_index_map:
  533. file_index_map[file_name] = file_index
  534. file_index += 1
  535. all_files.add((file_name, file_type))
  536. # 更新引用索引
  537. for prop_result in result["props"]:
  538. if "references" not in prop_result:
  539. continue
  540. for ref in prop_result["references"]:
  541. referrence = ref.get("referrence", "")
  542. if referrence and "/books/" in referrence:
  543. file_name = referrence.split("/books/")[-1]
  544. if file_name in file_index_map:
  545. ref["index"] = f"{file_index_map[file_name]}-{ref['index']}"
  546. # 更新answer中的flag
  547. if "answer" in prop_result:
  548. for sentence in prop_result["answer"]:
  549. if sentence["flag"]:
  550. for ref in prop_result["references"]:
  551. if ref["index"].endswith(f"-{sentence['flag']}"):
  552. sentence["flag"] = ref["index"]
  553. break
  554. # 添加文件信息到结果
  555. result["files"] = sorted([{
  556. "file_name": file_name,
  557. "file_type": file_type,
  558. "index": str(file_index_map[file_name])
  559. } for file_name, file_type in all_files], key=lambda x: int(x["index"]))
  560. end_time = time.time()
  561. logger.info(f"node_props_search接口耗时: {(end_time - start_time) * 1000:.2f}ms")
  562. # 缓存结果
  563. cache_key = f"xunzheng_{request.node_id}"
  564. cache[cache_key] = result
  565. return StandardResponse(success=True, data=result)
  566. except Exception as e:
  567. logger.error(f"Node props search failed: {str(e)}")
  568. raise HTTPException(status_code=500, detail=str(e))
  569. text_search_router = router