|
@@ -789,4 +789,84 @@ async def node_props_search(request: NodePropsSearchRequest, db: Session = Depen
|
|
logger.error(f"Node props search failed: {str(e)}")
|
|
logger.error(f"Node props search failed: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
|
|
+class FindSimilarTexts(BaseModel):
|
|
|
|
+ search_text: str
|
|
|
|
+
|
|
|
|
+@router.post("/knowledge/text/find_similar_texts", response_model=StandardResponse)
|
|
|
|
+async def node_props_search(request: FindSimilarTexts, db: Session = Depends(get_db)):
|
|
|
|
+ trunks_service = TrunksService()
|
|
|
|
+ search_text = request.search_text
|
|
|
|
+ # 使用向量搜索获取相似内容
|
|
|
|
+ search_results = trunks_service.search_by_vector(
|
|
|
|
+ text=search_text,
|
|
|
|
+ limit=500,
|
|
|
|
+ type='trunk',
|
|
|
|
+ distance=0.7
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # 准备语料库数据
|
|
|
|
+ trunk_texts = []
|
|
|
|
+ trunk_ids = []
|
|
|
|
+
|
|
|
|
+ # 创建一个字典来存储trunk的详细信息
|
|
|
|
+ trunk_details = {}
|
|
|
|
+
|
|
|
|
+ for trunk in search_results:
|
|
|
|
+ trunk_texts.append(trunk.get('content'))
|
|
|
|
+ trunk_ids.append(trunk.get('id'))
|
|
|
|
+ # 缓存trunk的详细信息
|
|
|
|
+ trunk_details[trunk.get('id')] = {
|
|
|
|
+ 'id': trunk.get('id'),
|
|
|
|
+ 'content': trunk.get('content'),
|
|
|
|
+ 'file_path': trunk.get('file_path'),
|
|
|
|
+ 'title': trunk.get('title'),
|
|
|
|
+ 'referrence': trunk.get('referrence'),
|
|
|
|
+ 'page_no': trunk.get('page_no')
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ if len(trunk_texts) == 0:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ # 初始化TextSimilarityFinder并加载语料库
|
|
|
|
+ similarity_finder = TextSimilarityFinder(method='tfidf', use_jieba=True)
|
|
|
|
+ similarity_finder.load_corpus(trunk_texts, trunk_ids)
|
|
|
|
+
|
|
|
|
+ similar_results = similarity_finder.find_most_similar(search_text, top_n=1)
|
|
|
|
+
|
|
|
|
+ # 处理搜索结果
|
|
|
|
+ if similar_results and similar_results[0]['similarity'] >= 0.3: # 设置相似度阈值
|
|
|
|
+ # 获取最相似的文本对应的trunk_id
|
|
|
|
+ trunk_id = similar_results[0]['path']
|
|
|
|
+
|
|
|
|
+ # 从缓存中获取trunk详细信息
|
|
|
|
+ trunk_info = trunk_details.get(trunk_id)
|
|
|
|
+
|
|
|
|
+ if trunk_info:
|
|
|
|
+ search_result = {
|
|
|
|
+ **trunk_info,
|
|
|
|
+ 'distance': similar_results[0]['similarity'] # 转换相似度为距离
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ reference, _ = _process_search_result(search_result, 1)
|
|
|
|
+ # prop_result["references"] = [reference]
|
|
|
|
+ # prop_result["answer"] = [{
|
|
|
|
+ # "sentence": prop_value,
|
|
|
|
+ # "flag": "1"
|
|
|
|
+ # }]
|
|
|
|
+ else:
|
|
|
|
+ # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
|
|
|
|
+ sentences = SentenceUtil.split_text(search_text, 10)
|
|
|
|
+ else:
|
|
|
|
+ # 如果整体搜索没有找到匹配结果,则进行句子拆分搜索
|
|
|
|
+ sentences = SentenceUtil.split_text(search_text, 10)
|
|
|
|
+ result_sentences, references = _process_sentence_search(
|
|
|
|
+ '', '', sentences, trunks_service
|
|
|
|
+ )
|
|
|
|
+ # if references:
|
|
|
|
+ # prop_result["references"] = references
|
|
|
|
+ # if result_sentences:
|
|
|
|
+ # prop_result["answer"] = result_sentences
|
|
|
|
+ return StandardResponse(success=True)
|
|
|
|
+
|
|
|
|
+
|
|
text_search_router = router
|
|
text_search_router = router
|