kg_node_service.py 7.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. from sqlalchemy.orm import Session
  2. from typing import Optional
  3. from model.kg_node import KGNode
  4. from db.session import get_db
  5. import logging
  6. from sqlalchemy.exc import IntegrityError
  7. from utils import vectorizer
  8. from utils.vectorizer import Vectorizer
  9. from sqlalchemy import func
  10. from service.kg_prop_service import KGPropService
  11. from service.kg_edge_service import KGEdgeService
  12. logger = logging.getLogger(__name__)
  13. DISTANCE_THRESHOLD = 0.65
  14. DISTANCE_THRESHOLD2 = 0.3
  15. class KGNodeService:
  16. def __init__(self, db: Session):
  17. self.db = db
  18. _cache = {}
  19. def search_title_index(self, index: str, title: str, top_k: int = 3):
  20. cache_key = f"{index}:{title}:{top_k}"
  21. if cache_key in self._cache:
  22. return self._cache[cache_key]
  23. query_embedding = Vectorizer.get_embedding(title)
  24. db = next(get_db())
  25. # 执行向量搜索
  26. results = (
  27. db.query(
  28. KGNode.id,
  29. KGNode.name,
  30. KGNode.category,
  31. KGNode.embedding.l2_distance(query_embedding).label('distance')
  32. )
  33. .filter(KGNode.status == 0)
  34. #过滤掉version不等于'er'的节点
  35. .filter(KGNode.version != 'ER')
  36. .filter(KGNode.embedding.l2_distance(query_embedding) <= DISTANCE_THRESHOLD2)
  37. .order_by('distance').limit(top_k).all()
  38. )
  39. results = [
  40. {
  41. "id": node.id,
  42. "title": node.name,
  43. "text": node.category,
  44. "score": 2.0-node.distance
  45. }
  46. for node in results
  47. ]
  48. self._cache[cache_key] = results
  49. return results
  50. def paginated_search(self, search_params: dict) -> dict:
  51. load_props = search_params.get('load_props', False)
  52. prop_service = KGPropService(self.db)
  53. edge_service = KGEdgeService(self.db)
  54. keyword = search_params.get('keyword', '')
  55. page_no = search_params.get('pageNo', 1)
  56. limit = search_params.get('limit', 10)
  57. if page_no < 1:
  58. page_no = 1
  59. if limit < 1:
  60. limit = 10
  61. embedding = Vectorizer.get_embedding(keyword)
  62. offset = (page_no - 1) * limit
  63. try:
  64. total_count = self.db.query(func.count(KGNode.id)).filter(KGNode.version.in_(search_params['knowledge_ids']), KGNode.embedding.l2_distance(embedding) < DISTANCE_THRESHOLD).scalar() if search_params.get('knowledge_ids') else self.db.query(func.count(KGNode.id)).filter(KGNode.embedding.l2_distance(embedding) < DISTANCE_THRESHOLD).scalar()
  65. query = self.db.query(
  66. KGNode.id,
  67. KGNode.name,
  68. KGNode.category,
  69. KGNode.embedding.l2_distance(embedding).label('distance')
  70. )
  71. query = query.filter(KGNode.status == 0)
  72. if search_params.get('knowledge_ids'):
  73. query = query.filter(KGNode.version.in_(search_params['knowledge_ids']))
  74. query = query.filter(KGNode.embedding.l2_distance(embedding) < DISTANCE_THRESHOLD)
  75. results = query.order_by('distance').offset(offset).limit(limit).all()
  76. #将results相同distance的category=疾病的放在前面
  77. results = sorted(results, key=lambda x: (x.distance, not x.category == '疾病'))
  78. return {
  79. 'records': [{
  80. 'id': r.id,
  81. 'name': r.name,
  82. 'category': r.category,
  83. 'props': prop_service.get_props_by_ref_id(r.id) if load_props else [],
  84. #'edges':edge_service.get_edges_by_nodes(r.id, r.id,False) if load_props else [],
  85. 'distance': round(r.distance, 3)
  86. } for r in results],
  87. 'pagination': {
  88. 'total': total_count,
  89. 'pageNo': page_no,
  90. 'limit': limit,
  91. 'totalPages': (total_count + limit - 1) // limit
  92. }
  93. }
  94. except Exception as e:
  95. logger.error(f"分页查询失败: {str(e)}")
  96. raise e
  97. def create_node(self, node_data: dict):
  98. try:
  99. existing = self.db.query(KGNode).filter(
  100. KGNode.name == node_data['name'],
  101. KGNode.category == node_data['category'],
  102. KGNode.version == node_data.get('version')
  103. ).first()
  104. if existing:
  105. raise ValueError("Node already exists")
  106. new_node = KGNode(**node_data)
  107. self.db.add(new_node)
  108. self.db.commit()
  109. return new_node
  110. except IntegrityError as e:
  111. self.db.rollback()
  112. logger.error(f"创建节点失败: {str(e)}")
  113. raise ValueError("Database integrity error")
  114. def get_node(self, node_id: int):
  115. if node_id is None:
  116. raise ValueError("Node ID is required")
  117. node = self.db.query(KGNode).filter(KGNode.id == node_id, KGNode.status == 0).first()
  118. if not node:
  119. raise ValueError("Node not found")
  120. return {
  121. 'id': node.id,
  122. 'name': node.name,
  123. 'category': node.category,
  124. 'version': node.version
  125. }
  126. def update_node(self, node_id: int, update_data: dict):
  127. node = self.db.query(KGNode).get(node_id)
  128. if not node:
  129. raise ValueError("Node not found")
  130. try:
  131. for key, value in update_data.items():
  132. setattr(node, key, value)
  133. self.db.commit()
  134. return node
  135. except Exception as e:
  136. self.db.rollback()
  137. logger.error(f"更新节点失败: {str(e)}")
  138. raise ValueError("Update failed")
  139. def delete_node(self, node_id: int):
  140. node = self.db.query(KGNode).get(node_id)
  141. if not node:
  142. raise ValueError("Node not found")
  143. try:
  144. self.db.delete(node)
  145. self.db.commit()
  146. return None
  147. except Exception as e:
  148. self.db.rollback()
  149. logger.error(f"删除节点失败: {str(e)}")
  150. raise ValueError("Delete failed")
  151. def batch_process_er_nodes(self):
  152. batch_size = 200
  153. offset = 0
  154. while True:
  155. try:
  156. nodes = self.db.query(KGNode).filter(
  157. #KGNode.version == 'ER',
  158. KGNode.embedding == None
  159. ).offset(offset).limit(batch_size).all()
  160. if not nodes:
  161. break
  162. updated_nodes = []
  163. for node in nodes:
  164. if not node.embedding:
  165. embedding = Vectorizer.get_embedding(node.name)
  166. node.embedding = embedding
  167. updated_nodes.append(node)
  168. if updated_nodes:
  169. self.db.commit()
  170. offset += batch_size
  171. except Exception as e:
  172. self.db.rollback()
  173. print(f"批量处理ER节点失败: {str(e)}")
  174. raise ValueError("Batch process failed")