kg_node_service.py 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. from sqlalchemy.orm import Session
  2. from typing import Optional
  3. from model.kg_node import KGNode
  4. from db.session import get_db
  5. import logging
  6. from sqlalchemy.exc import IntegrityError
  7. from utils import vectorizer
  8. from utils.vectorizer import Vectorizer
  9. from sqlalchemy import func
  10. from service.kg_prop_service import KGPropService
  11. from service.kg_edge_service import KGEdgeService
  12. from cachetools import TTLCache
  13. from cachetools.keys import hashkey
  14. logger = logging.getLogger(__name__)
  15. DISTANCE_THRESHOLD = 0.65
  16. class KGNodeService:
  17. def __init__(self, db: Session):
  18. self.db = db
  19. _cache = TTLCache(maxsize=100000, ttl=60*60*24*30)
  20. def search_title_index(self, index: str, keywrod: str,category: str, top_k: int = 3,distance: float = 0.3) -> Optional[int]:
  21. cache_key = f"search_title_index_{index}:{keywrod}:{category}:{top_k}:{distance}"
  22. if cache_key in self._cache:
  23. return self._cache[cache_key]
  24. query_embedding = Vectorizer.get_embedding(keywrod)
  25. db = next(get_db())
  26. # 执行向量搜索
  27. results = (
  28. db.query(
  29. KGNode.id,
  30. KGNode.name,
  31. KGNode.category,
  32. KGNode.embedding.l2_distance(query_embedding).label('distance')
  33. )
  34. .filter(KGNode.status == 0)
  35. .filter(KGNode.category == category)
  36. #todo 是否能提高性能 改成余弦算法
  37. .filter(KGNode.embedding.l2_distance(query_embedding) <= distance)
  38. .order_by('distance').limit(top_k).all()
  39. )
  40. results = [
  41. {
  42. "id": node.id,
  43. "title": node.name,
  44. "text": node.category,
  45. "score": 2.0-node.distance
  46. }
  47. for node in results
  48. ]
  49. self._cache[cache_key] = results
  50. return results
  51. def paginated_search(self, search_params: dict) -> dict:
  52. load_props = search_params.get('load_props', False)
  53. prop_service = KGPropService(self.db)
  54. edge_service = KGEdgeService(self.db)
  55. keyword = search_params.get('keyword', '')
  56. category = search_params.get('category', None)
  57. page_no = search_params.get('pageNo', 1)
  58. distance = search_params.get('distance',DISTANCE_THRESHOLD)
  59. limit = search_params.get('limit', 10)
  60. if page_no < 1:
  61. page_no = 1
  62. if limit < 1:
  63. limit = 10
  64. embedding = Vectorizer.get_embedding(keyword)
  65. offset = (page_no - 1) * limit
  66. try:
  67. # 构建基础查询条件
  68. base_query = self.db.query(func.count(KGNode.id)).filter(
  69. KGNode.status == 0,
  70. KGNode.embedding.l2_distance(embedding) < distance
  71. )
  72. # 如果有category,则添加额外过滤条件
  73. if category:
  74. base_query = base_query.filter(KGNode.category == category)
  75. # 如果有knowledge_ids,则添加额外过滤条件
  76. if search_params.get('knowledge_ids'):
  77. total_count = base_query.filter(
  78. KGNode.version.in_(search_params['knowledge_ids'])
  79. ).scalar()
  80. else:
  81. total_count = base_query.scalar()
  82. query = self.db.query(
  83. KGNode.id,
  84. KGNode.name,
  85. KGNode.category,
  86. KGNode.embedding.l2_distance(embedding).label('distance')
  87. )
  88. query = query.filter(KGNode.status == 0)
  89. #category有值时,过滤掉category不等于category的节点
  90. if category:
  91. query = query.filter(KGNode.category == category)
  92. if search_params.get('knowledge_ids'):
  93. query = query.filter(KGNode.version.in_(search_params['knowledge_ids']))
  94. query = query.filter(KGNode.embedding.l2_distance(embedding) < distance)
  95. results = query.order_by('distance').offset(offset).limit(limit).all()
  96. #将results相同distance的category=疾病的放在前面
  97. results = sorted(results, key=lambda x: (x.distance, not x.category == '疾病'))
  98. return {
  99. 'records': [{
  100. 'id': r.id,
  101. 'name': r.name,
  102. 'category': r.category,
  103. 'props': prop_service.get_props_by_ref_id(r.id) if load_props else [],
  104. #'edges':edge_service.get_edges_by_nodes(r.id, r.id,False) if load_props else [],
  105. 'distance': round(r.distance, 3)
  106. } for r in results],
  107. 'pagination': {
  108. 'total': total_count,
  109. 'pageNo': page_no,
  110. 'limit': limit,
  111. 'totalPages': (total_count + limit - 1) // limit
  112. }
  113. }
  114. except Exception as e:
  115. logger.error(f"分页查询失败: {str(e)}")
  116. raise e
  117. def create_node(self, node_data: dict):
  118. try:
  119. existing = self.db.query(KGNode).filter(
  120. KGNode.name == node_data['name'],
  121. KGNode.category == node_data['category'],
  122. KGNode.version == node_data.get('version')
  123. ).first()
  124. if existing:
  125. raise ValueError("Node already exists")
  126. new_node = KGNode(**node_data)
  127. self.db.add(new_node)
  128. self.db.commit()
  129. return new_node
  130. except IntegrityError as e:
  131. self.db.rollback()
  132. logger.error(f"创建节点失败: {str(e)}")
  133. raise ValueError("Database integrity error")
  134. def get_node(self, node_id: int):
  135. if node_id is None:
  136. raise ValueError("Node ID is required")
  137. cache_key = f"get_node_{node_id}"
  138. if cache_key in self._cache:
  139. return self._cache[cache_key]
  140. node = self.db.query(KGNode).filter(KGNode.id == node_id, KGNode.status == 0).first()
  141. if not node:
  142. raise ValueError("Node not found")
  143. node_data = {
  144. 'id': node.id,
  145. 'name': node.name,
  146. 'category': node.category,
  147. 'version': node.version
  148. }
  149. self._cache[cache_key] = node_data
  150. return node_data
  151. def update_node(self, node_id: int, update_data: dict):
  152. node = self.db.query(KGNode).get(node_id)
  153. if not node:
  154. raise ValueError("Node not found")
  155. try:
  156. for key, value in update_data.items():
  157. setattr(node, key, value)
  158. self.db.commit()
  159. return node
  160. except Exception as e:
  161. self.db.rollback()
  162. logger.error(f"更新节点失败: {str(e)}")
  163. raise ValueError("Update failed")
  164. def delete_node(self, node_id: int):
  165. node = self.db.query(KGNode).get(node_id)
  166. if not node:
  167. raise ValueError("Node not found")
  168. try:
  169. self.db.delete(node)
  170. self.db.commit()
  171. return None
  172. except Exception as e:
  173. self.db.rollback()
  174. logger.error(f"删除节点失败: {str(e)}")
  175. raise ValueError("Delete failed")
  176. def batch_process_er_nodes(self):
  177. batch_size = 200
  178. offset = 0
  179. while True:
  180. try:
  181. #下面的查询语句,增加根据id排序,防止并发问题
  182. nodes = self.db.query(KGNode).filter(
  183. #KGNode.version == 'ER',
  184. KGNode.embedding == None
  185. ).order_by(KGNode.id).offset(offset).limit(batch_size).all()
  186. if not nodes:
  187. break
  188. updated_nodes = []
  189. for node in nodes:
  190. if not node.embedding:
  191. embedding = Vectorizer.get_embedding(node.name)
  192. node.embedding = embedding
  193. updated_nodes.append(node)
  194. if updated_nodes:
  195. self.db.commit()
  196. offset += batch_size
  197. except Exception as e:
  198. self.db.rollback()
  199. print(f"批量处理ER节点失败: {str(e)}")
  200. raise ValueError("Batch process failed")