trunks_service.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. from sqlalchemy import func
  2. from sqlalchemy.orm import Session
  3. from db.session import get_db
  4. from typing import List, Optional
  5. from model.trunks_model import Trunks
  6. from db.session import SessionLocal
  7. import logging
  8. from utils.vectorizer import Vectorizer
  9. logger = logging.getLogger(__name__)
  10. class TrunksService:
  11. def __init__(self):
  12. self.db = next(get_db())
  13. def create_trunk(self, trunk_data: dict) -> Trunks:
  14. # 自动生成向量和全文检索字段
  15. content = trunk_data.get('content')
  16. if 'embedding' in trunk_data and len(trunk_data['embedding']) != 1024:
  17. raise ValueError("向量维度必须为1024")
  18. trunk_data['embedding'] = Vectorizer.get_embedding(content)
  19. if 'type' not in trunk_data:
  20. trunk_data['type'] = 'default'
  21. if 'title' not in trunk_data:
  22. from pathlib import Path
  23. trunk_data['title'] = Path(trunk_data['file_path']).stem
  24. print("embedding length:", len(trunk_data['embedding']))
  25. logger.debug(f"生成的embedding长度: {len(trunk_data['embedding'])}, 内容摘要: {content[:20]}")
  26. # trunk_data['content_tsvector'] = func.to_tsvector('chinese', content)
  27. db = SessionLocal()
  28. try:
  29. trunk = Trunks(**trunk_data)
  30. db.add(trunk)
  31. db.commit()
  32. db.refresh(trunk)
  33. return trunk
  34. except Exception as e:
  35. db.rollback()
  36. logger.error(f"创建trunk失败: {str(e)}")
  37. raise
  38. finally:
  39. db.close()
  40. def get_trunk_by_id(self, trunk_id: int) -> Optional[dict]:
  41. db = SessionLocal()
  42. try:
  43. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  44. if trunk:
  45. return {
  46. 'id': trunk.id,
  47. 'file_path': trunk.file_path,
  48. 'content': trunk.content,
  49. 'embedding': trunk.embedding.tolist(),
  50. 'type': trunk.type,
  51. 'title':trunk.title
  52. }
  53. return None
  54. finally:
  55. db.close()
  56. def search_by_vector(self, text: str, limit: int = 10, metadata_condition: Optional[dict] = None, type: Optional[str] = None, conversation_id: Optional[str] = None) -> List[dict]:
  57. embedding = Vectorizer.get_embedding(text)
  58. db = SessionLocal()
  59. try:
  60. query = db.query(
  61. Trunks.id,
  62. Trunks.file_path,
  63. Trunks.content,
  64. Trunks.embedding.l2_distance(embedding).label('distance'),
  65. Trunks.title,
  66. Trunks.embedding,
  67. Trunks.page_no,
  68. Trunks.referrence
  69. )
  70. if metadata_condition:
  71. query = query.filter_by(**metadata_condition)
  72. if type:
  73. query = query.filter(Trunks.type == type)
  74. results = query.order_by('distance').limit(limit).all()
  75. result_list = [{
  76. 'id': r.id,
  77. 'file_path': r.file_path,
  78. 'content': r.content,
  79. #保留小数点后三位
  80. 'distance': round(r.distance, 3),
  81. 'title': r.title,
  82. 'embedding': r.embedding.tolist(),
  83. 'page_no': r.page_no,
  84. 'referrence': r.referrence
  85. } for r in results]
  86. if conversation_id:
  87. self.set_cache(conversation_id, result_list)
  88. return result_list
  89. finally:
  90. db.close()
  91. def fulltext_search(self, query: str) -> List[Trunks]:
  92. db = SessionLocal()
  93. try:
  94. return db.query(Trunks).filter(
  95. Trunks.content_tsvector.match(query)
  96. ).all()
  97. finally:
  98. db.close()
  99. def update_trunk(self, trunk_id: int, update_data: dict) -> Optional[Trunks]:
  100. if 'content' in update_data:
  101. content = update_data['content']
  102. update_data['embedding'] = Vectorizer.get_embedding(content)
  103. if 'type' not in update_data:
  104. update_data['type'] = 'default'
  105. logger.debug(f"更新生成的embedding长度: {len(update_data['embedding'])}, 内容摘要: {content[:20]}")
  106. # update_data['content_tsvector'] = func.to_tsvector('chinese', content)
  107. db = SessionLocal()
  108. try:
  109. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  110. if trunk:
  111. for key, value in update_data.items():
  112. setattr(trunk, key, value)
  113. db.commit()
  114. db.refresh(trunk)
  115. return trunk
  116. except Exception as e:
  117. db.rollback()
  118. logger.error(f"更新trunk失败: {str(e)}")
  119. raise
  120. finally:
  121. db.close()
  122. def delete_trunk(self, trunk_id: int) -> bool:
  123. db = SessionLocal()
  124. try:
  125. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  126. if trunk:
  127. db.delete(trunk)
  128. db.commit()
  129. return True
  130. return False
  131. except Exception as e:
  132. db.rollback()
  133. logger.error(f"删除trunk失败: {str(e)}")
  134. raise
  135. finally:
  136. db.close()
  137. _cache = {}
  138. def get_cache(self, conversation_id: str) -> List[dict]:
  139. """
  140. 根据conversation_id获取缓存结果
  141. :param conversation_id: 会话ID
  142. :return: 结果列表
  143. """
  144. return self._cache.get(conversation_id, [])
  145. def set_cache(self, conversation_id: str, result: List[dict]) -> None:
  146. """
  147. 设置缓存结果
  148. :param conversation_id: 会话ID
  149. :param result: 要缓存的结果
  150. """
  151. self._cache[conversation_id] = result
  152. def get_cached_result(self, conversation_id: str) -> List[dict]:
  153. """
  154. 根据conversation_id获取缓存结果
  155. :param conversation_id: 会话ID
  156. :return: 结果列表
  157. """
  158. return self.get_cache(conversation_id)
  159. def paginated_search(self, search_params: dict) -> dict:
  160. """
  161. 分页查询方法
  162. :param search_params: 包含keyword, pageNo, limit的字典
  163. :return: 包含结果列表和分页信息的字典
  164. """
  165. keyword = search_params.get('keyword', '')
  166. page_no = search_params.get('pageNo', 1)
  167. limit = search_params.get('limit', 10)
  168. if page_no < 1:
  169. page_no = 1
  170. if limit < 1:
  171. limit = 10
  172. embedding = Vectorizer.get_embedding(keyword)
  173. offset = (page_no - 1) * limit
  174. db = SessionLocal()
  175. try:
  176. # 获取总条数
  177. total_count = db.query(func.count(Trunks.id)).filter(Trunks.type == search_params.get('type')).scalar()
  178. # 执行向量搜索
  179. results = db.query(
  180. Trunks.id,
  181. Trunks.file_path,
  182. Trunks.content,
  183. Trunks.embedding.l2_distance(embedding).label('distance'),
  184. Trunks.title
  185. ).filter(Trunks.type == search_params.get('type')).order_by('distance').offset(offset).limit(limit).all()
  186. return {
  187. 'data': [{
  188. 'id': r.id,
  189. 'file_path': r.file_path,
  190. 'content': r.content,
  191. #保留小数点后三位
  192. 'distance': round(r.distance, 3),
  193. 'title': r.title
  194. } for r in results],
  195. 'pagination': {
  196. 'total': total_count,
  197. 'pageNo': page_no,
  198. 'limit': limit,
  199. 'totalPages': (total_count + limit - 1) // limit
  200. }
  201. }
  202. finally:
  203. db.close()