trunks_service.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. from sqlalchemy import func
  2. from sqlalchemy.orm import Session
  3. from db.session import get_db
  4. from typing import List, Optional
  5. from model.trunks_model import Trunks
  6. from db.session import SessionLocal
  7. import logging
  8. from utils.vectorizer import Vectorizer
  9. logger = logging.getLogger(__name__)
  10. class TrunksService:
  11. def __init__(self):
  12. self.db = next(get_db())
  13. def create_trunk(self, trunk_data: dict) -> Trunks:
  14. # 自动生成向量和全文检索字段
  15. content = trunk_data.get('content')
  16. if 'embedding' in trunk_data and len(trunk_data['embedding']) != 1024:
  17. raise ValueError("向量维度必须为1024")
  18. trunk_data['embedding'] = Vectorizer.get_embedding(content)
  19. if 'type' not in trunk_data:
  20. trunk_data['type'] = 'default'
  21. print("embedding length:", len(trunk_data['embedding']))
  22. logger.debug(f"生成的embedding长度: {len(trunk_data['embedding'])}, 内容摘要: {content[:20]}")
  23. # trunk_data['content_tsvector'] = func.to_tsvector('chinese', content)
  24. db = SessionLocal()
  25. try:
  26. trunk = Trunks(**trunk_data)
  27. db.add(trunk)
  28. db.commit()
  29. db.refresh(trunk)
  30. return trunk
  31. except Exception as e:
  32. db.rollback()
  33. logger.error(f"创建trunk失败: {str(e)}")
  34. raise
  35. finally:
  36. db.close()
  37. def get_trunk_by_id(self, trunk_id: int) -> Optional[dict]:
  38. db = SessionLocal()
  39. try:
  40. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  41. if trunk:
  42. return {
  43. 'id': trunk.id,
  44. 'file_path': trunk.file_path,
  45. 'content': trunk.content,
  46. 'embedding': trunk.embedding.tolist(),
  47. 'type': trunk.type
  48. }
  49. return None
  50. finally:
  51. db.close()
  52. def search_by_vector(self, text: str, limit: int = 10, metadata_condition: Optional[dict] = None, type: Optional[str] = None) -> List[dict]:
  53. embedding = Vectorizer.get_embedding(text)
  54. db = SessionLocal()
  55. try:
  56. query = db.query(
  57. Trunks.id,
  58. Trunks.file_path,
  59. Trunks.content,
  60. Trunks.embedding.l2_distance(embedding).label('distance')
  61. )
  62. if metadata_condition:
  63. query = query.filter_by(**metadata_condition)
  64. if type:
  65. query = query.filter(Trunks.type == type)
  66. results = query.order_by('distance').limit(limit).all()
  67. return [{
  68. 'id': r.id,
  69. 'file_path': r.file_path,
  70. 'content': r.content,
  71. 'distance': r.distance
  72. } for r in results]
  73. finally:
  74. db.close()
  75. def fulltext_search(self, query: str) -> List[Trunks]:
  76. db = SessionLocal()
  77. try:
  78. return db.query(Trunks).filter(
  79. Trunks.content_tsvector.match(query)
  80. ).all()
  81. finally:
  82. db.close()
  83. def update_trunk(self, trunk_id: int, update_data: dict) -> Optional[Trunks]:
  84. if 'content' in update_data:
  85. content = update_data['content']
  86. update_data['embedding'] = Vectorizer.get_embedding(content)
  87. if 'type' not in update_data:
  88. update_data['type'] = 'default'
  89. logger.debug(f"更新生成的embedding长度: {len(update_data['embedding'])}, 内容摘要: {content[:20]}")
  90. # update_data['content_tsvector'] = func.to_tsvector('chinese', content)
  91. db = SessionLocal()
  92. try:
  93. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  94. if trunk:
  95. for key, value in update_data.items():
  96. setattr(trunk, key, value)
  97. db.commit()
  98. db.refresh(trunk)
  99. return trunk
  100. except Exception as e:
  101. db.rollback()
  102. logger.error(f"更新trunk失败: {str(e)}")
  103. raise
  104. finally:
  105. db.close()
  106. def delete_trunk(self, trunk_id: int) -> bool:
  107. db = SessionLocal()
  108. try:
  109. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  110. if trunk:
  111. db.delete(trunk)
  112. db.commit()
  113. return True
  114. return False
  115. except Exception as e:
  116. db.rollback()
  117. logger.error(f"删除trunk失败: {str(e)}")
  118. raise
  119. finally:
  120. db.close()
  121. def paginated_search(self, search_params: dict) -> dict:
  122. """
  123. 分页查询方法
  124. :param search_params: 包含keyword, pageNo, limit的字典
  125. :return: 包含结果列表和分页信息的字典
  126. """
  127. keyword = search_params.get('keyword', '')
  128. page_no = search_params.get('pageNo', 1)
  129. limit = search_params.get('limit', 10)
  130. if page_no < 1:
  131. page_no = 1
  132. if limit < 1:
  133. limit = 10
  134. embedding = Vectorizer.get_embedding(keyword)
  135. offset = (page_no - 1) * limit
  136. db = SessionLocal()
  137. try:
  138. # 获取总条数
  139. total_count = db.query(func.count(Trunks.id)).filter(Trunks.type == search_params.get('type')).scalar()
  140. # 执行向量搜索
  141. results = db.query(
  142. Trunks.id,
  143. Trunks.file_path,
  144. Trunks.content,
  145. Trunks.embedding.l2_distance(embedding).label('distance')
  146. ).filter(Trunks.type == search_params.get('type')).order_by('distance').offset(offset).limit(limit).all()
  147. return {
  148. 'data': [{
  149. 'id': r.id,
  150. 'file_path': r.file_path,
  151. 'content': r.content,
  152. 'distance': r.distance
  153. } for r in results],
  154. 'pagination': {
  155. 'total': total_count,
  156. 'pageNo': page_no,
  157. 'limit': limit,
  158. 'totalPages': (total_count + limit - 1) // limit
  159. }
  160. }
  161. finally:
  162. db.close()