trunks_service.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. from sqlalchemy import func
  2. from sqlalchemy.orm import Session
  3. from db.session import get_db
  4. from typing import List, Optional
  5. from model.trunks_model import Trunks
  6. from db.session import SessionLocal
  7. import logging
  8. from utils.sentence_util import SentenceUtil
  9. from utils.vectorizer import Vectorizer
  10. logger = logging.getLogger(__name__)
  11. class TrunksService:
  12. def __init__(self):
  13. self.db = next(get_db())
  14. def create_trunk(self, trunk_data: dict) -> Trunks:
  15. # 自动生成向量和全文检索字段
  16. content = trunk_data.get('content')
  17. if 'embedding' in trunk_data and len(trunk_data['embedding']) != 1024:
  18. raise ValueError("向量维度必须为1024")
  19. trunk_data['embedding'] = Vectorizer.get_embedding(content)
  20. if 'type' not in trunk_data:
  21. trunk_data['type'] = 'default'
  22. if 'title' not in trunk_data:
  23. from pathlib import Path
  24. trunk_data['title'] = Path(trunk_data['file_path']).stem
  25. print("embedding length:", len(trunk_data['embedding']))
  26. logger.debug(f"生成的embedding长度: {len(trunk_data['embedding'])}, 内容摘要: {content[:20]}")
  27. # trunk_data['content_tsvector'] = func.to_tsvector('chinese', content)
  28. db = SessionLocal()
  29. try:
  30. trunk = Trunks(**trunk_data)
  31. db.add(trunk)
  32. db.commit()
  33. db.refresh(trunk)
  34. return trunk
  35. except Exception as e:
  36. db.rollback()
  37. logger.error(f"创建trunk失败: {str(e)}")
  38. raise
  39. finally:
  40. db.close()
  41. def get_trunk_by_id(self, trunk_id: int) -> Optional[dict]:
  42. db = SessionLocal()
  43. try:
  44. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  45. if trunk:
  46. return {
  47. 'id': trunk.id,
  48. 'file_path': trunk.file_path,
  49. 'content': trunk.content,
  50. 'embedding': trunk.embedding.tolist(),
  51. 'type': trunk.type,
  52. 'title':trunk.title
  53. }
  54. return None
  55. finally:
  56. db.close()
  57. def search_by_vector(self, text: str, limit: int = 10, file_path: Optional[str]=None, distance: Optional[float]=None, type: Optional[str] = None, conversation_id: Optional[str] = None) -> List[dict]:
  58. embedding = Vectorizer.get_embedding(text)
  59. db = SessionLocal()
  60. try:
  61. query = db.query(
  62. Trunks.id,
  63. Trunks.file_path,
  64. Trunks.content,
  65. Trunks.embedding.l2_distance(embedding).label('distance'),
  66. Trunks.title,
  67. Trunks.embedding,
  68. Trunks.page_no,
  69. Trunks.referrence,
  70. Trunks.meta_header
  71. )
  72. if distance:
  73. query = query.filter(Trunks.embedding.l2_distance(embedding) <= distance)
  74. if type:
  75. query = query.filter(Trunks.type == type)
  76. if file_path:
  77. query = query.filter(Trunks.file_path.like('%'+file_path+'%'))
  78. results = query.order_by('distance').limit(limit).all()
  79. result_list = [{
  80. 'id': r.id,
  81. 'file_path': r.file_path,
  82. 'content': r.content,
  83. #保留小数点后三位
  84. 'distance': round(r.distance, 3),
  85. 'title': r.title,
  86. 'embedding': r.embedding.tolist(),
  87. 'page_no': r.page_no,
  88. 'referrence': r.referrence,
  89. 'meta_header': r.meta_header
  90. } for r in results]
  91. if conversation_id:
  92. self.set_cache(conversation_id, result_list)
  93. return result_list
  94. finally:
  95. db.close()
  96. def fulltext_search(self, query: str) -> List[Trunks]:
  97. db = SessionLocal()
  98. try:
  99. return db.query(Trunks).filter(
  100. Trunks.content_tsvector.match(query)
  101. ).all()
  102. finally:
  103. db.close()
  104. def update_trunk(self, trunk_id: int, update_data: dict) -> Optional[Trunks]:
  105. if 'content' in update_data:
  106. content = update_data['content']
  107. update_data['embedding'] = Vectorizer.get_embedding(content)
  108. if 'type' not in update_data:
  109. update_data['type'] = 'default'
  110. logger.debug(f"更新生成的embedding长度: {len(update_data['embedding'])}, 内容摘要: {content[:20]}")
  111. # update_data['content_tsvector'] = func.to_tsvector('chinese', content)
  112. db = SessionLocal()
  113. try:
  114. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  115. if trunk:
  116. for key, value in update_data.items():
  117. setattr(trunk, key, value)
  118. db.commit()
  119. db.refresh(trunk)
  120. return trunk
  121. except Exception as e:
  122. db.rollback()
  123. logger.error(f"更新trunk失败: {str(e)}")
  124. raise
  125. finally:
  126. db.close()
  127. def delete_trunk(self, trunk_id: int) -> bool:
  128. db = SessionLocal()
  129. try:
  130. trunk = db.query(Trunks).filter(Trunks.id == trunk_id).first()
  131. if trunk:
  132. db.delete(trunk)
  133. db.commit()
  134. return True
  135. return False
  136. except Exception as e:
  137. db.rollback()
  138. logger.error(f"删除trunk失败: {str(e)}")
  139. raise
  140. finally:
  141. db.close()
  142. def highlight(self, trunk_id: int, targetSentences: List[str]) -> List[int]:
  143. trunk = self.get_trunk_by_id(trunk_id)
  144. if not trunk:
  145. return []
  146. content = trunk['content']
  147. sentence_util = SentenceUtil()
  148. cleanedContent = sentence_util.clean_text(content)
  149. result = []
  150. for i, targetSentence in enumerate(targetSentences):
  151. cleanedTarget = sentence_util.clean_text(targetSentence)
  152. #cleanedTarget长度小于5的不进行匹配
  153. if len(cleanedTarget)<5:
  154. continue
  155. if cleanedTarget in cleanedContent:
  156. result.append(i)
  157. # 补齐连续下标
  158. if result:
  159. result.sort()
  160. filled_result = []
  161. prev = result[0]
  162. filled_result.append(prev)
  163. for current in result[1:]:
  164. if current - prev <= 2:
  165. for i in range(prev + 1, current):
  166. filled_result.append(i)
  167. filled_result.append(current)
  168. prev = current
  169. return filled_result
  170. return result
  171. _cache = {}
  172. def get_cache(self, conversation_id: str) -> List[dict]:
  173. """
  174. 根据conversation_id获取缓存结果
  175. :param conversation_id: 会话ID
  176. :return: 结果列表
  177. """
  178. return self._cache.get(conversation_id, [])
  179. def set_cache(self, conversation_id: str, result: List[dict]) -> None:
  180. """
  181. 设置缓存结果
  182. :param conversation_id: 会话ID
  183. :param result: 要缓存的结果
  184. """
  185. self._cache[conversation_id] = result
  186. def get_cached_result(self, conversation_id: str) -> List[dict]:
  187. """
  188. 根据conversation_id获取缓存结果
  189. :param conversation_id: 会话ID
  190. :return: 结果列表
  191. """
  192. return self.get_cache(conversation_id)
  193. def paginated_search_by_type_and_filepath(self, search_params: dict) -> dict:
  194. """
  195. 根据type和file_path进行分页查询
  196. :param search_params: 包含pageNo, limit的字典
  197. :return: 包含结果列表和分页信息的字典
  198. """
  199. page_no = search_params.get('pageNo', 1)
  200. limit = search_params.get('limit', 10)
  201. file_path = search_params.get('file_path', None)
  202. type = search_params.get('type', None)
  203. if page_no < 1:
  204. page_no = 1
  205. if limit < 1:
  206. limit = 10
  207. offset = (page_no - 1) * limit
  208. db = SessionLocal()
  209. try:
  210. query = db.query(
  211. Trunks.id,
  212. Trunks.file_path,
  213. Trunks.content,
  214. Trunks.type,
  215. Trunks.title
  216. )
  217. if type:
  218. query = query.filter(Trunks.type == type)
  219. if file_path:
  220. query = query.filter(Trunks.file_path.like('%' + file_path + '%'))
  221. query = query.filter(Trunks.page_no == None)
  222. results = query.offset(offset).limit(limit).all()
  223. return {
  224. 'data': [{
  225. 'id': r.id,
  226. 'file_path': r.file_path,
  227. 'content': r.content,
  228. 'type': r.type,
  229. 'title': r.title
  230. } for r in results]
  231. }
  232. finally:
  233. db.close()
  234. def paginated_search(self, search_params: dict) -> dict:
  235. """
  236. 分页查询方法
  237. :param search_params: 包含keyword, pageNo, limit的字典
  238. :return: 包含结果列表和分页信息的字典
  239. """
  240. keyword = search_params.get('keyword', '')
  241. page_no = search_params.get('pageNo', 1)
  242. limit = search_params.get('limit', 10)
  243. if page_no < 1:
  244. page_no = 1
  245. if limit < 1:
  246. limit = 10
  247. embedding = Vectorizer.get_embedding(keyword)
  248. offset = (page_no - 1) * limit
  249. db = SessionLocal()
  250. try:
  251. # 获取总条数
  252. total_count = db.query(func.count(Trunks.id)).filter(Trunks.type == search_params.get('type')).scalar()
  253. # 执行向量搜索
  254. results = db.query(
  255. Trunks.id,
  256. Trunks.file_path,
  257. Trunks.content,
  258. Trunks.embedding.l2_distance(embedding).label('distance'),
  259. Trunks.title
  260. ).filter(Trunks.type == search_params.get('type')).order_by('distance').offset(offset).limit(limit).all()
  261. return {
  262. 'data': [{
  263. 'id': r.id,
  264. 'file_path': r.file_path,
  265. 'content': r.content,
  266. #保留小数点后三位
  267. 'distance': round(r.distance, 3),
  268. 'title': r.title
  269. } for r in results],
  270. 'pagination': {
  271. 'total': total_count,
  272. 'pageNo': page_no,
  273. 'limit': limit,
  274. 'totalPages': (total_count + limit - 1) // limit
  275. }
  276. }
  277. finally:
  278. db.close()