text_processor.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. import os
  2. from sentence_transformers import SentenceTransformer
  3. from typing import List, Tuple, Dict
  4. import numpy as np
  5. from libs.singleton import Singleton
  6. from config.site import SiteConfig
  7. config = SiteConfig()
  8. EMBEDDING_MODEL = config.get_config("EMBEDDING_MODEL")
  9. class TextProcessor(Singleton):
  10. model = None
  11. def __init__(self, model_name: str = EMBEDDING_MODEL):
  12. if not self.model:
  13. if os.path.exists(model_name):
  14. self.model = SentenceTransformer(model_name_or_path=model_name)
  15. def chunk_text(self, text: str, chunk_size: int = 200, overlap: int = 50) -> List[str]:
  16. words = text.split()
  17. if len(words) < 200:
  18. chunk_size = 20
  19. overlap = 5
  20. if len(words) >= 200 and len(words) <= 400:
  21. chunk_size = 100
  22. overlap = 25
  23. chunks = []
  24. start = 0
  25. while start < len(words):
  26. end = start + chunk_size
  27. chunk = ' '.join(words[start:end])
  28. chunks.append(chunk)
  29. start = end - overlap
  30. return chunks
  31. def generate_embeddings(self, texts: List[str]) -> np.ndarray:
  32. if self.model is None:
  33. raise ValueError("Model not loaded. Please load the model first.")
  34. return self.model.encode(texts)
  35. def prepare_documents(self, text: str) -> List[Tuple[str, np.ndarray]]:
  36. if self.model is None:
  37. raise ValueError("Model not loaded. Please load the model first.")
  38. chunks = self.chunk_text(text)
  39. embeddings = self.generate_embeddings(chunks)
  40. return list(zip(chunks, embeddings))