import os from sentence_transformers import SentenceTransformer from typing import List, Tuple, Dict import numpy as np from libs.singleton import Singleton from config.site import SiteConfig config = SiteConfig() EMBEDDING_MODEL = config.get_config("EMBEDDING_MODEL") class TextProcessor(Singleton): model = None def __init__(self, model_name: str = EMBEDDING_MODEL): if not self.model: if os.path.exists(model_name): self.model = SentenceTransformer(model_name_or_path=model_name) def chunk_text(self, text: str, chunk_size: int = 200, overlap: int = 50) -> List[str]: words = text.split() if len(words) < 200: chunk_size = 20 overlap = 5 if len(words) >= 200 and len(words) <= 400: chunk_size = 100 overlap = 25 chunks = [] start = 0 while start < len(words): end = start + chunk_size chunk = ' '.join(words[start:end]) chunks.append(chunk) start = end - overlap return chunks def generate_embeddings(self, texts: List[str]) -> np.ndarray: if self.model is None: raise ValueError("Model not loaded. Please load the model first.") return self.model.encode(texts) def prepare_documents(self, text: str) -> List[Tuple[str, np.ndarray]]: if self.model is None: raise ValueError("Model not loaded. Please load the model first.") chunks = self.chunk_text(text) embeddings = self.generate_embeddings(chunks) return list(zip(chunks, embeddings))