12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- import os
- from sentence_transformers import SentenceTransformer
- from typing import List, Tuple, Dict
- import numpy as np
- from libs.singleton import Singleton
- from config.site import SiteConfig
- config = SiteConfig()
- EMBEDDING_MODEL = config.get_config("EMBEDDING_MODEL")
-
- class TextProcessor(Singleton):
- model = None
- def __init__(self, model_name: str = EMBEDDING_MODEL):
- if not self.model:
- if os.path.exists(model_name):
- self.model = SentenceTransformer(model_name_or_path=model_name)
-
- def chunk_text(self, text: str, chunk_size: int = 200, overlap: int = 50) -> List[str]:
- words = text.split()
- if len(words) < 200:
- chunk_size = 20
- overlap = 5
- if len(words) >= 200 and len(words) <= 400:
- chunk_size = 100
- overlap = 25
- chunks = []
- start = 0
- while start < len(words):
- end = start + chunk_size
- chunk = ' '.join(words[start:end])
- chunks.append(chunk)
- start = end - overlap
-
- return chunks
-
- def generate_embeddings(self, texts: List[str]) -> np.ndarray:
- if self.model is None:
- raise ValueError("Model not loaded. Please load the model first.")
- return self.model.encode(texts)
-
- def prepare_documents(self, text: str) -> List[Tuple[str, np.ndarray]]:
- if self.model is None:
- raise ValueError("Model not loaded. Please load the model first.")
- chunks = self.chunk_text(text)
- embeddings = self.generate_embeddings(chunks)
- return list(zip(chunks, embeddings))
-
|