python
/
self-constructing_graph


			
				
					
						
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
							import os
from sentence_transformers import SentenceTransformer
from typing import List, Tuple, Dict
import numpy as np
from libs.singleton import Singleton
from config.site import SiteConfig

config = SiteConfig()
EMBEDDING_MODEL = config.get_config("EMBEDDING_MODEL")
    
class TextProcessor(Singleton):
    model = None
    def __init__(self, model_name: str = EMBEDDING_MODEL):        
        if not self.model:
            if os.path.exists(model_name):
                self.model = SentenceTransformer(model_name_or_path=model_name)     
        
    def chunk_text(self, text: str, chunk_size: int = 200, overlap: int = 50) -> List[str]:
        words = text.split()
        if len(words) < 200:
            chunk_size = 20
            overlap = 5
        if len(words) >= 200 and len(words) <= 400:
            chunk_size = 100
            overlap = 25
        chunks = []
        start = 0
        while start < len(words):
            end = start + chunk_size
            chunk = ' '.join(words[start:end])

            chunks.append(chunk)
            start = end - overlap
            
        return chunks
        
    def generate_embeddings(self, texts: List[str]) -> np.ndarray:
        if self.model is None:
            raise ValueError("Model not loaded. Please load the model first.")
        return self.model.encode(texts)
        
    def prepare_documents(self, text: str) -> List[Tuple[str, np.ndarray]]:
        if self.model is None:
            raise ValueError("Model not loaded. Please load the model first.")
        chunks = self.chunk_text(text)
        embeddings = self.generate_embeddings(chunks)
        return list(zip(chunks, embeddings))