vectorizer.py 1.0 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import logging
  2. from typing import List
  3. import numpy as np
  4. import requests
  5. from requests.adapters import HTTPAdapter
  6. from urllib3.util.retry import Retry
  7. from utils.embed_helper import EmbedHelper
  8. from utils.vector_distance import VectorDistance
  9. logger = logging.getLogger(__name__)
  10. class Vectorizer:
  11. _instance = None
  12. def __init__(self):
  13. self.embedHelper = EmbedHelper()
  14. def get_embedding(self, text: str) -> List[float]:
  15. return self.embedHelper.embed_text(text)
  16. @classmethod
  17. def get_instance(cls):
  18. if cls._instance is None:
  19. cls._instance = cls()
  20. return cls._instance
  21. def chunk_text(self, text: str, chunk_size: int = 500) -> List[str]:
  22. tokens = self.tokenizer.tokenize(text)
  23. return [self.tokenizer.convert_tokens_to_string(tokens[i:i+chunk_size])
  24. for i in range(0, len(tokens), chunk_size)]
  25. if __name__ == '__main__':
  26. text = '你好'
  27. print(text)
  28. embedding2 = Vectorizer.get_instance().get_embedding(text)