chunc_helper.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. #from utils.es import ElasticsearchOperations
  2. import asyncio
  3. import os
  4. import time
  5. from utils.file import load_file
  6. from libs.text_processor import TextProcessor
  7. import json
  8. import jieba
  9. import hashlib
  10. import numpy as np
  11. import codecs
  12. from dotenv import load_dotenv
  13. load_dotenv()
  14. class ChuncHelper:
  15. def __init__(self, data_file: str, output_dir:str, user_dict:str) -> None:
  16. self.data_file = data_file
  17. #self.es = ElasticsearchOperations()
  18. self.processor = TextProcessor()
  19. self.output_dir = output_dir
  20. self.stop_words = set()
  21. if user_dict:
  22. jieba.load_userdict(user_dict)
  23. with open(os.getenv("JIEBA_STOP_DICT"),"r",encoding="utf-8") as f:
  24. for line in f:
  25. self.stop_words.add(line.strip())
  26. self.json_data = None
  27. self.load_data()
  28. def get_doc_id(self, title:str) -> str:
  29. md = hashlib.md5(title.encode())
  30. doc_id = md.hexdigest()
  31. return doc_id
  32. def get_url(self, title:str)->str:
  33. if self.json_data:
  34. title = title.strip()
  35. for item in self.json_data:
  36. ArticleTitle = item["meta_data"]["ArticleTitle"]
  37. if title == ArticleTitle:
  38. return item["url"]
  39. return "未找到相关内容"
  40. def get(self, title:str)->str:
  41. if self.json_data:
  42. title = title.strip()
  43. for item in self.json_data:
  44. ArticleTitle = item["meta_data"]["ArticleTitle"]
  45. if title == ArticleTitle:
  46. return item["article_text"]
  47. return "未找到相关内容"
  48. def get_article(self, title:str):
  49. result = {}
  50. if self.json_data:
  51. title = title.strip()
  52. for item in self.json_data:
  53. ArticleTitle = item["meta_data"]["ArticleTitle"]
  54. if title == ArticleTitle:
  55. result["site_name"] = item["meta_data"]["SiteName"]
  56. result["site_domain"] = item["meta_data"]["SiteDomain"]
  57. result["title"] = item["meta_data"]["ArticleTitle"]
  58. result["author"] = item["meta_data"]["author"]
  59. result["pub_date"] = item["meta_data"]["PubDate"]
  60. result["article_text"] = item["article_text"]
  61. result["url"] = item["url"]
  62. return result
  63. return None
  64. def load_data(self):
  65. with open(self.data_file,"r",encoding="utf-8") as f:
  66. lines = f.readlines()
  67. self.json_data = json.loads(''.join(lines))
  68. def code_t(self, text):
  69. t = codecs.encode(text, "utf-8")
  70. return codecs.decode(t, "utf-8")
  71. def cut_word(self, text: str):
  72. cut_result = jieba.cut(text)
  73. filtered_words = [word for word in cut_result if word not in self.stop_words]
  74. return filtered_words
  75. def title_reverse_index(self):
  76. ''' 对文章标题分词,然后基于词建立倒排索引 '''
  77. if self.json_data:
  78. index = 1
  79. words_vector = {}
  80. total = len(self.json_data)
  81. for item in self.json_data:
  82. print(f"\r {index}/{total}", end="")
  83. title = item["meta_data"]["ArticleTitle"]
  84. title_cut = jieba.cut(title)
  85. for word in title_cut:
  86. if word in words_vector.keys():
  87. if not title in words_vector[word]["articles"]:
  88. words_vector[word]["articles"].append(title)
  89. continue
  90. words_vector[word] = { "articles": [title] }
  91. index = index + 1
  92. with open(f"{self.output_dir}/title_reverse_index.json", "w", encoding="utf-8") as f:
  93. f.write(json.dumps(words_vector,ensure_ascii=False))
  94. def cut_title_vector(self, seperate=False):
  95. '''基于文章标题分词,每个词生成向量'''
  96. words_vector = json.loads(load_file(f"{self.output_dir}/words_vector.json"))
  97. if self.json_data:
  98. index = 1
  99. total = len(self.json_data)
  100. for item in self.json_data:
  101. print(f"\r {index}/{total}", end="")
  102. title = item["meta_data"]["ArticleTitle"]
  103. title_cut = jieba.cut(title)
  104. for word in title_cut:
  105. if word in words_vector.keys():
  106. continue
  107. words_vector[word] = self.processor.generate_embeddings([word])[0].tolist()
  108. index = index + 1
  109. print("\nwriting words vector files")
  110. if seperate == False:
  111. with open(f"{self.output_dir}/words_vector.json", "w", encoding="utf-8") as f:
  112. f.write(json.dumps(cached_json,ensure_ascii=False))
  113. return
  114. cached_json = {}
  115. count = 0
  116. size = 100
  117. index = 0
  118. for k in words_vector.keys():
  119. count += 1
  120. cached_json[k] = words_vector[k]
  121. if count == size:
  122. with open(f"{self.output_dir}/words_vector_{index}.json", "w", encoding="utf-8") as f:
  123. f.write(json.dumps(cached_json,ensure_ascii=False))
  124. cached_json = {}
  125. count = 0
  126. index += 1
  127. index += 1
  128. if len(cached_json)>0:
  129. with open(f"{self.output_dir}/words_vector_{index}.json", "w", encoding="utf-8") as f:
  130. f.write(json.dumps(cached_json,ensure_ascii=False))
  131. def process_data(self):
  132. '''对每个文章进行处理,生成关键词向量,标题向量,内容分片向量'''
  133. def default_converter(o):
  134. if isinstance(o, np.float32):
  135. return float(o)
  136. raise TypeError
  137. if self.json_data:
  138. total = len(self.json_data)
  139. print(f">>> total {total} documents ")
  140. index = 1
  141. data_collection = []
  142. for item in self.json_data:
  143. print(f">>> process {index}/{total}")
  144. data = {"title":self.code_t(item["meta_data"]['ArticleTitle']),
  145. "keywords":self.code_t(item["meta_data"]['keywords'])}
  146. author = item["meta_data"]["author"]
  147. description = item["meta_data"]["description"]
  148. keywords = item["meta_data"]["keywords"]
  149. SiteName = item["meta_data"]["SiteName"]
  150. SiteDomain = item["meta_data"]["SiteDomain"]
  151. SiteIDCode = item["meta_data"]["SiteIDCode"]
  152. ColumnName = item["meta_data"]["ColumnName"]
  153. ColumnType = item["meta_data"]["ColumnType"]
  154. ArticleTitle = item["meta_data"]["ArticleTitle"]
  155. doc_id = self.get_doc_id(ArticleTitle)
  156. if os.path.exists(f"{self.output_dir}/{doc_id}.json"):
  157. print(f"{doc_id} existed, skip")
  158. index = index + 1
  159. continue
  160. PubDate = item["meta_data"]["PubDate"]
  161. ContentSource = item["meta_data"]["ContentSource"]
  162. article_text = item["article_text"]
  163. text_len = len(article_text)
  164. print(f"{ArticleTitle}:{text_len}")
  165. #txt = [self.code_t(keywords),self.code_t(ArticleTitle),self.code_t(article_text)]
  166. txt = [self.code_t(keywords),self.code_t(ArticleTitle)]
  167. chuncs = self.processor.chunk_text(self.code_t(article_text))
  168. txt = txt + chuncs
  169. print(">>> start embedding...")
  170. embeded_text = self.processor.generate_embeddings(txt)
  171. #embeded_chuncs = self.processor.generate_embeddings(chuncs)
  172. title_cut = jieba.cut(ArticleTitle)
  173. keywords_cut = jieba.cut(keywords)
  174. data["title"] = ArticleTitle
  175. data["title_cut"] = list(title_cut)
  176. data["keywords_cut"] = list(keywords_cut)
  177. data["keywords_vector"] = embeded_text[0].tolist()
  178. data["title_vector"] = embeded_text[1].tolist()
  179. data['chuncs'] = chuncs
  180. #data["content_vector"] = embeded_text[2].tolist()
  181. data["chuncs_vector"] = embeded_text[2:].tolist()
  182. print(">>> write embedding...")
  183. with open(f"{self.output_dir}/{doc_id}.json", "w", encoding="utf-8") as f:
  184. f.write(json.dumps(data,ensure_ascii=False))
  185. print(f"{doc_id} done, {index}/{total}")
  186. index = index + 1