words_freq_in_doc.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334
  1. import jieba
  2. import os
  3. word_dict = []
  4. word_freq = {}
  5. word_freq_doc = {}
  6. with open("./dict/legal_terms.txt", "r", encoding="utf-8") as f:
  7. for line in f.readlines():
  8. jieba.add_word(line.strip())
  9. word_dict.append(line.strip())
  10. word_freq[line.strip()] = 0
  11. path = "./docs"
  12. for root, dirs, files in os.walk(path):
  13. for file in files:
  14. file_path = os.path.join(root, file)
  15. print(file_path)
  16. with open(file_path, "r", encoding="utf-8") as f:
  17. for line in f:
  18. words = jieba.cut(line)
  19. for w in words:
  20. if w in word_freq.keys():
  21. word_freq[w] += 1
  22. if w in word_freq_doc.keys():
  23. word_freq_doc[w] += 1
  24. else:
  25. word_freq_doc[w] = 1
  26. with open("word_feq.txt", "w", encoding="utf-8") as f:
  27. for k in word_freq.keys():
  28. f.write(f"{k} {word_freq[k]}\n")
  29. with open("word_feq_doc.txt", "w", encoding="utf-8") as f:
  30. for k in word_freq_doc.keys():
  31. f.write(f"{k} {word_freq_doc[k]}\n")