12345678910111213141516171819202122232425262728293031323334 |
- import jieba
- import os
- word_dict = []
- word_freq = {}
- word_freq_doc = {}
- with open("./dict/legal_terms.txt", "r", encoding="utf-8") as f:
- for line in f.readlines():
- jieba.add_word(line.strip())
- word_dict.append(line.strip())
- word_freq[line.strip()] = 0
- path = "./docs"
- for root, dirs, files in os.walk(path):
- for file in files:
- file_path = os.path.join(root, file)
- print(file_path)
- with open(file_path, "r", encoding="utf-8") as f:
- for line in f:
- words = jieba.cut(line)
- for w in words:
- if w in word_freq.keys():
- word_freq[w] += 1
- if w in word_freq_doc.keys():
- word_freq_doc[w] += 1
- else:
- word_freq_doc[w] = 1
-
- with open("word_feq.txt", "w", encoding="utf-8") as f:
- for k in word_freq.keys():
- f.write(f"{k} {word_freq[k]}\n")
-
- with open("word_feq_doc.txt", "w", encoding="utf-8") as f:
- for k in word_freq_doc.keys():
- f.write(f"{k} {word_freq_doc[k]}\n")
|