使用该示例,你需要将下面的txt文件导入进 Python 编辑的文件中。
three.txt
import jieba
content = open('three.txt', 'r',encoding='utf-8',errors = 'ignore').read()
words =jieba.lcut(content)#分词
excludes={"将军","却说","二人","后主","上马","不知","天子","大叫","众将","不可","主公","蜀兵","只见","如何","商议","都督","一人","汉中","不敢","人马","陛下","魏兵","天下","今日","左右","东吴","于是","荆州","不能","如此","大喜","引兵","次日","军士","军马"}#排除的词汇
words=jieba.lcut(content)
counts={}
for word in words:
if len(word) == 1: # 排除单个字符的分词结果
continue
elif word == '孔明' or word == '孔明曰':
real_word = '孔明'
elif word == '关公' or word == '云长':
real_word = '关羽'
elif word == '孟德' or word == '丞相':
real_word = '曹操'
elif word == '玄德' or word == '玄德曰':
real_word = '刘备'
else:
real_word =word
counts[word] = counts.get(word, 0) + 1
for word in excludes:
del(counts[word])
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count=items[i]
print (word, count)