import jieba
排除非人名
excludes = {"一个","那里","怎么","我们","不知","和尚","妖精","两个","甚么","不是",
"只见","国王","徒弟","呆子","如何","这个","大王","原来","不敢","不曾",
"闻言","正是","只是","那怪","出来","一声","真个","小妖" }
txt = open("西游记.txt","r",encoding='gb18030').read()
对文本进行分词
words = jieba.lcut(txt)
创建统计用字典
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "老孙" or word == "大圣" or word =="悟空":
rword = "行者"
elif word == "师父" or word == "三藏" or word =="长老":
rword = "唐僧"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
把排除序列去除
for word in excludes:
del counts[word]
items = list(counts.items())
按照从大到小排序
items.sort(key=lambda x:x[1], reverse=True)
for i in range(10):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))