引用jiaba库
点击查看代码
import jieba
读取文件,文件路径填写文件放取的位置
并且使用jieba分词的精确模式
点击查看代码
txt = open('西游记.txt', 'r', encoding='utf-8').read()
words = jieba.lcut(txt)
count = {} #通过键值对的形式存储词语及其出现的次数
将同一人物的不同说法统一,遍历所有词语,每出现一次其对应值加一
点击查看代码
for word in words:
if len(word) == 1:
continue
if word in ['孙猴子','孙行者','孙悟空','斗战胜佛','齐天大圣','行者','老孙','大圣','孙大圣','悟空']:
rword = '孙悟空'
elif word == ['唐僧','唐三藏','金蝉子','师父']:
rword = '唐僧'
elif word == ['猪八戒','猪悟能','天蓬元帅','悟能']:
rword = '猪八戒'
elif word == ['沙僧','沙悟净','悟净']:
rword = '沙僧'
elif word == ['如来佛祖','如来']:
rword = '如来佛祖'
else:
rword = word
counts[rword] = counts.get(rword,0) + 1
对出现的词语次数进行排序
并打印出来
点击查看代码
items = list(counts.items())
items.sort(key = lambda x:x[1], reverse = True)
for i in range(len(items)):
word, count = items[i]
print('{0:<10}{1:>5}'.format(word, count))