代码
1 import jieba 2 3 txt = open("《西游记》.txt", "r", encoding='utf-8').read() 4 5 words = jieba.lcut(txt) # 使用精确模式对文本进行分词 6 7 counts = {} # 通过键值对的形式存储词语及其出现的次数 8 9 for word in words: 10 if len(word) == 1: 11 continue 12 elif word == "大圣" or word == "老孙" or word == "行者" or word == "孙大圣" or word == "孙行者"\ 13 or word == "猴王" or word == "悟空" or word == "齐天大圣" or word == "猴子": 14 rword = "孙悟空" 15 elif word == "师父" or word == "三藏" or word == "圣僧": 16 rword = "唐僧" 17 elif word == "呆子" or word == "八戒" or word == "老猪": 18 rword = "猪八戒" 19 elif word == "沙和尚": 20 rword = "沙僧" 21 elif word == "妖精" or word == "妖魔" or word == "妖道": 22 rword = "妖怪" 23 elif word == "佛祖": 24 rword = "如来" 25 elif word == "三太子": 26 rword = "白马" 27 else: 28 rword = word 29 30 counts[rword] = counts.get(rword, 0) + 1 31 32 items = list(counts.items()) # 将键值对转换成列表 33 34 items.sort(key=lambda x: x[1], reverse=True) # 根据词语出现的次数进行从大到小排序 35 36 for i in range(20): 37 word, count = items[i] 38 print("{0:<10}{1:>5}".format(word, count))
标签:jieba,rword,word,elif,20,分词 From: https://www.cnblogs.com/223746q/p/17927009.html