西游记相关的分词,出现次数最高的20个
输入:
1 import jieba 2 excludes = {"一个", "我们", "怎么", "那里", "不知", "不是", "只见", "两个", "不敢", "这个", "如何", "原来", "甚么", "不曾", "闻言", "正是", "那怪", "一声"} 3 txt = open("西游记1.txt", "r", encoding='UTF-8').read() 4 words = jieba.lcut(txt) 5 jieba.add_word("孙悟空") 6 jieba.add_word("金公") 7 jieba.add_word("孙行者") 8 jieba.add_word("心猿") 9 jieba.add_word("齐天大圣") 10 jieba.add_word("斗战胜佛") 11 jieba.add_word("美猴王") 12 jieba.add_word("孙行者") 13 jieba.add_word("三藏法师") 14 jieba.add_word("玄奘") 15 jieba.add_word("金蝉子") 16 jieba.add_word("江流儿") 17 jieba.add_word("御弟") 18 jieba.add_word("沙僧") 19 jieba.add_word("沙和尚") 20 jieba.add_word("沙悟净") 21 jieba.add_word("刀圭") 22 jieba.add_word("黄婆") 23 jieba.add_word("悟能") 24 jieba.add_word("猪悟能") 25 jieba.add_word("猪刚鬣") 26 jieba.add_word("木母") 27 jieba.add_word("白龙马") 28 jieba.add_word("天龙马") 29 jieba.add_word("玉龙三太子") 30 jieba.add_word("八部天龙广力菩萨") 31 counts = {} 32 for word in words: 33 if len(word) == 1: 34 continue 35 elif word == "师父" or word == "三藏" or word == "玄奘" or word == "三藏法师" or word == "金蝉子" or word == "江流儿" or word == "御弟": 36 rword = "唐僧" 37 elif word == "大圣" or word == "老孙" or word == "孙悟空" or word == "美猴王" or word == "孙行者" or word == "齐天大圣" or word == "斗战胜佛" or word == "金公" or word == "心猿": 38 rword = "悟空" 39 elif word == "悟能" or word == "八戒" or word == "猪悟能" or word == "呆子" or word == "木母" or word == "猪刚鬣": 40 rword = "猪八戒" 41 elif word == "沙僧" or word == "沙悟净" or word == "沙和尚" or word == "刀圭" or word == "黄婆": 42 rword = "悟净" 43 elif word == "天龙马" or word == "玉龙三太子" or word == "八部天龙广力菩萨": 44 rword = "白龙马" 45 else: 46 rword = word 47 counts[rword] = counts.get(rword, 0) + 1 48 for word in excludes: 49 del(counts[word]) 50 items = list(counts.items()) 51 items.sort(key=lambda x:x[1], reverse=True) 52 for i in range(20): 53 word, count = items[i] 54 print("{0:<10}{1:>5}".format(word, count))
输出:
标签:jieba,rword,word,elif,add,counts,分词 From: https://www.cnblogs.com/ChenWenshi/p/17914004.html