这里用了gensim和jiba分词库,jensim原生是支持英文的,如果想使用中文word2vec,还需要自己训练一下。
中文语料库可以在这里下载:https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
stopwords.txt这里下载:https://files.cnblogs.com/files/tiandsp/stopwords.rar
训练测试代码如下:
from gensim.corpora import WikiCorpus from gensim.models import word2vec import jieba from hanziconv import HanziConv from gensim.models.keyedvectors import KeyedVectors def set_stopword(): stopwordset = set() with open("stopwords.txt", "r", encoding = "utf-8") as stopwords: for stopword in stopwords: stopwordset.add(stopword.strip('\n')) return stopwordset def wiki_to_txt(): wiki_corpus = WikiCorpus("zhwiki-latest-pages-articles.xml.bz2", dictionary = {}) with open("wiki_text.txt", 'w', encoding = 'utf-8') as output: text_count = 0 for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n') text_count += 1 if text_count % 10000 == 0: print("已处理",text_count,"文章") print("转换结束") def traditional_to_simplified(): simplified = open("simplified.txt", "w", encoding="utf-8") with open("wiki_text.txt", "r", encoding="utf-8") as traditional: text_count = 0 for s in traditional: simplified.write(HanziConv.toSimplified(s)) text_count += 1 if text_count % 10000 == 0: print("大小写转换",text_count,"文章") simplified.close() def segmentation(stopwordset): segmentation = open("segmentation.txt", "w", encoding = "utf-8") with open("simplified.txt", "r", encoding="utf-8") as Corpus: for sentence in Corpus: sentence = sentence.strip("\n") pos = jieba.cut(sentence, cut_all = False) for term in pos: if term not in stopwordset: segmentation.write(term + " ") segmentation.close() def train(): sentence = word2vec.Text8Corpus("segmentation.txt") model = word2vec.Word2Vec(sentence, vector_size = 128) model.wv.save_word2vec_format(u"result.model.bin", binary= True) def test(): word_vectors = KeyedVectors.load_word2vec_format("result.model.bin", binary = True) print("词语 电脑 最相似的词为") res = word_vectors.most_similar(u"电脑", topn=10) for item in res: print(item[0] + "," + str(item[1])) print(word_vectors["电脑"]) if __name__ == "__main__": isTrain = True if isTrain: stopwordset = set_stopword() wiki_to_txt() traditional_to_simplified() segmentation(stopwordset) train() #没有打印,多等一会 test()
输出:
词语 电脑 最想似的词为
个人电脑,0.7513424158096313
计算机,0.734223484992981
电脑系统,0.7252928614616394
软体,0.6842026710510254
机器,0.6824702024459839
人工智慧,0.680030107498169
笔记型电脑,0.6773096323013306
笔记本电脑,0.6697209477424622
微电脑,0.6696100234985352
手提电脑,0.660958468914032
[ 1.9014188 1.7202342 -0.23729557 -0.8388046 -0.11100324 1.0443975
-1.079673 1.3327979 2.1141734 -0.42479753 0.16044064 -2.561639
0.07217824 2.5577955 -1.0273463 0.2997158 1.6174937 -1.7803082
1.9497777 2.5927126 -0.80586976 -2.9386904 -1.2385877 -0.3056585
-0.844955 -0.01344261 0.8239617 0.7256101 0.40125108 3.638999
3.8976343 -2.7287369 -0.64771324 1.1698668 1.7715164 -0.9826302
1.2160134 1.7796464 -2.0833569 -0.7386875 -1.8985692 -1.8049567
1.4793253 1.2001754 0.3108599 -1.7661532 0.7310379 2.5774055
0.5808109 -0.4295639 2.7464058 -1.6657854 -1.5137665 -0.21789126
1.4760169 0.15219498 1.3405688 2.7709813 0.56437314 1.8010542
-1.8642671 -0.38118765 2.318528 4.0478578 -0.53604156 0.52138174
0.31089854 0.16010547 0.47881317 3.888891 0.15173812 0.74806917
-0.9127513 -0.70784163 -0.21646835 0.17479078 -0.5820462 0.60282844
1.4890023 0.64550465 1.1527337 -0.09543432 -0.44408607 0.4570686
-0.24730873 -0.6606648 1.4759941 2.4438853 -0.31588006 -1.8908714
-1.4571669 3.6256638 -0.629215 0.827116 2.0742445 -0.62628126
-1.5641035 1.7997856 -2.2292428 -2.3412764 0.52960634 -1.6119162
1.1685064 -0.6274638 0.48065302 -3.5462537 -1.3472499 0.94291407
-1.3048811 1.1482272 0.34360048 0.5445513 -1.0494298 -0.2798078
-0.9819264 1.2798338 -1.3399855 -4.8864737 -0.2593729 3.2680583
0.735407 -0.32167712 1.8851075 1.6150967 1.8446032 -2.297569
0.66746795 0.5124097 ]