深度学习（中文word2vec）

标签：count 中文 word2vec text segmentation simplified 深度 txt

这里用了gensim和jiba分词库，jensim原生是支持英文的，如果想使用中文word2vec，还需要自己训练一下。

中文语料库可以在这里下载：https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2

stopwords.txt这里下载：https://files.cnblogs.com/files/tiandsp/stopwords.rar

训练测试代码如下：

from gensim.corpora import WikiCorpus
from gensim.models import word2vec
import jieba
from hanziconv import HanziConv
from gensim.models.keyedvectors import KeyedVectors

def set_stopword():
    stopwordset = set()
    with open("stopwords.txt", "r", encoding = "utf-8") as stopwords:
        for stopword in stopwords:
            stopwordset.add(stopword.strip('\n'))
    return stopwordset

def wiki_to_txt():
    wiki_corpus = WikiCorpus("zhwiki-latest-pages-articles.xml.bz2", dictionary = {})
    with open("wiki_text.txt", 'w', encoding = 'utf-8') as output:
        text_count = 0
        for text in wiki_corpus.get_texts():
            output.write(' '.join(text) + '\n')
            text_count += 1
            if text_count % 10000 == 0:
                print("已处理",text_count,"文章")
    print("转换结束")

def traditional_to_simplified():
    simplified = open("simplified.txt", "w", encoding="utf-8")        
    with open("wiki_text.txt", "r", encoding="utf-8") as traditional:
        text_count = 0
        for s in traditional:
            simplified.write(HanziConv.toSimplified(s))
            text_count += 1
            if text_count % 10000 == 0:
                print("大小写转换",text_count,"文章")
        simplified.close()

def segmentation(stopwordset):
    segmentation = open("segmentation.txt", "w", encoding = "utf-8")
    with open("simplified.txt", "r", encoding="utf-8") as Corpus:
        for sentence in Corpus:
            sentence = sentence.strip("\n")
            pos = jieba.cut(sentence, cut_all = False)
            for term in pos:
                if term not in stopwordset:
                    segmentation.write(term + " ")
    segmentation.close()

def train():
    sentence = word2vec.Text8Corpus("segmentation.txt")
    model = word2vec.Word2Vec(sentence, vector_size = 128)
    model.wv.save_word2vec_format(u"result.model.bin", binary= True)

def test():
    word_vectors = KeyedVectors.load_word2vec_format("result.model.bin", binary = True)
    print("词语 电脑 最相似的词为")
    res = word_vectors.most_similar(u"电脑", topn=10)
    for item in res:
        print(item[0] + "," + str(item[1]))
    print(word_vectors["电脑"])

if __name__ == "__main__":
    isTrain = True

    if isTrain:
        stopwordset = set_stopword()
        wiki_to_txt()
        traditional_to_simplified()
        segmentation(stopwordset)
        train()                        #没有打印，多等一会

    test()

输出：

词语电脑最想似的词为
个人电脑,0.7513424158096313
计算机,0.734223484992981
电脑系统,0.7252928614616394
软体,0.6842026710510254
机器,0.6824702024459839
人工智慧,0.680030107498169
笔记型电脑,0.6773096323013306
笔记本电脑,0.6697209477424622
微电脑,0.6696100234985352
手提电脑,0.660958468914032
[ 1.9014188 1.7202342 -0.23729557 -0.8388046 -0.11100324 1.0443975
-1.079673 1.3327979 2.1141734 -0.42479753 0.16044064 -2.561639
0.07217824 2.5577955 -1.0273463 0.2997158 1.6174937 -1.7803082
1.9497777 2.5927126 -0.80586976 -2.9386904 -1.2385877 -0.3056585
-0.844955 -0.01344261 0.8239617 0.7256101 0.40125108 3.638999
3.8976343 -2.7287369 -0.64771324 1.1698668 1.7715164 -0.9826302
1.2160134 1.7796464 -2.0833569 -0.7386875 -1.8985692 -1.8049567
1.4793253 1.2001754 0.3108599 -1.7661532 0.7310379 2.5774055
0.5808109 -0.4295639 2.7464058 -1.6657854 -1.5137665 -0.21789126
1.4760169 0.15219498 1.3405688 2.7709813 0.56437314 1.8010542
-1.8642671 -0.38118765 2.318528 4.0478578 -0.53604156 0.52138174
0.31089854 0.16010547 0.47881317 3.888891 0.15173812 0.74806917
-0.9127513 -0.70784163 -0.21646835 0.17479078 -0.5820462 0.60282844
1.4890023 0.64550465 1.1527337 -0.09543432 -0.44408607 0.4570686
-0.24730873 -0.6606648 1.4759941 2.4438853 -0.31588006 -1.8908714
-1.4571669 3.6256638 -0.629215 0.827116 2.0742445 -0.62628126
-1.5641035 1.7997856 -2.2292428 -2.3412764 0.52960634 -1.6119162
1.1685064 -0.6274638 0.48065302 -3.5462537 -1.3472499 0.94291407
-1.3048811 1.1482272 0.34360048 0.5445513 -1.0494298 -0.2798078
-0.9819264 1.2798338 -1.3399855 -4.8864737 -0.2593729 3.2680583
0.735407 -0.32167712 1.8851075 1.6150967 1.8446032 -2.297569
0.66746795 0.5124097 ]

标签：count,中文,word2vec,text,segmentation,simplified,深度,txt
From： https://www.cnblogs.com/tiandsp/p/18256034

深度学习（中文word2vec）

相关文章

赞助商

阅读排行