首页 > 其他分享 >杂物箱 | Doc2Vec代码实现

杂物箱 | Doc2Vec代码实现

时间:2022-10-07 15:24:16浏览次数:32  
标签:cut text 代码 train import model data Doc2Vec 杂物箱

1 分词
import gensim
import re
import jieba
import pandas as pd
import os
from gensim.models.doc2vec import Doc2Vec
def cut_txt(text):
    text = re.sub(u'[^\u4e00-\u9fa5]', '', text)
    words = jieba.lcut(text, cut_all=False)
    new_text = ""
    fs = open("data/stoplist.txt", 'r', encoding='utf-8')
    stoplist = fs.read()
    stoplist = stoplist.split('\n')
    for w in words:
        if (w not in stoplist and len(w) > 1):
            new_text += w + ' '
        if (w == "\n"):
            new_text += w
    return new_text

data = pd.read_csv("data/all_data.csv", encoding="utf-8")
b = data["text"].astype(str).apply(cut_txt).tolist()
2 转换TaggedDocument
TaggededDocument = gensim.models.doc2vec.TaggedDocument
def X_train(cut_sentense):
    x_train = []
    for i, text in enumerate(cut_sentense):
        word_list = text.split(' ')
        l = len(word_list)
        word_list[l-1] = word_list[l-1].strip()
        document = TaggededDocument(word_list, tags=[i])
        x_train.append(document)
    return x_train
c = X_train(b)
3 训练
def train(x_train, size=768):
    model = Doc2Vec(x_train, min_count=1, window=3, vector_size=size, sample=1e-3, workers=4)
    model.train(x_train, total_examples=model.corpus_count, epochs=10)
    return model
model_dm = train(c)
4 结果
# 句向量转换
test_text = data[i].split(' ')
inferred_vector = model_dm.infer_vector(doc_words=test_text)
# 计算相似度
sims = model_dm.docvecs.most_similar([inferred_vector], topn=10)

标签:cut,text,代码,train,import,model,data,Doc2Vec,杂物箱
From: https://www.cnblogs.com/rachel0701/p/16759795.html

相关文章