1 分词
import gensim
import re
import jieba
import pandas as pd
import os
from gensim.models.doc2vec import Doc2Vec
def cut_txt(text):
text = re.sub(u'[^\u4e00-\u9fa5]', '', text)
words = jieba.lcut(text, cut_all=False)
new_text = ""
fs = open("data/stoplist.txt", 'r', encoding='utf-8')
stoplist = fs.read()
stoplist = stoplist.split('\n')
for w in words:
if (w not in stoplist and len(w) > 1):
new_text += w + ' '
if (w == "\n"):
new_text += w
return new_text
data = pd.read_csv("data/all_data.csv", encoding="utf-8")
b = data["text"].astype(str).apply(cut_txt).tolist()
2 转换TaggedDocument
TaggededDocument = gensim.models.doc2vec.TaggedDocument
def X_train(cut_sentense):
x_train = []
for i, text in enumerate(cut_sentense):
word_list = text.split(' ')
l = len(word_list)
word_list[l-1] = word_list[l-1].strip()
document = TaggededDocument(word_list, tags=[i])
x_train.append(document)
return x_train
c = X_train(b)
3 训练
def train(x_train, size=768):
model = Doc2Vec(x_train, min_count=1, window=3, vector_size=size, sample=1e-3, workers=4)
model.train(x_train, total_examples=model.corpus_count, epochs=10)
return model
model_dm = train(c)
4 结果
# 句向量转换
test_text = data[i].split(' ')
inferred_vector = model_dm.infer_vector(doc_words=test_text)
# 计算相似度
sims = model_dm.docvecs.most_similar([inferred_vector], topn=10)
标签:cut,text,代码,train,import,model,data,Doc2Vec,杂物箱
From: https://www.cnblogs.com/rachel0701/p/16759795.html