场景:
输入一段内容, 找到问题集中跟该内容最相似的问题
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
templates = [
"出来钓鱼了喂",
"王大腚爱钓鱼",
"格小格爱爱钓鱼",
"天元邓刚",
"爱钓鱼的超哥",
"王大苗钓鱼",
"王小羽",
"丽鱼杆",
]
# 结巴分词切割句子得到关键字列表
def tokenize(text):
return list(jieba.cut(text))
# 计算相似度并找到最大相似度
vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectors = vectorizer.fit_transform([question] + templates)
similarities = cosine_similarity(vectors[0], vectors[1:]).flatten()
# 最相似的问题集的索引
most_similar_index = similarities.argmax()
max_similarity = similarities[most_similar_index]
logging.info("原始问题{}:".format(question))
logging.info("{}: 最高匹配度: {}".format(question, max_similarity))
# 命中的最相似问题
target_question = templates[most_similar_index]
标签:jieba,similarity,text,钓鱼,question,sk,相似,learn
From: https://www.cnblogs.com/gatling/p/18097079