一、概述
1.1 数学公式
二、利用贝叶斯进行文本分析
2.1 文本向量化
对数据进行预处理,进行分词,并把文本向量化是贝叶斯算法的关键也是难点。
分词可以参考:https://www.cnblogs.com/qianslup/p/16907569.html
文本向量化可以参考:https://www.cnblogs.com/qianslup/p/16847967.html
2.2 数据源查看
2.2.1 核心数据源
这是一个评分(score)与评论(comment)
2.2.1 停用词库
2.3 代码展示
import pandas as pd import jieba jieba.setLogLevel(jieba.logging.INFO) from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB # 贝叶斯 from sklearn.model_selection import train_test_split df = pd.read_csv('../data/comment.csv') stopwords=pd.read_csv("../data/stopwords.txt", index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8') # 对comment 分词 def cut_word(sentence): clean_word = [] words = jieba.cut(sentence) for word in words: if word not in stopwords and len(word) > 1 and word != '\r\n': clean_word.append(word) new_sentence = ' '.join(clean_word) return new_sentence # 对score 分等级 def set_grade(score): if score < -20: return '-2' elif score < 0: return '-1' elif score < 20: return '1' else: return '2' df['clean_word'] = df['comment'].map(cut_word) df['grade'] = df['score'].map(set_grade) df.dropna df.to_csv('../data/clean_word.csv') X = df['comment'] # Series转List方便 y = df['grade'] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1) # L1 曼哈顿距离, L2 是欧拉距离 tfidf = TfidfVectorizer(use_idf=True, smooth_idf=True,norm='l2') X_train = tfidf.fit_transform(list(X_train)) mNB = MultinomialNB() mNB.fit(X_train,y_train) X_test = tfidf.transform(list(X_test)) print('Tf-idf未分词 socre',mNB.score(X_test,y_test)) X = df['clean_word'] # Series转List方便 y = df['grade'] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1) # L1 曼哈顿距离, L2 是欧拉距离 tfidf = TfidfVectorizer(use_idf=True, smooth_idf=True,norm='l2') X_train = tfidf.fit_transform(list(X_train)) mNB = MultinomialNB() mNB.fit(X_train,y_train) X_test = tfidf.transform(list(X_test)) print('Tf-idf分词 socre',mNB.score(X_test, y_test)) X = df['clean_word'] # Series转List方便 y = df['grade'] X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=1) cv = CountVectorizer() X_train = cv.fit_transform(list(X_train)) mNB = MultinomialNB() mNB.fit(X_train,y_train) X_test = cv.transform(list(X_test)) print('cv分词 socre',mNB.score(X_test, y_test))
标签:word,score,df,贝叶斯,train,test,mNB From: https://www.cnblogs.com/qianslup/p/16908256.html