# 代码12-1 评论去重的代码
import pandas as pd
import re
import jieba.posseg as psg
import numpy as np
# 去重,去除完全重复的数据
reviews = pd.read_csv(r "G:\data\data\reviews.csv" )
reviews = reviews[[ 'content' , 'content_type' ]].drop_duplicates()
content = reviews[ 'content' ]
# 代码12-2 数据清洗
# 去除去除英文、数字等
# 由于评论主要为京东美的电热水器的评论,因此去除这些词语
strinfo = re. compile ( '[0-9a-zA-Z]|京东|美的|电热水器|热水器|' )
content = content. apply ( lambda x: strinfo.sub('', x))
# 代码12-3 分词、词性标注、去除停用词代码
# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定义简单分词函数
seg_word = content. apply (worker)
# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
n_word = seg_word. apply ( lambda x: len (x)) # 每一评论中词的个数
n_content = [[x + 1 ] * y for x,y in zip ( list (seg_word.index), list (n_word))]
index_content = sum (n_content, []) # 将嵌套的列表展开,作为词所在评论的id
seg_word = sum (seg_word, [])
word = [x[ 0 ] for x in seg_word] # 词
nature = [x[ 1 ] for x in seg_word] # 词性
content_type = [[x] * y for x,y in zip ( list (reviews[ 'content_type' ]), list (n_word))]
content_type = sum (content_type, []) # 评论类型
result = pd.DataFrame({ "index_content" :index_content,
"word" :word,
"nature" :nature,
"content_type" :content_type})
# 删除标点符号
result = result[result[ 'nature' ] ! = 'x' ] # x表示标点符号
# 删除停用词
stop_path = open (r "G:\data\data\stoplist.txt" , 'r' ,encoding = 'UTF-8' )
stop = stop_path.readlines()
stop = [x.replace( '\n' , '') for x in stop]
word = list ( set (word) - set (stop))
result = result[result[ 'word' ].isin(word)]
# 构造各词在对应评论的位置列
n_word = list (result.groupby(by = [ 'index_content' ])[ 'index_content' ].count())
index_word = [ list (np.arange( 0 , y)) for y in n_word]
index_word = sum (index_word, []) # 表示词语在改评论的位置
# 合并评论id,评论中词的id,词,词性,评论类型
result[ 'index_word' ] = index_word
# 代码12-4 提取含有名词的评论
# 提取含有名词类的评论
ind = result[[ 'n' in x for x in result[ 'nature' ]]][ 'index_content' ].unique()
result = result[[x in ind for x in result[ 'index_content' ]]]
# 代码12-5 绘制词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
frequencies = result.groupby(by = [ 'word' ])[ 'word' ].count()
frequencies = frequencies.sort_values(ascending = False )
backgroud_Image = plt.imread(r "G:\data\data\pl.jpg" )
wordcloud = WordCloud(font_path = "C:\Windows\Fonts\STZHONGS.ttf" ,
max_words = 100 ,
background_color = 'white' ,
mask = backgroud_Image)
my_wordcloud = wordcloud.fit_words(frequencies)
plt.imshow(my_wordcloud)
plt.axis( 'off' )
plt.show()
# 将结果写出
result.to_csv(r "G:\data\data\word.csv" , index = False , encoding = 'utf-8' )
# -*- coding: utf-8 -*-
# 代码12-3
import numpy as np
# 分词
worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定义简单分词函数
seg_word = content. apply (worker)
# 将词语转为数据框形式,一列是词,一列是词语所在的句子ID,最后一列是词语在该句子的位置
n_word = seg_word. apply ( lambda x: len (x)) # 每一评论中词的个数
n_content = [[x + 1 ] * y for x,y in zip ( list (seg_word.index), list (n_word))]
index_content = sum (n_content, []) # 将嵌套的列表展开,作为词所在评论的id
seg_word = sum (seg_word, [])
word = [x[ 0 ] for x in seg_word] # 词
nature = [x[ 1 ] for x in seg_word] # 词性
content_type = [[x] * y for x,y in zip ( list (reviews[ 'content_type' ]),
list (n_word))]
content_type = sum (content_type, []) # 评论类型
result = pd.DataFrame({ "index_content" :index_content,
"word" :word,
"nature" :nature,
"content_type" :content_type})
# 删除标点符号
result = result[result[ 'nature' ] ! = 'x' ] # x表示标点符号
# 删除停用词
stop_path = open (r "G:\data\data\stoplist.txt" , 'r' ,encoding = 'UTF-8' )
stop = stop_path.readlines()
stop = [x.replace( '\n' , '') for x in stop]
word = list ( set (word) - set (stop))
result = result[result[ 'word' ].isin(word)]
# 构造各词在对应评论的位置列
n_word = list (result.groupby(by = [ 'index_content' ])[ 'index_content' ].count())
index_word = [ list (np.arange( 0 , y)) for y in n_word]
index_word = sum (index_word, []) # 表示词语在改评论的位置
# 合并评论id,评论中词的id,词,词性,评论类型
result[ 'index_word' ] = index_word
# 代码12-4
# 提取含有名词类的评论
ind = result[[ 'n' in x for x in result[ 'nature' ]]][ 'index_content' ].unique()
result = result[[x in ind for x in result[ 'index_content' ]]]
# 代码12-5
import matplotlib.pyplot as plt
from wordcloud import WordCloud
frequencies = result.groupby(by = [ 'word' ])[ 'word' ].count()
frequencies = frequencies.sort_values(ascending = False )
backgroud_Image = plt.imread(r "G:\data\data\pl.jpg" )
wordcloud = WordCloud(font_path = "C:\Windows\Fonts\STZHONGS.ttf" ,
max_words = 100 ,
background_color = 'white' ,
mask = backgroud_Image)
my_wordcloud = wordcloud.fit_words(frequencies)
plt.imshow(my_wordcloud)
plt.axis( 'off' )
plt.show()
# 将结果写出
result.to_csv(r "G:\data\data\word.csv" , index = False , encoding = 'utf-8' )
# -*- coding: utf-8 -*-
# 代码12-6 匹配情感词
import pandas as pd
import numpy as np
word = pd.read_csv(r "G:\data\data\word.csv" )
# 读入正面、负面情感评价词
pos_comment = pd.read_csv(r "G:\data\data\正面评价词语(中文).txt" , header = None , sep = "\n" ,
encoding = 'utf-8' , engine = 'python' )
neg_comment = pd.read_csv(r "G:\data\data\负面评价词语(中文).txt" , header = None , sep = "\n" ,
encoding = 'utf-8' , engine = 'python' )
pos_emotion = pd.read_csv(r "G:\data\data\正面情感词语(中文).txt" , header = None , sep = "\n" ,
encoding = 'utf-8' , engine = 'python' )
neg_emotion = pd.read_csv(r "G:\data\data\负面情感词语(中文).txt" , header = None , sep = "\n" ,
encoding = 'utf-8' , engine = 'python' )
# 合并情感词与评价词
positive = set (pos_comment.iloc[:, 0 ]) | set (pos_emotion.iloc[:, 0 ])
negative = set (neg_comment.iloc[:, 0 ]) | set (neg_emotion.iloc[:, 0 ])
intersection = positive & negative # 正负面情感词表中相同的词语
positive = list (positive - intersection)
negative = list (negative - intersection)
positive = pd.DataFrame({ "word" : positive,
"weight" : [ 1 ] * len (positive)})
negative = pd.DataFrame({ "word" : negative,
"weight" : [ - 1 ] * len (negative)})
posneg = positive.append(negative)
# 将分词结果与正负面情感词表合并,定位情感词
data_posneg = posneg.merge(word, left_on = 'word' , right_on = 'word' ,
how = 'right' )
data_posneg = data_posneg.sort_values(by = [ 'index_content' , 'index_word' ])
# 代码12-7 修正情感倾向
# 根据情感词前时候有否定词或双层否定词对情感值进行修正
# 载入否定词表
notdict = pd.read_csv(r "G:\data\data\not.csv" )
# 处理否定修饰词
data_posneg[ 'amend_weight' ] = data_posneg[ 'weight' ] # 构造新列,作为经过否定词修正后的情感值
data_posneg[ 'id' ] = np.arange( 0 , len (data_posneg))
only_inclination = data_posneg.dropna() # 只保留有情感值的词语
only_inclination.index = np.arange( 0 , len (only_inclination))
index = only_inclination[ 'id' ]
for i in np.arange( 0 , len (only_inclination)):
review = data_posneg[data_posneg[ 'index_content' ] = =
only_inclination[ 'index_content' ][i]] # 提取第i个情感词所在的评论
review.index = np.arange( 0 , len (review))
affective = only_inclination[ 'index_word' ][i] # 第i个情感值在该文档的位置
if affective = = 1 :
ne = sum ([i in notdict[ 'term' ] for i in review[ 'word' ][affective - 1 ]])
if ne = = 1 :
data_posneg[ 'amend_weight' ][index[i]] = - \
data_posneg[ 'weight' ][index[i]]
elif affective > 1 :
ne = sum ([i in notdict[ 'term' ] for i in review[ 'word' ][[affective - 1 ,
affective - 2 ]]])
if ne = = 1 :
data_posneg[ 'amend_weight' ][index[i]] = - \
data_posneg[ 'weight' ][index[i]]
# 更新只保留情感值的数据
only_inclination = only_inclination.dropna()
# 计算每条评论的情感值
emotional_value = only_inclination.groupby([ 'index_content' ],
as_index = False )[ 'amend_weight' ]. sum ()
# 去除情感值为0的评论
emotional_value = emotional_value[emotional_value[ 'amend_weight' ] ! = 0 ]
# 代码12-8 查看情感分析效果
# 给情感值大于0的赋予评论类型(content_type)为pos,小于0的为neg
emotional_value[ 'a_type' ] = ''
emotional_value[ 'a_type' ][emotional_value[ 'amend_weight' ] > 0 ] = 'pos'
emotional_value[ 'a_type' ][emotional_value[ 'amend_weight' ] < 0 ] = 'neg'
# 查看情感分析结果
result = emotional_value.merge(word,
left_on = 'index_content' ,
right_on = 'index_content' ,
how = 'left' )
result = result[[ 'index_content' , 'content_type' , 'a_type' ]].drop_duplicates()
confusion_matrix = pd.crosstab(result[ 'content_type' ], result[ 'a_type' ],
margins = True ) # 制作交叉表
(confusion_matrix.iat[ 0 , 0 ] + confusion_matrix.iat[ 1 , 1 ]) / confusion_matrix.iat[ 2 , 2 ]
# 提取正负面评论信息
ind_pos = list (emotional_value[emotional_value[ 'a_type' ] = = 'pos' ][ 'index_content' ])
ind_neg = list (emotional_value[emotional_value[ 'a_type' ] = = 'neg' ][ 'index_content' ])
posdata = word[[i in ind_pos for i in word[ 'index_content' ]]]
negdata = word[[i in ind_neg for i in word[ 'index_content' ]]]
# 绘制词云
import matplotlib.pyplot as plt
from wordcloud import WordCloud
# 正面情感词词云
freq_pos = posdata.groupby(by = [ 'word' ])[ 'word' ].count()
freq_pos = freq_pos.sort_values(ascending = False )
backgroud_Image = plt.imread(r "G:\data\data\pl.jpg" )
wordcloud = WordCloud(font_path = "C:\Windows\Fonts\STZHONGS.ttf" ,
max_words = 100 ,
background_color = 'white' ,
mask = backgroud_Image)
pos_wordcloud = wordcloud.fit_words(freq_pos)
plt.imshow(pos_wordcloud)
plt.axis( 'off' )
plt.show()<br><br> # 负面情感词词云
freq_neg = negdata.groupby(by = [ 'word' ])[ 'word' ].count()
freq_neg = freq_neg.sort_values(ascending = False )
neg_wordcloud = wordcloud.fit_words(freq_neg)
plt.imshow(neg_wordcloud)
plt.axis( 'off' )
plt.show()
# 将结果写出,每条评论作为一行
posdata.to_csv(r "G:\data\data\posdata.csv" , index = False , encoding = 'utf-8' )
negdata.to_csv(r "G:\data\data\negdata.csv" , index = False , encoding = 'utf-8' )
# -*- coding: utf-8 -*-
# 代码12-9 建立词典及语料库
import pandas as pd
import numpy as np
import re
import itertools
import matplotlib.pyplot as plt
# 载入情感分析后的数据
posdata = pd.read_csv(r "G:\data\data\posdata.csv" , encoding = 'utf-8' )
negdata = pd.read_csv(r "G:\data\data\negdata.csv" , encoding = 'utf-8' )
from gensim import corpora, models
# 建立词典
pos_dict = corpora.Dictionary([[i] for i in posdata[ 'word' ]]) # 正面
neg_dict = corpora.Dictionary([[i] for i in negdata[ 'word' ]]) # 负面
# 建立语料库
pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in posdata[ 'word' ]]] # 正面
neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in negdata[ 'word' ]]] # 负面
# 代码12-10 主题数寻优
# 构造主题数寻优函数
def cos(vector1, vector2): # 余弦相似度函数
dot_product = 0.0 ;
normA = 0.0 ;
normB = 0.0 ;
for a, b in zip (vector1, vector2):
dot_product + = a * b
normA + = a * * 2
normB + = b * * 2
if normA = = 0.0 or normB = = 0.0 :
return ( None )
else :
return (dot_product / ((normA * normB) * * 0.5 ))
# 主题数寻优
def lda_k(x_corpus, x_dict):
# 初始化平均余弦相似度
mean_similarity = []
mean_similarity.append( 1 )
# 循环生成主题并计算主题间相似度
for i in np.arange( 2 , 11 ):
lda = models.LdaModel(x_corpus, num_topics = i, id2word = x_dict) # LDA模型训练
for j in np.arange(i):
term = lda.show_topics(num_words = 50 )
# 提取各主题词
top_word = []
for k in np.arange(i):
top_word.append([' '.join(re.findall(' "(.*)" ', i)) \
for i in term[k][ 1 ].split( '+' )]) # 列出所有词
# 构造词频向量
word = sum (top_word, []) # 列出所有的词
unique_word = set (word) # 去除重复的词
# 构造主题词列表,行表示主题号,列表示各主题词
mat = []
for j in np.arange(i):
top_w = top_word[j]
mat.append( tuple ([top_w.count(k) for k in unique_word]))
p = list (itertools.permutations( list (np.arange(i)), 2 ))
l = len (p)
top_similarity = [ 0 ]
for w in np.arange(l):
vector1 = mat[p[w][ 0 ]]
vector2 = mat[p[w][ 1 ]]
top_similarity.append(cos(vector1, vector2))
# 计算平均余弦相似度
mean_similarity.append( sum (top_similarity) / l)
return (mean_similarity)
# 计算主题平均余弦相似度
pos_k = lda_k(pos_corpus, pos_dict)
neg_k = lda_k(neg_corpus, neg_dict)
# 绘制主题平均余弦相似度图形
from matplotlib.font_manager import FontProperties
font = FontProperties(size = 14 )
# 解决中文显示问题
plt.rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt.rcParams[ 'axes.unicode_minus' ] = False
fig = plt.figure(figsize = ( 10 , 8 ))
ax1 = fig.add_subplot( 211 )
ax1.plot(pos_k)
ax1.set_xlabel( '正面评论LDA主题数寻优' , fontproperties = font)
ax2 = fig.add_subplot( 212 )
ax2.plot(neg_k)
ax2.set_xlabel( '负面评论LDA主题数寻优' , fontproperties = font)
# 代码12-11 LDA主题分析
# LDA主题分析
pos_lda = models.LdaModel(pos_corpus, num_topics = 3 , id2word = pos_dict)
neg_lda = models.LdaModel(neg_corpus, num_topics = 3 , id2word = neg_dict)
pos_lda.print_topics(num_words = 10 )
neg_lda.print_topics(num_words = 10 )
plt.show()
|
标签:index,word,重叠,neg,content,词云,result,data From: https://www.cnblogs.com/gbywl/p/17363332.html