import pandas as pd import numpy as np import matplotlib as mp from pandas.core.algorithms import SelectN, diff import seaborn as se from matplotlib import pyplot as plt import wordcloud import jieba import logging from PIL import Image ##设置中文 plt.rcParams["font.sans-serif"]=['Microsoft YaHei'] plt.rcParams["axes.unicode_minus"]=False
##读取csv数据 file_path1='./2019-10-01_2021-05-30_accrued_income_iPhone.CSV' df1=pd.read_csv(file_path1) ##更改设置日期时间格式 df1['date']=pd.to_datetime(df1['date']) print(df1) print(df1.info()) ##绘制折线图 ##设置画布 plt.figure(figsize=(25,10),dpi=100) ##设置y轴为标注计数法 plt.ticklabel_format(axis='y',style='plain',useOffset=False,useLocale=False) ##画图 plt.plot(df1['date'],df1['accrued_income_iPhone'],label='iphone端预计营收(美元)',color="b") ##设置横纵轴和标题 plt.legend(loc='upper right') plt.xticks(rotation=45) plt.xlabel('日期',fontsize=25) plt.ylabel('收入',fontsize=25) plt.title('明日方舟iphone端预计营收折线图',fontsize=25) plt.show()
date accrued_income_iPhone 0 2019-10-01 519152 1 2019-10-02 494288 2 2019-10-03 561249 3 2019-10-04 687069 4 2019-10-05 248241 .. ... ... 603 2021-05-26 80903 604 2021-05-27 455368 605 2021-05-28 395334 606 2021-05-29 218846 607 2021-05-30 147614 [608 rows x 2 columns] <class 'pandas.core.frame.DataFrame'> RangeIndex: 608 entries, 0 to 607 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 608 non-null datetime64[ns] 1 accrued_income_iPhone 608 non-null int64 dtypes: datetime64[ns](1), int64(1) memory usage: 9.6 KB None
##读取csv数据 file_path2='./bilibili.csv' file_path3='./Sina_Visitor_System.csv' df2=pd.read_csv(file_path2) df3=pd.read_csv(file_path3) ##设置日期格式 df2['date']=pd.to_datetime(df2['date']) df3['date']=pd.to_datetime(df3['date']) ##抓取需要数据 df2=df2[df2['date']>='2019-09-30'] df3=df3[df3['date']>='2019-09-30'] print(df2.head()) print(df2.info()) print(df3.head()) print(df3.info()) ##分别绘制折线图 #设置画布 plt.figure(figsize=(25,20),dpi=100) ##生成图一 plt.subplot(2,1,1)#要生成两行一列,这是第一个图plt.subplot('行','列','编号') ##画图——折线图 # plt.plot(df2['date'],df2['reshare_bilibili'],label='转发数') plt.plot(df2['date'],df2['comment_bilibili'],label='评论数') plt.plot(df2['date'],df2['like_bilibili'],label='点赞数') #设置横纵轴,标题 plt.legend(loc='upper right') plt.xticks(rotation=45) plt.xlabel('日期',fontsize=15) plt.ylabel('人数',fontsize=15) plt.title('明日方舟b站动态',fontsize=20) #生成图二 plt.subplot(2,1,2) ##折线图 # plt.plot(df3['date'],df3['reshare_sina'],label='转发数') plt.plot(df3['date'],df3['comment_sina'],label='评论数') plt.plot(df3['date'],df3['like_sina'],label='点赞数') #设置横纵轴,标题 plt.legend(loc='upper right') plt.xticks(rotation=45) plt.xlabel('日期',fontsize=15) plt.ylabel('人数',fontsize=15) plt.title('明日方舟微博动态',fontsize=20) plt.show()
标题 标题链接 date content-ellipsis \ 293 NaN NaN 2019-09-30 互动抽奖 #明日方舟# #明日方舟bilibili账号突破两百万粉丝纪念# *在10月1日... 294 NaN NaN 2019-09-30 #明日方舟#【常驻标准寻访预告】起止时间:10月3日04:00~10月17日03:59寻访说... 295 NaN NaN 2019-10-02 #明日方舟#【新增干员】//红云“红云,猎人。给我工作吧,你不会失望的。”————————... 296 NaN NaN 2019-10-04 #明日方舟#微型故事集「战地秘闻」任务赠送干员//炎客“警醒能延续你的生命。”———————... 297 NaN NaN 2019-10-08 互动抽奖 #明日方舟# #明日方舟bilibili账号突破两百万粉丝纪念# *在10月1日... reshare_bilibili comment_bilibili like_bilibili 293 76000 3734 22000 294 207 5372 17000 295 203 3946 19000 296 382 6822 21000 297 167 589 5976 <class 'pandas.core.frame.DataFrame'> Int64Index: 664 entries, 293 to 956 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 标题 264 non-null object 1 标题链接 264 non-null object 2 date 664 non-null datetime64[ns] 3 content-ellipsis 497 non-null object 4 reshare_bilibili 664 non-null object 5 comment_bilibili 664 non-null int64 6 like_bilibili 664 non-null int64 dtypes: datetime64[ns](1), int64(2), object(4) memory usage: 41.5+ KB None weibo-text date surl-text \ 497 #明日方舟#【常驻标准寻访预告】起止时间:10月3日04:00~10月17日03:59寻访说... 2019-09-30 #明日方舟# 498 #明日方舟# 9月30日16:00闪断更新公告感谢您对《明日方舟》的关注与支持。《明日方舟》... 2019-09-30 #明日方舟# 499 #明日方舟#【新增干员】//红云“红云,猎人。给我工作吧,你不会失望的。”————————红... 2019-10-02 #明日方舟# 500 #明日方舟# 微型故事集「战地秘闻」任务赠送干员//炎客“警醒能延续你的生命。”——————... 2019-10-04 #明日方舟# 501 #明日方舟#【新增干员】//送葬人“罗德岛的博士,你好。我是与贵司签署了清理协议的拉特兰公民... 2019-10-08 #明日方舟# reshare_sina comment_sina like_sina 497 4194 8308 91000 498 134 984 12000 499 3641 5151 78000 500 11000 15000 160000 501 12000 10000 147000 <class 'pandas.core.frame.DataFrame'> Int64Index: 667 entries, 497 to 1163 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 weibo-text 667 non-null object 1 date 667 non-null datetime64[ns] 2 surl-text 631 non-null object 3 reshare_sina 667 non-null int64 4 comment_sina 667 non-null int64 5 like_sina 667 non-null int64 dtypes: datetime64[ns](1), int64(3), object(2) memory usage: 36.5+ KB None
##设置画布 fig,ax1=plt.subplots(figsize=(25,10)) plt.title('明日方舟营收和微博评论对比图',fontsize=20) plt.ticklabel_format(axis='y',style='plain',useOffset=False,useLocale=False) ##画图 plot1=ax1.plot(df1['date'],df1['accrued_income_iPhone'],color='b',label='iphone端预计营收(美元)') ax1.set_ylabel('收入',fontsize=20) for tl in ax1.get_yticklabels(): tl.set_color('b') ax2=ax1.twinx() plot2=ax2.plot(df3['date'],df3['comment_sina'],color='r',label='评论数') ax2.set_ylabel('评论数',fontsize=20) for tl in ax2.get_yticklabels(): tl.set_color('g') lines = plot1 + plot2 ax1.legend(lines, [l.get_label() for l in lines]) plt.show()
##设置画布 fig,ax1=plt.subplots(figsize=(25,10)) plt.title('明日方舟营收和bilibili动态评论对比图',fontsize=20) plt.ticklabel_format(axis='y',style='plain',useOffset=False,useLocale=False) ##画图 plot1=ax1.plot(df1['date'],df1['accrued_income_iPhone'],color='b',label='iphone端预计营收(美元)') ax1.set_ylabel('收入',fontsize=20) for tl in ax1.get_yticklabels(): tl.set_color('b') ax2=ax1.twinx() plot2=ax2.plot(df2['date'],df2['comment_bilibili'],color='r',label='评论数') ax2.set_ylabel('评论数',fontsize=20) for tl in ax2.get_yticklabels(): tl.set_color('g') lines = plot1 + plot2 ax1.legend(lines, [l.get_label() for l in lines]) plt.show()
##载入文件 file_path4='./20200806_bilibili.csv' ##文件调整 #昵称,uid,等级, df4=pd.read_csv(file_path4,header=None,names=['name','uid','level','comment','like','time'],lineterminator="\n",low_memory=False, error_bad_lines=False) df4.to_csv('tf.csv',index=False) df4['level'].replace(np.nan, 0, inplace=True) df4['level'].replace(np.inf, 0, inplace=True) df4['uid'].replace(np.nan, 0, inplace=True) df4['uid'].replace(np.inf, 0, inplace=True) df4['like'].replace(np.nan, 0, inplace=True) df4['like'].replace(np.inf, 0, inplace=True) df4['uid']=df4['uid'].astype('int') df4['level']=df4['level'].astype('int') print(df4) print(df4.info())
name uid level comment like \ 0 \tSuper丶过 84018702 6 啊啊啊啊啊啊啊啊啊啊啊啊蒂蒂时装!!!! 14659 1 \t要考深大的Dancy 76813331 5 找了好久,捞不动啊,头子[doge] 473 2 \t417508306 417508306 4 顶,你可是带功臣 374 3 \t月习Z24 339794357 5 捞不动啊[doge] 76 4 \t十-星尘 386262987 5 愿望实现了是吧[doge] 85 ... ... ... ... ... ... 354026 \t山巘之竹 23862202 5 近卫方舟 0 354027 \t不懂浪漫的锐萌萌 41941359 5 90发没有出黑[笑哭] 0 354028 \t香蕉丨君 12671223 6 蒂蒂 0 354029 \t西风眠 631159 5 斯卡蒂泳装!! 0 354030 \t爱次砂糖桔 364668474 5 斯卡蒂! 0 time 0 2020-08-0610:33:59\r 1 2020-08-0610:51:30\r 2 2020-08-0610:51:32\r 3 2020-08-0610:51:58\r 4 2020-08-0610:52:32\r ... ... 354026 2020-08-0610:32:44\r 354027 2020-08-0610:32:44\r 354028 2020-08-0610:32:43\r 354029 2020-08-0610:32:43\r 354030 2020-08-0610:32:43 [354031 rows x 6 columns] <class 'pandas.core.frame.DataFrame'> RangeIndex: 354031 entries, 0 to 354030 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 354030 non-null object 1 uid 354031 non-null int32 2 level 354031 non-null int32 3 comment 354030 non-null object 4 like 354031 non-null int64 5 time 354031 non-null object dtypes: int32(2), int64(1), object(3) memory usage: 13.5+ MB None
##分析弹幕和发帖数 count_user_id=df4['uid'].unique() print('总发帖数为:',len(df4['comment'])) print('参与发帖的用户数为:',len(count_user_id))
总发帖数为: 354031 参与发帖的用户数为: 32259
##前二十为发帖数展示 data1=df4.groupby(['uid']).count() data1=data1.sort_values(by=['comment'],ascending=[False]).head(20) data1['uid']=data1.index.astype('str') print(data1) plt.figure(figsize=(20,10)) plt.xticks(rotation=90) plt.xlabel('uid',fontsize=15) plt.ylabel('评论数',fontsize=15) plt.title('前二十发帖数展示',fontsize=20) plt.ticklabel_format(axis='y',style='plain',useOffset=False,useLocale=False) plt.bar(data1['uid'],data1['comment'],label='评论数') plt.legend(loc='upper right') plt.show()
name level comment like time uid uid 249178337 3491 3491 3491 3491 3491 249178337 38614665 3052 3052 3052 3052 3052 38614665 296745035 2814 2814 2814 2814 2814 296745035 87684563 2768 2768 2768 2768 2768 87684563 2069341 2502 2502 2502 2502 2502 2069341 196748348 2481 2481 2481 2481 2481 196748348 448103537 2477 2477 2477 2477 2477 448103537 76777403 2150 2150 2150 2150 2150 76777403 141314745 1707 1707 1707 1707 1707 141314745 33505022 1654 1654 1654 1654 1654 33505022 53122450 1364 1364 1364 1364 1364 53122450 195393716 1355 1355 1355 1355 1355 195393716 29085748 1287 1287 1287 1287 1287 29085748 223125640 1167 1167 1167 1167 1167 223125640 28573827 1166 1166 1166 1166 1166 28573827 451502255 1144 1144 1144 1144 1144 451502255 369946173 1106 1106 1106 1106 1106 369946173 35424364 1061 1061 1061 1061 1061 35424364 229127706 1022 1022 1022 1022 1022 229127706 11950192 995 995 995 995 995 11950192
#用户发帖分析 def suma(i): data2=df4.groupby(['uid']).count() data2=data2.sort_values(by=['comment'],ascending=[False]).head(i) a=data2['comment'].sum() print(a) suma(0) suma(8074) suma(16149) suma(24223)
0 313274 335202 345995
##词云分析 def wordcloudp(i): data2=df4.groupby(['uid']).count() #按照uid分类 data2=data2.sort_values(by=['comment'],ascending=[False]).head(i) #排序 data2['uid']=data2.index.astype('str') b=data2['uid'].astype('int') c=df4.loc[~df4['uid'].isin(b)] print(len(c)) data3=c['comment'].tolist() data3=str(data3) ls=jieba.lcut(data3) txt="".join(ls) STOPWORDS.update(['doge','微笑','OK','星星眼','妙啊','调皮','歪嘴','打call','呲牙','滑稽','吃瓜', '辣眼睛','嗑瓜子','笑哭','脱单doge','给心心','嘟嘟','喜欢','酸了','奸笑','喜极而泣','疑惑','害羞','大哭', '嫌弃','哦呼','笑','偷笑','惊讶','捂脸','阴险','囧','呆','尴尬','鼓掌','点赞','无语','惊喜','大笑','抠鼻', '灵魂出窍','委屈','傲娇','疼','冷','生病','吓','哈欠','翻白眼','再见','思考','嘘声','捂眼','吐','奋斗', '墨镜','难过','撇嘴','抓狂','生气','口罩','热词系列_知识增加','2233娘_大笑','支持','热词系列_吹爆','妙啊', 'tv_doge','热词系列_大师球','热词系列_爱了爱了']) mask = np.array(Image.open("./background01.jpg")) comment_wc = WordCloud( background_color='black', mask=mask, width=3000, height=3000, margin=1, max_words=2000, mode='RGBA', color_func=lambda *args, **kwargs: "white", stopwords=set(STOPWORDS), font_path="msyh.ttc") comment_wc.generate(txt) plt.imshow(comment_wc) plt.axis('off') plt.show() wordcloudp(0) wordcloudp(8074) wordcloudp(16149) wordcloudp(24223)
354031
40757
18829
8036
##载入文件 file_path5='./20210416_bilibili.csv' ##文件调整 df5=pd.read_csv(file_path5,header=None,names=['name','uid','level','comment','like','time'],lineterminator="\n",low_memory=False, error_bad_lines=False) df5.to_csv('tf.csv',index=False) df5['level'].replace(np.nan, 0, inplace=True) df5['level'].replace(np.inf, 0, inplace=True) df5['uid'].replace(np.nan, 0, inplace=True) df5['uid'].replace(np.inf, 0, inplace=True) df5['uid']=df5['uid'].astype('int') df5['level']=df5['level'].astype('int') print(df5) print(df5.info())
name uid level \ 0 \t愿与六花再次相逢 8689126 6 1 \t巴别塔的菜鸡 481064550 4 2 \t愿与六花再次相逢 8689126 6 3 \t梦忆丶髅髅宫 14669819 6 4 \t巴别塔的菜鸡 481064550 4 ... ... ... ... 32747 \t缇丶背 622589918 4 32748 \tcaduceus攻略组 281766906 5 32749 \t缇丶背 622589918 4 32750 \t-宅系の男生- 487524248 4 32751 \t芳泽霞--- 18687790 5 comment like \ 0 鹰角啊。敌人描述别让它自己滚动了行不行啊。[灵魂出窍]看着太费劲了。 3401 1 嗯呢,看过内卫介绍看了好几遍没看懂,最后搜了一下发现锁输出最高的那个[喜极而泣] 193 2 而且优先攻击没在国度内的我方角色。 47 3 小火龙+赛爹 内卫没走过来就融化了 0 4 回复错人了吧 1 ... ... ... 32747 又是你啊 0 32748 哟 方舟评论区常客啊 0 32749 你不也是常客吗[吃瓜]我方舟已经快毕业了怎么就不能来了 0 32750 那可是阴阳空格小号 0 32751 地生五金池,普通池不能给限定垫刀,复刻没奖励,锁定每周可得的合成玉上限,现在开始出强度一无是... 2 time 0 2021-04-1614:51:40\r 1 2021-04-1614:54:45\r 2 2021-04-1614:56:00\r 3 2021-04-1616:54:40\r 4 2021-04-1616:56:17\r ... ... 32747 2021-04-1911:55:47\r 32748 2021-04-1911:57:15\r 32749 2021-04-1912:01:30\r 32750 2021-04-1914:40:10\r 32751 2021-04-1715:16:33\r [32752 rows x 6 columns] <class 'pandas.core.frame.DataFrame'> RangeIndex: 32752 entries, 0 to 32751 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 32752 non-null object 1 uid 32752 non-null int32 2 level 32752 non-null int32 3 comment 32752 non-null object 4 like 32752 non-null int64 5 time 32752 non-null object dtypes: int32(2), int64(1), object(3) memory usage: 1.2+ MB None
##分析弹幕和发帖数 count_user_id=df5['uid'].unique() print('总发帖数为:',len(df5['comment'])) print('参与发帖的用户数为:',len(count_user_id))
总发帖数为: 32752 参与发帖的用户数为: 8045
##前二十为发帖数展示 data4=df5.groupby(['uid']).count() data4=data4.sort_values(by=['comment'],ascending=[False]).head(20) data4['uid']=data4.index.astype('str') print(data4) plt.figure(figsize=(20,10)) plt.xticks(rotation=90) plt.xlabel('uid',fontsize=15) plt.ylabel('评论数',fontsize=15) plt.title('前二十发帖数展示',fontsize=20) plt.ticklabel_format(axis='y',style='plain',useOffset=False,useLocale=False) plt.bar(data4['uid'],data4['comment'],label='评论数') plt.legend(loc='upper right') plt.show()
name level comment like time uid uid 38746068 2077 2077 2077 2077 2077 38746068 382025758 1217 1217 1217 1217 1217 382025758 192039809 285 285 285 285 285 192039809 35419755 165 165 165 165 165 35419755 778856 143 143 143 143 143 778856 477495 141 141 141 141 141 477495 81092732 132 132 132 132 132 81092732 285634302 128 128 128 128 128 285634302 156688004 127 127 127 127 127 156688004 2136818231 122 122 122 122 122 2136818231 6093025 122 122 122 122 122 6093025 281766906 122 122 122 122 122 281766906 13271065 116 116 116 116 116 13271065 393483328 112 112 112 112 112 393483328 81131948 98 98 98 98 98 81131948 179381984 97 97 97 97 97 179381984 285781611 94 94 94 94 94 285781611 143752980 84 84 84 84 84 143752980 622589918 84 84 84 84 84 622589918 381651502 83 83 83 83 83 381651502
#用户发帖分析 def sumb(i): data5=df5.groupby(['uid']).count() data5=data5.sort_values(by=['comment'],ascending=[False]).head(i) a=data5['comment'].sum() print(a) sumb(2015) sumb(4030) sumb(6045)
24253 28682 30752
##词云分析 def wordcloudq(i): data6=df5.groupby(['uid']).count() data6=data6.sort_values(by=['comment'],ascending=[False]).head(i) data6['uid']=data6.index.astype('str') b=data6['uid'].astype('int') c=df5.loc[~df5['uid'].isin(b)] print(len(c)) data7=c['comment'].tolist() data7=str(data7) ls=jieba.lcut(data7) txt="".join(ls) STOPWORDS.update(['doge','微笑','OK','星星眼','妙啊','调皮','歪嘴','打call','呲牙','滑稽','吃瓜', '辣眼睛','嗑瓜子','笑哭','脱单doge','给心心','嘟嘟','喜欢','酸了','奸笑','喜极而泣','疑惑','害羞','大哭', '嫌弃','哦呼','笑','偷笑','惊讶','捂脸','阴险','囧','呆','尴尬','鼓掌','点赞','无语','惊喜','大笑','抠鼻', '灵魂出窍','委屈','傲娇','疼','冷','生病','吓','哈欠','翻白眼','再见','思考','嘘声','捂眼','吐','奋斗', '墨镜','难过','撇嘴','抓狂','生气','口罩']) mask = np.array(Image.open("./background01.jpg")) comment_wc = WordCloud( background_color='black', mask=mask, #width=400, #height=200, #margin=1, max_words=200, mode='RGBA', color_func=lambda *args, **kwargs: "white", stopwords=set(STOPWORDS), font_path="msyh.ttc") comment_wc.generate(txt) plt.imshow(comment_wc) plt.axis('off') plt.show() wordcloudq(0) wordcloudq(2015) wordcloudq(4030) wordcloudq(6045)
32752
8499
4070
2000
标签:comment,...,non,手游,uid,##,plt,方舟,营收 From: https://www.cnblogs.com/LiMnO4/p/17347343.html