01-mysql_access.py
1 # -*- coding: utf-8 -*- 2 3 # 代码11-1 4 5 import os 6 import pandas as pd 7 8 9 # 修改工作路径到指定文件夹 10 os.chdir("D:/chapter11/demo") 11 12 # 第一种连接方式 13 from sqlalchemy import create_engine 14 15 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 16 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000) 17 18 # 第二种连接方式 19 import pymysql as pm 20 21 con = pm.connect('localhost','root','123456','test',charset='utf8') 22 data = pd.read_sql('select * from all_gzdata',con=con) 23 con.close() #关闭连接 24 25 # 保存读取的数据 26 data.to_csv('./tmp/all_gzdata.csv', index=False, encoding='utf-8')
02-pageviews_statistics.py
1 # -*- coding: utf-8 -*- 2 3 # 代码11-2 4 5 import pandas as pd 6 from sqlalchemy import create_engine 7 8 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 9 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000) 10 # 分析网页类型 11 counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计 12 counts = counts.copy() 13 counts = pd.concat(counts).groupby(level=0).sum() # 合并统计结果,把相同的统计项合并(即按index分组并求和) 14 counts = counts.reset_index() # 重新设置index,将原来的index作为counts的一列。 15 counts.columns = ['index', 'num'] # 重新设置列名,主要是第二列,默认为0 16 counts['type'] = counts['index'].str.extract('(\d{3})') # 提取前三个数字作为类别id 17 counts_ = counts[['type', 'num']].groupby('type').sum() # 按类别合并 18 counts_.sort_values(by='num', ascending=False, inplace=True) # 降序排列 19 counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum() 20 print(counts_) 21 22 23 24 # 代码11-3 25 26 # 因为只有107001一类,但是可以继续细分成三类:知识内容页、知识列表页、知识首页 27 def count107(i): #自定义统计函数 28 j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy() # 找出类别包含107的网址 29 j['type'] = None # 添加空列 30 j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页' 31 j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页' 32 j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页' 33 return j['type'].value_counts() 34 # 注意:获取一次sql对象就需要重新访问一下数据库(!!!) 35 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 36 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000) 37 38 counts2 = [count107(i) for i in sql] # 逐块统计 39 counts2 = pd.concat(counts2).groupby(level=0).sum() # 合并统计结果 40 print(counts2) 41 #计算各个部分的占比 42 res107 = pd.DataFrame(counts2) 43 # res107.reset_index(inplace=True) 44 res107.index.name= '107类型' 45 res107.rename(columns={'type':'num'}, inplace=True) 46 res107['比例'] = res107['num'] / res107['num'].sum() 47 res107.reset_index(inplace = True) 48 print(res107) 49 50 51 52 # 代码11-4 53 54 def countquestion(i): # 自定义统计函数 55 j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy() # 找出类别包含107的网址 56 return j 57 58 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 59 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000) 60 61 counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql] 62 counts3 = pd.concat(counts3).groupby(level=0).sum() 63 print(counts3) 64 65 # 求各个类型的占比并保存数据 66 df1 = pd.DataFrame(counts3) 67 df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100 68 df1.sort_values(by='fullURLId',ascending=False,inplace=True) 69 print(df1.round(4)) 70 71 72 73 # 代码11-5 74 75 def page199(i): #自定义统计函数 76 j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) & 77 (i['fullURL'].str.contains('\?'))] 78 j['pageTitle'].fillna('空',inplace=True) 79 j['type'] = '其他' # 添加空列 80 j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手' 81 j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功' 82 j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询' 83 j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜' 84 j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验' 85 j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询' 86 j['type'][(j['pageTitle'].str.contains('_法律快车')) | 87 (j['pageTitle'].str.contains('-法律快车'))] = '法律快车' 88 j['type'][j['pageTitle'].str.contains('空')] = '空' 89 90 return j 91 92 # 注意:获取一次sql对象就需要重新访问一下数据库 93 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 94 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息 95 #sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine) 96 97 counts4 = [page199(i) for i in sql] # 逐块统计 98 counts4 = pd.concat(counts4) 99 d1 = counts4['type'].value_counts() 100 print(d1) 101 d2 = counts4[counts4['type']=='其他'] 102 print(d2) 103 # 求各个部分的占比并保存数据 104 df1_ = pd.DataFrame(d1) 105 df1_['perc'] = df1_['type']/df1_['type'].sum()*100 106 df1_.sort_values(by='type',ascending=False,inplace=True) 107 print(df1_) 108 109 110 # 代码11-6 111 112 def xiaguang(i): #自定义统计函数 113 j = i.loc[(i['fullURL'].str.contains('\.html'))==False, 114 ['fullURL','fullURLId','pageTitle']] 115 return j 116 117 # 注意获取一次sql对象就需要重新访问一下数据库 118 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 119 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息 120 121 counts5 = [xiaguang(i) for i in sql] 122 counts5 = pd.concat(counts5) 123 124 xg1 = counts5['fullURLId'].value_counts() 125 print(xg1) 126 # 求各个部分的占比 127 xg_ = pd.DataFrame(xg1) 128 xg_.reset_index(inplace=True) 129 xg_.columns= ['index', 'num'] 130 xg_['perc'] = xg_['num']/xg_['num'].sum()*100 131 xg_.sort_values(by='num',ascending=False,inplace=True) 132 133 xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别id 134 135 xgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并 136 xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列 137 xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100 138 139 print(xgs_.round(4)) 140 141 142 143 # 代码11-7 144 145 # 分析网页点击次数 146 # 统计点击次数 147 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 148 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息 149 150 counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数 151 counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果,level=0表示按照index分组 152 print(counts1) 153 154 counts1_ = pd.DataFrame(counts1) 155 counts1_ 156 counts1['realIP'] = counts1.index.tolist() 157 158 counts1_[1]=1 # 添加1列全为1 159 hit_count = counts1_.groupby('realIP').sum() # 统计各个“不同点击次数”分别出现的次数 160 # 也可以使用counts1_['realIP'].value_counts()功能 161 hit_count.columns=['用户数'] 162 hit_count.index.name = '点击次数' 163 164 # 统计1~7次、7次以上的用户人数 165 hit_count.sort_index(inplace = True) 166 hit_count_7 = hit_count.iloc[:7,:] 167 time = hit_count.iloc[7:,0].sum() # 统计点击次数7次以上的用户数 168 hit_count_7 = hit_count_7.append([{'用户数':time}], ignore_index=True) 169 hit_count_7.index = ['1','2','3','4','5','6','7','7次以上'] 170 hit_count_7['用户比例'] = hit_count_7['用户数'] / hit_count_7['用户数'].sum() 171 print(hit_count_7) 172 173 174 175 # 代码11-8 176 177 # 分析浏览一次的用户行为 178 179 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8') 180 all_gzdata = pd.read_sql_table('all_gzdata', con = engine) # 读取all_gzdata数据 181 182 #对realIP进行统计 183 # 提取浏览1次网页的数据 184 real_count = pd.DataFrame(all_gzdata.groupby("realIP")["realIP"].count()) 185 real_count.columns = ["count"] 186 real_count["realIP"] = real_count.index.tolist() 187 user_one = real_count[(real_count["count"] == 1)] # 提取只登录一次的用户 188 # 通过realIP与原始数据合并 189 real_one = pd.merge(user_one, all_gzdata, left_on="realIP", right_on="realIP") 190 191 # 统计浏览一次的网页类型 192 URL_count = pd.DataFrame(real_one.groupby("fullURLId")["fullURLId"].count()) 193 URL_count.columns = ["count"] 194 URL_count.sort_values(by='count', ascending=False, inplace=True) # 降序排列 195 # 统计排名前4和其他的网页类型 196 URL_count_4 = URL_count.iloc[:4,:] 197 time = hit_count.iloc[4:,0].sum() # 统计其他的 198 URLindex = URL_count_4.index.values 199 URL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True) 200 URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3], 201 '其他'] 202 URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum() 203 print(URL_count_4) 204 205 206 207 # 代码11-9 208 209 # 在浏览1次的前提下, 得到的网页被浏览的总次数 210 fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count()) 211 fullURL_count.columns = ["count"] 212 fullURL_count["fullURL"] = fullURL_count.index.tolist() 213 fullURL_count.sort_values(by='count', ascending=False, inplace=True) # 降序排列
03-web_pretreatment.py
1 # -*- coding: utf-8 -*- 2 3 # 代码11-10 4 5 import os 6 import re 7 import pandas as pd 8 import pymysql as pm 9 from random import sample 10 11 # 修改工作路径到指定文件夹 12 os.chdir("D:/chapter11/demo") 13 14 # 读取数据 15 con = pm.connect('localhost','root','123456','test',charset='utf8') 16 data = pd.read_sql('select * from all_gzdata',con=con) 17 con.close() # 关闭连接 18 19 # 取出107类型数据 20 index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']] 21 data_107 = data.loc[index107,:] 22 23 # 在107类型中筛选出婚姻类数据 24 index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']] 25 data_hunyin = data_107.loc[index,:] 26 27 # 提取所需字段(realIP、fullURL) 28 info = data_hunyin.loc[:,['realIP','fullURL']] 29 30 # 去除网址中“?”及其后面内容 31 da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']] 32 info.loc[:,'fullURL'] = da # 将info中‘fullURL’那列换成da 33 # 去除无html网址 34 index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']] 35 index.count(True) # True 或者 1 , False 或者 0 36 info1 = info.loc[index,:] 37 38 39 40 # 代码11-11 41 42 # 找出翻页和非翻页网址 43 index = [re.search('/\d+_\d+\.html',i)!=None for i in info1.loc[:,'fullURL']] 44 index1 = [i==False for i in index] 45 info1_1 = info1.loc[index,:] # 带翻页网址 46 info1_2 = info1.loc[index1,:] # 无翻页网址 47 # 将翻页网址还原 48 da = [re.sub('_\d+\.html','.html',str(i)) for i in info1_1.loc[:,'fullURL']] 49 info1_1.loc[:,'fullURL'] = da 50 # 翻页与非翻页网址合并 51 frames = [info1_1,info1_2] 52 info2 = pd.concat(frames) 53 # 或者 54 info2 = pd.concat([info1_1,info1_2],axis = 0) # 默认为0,即行合并 55 # 去重(realIP和fullURL两列相同) 56 info3 = info2.drop_duplicates() 57 # 将IP转换成字符型数据 58 info3.iloc[:,0] = [str(index) for index in info3.iloc[:,0]] 59 info3.iloc[:,1] = [str(index) for index in info3.iloc[:,1]] 60 len(info3) 61 62 63 64 # 代码11-12 65 66 # 筛选满足一定浏览次数的IP 67 IP_count = info3['realIP'].value_counts() 68 # 找出IP集合 69 IP = list(IP_count.index) 70 count = list(IP_count.values) 71 # 统计每个IP的浏览次数,并存放进IP_count数据框中,第一列为IP,第二列为浏览次数 72 IP_count = pd.DataFrame({'IP':IP,'count':count}) 73 # 3.3筛选出浏览网址在n次以上的IP集合 74 n = 2 75 index = IP_count.loc[:,'count']>n 76 IP_index = IP_count.loc[index,'IP'] 77 78 79 80 # 代码11-13 81 82 # 划分IP集合为训练集和测试集 83 index_tr = sample(range(0,len(IP_index)),int(len(IP_index)*0.8)) # 或者np.random.sample 84 index_te = [i for i in range(0,len(IP_index)) if i not in index_tr] 85 IP_tr = IP_index[index_tr] 86 IP_te = IP_index[index_te] 87 # 将对应数据集划分为训练集和测试集 88 index_tr = [i in list(IP_tr) for i in info3.loc[:,'realIP']] 89 index_te = [i in list(IP_te) for i in info3.loc[:,'realIP']] 90 data_tr = info3.loc[index_tr,:] 91 data_te = info3.loc[index_te,:] 92 print(len(data_tr)) 93 IP_tr = data_tr.iloc[:,0] # 训练集IP 94 url_tr = data_tr.iloc[:,1] # 训练集网址 95 IP_tr = list(set(IP_tr)) # 去重处理 96 url_tr = list(set(url_tr)) # 去重处理 97 len(url_tr)
04-model_train.py
1 # -*- coding: utf-8 -*- 2 3 # 代码11-14 4 5 import pandas as pd 6 # 利用训练集数据构建模型 7 UI_matrix_tr = pd.DataFrame(0,index=IP_tr,columns=url_tr) 8 # 求用户-物品矩阵 9 for i in data_tr.index: 10 UI_matrix_tr.loc[data_tr.loc[i,'realIP'],data_tr.loc[i,'fullURL']] = 1 11 sum(UI_matrix_tr.sum(axis=1)) 12 13 # 求物品相似度矩阵(因计算量较大,需要耗费的时间较久) 14 Item_matrix_tr = pd.DataFrame(0,index=url_tr,columns=url_tr) 15 for i in Item_matrix_tr.index: 16 for j in Item_matrix_tr.index: 17 a = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)==2) 18 b = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)!=0) 19 Item_matrix_tr.loc[i,j] = a/b 20 21 # 将物品相似度矩阵对角线处理为零 22 for i in Item_matrix_tr.index: 23 Item_matrix_tr.loc[i,i]=0 24 25 # 利用测试集数据对模型评价 26 IP_te = data_te.iloc[:,0] 27 url_te = data_te.iloc[:,1] 28 IP_te = list(set(IP_te)) 29 url_te = list(set(url_te)) 30 31 # 测试集数据用户物品矩阵 32 UI_matrix_te = pd.DataFrame(0,index=IP_te,columns=url_te) 33 for i in data_te.index: 34 UI_matrix_te.loc[data_te.loc[i,'realIP'],data_te.loc[i,'fullURL']] = 1 35 36 # 对测试集IP进行推荐 37 Res = pd.DataFrame('NaN',index=data_te.index, 38 columns=['IP','已浏览网址','推荐网址','T/F']) 39 Res.loc[:,'IP']=list(data_te.iloc[:,0]) 40 Res.loc[:,'已浏览网址']=list(data_te.iloc[:,1]) 41 42 # 开始推荐 43 for i in Res.index: 44 if Res.loc[i,'已浏览网址'] in list(Item_matrix_tr.index): 45 Res.loc[i,'推荐网址'] = Item_matrix_tr.loc[Res.loc[i,'已浏览网址'], 46 :].argmax() 47 if Res.loc[i,'推荐网址'] in url_te: 48 Res.loc[i,'T/F']=UI_matrix_te.loc[Res.loc[i,'IP'], 49 Res.loc[i,'推荐网址']]==1 50 else: 51 Res.loc[i,'T/F'] = False 52 53 # 保存推荐结果 54 Res.to_csv('./tmp/Res.csv',index=False,encoding='utf8')
05-model_evaluate.py
1 # -*- coding: utf-8 -*- 2 3 # 代码11-15 4 5 import pandas as pd 6 # 读取保存的推荐结果 7 Res = pd.read_csv('./tmp/Res.csv',keep_default_na=False, encoding='utf8') 8 9 # 计算推荐准确率 10 Pre = round(sum(Res.loc[:,'T/F']=='True') / (len(Res.index)-sum(Res.loc[:,'T/F']=='NaN')), 3) 11 12 print(Pre) 13 14 # 计算推荐召回率 15 Rec = round(sum(Res.loc[:,'T/F']=='True') / (sum(Res.loc[:,'T/F']=='True')+sum(Res.loc[:,'T/F']=='NaN')), 3) 16 17 print(Rec) 18 19 # 计算F1指标 20 F1 = round(2*Pre*Rec/(Pre+Rec),3) 21 print(F1)
标签:数据分析,index,电子商务,loc,IP,tr,用户,pd,count From: https://www.cnblogs.com/zhangfurong/p/17282770.html