首页 > 其他分享 >数据分析之电子商务网站用户行为分析及服务推荐

数据分析之电子商务网站用户行为分析及服务推荐

时间:2023-04-03 13:00:11浏览次数:37  
标签:数据分析 index 电子商务 loc IP tr 用户 pd count

01-mysql_access.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # 代码11-1
 4 
 5 import os
 6 import pandas as pd
 7 
 8 
 9 # 修改工作路径到指定文件夹
10 os.chdir("D:/chapter11/demo")
11 
12 # 第一种连接方式
13 from sqlalchemy import create_engine
14 
15 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
16 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
17 
18 # 第二种连接方式
19 import pymysql as pm
20 
21 con = pm.connect('localhost','root','123456','test',charset='utf8')
22 data = pd.read_sql('select * from all_gzdata',con=con)
23 con.close()           #关闭连接
24 
25 # 保存读取的数据
26 data.to_csv('./tmp/all_gzdata.csv', index=False, encoding='utf-8')

02-pageviews_statistics.py

  1 # -*- coding: utf-8 -*-
  2 
  3 # 代码11-2
  4 
  5 import pandas as pd
  6 from sqlalchemy import create_engine
  7 
  8 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
  9 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
 10 # 分析网页类型
 11 counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计
 12 counts = counts.copy()
 13 counts = pd.concat(counts).groupby(level=0).sum()  # 合并统计结果,把相同的统计项合并(即按index分组并求和)
 14 counts = counts.reset_index()  # 重新设置index,将原来的index作为counts的一列。
 15 counts.columns = ['index', 'num']  # 重新设置列名,主要是第二列,默认为0
 16 counts['type'] = counts['index'].str.extract('(\d{3})')  # 提取前三个数字作为类别id
 17 counts_ = counts[['type', 'num']].groupby('type').sum()  # 按类别合并
 18 counts_.sort_values(by='num', ascending=False, inplace=True)  # 降序排列
 19 counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum()
 20 print(counts_)
 21 
 22 
 23 
 24 # 代码11-3
 25 
 26 # 因为只有107001一类,但是可以继续细分成三类:知识内容页、知识列表页、知识首页
 27 def count107(i): #自定义统计函数
 28     j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy()  # 找出类别包含107的网址
 29     j['type'] = None # 添加空列
 30     j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页'
 31     j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页'
 32     j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页'
 33     return j['type'].value_counts()
 34 # 注意:获取一次sql对象就需要重新访问一下数据库(!!!)
 35 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
 36 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
 37 
 38 counts2 = [count107(i) for i in sql] # 逐块统计
 39 counts2 = pd.concat(counts2).groupby(level=0).sum()  # 合并统计结果
 40 print(counts2)
 41 #计算各个部分的占比
 42 res107 = pd.DataFrame(counts2)
 43 # res107.reset_index(inplace=True)
 44 res107.index.name= '107类型'
 45 res107.rename(columns={'type':'num'}, inplace=True)
 46 res107['比例'] = res107['num'] / res107['num'].sum()
 47 res107.reset_index(inplace = True)
 48 print(res107)
 49 
 50 
 51 
 52 # 代码11-4
 53 
 54 def countquestion(i):  # 自定义统计函数
 55     j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy()  # 找出类别包含107的网址
 56     return j
 57 
 58 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
 59 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
 60 
 61 counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql]
 62 counts3 = pd.concat(counts3).groupby(level=0).sum()
 63 print(counts3)
 64 
 65 # 求各个类型的占比并保存数据
 66 df1 =  pd.DataFrame(counts3)
 67 df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100
 68 df1.sort_values(by='fullURLId',ascending=False,inplace=True)
 69 print(df1.round(4))
 70 
 71 
 72 
 73 # 代码11-5
 74 
 75 def page199(i): #自定义统计函数
 76     j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) & 
 77          (i['fullURL'].str.contains('\?'))]
 78     j['pageTitle'].fillna('空',inplace=True)
 79     j['type'] = '其他' # 添加空列
 80     j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手'
 81     j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功'
 82     j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询'
 83     j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜'
 84     j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验'
 85     j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询'
 86     j['type'][(j['pageTitle'].str.contains('_法律快车')) | 
 87             (j['pageTitle'].str.contains('-法律快车'))] = '法律快车'
 88     j['type'][j['pageTitle'].str.contains('空')] = '空'
 89     
 90     return j
 91 
 92 # 注意:获取一次sql对象就需要重新访问一下数据库
 93 #engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
 94 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
 95 #sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine)
 96 
 97 counts4 = [page199(i) for i in sql] # 逐块统计
 98 counts4 = pd.concat(counts4)
 99 d1 = counts4['type'].value_counts()
100 print(d1)
101 d2 = counts4[counts4['type']=='其他']
102 print(d2)
103 # 求各个部分的占比并保存数据
104 df1_ =  pd.DataFrame(d1)
105 df1_['perc'] = df1_['type']/df1_['type'].sum()*100
106 df1_.sort_values(by='type',ascending=False,inplace=True)
107 print(df1_)
108 
109 
110 # 代码11-6
111 
112 def xiaguang(i): #自定义统计函数
113     j = i.loc[(i['fullURL'].str.contains('\.html'))==False,
114               ['fullURL','fullURLId','pageTitle']]
115     return j
116 
117 # 注意获取一次sql对象就需要重新访问一下数据库
118 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
119 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
120 
121 counts5 = [xiaguang(i) for i in sql]
122 counts5 = pd.concat(counts5)
123 
124 xg1 = counts5['fullURLId'].value_counts()
125 print(xg1)
126 # 求各个部分的占比
127 xg_ =  pd.DataFrame(xg1)
128 xg_.reset_index(inplace=True)
129 xg_.columns= ['index', 'num']
130 xg_['perc'] = xg_['num']/xg_['num'].sum()*100
131 xg_.sort_values(by='num',ascending=False,inplace=True)
132 
133 xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别id    
134 
135 xgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并
136 xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列
137 xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100
138 
139 print(xgs_.round(4))
140  
141 
142 
143 # 代码11-7
144 
145 # 分析网页点击次数
146 # 统计点击次数
147 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
148 sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
149 
150 counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数
151 counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果,level=0表示按照index分组
152 print(counts1)
153 
154 counts1_ = pd.DataFrame(counts1)
155 counts1_
156 counts1['realIP'] = counts1.index.tolist()
157 
158 counts1_[1]=1  # 添加1列全为1
159 hit_count = counts1_.groupby('realIP').sum()  # 统计各个“不同点击次数”分别出现的次数
160 # 也可以使用counts1_['realIP'].value_counts()功能
161 hit_count.columns=['用户数']
162 hit_count.index.name = '点击次数'
163 
164 # 统计1~7次、7次以上的用户人数
165 hit_count.sort_index(inplace = True)
166 hit_count_7 = hit_count.iloc[:7,:]
167 time = hit_count.iloc[7:,0].sum()  # 统计点击次数7次以上的用户数
168 hit_count_7 = hit_count_7.append([{'用户数':time}], ignore_index=True)
169 hit_count_7.index = ['1','2','3','4','5','6','7','7次以上']
170 hit_count_7['用户比例'] = hit_count_7['用户数'] / hit_count_7['用户数'].sum()
171 print(hit_count_7)
172 
173 
174 
175 # 代码11-8
176 
177 # 分析浏览一次的用户行为
178 
179 engine = create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')
180 all_gzdata = pd.read_sql_table('all_gzdata', con = engine)  # 读取all_gzdata数据
181 
182 #对realIP进行统计
183 # 提取浏览1次网页的数据
184 real_count = pd.DataFrame(all_gzdata.groupby("realIP")["realIP"].count())
185 real_count.columns = ["count"]
186 real_count["realIP"] = real_count.index.tolist()
187 user_one = real_count[(real_count["count"] == 1)]  # 提取只登录一次的用户
188 # 通过realIP与原始数据合并
189 real_one = pd.merge(user_one, all_gzdata, left_on="realIP", right_on="realIP")
190 
191 # 统计浏览一次的网页类型
192 URL_count = pd.DataFrame(real_one.groupby("fullURLId")["fullURLId"].count())
193 URL_count.columns = ["count"]
194 URL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列
195 # 统计排名前4和其他的网页类型
196 URL_count_4 = URL_count.iloc[:4,:]
197 time = hit_count.iloc[4:,0].sum()  # 统计其他的
198 URLindex = URL_count_4.index.values
199 URL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True)
200 URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3], 
201                      '其他']
202 URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum()
203 print(URL_count_4)
204 
205 
206 
207 # 代码11-9
208 
209 # 在浏览1次的前提下, 得到的网页被浏览的总次数
210 fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count())
211 fullURL_count.columns = ["count"]
212 fullURL_count["fullURL"] = fullURL_count.index.tolist()
213 fullURL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列

03-web_pretreatment.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # 代码11-10
 4 
 5 import os
 6 import re
 7 import pandas as pd
 8 import pymysql as pm
 9 from random import sample
10 
11 # 修改工作路径到指定文件夹
12 os.chdir("D:/chapter11/demo")
13 
14 # 读取数据
15 con = pm.connect('localhost','root','123456','test',charset='utf8')
16 data = pd.read_sql('select * from all_gzdata',con=con)
17 con.close()  # 关闭连接
18 
19 # 取出107类型数据
20 index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']]
21 data_107 = data.loc[index107,:]
22 
23 # 在107类型中筛选出婚姻类数据
24 index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']]
25 data_hunyin = data_107.loc[index,:]
26 
27 # 提取所需字段(realIP、fullURL)
28 info = data_hunyin.loc[:,['realIP','fullURL']]
29 
30 # 去除网址中“?”及其后面内容
31 da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']]
32 info.loc[:,'fullURL'] = da     # 将info中‘fullURL’那列换成da
33 # 去除无html网址
34 index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']]
35 index.count(True)   # True 或者 1 , False 或者 0
36 info1 = info.loc[index,:]
37 
38 
39 
40 # 代码11-11
41 
42 # 找出翻页和非翻页网址
43 index = [re.search('/\d+_\d+\.html',i)!=None for i in info1.loc[:,'fullURL']]
44 index1 = [i==False for i in index]
45 info1_1 = info1.loc[index,:]   # 带翻页网址
46 info1_2 = info1.loc[index1,:]  # 无翻页网址
47 # 将翻页网址还原
48 da = [re.sub('_\d+\.html','.html',str(i)) for i in info1_1.loc[:,'fullURL']]
49 info1_1.loc[:,'fullURL'] = da
50 # 翻页与非翻页网址合并
51 frames = [info1_1,info1_2]
52 info2 = pd.concat(frames)
53 # 或者
54 info2 = pd.concat([info1_1,info1_2],axis = 0)   # 默认为0,即行合并
55 # 去重(realIP和fullURL两列相同)
56 info3 = info2.drop_duplicates()
57 # 将IP转换成字符型数据
58 info3.iloc[:,0] = [str(index) for index in info3.iloc[:,0]]
59 info3.iloc[:,1] = [str(index) for index in info3.iloc[:,1]]
60 len(info3)
61 
62 
63 
64 # 代码11-12
65 
66 # 筛选满足一定浏览次数的IP
67 IP_count = info3['realIP'].value_counts()
68 # 找出IP集合
69 IP = list(IP_count.index)
70 count = list(IP_count.values)
71 # 统计每个IP的浏览次数,并存放进IP_count数据框中,第一列为IP,第二列为浏览次数
72 IP_count = pd.DataFrame({'IP':IP,'count':count})
73 # 3.3筛选出浏览网址在n次以上的IP集合
74 n = 2
75 index = IP_count.loc[:,'count']>n
76 IP_index = IP_count.loc[index,'IP']
77 
78 
79 
80 # 代码11-13
81 
82 # 划分IP集合为训练集和测试集
83 index_tr = sample(range(0,len(IP_index)),int(len(IP_index)*0.8))  # 或者np.random.sample
84 index_te = [i for i in range(0,len(IP_index)) if i not in index_tr]
85 IP_tr = IP_index[index_tr]
86 IP_te = IP_index[index_te]
87 # 将对应数据集划分为训练集和测试集
88 index_tr = [i in list(IP_tr) for i in info3.loc[:,'realIP']]
89 index_te = [i in list(IP_te) for i in info3.loc[:,'realIP']]
90 data_tr = info3.loc[index_tr,:]
91 data_te = info3.loc[index_te,:]
92 print(len(data_tr))
93 IP_tr = data_tr.iloc[:,0]  # 训练集IP
94 url_tr = data_tr.iloc[:,1]  # 训练集网址
95 IP_tr = list(set(IP_tr))  # 去重处理
96 url_tr = list(set(url_tr))  # 去重处理
97 len(url_tr)

04-model_train.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # 代码11-14
 4 
 5 import pandas as pd
 6 # 利用训练集数据构建模型
 7 UI_matrix_tr = pd.DataFrame(0,index=IP_tr,columns=url_tr)
 8 # 求用户-物品矩阵
 9 for i in data_tr.index:
10     UI_matrix_tr.loc[data_tr.loc[i,'realIP'],data_tr.loc[i,'fullURL']] = 1
11 sum(UI_matrix_tr.sum(axis=1))
12 
13 # 求物品相似度矩阵(因计算量较大,需要耗费的时间较久)
14 Item_matrix_tr = pd.DataFrame(0,index=url_tr,columns=url_tr)
15 for i in Item_matrix_tr.index:
16     for j in Item_matrix_tr.index:
17         a = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)==2)
18         b = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)!=0)
19         Item_matrix_tr.loc[i,j] = a/b
20 
21 # 将物品相似度矩阵对角线处理为零
22 for i in Item_matrix_tr.index:
23     Item_matrix_tr.loc[i,i]=0
24 
25 # 利用测试集数据对模型评价
26 IP_te = data_te.iloc[:,0]
27 url_te = data_te.iloc[:,1]
28 IP_te = list(set(IP_te))
29 url_te = list(set(url_te))
30 
31 # 测试集数据用户物品矩阵
32 UI_matrix_te = pd.DataFrame(0,index=IP_te,columns=url_te)
33 for i in data_te.index:
34     UI_matrix_te.loc[data_te.loc[i,'realIP'],data_te.loc[i,'fullURL']] = 1
35 
36 # 对测试集IP进行推荐
37 Res = pd.DataFrame('NaN',index=data_te.index,
38                    columns=['IP','已浏览网址','推荐网址','T/F'])
39 Res.loc[:,'IP']=list(data_te.iloc[:,0])
40 Res.loc[:,'已浏览网址']=list(data_te.iloc[:,1])
41 
42 # 开始推荐
43 for i in Res.index:
44     if Res.loc[i,'已浏览网址'] in list(Item_matrix_tr.index):
45         Res.loc[i,'推荐网址'] = Item_matrix_tr.loc[Res.loc[i,'已浏览网址'],
46                 :].argmax()
47         if Res.loc[i,'推荐网址'] in url_te:
48             Res.loc[i,'T/F']=UI_matrix_te.loc[Res.loc[i,'IP'],
49                     Res.loc[i,'推荐网址']]==1
50         else:
51             Res.loc[i,'T/F'] = False
52 
53 # 保存推荐结果
54 Res.to_csv('./tmp/Res.csv',index=False,encoding='utf8')

05-model_evaluate.py

 1 # -*- coding: utf-8 -*-
 2 
 3 # 代码11-15
 4 
 5 import pandas as pd
 6 # 读取保存的推荐结果
 7 Res = pd.read_csv('./tmp/Res.csv',keep_default_na=False, encoding='utf8')
 8 
 9 # 计算推荐准确率
10 Pre = round(sum(Res.loc[:,'T/F']=='True') / (len(Res.index)-sum(Res.loc[:,'T/F']=='NaN')), 3)
11 
12 print(Pre)
13 
14 # 计算推荐召回率
15 Rec = round(sum(Res.loc[:,'T/F']=='True') / (sum(Res.loc[:,'T/F']=='True')+sum(Res.loc[:,'T/F']=='NaN')), 3)
16 
17 print(Rec)
18 
19 # 计算F1指标
20 F1 = round(2*Pre*Rec/(Pre+Rec),3)
21 print(F1)

 

标签:数据分析,index,电子商务,loc,IP,tr,用户,pd,count
From: https://www.cnblogs.com/zhangfurong/p/17282770.html

相关文章

  • Django自动插入登录用户用户名
    在Django管理界面中我想在发布时自动插入登录用户名和博客文章,目前我已经在每个用户的下拉菜单中显示每个用户选择但显然这不是很好,所以我想它自动输入这个。自动插入登录用户用户名在Django这里是我的代码:models.py从django.db进口车型从django.contrib.auth.models导入用户......
  • 第六周-电子商务网站用户行为分析
    访问数据库importosimportpandasaspd#修改工作路径到指定文件夹#os.chdir("D:/chapter11/demo")#第一种连接方式#fromsqlalchemyimportcreate_engine#engine=create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')#sql=pd.......
  • 30.查看锁等待相关的阻塞线程、被阻塞线程信息及相关用户、IP、PORT
    SELECTlocked_table,locked_index,locked_type,blocking_pid,concat(T2.USER,'@',T2.HOST)AS"blocking(user@ip:port)",blocking_lock_mode,blocking_trx_rows_modified,waiting_pid,......
  • 电子商务网站行为分析及服务推荐
    #代码11-1importosimportpandasaspd#修改工作路径到指定文件夹os.chdir("E:/demo")##第一种连接方式#fromsqlalchemyimportcreate_engine#engine=create_engine('mysql+pymysql://root:[email protected]:3306/test?charset=utf8')#sql=pd.rea......
  • 电子商务网站行为分析及服务推荐
    连接数据库importosimportpandasaspd#修改工作路径到指定文件夹os.chdir("D:\Python\数据处理")#第一种连接方式fromsqlalchemyimportcreate_engineengine=create_engine('mysql+pymysql://root:[email protected]:3306/7law?charset=utf8')connection=......
  • python电子商务网站用户行为分析
    1importos2importpandasaspd345#修改工作路径到指定文件夹6os.chdir("C:\Users\86184\Desktop\文件集\data")78#第一种连接方式9#fromsqlalchemyimportcreate_engine1011#engine=create_engine('mysql+pymysql://root:[email protected].......
  • 第11章 电子商务网站用户行为分析及服务推荐
    一、背景与挖掘目标  二、分析方法与过程网站智能推荐的主要步骤如下:从系统中获取用户访问网站的原始记录。分析用户访问内容,用户流失及用户分类等。对数据进行预处理,包含数据去重,数据变换,数据分类等过程。以用户访问html后缀的网页为关键条件,对数据进行处理。对比多......
  • 十一章电子商务网站用户行为分析及服务推荐
    第一部分代码一:python访问数据库importpandasaspdfromsqlalchemyimportcreate_engineengine=create_engine('mysql+pymysql://root:102011@localhost/test?charset=utf8')sql=pd.read_sql('all_gzdata',engine,chunksize=10000)''......
  • 配置用户登录token的过期时间
    自定义配置token过期时间:JWT_AUTH={'JWT_RESPONSE_PAYLOAD_HANDLER':'app02.utils.my_jwt_response_payload_handler',#自定配置token过期时间#days:float=...,#seconds:float=...,#microseconds:float=...,#milliseconds:flo......
  • 用户名、手机号、邮箱、多方式登录自动签发token
    1.如果不加字段序列化,则默认的检验无法通过  2.第一种方式:使用手机号登录  3.第二种方法:使用邮箱校验  4.第三种方法:使用常规用户名登录  5.views中的代码思路共4步  6.urls中要指定请求与视图中的关系  7.settings中的配置,以及上面的APP注......