首页 > 其他分享 >电子商务网站行为分析及服务推荐

电子商务网站行为分析及服务推荐

时间:2023-04-02 23:27:08浏览次数:39  
标签:count index 电子商务 loc 网站 推荐 pd sql fullURL

连接数据库

import os
import pandas as pd
# 修改工作路径到指定文件夹
os.chdir("D:\Python\数据处理")

# 第一种连接方式
from sqlalchemy import create_engine

engine = create_engine('mysql+pymysql://root:12345@192.168.31.140:3306/7law?charset=utf8')
connection = engine.connect()
sql = pd.read_sql('all_gzdata', connection, chunksize = 10000)

# # 第二种连接方式
# import pymysql as pm
# con = pm.connect('localhost','root','12345','test',charset='utf8')
# data = pd.read_sql('select * from all_gzdata',con=con)
# con.close()           #关闭连接

# 保存读取的数据
data.to_csv('D:\Python\数据处理/all_gzdata.csv', index=False, encoding='utf-8')

分析网页类型

import pandas as pd
from sqlalchemy import create_engine
 
engine = create_engine('mysql+pymysql://root:@localhost:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)
# 分析网页类型
counts = [i['fullURLId'].value_counts() for i in sql] #逐块统计
counts = counts.copy()
counts = pd.concat(counts).groupby(level=0).sum()  # 合并统计结果,把相同的统计项合并(即按index分组并求和)
counts = counts.reset_index()  # 重新设置index,将原来的index作为counts的一列。
counts.columns = ['index', 'num']  # 重新设置列名,主要是第二列,默认为0
counts['type'] = counts['index'].str.extract('(\d{3})')  # 提取前三个数字作为类别id
counts_ = counts[['type', 'num']].groupby('type').sum()  # 按类别合并
counts_.sort_values(by='num', ascending=False, inplace=True)  # 降序排列
counts_['ratio'] = counts_.iloc[:,0] / counts_.iloc[:,0].sum()
print(counts_)

 

 

 知识类型内部统计

# 因为只有107001一类,但是可以继续细分成三类:知识内容页、知识列表页、知识首页
def count107(i): #自定义统计函数
    j = i[['fullURL']][i['fullURLId'].str.contains('107')].copy()  # 找出类别包含107的网址
    j['type'] = None # 添加空列
    j['type'][j['fullURL'].str.contains('info/.+?/')]= '知识首页'
    j['type'][j['fullURL'].str.contains('info/.+?/.+?')]= '知识列表页'
    j['type'][j['fullURL'].str.contains('/\d+?_*\d+?\.html')]= '知识内容页'
    return j['type'].value_counts()
# 注意:获取一次sql对象就需要重新访问一下数据库(!!!)
#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)

counts2 = [count107(i) for i in sql] # 逐块统计
counts2 = pd.concat(counts2).groupby(level=0).sum()  # 合并统计结果
print(counts2)
#计算各个部分的占比
res107 = pd.DataFrame(counts2)
# res107.reset_index(inplace=True)
res107.index.name= '107类型'
res107.rename(columns={'type':'num'}, inplace=True)
res107['比例'] = res107['num'] / res107['num'].sum()
res107.reset_index(inplace = True)
print(res107)

 

 

 统计带"?"的数据

def countquestion(i):  # 自定义统计函数
    j = i[['fullURLId']][i['fullURL'].str.contains('\?')].copy()  # 找出类别包含107的网址
    return j

#engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)

counts3 = [countquestion(i)['fullURLId'].value_counts() for i in sql]
counts3 = pd.concat(counts3).groupby(level=0).sum()
print(counts3)

# 求各个类型的占比并保存数据
df1 =  pd.DataFrame(counts3)
df1['perc'] = df1['fullURLId']/df1['fullURLId'].sum()*100
df1.sort_values(by='fullURLId',ascending=False,inplace=True)
print(df1.round(4))

 

 

 统计具体类型占比

def page199(i): #自定义统计函数
    j = i[['fullURL','pageTitle']][(i['fullURLId'].str.contains('199')) & 
         (i['fullURL'].str.contains('\?'))]
    j['pageTitle'].fillna('空',inplace=True)
    j['type'] = '其他' # 添加空列
    j['type'][j['pageTitle'].str.contains('法律快车-律师助手')]= '法律快车-律师助手'
    j['type'][j['pageTitle'].str.contains('咨询发布成功')]= '咨询发布成功'
    j['type'][j['pageTitle'].str.contains('免费发布法律咨询' )] = '免费发布法律咨询'
    j['type'][j['pageTitle'].str.contains('法律快搜')] = '快搜'
    j['type'][j['pageTitle'].str.contains('法律快车法律经验')] = '法律快车法律经验'
    j['type'][j['pageTitle'].str.contains('法律快车法律咨询')] = '法律快车法律咨询'
    j['type'][(j['pageTitle'].str.contains('_法律快车')) | 
            (j['pageTitle'].str.contains('-法律快车'))] = '法律快车'
    j['type'][j['pageTitle'].str.contains('空')] = '空'
    
    return j
 
# 注意:获取一次sql对象就需要重新访问一下数据库
engine = create_engine('mysql+pymysql://root:@localhost:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息
#sql = pd.read_sql_query('select * from all_gzdata limit 10000', con=engine)
 
counts4 = [page199(i) for i in sql] # 逐块统计
counts4 = pd.concat(counts4)
d1 = counts4['type'].value_counts()
print(d1)
d2 = counts4[counts4['type']=='其他']
print(d2)
# 求各个部分的占比并保存数据
df1_ =  pd.DataFrame(d1)
df1_['perc'] = df1_['type']/df1_['type'].sum()*100
df1_.sort_values(by='type',ascending=False,inplace=True)
print(df1_)

 

 

 统计无目的的浏览用户中各个类型占比

def xiaguang(i): #自定义统计函数
    j = i.loc[(i['fullURL'].str.contains('\.html'))==False,
              ['fullURL','fullURLId','pageTitle']]
    return j

# 注意获取一次sql对象就需要重新访问一下数据库
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息

counts5 = [xiaguang(i) for i in sql]
counts5 = pd.concat(counts5)

xg1 = counts5['fullURLId'].value_counts()
print(xg1)
# 求各个部分的占比
xg_ =  pd.DataFrame(xg1)
xg_.reset_index(inplace=True)
xg_.columns= ['index', 'num']
xg_['perc'] = xg_['num']/xg_['num'].sum()*100
xg_.sort_values(by='num',ascending=False,inplace=True)

xg_['type'] = xg_['index'].str.extract('(\d{3})') #提取前三个数字作为类别id    

xgs_ = xg_[['type', 'num']].groupby('type').sum() #按类别合并
xgs_.sort_values(by='num', ascending=False,inplace=True) #降序排列
xgs_['percentage'] = xgs_['num']/xgs_['num'].sum()*100

print(xgs_.round(4))

 

 

 统计用户浏览网页次数的情况

# 分析网页点击次数
# 统计点击次数
engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
sql = pd.read_sql('all_gzdata', engine, chunksize = 10000)# 分块读取数据库信息

counts1 = [i['realIP'].value_counts() for i in sql] # 分块统计各个IP的出现次数
counts1 = pd.concat(counts1).groupby(level=0).sum() # 合并统计结果,level=0表示按照index分组
print(counts1)

counts1_ = pd.DataFrame(counts1)
counts1_
counts1['realIP'] = counts1.index.tolist()

counts1_[1]=1  # 添加1列全为1
hit_count = counts1_.groupby('realIP').sum()  # 统计各个“不同点击次数”分别出现的次数
# 也可以使用counts1_['realIP'].value_counts()功能
hit_count.columns=['用户数']
hit_count.index.name = '点击次数'

# 统计1~7次、7次以上的用户人数
hit_count.sort_index(inplace = True)
hit_count_7 = hit_count.iloc[:7,:]
time = hit_count.iloc[7:,0].sum()  # 统计点击次数7次以上的用户数
hit_count_7 = hit_count_7.append([{'用户数':time}], ignore_index=True)
hit_count_7.index = ['1','2','3','4','5','6','7','7次以上']
hit_count_7['用户比例'] = hit_count_7['用户数'] / hit_count_7['用户数'].sum()
print(hit_count_7)

 

 

 分析浏览次数为一次的用户的行为

# 分析浏览一次的用户行为

engine = create_engine('mysql+pymysql://root:123456@127.0.0.1:3306/test?charset=utf8')
all_gzdata = pd.read_sql_table('all_gzdata', con = engine)  # 读取all_gzdata数据

#对realIP进行统计
# 提取浏览1次网页的数据
real_count = pd.DataFrame(all_gzdata.groupby("realIP")["realIP"].count())
real_count.columns = ["count"]
real_count["realIP"] = real_count.index.tolist()
user_one = real_count[(real_count["count"] == 1)]  # 提取只登录一次的用户
user_one.to_csv('D:\Python\数据处理/user_one.csv', index=False, encoding='utf-8') 
inputfile ="D:\Python\数据处理/user_one1.csv" 
user_one1 = pd.read_csv(inputfile)  # 读取数据
real_one = pd.merge(user_one1, all_gzdata, left_on="realIP", right_on="realIP")


# 统计浏览一次的网页类型
URL_count = pd.DataFrame(real_one.groupby("fullURLId")["fullURLId"].count())
URL_count.columns = ["count"]
URL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列
# 统计排名前4和其他的网页类型
URL_count_4 = URL_count.iloc[:4,:]
time = hit_count.iloc[4:,0].sum()  # 统计其他的
URLindex = URL_count_4.index.values
URL_count_4 = URL_count_4.append([{'count':time}], ignore_index=True)
URL_count_4.index = [URLindex[0], URLindex[1], URLindex[2], URLindex[3], 
                     '其他']
URL_count_4['比例'] = URL_count_4['count'] / URL_count_4['count'].sum()
print(URL_count_4)

 

 统计单用户浏览次数为一次的网页

# 在浏览1次的前提下, 得到的网页被浏览的总次数
fullURL_count = pd.DataFrame(real_one.groupby("fullURL")["fullURL"].count())
fullURL_count.columns = ["count"]
fullURL_count["fullURL"] = fullURL_count.index.tolist()
fullURL_count.sort_values(by='count', ascending=False, inplace=True)  # 降序排列

# 网页类型ID统计
fullURLId_count = merge_data['fullURLId'].value_counts()
fullURLId_count = fullURLId_count.reset_index()
fullURLId_count.columns = ['fullURLId', 'count']
fullURLId_count['percent'] = fullURLId_count['count'] / fullURLId_count['count'].sum() * 100
print('*****' * 10)
print(fullURLId_count)

# 用户点击一次 浏览的网页统计
fullURL_count = merge_data['fullURL'].value_counts()
fullURL_count = fullURL_count.reset_index()
fullURL_count.columns = ['fullURL', 'count']
fullURL_count['percent'] = fullURL_count['count'] / fullURL_count['count'].sum() * 100
print('*****' * 10)
print(fullURL_count)

 

 删除不符合规范的网页

import os
import re
import pandas as pd
import pymysql as pm
from random import sample

# 修改工作路径到指定文件夹
os.chdir("D:\Python\数据处理")

# 读取数据
con = pm.connect('localhost','root','123456','test',charset='utf8')
data = pd.read_sql('select * from all_gzdata',con=con)
con.close()  # 关闭连接

# 取出107类型数据
index107 = [re.search('107',str(i))!=None for i in data.loc[:,'fullURLId']]
data_107 = data.loc[index107,:]

# 在107类型中筛选出婚姻类数据
index = [re.search('hunyin',str(i))!=None for i in data_107.loc[:,'fullURL']]
data_hunyin = data_107.loc[index,:]

# 提取所需字段(realIP、fullURL)
info = data_hunyin.loc[:,['realIP','fullURL']]

# 去除网址中“?”及其后面内容
da = [re.sub('\?.*','',str(i)) for i in info.loc[:,'fullURL']]
info.loc[:,'fullURL'] = da     # 将info中‘fullURL’那列换成da
# 去除无html网址
index = [re.search('\.html',str(i))!=None for i in info.loc[:,'fullURL']]
index.count(True)   # True 或者 1 , False 或者 0
info1 = info.loc[index,:]
print(info1.head())

 

 构建模型

import pandas as pd
# 利用训练集数据构建模型
UI_matrix_tr = pd.DataFrame(0,index=IP_tr,columns=url_tr)
# 求用户-物品矩阵
for i in data_tr.index:
    UI_matrix_tr.loc[data_tr.loc[i,'realIP'],data_tr.loc[i,'fullURL']] = 1
sum(UI_matrix_tr.sum(axis=1))

# 求物品相似度矩阵(因计算量较大,需要耗费的时间较久)
Item_matrix_tr = pd.DataFrame(0,index=url_tr,columns=url_tr)
for i in Item_matrix_tr.index:
    for j in Item_matrix_tr.index:
        a = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)==2)
        b = sum(UI_matrix_tr.loc[:,[i,j]].sum(axis=1)!=0)
        Item_matrix_tr.loc[i,j] = a/b

# 将物品相似度矩阵对角线处理为零
for i in Item_matrix_tr.index:
    Item_matrix_tr.loc[i,i]=0

# 利用测试集数据对模型评价
IP_te = data_te.iloc[:,0]
url_te = data_te.iloc[:,1]
IP_te = list(set(IP_te))
url_te = list(set(url_te))

# 测试集数据用户物品矩阵
UI_matrix_te = pd.DataFrame(0,index=IP_te,columns=url_te)
for i in data_te.index:
    UI_matrix_te.loc[data_te.loc[i,'realIP'],data_te.loc[i,'fullURL']] = 1

# 对测试集IP进行推荐
Res = pd.DataFrame('NaN',index=data_te.index,
                   columns=['IP','已浏览网址','推荐网址','T/F'])
Res.loc[:,'IP']=list(data_te.iloc[:,0])
Res.loc[:,'已浏览网址']=list(data_te.iloc[:,1])

# 开始推荐
for i in Res.index:
    if Res.loc[i,'已浏览网址'] in list(Item_matrix_tr.index):
        Res.loc[i,'推荐网址'] = Item_matrix_tr.loc[Res.loc[i,'已浏览网址'],
                :].argmax()
        if Res.loc[i,'推荐网址'] in url_te:
            Res.loc[i,'T/F']=UI_matrix_te.loc[Res.loc[i,'IP'],
                    Res.loc[i,'推荐网址']]==1
        else:
            Res.loc[i,'T/F'] = False

# 保存推荐结果
Res.to_csv('D:\Python\数据处理/Res.csv',index=False,encoding='utf8')

 

标签:count,index,电子商务,loc,网站,推荐,pd,sql,fullURL
From: https://www.cnblogs.com/qpy20020914/p/17281752.html

相关文章

  • python电子商务网站用户行为分析
    1importos2importpandasaspd345#修改工作路径到指定文件夹6os.chdir("C:\Users\86184\Desktop\文件集\data")78#第一种连接方式9#fromsqlalchemyimportcreate_engine1011#engine=create_engine('mysql+pymysql://root:123@192.168.31.......
  • 第11章 电子商务网站用户行为分析及服务推荐
    一、背景与挖掘目标  二、分析方法与过程网站智能推荐的主要步骤如下:从系统中获取用户访问网站的原始记录。分析用户访问内容,用户流失及用户分类等。对数据进行预处理,包含数据去重,数据变换,数据分类等过程。以用户访问html后缀的网页为关键条件,对数据进行处理。对比多......
  • 十一章电子商务网站用户行为分析及服务推荐
    第一部分代码一:python访问数据库importpandasaspdfromsqlalchemyimportcreate_engineengine=create_engine('mysql+pymysql://root:102011@localhost/test?charset=utf8')sql=pd.read_sql('all_gzdata',engine,chunksize=10000)''......
  • 第十一章 电子商务网站用户行为分析及服务推荐
    #代码11-1Python访问数据库importosimportpandasaspd#修改工作路径到指定文件夹#os.chdir('./data')#第一种连接方式fromsqlalchemyimportcreate_engineengine=create_engine('mysql+pymysql://root:123456@localhost:3306/test_1?charset=utf8')sql=p......
  • 给大家推荐一个.Net的混淆防反编译工具ConfuserEx
    给大家推荐一个.Net的混淆防反编译工具ConfuserEx。由于项目中要用到.Net的混淆防反编译工具。在网上找了很多.Net混淆或混淆防反编译工具,如.NETReactor、Dotfuscator、Eazfuscator.NET、ConfuserEx。由于是WEB项目,所有使用其中的某些软件混淆DLL后不能使用,或使用的局限性较......
  • 推荐系统
    importosimportpandasaspdimportpymysqlaspmos.chdir("G:\data\data")con=pm.connect(host='localhost',user='root',password='123456',database='gzdata',charset='utf8')data=pd.read_sql(......
  • 第十一章——电子商务网站用户行为分析及服务推荐
    一、python访问数据库importpandasaspdfromsqlalchemyimportcreate_engineengine=create_engine('mysql+pymysql://root:102011@localhost/test?charset=utf8')sql=pd.read_sql('all_gzdata',engine,chunksize=10000)'''用c......
  • 第十一章 电子商务网站用户行为分析及服务推荐
    前情提要:由于不可控因素,在导入数据时部分出错,导致数据可能缺失,运行结果有所偏差。#-*-coding:utf-8-*-#代码11-1Python访问数据库importosimportpandasaspd#修改工作路径到指定文件夹os.chdir("E:\\anaconda3\\jupyterFile\\数据分析")#第一种连接方式#......
  • 传统网站以及前后端分离网站的开发及访问
    传统的网站的开发及访问:   改进的网站的开发及访问:    现在的网站的开发及访问:前后端分离----微服务       ......
  • 外贸官方网站优化的核心要点与技巧
    作为一个从事外贸行业多年的业内人士,我深知优化官方网站在提升业务竞争力和吸引国际客户方面的重要性。在这篇文章中,我将分享一些关于外贸官方网站谷歌SEO优化的核心要点与技巧,帮助您更好地理解如何在Google搜索引擎中取得优势。首先,链接建设是谷歌SEO优化的关键一环。GPB外链作为......