import numpy as np
import pandas as pd
inputfile="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\GoodsOrder.csv"
data=pd.read_csv(inputfile,encoding='gbk')#读取数据
data.info()#查看数据属性
data=data['id']
description=[data.count(),data.min(),data.max()]#依次计算总数,最小值,最大值
description=pd.DataFrame(description,index=['Count','Min','Max']).T
print('描述性统计结果:\n',np.round(description))
<class 'pandas.core.frame.DataFrame'> RangeIndex: 43367 entries, 0 to 43366 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 43367 non-null int64 1 Goods 43367 non-null object dtypes: int64(1), object(1) memory usage: 677.7+ KB 描述性统计结果: Count Min Max 0 43367 1 9835
#销量排行前十的商品销量及其占比
import pandas as pd
inputfile="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\GoodsOrder.csv"
data=pd.read_csv(inputfile,encoding='gbk')
group=data.groupby(['Goods']).count().reset_index()
sorted=group.sort_values('id',ascending=False)
print('销量排行前十商品的销量:\n',sorted[:10])
#画条形图展示销量排行前10的商品销量
import matplotlib.pyplot as plt
x=sorted[:10]['Goods']
y=sorted[:10]['id']
plt.figure(figsize=(8,4))
plt.barh(x,y)
plt.rcParams['font.sans-serif']='SimHei'
plt.xlabel('销量')
plt.ylabel('商品类别')
plt.title('3137商品的销量top10')
plt.savefig("C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\top10.png")
plt.show()
#销量排行前10的商品销量占比
data_nums=data.shape[0]
for idnex, row in sorted[:10].iterrows():
print(row['Goods'],row['id'],row['id']/data_nums)
import pandas as pd
inputfile1="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\GoodsOrder.csv"
inputfile2="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\GoodsTypes.csv"
data=pd.read_csv(inputfile1,encoding='gbk')
types=pd.read_csv(inputfile2,encoding='gbk')
group=data.groupby(['Goods']).count().reset_index()
sort=group.sort_values('id',ascending=False).reset_index()
data_nums=data.shape[0]
del sort['index']
sort_links=pd.merge(sort,types)
#根据类别求和,每个商品类别的总量,排序
sort_link=sort_links.groupby(['Types']).sum().reset_index()
sort_link=sort_link.sort_values('id',ascending=False).reset_index()
del sort_link['index']#删除index列
#求百分比,然后更换列名,最后输出文件
sort_link['count']=sort_link.apply(lambda line: line['id']/data_nums,axis=1)
sort_link.rename(columns={'count':'percent'},inplace=True)
print('各类别商品的销量及其占比:\n',sort_link)
outfile1="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\percent.csv"
sort_link.to_csv(outfile1,index=False,header=True,encoding='gbk')
#画饼图展示每类商品的销量占比
import matplotlib.pyplot as plt
data=sort_link['percent']
labels=sort_link['Types']
plt.figure(figsize=(8,6))
plt.pie(data,labels=labels,autopct='%1.2f%%')
plt.rcParams['font.sans-serif']='SimHei'
plt.title('3137每类商品销量占比')
plt.savefig("C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\persent.png")
plt.show()
#先筛选非酒精饮料的商品,然后求百分比,然后输出结果到文件
selected=sort_links.loc[sort_links['Types']=='非酒精饮料']
child_nums=selected['id'].sum()
selected['child_percent']=selected.apply(lambda line: line['id']/child_nums,axis=1)
selected.rename(columns={'id':'count'},inplace=True)
print('非酒精饮料内部商品的销量及其占比:\n',selected)
outfile2="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\percent.csv"
sort_link.to_csv(outfile2,index=False,header=True,encoding='gbk')
#画饼图展示非酒精饮品内部商品的销量占比
import matplotlib.pyplot as plt
data=selected['child_percent']
labels=selected['Goods']
plt.figure(figsize=(8,6))
explode=(0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.08,0.3,0.1,0.3)
plt.pie(data,explode=explode,labels=labels,autopct='%1.2f%%',pctdistance=1.1,labeldistance=1.2)
plt.rcParams['font.sans-serif']='SimHei'
plt.title("3137非酒精饮料内部个商品的销量占比")
plt.axis('equal')
plt.savefig("C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\child_percent.png")
plt.show()
import pandas as pd
inputfile="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\GoodsOrder.csv"
data=pd.read_csv(inputfile,encoding='gbk')
#根据id对'Goods'列合并,并使用","将各商品隔开
data['Goods']=data['Goods'].apply(lambda x:','+x)
data=data.groupby('id').sum().reset_index()
#对合并的商品列转换数据格式
data['Goods']=data['Goods'].apply(lambda x:[x[1:]])
data_list=list(data['Goods'])
#分割商品名为每个元素
data_translation=[]
for i in data_list:
p=i[0].split(',')
data_translation.append(p)
print('数据转换结果的前五个元素:\n',data_translation[0:5])
数据转换结果的前五个元素: [['柑橘类水果', '人造黄油', '即食汤', '半成品面包'], ['咖啡', '热带水果', '酸奶'], ['全脂牛奶'], ['奶油乳酪', '肉泥', '仁果类水果', '酸奶'], ['炼乳', '长面包', '其他蔬菜', '全脂牛奶']]
from numpy import *
def loadDataSet():
return [['a','c','e'],['b','d'],['b','c'],['a','b','c','d'],['a','b'],['b','c'],['a','b'],['a','b','c','e'],['a','b','c'],['a','c','e']]
def createC1(dataSet):
C1=[]
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
#映射为frozenset唯一性的,可使用其构造字典
return list(map(frozenset,C1))
#从候选K项集到频繁K项集(支持度计算)
def scanD(D,CK,minSupport):
ssCnt={}
for tid in D: #遍历数据集
for can in CK: #遍历候选项
if can.issubset(tid): #判断候选项中是否含数据集的各项
if not can in ssCnt:
ssCnt[can]=1 #不含设为1
else:
ssCnt[can]+=1 #有则计数加1
numItems=float(len(D)) #数据集大小
retList=[] #L1初始化
supportData={} #记录候选项中各个数据的支持度
for key in ssCnt:
support=ssCnt[key]/numItems #计算支持度
if support>=minSupport:
retList.insert(0,key) #满足条件加入L1中
supportData[key]=support
return retList, supportData
def calSupport(D,CK,min_support):
dict_sup={}
for i in D: #遍历数据集
for j in CK: #遍历候选项
if j.issubset(i): #判断候选项中是否含数据集的各项
if not j in dict_sup:
dict_sup[j]=1 #不含设为1
else:
dict_sup[j]+=1 #有则计数加1
sumCount=float(len(D))
supportData={}
relist=[]
for i in dict_sup:
temp_sup=dict_sup[i]/sumCount
if temp_sup>=min_support:
relist.append(i)
#此处可设置返回全部的支持度数据
supportData[i]=temp_sup
return relist,supportData
#改进剪枝算法
def aprioriGen(Lk,k):
retList=[]
lenLk=len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk): #两两组合遍历
L1=list(Lk[i])[:k-2]
L2=list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1==L2: #前k-1项相等,则可相乘,这样可防止重复项出现
#进行剪枝(a1为k项集中的一个元素,b为它的所有k-1项子集)
a=Lk[i]|Lk[j] #a为frozenset()集合
a1=list(a)
b=[]
#遍历取出每一个元素,转换为set,依次从a1中剔除该元素,并加入到b中
for q in range(len(a1)):
t=[a1[q]]
tt=frozenset(set(a1)-set(t))
b.append(tt)
t=0
for w in b:
#当b(即所有k-1项子集)都是lk(频繁的)子集,则保留,否则剔除
if w in Lk:
t+=1
if t==len(b):
retList.append(b[0]|b[1])
return retList
def apriori(dataSet,minSupport=0.2):
#前三条语句是对计算查找单个元素中的频繁项集
C1=createC1(dataSet)
D=list(map(set,dataSet)) #使用List()转换为列表
L1,supportData=calSupport(D,C1,minSupport)
L=[L1] #加列表框,是的1项集为一个单独元素
k=2
while (len(L[k-2])>0): #是否还有候选集
CK=aprioriGen(L[k-2],k)
Lk,supK=scanD(D,CK,minSupport) #scan DB to get Lk
supportData.update(supK) #把supk的键值对添加到supportData里
L.append(Lk) #L最后一个值为空集
k+=1
del L[-1] #删除最后一个空集
return L,supportData #L为频繁项集,为一个列表,1,2,3项集分别为一个元素
#生成集合的所有子集
def getSubset(fromList,toList):
for i in range(len(fromList)):
t=[fromList[i]]
tt=frozenset(set(fromList)-set(t))
if not tt in toList:
toList.append(tt)
tt=list(tt)
if len(tt)>1:
getSubset(tt,toList)
def calcConf(freqSet,H,supportData,ruleList,minConf=0.7):
for conseq in H: #遍历H中的所有项集并计算他们的可信度集
conf=supportData[freqSet]/supportData[freqSet-conseq] #可信度计算,结合支持度数据
#提升都lift计算lift=p(a&b)/p(a)*p(b)
lift=supportData[freqSet]/(supportData[conseq]*supportData[freqSet-conseq])
if conf>=minConf and lift>1:
print(freqSet-conseq,'-->',conseq,'支持度',round(supportData[freqSet],6),'置信度:',round(conf,6),'lift值为:',round(lift,6))
ruleList.append((freqSet-conseq,conseq,conf))
#生成规则
def gen_rule(L,supportData,minConf=0.7):
bigRuleList=[]
for i in range(1,len(L)): #从二项集开始计算
for freqSet in L[i]: #freqSet为所有的k项集
#求该三项集的所有非空子集,1项集,2项集,直到k-1项集,用H1表示,为List类型,里面为frozenset类型
H1=list(freqSet)
all_subset=[]
getSubset(H1,all_subset) #生成所有的子集
calcConf(freqSet,all_subset,supportData,bigRuleList,minConf)
return bigRuleList
if __name__=='__main__':
dataSet=data_translation
L,supportData=apriori(dataSet,minSupport=0.02)
rule=gen_rule(L,supportData,minConf=0.35)
frozenset({'水果/蔬菜汁'}) --> frozenset({'全脂牛奶'}) 支持度 0.02664 置信度: 0.368495 lift值为: 1.44216 frozenset({'人造黄油'}) --> frozenset({'全脂牛奶'}) 支持度 0.024199 置信度: 0.413194 lift值为: 1.617098 frozenset({'牛肉'}) --> frozenset({'全脂牛奶'}) 支持度 0.021251 置信度: 0.405039 lift值为: 1.58518 frozenset({'冷冻蔬菜'}) --> frozenset({'全脂牛奶'}) 支持度 0.020437 置信度: 0.424947 lift值为: 1.663094 frozenset({'黄油'}) --> frozenset({'其他蔬菜'}) 支持度 0.020031 置信度: 0.361468 lift值为: 1.868122 frozenset({'本地蛋类'}) --> frozenset({'全脂牛奶'}) 支持度 0.029995 置信度: 0.472756 lift值为: 1.850203 frozenset({'黑面包'}) --> frozenset({'全脂牛奶'}) 支持度 0.025216 置信度: 0.388715 lift值为: 1.521293 frozenset({'糕点'}) --> frozenset({'全脂牛奶'}) 支持度 0.033249 置信度: 0.373714 lift值为: 1.462587 frozenset({'酸奶油'}) --> frozenset({'其他蔬菜'}) 支持度 0.028876 置信度: 0.402837 lift值为: 2.081924 frozenset({'猪肉'}) --> frozenset({'其他蔬菜'}) 支持度 0.021657 置信度: 0.375661 lift值为: 1.941476 frozenset({'酸奶油'}) --> frozenset({'全脂牛奶'}) 支持度 0.032232 置信度: 0.449645 lift值为: 1.759754 frozenset({'猪肉'}) --> frozenset({'全脂牛奶'}) 支持度 0.022166 置信度: 0.38448 lift值为: 1.504719 frozenset({'凝乳'}) --> frozenset({'全脂牛奶'}) 支持度 0.026131 置信度: 0.490458 lift值为: 1.919481 frozenset({'热带水果'}) --> frozenset({'全脂牛奶'}) 支持度 0.042298 置信度: 0.403101 lift值为: 1.577595 frozenset({'柑橘类水果'}) --> frozenset({'全脂牛奶'}) 支持度 0.030503 置信度: 0.36855 lift值为: 1.442377 frozenset({'黄油'}) --> frozenset({'全脂牛奶'}) 支持度 0.027555 置信度: 0.497248 lift值为: 1.946053 frozenset({'酸奶'}) --> frozenset({'全脂牛奶'}) 支持度 0.056024 置信度: 0.401603 lift值为: 1.571735 frozenset({'其他蔬菜'}) --> frozenset({'全脂牛奶'}) 支持度 0.074835 置信度: 0.386758 lift值为: 1.513634
#先筛选非酒精饮料的商品,然后求百分比,然后输出结果到文件
selected=sort_links.loc[sort_links['Types']=='西点']
child_nums=selected['id'].sum()
selected['child_percent']=selected.apply(lambda line: line['id']/child_nums,axis=1)
selected.rename(columns={'id':'count'},inplace=True)
print('西点内部商品的销量及其占比:\n',selected)
outfile2="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\percent.csv"
sort_link.to_csv(outfile2,index=False,header=True,encoding='gbk')
#画饼图展示非酒精饮品内部商品的销量占比
import matplotlib.pyplot as plt
data=selected['child_percent']
labels=selected['Goods']
plt.figure(figsize=(8,6))
explode=(0.001808,0.006952, 0.006952,0.009455,0.011680,0.012375, 0.018076,0.023359, 0.024194,0.033509,0.033509,0.038237,0.050751,0.051168,0.051724, 0.052558,0.054227,0.057564,0.088710,0.121663,0.251529)
plt.pie(data,explode=explode,labels=labels,autopct='%1.2f%%',pctdistance=1.1,labeldistance=1.2)
plt.axis('equal')
plt.rcParams['font.sans-serif']='SimHei'
plt.title("3137西点内部个商品的销量占比")
plt.axis('equal')
plt.savefig("C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\child_percent.png")
plt.show()
import pandas as p
import numpy as np
from numpy import *
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# 从硬盘读取数据进入内存
inputfile="C:\\Users\\ASUS\\Documents\\WeChat Files\\wxid_ivbyuelp335q22\\FileStorage\\File\\2023-03\\percent.csv"
percent = pd.read_csv(inputfile,encoding='gbk')
percent.head()
# 无量纲化
def dimensionlessProcessing(df):
newDataFrame = pd.DataFrame(index=df.index)
columns = df.columns.tolist()
for c in columns:
d = df[c]
MAX = d.max()
MIN = d.min()
MEAN = d.mean()
newDataFrame[c] = ((d - MEAN) / (MAX - MIN)).tolist()
return newDataFrame
def GRA_ONE(gray, m=0):
# 读取为df格式
gray = dimensionlessProcessing(gray)
# 标准化
std = gray.iloc[:, m] # 为标准要素
gray.drop(str(m),axis=1,inplace=True)
ce = gray.iloc[:, 0:] # 为比较要素
shape_n, shape_m = ce.shape[0], ce.shape[1] # 计算行列
# 与标准要素比较,相减
a = zeros([shape_m, shape_n])
for i in range(shape_m):
for j in range(shape_n):
a[i, j] = abs(ce.iloc[j, i] - std[j])
# 取出矩阵中最大值与最小值
c, d = amax(a), amin(a)
# 计算值
result = zeros([shape_m, shape_n])
for i in range(shape_m):
for j in range(shape_n):
result[i, j] = (d + 0.5 * c) / (a[i, j] + 0.5 * c)
# 求均值,得到灰色关联值,并返回
result_list = [mean(result[i, :]) for i in range(shape_m)]
result_list.insert(m,1)
return pd.DataFrame(result_list)
def GRA(DataFrame):
df = DataFrame.copy()
list_columns = [
str(s) for s in range(len(df.columns)) if s not in [None]
]
df_local = pd.DataFrame(columns=list_columns)
df.columns=list_columns
for i in range(len(df.columns)):
df_local.iloc[:, i] = GRA_ONE(df, m=i)[0]
return df_local