首页 > 编程语言 >python 数据分析

python 数据分析

时间:2023-03-13 23:14:56浏览次数:45  
标签:数据分析 plt center python cluster airline pd data

import matplotlib.pyplot as plt
import pandas as pd
datafile = 'air_data.csv'
resultfile = 'explore.csv'

data = pd.read_csv(datafile,encoding='utf-8')

explore = data.describe(percentiles=[],include='all').T

explore['null'] = len(data) - explore['count']
explore = explore[['null','max','min']]

explore.columns = [u'空数值',u'最大值',u'最小值']

explore.to_csv(resultfile)

from datetime import datetime
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x:x.year)
fig = plt.figure(figsize=(8,5))
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year,bins='auto',color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员人数')

plt.show()
plt.close()

#提取会员不同性别人数
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
#绘制会员性别比例饼图
fig = plt.figure(figsize=(7, 4)) #设置画布大小
plt.pie([male, female], labels=['男', '女'], colors=['lightskyblue', 'lightcoral'], autopct='%1.1f%%')
plt.title('会员性别比例')
plt.show()
plt.close()

#提取不同级别会员的人数
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
#绘制会员各级别人数条形图
fig = plt.figure(figsize=(8,5)) #设置画布大小
plt.bar(range(3), [lv_four, lv_five, lv_six], width=0.4, alpha=0.8, color='skyblue')
#left:x轴的位置序列,一般采用arange函数产生一个序列;
#height:y轴的数值序列,也就是柱形图的高度,一般就是我们需要展示的数据;
#alpha:透明度
#width:为柱形图的宽度,一般这是为0.8即可;
#color或facecolor:柱形图填充的颜色;
plt.xticks([index for index in range(3)], ['4', '5', '6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数')
plt.show()
plt.close()

#提取会员年龄
age = data['AGE'].dropna()
age = age.astype('int64')
#绘制会员年龄分布箱型图
fig = plt.figure(figsize=(5,10))
plt.boxplot(age, patch_artist=True, labels=['会员年龄'], boxprops={'facecolor': 'lightblue'}) #设置填充颜色
plt.title('会员年龄分布箱型图')
#显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close()

#3相关性分析
#提取属性并合并为新的数据集
data_corr = data.loc[:,['FFP_TIER', 'FLIGHT_COUNT', 'LAST_TO_END', 'SEG_KM_SUM', 'EXCHANGE_COUNT', 'Points_Sum']]
age1 = data['AGE'].fillna(0)
data_corr['AGE'] = age1.astype('int64')
data_corr['ffp_year'] = ffp_year

#计算相关性矩阵
dt_corr = data_corr.corr(method='pearson')
print('相关性矩阵为:\n', dt_corr)

import seaborn as sns
#绘制热力图
plt.subplots(figsize=(10, 10)) #设置画面大小
## data:数据 square:是否是正方形 vmax:最大值 vmin:最小值 robust:排除极端值影响
sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues')
plt.show()
plt.close()


# import numpy as np
# from sklearn.cluster import KMeans
# data["LOAD_TIME"] = pd.to_datetime(data["LOAD_TIME"])
# data["FFP_DATE"] = pd.to_datetime(data["FFP_DATE"])
# data["入会时间"] = data["LOAD_TIME"] - data["FFP_DATE"]
# data["平均每公里票价"] = (data["SUM_YR_1"] + data["SUM_YR_2"]) / data["SEG_KM_SUM"]
# data["时间间隔差值"] = data["MAX_INTERVAL"] - data["AVG_INTERVAL"]
#
# deal_data = data.rename(
# columns = {"FLIGHT_COUNT" : "飞行次数", "SEG_KM_SUM" : "总里程", "avg_discount" : "平均折扣率"},
# inplace = False
# )
# filter_data = deal_data[["入会时间", "飞行次数", "平均每公里票价", "总里程", "时间间隔差值", "平均折扣率"]]
# print(filter_data[0:5])
# filter_data['入会时间'] = filter_data['入会时间'].astype(np.int64)/(60*60*24*10**9)
#
# filter_zscore_data = (filter_data -filter_data.mean(axis=0))/(filter_data.std(axis=0))
# filter_zscore_data[0:5]
#
# kmodel = KMeans(n_clusters=4, n_jobs=4)
# kmodel.fit(filter_zscore_data)
#
# # 简单打印结果
# r1 = pd.Series(kmodel.labels_).value_counts() # 统计各个类别的数目
# r2 = pd.DataFrame(kmodel.cluster_centers_) # 找出聚类中心
#
# # 所有簇中心坐标值中最大值和最小值
# max = r2.values.max()
# min = r2.values.min()
# r = pd.concat([r2, r1], axis=1) # 横向连接(0是纵向),得到聚类中心对应的类别下的数目
# r.columns = list(filter_zscore_data.columns) + [u'类别数目'] # 重命名表头
#
# # 绘图
# fig = plt.figure(figsize=(10, 8))
# ax = fig.add_subplot(111, polar=True)
# center_num = r.values
# feature = ["入会时间", "飞行次数", "平均每公里票价", "总里程", "时间间隔差值", "平均折扣率"]
# N = len(feature)
#
# for i, v in enumerate(center_num):
# # 设置雷达图的角度,用于平分切开一个圆面
# angles = np.linspace(0, 2 * np.pi, N, endpoint=False)
# # 为了使雷达图一圈封闭起来,需要下面的步骤
# center = np.concatenate((v[:-1], [v[0]]))
# angles = np.concatenate((angles, [angles[0]]))
# # 绘制折线图
# ax.plot(angles, center, 'o-', linewidth=2, label="第%d簇人群,%d人" % (i + 1, v[-1]))
# # 填充颜色
# ax.fill(angles, center, alpha=0.25)
# # 添加每个特征的标签
# ax.set_thetagrids(angles * 180 / np.pi, feature, fontsize=15)
# # 设置雷达图的范围
# ax.set_ylim(min - 0.1, max + 0.1)
# # 添加标题
# plt.title('客户群特征分析图', fontsize=20)
# # 添加网格线
# ax.grid(True)
# # 设置图例
# plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.0), ncol=1, fancybox=True, shadow=True)
# # 显示图形
# plt.show()
#

import numpy as np
import pandas as pd

datafile='air_data.csv'
cleanedfile='data_cleaned.csv'

airline_data=pd.read_csv(datafile,encoding='utf-8')
print('原始数据的形状为:',airline_data.shape)

airline_notnull=airline_data.loc[airline_data['SUM_YR_1'].notnull() & airline_data['SUM_YR_2'].notnull(),:]
print('删除缺失记录后数据的形状为:',airline_notnull.shape)

index1=airline_notnull['SUM_YR_1']!=0
index2=airline_notnull['SUM_YR_2']!=0
index3=(airline_notnull['SEG_KM_SUM']>0) & (airline_notnull['avg_discount']!=0)
index4=airline_notnull['AGE']>100
airline=airline_notnull[(index1|index2)&index3& ~index4]
print('数据清洗后数据的形状为:',airline.shape)

airline.to_csv(cleanedfile)

import pandas as pd
import numpy as np

cleanedfile='data_cleaned.csv'
airline=pd.read_csv(cleanedfile,encoding='utf-8')
airline_selection=airline[['FFP_DATE','LOAD_TIME','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
print('筛选的属性前5行为:\n',airline_selection.head())

L=pd.to_datetime(airline_selection['LOAD_TIME'])-pd.to_datetime(airline_selection['FFP_DATE'])
L=L.astype('str').str.split().str[0]
L=L.astype('int')/30

airline_features=pd.concat([L,airline_selection.iloc[:,2:]],axis=1)
print('构建的LRFMC属性前5行为:\n',airline_features.head())

from sklearn.preprocessing import StandardScaler
data=StandardScaler().fit_transform(airline_features)
np.savez('airline_scale.npz',data)
print('标准化后LRFMC 5个属性为:\n',data[:5,:])

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

airline_scale=np.load('airline_scale.npz')['arr_0']
k=5

kmeans_model=KMeans(n_clusters=k,random_state=123)
fit_kmeans=kmeans_model.fit(airline_scale)

kmeans_cc=kmeans_model.cluster_centers_
print('各类聚类中心为:\n',kmeans_cc)
kmeans_labels=kmeans_model.labels_
print('各样本的类别标签为:\n',kmeans_labels)
r1=pd.Series(kmeans_model.labels_).value_counts()
print('最终每个类别的数目为:\n',r1)

cluster_center=pd.DataFrame(kmeans_model.cluster_centers_,columns=['ZL','ZR','ZF','ZM','ZC'])
cluster_center.index=pd.DataFrame(kmeans_model.labels_).drop_duplicates().iloc[:,0]
print(cluster_center)


import matplotlib.pyplot as plt
labels=['ZL','ZR','ZF','ZM','ZC']
legen=['客户群'+str(i+1) for i in cluster_center.index]
lstype=['-','--',(0,(3,5,1,5,1,5)),':','-.']
kinds=list(cluster_center.iloc[:,0])
cluster_center=pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
centers=np.array(cluster_center.iloc[:,0:])
n=len(labels)
angle=np.linspace(0,2*np.pi,n,endpoint=False)
angle=np.concatenate((angle,[angle[0]]))

fig=plt.figure(figsize=(8,6))
ax=fig.add_subplot(111,polar=True)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

for i in range(len(kinds)):
ax.plot(angle,centers[i],linestyle=lstype[i],linewidth=2,label=kinds[i])
#ax.set_thetagrids(angle*180/np.pi,labels)
plt.title('客户特征分析雷达图3116')
plt.legend(legen)
plt.show()
plt.close

import matplotlib.pyplot as plt
labels=['ZL','ZR','ZF','ZM','ZC']
legen=['客户群'+str(i+1) for i in cluster_center.index]
lstype=['-','--',(0,(3,5,1,5,1,5)),':','-.']
kinds=list(cluster_center.iloc[:,0])
cluster_center=pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
centers=np.array(cluster_center.iloc[:,0:])
n=len(labels)
angle=np.linspace(0,2*np.pi,n,endpoint=False)
angle=np.concatenate((angle,[angle[0]]))

fig=plt.figure(figsize=(8,6))
ax=fig.add_subplot(111,polar=True)
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

for i in range(len(kinds)):
ax.plot(angle,centers[i],linestyle=lstype[i],linewidth=2,label=kinds[i])
#ax.set_thetagrids(angle*180/np.pi,labels)
plt.title('客户特征分析雷达图3116')
plt.legend(legen)
plt.show()

 

 

 

标签:数据分析,plt,center,python,cluster,airline,pd,data
From: https://www.cnblogs.com/quanshi/p/17213302.html

相关文章