import pandas as pd datafile= 'air_data.csv' # 航空原始数据,第一行为属性标签 resultfile = 'explore.csv' # 数据探索结果表 data = pd.read_csv(datafile,encoding='utf-8') # 包括对数据的基本描述,percentiles参数是指定计算多少的分位数表(如1/4分位数、中位数等) explore = data.describe(percentiles = [], include = 'all').T # T是转置,转置后更方便查阅 explore['null'] = len(data)-explore['count'] # describe()函数自动计算非空值数,需要手动计算空值数 explore = explore[['null', 'max', 'min']] explore.columns = ['null', 'max', 'min'] # 表头重命名 explore.to_csv(resultfile) # 导出结果
import pandas as pd import matplotlib.pyplot as plt datafile= 'air_data.csv' # 航空原始数据,第一行为属性标签 # 读取原始数据,指定UTF-8编码(需要用文本编辑器将数据装换为UTF-8编码) data = pd.read_csv(datafile, encoding = 'utf-8') explore = data.describe(percentiles = [], include = 'all').T # T是转置,转置后更方便查阅 print(explore) from datetime import datetime ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d')) ffp_year = ffp.map(lambda x : x.year) # 绘制各年份会员入会人数直方图 fig = plt.figure(figsize = (8 ,5)) # 设置画布大小 plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示 plt.rcParams['axes.unicode_minus'] = False plt.hist(ffp_year, bins='auto', color='#0504aa') plt.xlabel('年份') plt.ylabel('入会人数') plt.title('各年份会员入会人数3145') plt.show() plt.close # 提取会员不同性别人数 male = pd.value_counts(data['GENDER'])['男'] female = pd.value_counts(data['GENDER'])['女'] # 绘制会员性别比例饼图 fig = plt.figure(figsize = (7 ,4)) # 设置画布大小 plt.pie([ male, female], labels=['男','女'], colors=['lightskyblue', 'lightcoral'], autopct='%1.1f%%') plt.title('会员性别比例3145') plt.show() plt.close # 提取不同级别会员的人数 lv_four = pd.value_counts(data['FFP_TIER'])[4] lv_five = pd.value_counts(data['FFP_TIER'])[5] lv_six = pd.value_counts(data['FFP_TIER'])[6] # 绘制会员各级别人数条形图 fig = plt.figure(figsize = (8 ,5)) # 设置画布大小 plt.bar(x=range(3), height=[lv_four,lv_five,lv_six], width=0.4, alpha=0.8, color='skyblue') plt.xticks([index for index in range(3)], ['4','5','6']) plt.xlabel('会员等级') plt.ylabel('会员人数') plt.title('会员各级别人数3145') plt.show() plt.close() # 提取会员年龄 age = data['AGE'].dropna() age = age.astype('int64') # 绘制会员年龄分布箱型图 fig = plt.figure(figsize = (5 ,10)) plt.boxplot(age, patch_artist=True, labels = ['会员年龄'], # 设置x轴标题 boxprops = {'facecolor':'lightblue'}) # 设置填充颜色 plt.title('会员年龄分布箱线图3145') # 显示y坐标轴的底线 plt.grid(axis='y') plt.show() plt.close
# 乘机信息类别 lte = data['LAST_TO_END'] fc = data['FLIGHT_COUNT'] sks = data['SEG_KM_SUM'] # 绘制最后乘机至结束时长箱线图 fig = plt.figure(figsize = (5 ,8)) plt.boxplot(lte, patch_artist=True, labels = ['时长'], # 设置x轴标题 boxprops = {'facecolor':'lightblue'}) # 设置填充颜色 plt.title('会员最后乘机至结束时长分布箱线图3145') # 显示y坐标轴的底线 plt.grid(axis='y') plt.show() plt.close # 绘制客户飞行次数箱线图 fig = plt.figure(figsize = (5 ,8)) plt.boxplot(fc, patch_artist=True, labels = ['飞行次数'], # 设置x轴标题 boxprops = {'facecolor':'lightblue'}) # 设置填充颜色 plt.title('会员飞行次数分布箱线图3145') # 显示y坐标轴的底线 plt.grid(axis='y') plt.show() plt.close # 绘制客户总飞行公里数箱线图 fig = plt.figure(figsize = (5 ,10)) plt.boxplot(sks, patch_artist=True, labels = ['总飞行公里数'], # 设置x轴标题 boxprops = {'facecolor':'lightblue'}) # 设置填充颜色 plt.title('客户总飞行公里数箱线图3145') # 显示y坐标轴的底线 plt.grid(axis='y') plt.show() plt.close
# 积分信息类别 # 提取会员积分兑换次数 ec = data['EXCHANGE_COUNT'] # 绘制会员兑换积分次数直方图 fig = plt.figure(figsize = (8 ,5)) # 设置画布大小 plt.hist(ec, bins=5, color='#0504aa') plt.xlabel('兑换次数') plt.ylabel('会员人数') plt.title('会员兑换积分次数分布直方图3145') plt.show() plt.close # 提取会员总累计积分 ps = data['Points_Sum'] # 绘制会员总累计积分箱线图 fig = plt.figure(figsize = (5 ,8)) plt.boxplot(ps, patch_artist=True, labels = ['总累计积分'], # 设置x轴标题 boxprops = {'facecolor':'lightblue'}) # 设置填充颜色 plt.title('客户总累计积分箱线图3145') # 显示y坐标轴的底线 plt.grid(axis='y') plt.show() plt.close
# 提取属性并合并为新数据集 data_corr = data[['FFP_TIER','FLIGHT_COUNT','LAST_TO_END', 'SEG_KM_SUM','EXCHANGE_COUNT','Points_Sum']] age1 = data['AGE'].fillna(0) data_corr['AGE'] = age1.astype('int64') data_corr['ffp_year'] = ffp_year # 计算相关性矩阵 dt_corr = data_corr.corr(method = 'pearson') print('相关性矩阵为:\n',dt_corr) # 绘制热力图 import seaborn as sns plt.subplots(figsize=(10, 10)) # 设置画面大小 sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues') plt.title('相关性热力图3145') plt.show() plt.close
import numpy as np import pandas as pd datafile = 'air_data.csv' # 航空原始数据路径 cleanedfile = 'data_cleaned.csv' # 数据清洗后保存的文件路径 # 读取数据 airline_data = pd.read_csv(datafile,encoding = 'utf-8') print('原始数据的形状为:',airline_data.shape) # 去除票价为空的记录 airline_notnull = airline_data.loc[airline_data['SUM_YR_1'].notnull() & airline_data['SUM_YR_2'].notnull(),:] print('删除缺失记录后数据的形状为:',airline_notnull.shape) # 只保留票价非零的,或者平均折扣率不为0且总飞行公里数大于0的记录。 index1 = airline_notnull['SUM_YR_1'] != 0 index2 = airline_notnull['SUM_YR_2'] != 0 index3 = (airline_notnull['SEG_KM_SUM']> 0) & (airline_notnull['avg_discount'] != 0) index4 = airline_notnull['AGE'] > 100 # 去除年龄大于100的记录 airline = airline_notnull[(index1 | index2) & index3 & ~index4] print('数据清洗后数据的形状为:',airline.shape) airline.to_csv(cleanedfile) # 保存清洗后的数据
二、电信用户分类
import pandas as pd data=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') #设置显示所有的列 pd.set_option('display.max_columns',None) #设置显示所有的行 data.head() data.info() data.columns=['用户ID','性别','老年人' ,'是否有配偶' ,'是否经济独立' ,'客户的入网时间','是否开通电话服务业务' ,'是否开通了多线业务' ,'是否开通互联网服务' ,'是否开通网络安全服务','是否开通在线备份业务','是否开通了设备保护业务','是否开通了技术支持服务','是否开通网络电视' ,'是否开通网络电影','签订合同年限' ,'是否开通电子账单','付款方式','月费用','总费用','该用户是否流失'] data.head() data['总费用'].value_counts() for i in data.columns: test=data[i].value_counts() print('[{0}行数是:{1}'.format(i,test.sum())) print('[{0}内容是:\n{1}\n'.format(i,test)) data['总费用']=data['总费用'].apply(pd.to_numeric, errors="coerce") data[data['总费用'].isnull()][['客户的入网时间','总费用']] data['总费用'].fillna(data[data['总费用'].isnull()]['月费用'],axis=0,inplace=True) data['客户的入网时间'].replace(to_replace=0,value=1,inplace=True) data.info()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.pie(data['该用户是否流失'].value_counts(),labels=['未流失','流失'],autopct='%.2f%%',explode=(0.1,0))
plt.title('用户流失占比3145')
import sklearn model=[DecisionTreeClassifier(random_state=120) #决策树、随机森林、K近邻、逻辑回归四个模型 , RandomForestClassifier(random_state=120) ,KNeighborsClassifier() , LogisticRegression(max_iter=1000)] for clf in model: clf.fit(Xtrain,ytrain) y_pre=clf.predict(Xtest) precision = precision_score(ytest, y_pre) accuracy = accuracy_score(ytest, y_pre) print(clf,'\n \n',classification_report(ytest,y_pre) ,'\n \n Precision Score:', precision ,'\n Accuracy Score::',accuracy ,'\n\n') #运行结果: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort='deprecated', random_state=120, splitter='best')
figure,ax=plt.subplots(1,4,figsize=(30,4))
for i in range(4):
train_sizes, train_scores, valid_scores=learning_curve(model[i],over_X,over_y, cv=5,random_state=10)
train_std=train_scores.mean(axis=1)
test_std=valid_scores.mean(axis=1)
ax[i].plot(train_sizes,train_std,color='red',label='train_scores')
ax[i].plot(train_sizes,test_std,color='blue',label='test_scores')
plt.legend()
标签:plt,第三周,会员,airline,pd,csv,data From: https://www.cnblogs.com/lijieying/p/17209188.html