# 对数据进行基本的探索
# 返回缺失值个数以及最大、最小值
import pandas as pd
datafile=r'E:\sj\air_data.csv' #航空原始数据,第一行为属性标签
resultfile =r'E:\outputsj\explore.csv'# 数据探索结果表
# 读取原始数据,指定UTF-8编码(需要用文本编辑器将数据转换为UTF-8编码)
data = pd.read_csv(datafile, encoding = 'utf-8')
# 包括对数据的基本描述,percentiles参数是指定计算多少的分位数表(如1/4分位数、中位数等)
explore=data.describe(percentiles = [], include = 'all').T
# describe()函数自动计算非空值数,需要手动计算空值数
explore['null'] = len(data)-explore['count']
explore = explore[['null', 'max', 'min']]
explore.columns = [u'空值数',u'最大值',u'最小值']# 表头重命名1:1
#这里只选取部分探索结果。
#describe()函数自动计算的字段有count(非空值数)、unique(唯一值数)、top(频数最高者)、 freq(最高频数)、mean(平均值)、
#std(方差)、min(最小值)、50名(中位数)、max(最大值)
explore.to_csv(resultfile) # 导出结果
import pandas as pd
import matplotlib.pyplot as plt
# 客户信息类别
# 提取会员入会年份
from datetime import datetime
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x : x.year)
# 绘制各年份会员入会人数直方图
fig = plt.figure(figsize = (8 ,5)) # 设置画布大小
plt.rcParams['font.sans-serif'] = 'SimHei' # 设置中文显示
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数3151',fontsize=20)
plt.show()
plt.close
# 提取会员不同性别人数
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
# 绘制会员性别比例饼图
fig = plt.figure(figsize = (7 ,4)) # 设置画布大小
plt.pie([ male, female], labels=['男','女'], colors=['lightskyblue', 'lightcoral'],
autopct='%1.1f%%')
plt.title('会员性别比例3151',fontsize=20)
plt.show()
plt.close
#提取不同级别会员的人数
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
#绘制会员各级别人数条形图
fig = plt.figure(figsize=(8,5)) #设置画布大小
plt.bar(range(3), [lv_four, lv_five, lv_six], width=0.4, alpha=0.8, color='skyblue')
#left:x轴的位置序列,一般采用arange函数产生一个序列;
#height:y轴的数值序列,也就是柱形图的高度,一般就是我们需要展示的数据;
#alpha:透明度
#width:为柱形图的宽度,一般这是为0.8即可;
#color或facecolor:柱形图填充的颜色;
plt.xticks([index for index in range(3)], ['4', '5', '6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数3151',fontsize=20)
plt.show()
plt.close()
#提取会员年龄
age = data['AGE'].dropna()
age = age.astype('int64')
#绘制会员年龄分布箱型图
fig = plt.figure(figsize=(5,10))
plt.boxplot(age, patch_artist=True, labels=['会员年龄'], boxprops={'facecolor': 'lightblue'}) #设置填充颜色
plt.title('会员年龄分布箱型图3151',fontsize=20)
#显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close()
lte = data['LAST_TO_END']
fc = data['FLIGHT_COUNT']
sks = data['SEG_KM_SUM']
# 绘制最后乘机至结束时长箱线图
fig = plt.figure(figsize=(5 ,8))
plt.boxplot(lte,
patch_artist=True,
labels = ['时长'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('会员最后乘机至结束时长分布箱线图3151',fontsize=20)
# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close
# 绘制客户飞行次数箱线图
fig = plt.figure(figsize=(5 ,8))
plt.boxplot(fc,
patch_artist=True,
labels = ['飞行次数'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('会员飞行次数分布箱线图3151',fontsize=20)
# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close
# 绘制客户总飞行公里数箱线图
fig = plt.figure(figsize=(5 ,10))
plt.boxplot(sks,
patch_artist=True,
labels = ['总飞行公里数'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('客户总飞行公里数箱线图3151',fontsize=20)
# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close
#积分信息类别
#提取会员积分
ec=data['EXCHANGE_COUNT']# 绘制会员兑换积分次数直方图
fig = plt.figure(figsize=(8 ,5))# 设置画布大小
plt.hist(ec,bins=5, color='#0504aa')
plt.xlabel('兑换次数')
plt.ylabel('会员人数')
plt.title('会员兑换积分次数分布直方图3151',fontsize=20)
plt.show()
plt.close
# 提取会员总累计积分
ps =data['Points_Sum']# 绘制会员总累计积分箱型图
fig = plt.figure(figsize=(5 ,8))
plt.boxplot(ps,
patch_artist=True,
labels =['总累计积分'], # 设置x轴标题
boxprops = {'facecolor':'lightblue'}) # 设置填充颜色
plt.title('客户总累计积分箱型图3151',fontsize=20)# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close
#提取属性并合并为新的数据集
data_corr = data.loc[:,['FFP_TIER', 'FLIGHT_COUNT', 'LAST_TO_END', 'SEG_KM_SUM', 'EXCHANGE_COUNT', 'Points_Sum']]
age1 = data['AGE'].fillna(0)
data_corr['AGE'] = age1.astype('int64')
data_corr['ffp_year'] = ffp_year
#计算相关性矩阵
dt_corr = data_corr.corr(method='pearson')
print('相关性矩阵为:\n', dt_corr)
#绘制热力图
import seaborn as sns
plt.subplots(figsize=(10, 10)) #设置画面大小
# data:数据 square:是否是正方形 vmax:最大值 vmin:最小值 robust:排除极端值影响
sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues')
plt.title('热力图3151',fontsize=20)
plt.show()
plt.close()
import numpy as np
import pandas as pd
#原始数据路径
datafile = r'E:\sj\air_data.csv'
#数据清洗后保存的文件路径
cleanedfile = r'E:\outputsj\air_cleaned.csv'
#读取数据
airline_data = pd.read_csv(datafile,encoding = 'utf-8')
print('原始数据的形状为:', airline_data.shape)
#去除票价为空的记录
airline_notnull = data.loc[airline_data['SUM_YR_1'].notnull() & airline_data['SUM_YR_2'].notnull(), :]
print('删除缺失记录后数据的形状为:',airline_notnull.shape)
#只保留票价为非零的,或者平均折扣率不为0且总飞行公里数大于0的记录
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM'] > 0) & (airline_notnull['avg_discount'] != 0)
index4 = airline_notnull['AGE'] > 100 #去除年龄大于100的记录
airline = airline_notnull[(index1 | index2) & index3 & ~index4]
print('清洗后数据的形状为:\n', airline.shape)
airline.to_csv(cleanedfile) #保存清洗后的数据
import pandas as pd
import numpy as np
#读取数据清洗后的数据
cleanedfile = r'E:\outputsj\air_cleaned.csv'
airline = pd.read_csv(cleanedfile,encoding = 'utf-8')
#选取需求属性
airline_selection = airline[['FFP_DATE', 'LOAD_TIME', 'LAST_TO_END', 'FLIGHT_COUNT', 'SEG_KM_SUM', 'avg_discount']]
print('筛选的属性前5行为:\n', airline_selection.head())
#构造属性L
L = pd.to_datetime(airline_selection['LOAD_TIME']) - pd.to_datetime(airline_selection['FFP_DATE'])
L = L.astype('str').str.split().str[0]
L = L.astype('int')/30
#合并属性
airline_features = pd.concat([L, airline_selection.iloc[:,2:]], axis=1)
print('构建的LRFMC属性前5行为:\n', airline_features.head())
#数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(airline_features)
np.savez(r'E:\outputsj\airline_scale.npz',data)
print('标准化后LRFMC 5个属性为:\n', data[:5,:])
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
#读取标准化后的数据
airline_scale = np.load(r'E:/outputsj/airline_scale.npz')['arr_0']
k = 5 #确定聚类中心数
#构建模型,随机种子设为123
kmeans_model = KMeans(n_clusters=k, random_state=123)
fit_kmeans = kmeans_model.fit(airline_scale) #模型训练
#查看聚类结果
kmeans_cc = kmeans_model.cluster_centers_ #聚类中心
print('各类聚类中心为:\n', kmeans_cc)
kmeans_labels = kmeans_model.labels_ #样本的类别标签
print('各样本的类别标签为:\n', kmeans_labels)
r1 = pd.Series(kmeans_model.labels_).value_counts() #统计不同类别样本的数目
print('最终每个类别的数目为:\n', r1)
#输出聚类分群的结果
cluster_center = pd.DataFrame(kmeans_model.cluster_centers_, columns=['ZL', 'ZR', 'ZF', 'ZM', 'ZC']) #将聚类中心放在数据框中
print(cluster_center)
cluster_center.index = pd.DataFrame(kmeans_model.labels_).drop_duplicates().iloc[:,0] #将样本类别作为数据框索引
print(cluster_center)
#print(cluster_center.index)
#print(pd.DataFrame(kmeans_model.labels_).drop_duplicates().iloc[:,0])
%matplotlib inline
import matplotlib.pyplot as plt
#客户分群雷达图
labels = ['ZL', 'ZR', 'ZF', 'ZM', 'ZC']
legen = ['客户群' + str(i + 1) for i in cluster_center.index]
lstype = ['-', '--', (0, (3, 5, 1, 5, 1, 5)), ':', '-.']
kinds = list(cluster_center.iloc[:,0])
#由于雷达图要保证数据闭合,因此要添加L列,并转换为np.ndarray
cluster_center = pd.concat([cluster_center, cluster_center[['ZL']]], axis=1)
centers = np.array(cluster_center.iloc[:, 0:])
#分割圆周长,并让其闭合
n = len(labels)
angle = np.linspace(0, 2 * np.pi, n, endpoint=False)
angle = np.concatenate((angle, [angle[0]]))
#values = np.concatenate((values, [values[0]]))
legen = np.concatenate((legen, [legen[0]]))
labels=np.concatenate((labels,[labels[0]])) #对labels进行封闭
#values = np.concatenate((values, [values[0]]))
#angles = np.concatenate((angles, [angles[0]]))
#labels=np.concatenate((feature,[feature[0]])) #对labels进行封闭
#绘图
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, polar=True) #以极坐标的形式绘制图形
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
#画线
for i in range(len(kinds)):
ax.plot(angle, centers[i], linestyle=lstype[i], linewidth=2, label=kinds[i])
#添加属性标签
ax.set_thetagrids(angle * 180 / np.pi, labels)
plt.title('客户特征分析雷达图3151',fontsize=20)
plt.legend(legen)
plt.show()
plt.close()
客户流失预测:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
import warnings
warnings.filterwarnings('ignore')
from scipy import stats
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn import metrics
import prettytable
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
data.head().T
data.info()
data.drop("customerID", axis=1, inplace=True)
# 转换成连续型变量
data['TotalCharges'] = pd.to_numeric(data.TotalCharges, errors='coerce')
# 查看是否存在缺失值
data['TotalCharges'].isnull().sum()
# 查看缺失值分布
data.loc[data['TotalCharges'].isnull()].T
data.query("tenure == 0").shape[0]
data.query("tenure == 0").shape[0]
data = data.query("tenure != 0")
# 重置索引
data = data.reset_index().drop('index',axis=1)
# 查看各类别特征频数
for i in data.select_dtypes(include="object").columns:
print(data[i].value_counts())
print('-'*50)
data.Churn = data.Churn.map({'No':0,'Yes':1})
fig, ax= plt.subplots(nrows=2, ncols=3, figsize = (20,8))
for i, feature in enumerate(['tenure','MonthlyCharges','TotalCharges']):
data.loc[data.Churn == 1, feature].hist(ax=ax[0][i], bins=30)
data.loc[data.Churn == 0, feature].hist(ax=ax[1][i], bins=30, )
ax[0][i].set_xlabel(feature+' Churn=0')
ax[1][i].set_xlabel(feature+' Churn=1')
plt.title('3151',fontsize=100)
data['TotalCharges_diff'] = data.tenure * data.MonthlyCharges - data.TotalCharges
def func(x):
if x > 0:
res = 2 # 2表示月费增加
elif x == 0:
res = 1 # 1表示月费持平
else:
res = 0 # 0表示月费减少
return res
data['TotalCharges_diff1'] = data['TotalCharges_diff'].apply(lambda x:func(x))
data.drop('TotalCharges_diff', axis=1, inplace=True)
data['tenure'] = pd.qcut(data['tenure'], q=5, labels=['tenure_'+str(i) for i in range(1,6)])
data['MonthlyCharges'] = pd.qcut(data['MonthlyCharges'], q=5, labels=['MonthlyCharges_'+str(i) for i in range(1,6)])
data['TotalCharges'], _ = stats.boxcox(data['TotalCharges'])
X = data[data.columns.drop('Churn')]
y = data.Churn
# 生成哑变量
X = pd.get_dummies(X)
# 标准化
scaler = StandardScaler()
scale_data = scaler.fit_transform(X)
X = pd.DataFrame(scale_data, columns = X.columns)
y.value_counts()
model_smote = SMOTE(random_state=10) # 建立SMOTE模型对象
X_smote, y_smote = model_smote.fit_resample(X, y)
y_smote.value_counts()
X_smote.shape[1]
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, stratify=y_smote, random_state=11)
# 交叉验证输出f1得分
def score_cv(model, X, y):
kfold = KFold(n_splits=5, random_state=42, shuffle=True)
f1= cross_val_score(model, X, y, scoring='f1', cv=kfold).mean()
return f1
# 网格搜索
def gridsearch_cv(model, test_param, cv=5):
gsearch = GridSearchCV(estimator=model, param_grid=test_param, scoring='f1', n_jobs=-1, cv=cv)
gsearch.fit(X_train, y_train)
print('Best Params: ', gsearch.best_params_)
print('Best Score: ', gsearch.best_score_)
return gsearch.best_params_
# 输出预测结果及混淆矩阵等相关指标
def model_pred(model):
model.fit(X_train, y_train)
pred = model.predict(X_test)
print('test f1-score: ', metrics.f1_score(y_test, pred))
print('-'*50)
print('classification_report \n',metrics.classification_report(y_test, pred))
print('-'*50)
tn, fp, fn, tp = metrics.confusion_matrix(y_test, pred).ravel() # 获得混淆矩阵
confusion_matrix_table = prettytable.PrettyTable(['','actual-1','actual-0']) # 创建表格实例
confusion_matrix_table.add_row(['prediction-1',tp,fp]) # 增加第一行数据
confusion_matrix_table.add_row(['prediction-0',fn,tn]) # 增加第二行数据
print('confusion matrix \n',confusion_matrix_table)
lr = LogisticRegression(random_state=10)
lr_f1 = score_cv(lr, X_train, y_train)
lr_f1
model_pred(lr)
标签:plt,航空公司,labels,客户,airline,pd,print,价值,data From: https://www.cnblogs.com/2500435557qq/p/17209246.html