In [ ]:
import pandas as pd
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.columns = ['用户ID', '性别', '是否老人', '是否有伴侣', '是否有孩子',
'合同期限', '通话服务', '多线程', '网络服务',
'在线安全', '在线备份', '设备安全', '技术支持',
'流媒体电视', '流媒体电影', '合同类型', '电子账单',
'支付方式', '月消费', '总消费', '是否流失']
data.head()
Out[ ]:
用户ID | 性别 | 是否老人 | 是否有伴侣 | 是否有孩子 | 合同期限 | 通话服务 | 多线程 | 网络服务 | 在线安全 | ... | 设备安全 | 技术支持 | 流媒体电视 | 流媒体电影 | 合同类型 | 电子账单 | 支付方式 | 月消费 | 总消费 | 是否流失 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
In [ ]:csm_cols = ['合同期限','合同类型', '电子账单','支付方式', '月消费', '总消费']
data[csm_cols]
Out[ ]:
合同期限 | 合同类型 | 电子账单 | 支付方式 | 月消费 | 总消费 | |
---|---|---|---|---|---|---|
0 | 1 | Month-to-month | Yes | Electronic check | 29.85 | 29.85 |
1 | 34 | One year | No | Mailed check | 56.95 | 1889.5 |
2 | 2 | Month-to-month | Yes | Mailed check | 53.85 | 108.15 |
3 | 45 | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 |
4 | 2 | Month-to-month | Yes | Electronic check | 70.70 | 151.65 |
... | ... | ... | ... | ... | ... | ... |
7038 | 24 | One year | Yes | Mailed check | 84.80 | 1990.5 |
7039 | 72 | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 |
7040 | 11 | Month-to-month | Yes | Electronic check | 29.60 | 346.45 |
7041 | 4 | Month-to-month | Yes | Mailed check | 74.40 | 306.6 |
7042 | 66 | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 |
7043 rows × 6 columns
In [ ]:# 4.6.1 数据检查:确认数据类型、可视化 -> 初步了解
# 转换数值格式
data[csm_cols].dtypes
Out[ ]:
合同期限 int64
合同类型 object
电子账单 object
支付方式 object
月消费 float64
总消费 object
dtype: object
In [ ]:
# 筛选排除是否存在含有空格的数值
data[data['总消费']==' ']
Out[ ]:
用户ID | 性别 | 是否老人 | 是否有伴侣 | 是否有孩子 | 合同期限 | 通话服务 | 多线程 | 网络服务 | 在线安全 | ... | 设备安全 | 技术支持 | 流媒体电视 | 流媒体电影 | 合同类型 | 电子账单 | 支付方式 | 月消费 | 总消费 | 是否流失 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
488 | 4472-LVYGI | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | No | |
753 | 3115-CZMZD | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | No | |
936 | 5709-LVOEQ | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | No | |
1082 | 4367-NUYAO | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | No | |
1340 | 1371-DWPAZ | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | No | |
3331 | 7644-OMVMY | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | No | |
3826 | 3213-VVOLG | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | No | |
4380 | 2520-SGTTA | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | No | |
5218 | 2923-ARZLG | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | No | |
6670 | 4075-WKNIU | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | No | |
6754 | 2775-SEFEE | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | No |
11 rows × 21 columns
In [ ]:# 发现上面的空白额数据条数是1%左右,对于最终的影响不大,这里选择直接删掉
import numpy as np
data_nonan = data.replace(' ', np.nan).dropna()
data_nonan
Out[ ]:
用户ID | 性别 | 是否老人 | 是否有伴侣 | 是否有孩子 | 合同期限 | 通话服务 | 多线程 | 网络服务 | 在线安全 | ... | 设备安全 | 技术支持 | 流媒体电视 | 流媒体电影 | 合同类型 | 电子账单 | 支付方式 | 月消费 | 总消费 | 是否流失 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | ... | Yes | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No |
7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | ... | Yes | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No |
7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No |
7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes |
7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | ... | Yes | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No |
7032 rows × 21 columns
In [ ]:# 转换数据类型
data_after=data_nonan
data_after['总消费']=data_after['总消费'].astype('float64')
data_after[csm_cols].dtypes
Out[ ]:
合同期限 int64
合同类型 object
电子账单 object
支付方式 object
月消费 float64
总消费 float64
dtype: object
In [ ]:
# 可视化
data_after['合同期限'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
data_after['月消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
data_after['总消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
# 4.6.2 类别数据:调用eda_calculate函数
# 全局变量
df_churn = data_after[data_after['是否流失']=='Yes']
def eda_calculate(column,types):
"""
计算类别数据对应条数、占总数比例、对应流失率
param @column: str,列表
param @types: list,类别数据
"""
# 1.计算类别数据对应的个数
res_list = []
rate_list = []
print('\n------------------------当前列标签:',column,types)
for t in types:
res = len(data_after[data_after[column]==t])
# 3.计算类别数据对应的流失率
rate = len(df_churn[df_churn[column]==t]) / res
res_list.append({t:res})
rate_list.append({t:rate})
# 2.展示各类别数据对应的个数、占总数比例
for r in res_list:
print("=================")
print(r,"占总数比例",list(r.values())[0]/len(data_after))
# 4.预览个别数据对应的流失率、前者和后者的倍数关系
print("类别数据对应流失率")
for rate in rate_list:
before = list(rate.values())[0]
index = rate_list.index(rate) + 1
print(rate)
if index < len(rate_list):
after = list(rate_list[index].values())[0]
print(rate,'前者和后者的倍数关系',before / after)
for c in ['合同类型','电子账单','支付方式']:
eda_calculate(column=c,types=data_after[c].drop_duplicates().tolist())
------------------------当前列标签: 合同类型 ['Month-to-month', 'One year', 'Two year']
=================
{'Month-to-month': 3875} 占总数比例 0.551052332195677
=================
{'One year': 1472} 占总数比例 0.20932878270762229
=================
{'Two year': 1685} 占总数比例 0.2396188850967008
类别数据对应流失率
{'Month-to-month': 0.4270967741935484}
{'Month-to-month': 0.4270967741935484} 前者和后者的倍数关系 3.787267780800622
{'One year': 0.11277173913043478}
{'One year': 0.11277173913043478} 前者和后者的倍数关系 3.9587579257246377
{'Two year': 0.028486646884272996}
------------------------当前列标签: 电子账单 ['Yes', 'No']
=================
{'Yes': 4168} 占总数比例 0.5927189988623436
=================
{'No': 2864} 占总数比例 0.4072810011376564
类别数据对应流失率
{'Yes': 0.33589251439539347}
{'Yes': 0.33589251439539347} 前者和后者的倍数关系 2.0511645228750677
{'No': 0.16375698324022347}
------------------------当前列标签: 支付方式 ['Electronic check', 'Mailed check', 'Bank transfer (automatic)', 'Credit card (automatic)']
=================
{'Electronic check': 2365} 占总数比例 0.3363196814562002
=================
{'Mailed check': 1604} 占总数比例 0.22810011376564276
=================
{'Bank transfer (automatic)': 1542} 占总数比例 0.21928327645051193
=================
{'Credit card (automatic)': 1521} 占总数比例 0.21629692832764505
类别数据对应流失率
{'Electronic check': 0.4528541226215645}
{'Electronic check': 0.4528541226215645} 前者和后者的倍数关系 2.3583701710551606
{'Mailed check': 0.19201995012468828}
{'Mailed check': 0.19201995012468828} 前者和后者的倍数关系 1.147654120512672
{'Bank transfer (automatic)': 0.16731517509727625}
{'Bank transfer (automatic)': 0.16731517509727625} 前者和后者的倍数关系 1.0969240574265395
{'Credit card (automatic)': 0.1525312294543064}
In [ ]:
# 4.6.3 数值数据:分布图、箱行图
# '合同期限'、'月消费'、'总消费':流失用户数据集和非流失用户数据集
df1 = data_after[data_after['是否流失']=='Yes']
df0 = data_after[data_after['是否流失']=='No']
In [ ]:
# 合同期限越长越不容易流失
df1['合同期限'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
df0['合同期限'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
# 月消费
df1['月消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
df0['月消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
# 月消费
df1['总消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
df0['总消费'].hist(bins=50)
Out[ ]:
<Axes: >
In [ ]:
# 总结
# 合同期限: 期限越短越容易流失,按月的流失率达42%,分别为1年的4倍、2年的10倍+
# 电子账单:使用电子账单的流失率为30%,为后者2倍
# 支付方式:使用电子发票的流失率为45号,为后者的2-3倍
# 在网时长: 与流失率呈负相关
# 月消费: 与流失率呈负相关
# 总消费: 与流失率呈负相关
标签:...,4.6,service,No,用户,year,Yes,行为,check
From: https://www.cnblogs.com/mlzxdzl/p/17782437.html