In [ ]:
import pandas as pd
In [ ]:
data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
data.columns = ['用户ID', '性别', '是否老人', '是否有伴侣', '是否有孩子',
'合同期限', '通话服务', '多线程', '网络服务',
'在线安全', '在线备份', '设备安全', '技术支持',
'流媒体电视', '流媒体电影', '合同类型', '电子账单',
'支付方式', '月消费', '总消费', '是否流失']
data.head()
Out[ ]:
用户ID | 性别 | 是否老人 | 是否有伴侣 | 是否有孩子 | 合同期限 | 通话服务 | 多线程 | 网络服务 | 在线安全 | ... | 设备安全 | 技术支持 | 流媒体电视 | 流媒体电影 | 合同类型 | 电子账单 | 支付方式 | 月消费 | 总消费 | 是否流失 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
In [ ]:# 4.4.1 整体流失情况:人数、比例、流失率
In [ ]:
data['是否流失'].drop_duplicates()
Out[ ]:
0 No
2 Yes
Name: 是否流失, dtype: object
In [ ]:
# 流失人数、非流失人数
churn1 = data[data['是否流失']=='Yes']['是否流失'].count()
churn0 = data[data['是否流失']=='No']['是否流失'].count()
churn0,churn1
Out[ ]:
(5174, 1869)
In [ ]:
# 流失率 = 流失人数 / 总人数
churn_per = churn1 / len(data)
churn_per
Out[ ]:
0.2653698707936959
In [ ]:
# 4.4.2 性别:人数、比例、流失率
data['性别'].drop_duplicates()
Out[ ]:
0 Female
1 Male
Name: 性别, dtype: object
In [ ]:
gender0 = data[data['性别']=='Female']['性别'].count()
gender1 = data[data['性别']=='Male']['性别'].count()
gender0,gender1
Out[ ]:
(3488, 3555)
In [ ]:
gender0 / len(data) # 女性占人数总量
Out[ ]:
0.495243504188556
In [ ]:
# 女性流失率:女性流失人数/女性总人数
## 创建一个churn=yes的子数据集
df_churn = data[data['是否流失']=='Yes']
## 计算女性流失率
gender0_churn = df_churn[df_churn['性别']=='Female']['性别'].count()
gender0_churn_per = gender0_churn / gender0
gender0_churn_per
Out[ ]:
0.26920871559633025
In [ ]:
# 男性流失率:男性流失人数/男性总人数
## 计算男性流失率
gender0_churn = df_churn[df_churn['性别']=='Male']['性别'].count()
gender0_churn_per = gender0_churn / gender1
gender0_churn_per
Out[ ]:
0.2616033755274262
In [ ]:
# 4.4.3 老人:人数、比例、流失率
data['是否老人'].drop_duplicates()
Out[ ]:
0 0
20 1
Name: 是否老人, dtype: int64
In [ ]:
senior0 = len(data[data['是否老人']==0])
senior1 = len(data[data['是否老人']==1])
senior0,senior1
Out[ ]:
(5901, 1142)
In [ ]:
senior1 / len(data) # 老年人比例
Out[ ]:
0.1621468124378816
In [ ]:
# 计算对应的流失率:对应流失人数/对应总人数
s0_churn = len(df_churn[df_churn['是否老人']==0])
s1_churn = len(df_churn[df_churn['是否老人']==1])
s0_churn_per = s0_churn / senior0
s1_churn_per = s1_churn / senior1
s0_churn_per, s1_churn_per
Out[ ]:
(0.23606168446026096, 0.4168126094570928)
In [ ]:
# 4.4.4 伴侣:人数、比例、流失率
data['是否有伴侣'].drop_duplicates()
Out[ ]:
0 Yes
1 No
Name: 是否有伴侣, dtype: object
In [ ]:
partner0 = len(data[data['是否有伴侣']=='No'])
partner1 = len(data[data['是否有伴侣']=='Yes'])
partner0,partner1
Out[ ]:
(3641, 3402)
In [ ]:
partner0 / len(data) # 没有伴侣的人的比例
Out[ ]:
0.5169672014766434
In [ ]:
partner1 / len(data) # 有伴侣的人的比例
Out[ ]:
0.4830327985233565
In [ ]:
# 计算对应的流失率:对应流失人数/对应总人数
partner0_churn = len(df_churn[df_churn['是否有伴侣']=='No'])
partner1_churn = len(df_churn[df_churn['是否有伴侣']=='Yes'])
partner0_churn_per = partner0_churn / partner0
partner1_churn_per = partner1_churn / partner1
partner0_churn_per, partner1_churn_per
Out[ ]:
(0.32957978577313923, 0.1966490299823633)
In [ ]:
# 4.4.5 亲属:人数、比例、流失率
data['是否有孩子'].drop_duplicates()
Out[ ]:
0 No
6 Yes
Name: 是否有孩子, dtype: object
In [ ]:
dependents0 = len(data[data['是否有孩子']=='No'])
dependents1 = len(data[data['是否有孩子']=='Yes'])
dependents0,dependents1
Out[ ]:
(4933, 2110)
In [ ]:
# 是否有孩子的比例
dependents0 / len(data),dependents1 / len(data)
Out[ ]:
(0.7004117563538265, 0.2995882436461735)
In [ ]:
# 计算对应的流失率:对应流失人数/对应总人数
dependents0_churn = len(df_churn[df_churn['是否有孩子']=='No'])
dependents1_churn = len(df_churn[df_churn['是否有孩子']=='Yes'])
dependents0_churn_per = dependents0_churn / dependents0
dependents1_churn_per = dependents1_churn / dependents1
dependents0_churn_per, dependents1_churn_per
Out[ ]:
(0.3127914048246503, 0.15450236966824646)
In [ ]:
# 4.4.6 数据洞察:现象、朔源、建议
# 高流失率用户特征:
# 1.性别无特殊性
# 2.老年人相对于年轻人更容易流失
# 3.单身用户更倾向于流失
# 4.无孩子相对于有孩子更容易流失
标签:4.4,No,用户,len,维度,Yes,churn,data,Out
From: https://www.cnblogs.com/mlzxdzl/p/17782422.html