逻辑回归模型 - 案例
股票客户流失预警模型
# 1.读取数据
import pandas as pd
df = pd.read_excel('stock_customer_churn.xlsx')
df
账户资金(元) | 最后一次交易距今时间(天) | 上月交易佣金(元) | 累计交易佣金(元) | 本券商使用时长(年) | 是否流失 | |
---|---|---|---|---|---|---|
0 | 22686.5 | 297 | 149.25 | 2029.85 | 0 | 0 |
1 | 190055.0 | 42 | 284.75 | 3889.50 | 2 | 0 |
2 | 29733.5 | 233 | 269.25 | 2108.15 | 0 | 1 |
3 | 185667.5 | 44 | 211.50 | 3840.75 | 3 | 0 |
4 | 33648.5 | 213 | 353.50 | 2151.65 | 0 | 1 |
... | ... | ... | ... | ... | ... | ... |
7038 | 199145.0 | 40 | 424.00 | 3990.50 | 1 | 0 |
7039 | 682661.0 | 1 | 516.00 | 9362.90 | 5 | 0 |
7040 | 51180.5 | 167 | 148.00 | 2346.45 | 0 | 0 |
7041 | 47594.0 | 174 | 372.00 | 2306.60 | 0 | 1 |
7042 | 636005.0 | 2 | 528.25 | 8844.50 | 5 | 0 |
7043 rows × 6 columns
# 2.划分特征变量X,和目标变量y
X = df.drop(columns='是否流失') ## drop 扔掉某一列
y = df['是否流失']
# 3.划分训练集和测试集
# 训练集数量=7043 * 80%; 测试集7043*20%
from sklearn.model_selection import train_test_split ## sklearn, model_selection
# X *2 , y*2 ,4
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1 )
X_train.head()
账户资金(元) | 最后一次交易距今时间(天) | 上月交易佣金(元) | 累计交易佣金(元) | 本券商使用时长(年) | |
---|---|---|---|---|---|
1814 | 43251.5 | 192 | 98.50 | 2258.35 | 0 |
5946 | 304449.5 | 22 | 369.50 | 5160.55 | 3 |
3881 | 441357.5 | 9 | 325.75 | 6681.75 | 5 |
2389 | 587076.5 | 2 | 427.25 | 8300.85 | 5 |
3676 | 204027.5 | 39 | 352.00 | 4044.75 | 2 |
y_train.head()
1814 0
5946 1
3881 0
2389 0
3676 0
Name: 是否流失, dtype: int64
# 4.模型搭建
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# 5.模型使用1 - 预测数据结果
# 把训练集X_test放进来,用模型去预测y_pred, y_pred和y_test比较
y_pred = model.predict(X_test)
print(y_pred[0:100]) # 打印预测内容的前100个看看
[0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1]
# 放到一个DataFrame里进行查看比对
a = pd.DataFrame() # 创建一个空DataFrame
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a # 可以看到此时前5个预测准确度为80%
预测值 | 实际值 | |
---|---|---|
0 | 0 | 0 |
1 | 0 | 0 |
2 | 0 | 0 |
3 | 0 | 1 |
4 | 0 | 0 |
... | ... | ... |
1404 | 1 | 1 |
1405 | 0 | 0 |
1406 | 0 | 0 |
1407 | 0 | 0 |
1408 | 1 | 1 |
1409 rows × 2 columns
# 查看全部的预测准确度
from sklearn.metrics import accuracy_score
score = accuracy_score(y_pred, y_test)
print(score)
0.7977288857345636
# 也可以使用.score()查看模型预测准确度
model.score(X_test, y_test)
0.7977288857345636
# 6.模型使用2 - 预测概率
y_pred_proba = model.predict_proba(X_test)
y_pred_proba[0:5]
array([[0.82041491, 0.17958509],
[0.84029613, 0.15970387],
[0.79819342, 0.20180658],
[0.62989192, 0.37010808],
[0.61636611, 0.38363389]])
# 另一种查看概率的方式
a = pd.DataFrame(y_pred_proba, columns=['不流失概率', '流失概率'])
a.head()
不流失概率 | 流失概率 | |
---|---|---|
0 | 0.820415 | 0.179585 |
1 | 0.840296 | 0.159704 |
2 | 0.798193 | 0.201807 |
3 | 0.629892 | 0.370108 |
4 | 0.616366 | 0.383634 |
# 只查看流失的概率(也即y=1概率,即上面二维数组的第二列)
y_pred_proba[:,1]
array([0.17958509, 0.15970387, 0.20180658, ..., 0.04220544, 0.09782449,
0.63586739])
# 7.查看各个特征变量的系数(额外知识点,供参考)
model.coef_
array([[ 2.41952469e-05, 8.16881491e-03, 1.04320950e-02,
-2.54894468e-03, -1.10120609e-04]])
model.intercept_
array([-1.43393291e-06])
# 通过公式获取流失概率
import numpy as np
for i in range(5): # 前5条测试集数据的预测流失的概率
print(1 / (1 + np.exp(-(np.dot(X_test.iloc[i], model.coef_.T) + model.intercept_))))
[0.17958509]
[0.15970387]
[0.20180658]
[0.37010808]
[0.38363389]
标签:week11,customer,...,pred,流失,train,logistic,test,model
From: https://www.cnblogs.com/dogfaraway/p/17403157.html