woe分箱_iv值计算
-
基于
scorecardpy
库,乳腺癌数据集import pandas as pd import numpy as np from sklearn.datasets import load_breast_cancer import scorecardpy as sc from tqdm import notebook cancer = load_breast_cancer() df = pd.DataFrame(cancer.data,columns=['_'.join(i.split()) for i in cancer.feature_names]) df['y'] = cancer.target """woe计算""" #所有的column是 need_columns = df.iloc[:,:-1].columns.tolist() data_list = [] for i in notebook.tqdm(need_columns): bins = sc.woebin(df[[i,'y']],y='y') data_list.append(bins[i]) result_woe = pd.concat(data_list) result_woe.columns = ['特征','分箱','分箱样本数','count_distr','负样本数' ,'正样本数','正样本占比_当前分箱','woe_当前分箱' ,'iv_当前分箱','iv值','分割点','是否特殊值'] result_woe['负样本占比_当前分箱'] = result_woe['负样本数']/result_woe['分箱样本数'] result_woe['正样本%'] = result_woe['正样本数']/df['y'].sum() result_woe['负样本%'] = result_woe['负样本数']/(df.shape[0] - df['y'].sum()) result_woe = result_woe[['特征', '分箱', '分割点', '是否特殊值', '分箱样本数' , 'count_distr', '负样本数', '正样本数', '正样本占比_当前分箱' , '负样本占比_当前分箱', '正样本%','负样本%','woe_当前分箱', 'iv_当前分箱', 'iv值']]
-
woe
计算公式
w o e = log ( g o o d % b a d % ) woe = \log(\frac{good\%}{bad\%} ) woe=log(bad%good%) -
iv
计算公式
i v = ∑ ( g o o d % − b a d % ) ∗ w o e iv = \sum(good\%-bad\%)*woe iv=∑(good%−bad%)∗woe