import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
import graphviz
加载数据
X,y = datasets.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=1024)
建模
ada = AdaBoostClassifier(n_estimators=3,algorithm='SAMME',learning_rate=1.0)
ada.fit(X_train,y_train)#建模,学习
y_ = ada.predict(X_test)
proba_ = ada.predict_proba(X_test)
print('分类准确率',ada.score(X_test,y_test))
display(y_,proba_)
构建第一棵树
dot_data = tree.export_graphviz(ada[0],filled=True,rounded=True)
graphviz.Source(dot_data)
gini系数
w1 = np.full(shape = 120,fill_value = 1/120)#每个样本的类别
gini = 1
for i in range(3):#三类,0,1,2
#计算每个类别的概率
cond = y_train == i #判断类别
p = w1[cond].sum()#这一类样本的权重
gini-=p**2
print(gini)
0.6662499999999998
拆分条件
best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数
#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
for i in range(len(X_train)-1):#具体特征,选哪个裂分点
X=X_train[:,col].copy()#取出某个特征
X.sort()#排序,裂分点
#切片,求平均值,裂分点
split = X[i:i+2].mean()
cond = (X_train[:,col]<=split).ravel()#条件判断
left = y_train[cond]
right = y_train[~cond]
#左右两边的gini系数,分别计算
gini_left = 0
gini_right = 0
for target in range(3):
p1 = (left==target).sum()/left.size #左边类别的概率
gini_left += p1*(1-p1)
if right.size!=0:
p2 = (right==target).sum()/right.size
gini_right +=p2*(1-p2)
left_p = w1[cond].sum()
right_p = 1-left_p
gini = left_p*gini_left +right_p*gini_right
if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
lower_gini = gini
best_split.clear()
best_split['X[%d]'%(col)] = split
elif gini ==lower_gini:
best_split['X[%d]'%(col)] = split
print(best_split)
计算误差率
y1_ = ada[0].predict(X_train)#预测结果
print(y1_)
e1 = ((y_train !=y1_)).mean()#误差
print('第一棵树误差率',e1)
print('算法的误差率',ada.estimator_errors_)
计算弱学习器权重
learning_rate = 1.0
num = 3#三分类
a1 = learning_rate * (np.log((1-e1)/e1)+np.log(num-1))
print('手动计算算法权重是',a1)
print('算法返回的分类器权重是',ada.estimator_weights_)
更新权重
w2 = w1*np.exp(a1 * (y_train!=y1_))
w2/= w2.sum()#归一化
w2
构建第二棵树
dot_data = tree.export_graphviz(ada[1],filled=True,rounded=True)
graphviz.Source(dot_data)
for i in range(3):
cond = y_train ==i
value = w2[cond].sum()
print(np.round(value,3))
gini 系数
gini = 1
for i in range(3):#三类,0,1,2
#计算每个类别的概率
cond = y_train == i #判断类别
p = w2[cond].sum()#这一类样本的权重
gini-=p**2
print(np.round(gini,3))
0.5
best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数
#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
for i in range(len(X_train)-1):#具体特征,选哪个裂分点
X=X_train[:,col].copy()#取出某个特征
X.sort()#排序,裂分点
#切片,求平均值,裂分点
split = X[i:i+2].mean()
cond = (X_train[:,col]<=split).ravel()#条件判断
#左右两部分
left = y_train[cond]
left_w = w2[cond]/w2[cond].sum()#左侧内部权重分布
right = y_train[~cond]
right_w = w2[~cond]/w2[~cond].sum()#右侧内部权重分布
#左右两边的gini系数,分别计算
gini_left = 0
gini_right = 0
for target in range(3):
cond1 = left ==target#类别判定条件
p1 = left_w[cond1].sum()
gini_left += p1*(1-p1)
cond2 =right ==target
p2 = right_w[cond2].sum()
gini_right +=p2*(1-p2)
left_p = w2[cond].sum()
right_p = 1-left_p
#整合左右两边的gini系数
gini = left_p*gini_left +right_p*gini_right
#标志判断最小的gini系数
if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
lower_gini = gini
best_split.clear()
best_split['X[%d]'%(col)] = split
elif gini ==lower_gini:
best_split['X[%d]'%(col)] = split
print(best_split)
{'X[2]': 4.75}
计算误差率
y2_ = ada[1].predict(X_train)#预测结果
print(y2_)
#e2 = ((y_train !=y2_)).mean()#误差
cond = y_train!=y2_
e2 = w2[cond].sum()
print('第一棵树误差率',e2)
print('算法的误差率',ada.estimator_errors_)
计算弱学习器权重
learning_rate = 1.0
num = 3#三分类
a2 = learning_rate * (np.log((1-e2)/e2)+np.log(num-1))
print('手动计算算法权重是',a2)
print('算法返回的分类器权重是',ada.estimator_weights_)
dot_data = tree.export_graphviz(ada[2],filled=True,rounded=True)
graphviz.Source(dot_data)
更新权重
w3 = w2*np.exp(a2 * (y_train!=y2_))
w3/=w3.sum()#归一化
w3
构建第三棵树
gini系数计算
gini = 1
for i in range(3):#三类,0,1,2
#计算每个类别的概率
cond = y_train== i #判断类别
p = w3[cond].sum()#这一类样本的权重
gini-=p**2
print(np.round(gini,3))
0.52
拆分条件
best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数
#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
for i in range(len(X_train)-1):#具体特征,选哪个裂分点
X=X_train[:,col].copy()#取出某个特征
X.sort()#排序,裂分点
#切片,求平均值,裂分点
split = X[i:i+2].mean()
cond = (X_train[:,col]<=split).ravel()#条件判断
#左右两部分
left = y_train[cond]
left_w = w3[cond]/w3[cond].sum()#左侧内部权重分布
right = y_train[~cond]
right_w = w3[~cond]/w3[~cond].sum()#右侧内部权重分布
#左右两边的gini系数,分别计算
gini_left = 0
gini_right = 0
for target in range(3):
cond1 = left ==target#类别判定条件
p1 = left_w[cond1].sum()
gini_left += p1*(1-p1)
cond2 =right ==target
p2 = right_w[cond2].sum()
gini_right +=p2*(1-p2)
left_p = w3[cond].sum()
right_p = 1-left_p
#整合左右两边的gini系数
gini = left_p*gini_left +right_p*gini_right
#标志判断最小的gini系数
if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
lower_gini = gini
best_split.clear()
best_split['X[%d]'%(col)] = split
elif gini ==lower_gini:
best_split['X[%d]'%(col)] = split
print(best_split)
{'X[3]': 1.65}
y3_ = ada[2].predict(X_train)#预测结果
print(y3_)
#e2 = ((y_train !=y3_)).mean()#误差
cond = y_train!=y3_
e3 = w3[cond].sum()
print('第一棵树误差率',e3)
print('算法的误差率',ada.estimator_errors_)
计算弱学习器权重
learning_rate = 1.0
num = 3#三分类
a3 = learning_rate * (np.log((1-e3)/e3)+np.log(num-1))
print('手动计算算法权重是',a3)
print('算法返回的分类器权重是',ada.estimator_weights_)
概率计算
算法返回的概率
proba_= ada.predict_proba(X_test)[:5]
proba_
proba3 = (ada[2].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba3
自己计算的概率
proba1= (ada[0].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba1
proba2 = (ada[1].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba2
proba3 = (ada[2].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba3
proba = proba1 * a1 + proba2 * a2 + proba3 * a3
proba
归一化
proba /=ada.estimator_weights_.sum()
proba
proba /=(num-1)
proba
softmax概率
(np.exp(proba)/(np.exp(proba).sum(axis =1)).reshape(-1,1))[:5]
proba_[:20]
标签:24,11,gini,proba,print,train,AdaBoost,ada,np
From: https://blog.csdn.net/yyyy2711/article/details/141108123