定义
输入:训练数据集
T
=
{
(
x
1
,
y
1
)
,
(
x
2
,
y
2
)
,
⋯
,
(
x
N
,
y
N
)
}
T=\{ (x_1,y_1),(x_2,y_2),\cdots,(x_N,y_N)\}
T={(x1,y1),(x2,y2),⋯,(xN,yN)},其中,
x
i
∈
χ
⊆
R
n
,
y
i
∈
y
=
{
−
1
,
+
1
}
x_i \in \chi\subseteq R^n, y_i \in {\tt y}=\{ -1,+1 \}
xi∈χ⊆Rn,yi∈y={−1,+1};弱学习算法;
输出:最终分类器
G
(
x
)
。
G(x)。
G(x)。
(1)初始化训练数据的权值分布
D
1
=
(
ω
11
,
⋯
,
ω
1
i
,
⋯
,
ω
1
N
)
,
ω
1
i
=
1
N
,
i
=
1
,
2
,
⋯
,
N
D_1=\big( \omega_{11},\cdots,\omega_{1i},\cdots,\omega_{1N} \big),\omega_{1i}=\frac{1}{N},i=1,2,\cdots,N
D1=(ω11,⋯,ω1i,⋯,ω1N),ω1i=N1,i=1,2,⋯,N
(2)对
m
=
1
,
2
,
⋯
,
M
m=1,2,\cdots,M
m=1,2,⋯,M
(a)使用具有权值分布
D
m
D_m
Dm的训练数据集学习,得到基本分类器
G
m
(
x
)
:
χ
→
{
−
1
,
+
1
}
G_m(x): \chi \rightarrow \{ -1,+1 \}
Gm(x):χ→{−1,+1}
(b)计算
G
m
(
x
)
G_m(x)
Gm(x)在训练数据集上的分类误差率
e
m
=
∑
i
=
1
N
P
(
G
m
(
x
i
)
≠
y
i
)
=
∑
i
=
1
N
ω
m
i
I
(
G
m
(
x
i
)
≠
y
i
)
e_m=\sum_{i=1}^N P(G_m(x_i) \ne y_i) = \sum_{i=1}^N \omega_{mi}I(G_m(x_i) \ne y_i )
em=i=1∑NP(Gm(xi)=yi)=i=1∑NωmiI(Gm(xi)=yi)
©计算
G
m
(
x
)
G_m(x)
Gm(x)的系数
α
m
=
1
2
l
o
g
1
−
e
m
e
m
,
此处的对数是自然对数
\alpha_m = \frac{1}{2} log \frac{1-e_m}{e_m},此处的对数是自然对数
αm=21logem1−em,此处的对数是自然对数
(d)更新训练数据集的权值分布
D
m
+
1
=
(
ω
m
+
1
,
1
,
⋯
,
ω
m
+
1
,
i
,
⋯
,
ω
m
+
1
,
N
)
D_{m+1} = ( \omega_{m+1,1},\cdots,\omega_{m+1,i},\cdots,\omega_{m+1,N} )
Dm+1=(ωm+1,1,⋯,ωm+1,i,⋯,ωm+1,N)
ω
m
+
1
,
i
=
ω
m
i
Z
m
e
x
p
(
−
α
m
y
i
G
m
(
x
i
)
)
,
i
=
1
,
2
,
⋯
,
N
\omega_{m+1,i} = \frac{\omega_{mi}}{Z_m} exp(-\alpha_m y_i G_m(x_i)),i=1,2,\cdots,N
ωm+1,i=Zmωmiexp(−αmyiGm(xi)),i=1,2,⋯,N
其中
Z
m
是规范化因子
,
Z
m
=
∑
i
=
1
N
ω
m
i
e
x
p
(
−
α
m
y
i
G
m
(
x
i
)
)
,
它使
D
m
+
1
成为一个概率分布
其中Z_m是规范化因子, Z_m = \sum_{i=1}^N \omega_{mi} exp(-\alpha_m y_i G_m(x_i)),它使D_{m+1}成为一个概率分布
其中Zm是规范化因子,Zm=i=1∑Nωmiexp(−αmyiGm(xi)),它使Dm+1成为一个概率分布
(3)构建基本分类器的线性组合
f
(
x
)
=
∑
m
=
1
M
α
m
G
m
(
x
)
f(x) = \sum_{m=1}^M \alpha_m G_m(x)
f(x)=m=1∑MαmGm(x)
得到最终分类器
G
(
x
)
=
s
i
g
n
(
f
(
x
)
)
=
s
i
g
n
(
∑
m
=
1
M
α
m
G
m
(
x
)
)
G(x) = sign(f(x)) = sign\bigg( \sum_{m=1}^M \alpha_m G_m(x) \bigg)
G(x)=sign(f(x))=sign(m=1∑MαmGm(x))
输入空间
T= { ( x 1 , y 1 ) , ( x 2 , y 2 ) , … , ( x N , y N ) } \left\{(x_1,y_1),(x_2,y_2),\dots,(x_N,y_N)\right\} {(x1,y1),(x2,y2),…,(xN,yN)}
import time
import numpy as np
def loadData(fileName):
'''
加载Mnist数据集 下载地址:https://download.csdn.net/download/nanxiaotao/89720991)
:param fileName:要加载的文件路径
:return: 数据集和标签集
'''
#存放数据及标记
dataArr = []; labelArr = []
#读取文件
fr = open(fileName)
#遍历文件中的每一行
for line in fr.readlines():
curLine = line.strip().split(',')
dataArr.append([int(int(num) > 128) for num in curLine[1:]])
if int(curLine[0]) == 0:
labelArr.append(1)
else:
labelArr.append(-1)
#返回数据集和标记
return dataArr, labelArr
trainDataList, trainLabelList = loadData('../Mnist/mnist_train.csv')
np.shape(trainDataList)
特征空间(Feature Space)
trainDataList[0][0:784]
统计学习方法
模型
G ( x ) = s i g n ( f ( x ) ) = s i g n ( ∑ m = 1 M α m G m ( x ) ) G(x) = sign(f(x)) = sign\bigg( \sum_{m=1}^M \alpha_m G_m(x) \bigg) G(x)=sign(f(x))=sign(m=1∑MαmGm(x))
策略
ω
m
+
1
,
i
=
ω
m
i
Z
m
e
x
p
(
−
α
m
y
i
G
m
(
x
i
)
)
,
i
=
1
,
2
,
⋯
,
N
\omega_{m+1,i} = \frac{\omega_{mi}}{Z_m} exp(-\alpha_m y_i G_m(x_i)),i=1,2,\cdots,N
ωm+1,i=Zmωmiexp(−αmyiGm(xi)),i=1,2,⋯,N
(
α
m
,
G
m
(
x
)
)
=
a
r
g
∗
m
i
n
α
,
G
∑
i
=
1
N
e
x
p
[
−
y
i
(
f
m
−
1
(
x
i
)
+
α
G
(
x
i
)
)
]
,
其中
f
(
x
)
=
∑
m
=
1
M
α
m
G
m
(
x
)
(\alpha_m , G_m(x)) = arg * \mathop{min}\limits_{\alpha,G}\sum_{i=1}^N exp\bigg[ -y_i ( f_{m-1}(x_i) + \alpha G(x_i)) \bigg],其中 f(x) = \sum_{m=1}^M \alpha_m G_m(x)
(αm,Gm(x))=arg∗α,Gmini=1∑Nexp[−yi(fm−1(xi)+αG(xi))],其中f(x)=m=1∑MαmGm(x)
算法
trainDataList = trainDataList[:10000]
trainLabelList = trainLabelList[:10000]
treeNum = 40 #树的层数
e m = ∑ i = 1 N P ( G m ( x i ) ≠ y i ) = ∑ i = 1 N ω m i I ( G m ( x i ) ≠ y i ) e_m = \sum_{i=1}^N P(G_m(x_i) \neq y_i ) = \sum_{i=1}^N \omega_{mi} I(G_m(x_i) \neq y_i ) em=i=1∑NP(Gm(xi)=yi)=i=1∑NωmiI(Gm(xi)=yi)
def calc_e_Gx(trainDataArr, trainLabelArr, n, div, rule, D):
'''
计算分类错误率
:param trainDataArr:训练数据集数字
:param trainLabelArr: 训练标签集数组
:param n: 要操作的特征
:param div:划分点
:param rule:正反例标签
:param D:权值分布D
:return:预测结果, 分类误差率
'''
#初始化分类误差率为0
e = 0
x = trainDataArr[:, n]
y = trainLabelArr
predict = []
if rule == 'LisOne': L = 1; H = -1
else: L = -1; H = 1
#遍历所有样本的特征m
for i in range(trainDataArr.shape[0]):
if x[i] < div:
predict.append(L)
if y[i] != L: e += D[i]
elif x[i] >= div:
predict.append(H)
if y[i] != H: e += D[i]
return np.array(predict), e
α
m
=
1
2
l
o
g
1
−
e
m
e
m
\alpha_m = \frac{1}{2} log \frac{1-e_m}{e_m}
αm=21logem1−em
ω
m
+
1
,
i
=
ω
m
i
Z
m
e
x
p
(
−
α
m
y
i
G
m
(
x
i
)
)
,
i
=
1
,
2
,
⋯
,
N
\omega_{m+1,i} = \frac{\omega_{mi}}{Z_m} exp(-\alpha_m y_i G_m(x_i)),i=1,2,\cdots,N
ωm+1,i=Zmωmiexp(−αmyiGm(xi)),i=1,2,⋯,N
D
m
+
1
=
(
ω
m
+
1
,
1
,
⋯
,
ω
m
+
1
,
i
,
⋯
,
ω
m
+
1
,
N
)
D_{m+1} = ( \omega_{m+1,1},\cdots,\omega_{m+1,i},\cdots,\omega_{m+1,N} )
Dm+1=(ωm+1,1,⋯,ωm+1,i,⋯,ωm+1,N)
def createSigleBoostingTree(trainDataArr, trainLabelArr, D):
'''
创建单层提升树
:param trainDataArr:训练数据集数组
:param trainLabelArr: 训练标签集数组
:param D:
:return: 创建的单层提升树
'''
#获得样本数目及特征数量
m, n = np.shape(trainDataArr)
sigleBoostTree = {}
#误差率最高也只能100%,因此初始化为1
sigleBoostTree['e'] = 1
#对每一个特征进行遍历,寻找用于划分的最合适的特征
for i in range(n):
#因为特征已经经过二值化,只能为0和1,因此分切分时分为-0.5, 0.5, 1.5三挡进行切割
for div in [-0.5, 0.5, 1.5]:
for rule in ['LisOne', 'HisOne']:
#按照第i个特征,以值div进行切割,进行当前设置得到的预测和分类错误率
Gx, e = calc_e_Gx(trainDataArr, trainLabelArr, i, div, rule, D)
#如果分类错误率e小于当前最小的e,那么将它作为最小的分类错误率保存
if e < sigleBoostTree['e']:
sigleBoostTree['e'] = e
#同时也需要存储最优划分点、划分规则、预测结果、特征索引
#以便进行D更新和后续预测使用
sigleBoostTree['div'] = div
sigleBoostTree['rule'] = rule
sigleBoostTree['Gx'] = Gx
sigleBoostTree['feature'] = i
#返回单层的提升树
return sigleBoostTree
def createBosstingTree(trainDataList, trainLabelList, treeNum = 50):
'''
创建提升树
:param trainDataList:训练数据集
:param trainLabelList: 训练测试集
:param treeNum: 树的层数
:return: 提升树
'''
#将数据和标签转化为数组形式
trainDataArr = np.array(trainDataList)
trainLabelArr = np.array(trainLabelList)
#没增加一层数后,当前最终预测结果列表
finallpredict = [0] * len(trainLabelArr)
#获得训练集数量以及特征个数
m, n = np.shape(trainDataArr)
D = [1 / m] * m
#初始化提升树列表,每个位置为一层
tree = []
#循环创建提升树
for i in range(treeNum):
curTree = createSigleBoostingTree(trainDataArr, trainLabelArr, D)
alpha = 1/2 * np.log((1 - curTree['e']) / curTree['e'])
Gx = curTree['Gx']
D = np.multiply(D, np.exp(-1 * alpha * np.multiply(trainLabelArr, Gx))) / sum(D)
curTree['alpha'] = alpha
tree.append(curTree)
finallpredict += alpha * Gx
error = sum([1 for i in range(len(trainDataList)) if np.sign(finallpredict[i]) != trainLabelArr[i]])
finallError = error / len(trainDataList)
if finallError == 0: return tree
print('iter:%d:%d, sigle error:%.4f, finall error:%.4f'%(i, treeNum, curTree['e'], finallError ))
#返回整个提升树
return tree
tree = createBosstingTree(trainDataList,trainLabelList,treeNum)
假设空间(Hypothesis Space)
{ f ∣ f ( x ) = s i g n ( ∑ m = 1 M α m G m ( x ) ) } \left\{f|f(x) = sign\bigg( \sum_{m=1}^M \alpha_m G_m(x) \bigg) \right\} {f∣f(x)=sign(m=1∑MαmGm(x))}
输出空间
Y ∈ { − 1 , 1 } Y \in \{ -1,1 \} Y∈{−1,1}
模型评估
训练误差
testDataList, testLabelList = loadData('../Mnist/mnist_test.csv')
testDataList = testDataList[:1000]
testLabelList = testLabelList[:1000]
def predict(x, div, rule, feature):
'''
输出单独层预测结果
:param x: 预测样本
:param div: 划分点
:param rule: 划分规则
:param feature: 进行操作的特征
:return:
'''
#依据划分规则定义小于及大于划分点的标签
if rule == 'LisOne': L = 1; H = -1
else: L = -1; H = 1
#判断预测结果
if x[feature] < div: return L
else: return H
def model_test(testDataList, testLabelList, tree):
'''
测试
:param testDataList:测试数据集
:param testLabelList: 测试标签集
:param tree: 提升树
:return: 准确率
'''
#错误率计数值
errorCnt = 0
#遍历每一个测试样本
for i in range(len(testDataList)):
#预测结果值,初始为0
result = 0
#遍历每层的树
for curTree in tree:
#获取该层参数
div = curTree['div']
rule = curTree['rule']
feature = curTree['feature']
alpha = curTree['alpha']
#将当前层结果加入预测中
result += alpha * predict(testDataList[i], div, rule, feature)
#预测结果取sign值,如果大于0 sign为1,反之为0
if np.sign(result) != testLabelList[i]: errorCnt += 1
#返回准确率
return 1 - errorCnt / len(testDataList)
accuracy = model_test(testDataList[:1000], testLabelList[:1000], tree)
print('the accuracy is:%d' % (accuracy * 100), '%')