实验一:决策树算法实验
姓名:许珂
学号:201613344
【实验目的】
- 理解决策树算法原理,掌握决策树算法框架;
- 理解决策树学习算法的特征选择、树的生成和树的剪枝;
- 能根据不同的数据类型,选择不同的决策树算法;
- 针对特定应用场景及数据,能应用决策树算法解决实际问题。
【实验内容】
- 设计算法实现熵、经验条件熵、信息增益等方法。
- 针对给定的房贷数据集(数据集表格见附录1)实现ID3算法。
- 熟悉sklearn库中的决策树算法;
- 针对iris数据集,应用sklearn的决策树算法进行类别预测。
【实验报告要求】
- 对照实验内容,撰写实验过程、算法及测试结果;
- 代码规范化:命名规则、注释;
- 查阅文献,讨论ID3、5算法的应用场景;
- 查询文献,分析决策树剪枝策略。
实验代码
1.设计算法实现熵、经验条件熵、信息增益等办法:
1 #导入本次实验所需要的包 2 import numpy as np 3 import 4 import matplotlib.pyplot as plt 5 %matplotlib inline 6 from sklearn.datasets import load_iris 7 from sklearn.model_selection import train_test_split 8 from collections import Counter 9 import math 10 from math import log 11 import pprint
#数据集和分类属性
def create_data(): datasets = [['青年', '否', '否', '一般', '否'], ['青年', '否', '否', '好', '否'], ['青年', '是', '否', '好', '是'], ['青年', '是', '是', '一般', '是'], ['青年', '否', '否', '一般', '否'], ['中年', '否', '否', '一般', '否'], ['中年', '否', '否', '好', '否'], ['中年', '是', '是', '好', '是'], ['中年', '否', '是', '非常好', '是'], ['中年', '否', '是', '非常好', '是'], ['老年', '否', '是', '非常好', '是'], ['老年', '否', '是', '好', '是'], ['老年', '是', '否', '好', '是'], ['老年', '是', '否', '非常好', '是'], ['老年', '否', '否', '一般', '否'],] labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别'] # 返回数据集和每个维度的名称 return datasets, labels
1 datasets, labels = create_data() 2 train_data = pd.DataFrame(datasets, columns=labels) 3 train_data
运行结果:
计算数据集的熵
1 def calc_ent(datasets): 2 data_length = len(datasets) 3 label_count = {} 4 for i in range(data_length): 5 label = datasets[i][-1] 6 if label not in label_count: 7 label_count[label] = 0 8 label_count[label] += 1 9 ent = -sum([(p / data_length) * log(p / data_length, 2) 10 for p in label_count.values()]) 11 return ent 12 13 14 ent = calc_ent(datasets) 15 ent 16 #def entropy(y): 17 #Entropy of a label sequence 18 #""" 19 #hist = np.bincount(y) 20 #ps = hist / np.sum(hist) 21 #return -np.sum([p * np.log2(p) for p in ps if p > 0])
运行结果:
计算经验条件熵
1 def cond_ent(datasets, axis=0): 2 data_length = len(datasets)
3 feature_sets = {} 4 for i in range(data_length): 5 feature = datasets[i][axis] 6 if feature not in feature_sets: 7 feature_sets[feature] = [] 8 feature_sets[feature].append(datasets[i]) 9 cond_ent = sum([(len(p) / data_length) * calc_ent(p) for p in feature_sets.values()]) 10 return cond_ent 11 12 cond_ent = cond_ent(datasets) 13 cond_ent
运行结果:
计算数据集的信息增益:
1 def info_gain(ent, cond_ent): 2 return ent - cond_ent 3 def info_gain_train(datasets): 4 count = len(datasets[0]) - 1 5 ent = calc_ent(datasets) 6 7 #ent = entropy(datasets) 8 best_feature = [] 9 for c in range(count): 10 c_info_gain = info_gain(ent, cond_ent(datasets, axis=c)) 11 best_feature.append((c, c_info_gain)) 12 print('特征({}) - info_gain - {:.3f}'.format(labels[c], c_info_gain)) 13 # 比较大小 14 best_ = max(best_feature, key=lambda x: x[-1]) 15 return '特征({})的信息增益最大,选择为根节点特征'.format(labels[best_[0]])
info_gain_train(np.array(datasets))
运行结果:
2.针对给定的房贷数据集实现ID3算法:
1 #使用ID3算法生成决策树 2 class Node: 3 def __init__(self, root=True, label=None, feature_name=None, feature=None): 4 self.root = root 5 self.label = label 6 self.feature_name = feature_name 7 self.feature = feature 8 self.tree = {} 9 self.result = { 10 'label:': self.label, 11 'feature': self.feature, 12 'tree': self.tree} 13 def __repr__(self): 14 return '{}'.format(self.result) 15 def add_node(self, val, node): 16 self.tree[val] = node 17 def predict(self, features): 18 if self.root is True: 19 return self.label 20 return self.tree[features[self.feature]].predict(features) 21 22 class DTree: 23 def __init__(self, epsilon=0.1): 24 self.epsilon = epsilon 25 self._tree = {} 26 # 熵 27 @staticmethod 28 def calc_ent(datasets): 29 data_length = len(datasets) 30 label_count = {} 31 for i in range(data_length): 32 label = datasets[i][-1] 33 if label not in label_count: 34 label_count[label] = 0 35 label_count[label] += 1 36 ent = -sum([(p / data_length) * log(p / data_length, 2) 37 for p in label_count.values()]) 38 return ent 39 # 经验条件熵 40 def cond_ent(self, datasets, axis=0): 41 data_length = len(datasets) 42 feature_sets = {} 43 for i in range(data_length): 44 feature = datasets[i][axis] 45 if feature not in feature_sets: 46 feature_sets[feature] = [] 47 feature_sets[feature].append(datasets[i]) 48 cond_ent = sum([(len(p) / data_length) * self.calc_ent(p) 49 for p in feature_sets.values()]) 50 return cond_ent 51 # 信息增益 52 @staticmethod 53 def info_gain(ent, cond_ent): 54 return ent - cond_ent 55 def info_gain_train(self, datasets): 56 count = len(datasets[0]) - 1 57 ent = self.calc_ent(datasets) 58 best_feature = [] 59 for c in range(count): 60 c_info_gain = self.info_gain(ent, self.cond_ent(datasets, axis=c)) 61 best_feature.append((c, c_info_gain)) 62 # 比较大小 63 best_ = max(best_feature, key=lambda x: x[-1]) 64 return best_ 65 def train(self, train_data): 66 """ 67 input:数据集D(DataFrame格式),特征集A,阈值eta 68 output:决策树T 69 """ 70 _, y_train, features = train_data.iloc[:, : 71 -1], train_data.iloc[:,-1], train_data.columns[:-1] 72 # 1,若D中实例属于同一类Ck,则T为单节点树,并将类Ck作为结点的类标记,返回T 73 if len(y_train.value_counts()) == 1: 74 return Node(root=True, label=y_train.iloc[0]) 75 # 2, 若A为空,则T为单节点树,将D中实例树最大的类Ck作为该节点的类标记,返回T 76 if len(features) == 0: 77 return Node( 78 root=True, 79 label=y_train.value_counts().sort_values( 80 ascending=False).index[0]) 81 # 3,计算最大信息增益 同5.1,Ag为信息增益最大的特征 82 max_feature, max_info_gain = self.info_gain_train(np.array(train_data)) 83 max_feature_name = features[max_feature] 84 # 4,Ag的信息增益小于阈值eta,则置T为单节点树,并将D中是实例数最大的类Ck作为该节点的类标记,返 85 if max_info_gain < self.epsilon: 86 return Node( 87 root=True, 88 label=y_train.value_counts().sort_values(ascending=False).index[0]) 89 # 5,构建Ag子集 90 node_tree = Node( 91 root=False, feature_name=max_feature_name, feature=max_feature) 92 feature_list = train_data[max_feature_name].value_counts().index 93 for f in feature_list: 94 sub_train_df = train_data.loc[train_data[max_feature_name] ==f].drop([max_feature_name], axis=1) 95 # 6, 递归生成树 96 sub_tree = self.train(sub_train_df) 97 node_tree.add_node(f, sub_tree) 98 # pprint.pprint(node_tree.tree) 99 return node_tree 100 def fit(self, train_data): 101 self._tree = self.train(train_data) 102 return self._tree 103 def predict(self, X_test): 104 return self._tree.predict(X_test)
1 #创建决策树 2 datasets, labels = create_data() 3 data_df = pd.DataFrame(datasets, columns=labels) 4 dt = DTree() 5 tree = dt.fit(data_df) 6 tree
运行结果:
1 dt.predict(['老年', '否', '否', '一般'])
运行结果:
3.针对iris数据集,应用sklearn的决策树算法进行类别预测:
1 import numpy as np 2 import random 3 from sklearn import tree 4 from graphviz import Source 5 import pandas as pd 6 import re 7 8 def origalData(): 9 dataSet =[['青年', '否', '否', '一般', '否'], 10 ['青年', '否', '否', '好', '否'], 11 ['青年', '是', '否', '好', '是'], 12 ['青年', '是', '是', '一般', '是'], 13 ['青年', '否', '否', '一般', '否'], 14 ['中年', '否', '否', '一般', '否'], 15 ['中年', '否', '否', '好', '否'], 16 ['中年', '是', '是', '好', '是'], 17 ['中年', '否', '是', '非常好', '是'], 18 ['中年', '否', '是', '非常好', '是'], 19 ['老年', '否', '是', '非常好', '是'], 20 ['老年', '否', '是', '好', '是'], 21 ['老年', '是', '否', '好', '是'], 22 ['老年', '是', '否', '非常好', '是'], 23 ['老年', '否', '否', '一般', '否'],] 24 labels = [u'年龄', u'有工作', u'有自己的房子', u'信贷情况', u'类别'] # 分类属性 25 return dataSet, labels # 返回数据集和分类属性 26 27 28 if __name__ == '__main__': 29 dataset,labels = origalData() 30 datasetFrame = pd.DataFrame(dataset) 31 print("datasetFrame:{}".format(datasetFrame)) 32 X_train = datasetFrame.iloc[:,:-1] 33 Y_train = datasetFrame.iloc[:,4:] 34 a = np.column_stack((Y_train,X_train)) 35 clf = tree.DecisionTreeClassifier(criterion='gini',max_depth=4) 36 clf =clf.fit(X_train,Y_train) 37 graph = Source(tree.export_graphviz(clf,out_file=None)) 38 graph.format='png' 39 graph.render('dtYesNo',view=True) 40 print('X_train:{}\nY_train:{}'.format(X_train,Y_train)) 41 # print("dataset:{}\nlabels:{}".format(dataset,labels))
运行结果:
1 import numpy as np 2 import pandas as pd 3 import matplotlib.pyplot as plt 4 from graphviz import Source 5 from sklearn.datasets import load_iris 6 from sklearn.metrics import accuracy_score 7 from sklearn.model_selection import train_test_split 8 from _collections import _count_elements 9 import math 10 from sklearn import tree 11 from math import log 12 import pprint 13 from sklearn.tree import DecisionTreeClassifier 14 from sklearn.tree import export_graphviz 15 import graphviz 16 17 def create_data(): 18 iris = load_iris() 19 df = pd.DataFrame(iris.data,columns=iris.feature_names) 20 df['label']=iris.target 21 df.columns = ['speal length','speal width','petal length','petal width','label'] 22 data = np.array(df.iloc[:100,[0,1,-1]]) 23 print('data:') 24 print(data) 25 26 return data[:,:2],data[:,-1] 27 if __name__ == '__main__': 28 iris = load_iris() 29 X,y = create_data() 30 X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3) 31 #print(X_train,X_test,y_train,y_test) 32 clf = DecisionTreeClassifier(max_depth=4) 33 print(clf.fit(X_train,y_train,)) 34 print(clf.score(X_test,y_test)) 35 predict_results = clf.predict(X_test) # 使用模型对测试集进行预测 36 print(predict_results) 37 print(accuracy_score(predict_results, y_test)) 38 graph = Source(tree.export_graphviz(clf, out_file=None)) 39 graph.format = 'png' 40 graph.render('dt', view=True)
运行结果:
实验小结:
讨论ID3算法的应用场景
ID3算法应用场景:
通过本次实验,我对决策树算法实验和ID3算法有了更近一步的掌握,它的基础理论清晰,算法比较简单,学习能力较强,适合处理大规模的学习问题,是数据挖掘和知识发现领域中的一个很好的范例,为后来各学者提出优化算法奠定了理论基础,ID3算法特别在机器学、知识发现和数据挖掘等领域得到了极大地发展。
分析决策树剪枝策略:
- 如何进行决策树剪枝
先对数据集划分成训练集和验证集,训练集用来决定书生成过程中每个节点划分选择的属性,验证集在预剪枝中用于决定该节点是否有必要一句改属性进行展开,在后剪枝中用于判断该节点是否需要进行剪枝。先剪枝(pruning)的目的是为了避免决策树模型的过拟合。因为决策树算法在学习的过程中为了尽可能的正确的分类训练样本,不停地对结点进行划分,因此这会导致整棵树的分支过多,也就导致了过拟合。决策树的剪枝策略最基本的有两种:预剪枝(pre-pruning)和后剪枝(post-pruning): - 预剪枝(pre-pruning):预剪枝就是在构造决策树的过程中,先对每个结点在划分前进行估计,若果当前结点的划分不能带来决策树模型泛华性能的提升,则不对当前结点进行划分并且将当前结点标记为叶结点
预剪枝
通过提前停止树的构建而对树剪枝,一旦停止,节点就是树叶,该树叶持有子集元祖最频繁的类。
标签:datasets,self,feature,算法,train,实验,ent,data,决策树 From: https://www.cnblogs.com/Xu820228/p/16840171.html