csv的内容为:
运行的代码为:
from sklearn.feature_extraction import DictVectorizer import csv from sklearn import tree,preprocessing from six import StringIO allElectronicsData = open(r'/Users/zhangsanfeng/Documents/jupyterNotebook/AllElectronics.csv') reader = csv.reader(allElectronicsData) headers = next(reader) # 获取表头 print(headers) featureList = [] labelList = [] for row in reader: labelList.append(row[len(row)-1]) # 取该行最后一个值 rowDict = {} for i in range(1,len(row)-1): # 从第二个值开始轮询该行,到倒数第二个值结束 rowDict[headers[i]] = row[i] # 用headers里面的值和该行的值组装成一个键值对 featureList.append(rowDict) # 将键值对拼接到列表里面 print(featureList) print(labelList) vec = DictVectorizer() dummyX = vec.fit_transform(featureList).toarray() # 将键值对中的值转换为坐标值 print("dummyX: " + str(dummyX)) # 打印转换后的结果 print(vec.get_feature_names()) # 打印每一项标识的含义 print("labelList: " + str(labelList)) # labelList尚未转换 lb = preprocessing.LabelBinarizer() dummyY = lb.fit_transform(labelList) # 转换labelList print("dummyY: " + str(dummyY)) # 使用决策树进行分类 clf = tree.DecisionTreeClassifier(criterion='entropy') clf = clf.fit(dummyX, dummyY) print("clf: " + str(clf)) # 可视化模型,将结果写入到文件 with open("/Users/zhangsanfeng/Documents/jupyterNotebook/allElectronicInformationGainOri.dot", 'w') as f: f = tree.export_graphviz(clf, feature_names=vec.get_feature_names(), out_file=f) oneRowX = dummyX[0, :] print("oneRowX: " + str(oneRowX)) newRowX = oneRowX newRowX[0] = 1 newRowX[2] = 0 newRowX = newRowX.reshape(1, -1) # 这里需要将1维数组转换为2维数组,不然后面会报错 print("newRowX: " + str(newRowX)) predictedY = clf.predict(newRowX) # 验证测试结果 print("predictedY: " + str(predictedY))
输出结果为:
['RID', 'age', 'income', 'student', 'credit_rating', 'class_buys_computer'] [{'age': 'youth', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'high', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'high', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'low', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'youth', 'income': 'medium', 'student': 'no', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'low', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'youth', 'income': 'medium', 'student': 'yes', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}, {'age': 'middle_aged', 'income': 'high', 'student': 'yes', 'credit_rating': 'fair'}, {'age': 'senior', 'income': 'medium', 'student': 'no', 'credit_rating': 'excellent'}] ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'] dummyX: [[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.] [0. 0. 1. 1. 0. 1. 0. 0. 1. 0.] [1. 0. 0. 0. 1. 1. 0. 0. 1. 0.] [0. 1. 0. 0. 1. 0. 0. 1. 1. 0.] [0. 1. 0. 0. 1. 0. 1. 0. 0. 1.] [0. 1. 0. 1. 0. 0. 1. 0. 0. 1.] [1. 0. 0. 1. 0. 0. 1. 0. 0. 1.] [0. 0. 1. 0. 1. 0. 0. 1. 1. 0.] [0. 0. 1. 0. 1. 0. 1. 0. 0. 1.] [0. 1. 0. 0. 1. 0. 0. 1. 0. 1.] [0. 0. 1. 1. 0. 0. 0. 1. 0. 1.] [1. 0. 0. 1. 0. 0. 0. 1. 1. 0.] [1. 0. 0. 0. 1. 1. 0. 0. 0. 1.] [0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]] ['age=middle_aged', 'age=senior', 'age=youth', 'credit_rating=excellent', 'credit_rating=fair', 'income=high', 'income=low', 'income=medium', 'student=no', 'student=yes'] labelList: ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no'] dummyY: [[0] [0] [1] [1] [1] [0] [1] [0] [1] [1] [1] [1] [1] [0]] clf: DecisionTreeClassifier(criterion='entropy') oneRowX: [0. 0. 1. 0. 1. 1. 0. 0. 1. 0.] newRowX: [[1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]] predictedY: [1]
标签:rating,no,--,age,笔记,credit,income,yes,决策树 From: https://www.cnblogs.com/tortoise512/p/16942008.html