0003 机器学习实战第三章分类

标签：实战第三章 0003 self father train print False clf

1 本章所有示例代码

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""
路径    : lesson03.py
标题    : 分类
创建    : 2022-10-15 17:46
更新    : 2022-10-15 17:46
编写    : 陈倚云
"""
import time
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


class Never5Clissifier(BaseEstimator):
    """
    自定义针对数字5的分类器
    """

    def fit(self, X, y=None):
        """
        继承
        """
        return self

    def predict(self, X):
        """
        预测
        """
        # 全部返回False。因为只预测是5或非5,哪怕全是False,准确率都能达到90%以上
        ret = np.zeros((len(X), 1), dtype=bool)
        return ret


class Classifies:
    """
    分类
    """

    def __init__(self):
        # pass
        # mnist_784共有70000张图片,每张图片有784个特征
        self.mnist = fetch_openml('mnist_784', version=1)
        self.X, self.y = self.mnist['data'], self.mnist['target']
        self.y = self.y.astype(np.uint8)  # astype是pandas的类型转换函数,numpy的uint8用于存储图像的数字
        self.X_train, self.X_test, self.y_train, self.y_test = self.X.values[:60000], self.X.values[60000:], self.y.values[:60000], self.y.values[60000:]
        self.size = 28  # 图片长宽
        self.random_state = 42  # 随机因子
        self.some_digit = self.X.values[0]  # 第一条记录
        self.y_train_5 = (self.y_train == 5)  # 更改标签标识,是5的,为True,其它为False

    def plot_digits(self, instances, images_per_row=10, **options):
        """
        打印数字
        """
        images_per_row = min(len(instances), images_per_row)
        images = [instance.reshape(self.size, self.size) for instance in instances]
        n_rows = (len(instances) - 1) // images_per_row + 1  # //双斜杠,返回不大于结果的最大整数
        row_images = []
        n_empty = n_rows * images_per_row - len(instances)
        images.append(np.zeros((self.size, self.size * n_empty)))
        for row in range(n_rows):
            rimages = images[row * images_per_row: (row + 1) * images_per_row]
            row_images.append(np.concatenate(rimages, axis=1))
        image = np.concatenate(row_images, axis=0)
        plt.imshow(image, cmap=mpl.cm.binary, **options)
        plt.axis("off")

    class Mnist:
        """
        3.1 MNIST 显示数据集中的数字
        """

        def __init__(self, father):
            self.father = father

        def display_image_0(self):
            """
            显示MNIST数据库中第一张图片
            """
            some_digit_image = self.father.some_digit.reshape(self.father.size, self.father.size)
            plt.imshow(some_digit_image, cmap='binary')  # 显示灰度图,图中0-255用于灰度,0表示白色,255表示黑色,由白到黑变化过程形成灰度
            plt.axis('off')  # 隐藏四个轴
            plt.savefig('images\\3_1_0.png')
            plt.show()
            print('---------------------------------MNIST第1张图片的标签----------------------------------')
            print(self.father.y[0])  # 显示第一条记录的标签

        def display_image_100(self):
            """
            显示MNIST数据库中前100张图片
            """
            plt.figure(figsize=(20, 20))
            for i in range(0, 100):
                plt.subplot(10, 10, i + 1)
                plt.axis('off')
                X = self.father.X.values[i].reshape(self.father.size, self.father.size)
                plt.imshow(X=X, cmap='binary')
            print('-------------------------------MNIST前100张图片的标签----------------------------------')
            print(self.father.y_train[:100])
            plt.savefig('images\\3_1_100.png')
            plt.show()

    class TrainBinaryClassifier:
        """
        3.2 训练二分类器
            本示例采用SGD随机梯度下降分类器,使用Scikit-Learn的SGDClassifier类.
            这个分类器的优势是能够有效的处理非常大型的数据集,这部分是因为SGD独立处理训练实例,一次一个(这也使得SGD非常适合在线学习.)
        """

        def __init__(self, father):
            self.father = father

        def get_sgd_clf(self):
            """
            训练一个SGDClassfier二元分类器
            """
            sgd_clf = SGDClassifier(random_state=self.father.random_state)  # 创建一个SGDClassifier二元分类器
            sgd_clf.fit(self.father.X_train, self.father.y_train_5)  # 用随机梯度下降的方法拟合线性模型,即:训练一个类标签是否为5的模型
            return sgd_clf

        def predict(self):
            """
            执行二元分类器预测
            """
            clf = self.get_sgd_clf()
            clf_predict = clf.predict(self.father.X.values[:100])  # 把拟合后的线性模型用于预测样本中的类标签,返回预测结果
            print('-------------------------二分类器SGD预测前100个数字中哪些是5----------------------------')
            print(clf_predict)

    class PerformanceMeasurement:
        """
        3.3 性能测量
        """

        def __init__(self, father):
            self.father = father

        def cross_validate(self):
            """
            3.3.1 交叉验证
                使用Scikit-Learn K折交叉验证函数cross_val_score()来评估SGDClassifier随机梯度下降模型的准确性.
                K折交叉验证法:
                    将训练集分成K个折叠,每次留其中一个折叠用来预测,其余的用来训练,直到所有折叠都用来预测后,返回每个折叠的得分.
            """
            sgd_clf = self.father.TrainBinaryClassifier(self.father).get_sgd_clf()
            # 以下是手工写的交叉验证
            """
            skfolds = StratifiedKFold(  # 交叉验证
                n_splits=3,  # 折叠次数,默认为3,至少为2
                shuffle=True,  # 是否在每次分割前打乱顺序
                random_state=42  # 随机种子,只有在shuffle==True时才有用。
            )
            for train_index, test_index in skfolds.split(self.father.X_train, self.father.y_train_5):
                clone_clf = clone(sgd_clf)  # 克隆一个梯度下降分类器
                X_train_folds = self.father.X_train[train_index]
                y_train_folds = self.father.y_train_5[train_index]
                X_test_folds = self.father.X_train[test_index]
                y_test_folds = self.father.y_train_5[test_index]
                clone_clf.fit(X_train_folds, y_train_folds)
                y_pred = clone_clf.predict(X_test_folds)
                n_correct = sum(y_test_folds == y_pred)
                print(n_correct / len(y_pred))
            """
            # 调用sklearn交叉验证
            cross_score = cross_val_score(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3,
                scoring='accuracy'  # 验证准确率
            )
            print('-----------------------交叉验证二元分类器SGDClassifier的准确率--------------------------')
            print(cross_score)

        def confusion_marix_5(self):
            """
            3.3.2 混淆矩阵
                使用Scikit-Learn混淆矩阵函数confusion_matrix()来评估SGDClassifier随机梯度下降模型的准确性.
                混淆矩阵:把预测结果和实际标签对比,分成四类:
                    TP:真正类  左上角 预测是正类,实际是正类
                    TN:真负类  右下角 预测是负类,实际是负类
                    FP:假正类  右上角 预测是正类,实际是负类
                    FN:假负类  左上角 测试是负类,实际是正类
            """
            sgd_clf = self.father.TrainBinaryClassifier(self.father).get_sgd_clf()
            y_train_pred = cross_val_predict(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3
            )
            con_mix = confusion_matrix(y_true=self.father.y_train_5, y_pred=y_train_pred)
            print('-----------------------混淆矩阵二元分类器SGDClassifier的准确率--------------------------')
            print(con_mix)

        def precision_and_recall(self):
            """
            3.3.3 精度和召回率
                精度:     TP/(TP+FP)
                召回率:   TP/(TP+FN)
                f1分数:   TP/(TP+(FP+FN)/2)
            这三个指标根据业务场景做取舍,如:
                检测儿童视频,则需要精度最高100%,即假正类越少越好
                抓罪犯,则需要召回率最高100%,即假负类越少越好
            """
            sgd_clf = self.father.TrainBinaryClassifier(self.father).get_sgd_clf()
            y_train_pred = cross_val_predict(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3
            )
            pre_score = precision_score(y_true=self.father.y_train_5, y_pred=y_train_pred)
            re_score = recall_score(y_true=self.father.y_train_5, y_pred=y_train_pred)
            fone_score = f1_score(y_true=self.father.y_train_5, y_pred=y_train_pred)
            print('----------------------------------精度 召回率 f1分数-----------------------------------')
            print('精度\t', pre_score)
            print('召回率\t', re_score)
            print('f1分数\t', fone_score)

        def trading_off_percision_and_recall(self):
            """
            3.3.4 精度和召回率权衡
                关键是根据业务需求,找到合适的阈值,再通过阈值计算精度和召回率进行验证
            """
            sgd_clf = self.father.TrainBinaryClassifier(self.father).get_sgd_clf()
            y_scores = sgd_clf.decision_function([self.father.some_digit])
            print('------------------------------------精度/召回率权衡------------------------------------')
            print('决策分数\t', y_scores)
            threshold = 0  # 设置阈值
            y_some_digit_pred = (y_scores > threshold)
            print('阈值为0时,预测结果\t\t', y_some_digit_pred)
            threshold = 8000
            y_some_digit_pred = (y_scores > threshold)
            print('阈值为8000时,预测结果\t', y_some_digit_pred)
            y_scores = cross_val_predict(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3,
                method='decision_function'
            )
            print('-----------------y_scores--------------------')
            print(y_scores)
            precisions, recalls, thresholds = precision_recall_curve(y_true=self.father.y_train_5, probas_pred=y_scores)
            plt.plot(thresholds, precisions[:-1], 'b--', label='Precision')  # [:-1]不包括最后一项
            plt.plot(thresholds, recalls[:-1], 'g-', label='Recall')
            print('----------------precision--------------------')
            print(precisions)
            print('------------------recall---------------------')
            print(recalls)
            print('----------------thresholds-------------------')
            print(thresholds)
            print('---------------------------------------------')
            plt.savefig('images\\3_3_4_1.png')
            plt.show()
            threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]
            y_train_pred_90 = (y_scores >= threshold_90_precision)
            pre_score = precision_score(y_true=self.father.y_train_5, y_pred=y_train_pred_90)
            re_score = recall_score(y_true=self.father.y_train_5, y_pred=y_train_pred_90)
            print('精度\t', pre_score)
            print('召回率\t', re_score)
            plt.plot(recalls[:-1], precisions[:-1], linewidth=2, label='---')  # 横向表示召回率,纵向表示精度
            plt.savefig('images\\3_3_4_2.png')
            plt.show()

        @staticmethod
        def plot_roc_curve(fpr, tpr, label=None):
            """
            画曲线
            """
            plt.plot(fpr, tpr, linewidth=2, label=label)
            plt.plot([0, 1], [0, 1], 'k--')

        def draw_roc_curve(self):
            """
            3.3.5 ROC曲线(Receiver Operating Characteristic Curve)
                X轴:假正类率(特异度),指被错误分为正类的负类实例比率
                Y轴:真正类率(灵敏度),指被正确分为负类的负类实例比率,也叫召回率
                X轴越接近0,准确率越高,Y越大,准确率越高,曲线离
            """
            sgd_clf = self.father.TrainBinaryClassifier(self.father).get_sgd_clf()
            y_scores = cross_val_predict(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3,
                method='decision_function'
            )
            fpr, tpr, threthold = roc_curve(y_true=self.father.y_train_5, y_score=y_scores)
            self.plot_roc_curve(fpr, tpr)
            plt.savefig('images\\3_3_5_1.png')
            plt.show()
            ra_score = roc_auc_score(y_true=self.father.y_train_5, y_score=y_scores)
            print('----------------------------------------ROC曲线---------------------------------------')
            print('roc_auc_score\t', ra_score)
            forest_clf = RandomForestClassifier(random_state=self.father.random_state)
            y_probas_forest = cross_val_predict(
                estimator=forest_clf,
                X=self.father.X_train,
                y=self.father.y_train_5,
                cv=3,
                method='predict_proba'
            )
            y_scores_forest = y_probas_forest[:, -1]
            fpr_forest, tpr_forest, threthold = roc_curve(y_true=self.father.y_train_5, y_score=y_scores_forest)
            plt.plot(fpr, tpr, 'b:', label='SGD')
            self.plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
            plt.legend(loc='lower right')
            plt.savefig('images\\3_3_5_2.png')
            plt.show()

    class MuticlassClassifier:
        """
        3.4 多类分类器
        """

        def __init__(self, father):
            self.father = father

        def svc_classifier(self):
            """
            SVC分类器：即C-支持向量分类器,是一种二元分类模型。
            SVM支持向量机本质上是一个非线性分类器,学习策略是区间最大化
            """
            print('-----------------------------------SVC支持向量分类器-----------------------------------')
            svm_clf = SVC()
            svm_clf.fit(self.father.X_train, self.father.y_train)
            svc_predict = svm_clf.predict([self.father.some_digit])
            print('SVC多类分类器预测\t', svc_predict)
            some_digit_scores = svm_clf.decision_function([self.father.some_digit])
            print('SVC 决策分数\t\t', some_digit_scores)
            arg_max = np.argmax(some_digit_scores)
            print('arg_max\t\t', arg_max)
            print('svc classes:\t\t', svm_clf.classes_)
            ovr_clf = OneVsRestClassifier(SVC())
            ovr_clf.fit(self.father.X_train, self.father.y_train)
            ovr_predict = ovr_clf.predict([self.father.some_digit])
            print('ovr_predict\t\t', ovr_predict)
            print('estmators\t\t', ovr_clf.estimators_)

        def sgd_classifier(self):
            """
            SGD分类器：
            """
            print('---------------------------------SGD随机梯度下降分类器----------------------------------')
            sgd_clf = SGDClassifier(random_state=self.father.random_state)
            sgd_clf.fit(self.father.X_train, self.father.y_train)
            sgd_predict = sgd_clf.predict([self.father.some_digit])
            print('sgd_predict\t', sgd_predict)
            print('----------SGD decision function-----------')
            sgd_decision = sgd_clf.decision_function([self.father.some_digit])
            print(sgd_decision)
            print('-----------SGD Cross val score------------')
            cross_score = cross_val_score(
                estimator=sgd_clf,
                X=self.father.X_train,
                y=self.father.y_train,
                cv=3,
                scoring='accuracy'
            )
            print(cross_score)
            print('------简单缩放 Cross val score------------')
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(self.father.X_train.astype(np.float64))
            cross_score = cross_val_score(
                estimator=sgd_clf,
                X=X_train_scaled,
                y=self.father.y_train,
                cv=3,
                scoring='accuracy'
            )
            print(cross_score)

    class ErrorAnalysis:
        """
        3.5 误差分析
        """

        def __init__(self, father):
            self.father = father

        def get_y_train_pred(self):
            """
            交叉预测
            """
            svm_clf = SVC()
            svm_clf.fit(X=self.father.X_train, y=self.father.y_train)
            standard_scaler = StandardScaler()
            x_train_scaled = standard_scaler.fit_transform(X=self.father.X_train.astype(np.float64))
            y_train_pred = cross_val_predict(
                estimator=svm_clf,
                X=x_train_scaled,
                y=self.father.y_train,
                cv=3
            )
            return y_train_pred

        def error_analysis_01(self):
            """
            3.5 误差分析01
            """
            print('--------------------------------混淆矩阵(不考虑图片数量)---------------------------------')
            y_train_pred = self.get_y_train_pred()
            conf_mx = confusion_matrix(y_true=self.father.y_train, y_pred=y_train_pred)
            print(conf_mx)
            plt.matshow(conf_mx, cmap=plt.cm.gray)
            plt.savefig('images/3_5_1.png')
            plt.show()

        def error_analysis_02(self):
            """
            3.5 误差分析02
            """
            print('---------------------------------混淆矩阵(考虑图片数量)----------------------------------')
            y_train_pred = self.get_y_train_pred()
            conf_mx = confusion_matrix(y_true=self.father.y_train, y_pred=y_train_pred)
            row_sums = conf_mx.sum(axis=1, keepdims=True)
            norm_conf_mx = conf_mx / row_sums
            np.fill_diagonal(norm_conf_mx, 0)
            plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
            plt.savefig('images/3_5_2.png')
            plt.show()

        def error_analysis_03(self):
            """
            3.5 误差分析03
            生成一副关于3和5的混淆矩阵图
                左上角3,右下角是5,右上角是把3猜成了5,而左下角是把5猜成了3
            """
            print('---------------------------------混淆矩阵(分析数字3和5)----------------------------------')
            y_train_pred = self.get_y_train_pred()
            cl_a, cl_b = 3, 5
            X_aa = self.father.X_train[(self.father.y_train == cl_a) & (y_train_pred == cl_a)]
            X_ab = self.father.X_train[(self.father.y_train == cl_a) & (y_train_pred == cl_b)]
            X_ba = self.father.X_train[(self.father.y_train == cl_b) & (y_train_pred == cl_a)]
            X_bb = self.father.X_train[(self.father.y_train == cl_b) & (y_train_pred == cl_b)]
            plt.figure(figsize=(8, 8))
            plt.subplot(221)
            self.father.plot_digits(X_aa[:25], images_per_row=5)
            plt.subplot(222)
            self.father.plot_digits(X_ab[:25], images_per_row=5)
            plt.subplot(223)
            self.father.plot_digits(X_ba[:25], images_per_row=5)
            plt.subplot(224)
            self.father.plot_digits(X_bb[:25], images_per_row=5)
            plt.savefig('images/3_5_3.png')
            plt.show()

    class MultiLabelClassifier:
        """
        3.6 多标签分类
        """

        def __init__(self, father):
            self.father = father

        def knn_classifier(self):
            """
            KNeighborsClassifier 分类器,支持多标签分类,然后使用多个目标组对它进行训练
            """
            print('--------------------------------------KNN分类器-----------------------------------------')
            y_train_large = (self.father.y_train >= 7)
            y_train_odd = (self.father.y_train % 2 == 1)
            y_multilabel = np.c_[y_train_large, y_train_odd]
            knn_clf = KNeighborsClassifier()
            knn_clf.fit(self.father.X_train, y_multilabel)
            knn_predict = knn_clf.predict([self.father.some_digit])
            print('----------KNN Predict---------')
            print(knn_predict)
            y_train_knn_pred = cross_val_predict(
                estimator=knn_clf,
                X=self.father.X_train,
                y=y_multilabel,
                cv=3
            )
            fone_score = f1_score(y_true=y_multilabel, y_pred=y_train_knn_pred, average='macro')
            print('------------f1 score----------')
            print(fone_score)

    class MultiOutputClassifier:
        """
        3.7 多输出分类
        """

        def __init__(self, father):
            self.father = father

        def display(self):
            """
            显示多输出分类
            """
            print('-----------------------------------KNN分类器去噪声--------------------------------------')
            noise = np.random.randint(0, 100, (len(self.father.X_train), 784))
            X_train_mod = self.father.X_train + noise  # 给训练图片加上噪声
            noise = np.random.randint(0, 100, (len(self.father.X_test), 784))
            X_test_mod = self.father.X_test + noise  # 给测试图片加上噪声
            y_train_mod = self.father.X_train
            knn_clf = KNeighborsClassifier()  #
            knn_clf.fit(X_train_mod, y_train_mod)
            clean_digit = knn_clf.predict(X_test_mod[:50])
            print(type(X_test_mod), type(clean_digit))
            diagram = np.append(X_test_mod[:50], clean_digit).reshape(100, 784)
            # print(len(diagram), len(diagram[:, 1]))
            # print(diagram)
            plt.figure(figsize=(8, 8))
            self.father.plot_digits(diagram)
            plt.savefig('images/3_7.png')
            plt.show()


if __name__ == '__main__':
    start = time.time()
    classifies = Classifies()
    # 3.1 MNIST数据集
    # classifies.Mnist(classifies).display_image_0()  # 显示MNIST数据库中第1张图片
    classifies.Mnist(classifies).display_image_100()  # 显示MNIST数据库中前100张图片
    # 3.2 训练一个二分类器
    # classifies.TrainBinaryClassifier(classifies).predict()
    # 3.3 性能测量
    # classifies.PerformanceMeasurement(classifies).cross_validate()  # 3.3.1 使用交叉验证测量准确率
    # classifies.PerformanceMeasurement(classifies).confusion_marix_5()  # 3.3.2 混淆矩阵
    # classifies.PerformanceMeasurement(classifies).precision_and_recall()  # 3.3.3 精度和召回率
    # classifies.PerformanceMeasurement(classifies).trading_off_percision_and_recall()  # 3.3.4 精度/召回率权衡
    # classifies.PerformanceMeasurement(classifies).draw_roc_curve()  # 3.3.5 ROC曲线
    # 3.4 多类分类器
    # classifies.MuticlassClassifier(classifies).svc_classifier()
    # classifies.MuticlassClassifier(classifies).sgd_classifier()
    # 3.5 误差分析
    # classifies.ErrorAnalysis(classifies).error_analysis_01()
    # classifies.ErrorAnalysis(classifies).error_analysis_02()
    # classifies.ErrorAnalysis(classifies).error_analysis_03()
    # 3.6 多标签分类
    # classifies.MultiLabelClassifier(classifies).knn_classifier()
    # 3.7 多输出分类
    # classifies.MultiOutputClassifier(classifies).display()

    print('耗时:', time.time() - start)

2 代码运行结果（按教材章节编号）

3.1 MNIST数据集

【示例1】

---------------------------------MNIST第1张图片的标签----------------------------------
5
耗时: 63.53922486305237

【示例2】

-------------------------------MNIST前100张图片的标签----------------------------------
[5 0 4 1 9 2 1 3 1 4 3 5 3 6 1 7 2 8 6 9 4 0 9 1 1 2 4 3 2 7 3 8 6 9 0 5 6
 0 7 6 1 8 7 9 3 9 8 5 9 3 3 0 7 4 9 8 0 9 4 1 4 4 6 0 4 5 6 1 0 0 1 7 1 6
 3 0 2 1 1 7 9 0 2 6 7 8 3 9 0 4 6 7 4 6 8 0 7 8 3 1]
耗时: 65.62755966186523

3.2 训练二元分类器

-------------------------二分类器SGD预测前100个数字中哪些是5----------------------------
[ True False False False False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False False False  True
 False False False False False False False False False False False  True
  True False False False False False False False False False False False
 False False False False False  True False False False False False False
 False False False False False False False False False  True False False
 False False False False False False False False False False False False
 False False False False]
耗时: 76.09012770652771

3.3 性能测量

3.3.1 使用交叉验证测量准确率

-----------------------交叉验证二元分类器SGDClassifier的准确率--------------------------
[0.95035 0.96035 0.9604 ]
耗时: 92.97741341590881

3.3.2 混淆矩阵

-----------------------混淆矩阵二元分类器SGDClassifier的准确率--------------------------
[[53892   687]
 [ 1891  3530]]
耗时: 92.94346785545349

3.3.3 精度和召回率

----------------------------------精度 召回率 f1分数-----------------------------------
精度  0.8370879772350012
召回率     0.6511713705958311
f1分数    0.7325171197343846
耗时: 93.0093207359314

3.3.4 精度/召回率权衡

------------------------------------精度/召回率权衡------------------------------------
决策分数    [2164.22030239]
阈值为0时,预测结果     [ True]
阈值为8000时,预测结果   [False]
----------------precision--------------------
[0.09035    0.09035151 0.09035301 ... 1.         1.         1.        ]
------------------recall---------------------
[1.00000000e+00 1.00000000e+00 1.00000000e+00 ... 3.68935621e-04
 1.84467810e-04 0.00000000e+00]
----------------thresholds-------------------
[-146348.56726174 -142300.00705404 -137588.97581744 ...   38871.26391927
   42216.05562787   49441.43765905]
---------------------------------------------
精度  0.9000345901072293
召回率     0.4799852425751706
耗时: 93.40939092636108

3.3.5 ROC曲线

----------------------------------------ROC曲线---------------------------------------
roc_auc_score   0.9604938554008616
耗时: 139.91025948524475

3.4 多类分类器

-----------------------------------SVC支持向量分类器-----------------------------------
SVC多类分类器预测  [5]
SVC 决策分数       [[ 1.72501977  2.72809088  7.2510018   8.3076379  -0.31087254  9.3132482
   1.70975103  2.76765202  6.23049537  4.84771048]]
arg_max        5
svc classes:       [0 1 2 3 4 5 6 7 8 9]
ovr_predict        [5]
estmators      [SVC(), SVC(), SVC(), SVC(), SVC(), SVC(), SVC(), SVC(), SVC(), SVC()]
耗时: 1065.8030738830566

---------------------------------SGD随机梯度下降分类器----------------------------------
sgd_predict     [3]
----------SGD decision function-----------
[[-31893.03095419 -34419.69069632  -9530.63950739   1823.73154031
  -22320.14822878  -1385.80478895 -26188.91070951 -16147.51323997
   -4604.35491274 -12050.767298  ]]
-----------SGD Cross val score------------
[0.87365 0.85835 0.8689 ]
---------简单缩放 Cross val score----------
[0.8983 0.891  0.9018]
耗时: 775.6501314640045

3.5 误差分析

--------------------------------混淆矩阵(不考虑图片数量)---------------------------------
[[5809    3   20    8    6   13   32   14   17    1]
 [   1 6620   37   12   11    0    7   31   13   10]
 [  20   18 5721   42   27    4   20   61   36    9]
 [   3   15   83 5778    5   64    2   87   73   21]
 [   5   11   45    2 5623    6   21   25   10   94]
 [  14   11   16   77   17 5134   58   50   28   16]
 [  20    7   22    0   13   39 5743   55   19    0]
 [   9   24   37   15   39    1    0 6070    5   65]
 [  19   44   42   50   20   52   26   40 5533   25]
 [  11   10   25   49   81   12    1  153   22 5585]]
耗时: 892.1890172958374
---------------------------------混淆矩阵(考虑图片数量)----------------------------------
耗时: 899.4664740562439
---------------------------------混淆矩阵(分析数字3和5)----------------------------------
耗时: 881.4797575473785

3.6 多标签分类

--------------------------------------KNN分类器-----------------------------------------
----------KNN Predict---------
[[False  True]]
------------f1 score----------
0.976410265560605
耗时: 76.51309132575989

3.7 多输出分类

-----------------------------------KNN分类器去噪声--------------------------------------
[[33. 34. 69. ...  7. 65. 17.]
 [41. 49. 81. ... 89.  8. 94.]
 [72. 99. 56. ... 28. 95. 28.]
 ...
 [64.  2. 44. ... 16. 79. 76.]
 [85. 49. 99. ... 57. 20. 13.]
 [75. 39. 26. ... 92. 29. 85.]]
耗时: 68.47794008255005

标签：实战,第三章,0003,self,father,train,print,False,clf
From： https://www.cnblogs.com/chenyiyun/p/16805872.html

0003 机器学习实战第三章分类

相关文章

赞助商

阅读排行

0003 机器学习实战 第三章 分类

相关文章

赞助商

阅读排行

0003 机器学习实战第三章分类