1. 通过sklearn调用机器学习api处理问题通用流程
# -*- coding: utf-8 -*- import time import joblib import numpy as np import pandas as pd from collections import Counter from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score def general_process_ML(): # 1. 数据爬取和清洗 # 2. 数据读取 data/label names = ['x1', 'x2', 'x3', 'x4', 'y'] data = pd.read_csv(r'./../data/iris.data', header=None, names=names) X = data.iloc[:, :-1] Y = data.iloc[:, -1] label_encoder = LabelEncoder() Y = label_encoder.fit_transform(Y) # print(data.describe()) # 数据描述 # print('=' * 10) # print(data.head()) # 查看前五行 # print('=' * 10) # print(data.isnull().any()) # 数据都不为空 # 3. 划分训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) # 4. 特征工程(正则化、标准化、word2Vector) scaler = StandardScaler() x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) # 模型初始化 model = KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='kd_tree') # 模型训练 train_start = time.time() model.fit(x_train, y_train) train_total_time = time.time() - train_start print(f"模型训练过程用时: {train_total_time}") # 模型评估 y_train_pred = model.predict(x_train) y_test_pred = model.predict(x_test) train_score = accuracy_score(y_train, y_train_pred) test_score = accuracy_score(y_test, y_test_pred) print(f"模型在训练集上的accuracy: {train_score}") print(f"模型在测试集上的accuracy: {test_score}") # 模型持久化&部署 scaler_path = r"./../models/knn_scaler.joblib" knn_path = r"./../models/knn_model.joblib" joblib.dump(scaler, scaler_path) joblib.dump(model, knn_path) # 加载复用模型权重 # reload_scaler = joblib.load(scaler_path) # reload_knn = joblib.load(knn_path)
2. 手搓KNN
# -*- coding: utf-8 -*- import time import joblib import numpy as np import pandas as pd from collections import Counter from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score class KNNModel: def __init__(self, k=None, classify=False): self.k = k self.X = None self.Y = None self.classify = classify def fit(self, X, Y): self.X = np.array(X) self.Y = np.array(Y) def get_k_nearest_neighbors(self, X): near_neighbors = [] num = X.shape[0] for i in range(num): dist = [np.sum((np.array(X[i]) - x) ** 2) for x in self.X] dist_list = list(zip(dist, self.Y)) dist_list.sort(key=lambda pair: pair[0]) near_neighbors.append(dist_list[:self.k]) return np.array(np.array(near_neighbors)[:, :, -1]) def predict(self, X): nearest_k_neighbor = self.get_k_nearest_neighbors(X) # X中每个元素的最近的k个元素的标签 # 分类取类别最多的,回归取均值 if self.classify: return np.array([Counter(item).most_common(1)[0][0] for item in nearest_k_neighbor]) else: return np.array(nearest_k_neighbor).mean(axis=1) def score(self, X, Y): y_hat = self.predict(X) acc = np.mean(y_hat == Y) return acc
标签:KNN,1.12,scaler,近邻,np,train,test,import,self From: https://www.cnblogs.com/zhangzhenw/p/18349035