# 将数据分为测试集和训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)
y_predict = knn_clf.predict(X_test)
# 查看分类的准确度,原理是(真实值-测试值)/数据的个数
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)
# 如果查看分类的准确度时,不需要关心y_predict是什么,而直接想得出准确度时,可以:
knn_clf.score(X_test, y_test)
# 网格化超参数搜索
param_grid = [
{
'weights': ['uniform'],
'n_neighbors': [i for i in range(1, 11)]
},
# 名科夫斯基距离
{
'weights': ['distance'],
'n_neighbors': [i for i in range(1, 11)],
'p': [i for i in range(1, 6)]
}
]
knn_clf = KNeighborsClassifier()
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(knn_clf, param_grid)
# 针对定义所有的参数,来寻找最佳的模型
%%timetime
grid_search.fit(X_train, y_train)
# 返回的是网格搜索最佳的分类器对应的参数(返回的是所有参数)
grid_search.best_estimator_
# 返回的是网格搜索最佳的分类器的准确度得分
grid_search.best_score_
# 返回的是网格化参数重最优的参数
grid_search.best_params_
# 测试最优参数的准确率
knn_clf = grid_search.best_estimator_
knn_clf.score(X_test,y_test)
# 并行处理超参数
# verbose的意思是在搜索的过程中进行一些输出
%%time
grid_search = GridSearchCV(knn_clf, param_grid, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
'''
除了名科夫斯基距离外,还可以有:
向量空间余弦相似度
调整余弦相似度
皮尔森相关系数
Jaccard相似系数
metric参数修改(默认名科夫斯基距离)
'''
# 对测试数据集进行归一化的方法:(得到均值和方差)
# (x_test - mean_train)/std_train
# sklearn实现的流程:先对train数据集进行Scalar,fit后得到其Scalar相关的关键参数,
# 再经过transform得到输出结果
# 数据归一化
# 1.最值归一化:把所有数据映射到0-1之间
# 适用于分布有边界的i情况,受outlier影响较大
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
# 归一化
X_train = scaler.transform(X_train)
X_test_standard = scaler.transform(X_test)
#2.均值方差归一化:把所有数据归一到均值为0方差为1的分布中
# 适用于数据分布没有明显的边界;有可能存在极端数据值
from sklearn.preprocessing import StandardScaler
standardScalar = StandardScaler()
standardScalar.fit(X_train)
# 均值
standardScalar.mean_
# 方差
standardScalar.scale_
# 归一化
X_train = standardScalar.transform(X_train)
X_test_standard = standardScalar.transform(X_test)
# 再用knn传入时:
knn_clf.score(X_test_standard, y_test)
'''
K近邻方法缺点:
⾼高度数据相关
预测结果不不具有可解释性
维数灾难
'''
# KNN做 回归 ->KNN Regressor
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor()
knn_reg.fit(X_train_standard, y_train)
knn_reg.score(X_test_standard, y_test)
# 超参数
from sklearn.model_selection import GridSearchCV
param_grid = [
{
"weights": ["uniform"],
"n_neighbors": [i for i in range(1, 11)]
},
{
"weights": ["distance"],
"n_neighbors": [i for i in range(1, 11)],
"p": [i for i in range(1,6)]
}
]
knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)
grid_search.fit(X_train_standard, y_train)
grid_search.best_params_
grid_search.best_score_
grid_searchgrid_sea .best_estimator_.score(X_test_standard, y_test)
标签:Knn,knn,search,clf,test,train,grid,归一化,网格化 From: https://blog.51cto.com/u_15955938/6039487