import sklearn from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn import preprocessing import csv import numpy as np from sklearn.inspection import partial_dependence from sklearn.inspection import plot_partial_dependence from sklearn.inspection import partial_dependence from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score import shap from sklearn.preprocessing import OneHotEncoder from matplotlib.pyplot import MultipleLocator from sklearn.metrics import r2_score #读取原表格 pd_data=pd.read_csv('QYD.csv') #提取特征与标签,此处X_origin输入特征,Y_origin输入因变量 X_origin = pd_data.loc[:, ('BTSM','SVF','GVI','ROAD','POI','JZMD','NDVI','DLMD','POPU','FLOOR')] y_origin = pd_data.loc[:, 'temp'] X, y = sklearn.utils.shuffle(X_origin, y_origin) #划分测试集和训练集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=532)#选择20%为测试集 print('训练集测试及参数:') print('X_train.shape={}\n y_train.shape ={}\n X_test.shape={}\n, y_test.shape={}'.format(X_train.shape,y_train.shape,X_test.shape,y_test.shape)) #定义训练模型 rfreg = RandomForestRegressor(n_estimators=161, max_depth=19, random_state=90) #训练 model = rfreg.fit(X_train, y_train) #输出模型超参数 print('模型参数:') print(model) # 训练后模型截距 print('模型特征重要性:') print(rfreg.feature_importances_) # 训练后模型权重 score = cross_val_score(rfreg, X_train, y_train, cv=10).mean() print(score) #对测试集进行训练 y_pred = rfreg.predict(X_test) #初始化误差参数 sum_mean = 0 #计算均方误差 for i in range(len(y_pred)): sum_mean += (y_pred[i] - y_test.values[i]) ** 2 sum_erro = np.sqrt(sum_mean /len(y_pred)) #打印均方误差 print ("RMSE by hand:", sum_erro) print("R2:", rfreg.score(X_test,y_test)) #返回十次交叉验证值,此处返回负的MSE,转为正值 cross_score=cross_val_score(rfreg, X, y, cv=10,scoring = 'neg_mean_squared_error') print(pow(-cross_score,0.5)) # 做验证集的原始数据-预测数据曲线 plt.figure() plt.plot(range(len(y_pred)), y_pred, 'r', label="predict") plt.plot(range(len(y_pred)), y_test, 'b', label="test",) plt.legend(loc="upper right") # 显示图中的标签 plt.xlabel("the number of sales") plt.ylabel('value of sales') plt.show() plt.close() #绘制特征的相关性热力图 hot = X.corr() plt.subplots(figsize = (4,4)) sns.heatmap(hot,annot = True,vmax = 1,square = True,cmap = "Blues") plt.show() plt.close() #绘制部分依赖图,输入特征 features = ['BTSM','SVF','GVI','ROAD','POI','JZMD','NDVI','DLMD','POPU','FLOOR'] plot_partial_dependence(rfreg, X_train, features,n_jobs=3, grid_resolution=20, method='brute')#,line_kw={"color": "black","lw":0.8},line_kw是传给plot的关键字字典 fig = plt.gcf() fig.subplots_adjust(hspace=0.3) plt.show() plt.close() #shap可解释机器学习 #创建explainer,输入特征 cols = ['BTSM','SVF','GVI','ROAD','POI','JZMD','NDVI','DLMD','POPU','FLOOR'] explainer = shap.TreeExplainer(rfreg) #numpy.array数组 shap_values = explainer.shap_values(pd_data[cols]) #shap.Explanation对象 shap_values2 = explainer(pd_data[cols]) print(shap_values.shape) #绘制Summary Plot shap.summary_plot(shap_values, pd_data[cols]) #绘制heatmap shap.plots.heatmap(shap_values2)
标签:shap,plt,python,备忘,train,test,import,sklearn From: https://www.cnblogs.com/Vicrooor/p/17890866.html