test
# coding=utf-8 import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.model_selection import GridSearchCV from lightgbm import LGBMRegressor import re from sklearn.decomposition import PCA import joblib import shap data = pd.read_excel(r"E:\Desktop\data.xlsx") X = data.drop("y", axis=1) y = data["y"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) lgb = LGBMRegressor(random_state=0) param_grid = { 'n_estimators':[200,500], 'max_depth': range(3,8,2), 'learning_rate': [0.1, 0.2], 'subsample': [0.8], 'colsample_bytree': [0.8], 'num_leaves':[31, 63, 127], } grid = GridSearchCV(lgb, param_grid, cv=5, scoring="neg_mean_squared_error") grid.fit(X_train, y_train) print("best_params:", grid.best_params_) best_lgb = grid.best_estimator_ y_pred = best_lgb.predict(X_test) error = y_pred - y_test mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) pcc = np.corrcoef(y_test, y_pred)[0, 1] print("mse:", mse) print("rmse:", rmse) print("mae:", mae) print("r2:", r2) print("pcc:", pcc) plt.scatter(y_test, y_pred, c="blue") plt.xlabel("Truth") plt.ylabel("predict") plt.title("Truth vs predict") plt.show() plt.hist(error, bins=20, color="orange") plt.xlabel("SE") plt.ylabel("Fruquence") plt.title("SE distribute") plt.show() pca = PCA(n_components=2) X_pca = pca.fit_transform(X) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="rainbow") plt.xlabel("1st_PCA") plt.ylabel("2rd_PCA") plt.title("PCA result") plt.colorbar() plt.show() y_train_pred = best_lgb.predict(X_train) mae_train = mean_absolute_error(y_train, y_train_pred) mse_train = mean_squared_error(y_train, y_pred) rmse = np.sqrt(mse_train) r2_train = r2_score(y_train, y_train_pred) pcc_train = np.corrcoef(y_train, y_train_pred)[0, 1] mse_train = mean_squared_error(y_train, y_pred) print("mae_train:", mae_train) print("mse_train:", mse_train) print("rmse_train:", rmse_train) print("r2_train:", r2_train) print("pcc_train:", pcc_train) # 对每列x feature进行权重分析 # 使用feature_importances_属性获取每个特征的重要性分数 feature_names = X.columns # 获取特征名称 feature_importances = best_lgb.feature_importances_ # 获取特征重要性分数 # 绘制柱状图显示每个特征的重要性 plt.bar(feature_names, feature_importances) plt.xlabel("Feature") plt.ylabel("Importance") plt.title("Feature importance") plt.show() # 使用shap库获取每个特征的SHAP值 explainer = shap.TreeExplainer(best_lgb) # 创建解释器对象 shap_values = explainer.shap_values(X) # 获取SHAP值 # 绘制汇总图显示每个特征的SHAP值 shap.summary_plot(shap_values, X, plot_type="bar") joblib.dump(best_lgb, 'best_lgb.pkl')
# 调用best_lgb.pkl文件 model = joblib.load('best_lgb.pkl') # 读取data.xlsx文件 df = pd.read_excel(r"E:\Desktop\data.xlsx") # 删除y列 df = df.drop("y", axis=1) # 遍历每一行的x值,输入到模型,并将预测的y值,输入到df_read.iloc[i, 18] for i, row in df.iterrows(): # 获取x值,转换为二维数组 x = row.values.reshape(1, -1) # 预测y值,转换为标量 y = model.predict(x)[0] # 输入y值到df_read.iloc[i, 18] df_read.loc[i, 18] = y
标签:plt,lightgbm,pred,print,train,test,best From: https://www.cnblogs.com/kehan/p/17840448.html