import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import numpy as np
from docx import Document
import matplotlib.pyplot as plt
import time
# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 定义全局变量以便于调整和调试
DATA_SPLIT_RATIO = 0.2 # 数据划分比例(测试集占比)
DATA_SHUFFLE = True # 是否进行数据洗牌
CROSS_VALIDATION = True # 是否进行交叉验证
NGEN = 10 # 遗传算法迭代代数
POP_SIZE = 10 # 种群大小
MUTPB = 0.2 # 突变概率
CXPB = 0.5 # 交叉概率
def load_data():
# 读取Excel数据
data = pd.read_excel('附件1.xlsx') # 替换为您的数据文件
data = data.dropna() # 清除缺失值
return data
def pca_analysis(X):
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 主成分分析(PCA)
pca = PCA(n_components=min(X_scaled.shape))
pca.fit(X_scaled)
explained_variance = pca.explained_variance_
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)
# 总方差解释表格
print("总方差解释表格:")
print("成分\t特征根\t方差解释率(%)\t累积方差解释率(%)")
for i in range(len(explained_variance)):
print(f"{i + 1}\t{explained_variance[i]:.3f}\t{explained_variance_ratio[i] * 100:.3f}\t{cumulative_variance[i] * 100:.3f}")
return X_scaled, pca
def ask_user_for_choices():
print("\n请查看Excel数据的前几行:")
data = load_data()
print(data.head())
preprocess = input("数据是否已完成预处理(数据清洗和标准化)?(是/否): ").strip().lower()
if preprocess != '是':
print("请先进行数据预处理。")
return
task_type = input("请选择任务类型(分类/回归): ").strip().lower()
target_column = input("请指定Excel中的因变量列名: ").strip()
X = data.drop(columns=[target_column])
y = data[target_column]
if task_type == '分类':
if not pd.api.types.is_numeric_dtype(y) or len(y.unique()) <= 2:
model_class = RandomForestClassifier
metrics_function = accuracy_score
else:
print("分类任务需要目标变量为类别标签。")
return
elif task_type == '回归':
if pd.api.types.is_numeric_dtype(y):
model_class = RandomForestRegressor
metrics_function = mean_squared_error
else:
print("回归任务需要目标变量为连续数值。")
return
else:
print("任务类型无效。")
return
pca_replace = input("是否使用主成分分析的特征代替数据?(是/否): ").strip().lower()
if pca_replace == '是':
X_scaled, pca = pca_analysis(X)
num_components = int(input("请选择前几个主成分作为特征: "))
X = X_scaled[:, :num_components]
else:
X = X.values # 保持原始数据
# 遗传算法和随机森林模型的定义
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)
toolbox = base.Toolbox()
toolbox.register("attr_int", np.random.randint, 1, 21) # 个体的第一个特征(dp),修正范围
toolbox.register("attr_int_es", np.random.randint, 1, 201) # 个体的第二个特征(es),修正范围
toolbox.register("individual", tools.initCycle, creator.Individual,
(toolbox.attr_int, toolbox.attr_int_es), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutPolynomialBounded, low=[1, 1], up=[20, 200], eta=1.0, indpb=MUTPB)
toolbox.register("select", tools.selTournament, tournsize=3)
def evaluate(individual):
dp, es = individual # 这里确保每个个体只有两个属性
dp = int(dp) # 确保 dp 是一个整数
es = int(es) # 确保 es 是一个整数
if dp < 1:
dp = 1 # 确保 max_depth 至少为 1
if es < 1:
es = 1 # 确保 n_estimators 至少为 1
model = model_class(n_estimators=es, max_depth=dp)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if task_type == '分类':
accuracy = metrics_function(y_test, y_pred)
return (accuracy,)
else:
mse = metrics_function(y_test, y_pred)
return (1 / (1 + mse),) # 最大化准确率
toolbox.register("evaluate", evaluate)
# 遗传算法运行
start_time = time.time()
population = toolbox.population(n=POP_SIZE)
algorithms.eaSimple(population, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=NGEN,
verbose=True, stats=None, halloffame=None)
end_time = time.time()
elapsed_time = end_time - start_time
# 最优模型参数
best_ind = tools.selBest(population, 1)[0]
dp_best, es_best = best_ind
print(f"最优树深(dp):{dp_best}")
print(f"最优树数目(es):{es_best}")
# 生成报告
generate_report(model_class, X, y, best_ind, dp_best, es_best, task_type, elapsed_time)
def generate_report(model_class, X, y, best_ind, dp_best, es_best, task_type, elapsed_time):
# 创建一个新的Word文档
doc = Document()
doc.add_heading('遗传算法优化随机森林模型报告', 0)
# 添加分析步骤部分
doc.add_heading('分析步骤', level=1)
doc.add_paragraph("1. 通过训练集数据来建立随机森林回归模型。")
doc.add_paragraph("2. 通过建立的随机森林来计算特征重要性。")
doc.add_paragraph("3. 将建立的随机森林回归模型应用到训练、测试数据,得到模型评估结果。")
doc.add_paragraph("4. 由于随机森林具有随机性,每次运算的结果不一样,若保存本次训练模型,后续可以直接上传数据代入到本次训练模型进行计算预测。")
doc.add_paragraph("5. 注:随机森林无法像传统模型一样得到确定的方程,通常通过测试数据预测精度来对模型进行评价。")
# 输出结果1:模型参数
doc.add_heading('输出结果1:模型参数', level=1)
model = model_class(n_estimators=int(es_best), max_depth=int(dp_best))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
model.fit(X_train, y_train)
training_time = f"{elapsed_time:.2f}秒"
oob_score = model.oob_score_ if hasattr(model, "oob_score_") else 'N/A' # 袋外数据测试结果
# 注意这里将行数增加到16行,原为14行
parameters_table = doc.add_table(rows=16, cols=2) # 修改行数为16
parameters_table.style = 'Table Grid'
parameters_table.cell(0, 0).text = "参数名称"
parameters_table.cell(0, 1).text = "参数值"
parameters_table.cell(1, 0).text = "数据划分比例"
parameters_table.cell(1, 1).text = str(DATA_SPLIT_RATIO)
parameters_table.cell(2, 0).text = "数据洗牌"
parameters_table.cell(2, 1).text = "是" if DATA_SHUFFLE else "否"
parameters_table.cell(3, 0).text = "交叉验证"
parameters_table.cell(3, 1).text = "是" if CROSS_VALIDATION else "否"
parameters_table.cell(4, 0).text = "节点分裂评价准则"
parameters_table.cell(4, 1).text = "mse" if task_type == '回归' else "gini"
parameters_table.cell(5, 0).text = "划分时考虑的最大特征比例"
parameters_table.cell(5, 1).text = "None"
parameters_table.cell(6, 0).text = "内部节点分裂的最小样本数"
parameters_table.cell(6, 1).text = str(model.min_samples_split)
parameters_table.cell(7, 0).text = "叶子节点的最小样本数"
parameters_table.cell(7, 1).text = str(model.min_samples_leaf)
parameters_table.cell(8, 0).text = "叶子节点中样本的最小权重"
parameters_table.cell(8, 1).text = str(model.min_weight_fraction_leaf)
parameters_table.cell(9, 0).text = "树的最大深度"
parameters_table.cell(9, 1).text = str(dp_best)
parameters_table.cell(10, 0).text = "叶子节点的最大数量"
parameters_table.cell(10, 1).text = str(model.max_leaf_nodes)
parameters_table.cell(11, 0).text = "节点划分不纯度的阈值"
parameters_table.cell(11, 1).text = "0"
parameters_table.cell(12, 0).text = "决策树数量"
parameters_table.cell(12, 1).text = str(es_best)
parameters_table.cell(13, 0).text = "有放回采样"
parameters_table.cell(13, 1).text = "true"
parameters_table.cell(14, 0).text = "袋外数据测试"
parameters_table.cell(14, 1).text = str(oob_score)
parameters_table.cell(15, 0).text = "训练时间"
parameters_table.cell(15, 1).text = training_time
doc.add_paragraph("图表说明:上表展示了模型各项参数配置以及模型训练时长。")
# 输出结果2:特征重要性
doc.add_heading('输出结果2:特征重要性', level=1)
if task_type == '回归':
importances = model.feature_importances_
features = X.columns if hasattr(X, 'columns') else [f'Feature {i}' for i in range(X.shape[1])]
importance_table = doc.add_table(rows=len(importances) + 1, cols=2)
importance_table.style = 'Table Grid'
importance_table.cell(0, 0).text = "特征名称"
importance_table.cell(0, 1).text = "特征重要性"
for i, (feature, importance) in enumerate(zip(features, importances)):
importance_table.cell(i + 1, 0).text = str(feature)
importance_table.cell(i + 1, 1).text = f"{importance * 100:.2f}%"
doc.add_paragraph("图表说明:上表展示了各特征(自变量)的重要性比例。")
# 输出结果3:模型评估结果
doc.add_heading('输出结果3:模型评估结果', level=1)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
if task_type == '回归':
mse_train = mean_squared_error(y_train, y_train_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_train = np.sqrt(mse_train)
rmse_test = np.sqrt(mse_test)
mae_train = mean_absolute_error(y_train, y_train_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_train = r2_score(y_train, y_train_pred)
r2_test = r2_score(y_test, y_test_pred)
eval_table = doc.add_table(rows=3, cols=6)
eval_table.style = 'Table Grid'
eval_table.cell(0, 0).text = ""
eval_table.cell(0, 1).text = "MSE"
eval_table.cell(0, 2).text = "RMSE"
eval_table.cell(0, 3).text = "MAE"
eval_table.cell(0, 4).text = "R²"
eval_table.cell(1, 0).text = "训练集"
eval_table.cell(1, 1).text = f"{mse_train:.3f}"
eval_table.cell(1, 2).text = f"{rmse_train:.3f}"
eval_table.cell(1, 3).text = f"{mae_train:.3f}"
eval_table.cell(1, 4).text = f"{r2_train:.3f}"
eval_table.cell(2, 0).text = "测试集"
eval_table.cell(2, 1).text = f"{mse_test:.3f}"
eval_table.cell(2, 2).text = f"{rmse_test:.3f}"
eval_table.cell(2, 3).text = f"{mae_test:.3f}"
eval_table.cell(2, 4).text = f"{r2_test:.3f}"
doc.add_paragraph("图表说明:上表中展示了训练集和测试集的预测评价指标,通过量化指标来衡量随机森林的预测效果。")
doc.add_paragraph(
"模型评估指标解释:\n"
"- MSE(均方误差):MSE 值越小,表示模型在预测时的误差越小,模型性能越好。\n"
"- RMSE(均方根误差):RMSE 是 MSE 的平方根,易于理解的误差度量单位。RMSE 值越小,表示模型的预测效果越好。\n"
"- MAE(平均绝对误差):MAE 是预测值与真实值之间的绝对误差的平均值。MAE 值越小,模型的预测性能越好。\n"
"- R²(决定系数):R² 值表示模型对数据变异性的解释程度,R² 越接近 1,表示模型对数据的拟合效果越好。\n\n"
)
# 保存文档
doc.save('遗传-随机森林模型报告.docx')
print("报告已生成并保存为 '遗传-随机森林模型报告.docx'")
if __name__ == "__main__":
ask_user_for_choices()
标签:import,代码,explained,随机,variance,print,遗传算法,pca,data
From: https://blog.csdn.net/m0_73065928/article/details/141615335