主成分分析结合遗传算法优化的随机森林通用代码

标签：import 代码 explained 随机 variance print 遗传算法 pca data
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, algorithms
import numpy as np
from docx import Document
import matplotlib.pyplot as plt
import time

# 设置中文字体和负号显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

# 定义全局变量以便于调整和调试
DATA_SPLIT_RATIO = 0.2  # 数据划分比例（测试集占比）
DATA_SHUFFLE = True  # 是否进行数据洗牌
CROSS_VALIDATION = True  # 是否进行交叉验证
NGEN = 10  # 遗传算法迭代代数
POP_SIZE = 10  # 种群大小
MUTPB = 0.2  # 突变概率
CXPB = 0.5  # 交叉概率

def load_data():
    # 读取Excel数据
    data = pd.read_excel('附件1.xlsx')  # 替换为您的数据文件
    data = data.dropna()  # 清除缺失值
    return data

def pca_analysis(X):
    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 主成分分析（PCA）
    pca = PCA(n_components=min(X_scaled.shape))
    pca.fit(X_scaled)
    explained_variance = pca.explained_variance_
    explained_variance_ratio = pca.explained_variance_ratio_
    cumulative_variance = np.cumsum(explained_variance_ratio)

    # 总方差解释表格
    print("总方差解释表格：")
    print("成分\t特征根\t方差解释率(%)\t累积方差解释率(%)")
    for i in range(len(explained_variance)):
        print(f"{i + 1}\t{explained_variance[i]:.3f}\t{explained_variance_ratio[i] * 100:.3f}\t{cumulative_variance[i] * 100:.3f}")

    return X_scaled, pca

def ask_user_for_choices():
    print("\n请查看Excel数据的前几行：")
    data = load_data()
    print(data.head())

    preprocess = input("数据是否已完成预处理（数据清洗和标准化）？(是/否): ").strip().lower()
    if preprocess != '是':
        print("请先进行数据预处理。")
        return

    task_type = input("请选择任务类型（分类/回归）: ").strip().lower()
    target_column = input("请指定Excel中的因变量列名: ").strip()
    X = data.drop(columns=[target_column])
    y = data[target_column]

    if task_type == '分类':
        if not pd.api.types.is_numeric_dtype(y) or len(y.unique()) <= 2:
            model_class = RandomForestClassifier
            metrics_function = accuracy_score
        else:
            print("分类任务需要目标变量为类别标签。")
            return
    elif task_type == '回归':
        if pd.api.types.is_numeric_dtype(y):
            model_class = RandomForestRegressor
            metrics_function = mean_squared_error
        else:
            print("回归任务需要目标变量为连续数值。")
            return
    else:
        print("任务类型无效。")
        return

    pca_replace = input("是否使用主成分分析的特征代替数据？(是/否): ").strip().lower()
    if pca_replace == '是':
        X_scaled, pca = pca_analysis(X)
        num_components = int(input("请选择前几个主成分作为特征: "))
        X = X_scaled[:, :num_components]
    else:
        X = X.values  # 保持原始数据

    # 遗传算法和随机森林模型的定义
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    toolbox = base.Toolbox()
    toolbox.register("attr_int", np.random.randint, 1, 21)  # 个体的第一个特征（dp），修正范围
    toolbox.register("attr_int_es", np.random.randint, 1, 201)  # 个体的第二个特征（es），修正范围
    toolbox.register("individual", tools.initCycle, creator.Individual,
                     (toolbox.attr_int, toolbox.attr_int_es), n=1)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("mate", tools.cxBlend, alpha=0.5)
    toolbox.register("mutate", tools.mutPolynomialBounded, low=[1, 1], up=[20, 200], eta=1.0, indpb=MUTPB)
    toolbox.register("select", tools.selTournament, tournsize=3)

    def evaluate(individual):
        dp, es = individual  # 这里确保每个个体只有两个属性
        dp = int(dp)  # 确保 dp 是一个整数
        es = int(es)  # 确保 es 是一个整数
        if dp < 1:
            dp = 1  # 确保 max_depth 至少为 1
        if es < 1:
            es = 1  # 确保 n_estimators 至少为 1
        model = model_class(n_estimators=es, max_depth=dp)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if task_type == '分类':
            accuracy = metrics_function(y_test, y_pred)
            return (accuracy,)
        else:
            mse = metrics_function(y_test, y_pred)
            return (1 / (1 + mse),)  # 最大化准确率

    toolbox.register("evaluate", evaluate)

    # 遗传算法运行
    start_time = time.time()
    population = toolbox.population(n=POP_SIZE)
    algorithms.eaSimple(population, toolbox, cxpb=CXPB, mutpb=MUTPB, ngen=NGEN, 
                        verbose=True, stats=None, halloffame=None)
    end_time = time.time()
    elapsed_time = end_time - start_time

    # 最优模型参数
    best_ind = tools.selBest(population, 1)[0]
    dp_best, es_best = best_ind
    print(f"最优树深（dp）：{dp_best}")
    print(f"最优树数目（es）：{es_best}")

    # 生成报告
    generate_report(model_class, X, y, best_ind, dp_best, es_best, task_type, elapsed_time)

def generate_report(model_class, X, y, best_ind, dp_best, es_best, task_type, elapsed_time):
    # 创建一个新的Word文档
    doc = Document()
    doc.add_heading('遗传算法优化随机森林模型报告', 0)

    # 添加分析步骤部分
    doc.add_heading('分析步骤', level=1)
    doc.add_paragraph("1. 通过训练集数据来建立随机森林回归模型。")
    doc.add_paragraph("2. 通过建立的随机森林来计算特征重要性。")
    doc.add_paragraph("3. 将建立的随机森林回归模型应用到训练、测试数据，得到模型评估结果。")
    doc.add_paragraph("4. 由于随机森林具有随机性，每次运算的结果不一样，若保存本次训练模型，后续可以直接上传数据代入到本次训练模型进行计算预测。")
    doc.add_paragraph("5. 注：随机森林无法像传统模型一样得到确定的方程，通常通过测试数据预测精度来对模型进行评价。")

    # 输出结果1：模型参数
    doc.add_heading('输出结果1：模型参数', level=1)
    model = model_class(n_estimators=int(es_best), max_depth=int(dp_best))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=DATA_SPLIT_RATIO, shuffle=DATA_SHUFFLE, random_state=42)
    model.fit(X_train, y_train)
    
    training_time = f"{elapsed_time:.2f}秒"
    oob_score = model.oob_score_ if hasattr(model, "oob_score_") else 'N/A'  # 袋外数据测试结果

    # 注意这里将行数增加到16行，原为14行
    parameters_table = doc.add_table(rows=16, cols=2)  # 修改行数为16
    parameters_table.style = 'Table Grid'
    parameters_table.cell(0, 0).text = "参数名称"
    parameters_table.cell(0, 1).text = "参数值"
    parameters_table.cell(1, 0).text = "数据划分比例"
    parameters_table.cell(1, 1).text = str(DATA_SPLIT_RATIO)
    parameters_table.cell(2, 0).text = "数据洗牌"
    parameters_table.cell(2, 1).text = "是" if DATA_SHUFFLE else "否"
    parameters_table.cell(3, 0).text = "交叉验证"
    parameters_table.cell(3, 1).text = "是" if CROSS_VALIDATION else "否"
    parameters_table.cell(4, 0).text = "节点分裂评价准则"
    parameters_table.cell(4, 1).text = "mse" if task_type == '回归' else "gini"
    parameters_table.cell(5, 0).text = "划分时考虑的最大特征比例"
    parameters_table.cell(5, 1).text = "None"
    parameters_table.cell(6, 0).text = "内部节点分裂的最小样本数"
    parameters_table.cell(6, 1).text = str(model.min_samples_split)
    parameters_table.cell(7, 0).text = "叶子节点的最小样本数"
    parameters_table.cell(7, 1).text = str(model.min_samples_leaf)
    parameters_table.cell(8, 0).text = "叶子节点中样本的最小权重"
    parameters_table.cell(8, 1).text = str(model.min_weight_fraction_leaf)
    parameters_table.cell(9, 0).text = "树的最大深度"
    parameters_table.cell(9, 1).text = str(dp_best)
    parameters_table.cell(10, 0).text = "叶子节点的最大数量"
    parameters_table.cell(10, 1).text = str(model.max_leaf_nodes)
    parameters_table.cell(11, 0).text = "节点划分不纯度的阈值"
    parameters_table.cell(11, 1).text = "0"
    parameters_table.cell(12, 0).text = "决策树数量"
    parameters_table.cell(12, 1).text = str(es_best)
    parameters_table.cell(13, 0).text = "有放回采样"
    parameters_table.cell(13, 1).text = "true"
    parameters_table.cell(14, 0).text = "袋外数据测试"
    parameters_table.cell(14, 1).text = str(oob_score)
    parameters_table.cell(15, 0).text = "训练时间"
    parameters_table.cell(15, 1).text = training_time

    doc.add_paragraph("图表说明：上表展示了模型各项参数配置以及模型训练时长。")

    # 输出结果2：特征重要性
    doc.add_heading('输出结果2：特征重要性', level=1)
    if task_type == '回归':
        importances = model.feature_importances_
        features = X.columns if hasattr(X, 'columns') else [f'Feature {i}' for i in range(X.shape[1])]
        importance_table = doc.add_table(rows=len(importances) + 1, cols=2)
        importance_table.style = 'Table Grid'
        importance_table.cell(0, 0).text = "特征名称"
        importance_table.cell(0, 1).text = "特征重要性"
        for i, (feature, importance) in enumerate(zip(features, importances)):
            importance_table.cell(i + 1, 0).text = str(feature)
            importance_table.cell(i + 1, 1).text = f"{importance * 100:.2f}%"
    
    doc.add_paragraph("图表说明：上表展示了各特征（自变量）的重要性比例。")

    # 输出结果3：模型评估结果
    doc.add_heading('输出结果3：模型评估结果', level=1)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    if task_type == '回归':
        mse_train = mean_squared_error(y_train, y_train_pred)
        mse_test = mean_squared_error(y_test, y_test_pred)
        rmse_train = np.sqrt(mse_train)
        rmse_test = np.sqrt(mse_test)
        mae_train = mean_absolute_error(y_train, y_train_pred)
        mae_test = mean_absolute_error(y_test, y_test_pred)
        r2_train = r2_score(y_train, y_train_pred)
        r2_test = r2_score(y_test, y_test_pred)

        eval_table = doc.add_table(rows=3, cols=6)
        eval_table.style = 'Table Grid'
        eval_table.cell(0, 0).text = ""
        eval_table.cell(0, 1).text = "MSE"
        eval_table.cell(0, 2).text = "RMSE"
        eval_table.cell(0, 3).text = "MAE"
        eval_table.cell(0, 4).text = "R²"
        eval_table.cell(1, 0).text = "训练集"
        eval_table.cell(1, 1).text = f"{mse_train:.3f}"
        eval_table.cell(1, 2).text = f"{rmse_train:.3f}"
        eval_table.cell(1, 3).text = f"{mae_train:.3f}"
        eval_table.cell(1, 4).text = f"{r2_train:.3f}"
        eval_table.cell(2, 0).text = "测试集"
        eval_table.cell(2, 1).text = f"{mse_test:.3f}"
        eval_table.cell(2, 2).text = f"{rmse_test:.3f}"
        eval_table.cell(2, 3).text = f"{mae_test:.3f}"
        eval_table.cell(2, 4).text = f"{r2_test:.3f}"


    doc.add_paragraph("图表说明：上表中展示了训练集和测试集的预测评价指标，通过量化指标来衡量随机森林的预测效果。")
    doc.add_paragraph(
            "模型评估指标解释：\n"
            "- MSE（均方误差）：MSE 值越小，表示模型在预测时的误差越小，模型性能越好。\n"
            "- RMSE（均方根误差）：RMSE 是 MSE 的平方根，易于理解的误差度量单位。RMSE 值越小，表示模型的预测效果越好。\n"
            "- MAE（平均绝对误差）：MAE 是预测值与真实值之间的绝对误差的平均值。MAE 值越小，模型的预测性能越好。\n"
            "- R²（决定系数）：R² 值表示模型对数据变异性的解释程度，R² 越接近 1，表示模型对数据的拟合效果越好。\n\n"
        )

    # 保存文档
    doc.save('遗传-随机森林模型报告.docx')
    print("报告已生成并保存为 '遗传-随机森林模型报告.docx'")

if __name__ == "__main__":
    ask_user_for_choices()
标签：import,代码,explained,随机,variance,print,遗传算法,pca,data
From： https://blog.csdn.net/m0_73065928/article/details/141615335
主成分分析结合遗传算法优化的随机森林通用代码

相关文章

赞助商

阅读排行