# 清空当前环境中的所有对象
rm(list = ls())
# 设置工作目录
setwd("C:\\Users\\Administrator\\Desktop\\随机森林4")
library(randomForest)
library(tidyverse)
library(pROC)
library(caret)
#加载数据,指定第一行包含列名(变量名)
otu <- read.table("otutable.txt", header = TRUE, sep = "\t")
# 因变量分布情况
table(otu$gene)
# 使用留一法
loocv_results <- vector("list", length = nrow(otu))
# 在循环开始前初始化存储评估指标的向量
accuracy_vec <- numeric(nrow(otu))
sensitivity_vec <- numeric(nrow(otu))
specificity_vec <- numeric(nrow(otu))
for (i in 1:nrow(otu)) {
# 创建训练集和测试集
train_data <- otu[-i, ]
test_data <- otu[i, ]
# 打印测试集的gene值
print(test_data$gene)
# 因变量自变量构建公式
form_cls <- as.formula(
paste0(
"gene ~",
paste0(colnames(train_data)[2:554], collapse = "+")
)
)
# 构建模型
set.seed(1)
train_data$gene <- factor(train_data$gene)
fit_rf_cls <- randomForest(
form_cls,
data = train_data,
ntree = 500,
mtry = 23,
importance = TRUE
)
# 预测测试集概率
test_pred_prob <- predict(fit_rf_cls, newdata = test_data, type = "prob")[, 2]
# 预测测试集类别
test_pred_class <- predict(fit_rf_cls, newdata = test_data)
# 转换为因子类型,并确保具有相同的水平
levels(test_pred_class) <- levels(train_data$gene)
test_data$gene <- factor(test_data$gene, levels = levels(train_data$gene))
# 计算评估指标
cm <- confusionMatrix(test_pred_class, test_data$gene)
accuracy_vec[i] <- cm$overall['Accuracy']
sensitivity_vec[i] <- cm$byClass['Sensitivity']
specificity_vec[i] <- cm$byClass['Specificity']
# 存储结果
loocv_results[[i]] <- list(true_label = test_data$gene, predicted_prob = test_pred_prob)
}
# 计算平均评估指标
mean_accuracy <- mean(accuracy_vec)
mean_sensitivity <- mean(sensitivity_vec,na.rm = TRUE)
mean_specificity <- mean(specificity_vec,na.rm = TRUE)
# 打印评估指标
print(paste("Average Accuracy: ", mean_accuracy))
print(paste("Average Sensitivity: ", mean_sensitivity))
print(paste("Average Specificity: ", mean_specificity))
# 合并 LOOCV 结果
all_true_labels <- unlist(lapply(loocv_results, function(x) x$true_label))
all_predicted_probs <- unlist(lapply(loocv_results, function(x) x$predicted_prob))
# 计算并绘制“平均化”ROC曲线
roc_curve <- roc(response = all_true_labels, predictor = all_predicted_probs)
# 绘制测试集ROC曲线
plot(roc_curve,
print.auc = TRUE,
grid = c(0.1, 0.2),
auc.polygon = FALSE,
max.auc.polygon = TRUE,
main = "留一法ROC曲线",
grid.col = c("green", "red"))
##############################################################################
# 查看模型的输出
print(fit_rf_cls)
# 查看概率矩阵的列名
prob_matrix <- predict(fit_rf_cls, newdata = test_data, type = "prob")
print(colnames(prob_matrix))
# 假设 fit_rf_cls 是你已经训练好的随机森林模型
importance <- importance(fit_rf_cls)
# 确保importance有内容
print(head(importance))
# 对特征重要性进行排序
sorted_features <- sort(importance[, 3], decreasing = TRUE) # 假设重要性在第一列
# 检查排序后的特征重要性
print(head(sorted_features))
# 计算要选择的特征数量(前20%)
num_features_to_select <- round(0.20 * ncol(otu))
# 选择前20%的特征
selected_features <- names(sorted_features)[1:num_features_to_select]
# 再次检查selected_features
print(selected_features)
# 准备交叉验证
fitControl <- trainControl(method = "cv", number = 70) # 10折交叉验证
# 在不同的特征子集上训练和评估模型
results <- list()
for (n in seq(10, length(selected_features), by = 10)) {
# 选择前n个特征
features_subset <- selected_features[1:n]
# 使用reformulate构建公式
formula_subset <- reformulate(termlabels = features_subset, response = "gene")
# 训练模型
set.seed(1)
model <- train(formula_subset, data = otu, method = "rf", trControl = fitControl, ntree = 500)
# 存储结果
results[[paste("Top", n, "Features")]] <- model
}
# 查看不同特征子集模型的性能
for (model_name in names(results)) {
print(model_name)
print(results[[model_name]]$results)
}
##################################################################
# 假设你的结果存储在名为 results 的列表中
# 首先,创建一个用于绘图的数据框
plot_data <- data.frame(
NumFeatures = seq(10, num_features_to_select, by = 10),
Accuracy = sapply(results, function(x) max(x$results$Accuracy)) # 获取每个特征子集的最高准确度
)
# 计算交叉验证错误率
plot_data$CV_Error = 1 - plot_data$Accuracy
# 绘制特征数量与交叉验证错误率的关系
with(plot_data, plot(NumFeatures, CV_Error, type = "o", lwd = 2,
xlab = "Number of Features", ylab = "Cross-Validation Error"))
###################################################################
library(ggplot2)
library(gridExtra)
# 创建一个空列表来存储单独的图
plots_list <- list()
# 遍历每个特征数量,创建单独的图并存储在列表中
for (i in seq(10, 110, by = 10)) {
# 计算错误率
subset_results <- results[[paste("Top", i, "Features")]]
subset_results$ErrorRate <- 1 - subset_results$Accuracy
# 绘制单独的图
p <- ggplot(subset_results, aes(x = mtry, y = ErrorRate)) +
geom_line() +
geom_point() +
theme_minimal() +
ggtitle(paste("Top", i, "Features"))
plots_list[[i / 10]] <- p
}
# 合并所有图
grid.arrange(grobs = plots_list, ncol = 3)
标签:caret,library,Desktop,随机,测试版,森林 From: https://www.cnblogs.com/wzbzk/p/17912238.html