# 清空当前环境中的所有对象
rm(list = ls())
# 设置工作目录
setwd("C:\\Users\\Administrator\\Desktop\\随机森林4")
library(randomForest)
library(tidyverse)
library(pROC)
library(caret)
#加载数据,指定第一行包含列名(变量名)
otu <- read.table("otutable.txt", header = TRUE, sep = "\t")
# 因变量分布情况
table(otu$gene)
# 使用留一法
loocv_results <- vector("list", length = nrow(otu))
# 在循环开始前初始化存储评估指标的向量
accuracy_vec <- numeric(nrow(otu))
sensitivity_vec <- numeric(nrow(otu))
specificity_vec <- numeric(nrow(otu))
for (i in 1:nrow(otu)) {
# 创建训练集和测试集
train_data <- otu[-i, ]
test_data <- otu[i, ]
# 打印测试集的gene值
print(test_data$gene)
# 因变量自变量构建公式
form_cls <- as.formula(
paste0(
"gene ~",
paste0(colnames(train_data)[2:554], collapse = "+")
)
)
# 构建模型
set.seed(1)
train_data$gene <- factor(train_data$gene)
fit_rf_cls <- randomForest(
form_cls,
data = train_data,
ntree = 500,
mtry = 23,
importance = TRUE
)
# 预测测试集概率
test_pred_prob <- predict(fit_rf_cls, newdata = test_data, type = "prob")[, 2]
# 预测测试集类别
test_pred_class <- predict(fit_rf_cls, newdata = test_data)
# 转换为因子类型,并确保具有相同的水平
levels(test_pred_class) <- levels(train_data$gene)
test_data$gene <- factor(test_data$gene, levels = levels(train_data$gene))
# 计算评估指标
cm <- confusionMatrix(test_pred_class, test_data$gene)
accuracy_vec[i] <- cm$overall['Accuracy']
sensitivity_vec[i] <- cm$byClass['Sensitivity']
specificity_vec[i] <- cm$byClass['Specificity']
# 存储结果
loocv_results[[i]] <- list(true_label = test_data$gene, predicted_prob = test_pred_prob)
}
# 计算平均评估指标
mean_accuracy <- mean(accuracy_vec)
mean_sensitivity <- mean(sensitivity_vec,na.rm = TRUE)
mean_specificity <- mean(specificity_vec,na.rm = TRUE)
# 打印评估指标
print(paste("Average Accuracy: ", mean_accuracy))
print(paste("Average Sensitivity: ", mean_sensitivity))
print(paste("Average Specificity: ", mean_specificity))
# 合并 LOOCV 结果
all_true_labels <- unlist(lapply(loocv_results, function(x) x$true_label))
all_predicted_probs <- unlist(lapply(loocv_results, function(x) x$predicted_prob))
# 计算并绘制“平均化”ROC曲线
roc_curve <- roc(response = all_true_labels, predictor = all_predicted_probs)
# 绘制测试集ROC曲线
plot(roc_curve,
print.auc = TRUE,
grid = c(0.1, 0.2),
auc.polygon = FALSE,
max.auc.polygon = TRUE,
main = "留一法ROC曲线",
grid.col = c("green", "red"))
###############################################################################
# 提取已计算的特征重要性
importance <- importance(fit_rf_cls)
# 对特征重要性进行排序
ordered_indices <- order(importance[, "MeanDecreaseGini"], decreasing = TRUE)
ordered_feature_names <- rownames(importance)[ordered_indices]
# 初始化用于存储每一步性能指标的向量
performance_metrics <- data.frame(NumFeatures = integer(), Accuracy = numeric(), Sensitivity = numeric(), Specificity = numeric())
# 逐步移除特征并评估模型性能
for (i in seq_along(ordered_feature_names)) {
# 使用除了最不重要的i个特征之外的所有特征
features_to_use <- ordered_feature_names[-seq_len(i)]
formula <- as.formula(paste("gene ~", paste(features_to_use, collapse = "+")))
# 使用留一法交叉验证评估模型
loocv_results <- train(formula, data = otu, method = "rf", trControl = trainControl(method = "LOOCV"))
cm <- confusionMatrix(loocv_results$pred$pred, loocv_results$pred$obs)
# 存储性能指标
performance_metrics <- rbind(performance_metrics, data.frame(NumFeatures = length(features_to_use), Accuracy = cm$overall['Accuracy'], Sensitivity = cm$byClass['Sensitivity'], Specificity = cm$byClass['Specificity']))
}
# 绘制特征数量与性能指标的关系
ggplot(performance_metrics, aes(x = NumFeatures)) +
geom_line(aes(y = Accuracy), color = "black") +
geom_line(aes(y = Sensitivity), color = "#00BFFF") + #真正率
geom_line(aes(y = Specificity), color = "#FF4500") + #真负率
labs(title = "Model Performance vs. Number of Features", x = "Number of Features", y = "Performance Metrics") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()
# 将performance_metrics保存为TXT文件
write.table(performance_metrics,
file = "performance_metrics.txt",
sep = "\t", # 使用制表符作为分隔符
row.names = TRUE, # 保存行名
col.names = TRUE) # 保存列名
#################################################################################
# 绘制特征数量与性能指标的关系
max_accuracy_point <- performance_metrics[which.max(performance_metrics$Accuracy),]
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
geom_line(aes(y = Accuracy), color = "blue") +
geom_line(aes(y = Sensitivity), color = "red") +
geom_line(aes(y = Specificity), color = "green") +
labs(title = "Model Performance vs. Number of Features", x = "Number of Features", y = "Performance Metrics") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()
# 标注准确率最高的点
p <- p + geom_point(aes(x = max_accuracy_point$NumFeatures, y = max_accuracy_point$Accuracy), color = "blue", size = 4) +
geom_text(aes(x = max_accuracy_point$NumFeatures, y = max_accuracy_point$Accuracy, label = paste("Features:", max_accuracy_point$NumFeatures)), vjust = -1)
# 打印图表
print(p)
###############################################################################
# 绘制特征数量与性能指标的关系并添加平滑线
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
geom_line(aes(y = Accuracy), color = "black") +
geom_smooth(aes(y = Accuracy), color = "black", se = FALSE, method = "loess", span = 0.3) +
geom_line(aes(y = Sensitivity), color = "#00BFFF") +
geom_smooth(aes(y = Sensitivity), color = "#00BFFF", se = FALSE, method = "loess", span = 0.3) +
geom_line(aes(y = Specificity), color = "#FF4500") +
geom_smooth(aes(y = Specificity), color = "#FF4500", se = FALSE, method = "loess", span = 0.3) +
labs(title = "Model Performance vs. Number of Features removed", x = "Number of Features removed", y = "Performance Metrics") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal()
# 打印图表
print(p)
####################################################################################
# 绘制特征数量与性能指标的关系并添加平滑线
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
geom_line(aes(y = Accuracy, color = "Accuracy"), size = 1) +
geom_smooth(aes(y = Accuracy, color = "Accuracy"), se = FALSE, method = "loess", span = 0.3) +
geom_line(aes(y = Sensitivity, color = "Sensitivity"), size = 1) +
geom_smooth(aes(y = Sensitivity, color = "Sensitivity"), se = FALSE, method = "loess", span = 0.3) +
geom_line(aes(y = Specificity, color = "Specificity"), size = 1) +
geom_smooth(aes(y = Specificity, color = "Specificity"), se = FALSE, method = "loess", span = 0.3) +
scale_color_manual(values = c("Accuracy" = "black", "Sensitivity" = "#00BFFF", "Specificity" = "#FF4500")) +
labs(title = "Model Performance vs. Number of Features Removed",
subtitle = "Comparison of Accuracy, Sensitivity, and Specificity",
x = "Number of Features Removed", y = "Performance Metrics",
color = "Metrics") +
scale_y_continuous(labels = scales::percent_format()) +
theme_minimal(base_size = 14) +
theme(legend.position = "top",
plot.title = element_text(face = "bold", size = 16),
plot.subtitle = element_text(size = 14),
legend.title.align = 0.5)
# 打印图表
print(p)
标签:caret,library,Desktop,随机,测试版,森林 From: https://www.cnblogs.com/wzbzk/p/17912242.html