首页 > 其他分享 >R :随机森林(测试版3)

R :随机森林(测试版3)

时间:2023-12-18 21:01:43浏览次数:31  
标签:caret library Desktop 随机 测试版 森林

# 清空当前环境中的所有对象
rm(list = ls())

# 设置工作目录
setwd("C:\\Users\\Administrator\\Desktop\\随机森林4")

library(randomForest) 
library(tidyverse) 
library(pROC)
library(caret)


#加载数据,指定第一行包含列名(变量名)
otu <- read.table("otutable.txt", header = TRUE, sep = "\t")

# 因变量分布情况
table(otu$gene)

# 使用留一法
loocv_results <- vector("list", length = nrow(otu))

# 在循环开始前初始化存储评估指标的向量
accuracy_vec <- numeric(nrow(otu))
sensitivity_vec <- numeric(nrow(otu))
specificity_vec <- numeric(nrow(otu))

for (i in 1:nrow(otu)) {
  # 创建训练集和测试集
  train_data <- otu[-i, ]
  test_data <- otu[i, ]
  # 打印测试集的gene值
  print(test_data$gene)
  
  # 因变量自变量构建公式
  form_cls <- as.formula(
    paste0(
      "gene ~", 
      paste0(colnames(train_data)[2:554], collapse = "+")
    )
  )
  
  # 构建模型
  set.seed(1)
  train_data$gene <- factor(train_data$gene)
  fit_rf_cls <- randomForest(
    form_cls,
    data = train_data,
    ntree = 500,
    mtry = 23,
    importance = TRUE
  )
  
  # 预测测试集概率
  test_pred_prob <- predict(fit_rf_cls, newdata = test_data, type = "prob")[, 2]
  
  # 预测测试集类别
  test_pred_class <- predict(fit_rf_cls, newdata = test_data)
  # 转换为因子类型,并确保具有相同的水平
  levels(test_pred_class) <- levels(train_data$gene)
  test_data$gene <- factor(test_data$gene, levels = levels(train_data$gene))
  
  # 计算评估指标
  cm <- confusionMatrix(test_pred_class, test_data$gene)
  accuracy_vec[i] <- cm$overall['Accuracy']
  sensitivity_vec[i] <- cm$byClass['Sensitivity']
  specificity_vec[i] <- cm$byClass['Specificity']
  
  # 存储结果
  loocv_results[[i]] <- list(true_label = test_data$gene, predicted_prob = test_pred_prob)
}

# 计算平均评估指标
mean_accuracy <- mean(accuracy_vec)
mean_sensitivity <- mean(sensitivity_vec,na.rm = TRUE)
mean_specificity <- mean(specificity_vec,na.rm = TRUE)

# 打印评估指标
print(paste("Average Accuracy: ", mean_accuracy))
print(paste("Average Sensitivity: ", mean_sensitivity))
print(paste("Average Specificity: ", mean_specificity))

# 合并 LOOCV 结果
all_true_labels <- unlist(lapply(loocv_results, function(x) x$true_label))
all_predicted_probs <- unlist(lapply(loocv_results, function(x) x$predicted_prob))

# 计算并绘制“平均化”ROC曲线
roc_curve <- roc(response = all_true_labels, predictor = all_predicted_probs)

# 绘制测试集ROC曲线
plot(roc_curve,
     print.auc = TRUE,
     grid = c(0.1, 0.2),
     auc.polygon = FALSE,
     max.auc.polygon = TRUE,
     main = "留一法ROC曲线",
     grid.col = c("green", "red"))
###############################################################################
# 提取已计算的特征重要性
importance <- importance(fit_rf_cls)

# 对特征重要性进行排序
ordered_indices <- order(importance[, "MeanDecreaseGini"], decreasing = TRUE)
ordered_feature_names <- rownames(importance)[ordered_indices]

# 初始化用于存储每一步性能指标的向量
performance_metrics <- data.frame(NumFeatures = integer(), Accuracy = numeric(), Sensitivity = numeric(), Specificity = numeric())

# 逐步移除特征并评估模型性能
for (i in seq_along(ordered_feature_names)) {
  # 使用除了最不重要的i个特征之外的所有特征
  features_to_use <- ordered_feature_names[-seq_len(i)]
  formula <- as.formula(paste("gene ~", paste(features_to_use, collapse = "+")))
  
  # 使用留一法交叉验证评估模型
  loocv_results <- train(formula, data = otu, method = "rf", trControl = trainControl(method = "LOOCV"))
  cm <- confusionMatrix(loocv_results$pred$pred, loocv_results$pred$obs)
  
  # 存储性能指标
  performance_metrics <- rbind(performance_metrics, data.frame(NumFeatures = length(features_to_use), Accuracy = cm$overall['Accuracy'], Sensitivity = cm$byClass['Sensitivity'], Specificity = cm$byClass['Specificity']))
}


# 绘制特征数量与性能指标的关系
ggplot(performance_metrics, aes(x = NumFeatures)) +
  geom_line(aes(y = Accuracy), color = "black") +
  geom_line(aes(y = Sensitivity), color = "#00BFFF") + #真正率
  geom_line(aes(y = Specificity), color = "#FF4500") + #真负率
  labs(title = "Model Performance vs. Number of Features", x = "Number of Features", y = "Performance Metrics") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

# 将performance_metrics保存为TXT文件
write.table(performance_metrics, 
            file = "performance_metrics.txt", 
            sep = "\t",    # 使用制表符作为分隔符
            row.names = TRUE,  # 保存行名
            col.names = TRUE)   # 保存列名


#################################################################################
# 绘制特征数量与性能指标的关系
max_accuracy_point <- performance_metrics[which.max(performance_metrics$Accuracy),]
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
  geom_line(aes(y = Accuracy), color = "blue") +
  geom_line(aes(y = Sensitivity), color = "red") +
  geom_line(aes(y = Specificity), color = "green") +
  labs(title = "Model Performance vs. Number of Features", x = "Number of Features", y = "Performance Metrics") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

# 标注准确率最高的点
p <- p + geom_point(aes(x = max_accuracy_point$NumFeatures, y = max_accuracy_point$Accuracy), color = "blue", size = 4) +
  geom_text(aes(x = max_accuracy_point$NumFeatures, y = max_accuracy_point$Accuracy, label = paste("Features:", max_accuracy_point$NumFeatures)), vjust = -1)

# 打印图表
print(p)
###############################################################################
# 绘制特征数量与性能指标的关系并添加平滑线
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
  geom_line(aes(y = Accuracy), color = "black") +
  geom_smooth(aes(y = Accuracy), color = "black", se = FALSE, method = "loess", span = 0.3) +
  geom_line(aes(y = Sensitivity), color = "#00BFFF") +
  geom_smooth(aes(y = Sensitivity), color = "#00BFFF", se = FALSE, method = "loess", span = 0.3) +
  geom_line(aes(y = Specificity), color = "#FF4500") +
  geom_smooth(aes(y = Specificity), color = "#FF4500", se = FALSE, method = "loess", span = 0.3) +
  labs(title = "Model Performance vs. Number of Features removed", x = "Number of Features removed", y = "Performance Metrics") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal()

# 打印图表
print(p)
####################################################################################
# 绘制特征数量与性能指标的关系并添加平滑线
p <- ggplot(performance_metrics, aes(x = NumFeatures)) +
  geom_line(aes(y = Accuracy, color = "Accuracy"), size = 1) +
  geom_smooth(aes(y = Accuracy, color = "Accuracy"), se = FALSE, method = "loess", span = 0.3) +
  geom_line(aes(y = Sensitivity, color = "Sensitivity"), size = 1) +
  geom_smooth(aes(y = Sensitivity, color = "Sensitivity"), se = FALSE, method = "loess", span = 0.3) +
  geom_line(aes(y = Specificity, color = "Specificity"), size = 1) +
  geom_smooth(aes(y = Specificity, color = "Specificity"), se = FALSE, method = "loess", span = 0.3) +
  scale_color_manual(values = c("Accuracy" = "black", "Sensitivity" = "#00BFFF", "Specificity" = "#FF4500")) +
  labs(title = "Model Performance vs. Number of Features Removed",
       subtitle = "Comparison of Accuracy, Sensitivity, and Specificity",
       x = "Number of Features Removed", y = "Performance Metrics",
       color = "Metrics") +
  scale_y_continuous(labels = scales::percent_format()) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top",
        plot.title = element_text(face = "bold", size = 16),
        plot.subtitle = element_text(size = 14),
        legend.title.align = 0.5)

# 打印图表
print(p)

 

标签:caret,library,Desktop,随机,测试版,森林
From: https://www.cnblogs.com/wzbzk/p/17912242.html

相关文章

  • 新增“失窃设备保护”:苹果发布iOS 17.3测试版本
    12月13日消息,在昨天发布iOS17.2正式版之后,今天苹果向开发者推送了iOS17.3Beta版本更新,新增了“失窃设备保护”功能。iOS17.3首个Beta版本更新引入了“设备被盗保护”(StolenDeviceProtection)功能,可以在小偷或其他攻击者知道用户的私人密码时保护用户信息。当此功能开启后,如......
  • [LeetCode138-链表-中等] 复制带有随机指针的链表
    这道题是这样的,就是说有一个链表LindedNode,通常我们链表包含2个属性,一个是它的值val,另一个是它指向的下一个结点nextNode,但是这个题目中的链表还有一个属性,就是它还有个随机指针,这个随机指针可能指向链表中的任意结点(包括链表的结尾null结点,或者是自己)也就是说这个链表Lin......
  • 随机选择算法
    在一个无序数组中求第k大或者第k小的问题,要求时间复杂度是O(N),那么对于这两个问题只要解决一个另一个就解决了。因为求第一大可以转换成求第n小。那么对于一个有100个元素的数组来说,我们要求第57小的数,那么它就是在求这个数组排好序之后下标在56的值,因为如果数组的元素没有重复......
  • 随机快速排序
    快速排序是一个经典的算法,它是基于比较排序中最快的算法之一,时间复杂度是O(N*logN)的,时间复杂度证明可以用master公式证明。但经典的快速排序会存在最坏的情况,会使得快速排序的时间复杂度退化到O(N2),这样快速排序也就失去了意义。因此我们为了避免出现最坏的情况,来引入随机一行......
  • 可视化森林:探索自然资源的奥秘
    随着全球经济的发展和人口的增加,自然资源的利用和保护变得越来越重要。森林作为地球上最重要的生态系统之一,不仅是地球生态系统的重要组成部分,还承担着调节气候、维护生态平衡等重要的生态功能。但是,随着人类活动的增加,森林面积逐渐减少,生态系统受到了破坏,保护森林成为了全球的共......
  • shell补-特殊玩法-生成随机字符串
    shell补-特殊玩法-生成随机字符串方法1:md5sum方法2:tr+/dev/urandom方法3:内置变量RANDOM;#方法1[root@localhostser]#opensslrand-base64108/54arQpCmQ12Q==[root@localhostser]##方法2必备[root@localhostser]#date+%N|md5sum###给日期加密;可以写其......
  • 记录--js小练习(弹幕、 电梯导航、 倒计时、 随机点名、 购物放大镜)
    这里给大家分享我在网上总结出来的一些知识,希望对大家有所帮助DOM小练习弹幕电梯导航倒计时随机点名购物放大镜1.弹幕效果预览功能:输入弹幕内容,按下回车显示一条弹幕(弹幕颜色、字体随机生成)思路:设置按钮抬起事件,在事件中判断如果按下的是回车键则将输入框中替换掉......
  • 逛森林
    这是一道模板题首先,对任意时刻,\(u\)->\(v\)这条路径上的点都是不会变动的(就是说,比如,如果某时刻从\(1\)到\(4\)的路径为\(1\)->\(3\)->\(4\),那么对之后的任意时刻,这条路径都是这个,既不会改变顺序,也不会新增节点,更不会删除已有节点),所以我们可以把所有有效的操作一存起来最后再建边......
  • 随机森林代码实现(奥拓数据分类)
    importpandasaspdimportnumpyasnpimportmatplotlib.pyplotaspltdata=pd.read_csv("./data/train.csv")data.head()importseabornassnssns.countplot(data.target)plt.show()#采用随机欠采样之前需要确定数据的特征值和标签值y=data["target"]x=data......
  • 树与森林
     ......