# 清理环境和设置工作目录
rm(list = ls())
setwd("C:\\Users\\Administrator\\Desktop\\machine learning\\SVM-RFE\\CAZy")
set.seed(127)
# 加载数据
input <- read.table("matched_otu.txt", header = TRUE, row.names = 1, sep = "\t")
# 加载必要的包
library(e1071)
library(caret)
library(parallel)
library(ggplot2)
library(kernlab)
# 准备数据
response <- as.factor(input[, "group"]) # 响应变量
features <- input[, -1] # 特征矩阵(去掉 group)
# 定义并行计算环境
cl <- makeCluster(detectCores() - 1) # 使用所有核心,留一个空闲
clusterExport(cl, c("features", "response")) # 导出全局变量到每个工作节点
clusterEvalQ(cl, library(e1071)) # 在每个核心加载必要的包
# 定义SVM-RFE函数
svm_rfe <- function(features, response, n_folds = 10) {
rfe_control <- rfeControl(
functions = caretFuncs,
method = "cv",
number = n_folds,
allowParallel = TRUE
)
# 使用支持向量机进行RFE
rfe_results <- rfe(
x = features,
y = response,
sizes = seq(1, ncol(features), by = 1), # 逐步减少特征
rfeControl = rfe_control,
method = "svmLinear" # 使用线性核支持向量机
)
return(rfe_results)
}
# 执行SVM-RFE
rfe_results <- svm_rfe(features, response)
# 停止并行计算
stopCluster(cl)
# 保存特征重要性
importance <- varImp(rfe_results)
write.table(importance, "feature_importance.txt", sep = "\t", col.names = NA, quote = FALSE)
# 可视化:泛化误差与特征数的关系
performance_data <- data.frame(
Features = rfe_results$results$Variables,
Accuracy = rfe_results$results$Accuracy
)
ggplot(performance_data, aes(x = Features, y = Accuracy)) +
geom_line(color = "blue") +
geom_point(color = "red") +
labs(
title = "Feature Number vs Accuracy",
x = "Number of Features",
y = "Accuracy"
) +
theme_minimal()
# 保存图表
ggsave("feature_vs_accuracy.png", width = 8, height = 6)
# 找出最佳特征数量(最高准确率对应的特征数量)
best_feature_count <- performance_data$Features[which.max(performance_data$Accuracy)]
# 绘制图像
p <- ggplot() +
# 绘制准确率曲线(灰色线表示所有特征数量的表现)
geom_line(data = performance_data, aes(x = Features, y = Accuracy), color = "gray", alpha = 0.8) +
# 在最高准确率点绘制加粗的蓝色曲线
geom_line(data = performance_data, aes(x = Features, y = Accuracy), color = "blue", size = 1.5) +
# 标注最佳特征数量的垂直虚线
geom_vline(xintercept = best_feature_count, linetype = "dashed", color = "red") +
# 添加标注
annotate(
"text",
x = best_feature_count,
y = max(performance_data$Accuracy),
label = paste("Best feature count:", best_feature_count),
hjust = -0.1, vjust = -0.3,
size = 5, color = "red"
) +
# 自定义标题和坐标轴
labs(
title = "Relationship Between Feature Count and Accuracy",
x = "Number of Features",
y = "Accuracy"
) +
# 设置白色背景样式
theme_bw() +
# 定制样式
theme(
plot.title = element_text(hjust = 0.5, size = 20), # 居中标题
axis.line.x = element_line(size = 1.2, color = "black"), # 保留下边框
axis.line.y = element_line(size = 1.2, color = "black"), # 保留左边框
axis.text = element_text(size = 18), # 坐标轴标签
axis.title = element_text(size = 18), # 坐标轴标题
axis.ticks = element_line(size = 1.2), # 刻度线
panel.grid.major = element_blank(), # 去掉主要网格线
panel.grid.minor = element_blank(), # 去掉次要网格线
panel.border = element_blank() # 去掉上边框和右边框
)
# 显示图形
print(p)
# 保存图形
ggsave("feature_vs_accuracy_styled.png", plot = p, width = 8, height = 8, dpi = 1200)
标签:脚本,SVM,性能,分类,RFE,筛选 From: https://www.cnblogs.com/wzbzk/p/18608055