# 清理环境和加载必要包
rm(list = ls())
setwd("C:\\Users\\Administrator\\Desktop\\machine learning\\LASSO回归\\CAZy") # 设置工作目录
train <- read.table("matched_otu.txt", header = TRUE, row.names = 1, sep = "\t")
############################################## 预处理数据
# 将目标变量转换为因子
train$group <- as.factor(train$group)
# 分离特征和目标变量
x <- as.matrix(train[, -1]) # 特征数据
y <- train$group # 目标变量
############################################## Lasso 回归及交叉验证
library(glmnet)
library(reshape2)
library(ggplot2)
# 执行 Lasso 回归
set.seed(123)
lasso_cv <- cv.glmnet(x, y, alpha = 1, family = "binomial", type.measure = "class", nfolds = 10)
# 提取最佳 Lambda 值
lambda_min <- lasso_cv$lambda.min # 最优 lambda
lambda_1se <- lasso_cv$lambda.1se # 最简化模型的 lambda
cat("最佳 Lambda 值为:", lambda_min, "\n")
# 绘制交叉验证曲线
plot(lasso_cv)
# 提取交叉验证误差数据
cv_data <- data.frame(
log_lambda = log(lasso_cv$lambda),
cvm = lasso_cv$cvm,
cvup = lasso_cv$cvup,
cvlo = lasso_cv$cvlo
)
# 绘制交叉验证误差图
ggplot(cv_data, aes(x = log_lambda, y = cvm)) +
geom_point() +
geom_errorbar(aes(ymin = cvlo, ymax = cvup), width = 0.2) +
geom_vline(xintercept = log(lambda_min), linetype = "dashed", color = "red") +
geom_vline(xintercept = log(lambda_1se), linetype = "dashed", color = "blue") +
labs(title = "Cross-Validation Error Plot", x = "Log(Lambda)", y = "Mean Cross-Validation Error") +
theme_minimal()
############################################## 提取重要特征
# 使用最佳 lambda 训练 Lasso 模型
lasso_model <- glmnet(x, y, alpha = 1, family = "binomial", lambda = lambda_min)
# 提取非零系数
coefficients <- coef(lasso_model)
important_features <- data.frame(
Feature = rownames(coefficients)[which(coefficients != 0)],
Coefficient = coefficients[which(coefficients != 0)]
)
# 移除截距项(如果不需要)
important_features <- important_features[important_features$Feature != "(Intercept)", ]
# 将结果保存为 CSV 文件到当前工作目录
write.csv(important_features, "important_features.csv", row.names = FALSE)
cat("重要特征已保存到 important_features.csv 文件中\n")
############################################## 绘制 LASSO 路径图
# 获取 LASSO 路径数据
lasso_path <- glmnet(x, y, alpha = 1, family = "binomial")
# 提取系数矩阵并转换为普通矩阵
coef_matrix <- as.matrix(lasso_path$beta) # 每列对应一个 lambda 值
# 转换为长格式数据框
path_data_long <- melt(coef_matrix)
colnames(path_data_long) <- c("Feature", "Lambda_Index", "Coefficient")
# 添加 Lambda 值
path_data_long$Lambda <- lasso_path$lambda[path_data_long$Lambda_Index]
# 绘制美化后的 LASSO 路径图
p1 <- ggplot(path_data_long, aes(x = log(Lambda), y = Coefficient, color = Feature)) +
theme_minimal(base_size = 14) + # 设置基本主题和字体大小
geom_line(size = 1.2) + # 增加线条粗细
geom_vline(xintercept = log(lambda_min), linetype = "dashed", color = "#FF4500", size = 1.5) + # 标注 lambda.min
geom_vline(xintercept = log(lambda_1se), linetype = "dashed", color = "#00BFFF", size = 1.5) + # 标注 lambda.1se
annotate("text", x = max(log(path_data_long$Lambda)) - 2.3,
y = max(path_data_long$Coefficient, na.rm = TRUE) * 0.9,
label = paste0("lambda.min: ", round(lambda_min, 4)),
color = "#FF4500", hjust = 0, size = 5) + # 在右上角写 lambda.min 信息
annotate("text", x = max(log(path_data_long$Lambda)) - 2.3,
y = max(path_data_long$Coefficient, na.rm = TRUE) * 0.8,
label = paste0("lambda.1se: ", round(lambda_1se, 4)),
color = "#00BFFF", hjust = 0, size = 5) + # 在右上角写 lambda.1se 信息
labs(
title = "LASSO Coefficient Path",
x = "Log Lambda",
y = "Coefficient"
) +
theme(
legend.position = "none", # 隐藏图例
plot.title = element_text(hjust = 0.5, size = 20, face = "bold"), # 设置标题居中和字体样式
panel.grid.major = element_blank(), # 去掉主要网格线
panel.grid.minor = element_blank(), # 去掉次要网格线
axis.line = element_line(size = 1.2, colour = "black"), # 自定义坐标轴线粗细
axis.text.x = element_text(size = 16), # 自定义横坐标刻度标签大小
axis.text.y = element_text(size = 16), # 自定义纵坐标刻度标签大小
axis.title.x = element_text(size = 18), # 自定义横坐标标题大小
axis.title.y = element_text(size = 18), # 自定义纵坐标标题大小
axis.ticks = element_line(size = 1.2) # 自定义刻度线粗细
) +
scale_x_continuous(
breaks = seq(floor(log(min(path_data_long$Lambda))), ceiling(log(max(path_data_long$Lambda))), by = 2),
labels = round(seq(floor(log(min(path_data_long$Lambda))), ceiling(log(max(path_data_long$Lambda))), by = 2), 1)
) + # 设置自定义横坐标刻度
scale_color_manual(values = rainbow(length(unique(path_data_long$Feature)))) # 使用彩虹色区分特征
# 显示图
print(p1)
# 保存图像
ggsave('LASSO Path.png', width = 8, height = 8, bg = 'white', dpi = 1200)
###################################################################################
# 绘制美化后的交叉验证误差图
cv_plot <- ggplot(cv_data, aes(x = log_lambda, y = cvm)) +
theme_minimal(base_size = 14) + # 设置基本主题和字体大小
geom_point(size = 3) + # 调整点的大小
geom_errorbar(aes(ymin = cvlo, ymax = cvup), width = 0.2, size = 1) + # 调整误差线粗细
geom_vline(xintercept = log(lambda_min), linetype = "dashed", color = "#FF4500", size = 1.5) + # 标注 lambda.min
geom_vline(xintercept = log(lambda_1se), linetype = "dashed", color = "#00BFFF", size = 1.5) + # 标注 lambda.1se
annotate("text", x = min(cv_data$log_lambda) - 1,
y = max(cv_data$cvm, na.rm = TRUE) * 1.1,
label = paste0("lambda.min: ", round(lambda_min, 4)),
color = "#FF4500", hjust = 0, size = 5) + # 在左上角写 lambda.min 信息
annotate("text", x = min(cv_data$log_lambda) - 1,
y = max(cv_data$cvm, na.rm = TRUE) * 1.07,
label = paste0("lambda.1se: ", round(lambda_1se, 4)),
color = "#00BFFF", hjust = 0, size = 5) + # 在左上角写 lambda.1se 信息
labs(
title = "Cross-Validation Error Plot",
x = "Log Lambda",
y = "Mean Cross-Validation Error"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 20, face = "bold"), # 设置标题居中和字体样式
panel.grid.major = element_blank(), # 去掉主要网格线
panel.grid.minor = element_blank(), # 去掉次要网格线
axis.line = element_line(size = 1.2, colour = "black"), # 自定义坐标轴线粗细
axis.text.x = element_text(size = 16), # 自定义横坐标刻度标签大小
axis.text.y = element_text(size = 16), # 自定义纵坐标刻度标签大小
axis.title.x = element_text(size = 18), # 自定义横坐标标题大小
axis.title.y = element_text(size = 18), # 自定义纵坐标标题大小
axis.ticks = element_line(size = 1.2) # 自定义刻度线粗细
)
# 显示图
print(cv_plot)
# 保存图像
ggsave('Cross-Validation Error Plot.png', plot = cv_plot, width = 8, height = 8, bg = 'white', dpi = 1200)
###################################################################################
# 读取映射文件
label_map <- read.table("feature_labels.txt", header = FALSE, sep = "\t", stringsAsFactors = FALSE)
colnames(label_map) <- c("Internal_Name", "Readable_Name") # 设置列名
# 检查映射数据
head(label_map)
# 将重要特征的内部名称替换为可读名称
important_features$Readable_Name <- label_map$Readable_Name[match(important_features$Feature, label_map$Internal_Name)]
# 如果有未映射的特征,保持原名
important_features$Readable_Name[is.na(important_features$Readable_Name)] <- important_features$Feature[is.na(important_features$Readable_Name)]
# 过滤绝对值大于 0.1 的特征
filtered_features <- important_features[abs(important_features$Coefficient) > 0.1, ]
# 绘制系数量化图,使用过滤后的特征
coef_plot <- ggplot(filtered_features, aes(x = reorder(Readable_Name, Coefficient), y = Coefficient, fill = Coefficient > 0)) +
geom_bar(stat = "identity") + # 使用条形图
coord_flip() + # 翻转坐标轴,水平显示特征
scale_fill_manual(
values = c("TRUE" = "#8FC9E2", "FALSE" = "#ECC97F") # 正系数蓝色,负系数橙色
) +
labs(
title = "Coefficient Plot (|Coefficient| > 0.1)",
x = "Feature",
y = "Coefficient Value"
) +
theme_minimal(base_size = 14) + # 设置主题和字体大小
theme(
plot.title = element_text(hjust = 0.5, size = 20, face = "bold"), # 标题居中
axis.text.x = element_text(size = 14), # 横坐标字体大小
axis.text.y = element_text(size = 14), # 纵坐标字体大小
axis.title.x = element_text(size = 16), # 横坐标标题字体大小
axis.title.y = element_text(size = 16), # 纵坐标标题字体大小
panel.grid.major = element_blank(), # 去掉主要网格线
panel.grid.minor = element_blank(), # 去掉次要网格线
axis.line = element_line(size = 1.2, colour = "black"), # 自定义坐标轴线粗细
legend.position = "none" # 隐藏图例
)
# 显示图
print(coef_plot)
# 保存图像
ggsave('Coefficient_Plot.png', plot = coef_plot, width = 10, height = 8, bg = 'white', dpi = 1200)
标签:plot,路径分析,字体大小,text,element,axis,筛选,LASSO,size From: https://www.cnblogs.com/wzbzk/p/18608058