首页 > 其他分享 >数据分析:基因突变瀑布图统计以及可视化

数据分析:基因突变瀑布图统计以及可视化

时间:2024-11-19 09:19:46浏览次数:1  
标签:数据分析 genes driver library 可视化 freq 基因突变

介绍

基因突变瀑布图统计以及可视化

数据下载

MongolianHCC数据集:

加载R包

library(maftools)
library(TCGAmutations)
library(circlize)
library(ComplexHeatmap)
library(RColorBrewer)
library(openxlsx)
library(dplyr)

代码

PROJECT_DIR <- "./MongolianHCC/" # replace this line with your local path

source(file.path(PROJECT_DIR, "SCRIPTS", "helper_functions.oncoplot.R"))

sample_info_file <- file.path(PROJECT_DIR, "DATA", "PROCESSED", "patient_sample_metadata_w_clustering_risk.txt")
driver_res_file <- file.path(PROJECT_DIR, "DATA", "ORIGINAL", "mut_full.sig_genes.txt")

out_dir <- file.path(PROJECT_DIR, "RESULTS", "mut_oncoplot")
maf_file <- file.path(PROJECT_DIR, "DATA", "PROCESSED", "maf_after_all_filters.maf")
sample_info.exome.file <- sample_info_file

driver_results_file <- driver_res_file
driver_sig_col <- "q"
driver_sig_thresh <- 0.1
driver_sig_freq <- 0.05
cohort_freq_thresh <- 0.2
# cohort_freq_thresh=NA
gene_list_file <- NA

source(file.path(PROJECT_DIR, "SCRIPTS", "helper_functions.oncoplot.R"))

if (!dir.exists(out_dir)) {
  dir.create(out_dir, recursive = T)
}

if (grepl(".xlsx$", sample_info.exome.file)) {
  library(openxlsx)
  sample_info.exome <- read.xlsx(sample_info.exome.file)
} else {
  sample_info.exome <- read.table(sample_info.exome.file, sep = "\t", header = T, stringsAsFactors = F)
  sample_info.exome$Tumor_Sample_Barcode <- sample_info.exome$WES_T
}

ignoreGenes <- c("TTN", "MUC16", "OBSCN", "AHNAK2", "SYNE1", "FLG", "MUC5B", 
                 "DNAH17", "PLEC", "DST", "SYNE2", "NEB", "HSPG2", "LAMA5", 
                 "AHNAK", "HMCN1", "USH2A", "DNAH11", "MACF1", "MUC17", "DNAH5", 
                 "GPR98", "FAT1", "PKD1", "MDN1", "RNF213", "RYR1", "DNAH2", 
                 "DNAH3", "DNAH8", "DNAH1", "DNAH9", "ABCA13", "SRRM2", "CUBN", 
                 "SPTBN5", "PKHD1", "LRP2", "FBN3", "CDH23", "DNAH10", "FAT4", 
                 "RYR3", "PKHD1L1", "FAT2", "CSMD1", "PCNT", "COL6A3", "FRAS1", 
                 "FCGBP", "RYR2", "HYDIN", "XIRP2", "LAMA1")
# sort(ignoreGenes)

mafObj <- read.maf(maf_file, clinicalData = sample_info.exome)
maf.filter <- mafObj

frac_mut <- data.frame(
  Hugo_Symbol = [email protected]$Hugo_Symbol,
  frac_mut = ([email protected]$MutatedSamples / as.numeric(maf.filter@summary$summary[3])),
  stringsAsFactors = F
)
frac_mut[is.na(frac_mut)] <- 0

# driver_res_file="data/somatic.sig_genes.txt"
# driver_sig_col="q"
# driver_sig_thresh=0.1
# driver_sig_freq=0.05
if (file.exists(driver_res_file)) {
  driver_res <- read.table(driver_res_file, sep = "\t", header = T, quote = "", stringsAsFactors = F)
  colnames(driver_res)[colnames(driver_res) == "gene"] <- "Hugo_Symbol"
  driver_res$FLAG_gene <- driver_res$Hugo_Symbol %in% ignoreGenes
  driver_res_plus <- merge.data.frame(driver_res, frac_mut)
  driver_res_plus <- driver_res_plus[order(driver_res_plus[, driver_sig_col], decreasing = F), ]
  driver_genes <- driver_res_plus$Hugo_Symbol[driver_res_plus[, driver_sig_col] < driver_sig_thresh & driver_res_plus$frac_mut > driver_sig_freq]
} else {
  driver_res_plus <- frac_mut
  driver_genes <- c()
}

# driver_res_plus <- driver_res
cohort_freq_thresh <- 0.1
if (!is.na(cohort_freq_thresh)) {
  freq_genes <- setdiff(driver_res_plus$Hugo_Symbol[driver_res_plus$frac_mut > cohort_freq_thresh], driver_genes)
} else {
  freq_genes <- NULL
}

if (file.exists(as.character(gene_list_file))) {
  custom_gene_list <- read.table(gene_list_file, stringsAsFactors = F)[, 1]
  custom_gene_list <- setdiff(custom_gene_list, c(driver_genes, freq_genes))
} else {
  custom_gene_list <- NULL
}

gene_list <- list(driver_genes, freq_genes, custom_gene_list)
reasons <- c(
  paste0("Driver Gene, ", driver_sig_col, " < ", driver_sig_thresh),
  paste0("Cohort Freq > ", cohort_freq_thresh),
  paste0("Selected Genes")
)

genes_for_oncoplot <- data.frame(Hugo_Symbol = c(), reason = c())
for (i in 1:length(gene_list)) {
  if (is.null(gene_list[[i]][1])) {
    next
  }
  genes_for_oncoplot <- rbind(
    genes_for_oncoplot,
    data.frame(
      Hugo_Symbol = gene_list[[i]],
      reason = reasons[i]
    )
  )
}
genes_for_oncoplot <- cbind(genes_for_oncoplot,
  frac = driver_res_plus$frac_mut[match(genes_for_oncoplot$Hugo_Symbol, driver_res_plus$Hugo_Symbol)]
)

genes_for_oncoplot <- genes_for_oncoplot[!is.na(genes_for_oncoplot$frac), ]
genes_for_oncoplot <- genes_for_oncoplot[order(genes_for_oncoplot$reason, -genes_for_oncoplot$frac), ]
split_idx <- as.character(genes_for_oncoplot$reason)
split_idx <- factor(split_idx, levels = reasons[reasons %in% split_idx])
split_colors <- rainbow(length(levels(split_idx)))
names(split_colors) <- as.character(genes_for_oncoplot$reason[!duplicated(genes_for_oncoplot$reason)])
split_colors <- list(Reason = split_colors)

# source("scripts/helper_functions.R")
oncomat <- createOncoMatrix(maf.filter, g = genes_for_oncoplot$Hugo_Symbol, add_missing = F)$oncoMatrix
write.table(genes_for_oncoplot, file = paste0(out_dir, "/genes_for_oncoplot.txt"), sep = "\t", quote = F, row.names = F)

include_all <- T
if (include_all) {
  ### createOncoMatrix drops empty samples, so this adds them back in
  all_wes_samples <- as.character(sample_info.exome$Tumor_Sample_Barcode[!is.na(sample_info.exome$Tumor_Sample_Barcode)])
  extra_samples <- setdiff(all_wes_samples, colnames(oncomat))
  empty_data <- matrix(data = "", nrow = nrow(oncomat), ncol = length(extra_samples), dimnames = list(rownames(oncomat), extra_samples))
  oncomat <- cbind(oncomat, empty_data)
}

oncomat <- oncomat[match(genes_for_oncoplot$Hugo_Symbol, rownames(oncomat)), ]
onco_genes <- rownames(oncomat)

#### TCGA Comparison Heatmap (from helper_functions.oncoplot.R)
tcga_comparison_results <- make_TCGA_comparison_heatmap(onco_genes, maf.filter, split_at = split_idx)
tcga_comparison_hm <- tcga_comparison_results[[1]]


#### MONGOLIA oncoplot
## Oncoplot parameters
annotate_empty <- ""
annotation_font_size <- 9
annotation_height_frac <- 0.3
onco_width <- 9
onco_height <- NULL

oncomat.plot <- oncomat
colnames(oncomat.plot) <- sample_info.exome$Patient[match(colnames(oncomat.plot), sample_info.exome$Tumor_Sample_Barcode)]

if (is.null(onco_height)) {
  onco_height <- max(round(0.2 * nrow(oncomat.plot), 0) + 2, 5)
}

hm_anno_info <- as.data.frame(sample_info.exome[match(colnames(oncomat.plot), sample_info.exome$Patient), ])
rownames(hm_anno_info) <- hm_anno_info$Patient
hm_anno_info <- hm_anno_info[, c(10:ncol(hm_anno_info))]
hm_anno_info$sex <- ifelse(is.na(hm_anno_info$sex), "NA", ifelse(hm_anno_info$sex == 1, "M", "F"))


grouping_var <- "hdv"
group_counts <- c(0, table(hm_anno_info[, grouping_var], useNA = "ifany"))
names(group_counts) <- c(names(group_counts)[2:length(group_counts)], "last")
group_levels <- sort(unique(hm_anno_info[, grouping_var]))
plot_idx <- c()
for (currgroup_idx in 1:length(group_levels)) {
  # currgroup_idx=1
  currgroup <- group_levels[currgroup_idx]
  curr_subset <- hm_anno_info[hm_anno_info[, grouping_var] == currgroup, ]
  order_classes <- factor(ifelse(is.na(as.character(curr_subset$class)), "Unknown", as.character(curr_subset$class)))
  curr_idx <- orderByGroup(oncomat.plot[, colnames(oncomat.plot) %in% rownames(curr_subset)], order_classes)
  plot_idx <- c(
    plot_idx,
    rownames(curr_subset)[curr_idx]
  )
}

plot_order <- plot_idx

################### TOGGLE FOR TOP/BOTTOM ANNOTATION ###################
##### This is for only class/cluster on top
# top_anno_names <- c("Class")
# names(top_anno_names) <- c("class")
# bot_anno_names <- c("Sex","Age","HCV","HBV","HDV","Stage","Tumor Size","Cirrhosis","Obesity","Alcohol","Smoker","Family History","alb","bil","alt","afp","multinodular","Risk Group")
# names(bot_anno_names) <- c("sex","age_bin","hcv","hbv","hdv","stage","tumor_size","cirrhosis","obesity","alcohol","smoker","FHx_LC","alb","bil","alt","afp","multinodular","risk_bin") # column names
################### TOGGLE FOR TOP/BOTTOM ANNOTATION ###################
##### This is for cluster+HDV on top
top_anno_names <- c("Class", "HDV", "HBV", "HCV")
names(top_anno_names) <- c("class", "hdv", "hbv", "hcv")
bot_anno_names <- c(
  "Sex", "Age", "Obesity", "Smoking", "Alcohol", "Family History", "Cirrhosis",
  "alb", "bil", "alt", "afp", "Tumor Size", "multinodular", "Stage", "Risk Group"
)

names(bot_anno_names) <- c("sex", "age_bin", "obesity", "smoker", "alcohol", 
                           "FHx_LC", "cirrhosis", "alb", "bil", "alt", "afp", 
                           "tumor_size", "multinodular", "stage", "risk_bin") # column names
########################################################################


hm_anno_info <- hm_anno_info[, unique(c(names(top_anno_names), names(bot_anno_names)))]

annocolors <- my_oncoplot_colors(hm_anno_info)


my_types <- unique(unlist(apply(oncomat.plot, 2, unique)))
my_types <- my_types[!my_types %in% c(NA, "", 0)]

col <- c(
  Nonsense_Mutation = "#ad7aff", 
  Missense_Mutation = "#377EB8", 
  Frame_Shift_Del = "#4DAF4A",
  In_Frame_Ins = "#ff008c", 
  Splice_Site = "#FF7F00", 
  Multi_Hit = "#FFFF33", 
  Frame_Shift_Ins = "#A65628",
  In_Frame_Del = "#f781bf", 
  Translation_Start_Site = "#400085", 
  Nonstop_Mutation = "#b68dfc",
  no_variants = "#d6d6d6"
)

top_anno_data <- data.frame(hm_anno_info[, names(top_anno_names)], row.names = rownames(hm_anno_info))
colnames(top_anno_data) <- names(top_anno_names)
bot_anno_data <- data.frame(hm_anno_info[, names(bot_anno_names)], row.names = rownames(hm_anno_info))
colnames(bot_anno_data) <- names(bot_anno_data)

unmutated_annodata <- ifelse(colSums(nchar(oncomat.plot)) == 0, annotate_empty, "")

annocolors$empty <- c("TRUE" = "black", "FALSE" = "white")
top_height <- onco_height * annotation_height_frac * (ncol(top_anno_data) / ncol(bot_anno_data))
top_anno <- HeatmapAnnotation(
  empty = anno_text(
    unmutated_annodata, 
    gp = gpar(fontsize = 6, fontface = "bold", col = "grey10"), 
    location = unit(0.5, "npc"),
    which = "column"),
  df = top_anno_data, 
  name = "top_anno", 
  col = annocolors, 
  show_annotation_name = T, 
  na_col = "grey70", 
  show_legend = F,
  simple_anno_size_adjust = T, 
  height = unit(top_height, "inches"), 
  annotation_name_gp = gpar(fontsize = annotation_font_size)
)
names(top_anno) <- top_anno_names[names(top_anno)]

bot_anno <- HeatmapAnnotation(
  df = bot_anno_data, 
  name = "bot_anno", 
  col = annocolors, 
  show_annotation_name = T, 
  na_col = "white", 
  show_legend = F,
  simple_anno_size_adjust = T, 
  height = unit(onco_height * annotation_height_frac, "inches"),
  annotation_name_gp = gpar(fontsize = annotation_font_size)
)
names(bot_anno) <- bot_anno_names[names(bot_anno)]


# browser()
names(col) <- gsub("_", " ", names(col))
oncomat.plot <- gsub("_", " ", oncomat.plot)

col_split_idx <- hm_anno_info$hdv
onco_base_default <- oncoPrint(
  oncomat.plot,
  alter_fun = alter_fun, 
  col = col, 
  row_order = 1:nrow(oncomat.plot),
  name = "oncoplot",
  show_pct = F,
  top_annotation = top_anno,
  bottom_annotation = bot_anno,
  row_split = split_idx,
  left_annotation = rowAnnotation(Reason = split_idx, col = split_colors, annotation_width = unit(0.2, "mm")),
  row_title = NULL,
  column_title = NULL,
  column_order = plot_order,
  column_gap = unit(0.01, "npc"),
  column_split = col_split_idx,
  width = 1
)

# browser()
save_name <- paste0(out_dir, "/oncoplot.pdf")

pdf(file = save_name, height = onco_height, width = onco_width)
draw(tcga_comparison_hm + onco_base_default, main_heatmap = 2)

class_labels <- hm_anno_info$class[match(plot_order, rownames(hm_anno_info))]
class_labels[is.na(class_labels)] <- "NA"
class_labels <- factor(class_labels)

slice_labels <- hm_anno_info$hdv[match(plot_order, rownames(hm_anno_info))]
slice_labels[is.na(slice_labels)] <- "NA"
slice_labels <- factor(slice_labels)

myclasses <- split(class_labels, slice_labels)
label_idx <- lapply(myclasses, function(x) {
  # x=myclasses[[1]]
  change_idx <- c(which(as.logical(diff(as.numeric(x)))), length(x))
  change_idx <- change_idx / length(x)
  curr_label_idx <- rep(1, length(change_idx))
  for (i in 1:length(change_idx)) {
    prev_start <- ifelse(length(change_idx[i - 1]) > 0, change_idx[i - 1], 0)
    curr_label_idx[i] <- (change_idx[i] - prev_start) / 2 + prev_start
  }
  mynames <- ifelse(diff(c(0, change_idx)) < 0.1, "", levels(x))
  names(curr_label_idx) <- mynames
  return(curr_label_idx)
})

## Add cluster labels to top annotation
for (slice_num in 1:length(label_idx)) {
  for (class_num in 1:length(label_idx[[slice_num]])) {
    labeltxt <- names(label_idx[[slice_num]])[class_num]
    labelpos <- label_idx[[slice_num]][class_num]
    decorate_annotation("Class", slice = slice_num, {
      grid.text(labeltxt, labelpos,
        0.5,
        default.units = "npc", gp = gpar(fontsize = annotation_font_size * 0.9, fontface = "italic", col = "white")
      )
    })
  }
}

dev.off()


####  TABLE containing data for each variant
output_data <- maf.filter@data[maf.filter@data$Hugo_Symbol %in% onco_genes, ]
output_data$tumor_genotype <- apply(output_data[, c("Tumor_Seq_Allele1", "Tumor_Seq_Allele2")], 1, paste, collapse = "/")
output_data$normal_genotype <- apply(output_data[, c("Match_Norm_Seq_Allele1", "Match_Norm_Seq_Allele2")], 1, paste, collapse = "/")

pheno_info <- sample_info.exome[match(output_data$Tumor_Sample_Barcode, sample_info.exome$Tumor_Sample_Barcode), ]
pheno_info <- cbind(pheno_info[, "Tumor_Sample_Barcode"], pheno_info[, -c("Tumor_Sample_Barcode")])
pheno_columns <- colnames(pheno_info)
names(pheno_columns) <- make.names(pheno_columns, unique = T)

output_data <- cbind(output_data, pheno_info)
cols_for_table <- c(
  "Hugo Symbol" = "Hugo_Symbol",
  "Variant Classification" = "Variant_Classification",
  "Variant Type" = "Variant_Type",
  "Consequence" = "Consequence",
  "Chromosome" = "Chromosome", "Start Position" = "Start_Position", "End Position" = "End_Position", "Strand" = "Strand",
  "Reference Allele" = "Reference_Allele",
  "Tumor Genotype" = "tumor_genotype",
  "Normal Genotype" = "normal_genotype",
  "Transcript Change" = "HGVSc",
  "Protein Change" = "HGVSp_Short",
  "Normal Depth" = "n_depth",
  "Normal Ref Depth" = "n_ref_count",
  "Normal Alt Depth" = "n_alt_count",
  "Tumor Depth" = "t_depth",
  "Tumor Ref Depth" = "t_ref_count",
  "Tumor Alt Depth" = "t_alt_count",
  "Existing Annotation" = "Existing_variation",
  "gnomAD Frequency" = "gnomAD_AF",
  "ExAC Frequency" = "ExAC_AF",
  "1000Genomes Frequency" = "AF",
  "Current Cohort Frequency" = "tumor_freq",
  pheno_columns
)

variant_info <- as.data.frame(output_data)[, cols_for_table]
colnames(variant_info) <- names(cols_for_table)
write.xlsx(variant_info, file = paste0(out_dir, "/Table_2.xlsx"))

tcga.white <- tcga_comparison_results[[2]]
tcga.asian <- tcga_comparison_results[[3]]
tcga_lihc_mc3 <- tcga_comparison_results[[4]]
all_dfs <- list(
  driver_res_plus,
  data.frame(
    MONG_frac_mut = ([email protected]$MutatedSamples / as.numeric(maf.filter@summary$summary[3])),
    Hugo_Symbol = [email protected]$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_All_frac_mut = ([email protected]$MutatedSamples / as.numeric(tcga_lihc_mc3@summary$summary[3])),
    Hugo_Symbol = [email protected]$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_Asian_frac_mut = ([email protected]$MutatedSamples / as.numeric(tcga.asian@summary$summary[3])),
    Hugo_Symbol = [email protected]$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame(
    TCGA_White_frac_mut = ([email protected]$MutatedSamples / as.numeric(tcga.white@summary$summary[3])),
    Hugo_Symbol = [email protected]$Hugo_Symbol, stringsAsFactors = F
  ),
  data.frame([email protected])
)


all_dfs_merged <- all_dfs %>%
  Reduce(function(dtf1, dtf2) left_join(dtf1, dtf2, by = "Hugo_Symbol"), .)

write.xlsx(all_dfs_merged, file = paste0(out_dir, "/gene_mutsig_info.xlsx")) # ,asTable = T)

参考

  • The genomic landscape of Mongolian hepatocellular carcinoma

标签:数据分析,genes,driver,library,可视化,freq,基因突变
From: https://www.cnblogs.com/bioinformatics-hua/p/18554207

相关文章

  • (分享源码)计算机毕业设计必看必学 上万套实战教程手把手教学JAVA、PHP,node.js,C++、pyth
     摘 要21世纪的今天,随着社会的不断发展与进步,人们对于信息科学化的认识,已由低层次向高层次发展,由原来的感性认知向理性认知提高,管理工作的重要性已逐渐被人们所认识,科学化的管理,使信息存储达到准确、快速、完善,并能提高工作管理效率,促进其发展。论文主要是对医疗门诊管理......
  • (分享源码)计算机毕业设计必看必学 上万套实战教程手把手教学JAVA、PHP,node.js,C++、pyth
     摘 要随着我国经济迅速发展,人们对手机的需求越来越大,各种手机软件也都在被广泛应用,但是对于手机进行数据信息管理,对于手机的各种软件也是备受用户的喜爱,校园跳蚤市场管理系统被用户普遍使用,为方便用户能够可以随时进行校园跳蚤市场管理系统的数据信息管理,特开发了基于spri......
  • R语言数据分析案例45-全国汽车销售数据分析(可视化与回归分析)
    一、研究背景随着经济的发展和人们生活水平的提高,汽车已经成为人们日常生活中不可或缺的交通工具之一。汽车市场的规模不断扩大,同时竞争也日益激烈。对于汽车制造商和经销商来说,深入了解汽车销售数据背后的规律和影响因素,对于制定合理的生产计划、营销策略以及提高市场竞争力......
  • 基于python+django的旅游数据分析与推荐系统
    前言基于python+django的旅游数据分析与推荐系统,为游客提供了智能化的旅游决策支持。系统从多种途径收集旅游数据,包括旅游网站的用户评价、景点预订信息、酒店入住数据等。通过数据清洗和预处理,确保数据的准确性和可用性。在分析方面,它能挖掘出有价值的信息。例如,......
  • 基于python+django的国内运动男装小红书文章数据可视化分析系统的设计与实现
    前言基于python+django的国内运动男装小红书文章数据可视化分析系统,为深入了解运动男装市场在小红书平台的表现提供了有效途径。系统通过网络爬虫技术收集小红书上有关国内运动男装的文章数据,包括文章内容、点赞数、收藏数、评论数、发布者信息等。对这些数据进行清......
  • 基于python+django的广东省人口流动数据分析系统
    前言基于python+django的广东省人口流动数据分析系统,是深入了解广东省人口动态的有力工具。系统能够收集多渠道的人口流动数据,包括交通枢纽的客流数据、社区登记信息、手机信令数据等。通过数据清洗和预处理,去除错误和冗余信息,保证数据质量。在分析方面,可从不同维度......
  • WebGL 被视为前端开发天花板,那3D可视化和它相比呢
    一、WebGL:前端开发的强大利器WebGL是一种基于OpenGLES的JavaScriptAPI,它允许在网页浏览器中呈现交互式2D和3D图形,而无需安装额外的插件。其强大之处主要体现在以下几个方面:高性能图形渲染WebGL能够利用图形硬件加速,实现高效的图形渲染。这使得在网页上展示复杂......
  • 【大数据分析&机器学习】分布式机器学习
    【作者主页】FrancekChen【专栏介绍】⌈⌈⌈智能大数据分析⌋......
  • 从零开始学机器学习——聚类可视化
    首先给大家介绍一个很好用的学习地址:https://cloudstudio.net/columns在上一章节中,我们对聚类的相关知识进行了全面的介绍,旨在为大家打下坚实的理论基础。今天,我们的主要任务是深入探讨数据可视化的技术和方法。在之前的学习中,我们已经接触过回归分析中的可视化技术,而今天我们将......
  • 一款基于 Java 开发的微信数据分析工具!
    大家好,我是Java陈序员。今天,给大家介绍一款基于Java开发的微信数据分析工具!关注微信公众号:【Java陈序员】,获取开源项目分享、AI副业分享、超200本经典计算机电子书籍等。项目介绍wx-dump-4j——一款基于Java开发的微信数据分析工具。它准确显示好友数、群聊数和当日......