多个生信分析上游分析Snakemake的编写

标签：分析 output fastqc gz sample Snakemake input bam 生信

准备基本的包

conda install -c bioconda snakemake samtools hisat2 trim-galore subread -y

准备数据

wget https://ftp.ensembl.org/pub/release-110/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz
wget https://ftp.ensembl.org/pub/release-110/gtf/sus_scrofa/Sus_scrofa.Sscrofa11.1.110.gtf.gz

# 解压
gunzip Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz && mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa pig.fa
gunzip Sus_scrofa.Sscrofa11.1.110.gtf.gz && mv Sus_scrofa.Sscrofa11.1.110.gtf pig.gtf

# 以及相关的样本数据
$ ls
# B1-nao_1.fastq.gz   B2-1nao_2.fastq.gz  B4-1nao_1.fastq.gz  B5-nao_2.fastq.gz  B8-nao_1.fastq.gz
# B1-nao_2.fastq.gz   B3-1nao_1.fastq.gz  B4-1nao_2.fastq.gz  B7-nao_1.fastq.gz  B8-nao_2.fastq.gz
# B2-1nao_1.fastq.gz  B3-1nao_2.fastq.gz  B5-nao_1.fastq.gz   B7-nao_2.fastq.gz  md5.txt

# MD5校验
md5sum -c md5.txt

构建索引

hisat2-build -p 8 pig.fa pig

bash脚本运行

# 获取样本名字
ls 0_rawdata/*.fastq.gz > list.txt
awk -F '/' '{print $2}' list.txt | awk -F '_' '{print $1}' | sort | uniq > list1.txt && mv list1.txt list.txt
# 生成索引
hisat2-build -p 8 pig.fa pig

for i in `cat list.txt`
do
  nohup trim_galore -j 8 -o 1_trim/ --paired 0_rawdata/${i}_1.fastq.gz 0_rawdata/${i}_2.fastq.gz &
done

wait

for i in `cat list.txt`
do
  nohup hisat2 -p 8 -x pig -1 1_trim/${i}_1_val_1.fq.gz -2 1_trim/${i}_2_val_2.fq.gz | samtools view -bS - > 2_alignment/${i}.bam &
done

wait

for i in `cat list.txt`
do
  nohup featureCounts -T 8 -t exon -g gene_id -a ./genome/pig.gtf -o 3_featureCounts/${i}.txt 2_alignment/${i}.bam &
done

攥写Snakefile

config.yaml

gtf: "./genome/pig.gtf"
index: "./genome/pig"
threads: 20

Snakefile

shell.executable("/bin/bash")
configfile: "./config.yaml"

(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fastq.gz")
READS=["1","2"]

rule all: # all要包含所有没被其他rule调用的output
    input:
        expand("3_featureCounts/{sample}.txt", sample=SAMPLES),
        expand("4_coverage/{sample}.bw", sample=SAMPLES),
        "pig.ht2.index.complete"

rule print_samples:
    run:
        print("SAMPLES:", SAMPLES)

rule trim_galore:
    input:
        "0_rawdata/{sample}_1.fastq.gz",
        "0_rawdata/{sample}_2.fastq.gz"
    output:
        "1_trim/{sample}_1_val_1.fq.gz",
        "1_trim/{sample}_2_val_2.fq.gz"
    shell:
        "trim_galore --cores {config[threads]} --output_dir 1_trim/ --paired {input}"


rule hisat2build: # rule的名字不能有-，否则会报错Expected name or colon after rule keyword. 
    input:
        fa="genome/pig.fa"
    output:
        touch("pig.ht2.index.complete")
    shell:
        "hisat2-build -p {config[threads]} {input.fa} genome/pig && touch {output}"

rule hisat2:
    input:
        trimmed1="1_trim/{sample}_1_val_1.fq.gz",
        trimmed2="1_trim/{sample}_2_val_2.fq.gz",
        index="pig.ht2.index.complete"
    output:
        "2_alignment/{sample}.bam"
    shell:
        "hisat2 -p {config[threads]} -x genome/pig -1 {input.trimmed1} -2 {input.trimmed2} | samtools view -bS - > {output}"

rule featureCounts:
    input:
        "2_alignment/{sample}.bam"
    output:
        "3_featureCounts/{sample}.txt"
    shell:
        """
        featureCounts -T {config[threads]} -t exon -g gene_id -a {config[gtf]} -o {output[0]} -p {input[0]}
        """

rule bamsortindex:
    input:
        "2_alignment/{sample}.bam"
    output:
        sort="2_alignment/{sample}.sorted.bam",
	index="2_alignment/{sample}.sorted.bam.bai"
    shell:
        """
	samtools sort -@ {config[threads]} -o {output.sort} {input} && samtools index {output.sort}
        """

rule bamCoverage:
    input:
        bam="2_alignment/{sample}.sorted.bam",
        index="2_alignment/{sample}.sorted.bam.bai"
    output:
        "4_coverage/{sample}.bw"
    shell:
        "bamCoverage -b {input.bam} -o {output} --numberOfProcessors {config[threads]}"

攥写运行snakefile的SLURM递交脚本

#!/bin/bash
#SBATCH --job-name=sbatch_rna_flow
#SBATCH --cpus-per-task=40
#SBATCH -o job.out
#SBATCH --nodelist=comput3
snakemake -j 40

其他相关操作

snakemake --cluster "qsub -V -cwd -q v01" -j 10
--cluster 集群运行指令
qusb -V -cwd -q 表示输出当前环境变量(-V),在当前目录下运行(-cwd), 投递到指定的队列(-q), 如果不指定则使用任何可用队列
--local-cores N: 在每个集群中最多并行N核
--cluster-config/-u FILE: 集群配置文件

RNAseq表达定量

library(DESeq2)
counts <- read.table("counts.txt", header=TRUE, row.names=1)
countsTable <- as.matrix(counts[,(6:ncol(counts))])
# 每个样本的总reads数
totalReads <- t(as.matrix(colSums(countsTable)))

length <- as.matrix(counts[,5])

# 每个样本基因的fpkm等于每个样本基因的reads数除以每个样本的总reads数，再除以对应基因的长度
## 样本总reads数乘基因的长度获得矩阵
totalReadsLength <- length %*% totalReads
## 每个样本基因的fpkm等于countsTable矩阵除以totalReadsLength矩阵对应位置的值
fpkm <- 10^9 * countsTable / totalReadsLength

# 保存，并保留行名和列名
write.table(fpkm, "fpkm.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)

更改fpkm的列名，使gene_id转化为symbol

library(clusterProfiler) # 不包含猪的ensembl id
library(org.Ss.eg.db)
# 下载biomaRt
# if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
# BiocManager::install("biomaRt")
library(biomaRt)

# 显示能连接的数据库
listMarts()
#选择使用哪个数据库
Mart <- useMart('ENSEMBL_MART_ENSEMBL')
#列出并选择猪的基因组注释
listDatasets(Mart)
# 188                                      Pig genes (Sscrofa11.1)
Mart <- useDataset('sscrofa_gene_ensembl', mart = Mart)

#查看可以输入的gene ID的类型
listFilters(Mart)
#查看可以输出的gene ID的类型
listAttributes(Mart)

fpkm <- read.table("fpkm.txt", header=TRUE, row.names=1)
gene_id <- rownames(fpkm)
# > head(gene_id)
# [1] "ENSSSCG00000028996"
# [2] "ENSSSCG00000005267"
# [3] "ENSSSCG00000005268"
# [4] "ENSSSCG00000005269"
# [5] "ENSSSCG00000031382"
# [6] "ENSSSCG00000005271"

# 根据输入输出类型使用getBM()函数进行gene ID转换
geneconvert <- getBM(attributes=c('ensembl_gene_id', 'external_gene_name'), 
                     filters='ensembl_gene_id', 
                     values=gene_id, 
                     mart=Mart)

# 保存
write.table(geneconvert, "geneconvert.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)

# getBM(attributes, filters = "", values = "", mart, curl = NULL, checkFilters = TRUE, verbose = FALSE, uniqueRows = TRUE, bmHeader = FALSE,quote = "\", useCache = TRUE)
# attributes定义gene ID输出类型（支持多类型输出），filters定义gene ID输入类型，values指定所输入的gene ID，mart为所用的数据库。但需要注意的是在输出时，需要带上输入的gene ID类型，否则会无法分清输出结果

# 将fpkm的行名替换为symbol，缺失的symbol用gene_id代替，重复的rowname添加后缀
# 假设 geneconvert 数据框包含两列：ensembl_gene_id 和 external_gene_name

# 匹配 fpkm 矩阵的行名和 geneconvert 数据框中的 ensembl_gene_id，并获取对应的 external_gene_name
matching_names <- geneconvert$external_gene_name[match(rownames(fpkm), geneconvert$ensembl_gene_id)]

# 创建一个空的向量用于存储新的行名
new_row_names <- character(length = length(matching_names))

# 遍历匹配到的行名
for (i in 1:length(matching_names)) {
  # 如果存在匹配的 external_gene_name，则使用它作为新的行名
  if (!is.na(matching_names[i])) {
    new_row_names[i] <- matching_names[i]
  } else {
    # 如果缺失 external_gene_name，则使用对应的 gene_id 作为新的行名
    new_row_names[i] <- geneconvert$ensembl_gene_id[match(rownames(fpkm)[i], geneconvert$ensembl_gene_id)]
  }
}

# 处理重复的行名
unique_names <- make.unique(new_row_names, sep = "_")

# 将新的行名设置为 fpkm 矩阵的行名
rownames(fpkm) <- unique_namesd

# 保存
write.table(fpkm, "fpkm_symbol.txt", sep="\t", quote=FALSE, col.names=NA, row.names=TRUE)

攥写wgbs的Snakefile

config.yaml

threads: 8
index: "./genome"

Snakefile

shell.executable("/bin/bash")
configfile: "./config.yaml"

(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fastq.gz")

rule all:
    input: 
        expand("5_bismark_methylation_extractor/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz", sample=SAMPLES),
	    "1_fastqc/multiqc_report.zip",
    	"2_fastqc/multiqc_report.zip"

rule fastqc1:
    input:
        "0_rawdata/{sample}_1.fastq.gz",
        "0_rawdata/{sample}_2.fastq.gz"
    output:
        "1_fastqc/{sample}_1_fastqc.zip",
        "1_fastqc/{sample}_1_fastqc.html",
        "1_fastqc/{sample}_2_fastqc.zip",
        "1_fastqc/{sample}_2_fastqc.html"
    shell:
        "fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc1:
    input:
        expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
        expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
    output:
        "1_fastqc/multiqc_report.zip"
    shell:
        "multiqc 1_fastqc/ -o 1_fastqc/"

rule trim_galore:
    input:
        "0_rawdata/{sample}_1.fastq.gz",
        "0_rawdata/{sample}_2.fastq.gz"
    output:
        "2_trim/{sample}_1_val_1.fq.gz",
        "2_trim/{sample}_2_val_2.fq.gz"
    shell:
        "trim_galore -j {config[threads]} -o 2_trim/ --paired {input[0]} {input[1]}"

rule fastqc2:
    input:
        "2_trim/{sample}_1_val_1.fq.gz",
        "2_trim/{sample}_2_val_2.fq.gz"
    output:
        "2_fastqc/{sample}_1_fastqc.zip",
        "2_fastqc/{sample}_1_fastqc.html",
        "2_fastqc/{sample}_2_fastqc.zip",
        "2_fastqc/{sample}_2_fastqc.html"
    shell:
        "fastqc -o 2_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc2:
    input:
        expand("2_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
        expand("2_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
    output:
        "2_fastqc/multiqc_report.zip"
    shell:
        "multiqc 2_fastqc/ -o 2_fastqc/"

# rule bismark_genome_preparation:
#     input:
#         "genome"
#     output:
#         touch("genome/Bisulfite_Genome/CT_conversion/genome_mfa.CT_conversion.fa")
#     shell:
#         "bismark_genome_preparation --bowtie2 --parallel {config[threads]} {input}"

rule bismark:
    input:
        fq1="2_trim/{sample}_1_val_1.fq.gz",
        fq2="2_trim/{sample}_2_val_2.fq.gz",
        index="genome/Bisulfite_Genome/CT_conversion/genome_mfa.CT_conversion.fa"
    output:
        "3_alignment/{sample}_1_val_1_bismark_bt2_pe.bam"
    shell:
        "bismark --bowtie2 --genome genome -1 {input.fq1} -2 {input.fq2} -o 3_alignment/"

rule deduplicate:
    input:
        "3_alignment/{sample}_1_val_1_bismark_bt2_pe.bam"
    output:
        "4_deduplicate/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bam"
    shell:
        "deduplicate_bismark -p --bam {input} --output_dir 4_deduplicate/"

rule bismark_methylation_extractor:
    input:
        "4_deduplicate/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bam"
    output:
        "5_bismark_methylation_extractor/{sample}_1_val_1_bismark_bt2_pe.deduplicated.bismark.cov.gz"
    shell:
        "bismark_methylation_extractor -p --bedGraph --multicore {config[threads]} --output 5_bismark_methylation_extractor/ {input}"

撰写Chip-seq的Snakefile

config.yaml

threads: 8
index: "./genome"

Snakefile

shell.executable("/bin/bash")
configfile: "./config.yaml"

prefix = "SRR13289"
start = 578
end = 593

SAMPLES = []

for id in range(start, end + 1):
    file_name = f"{prefix}{id}"
    SAMPLES.append(file_name)

rule all:
    input: 
        "1_fastqc/multiqc_report.html",
        "2_fastqc/multiqc_report.html",
        expand("3_bamqc/{sample}.sorted.bam.stat", sample=SAMPLES),
        expand("5_dedup/{sample}.sorted.dedup.bam.bai", sample=SAMPLES),
        expand("6_bigwig/{sample}.bw", sample=SAMPLES),
        "genome/pig.complete"

rule fastqc1:
    input:
        "0_rawdata/{sample}_1.fastq.gz",
        "0_rawdata/{sample}_2.fastq.gz"
    output:
        "1_fastqc/{sample}_1_fastqc.zip",
        "1_fastqc/{sample}_2_fastqc.zip"
    shell:
        "fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc1:
    input:
        expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
        expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
    output:
        "1_fastqc/multiqc_report.html"
    shell:
        "multiqc 1_fastqc/ -o 1_fastqc/"

rule trim_galore:
    input:
        "0_rawdata/{sample}_1.fastq.gz",
        "0_rawdata/{sample}_2.fastq.gz"
    output:
        "2_trim/{sample}_1_val_1.fq.gz",
        "2_trim/{sample}_2_val_2.fq.gz"
    shell:
        "trim_galore -j {config[threads]} -o 2_trim/ --paired {input[0]} {input[1]}"

rule fastqc2:
    input:
        "2_trim/{sample}_1_val_1.fq.gz",
        "2_trim/{sample}_2_val_2.fq.gz"
    output:
        "2_fastqc/{sample}_1_val_1_fastqc.zip",
        "2_fastqc/{sample}_2_val_2_fastqc.zip"
    shell:
        "fastqc -o 2_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc2:
    input:
        expand("2_fastqc/{sample}_1_val_1_fastqc.zip", sample=SAMPLES),
        expand("2_fastqc/{sample}_2_val_2_fastqc.zip", sample=SAMPLES)
    output:
        "2_fastqc/multiqc_report.html"
    shell:
        "multiqc 2_fastqc/ -o 2_fastqc/"

rule bowtie2build:
    input:
        fa="genome/pig.fa"
    output:
        touch("genome/pig.complete")
    shell:
        "bowtie2-build {input.fa} genome/pig && touch {output}"

rule bowtie2:
    input:
        "2_trim/{sample}_1_val_1.fq.gz",
        "2_trim/{sample}_2_val_2.fq.gz"
    output:
        "3_sort/{sample}.sam"
    shell:
        "bowtie2 -p 8 -x ./genome/pig -1 {input[0]} -2 {input[1]} -S {output}"

rule sam2bam:
    input:
        "3_sort/{sample}.sam"
    output:
        "3_sort/{sample}.bam"
    shell:
        "samtools view -@ {config[threads]} -bS {input} > {output}"

rule samtoolssort1:
    input:
        "3_sort/{sample}.bam"
    output:
        "3_sort/{sample}.sorted.bam"
    shell:
        "samtools sort -@ {config[threads]} -n -o {output} {input}"

rule bamqc:
    input:
        "3_sort/{sample}.sorted.bam"
    output:
        "3_bamqc/{sample}.sorted.bam.stat"
    shell:
        "samtools flagstat {input} > {output}"

rule fixmate:
    input:
        "3_sort/{sample}.sorted.bam"
    output:
        "4_fixmate/{sample}.sorted.fixmate.bam"
    shell:
        "samtools fixmate -m -@ {config[threads]} {input} {output}"

rule samtoolssort2:
    input:
        "4_fixmate/{sample}.sorted.fixmate.bam"
    output:
        "4_sort/{sample}.sorted.bam"
    shell:
        "samtools sort -@ {config[threads]} -o {output} {input}"

rule deduplicate:
    input:
        "4_sort/{sample}.sorted.bam"
    output:
        "5_dedup/{sample}.sorted.dedup.bam"
    shell:
        "samtools markdup -@ {config[threads]} {input} {output}"

rule bamindex:
    input:
        "5_dedup/{sample}.sorted.dedup.bam"
    output:
        "5_dedup/{sample}.sorted.dedup.bam.bai"
    shell:
        "samtools index {input}"

rule bam2bigwig:
    input:
        "5_dedup/{sample}.sorted.dedup.bam",
	"5_dedup/{sample}.sorted.dedup.bam.bai"
    output:
        "6_bigwig/{sample}.bw"
    shell:
        "bamCoverage -b {input[0]} -o {output} -bs 10 --normalizeUsing BPM"

攥写cuttag的Snakefile

config.yaml

threads: 8
index: "./genome"

Snakefile

shell.executable("/bin/bash")
configfile: "./config.yaml"

(SAMPLES,READS,) = glob_wildcards("0_rawdata/{sample}_{read}.fq.gz")
READS=["1","2"]

rule all:
    input:
        "1_fastqc/multiqc_report.zip",
        "genome/pig.complete",
        expand("6_bed/{sample}.clean.bed", sample=SAMPLES),
        expand("7_bigwig/{sample}.bw", sample=SAMPLES)

rule fastqc:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "1_fastqc/{sample}_1_fastqc.zip",
        "1_fastqc/{sample}_2_fastqc.zip"
    shell:
        "fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"

rule multiqc:
    input:
        expand("1_fastqc/{sample}_1_fastqc.zip", sample=SAMPLES),
        expand("1_fastqc/{sample}_2_fastqc.zip", sample=SAMPLES)
    output:
        "1_fastqc/multiqc_report.zip"
    shell:
        "multiqc 1_fastqc/ -o 1_fastqc/"

rule bowtie2build:
    input:
        fa="genome/pig.fa"
    output:
        touch("genome/pig.complete")
    shell:
        "bowtie2-build {input.fa} genome/pig && touch {output}"

rule bowtie2:
    input:
        "0_rawdata/{sample}_1.fq.gz",
        "0_rawdata/{sample}_2.fq.gz"
    output:
        "2_bowtie2/{sample}.sam"
    shell:
        """
        touch {output.log}
        bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 {input[0]} -2 {input[1]} -S {output}
        """

rule sam2bam:
    input:
        "2_bowtie2/{sample}.sam"
    output:
        "3_bam/{sample}.bam"
    shell:
        "samtools view -@ {config[threads]} -F 0x04 -bS {input} > {output}"

rule bam2bed:
    input:
        "3_bam/{sample}.bam"
    output:
        "4_bed/{sample}.bedpe"
    shell:
        "bedtools bamtobed -i {input} -bedpe > {output}"

rule cleanbed:
    input:
        "4_bed/{sample}.bedpe"
    output:
        "5_bed/{sample}.clean.bedpe"
    shell:
        "awk '$1==$4 && $6-$2 < 1000 {print $0}' {input} > {output}"

rule bedpe2bed:
    input:
        "5_bed/{sample}.clean.bedpe"
    output:
        "6_bed/{sample}.clean.bed"
    shell:
        "cut -f 1,2,6 {input} | sort -k1,1 -k2,2n -k3,3n  > {output}"

rule bamsort:
    input:
        "3_bam/{sample}.bam"
    output:
        "3_bam/{sample}.sorted.bam"
    shell:
        "samtools sort -@ {config[threads]} -o {output} {input}"

rule bamindex:
    input:
        "3_bam/{sample}.sorted.bam"
    output:
        "3_bam/{sample}.sorted.bam.bai"
    shell:
        "samtools index {input}"

rule bam2bigwig:
    input:
        "3_bam/{sample}.sorted.bam",
        "3_bam/{sample}.sorted.bam.bai"
    output:
        "7_bigwig/{sample}.bw"
    shell:
        "bamCoverage -b {input[0]} -o {output}"

标签：分析,output,fastqc,gz,sample,Snakemake,input,bam,生信
From： https://www.cnblogs.com/cauwj/p/18137147

多个生信分析上游分析Snakemake的编写

准备基本的包

准备数据

构建索引

bash脚本运行

攥写Snakefile

config.yaml

Snakefile

攥写运行snakefile的SLURM递交脚本

其他相关操作

RNAseq表达定量

更改fpkm的列名，使gene_id转化为symbol

攥写wgbs的Snakefile

config.yaml

Snakefile

撰写Chip-seq的Snakefile

config.yaml

Snakefile

攥写cuttag的Snakefile

config.yaml

Snakefile

相关文章

赞助商

阅读排行