参考博客
https://mp.weixin.qq.com/s/uwO9G_71h8kU3lTWsW_zPw
https://www.jianshu.com/p/1a23656a0713
https://zhuanlan.zhihu.com/p/520071927
数据质控
fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_1.fq
fastqc -o 0_fastqc/ -f fastq -t 10 --noextract ./0_rawdata/*_2.fq
bowtie2建立索引
wget ftp://ftp.ensembl.org/pub/release-102/fasta/sus_scrofa/dna/Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz
gunzip Sus_scrofa.Sscrofa11.1.dna.toplevel.fa.gz && mv Sus_scrofa.Sscrofa11.1.dna.toplevel.fa pig.fa
bowtie2-build genome/pig.fa genome/pig
slurm提交
#!/bin/bash
#SBATCH --job-name=sbatch_cuttag
#SBATCH --cpus-per-task=40
#SBATCH -o 1_bowtie2.out
#SBATCH --nodelist=comput3
bowtie2比对
比对到猪基因组
cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/${id}.sam &"
done > bowtie2.sh
SpikeIn处理
不确定在猪的cuttag中是否需要进行spikeIn,如果需要,可以参考以下代码
比对到大肠杆菌基因组
# 下载大肠杆菌基因组
wget ftp://ftp.ensemblgenomes.org/pub/bacteria/release-48/fasta/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655/dna/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz
gunzip Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz && mv Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa ecoli.fa
# 建立索引
bowtie2-build genome/e.coli/ecoli.fa genome/e.coli/ecoli
# 比对
cat sample.txt | while read id
do
echo "bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/e.coli/ecoli -1 ./0_rawdata/${id}_1.fq -2 ./0_rawdata/${id}_2.fq -S ./1_bowtie2/ecoli/${id}.sam & > ${id}_bowtie2_spikeIn.txt"
done > bowtie2_ecoli.sh
获取测序深度值
while read id
do
{
seqDepthDouble=`samtools view -F 0x04 ./1_bowtie2/ecoli/${id}.sam | wc -l`
seqDepth=$((seqDepthDouble/2))
echo $seqDepth > ./1_bowtie2/ecoli/${id}_bowtie2_spikeIn.seqDepth
} &
done < sample.txt
wait
文件格式的转换,获得bed文件
while read i
do
{
## 转换为 BAM 文件格式,并且仅保留成功映射到参考序列上的读取
samtools view -bS -F 0x04 ./1_bowtie2/pig/${i}.sam > ./1_mapped/${i}.bam
## 将 BAM 文件转换为 bedpe 文件格式
bedtools bamtobed -i ./1_mapped/${i}.bam -bedpe > ./2_bed/${i}.bedpe
## 将 BAM 文件转换为 bed 文件格式
bedtools bamtobed -i ./1_mapped/${i}.bam > ./2_bed/${i}.bed
## 保留那些在同一条染色体且片段长度小于 1000bp 的双端 reads
awk '$1==$4 && $6-$2 < 1000 {print $0}' ./2_bed/${i}.bed > ./2_bed/${i}.clean.bed
## 仅提取片段相关的列
cut -f 1,2,6 ./2_bed/${i}.clean.bed | sort -k1,1 -k2,2n -k3,3n > ./3_fragments/${i}.fragments.bed
} &
done < sample.txt
wait
评估重复性
## 评估重复性
## 为了研究重复之间和不同条件下的可重复性,基因组被分成 500 bp bin,每个 bin reads 计数的 log2 转换值的皮尔逊相关性在重复数据集之间计算。多个重复和 IgG 对照数据集显示在层次化聚类关联矩阵中。
while read i
do
{
binLen=500
awk -v w=$binLen '{print $1, int(($2 + $3)/(2*w))*w + w/2}' ./3_fragments/${i}.fragments.bed \
sort -k1,1V -k2,2n \
uniq -c \
awk -v OFS="\t" '{print $2, $3, $1}' \
sort -k1,1V -k2,2n > ./4_bin/${i}.fragmentsCount.bin$binLen.bed
} &
done < sample.txt
wait
加入测序深度对bed文件进行标准化处理
需要有大肠杆菌的测序深度值,如没有不进行标准化处理
# 标准化
samtools faidx ./genome/pig/pig.fa ##后面的bedtools -g的内容通过samtools的faidx来进行构建获得索引文件
# 这里的seqDepth是前面的数据比对处理当中获得测序深度值,可以调取txt文件,来进行后续的循环处理
while read i
do
{
# 读取测序深度值
seqDepth=`cat ./1_bowtie2/ecoli/${i}_bowtie2_spikeIn.seqDepth`
# 根据测序深度值进行标准化处理
if [[ "$seqDepth" -gt "1" ]]; then
scale_factor=`echo "10000 / $seqDepth" | bc -l`
echo "Scaling factor for $histName is: $scale_factor!"
bedtools genomecov -bg -scale $scale_factor -i ./3_fragments/${i}.fragments.bed -g ./genome/pig/pig.fa.fai > ./5_normalized/${i}.fragments.normalized.bedgraph
fi
} &
done < sample.txt
wait
call peak
# 用SEACR进行peak calling
seacr="seacr.sh"
cat sample.txt | while read i
do
bash $seacr ./5_normalized/${i}.fragments.normalized.bedgraph 0.01 non stringent ./6_peak/${i}_seacr_top0.01.peaks.bed
done
snakemake
shell.executable("/bin/bash")
configfile: "./config.yaml"
SAMPLES=['RD29C36_N501_701', 'RD29C36-N501-702', 'RD29C36-N501-702', 'RD29C36-N501-703', 'RD29C36-N501-703', 'RD29C36-N501-704', 'RD29C36-N501-704', 'RD29C36-N502-701', 'RD29C36-N502-701', 'RD29-N502-702', 'RD29-N502-702', 'RD29-N502-703', 'RD29-N502-703', 'RD29-N502-704', 'RD29-N502-704', 'M2-oocyte-501-704', 'M2-oocyte-501-704', 'M2-oocyte-501-706', 'M2-oocyte-501-706', 'no-oocyte-501-705', 'no-oocyte-501-705', 'RD29C36_N501_701']
rule all:
input:
"1_fastqc/multiqc_report.html",
"genome/pig.complete",
expand("6_bed/{sample}.clean.bed", sample=SAMPLES),
expand("7_bigwig/{sample}.bw", sample=SAMPLES)
rule fastqc1:
input:
"0_rawdata/{sample}_1.fq.gz",
"0_rawdata/{sample}_2.fq.gz"
output:
"1_fastqc/{sample}_1_fastqc.html",
"1_fastqc/{sample}_2_fastqc.html"
shell:
"fastqc -o 1_fastqc/ -f fastq -t {config[threads]} --noextract {input[0]} {input[1]}"
rule multiqc1:
input:
expand("1_fastqc/{sample}_1_fastqc.html", sample=SAMPLES),
expand("1_fastqc/{sample}_2_fastqc.html", sample=SAMPLES)
output:
"1_fastqc/multiqc_report.html"
shell:
"multiqc 1_fastqc/ -o 1_fastqc/"
rule bowtie2build:
input:
fa="genome/pig.fa"
output:
touch("genome/pig.complete")
shell:
"bowtie2-build {input.fa} genome/pig && touch {output}"
rule bowtie2:
input:
"0_rawdata/{sample}_1.fq.gz",
"0_rawdata/{sample}_2.fq.gz"
output:
"2_bowtie2/{sample}.sam"
shell:
"""
bowtie2 --end-to-end --very-sensitive --no-mixed --no-discordant --phred33 -I 10 -X 700 -p 8 -x genome/pig -1 {input[0]} -2 {input[1]} -S {output}
"""
rule sam2bam:
input:
"2_bowtie2/{sample}.sam"
output:
"3_bam/{sample}.bam"
shell:
"samtools view -@ {config[threads]} -F 0x04 -bS {input} > {output}"
rule bam2bed:
input:
"3_bam/{sample}.bam"
output:
"4_bed/{sample}.bedpe"
shell:
"bedtools bamtobed -i {input} -bedpe > {output}"
rule cleanbed:
input:
"4_bed/{sample}.bedpe"
output:
"5_bed/{sample}.clean.bedpe"
shell:
"awk '$1==$4 && $6-$2 < 1000 {{print $0}}' {input} > {output}"
rule bedpe2bed:
input:
"5_bed/{sample}.clean.bedpe"
output:
"6_bed/{sample}.clean.bed"
shell:
"cut -f 1,2,6 {input} | sort -k1,1 -k2,2n -k3,3n > {output}"
rule bamsort:
input:
"3_bam/{sample}.bam"
output:
"3_bam/{sample}.sorted.bam"
shell:
"samtools sort -@ {config[threads]} -o {output} {input}"
rule bamindex:
input:
"3_bam/{sample}.sorted.bam"
output:
"3_bam/{sample}.sorted.bam.bai"
shell:
"samtools index {input}"
rule bam2bigwig:
input:
"3_bam/{sample}.sorted.bam",
"3_bam/{sample}.sorted.bam.bai"
output:
"7_bigwig/{sample}.bw"
shell:
"bamCoverage -b {input[0]} -o {output}"
标签:--,流程,bed,sample,cuttag,output,input,存疑,bowtie2
From: https://www.cnblogs.com/cauwj/p/18137150