001、数据和脚本
[root@PC1 test2]# ls GCF_001704415.1_ARS1_genomic.gff record.sh
002、脚本
[root@PC1 test2]# cat record.sh ## 脚本内容 #!/bin/bas ## step1: eliminate the influence of pseudogene grep -v "^#" GCF_001704415.1_ARS1_genomic.gff | awk -F "\t" 'BEGIN{tag = "yes"}{if($3 == "pseudogene") {tag = "no"}; if($3 == "gene") {tag = "yes"}; if(tag == "yes") {print $0}}' > result.gff ## step2: exclude the genes without exon awk -F "\t" 'BEGIN{sum = 0}{if($3 == "gene"){sum++} else {sum = 0}; if(sum > 1) {print NR - 1}}' result.gff | while read i do sed -i "$i s/^/detele_tag\t/" result.gff done sed -i '/detele_tag\t/d' result.gff ## step3: num=0 awk -F "\t" '$3 == "gene"{print $NF}' result.gff | cut -d ";" -f 1 | while read i do ## 提取基因ID let num=$num+1 check1=$(grep "$i;" result.gff | wc -l) if [ $check1 -ne 1 ] then echo "$i layer1 check" exit fi grep "$i;" result.gff >> 001.gff ## 基因ID 写入文件 str1=$(grep "$i;" result.gff | awk -F "\t" '{print $NF}' | cut -d ";" -f 1 | cut -d "=" -f 2) check2=$(grep "Parent=$str1;" result.gff | wc -l) echo "$i: $check2" >> 002.txt ## 记录每个基因的转录本数目 if [ $check2 -lt 1 ] then echo "$i layer 2 check" exit fi grep "Parent=$str1;" result.gff > tmp for j in $(seq $check2) do ## 提取每一个转录本的ID str2=$(sed -n "$j"p tmp | awk -F "\t" '{print $NF}' | cut -d ";" -f 1 | cut -d "=" -f 2) check3=$(grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' | wc -l) if [ $check3 -eq 0 ] then echo "$i layer3 check" exit fi sed -n "$j"p tmp >> 001.gff ## 转录本ID写入文件 grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' >> 001.gff ## 外显子写入文件 grep "Parent=$str2;" result.gff | awk -F "\t" '$3 == "exon"' | awk -v a=$i -v b=$str2 'BEGIN{sum = 0} {OFS = "\t"; sum += ($5 - $4 + 1)} END {print a, b, sum}' >> 003.txt done ## 统计每一个转录本的长度并写入文件 rm -f tmp echo $num done done
。
标签:shell,grep,##,sum,awk,result,linux,gff From: https://www.cnblogs.com/liujiaxin2018/p/17521572.html