1. 第一种数据格式为protein.fa(translated.fa) 和 gene.gtf文件,序列信息如下
点击查看代码
def readfasta(filename):
fa = open(filename, 'r')
res = {}
ID = ''
for line in fa:
if line.startswith('>'):
ID = line#.strip('\n')
res[ID] = ''
else:
res[ID] += line#.strip('\n')
return res
res = readfasta('/PERSONALBIO/work/singlecell/s01/ref/Drosophila/GCF_000001215.4_Release_6_plus_ISO1_MT_translated_cds.faa')
uniq = {}
for k,v in res.items():
gene = k.split(' ')[1]
title = gene[6:-1]+'\n'
v = [v]
if title not in uniq: # 注意这种生成双层字典的方法!
uniq[title] = v
else:
uniq[title] += v
max_seq = {}
for k,v in uniq.items():
seq = max(v, key = len)
max_seq[k] = seq
w = open('longest.txt',"w")
for k,v in max_seq.items():
w.write('>' +k)
w.write(v)
w.close()
这种运行完后,protein.fa文件中蛋白名直接替换为了基因名,在下游的分析中就不需要替换基因名了
第二种是gtf文件相对注释不全的数据,但是同一基因不同长度的cds序列id名以。1 .2 .3 区分
点击查看代码
#!usr/bin/env python
# -*- coding:utf-8 -*-
"""
@FileName: get_longest
@Time: 2022/3/25,19:12
@Motto: go go go
"""
import argparse
from Bio import SeqIO
def read_file(file):
t = {} # 记录长度和序列名字
result = {} #这个字典用于储存最长转录本 、最长cds、最长protein
for seq_record in SeqIO.parse(file, "fasta"):
id = seq_record.id.rsplit(".", 1)[0]
if id not in t:
result[seq_record.id] = str(seq_record.seq)
t[id] = [len(seq_record.seq), seq_record.id]
else:
if t[id][0] >= len(seq_record.seq):
continue
else:
result.pop(t[id][1])
result[seq_record.id] = str(seq_record.seq)
t[id] = [len(seq_record.seq), seq_record.id]
return result
def write(filename, res):
with open(filename,'w') as f:
for i, j in res.items():
f.write(">" + i + "\n")
f.write(j + "\n")
def main():
parser = argparse.ArgumentParser(usage='********', description='得到最长结果')
parser.add_argument("-i", "--input", help="input filename")
parser.add_argument("-o", "--output", help="output filename")
args = parser.parse_args()
res_dict = read_file(args.input)
write(args.output, res_dict)
if __name__ == '__main__':
#res_dict = read_file(r'./cds.fa')
#write(r'out_cds.fa', res_dict)
main()