点击查看代码
#!/usr/bin/env python
# -*- coding=utf-8 -*-
'''
提取序列文件中最长的转录本ID
需要修改######位置的参数 以及 open的目录
'''
import sys
import re
Fasta=open("/Business/psn_company/sc01/Workbase_Wangcanbiao/reference/MF_new/pep.fa","r")
Sequence={}
## 1.创建ID和序列的字典
for line in Fasta.readlines():
content=line.strip()
if content.startswith(">"):
nameS=content[1:]
name=re.sub(" .*$","",nameS) ######修改一下序列的header
Sequence[name]=''
else:
Sequence[name]+=content
Fasta.close()
#print(Sequence.keys())
## 2.提取每个转录本的序列长度,形成三列
Out=open("./gene_id_len.txt","w")
for i in Sequence.keys():
#print(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n")
Out.write(i+"\t"+str(len(Sequence[i]))+"\t"+i.split(".")[0]+"\n") #第一列是转录本ID,第二列是转录本长度,第三列是gene ID
Out.close()
## 3.这一步准备对out文件进行排序,找到每个基因最长的转录本
from operator import itemgetter
gene_id_len=open("./gene_id_len.txt","r")
table = []
for line in gene_id_len:
col = line.strip().split("\t")
col[0] = str(col[0])
col[1] = int(col[1])
col[2] = str(col[2])
table.append(col) #生成嵌套式列表
#print(table)
table_sorted = sorted(table, key=itemgetter(2, 1),reverse=True) #先后按列索引2,1排序
#print(table_sorted)
output_file = open("./gene_id_len_sorted.txt","w")
for row in table_sorted: #遍历读取排序后的嵌套列表
row = [str(x) for x in row] #列表要转换为字符串才可以写入文本
output_file.write("\t".join(row) + '\n')
output_file.close()
## 4.找到最长转录本
input_file=open("./gene_id_len_sorted.txt","r")
dict2={}
for line in input_file.readlines():
col = line.strip().split("\t")
col[0] = str(col[0])
col[1] = int(col[1])
col[2] = str(col[2])
if col[2] not in dict2:
dict2[col[2]]=col[0]
else:
dict2[col[2]]+= '\t'+col[0]
#print(dict2)
#print(dict2.values())
list_values=list(dict2.values())
#print(list_values)
result_file = open("./longest_pep_id.txt","w")
for line in list_values:
col = line.strip().split("\t")
col[0] = str(col[0])
#print(col[0])
result_file.write(col[0]+"\n")
result_file.close()
zcat hairpin.fa.gz | seqkit grep -f list > new.fa
标签:提取,file,len,col,转录,str,print,line,最长 From: https://www.cnblogs.com/-bonjour/p/16739048.html