python 中序列ID从fasta文件中批量提取序列数据

发布时间 2023-10-13 20:43:45作者: 小鲨鱼2018

 

001、

[root@pc1 test1]# ls
a.fa  chr.list  test.py
[root@pc1 test1]# cat a.fa               ##  测试fasta文件
>chr1
tttcccggg
>chr2
tttggg
ccc
>chr3
cccttt
>chr4
aaaaattt
[root@pc1 test1]# cat chr.list            ## 序列ID
chr2
chr4

 

[root@pc1 test1]# cat test.py           ## 提取程序
#!/usr/bin/env python3
# -*- coding utf-8 -*-

in_file = open("a.fa", "r")
dict1 = {}

for i in in_file:
        i = i.strip()
        if i[0] == ">":
                tmp = i
                dict1[tmp] = ""
        else:
                dict1[tmp] += i
in_file.close()

in_file2 = open("chr.list", "r")
for i in in_file2:
        i = ">" + i.strip()
        if i in dict1:
                print(i)
                j = dict1[i]
                while len(j) >= 5:        ## 指定每行输出5个碱基
                        print(j[:5])
                        j = j[5:]
                if len(j) != 0:
                        print(j)
in_file2.close()

 

[root@pc1 test1]# python3 test.py         ## 提取结果
>chr2
tttgg
gccc
>chr4
aaaaa
ttt