python中计算dna序列的GC含量

发布时间 2023-08-28 21:49:54作者: 小鲨鱼2018

 

001、对G、C计数进行统计

[root@pc1 test01]# ls
a.fa  test.py
[root@pc1 test01]# cat a.fa          ## 测试DNA序列
>Rosalind_6404
CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
TCCCACTAATAATTCTGAGG
>Rosalind_5959
CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
ATATCCATTTGTCAGCAGACACGC
>Rosalind_0808
CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
TGGGAACCTGCGGGCAGTAGGTGGAAT
[root@pc1 test01]# cat test.py      ## 统计程序
#!/usr/bin/env python
# -*- coding: utf-8 -*-

in_file = open("a.fa", "r")

dict1 = dict()
for i in in_file:
        i = i.strip()
        if i.startswith(">"):
                temp = i
                dict1[temp] = str()
        else:
                dict1[temp] += i
in_file.close()

dict2 = dict()
for i,j in dict1.items():
        count = 0
        for k in j:
                if k == "C" or k == "G":
                        count += 1
        dict2[i] = count/len(j)

for i,j in dict2.items():
        print("%s %.2f" % (i, j * 100))

max_value = max(dict2.values())
for i,j in dict2.items():
        if j == max_value:
                print(i.replace(">", ""))
                print("%.6f" % (j * 100))
[root@pc1 test01]# python3 test.py       ## 输出结果
>Rosalind_6404 53.75
>Rosalind_5959 53.57
>Rosalind_0808 60.92
Rosalind_0808
60.919540