基因组坐标到转录本坐标转换——单碱基模式

发布时间 2023-04-20 15:26:08作者: bio调参侠

今天分享一个自己写的python小脚本可以实现单碱基的基因组位置转换到转录本的坐标,欢迎大家使用,并提出错误

#!/share/home/hujun/miniconda3/bin/python3

import pybedtools
from pybedtools import BedTool
import pandas as pd
import argparse


def intersect_region(genepred_file, genome_site_file):
    genepred = pd.read_csv(genepred_file, sep="\t", header=None)
    genepred = genepred.iloc[:, [1, 3, 4, 0, 7, 2, 5, 6, 8, 9]]
    genepred = BedTool.from_dataframe(genepred)

    genome_site = pybedtools.BedTool(genome_site_file)

    intersect = genepred.intersect(genome_site, wa=True, wb=True, s=True)
    return intersect


def transform_position(intersect):
    output = []
    for i in intersect:
        site = int(i[12])
        exon_start = i[8].strip(",").split(",")
        exon_end = i[9].strip(",").split(",")

        exon_number = i[4]
        id = i[3]
        strand = i[5]

        if strand == "+":
            length = 0
            for j in range(len(exon_start) - 1):
                length = length + int(exon_end[j]) - int(exon_start[j])
                if site >= int(exon_start[j]) and site <= int(exon_end[j]):
                    position = length - (int(exon_end[j]) - site)
                    tras_p = [id, position]
                    output.append(tras_p)

        else:
            length = 0
            for j in range(len(exon_start) - 1, 0, -1):
                length = length + int(exon_end[j]) - int(exon_start[j])
                if site >= int(exon_start[j]) and site <= int(exon_end[j]):
                    position = length - (site - int(exon_start[j])) + 1
                    tras_p = [id, position]
                    output.append(tras_p)

    return pd.DataFrame(output)


def id_change():
    intersect = intersect_region(genepred_file=genepred,
                                 genome_site_file=bed)
    out = transform_position(intersect)
    trascript_index = pd.read_csv(index, sep="\t", engine="python", header=None)
    new_col = trascript_index.iloc[:, 0].str.split("|", expand=True)
    trascript_index["id"] = new_col[0]
    a = pd.merge(out, trascript_index, how="left", left_on=0, right_on="id")
    outdf = a.iloc[:, [3, 2, 0]]
    outdf.to_csv(output, sep="\t", index=False)
if __name__=="__main__":
    parser = argparse.ArgumentParser(
        description='This script aim to transform a single genome position to the transcript position', add_help=True)
    parser.add_argument('-g', '--genepred', type=str,required=True,
                        help='The genepred format file which cat transform from the gtf file')
    parser.add_argument('-b', '--bed', type=str,required=True,
                        help='The bed format file for a single position of the genome ,such as SNP,'
                             'and must have six columns', )
    parser.add_argument('-d', '--index', type=str,required=True,
                        help='The transcript fasta index file which you can generate with the '
                             'samtools')
    parser.add_argument('-o', '--output', type=str, required=True,help='The output filename')
    args = parser.parse_args()
    if not any(vars(args).values()):
        parser.print_help()
    else:
        genepred = args.genepred
        bed = args.bed
        index = args.index
        output = args.output
        id_change()

最终输出的文件总共有三列,第一列为转录本fasta文件中的名字,第二列为1 base 的转录本坐标,第三列为转录本id,之所以设置成这个样子是为了方便和一些比对到转录本中的数据进行比较