view mature_mir_gff_translation.py @ 4:da1aa7de2b19 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/mircounts commit ddaf9622722487d010001cd1f255107adf0c332d
author artbio
date Mon, 04 Sep 2017 17:55:01 -0400
parents 6b8adacd4750
children 9ea96a02c416
line wrap: on
line source

#!/usr/bin/env python

import argparse


def Parser():
    the_parser = argparse.ArgumentParser()
    the_parser.add_argument(
        '--input', action="store", type=str, help="input miRBase GFF3 file")
    the_parser.add_argument(
        '--output', action="store", type=str,
        help="output GFF3 file with converted mature mir coordinates")
    args = the_parser.parse_args()
    return args


GFF3_header = '''##gff-version 3
##generated by mature_mir_gff_translation.py
#
# Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
# microRNAs:               miRBase current_version
# genome-build-id:         check http://mirbase.org/
#
# Hairpin precursor sequences have type "miRNA_primary_transcript".
# Note, these sequences do not represent the full primary transcript,
# rather a predicted stem-loop portion that includes the precursor
# miRNA. Mature sequences have type "miRNA".
#
'''


def load_gff_in_dict(gff_input_file):
    '''
    Reads the gff3 file and return a dictionary of dictionaries
    with keys equal to standard gff3 fields (9)
    Note that the key of the primary dictionary is the ID
    '''
    gff_dict = {}
    for line in open(gff_input_file, "r"):
        if line[0] == "#":
            continue
        gff_fields = line[:-1].split("\t")
        ID = gff_fields[8].split("ID=")[1].split(";")[0]
        gff_dict[ID] = {}
        gff_dict[ID]["seqid"] = gff_fields[0]
        gff_dict[ID]["source"] = gff_fields[1]
        gff_dict[ID]["type"] = gff_fields[2]
        gff_dict[ID]["start"] = gff_fields[3]
        gff_dict[ID]["end"] = gff_fields[4]
        gff_dict[ID]["score"] = gff_fields[5]
        gff_dict[ID]["strand"] = gff_fields[6]
        gff_dict[ID]["phase"] = gff_fields[7]
        gff_dict[ID]["attributes"] = gff_fields[8]
        if "Derives_from" in gff_dict[ID]["attributes"]:
            parent_primary_transcript = gff_dict[ID]["attributes"].split(
                "Derives_from=")[1]
            parent_primary_transcript = gff_dict[parent_primary_transcript][
                                            "attributes"].split("Name=")[1]
            gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
                gff_dict[ID]["attributes"], parent_primary_transcript)
    return gff_dict


def genome_to_mir_gff(gff_dict, output):
    '''
    Converts seqid field from chromosome to item Name
    Then converts coordinates relative to "miRNA_primary_transcript"
    Note that GFF files are 1-based coordinates
    '''
    for key in gff_dict:
        name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
        gff_dict[key]["seqid"] = name
        if "Derives_from=" in gff_dict[key]["attributes"]:
            parent_ID = gff_dict[key]["attributes"].split(
                                        "Derives_from=")[1].split(";")[0]
            gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
                                       gff_dict[parent_ID]["start"])+1)
            gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
                                     gff_dict[parent_ID]["start"])+1)
    hairpins = {}
    matures = {}
    # treats miRNA_primary_transcript coordinates
    # in a second loop to avoid errors in conversion
    for key in gff_dict:
        if gff_dict[key]["type"] == "miRNA_primary_transcript":
            gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
                                       gff_dict[key]["start"]) + 1)
            gff_dict[key]["start"] = '1'
            # now, do a dict[ID]=Name but only for miRNA_primary_transcript
            hairpins[key] = gff_dict[key]["attributes"].split(
                                                        "Name=")[1].split(
                                                        ";")[0]
        else:
            matures[key] = gff_dict[key]["attributes"].split(
                                                        "Name=")[1].split(
                                                         ";")[0]
    with open(output, "w") as output:
        output.write(GFF3_header)
        for ID in sorted(hairpins, key=hairpins.get):
            output.write("\t".join([gff_dict[ID]["seqid"],
                         gff_dict[ID]["source"], gff_dict[ID]["type"],
                         gff_dict[ID]["start"], gff_dict[ID]["end"],
                         gff_dict[ID]["score"], gff_dict[ID]["strand"],
                         gff_dict[ID]["phase"], gff_dict[ID]["attributes"]]))
            output.write("\n")
            for id in sorted(matures, key=matures.get, reverse=True):
                if ID in gff_dict[id]["attributes"]:
                    output.write("\t".join([gff_dict[id]["seqid"],
                                 gff_dict[id]["source"], gff_dict[id]["type"],
                                 gff_dict[id]["start"], gff_dict[id]["end"],
                                 gff_dict[id]["score"],
                                 gff_dict[id]["strand"],
                                 gff_dict[id]["phase"],
                                 gff_dict[id]["attributes"]]))
                    output.write("\n")


def main(infile, outfile):
    gff_dict = load_gff_in_dict(infile)
    genome_to_mir_gff(gff_dict, outfile)


if __name__ == "__main__":
    args = Parser()
    main(args.input, args.output)