Mercurial > repos > iuc > exonerate
diff exonerategff_to_gff3.py @ 3:a03dead1bede draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/exonerate commit a141c63903d1a598569985e400125d4e7de42801"
author | iuc |
---|---|
date | Sun, 01 Mar 2020 04:48:34 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/exonerategff_to_gff3.py Sun Mar 01 04:48:34 2020 -0500 @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 + +""" +Converts a GFF produced by exonerate into a more standard GFF3 (e.g. usable in JBrowse) +""" + +import argparse +import sys + +from BCBio import GFF +from Bio.SeqFeature import FeatureLocation, SeqFeature + +parser = argparse.ArgumentParser() +parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin) +parser.add_argument('outfile', nargs='?', type=argparse.FileType('a'), default=sys.stdout) +args = parser.parse_args() + + +scaffs = [] +gene_number = 0 +for scaff in GFF.parse(args.infile): + scaff.annotations = {} + scaff.seq = "" + kept_features = [] + current_gene = None + exon_number = 0 + last_utr = None + + for feature in scaff.features: + + if feature.type == "gene": + gene_number += 1 + mrna_feature = SeqFeature(FeatureLocation(feature.location.start, feature.location.end), type="mRNA", strand=feature.location.strand) + mrna_feature.sub_features = [] + mrna_feature.qualifiers['source'] = feature.qualifiers['source'] + mrna_id = "mRNA_" + str(gene_number) + mrna_feature.qualifiers['ID'] = mrna_id + feature.sub_features = [mrna_feature] + feature.qualifiers['ID'] = "gene_" + str(gene_number) + if 'gene_orientation' in feature.qualifiers: + del feature.qualifiers['gene_orientation'] + if current_gene: + kept_features.append(current_gene) + + current_gene = feature + exon_number = 0 + last_utr = None + + elif feature.type == 'utr5': + feature.type = 'five_prime_UTR' + feature.qualifiers['ID'] = '%s_five_prime_UTR' % (mrna_id) + mrna_feature.sub_features.append(feature) + last_utr = {'start': feature.location.start, 'end': feature.location.end} + + elif feature.type == 'utr3': + feature.type = 'three_prime_UTR' + feature.qualifiers['ID'] = '%s_three_prime_UTR' % (mrna_id) + mrna_feature.sub_features.append(feature) + last_utr = {'start': feature.location.start, 'end': feature.location.end} + + elif feature.type == 'exon': + exon_number += 1 + feature.qualifiers['ID'] = '%s_exon_%s' % (mrna_id, exon_number) + mrna_feature.sub_features.append(feature) + + if last_utr is None: + cds_feature = SeqFeature(FeatureLocation(feature.location.start, feature.location.end), type="CDS", strand=feature.location.strand) + cds_feature.sub_features = [] + cds_feature.qualifiers['source'] = feature.qualifiers['source'] + cds_feature.qualifiers['ID'] = mrna_id + "_CDS" + mrna_feature.sub_features.append(cds_feature) + elif feature.location.start != last_utr['start'] or feature.location.end != last_utr['end']: + if feature.location.start > last_utr['start']: + cds_feature = SeqFeature(FeatureLocation(feature.location.start, last_utr['start']), type="CDS", strand=feature.location.strand) + cds_feature.sub_features = [] + cds_feature.qualifiers['source'] = feature.qualifiers['source'] + cds_feature.qualifiers['ID'] = mrna_id + "_CDS" + mrna_feature.sub_features.append(cds_feature) + if feature.location.end < last_utr['end']: + cds_feature = SeqFeature(FeatureLocation(feature.location.end, last_utr['end']), type="CDS", strand=feature.location.strand) + cds_feature.sub_features = [] + cds_feature.qualifiers['source'] = feature.qualifiers['source'] + cds_feature.qualifiers['ID'] = mrna_id + "_CDS" + mrna_feature.sub_features.append(cds_feature) + + last_utr = None + + elif feature.type == 'similarity': + if current_gene is None: + # We haven't seen any gene, just convert similarity to match + feature.type = 'match' + kept_features.append(feature) + + last_utr = None + + elif feature.type not in ['splice3', 'splice5', 'similarity', 'intron']: + mrna_feature.sub_features.append(feature) + last_utr = None + + # For the last one + if current_gene: + kept_features.append(current_gene) + + scaff.features = kept_features + + if len(kept_features): + GFF.write([scaff], args.outfile)