diff exonerategff_to_gff3.py @ 3:a03dead1bede draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/exonerate commit a141c63903d1a598569985e400125d4e7de42801"
author iuc
date Sun, 01 Mar 2020 04:48:34 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/exonerategff_to_gff3.py	Sun Mar 01 04:48:34 2020 -0500
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+"""
+Converts a GFF produced by exonerate into a more standard GFF3 (e.g. usable in JBrowse)
+"""
+
+import argparse
+import sys
+
+from BCBio import GFF
+from Bio.SeqFeature import FeatureLocation, SeqFeature
+
+parser = argparse.ArgumentParser()
+parser.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
+parser.add_argument('outfile', nargs='?', type=argparse.FileType('a'), default=sys.stdout)
+args = parser.parse_args()
+
+
+scaffs = []
+gene_number = 0
+for scaff in GFF.parse(args.infile):
+    scaff.annotations = {}
+    scaff.seq = ""
+    kept_features = []
+    current_gene = None
+    exon_number = 0
+    last_utr = None
+
+    for feature in scaff.features:
+
+        if feature.type == "gene":
+            gene_number += 1
+            mrna_feature = SeqFeature(FeatureLocation(feature.location.start, feature.location.end), type="mRNA", strand=feature.location.strand)
+            mrna_feature.sub_features = []
+            mrna_feature.qualifiers['source'] = feature.qualifiers['source']
+            mrna_id = "mRNA_" + str(gene_number)
+            mrna_feature.qualifiers['ID'] = mrna_id
+            feature.sub_features = [mrna_feature]
+            feature.qualifiers['ID'] = "gene_" + str(gene_number)
+            if 'gene_orientation' in feature.qualifiers:
+                del feature.qualifiers['gene_orientation']
+            if current_gene:
+                kept_features.append(current_gene)
+
+            current_gene = feature
+            exon_number = 0
+            last_utr = None
+
+        elif feature.type == 'utr5':
+            feature.type = 'five_prime_UTR'
+            feature.qualifiers['ID'] = '%s_five_prime_UTR' % (mrna_id)
+            mrna_feature.sub_features.append(feature)
+            last_utr = {'start': feature.location.start, 'end': feature.location.end}
+
+        elif feature.type == 'utr3':
+            feature.type = 'three_prime_UTR'
+            feature.qualifiers['ID'] = '%s_three_prime_UTR' % (mrna_id)
+            mrna_feature.sub_features.append(feature)
+            last_utr = {'start': feature.location.start, 'end': feature.location.end}
+
+        elif feature.type == 'exon':
+            exon_number += 1
+            feature.qualifiers['ID'] = '%s_exon_%s' % (mrna_id, exon_number)
+            mrna_feature.sub_features.append(feature)
+
+            if last_utr is None:
+                cds_feature = SeqFeature(FeatureLocation(feature.location.start, feature.location.end), type="CDS", strand=feature.location.strand)
+                cds_feature.sub_features = []
+                cds_feature.qualifiers['source'] = feature.qualifiers['source']
+                cds_feature.qualifiers['ID'] = mrna_id + "_CDS"
+                mrna_feature.sub_features.append(cds_feature)
+            elif feature.location.start != last_utr['start'] or feature.location.end != last_utr['end']:
+                if feature.location.start > last_utr['start']:
+                    cds_feature = SeqFeature(FeatureLocation(feature.location.start, last_utr['start']), type="CDS", strand=feature.location.strand)
+                    cds_feature.sub_features = []
+                    cds_feature.qualifiers['source'] = feature.qualifiers['source']
+                    cds_feature.qualifiers['ID'] = mrna_id + "_CDS"
+                    mrna_feature.sub_features.append(cds_feature)
+                if feature.location.end < last_utr['end']:
+                    cds_feature = SeqFeature(FeatureLocation(feature.location.end, last_utr['end']), type="CDS", strand=feature.location.strand)
+                    cds_feature.sub_features = []
+                    cds_feature.qualifiers['source'] = feature.qualifiers['source']
+                    cds_feature.qualifiers['ID'] = mrna_id + "_CDS"
+                    mrna_feature.sub_features.append(cds_feature)
+
+            last_utr = None
+
+        elif feature.type == 'similarity':
+            if current_gene is None:
+                # We haven't seen any gene, just convert similarity to match
+                feature.type = 'match'
+                kept_features.append(feature)
+
+            last_utr = None
+
+        elif feature.type not in ['splice3', 'splice5', 'similarity', 'intron']:
+            mrna_feature.sub_features.append(feature)
+            last_utr = None
+
+    # For the last one
+    if current_gene:
+        kept_features.append(current_gene)
+
+    scaff.features = kept_features
+
+    if len(kept_features):
+        GFF.write([scaff], args.outfile)