diff glimmerHMM/glimmerhmm_gff_to_sequence.py @ 0:c9699375fcf6 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/glimmer_hmm commit 0dc67759bcbdf5a8a285ded9ba751340d741fe63
author bgruening
date Wed, 13 Jul 2016 10:55:48 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/glimmerHMM/glimmerhmm_gff_to_sequence.py	Wed Jul 13 10:55:48 2016 -0400
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""Convert GlimmerHMM GFF3 gene predictions into protein sequences.
+
+This works with the GlimmerHMM GFF3 output format:
+
+##gff-version 3
+##sequence-region Contig5.15 1 47390
+Contig5.15      GlimmerHMM      mRNA    323     325     .       +       .       ID=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1
+Contig5.15      GlimmerHMM      CDS     323     325     .       +       0       ID=Contig5.15.cds1.1;Parent=Contig5.15.path1.gene1;Name=Contig5.15.path1.gene1;Note=final-exon
+
+http://www.cbcb.umd.edu/software/GlimmerHMM/
+
+Modified version of the converter from Brad Chapman: https://github.com/chapmanb/bcbb/blob/master/biopython/glimmergff_to_proteins.py
+
+Usage:
+    glimmer_to_proteins.py <glimmer output> <ref fasta> <output file> <convert to protein ... False|True>
+"""
+import sys
+import os
+import operator
+
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+from BCBio import GFF
+
+def main(glimmer_file, ref_file, out_file, to_protein):
+    with open(ref_file) as in_handle:
+        ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta"))
+
+    base, ext = os.path.splitext(glimmer_file)
+
+    with open(out_file, "w") as out_handle:
+        SeqIO.write(protein_recs(glimmer_file, ref_recs, to_protein), out_handle, "fasta")
+
+def protein_recs(glimmer_file, ref_recs, to_protein):
+    """Generate protein records from GlimmerHMM gene predictions.
+    """
+    with open(glimmer_file) as in_handle:
+        for rec in glimmer_predictions(in_handle, ref_recs):
+            for feature in rec.features:
+                seq_exons = []
+                for cds in feature.sub_features:
+                    seq_exons.append(rec.seq[
+                        cds.location.nofuzzy_start:
+                        cds.location.nofuzzy_end])
+                gene_seq = reduce(operator.add, seq_exons)
+                if feature.strand == -1:
+                    gene_seq = gene_seq.reverse_complement()
+
+                if to_protein:
+                    yield SeqRecord(gene_seq.translate(), feature.qualifiers["ID"][0], "", "")
+                else:
+                    yield SeqRecord(gene_seq, feature.qualifiers["ID"][0], "", "")
+
+def glimmer_predictions(in_handle, ref_recs):
+    """Parse Glimmer output, generating SeqRecord and SeqFeatures for predictions
+    """
+    for rec in GFF.parse(in_handle, target_lines=1000, base_dict=ref_recs):
+        yield rec
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print __doc__
+        sys.exit()
+    main(*sys.argv[1:])