Mercurial > repos > earlhaminst > treebest_best

--- a/fasta_header_converter.py	Wed Feb 22 05:48:02 2017 -0500
+++ b/fasta_header_converter.py	Fri Mar 03 07:22:53 2017 -0500
@@ -1,7 +1,28 @@
 from __future__ import print_function

+import collections
 import json
 import optparse
+import sys
+
+Sequence = collections.namedtuple('Sequence', ['header', 'sequence'])
+
+
+def FASTAReader_gen(fasta_filename):
+    with open(fasta_filename) as fasta_file:
+        line = fasta_file.readline()
+        while True:
+            if not line:
+                return
+            assert line.startswith('>'), "FASTA headers must start with >"
+            header = line.rstrip()
+            sequence_parts = []
+            line = fasta_file.readline()
+            while line and line[0] != '>':
+                sequence_parts.append(line.rstrip())
+                line = fasta_file.readline()
+            sequence = "\n".join(sequence_parts)
+            yield Sequence(header, sequence)


 def read_gene_info(gene_info):
@@ -17,23 +38,25 @@
                   help='Gene feature information in JSON format')
 parser.add_option('-f', '--fasta', dest="input_fasta_filename",
                   help='Sequences in FASTA format')
+parser.add_option('-o', '--output', dest="output_fasta_filename",
+                  help='Output FASTA file name')
 options, args = parser.parse_args()

 if options.input_gene_filename is None:
     raise Exception('-j option must be specified')
-
 if options.input_fasta_filename is None:
     raise Exception('-f option must be specified')
+if options.output_fasta_filename is None:
+    raise Exception('-o option must be specified')

 with open(options.input_gene_filename) as json_fh:
     gene_info = json.load(json_fh)
 transcript_species_dict = read_gene_info(gene_info)

-with open(options.input_fasta_filename) as fasta_fh:
-    for line in fasta_fh:
-        line = line.rstrip()
-        if line.startswith(">"):
-            name = line[1:].lstrip()
-            print(">" + name + "_" + transcript_species_dict[name])
-        else:
-            print(line)
+with open(options.output_fasta_filename, 'w') as output_fasta_file:
+    for entry in FASTAReader_gen(options.input_fasta_filename):
+        name = entry.header[1:].lstrip()
+        if name not in transcript_species_dict:
+            print("Transcript '%s' not found in the gene feature information" % name, file=sys.stderr)
+            continue
+        output_fasta_file.write(">%s_%s\n%s\n" % (name, transcript_species_dict[name], entry.sequence))
--- a/fasta_header_converter.xml	Wed Feb 22 05:48:02 2017 -0500
+++ b/fasta_header_converter.xml	Fri Mar 03 07:22:53 2017 -0500
@@ -1,11 +1,11 @@
 <tool id="fasta_header_converter" name="FASTA header converter" version="0.1.1">
     <description>to append species information</description>
-    <command>
+    <command detect_errors="exit_code">
 <![CDATA[
 python '$__tool_directory__/fasta_header_converter.py'
 -f '$fastaFile'
 -j '$genesFile'
-> '$outputFile'
+-o '$outputFile'
 ]]>
     </command>
     <inputs>