diff tools/protein_analysis/tmhmm2.py @ 0:bca9bc7fdaef

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:03:34 -0400
parents
children 3ff1dcbb9440
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/tmhmm2.py	Tue Jun 07 18:03:34 2011 -0400
@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+"""Wrapper for TMHMM v2.0 for use in Galaxy.
+
+This script takes exactly two command line arguments - an input protein FASTA
+filename and an output tabular filename. It then calls the standalone TMHMM
+v2.0 program (not the webservice) requesting the short output (one line per
+protein).
+
+First major feature is cleaning up the tabular output. The raw output from
+TMHMM v2.0 looks like this (six columns tab separated):
+
+ gi|2781234|pdb|1JLY|B	len=304 ExpAA=0.01	First60=0.00	PredHel=0	Topology=o
+ gi|4959044|gb|AAD34209.1|AF069992_1	len=600	ExpAA=0.00	First60=0.00	PredHel=0	Topology=o
+ gi|671626|emb|CAA85685.1|	len=473 ExpAA=0.19	First60=0.00 PredHel=0	Topology=o
+ gi|3298468|dbj|BAA31520.1|	len=107	ExpAA=59.37	First60=31.17	PredHel=3	Topology=o23-45i52-74o89-106i
+
+In order to make it easier to use in Galaxy, this wrapper script simplifies
+this to remove the redundant tags, and instead adds a comment line at the
+top with the column names:
+
+ #ID	len	ExpAA	First60	PredHel	Topology 
+ gi|2781234|pdb|1JLY|B	304	0.01	60	0.00	0	o
+ gi|4959044|gb|AAD34209.1|AF069992_1	600	0.00	0	0.00	0	o
+ gi|671626|emb|CAA85685.1|	473	0.19	0.00	0	o
+ gi|3298468|dbj|BAA31520.1|	107	59.37	31.17	3	o23-45i52-74o89-106i
+
+The second major potential feature is taking advantage of multiple cores
+(since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file
+into chunks and running multiple copies of TMHMM in parallel. I would normally
+use Python's multiprocessing library in this situation but it requires at
+least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
+"""
+import sys
+import os
+from seq_analysis_utils import stop_err, split_fasta, run_jobs
+
+FASTA_CHUNK = 500
+
+if len(sys.argv) != 4:
+   stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
+try:
+   num_threads = int(sys.argv[1])
+except:
+   num_threads = 0
+if num_threads < 1:
+   stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
+fasta_file = sys.argv[2]
+tabular_file = sys.argv[3]
+
+def clean_tabular(raw_handle, out_handle):
+    """Clean up tabular TMHMM output."""
+    for line in raw_handle:
+        if not line:
+            continue
+        parts = line.rstrip("\r\n").split("\t")
+        try:
+            identifier, length, expAA, first60, predhel, topology = parts
+        except:
+            assert len(parts)!=6
+            stop_err("Bad line: %r" % line)
+        assert length.startswith("len="), line
+        length = length[4:]
+        assert expAA.startswith("ExpAA="), line
+        expAA = expAA[6:]
+        assert first60.startswith("First60="), line
+        first60 = first60[8:]
+        assert predhel.startswith("PredHel="), line
+        predhel = predhel[8:]
+        assert topology.startswith("Topology="), line
+        topology = topology[9:]
+	out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
+                   % (identifier, length, expAA, first60, predhel, topology))
+
+fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK)
+temp_files = [f+".out" for f in fasta_files]
+jobs = ["tmhmm %s > %s" % (fasta, temp)
+        for fasta, temp in zip(fasta_files, temp_files)]
+
+def clean_up(file_list):
+    for f in file_list:
+        if os.path.isfile(f):
+            os.remove(f)
+
+if len(jobs) > 1 and num_threads > 1:
+    #A small "info" message for Galaxy to show the user.
+    print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+results = run_jobs(jobs, num_threads)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+    error_level = results[cmd]
+    if error_level:
+        try:
+            output = open(temp).readline()
+        except IOError:
+            output = ""
+        clean_up(fasta_files)
+        clean_up(temp_files)
+        stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+                 error_level)
+del results
+del jobs
+
+out_handle = open(tabular_file, "w")
+out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n")
+for temp in temp_files:
+    data_handle = open(temp)
+    clean_tabular(data_handle, out_handle)
+    data_handle.close()
+out_handle.close()
+
+clean_up(fasta_files)
+clean_up(temp_files)