comparison tools/protein_analysis/tmhmm2.py @ 0:bca9bc7fdaef

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:03:34 -0400
parents
children 3ff1dcbb9440
comparison
equal deleted inserted replaced
-1:000000000000 0:bca9bc7fdaef
1 #!/usr/bin/env python
2 """Wrapper for TMHMM v2.0 for use in Galaxy.
3
4 This script takes exactly two command line arguments - an input protein FASTA
5 filename and an output tabular filename. It then calls the standalone TMHMM
6 v2.0 program (not the webservice) requesting the short output (one line per
7 protein).
8
9 First major feature is cleaning up the tabular output. The raw output from
10 TMHMM v2.0 looks like this (six columns tab separated):
11
12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o
13 gi|4959044|gb|AAD34209.1|AF069992_1 len=600 ExpAA=0.00 First60=0.00 PredHel=0 Topology=o
14 gi|671626|emb|CAA85685.1| len=473 ExpAA=0.19 First60=0.00 PredHel=0 Topology=o
15 gi|3298468|dbj|BAA31520.1| len=107 ExpAA=59.37 First60=31.17 PredHel=3 Topology=o23-45i52-74o89-106i
16
17 In order to make it easier to use in Galaxy, this wrapper script simplifies
18 this to remove the redundant tags, and instead adds a comment line at the
19 top with the column names:
20
21 #ID len ExpAA First60 PredHel Topology
22 gi|2781234|pdb|1JLY|B 304 0.01 60 0.00 0 o
23 gi|4959044|gb|AAD34209.1|AF069992_1 600 0.00 0 0.00 0 o
24 gi|671626|emb|CAA85685.1| 473 0.19 0.00 0 o
25 gi|3298468|dbj|BAA31520.1| 107 59.37 31.17 3 o23-45i52-74o89-106i
26
27 The second major potential feature is taking advantage of multiple cores
28 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file
29 into chunks and running multiple copies of TMHMM in parallel. I would normally
30 use Python's multiprocessing library in this situation but it requires at
31 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
32 """
33 import sys
34 import os
35 from seq_analysis_utils import stop_err, split_fasta, run_jobs
36
37 FASTA_CHUNK = 500
38
39 if len(sys.argv) != 4:
40 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
41 try:
42 num_threads = int(sys.argv[1])
43 except:
44 num_threads = 0
45 if num_threads < 1:
46 stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
47 fasta_file = sys.argv[2]
48 tabular_file = sys.argv[3]
49
50 def clean_tabular(raw_handle, out_handle):
51 """Clean up tabular TMHMM output."""
52 for line in raw_handle:
53 if not line:
54 continue
55 parts = line.rstrip("\r\n").split("\t")
56 try:
57 identifier, length, expAA, first60, predhel, topology = parts
58 except:
59 assert len(parts)!=6
60 stop_err("Bad line: %r" % line)
61 assert length.startswith("len="), line
62 length = length[4:]
63 assert expAA.startswith("ExpAA="), line
64 expAA = expAA[6:]
65 assert first60.startswith("First60="), line
66 first60 = first60[8:]
67 assert predhel.startswith("PredHel="), line
68 predhel = predhel[8:]
69 assert topology.startswith("Topology="), line
70 topology = topology[9:]
71 out_handle.write("%s\t%s\t%s\t%s\t%s\t%s\n" \
72 % (identifier, length, expAA, first60, predhel, topology))
73
74 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK)
75 temp_files = [f+".out" for f in fasta_files]
76 jobs = ["tmhmm %s > %s" % (fasta, temp)
77 for fasta, temp in zip(fasta_files, temp_files)]
78
79 def clean_up(file_list):
80 for f in file_list:
81 if os.path.isfile(f):
82 os.remove(f)
83
84 if len(jobs) > 1 and num_threads > 1:
85 #A small "info" message for Galaxy to show the user.
86 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
87 results = run_jobs(jobs, num_threads)
88 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
89 error_level = results[cmd]
90 if error_level:
91 try:
92 output = open(temp).readline()
93 except IOError:
94 output = ""
95 clean_up(fasta_files)
96 clean_up(temp_files)
97 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
98 error_level)
99 del results
100 del jobs
101
102 out_handle = open(tabular_file, "w")
103 out_handle.write("#ID\tlen\tExpAA\tFirst60\tPredHel\tTopology\n")
104 for temp in temp_files:
105 data_handle = open(temp)
106 clean_tabular(data_handle, out_handle)
107 data_handle.close()
108 out_handle.close()
109
110 clean_up(fasta_files)
111 clean_up(temp_files)