comparison tools/protein_analysis/tmhmm2.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author peterjc
date Mon, 30 Jul 2012 10:25:07 -0400
parents 6901298ac16c
children e52220a9ddad
comparison
equal deleted inserted replaced
6:a290c6d4e658 7:9b45a8743100
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Wrapper for TMHMM v2.0 for use in Galaxy. 2 """Wrapper for TMHMM v2.0 for use in Galaxy.
3 3
4 This script takes exactly two command line arguments - an input protein FASTA 4 This script takes exactly three command line arguments - number of threads,
5 filename and an output tabular filename. It then calls the standalone TMHMM 5 an input protein FASTA filename, and an output tabular filename. It then
6 v2.0 program (not the webservice) requesting the short output (one line per 6 calls the standalone TMHMM v2.0 program (not the webservice) requesting
7 protein). 7 the short output (one line per protein).
8 8
9 The first major feature is cleaning up the tabular output. The short form raw 9 The first major feature is cleaning up the tabular output. The short form raw
10 output from TMHMM v2.0 looks like this (six columns tab separated): 10 output from TMHMM v2.0 looks like this (six columns tab separated):
11 11
12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o 12 gi|2781234|pdb|1JLY|B len=304 ExpAA=0.01 First60=0.00 PredHel=0 Topology=o
31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file 31 (since TMHMM v2.0 itself is single threaded) by dividing the input FASTA file
32 into chunks and running multiple copies of TMHMM in parallel. I would normally 32 into chunks and running multiple copies of TMHMM in parallel. I would normally
33 use Python's multiprocessing library in this situation but it requires at 33 use Python's multiprocessing library in this situation but it requires at
34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. 34 least Python 2.6 and at the time of writing Galaxy still supports Python 2.4.
35 35
36 Note that this is somewhat redundant with job-splitting available in Galaxy
37 itself (see the SignalP XML file for settings).
38
36 Also tmhmm2 can fail without returning an error code, for example if run on a 39 Also tmhmm2 can fail without returning an error code, for example if run on a
37 64 bit machine with only the 32 bit binaries installed. This script will spot 40 64 bit machine with only the 32 bit binaries installed. This script will spot
38 when there is no output from tmhmm2, and raise an error. 41 when there is no output from tmhmm2, and raise an error.
39 """ 42 """
40 import sys 43 import sys
41 import os 44 import os
45 import tempfile
42 from seq_analysis_utils import stop_err, split_fasta, run_jobs 46 from seq_analysis_utils import stop_err, split_fasta, run_jobs
43 47
44 FASTA_CHUNK = 500 48 FASTA_CHUNK = 500
45 49
46 if len(sys.argv) != 4: 50 if len(sys.argv) != 4:
47 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file") 51 stop_err("Require three arguments, number of threads (int), input protein FASTA file & output tabular file")
48 try: 52 try:
49 num_threads = int(sys.argv[1]) 53 num_threads = int(sys.argv[1])
50 except: 54 except:
51 num_threads = 0 55 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
52 if num_threads < 1: 56 if num_threads < 1:
53 stop_err("Threads argument %s is not a positive integer" % sys.argv[1]) 57 stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
54 fasta_file = sys.argv[2] 58 fasta_file = sys.argv[2]
55 tabular_file = sys.argv[3] 59 tabular_file = sys.argv[3]
60
61 tmp_dir = tempfile.mkdtemp()
56 62
57 def clean_tabular(raw_handle, out_handle): 63 def clean_tabular(raw_handle, out_handle):
58 """Clean up tabular TMHMM output, returns output line count.""" 64 """Clean up tabular TMHMM output, returns output line count."""
59 count = 0 65 count = 0
60 for line in raw_handle: 66 for line in raw_handle:
82 count += 1 88 count += 1
83 return count 89 return count
84 90
85 #Note that if the input FASTA file contains no sequences, 91 #Note that if the input FASTA file contains no sequences,
86 #split_fasta returns an empty list (i.e. zero temp files). 92 #split_fasta returns an empty list (i.e. zero temp files).
87 fasta_files = split_fasta(fasta_file, tabular_file, FASTA_CHUNK) 93 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
88 temp_files = [f+".out" for f in fasta_files] 94 temp_files = [f+".out" for f in fasta_files]
89 jobs = ["tmhmm -short %s > %s" % (fasta, temp) 95 jobs = ["tmhmm -short %s > %s" % (fasta, temp)
90 for fasta, temp in zip(fasta_files, temp_files)] 96 for fasta, temp in zip(fasta_files, temp_files)]
91 97
92 def clean_up(file_list): 98 def clean_up(file_list):
93 for f in file_list: 99 for f in file_list:
94 if os.path.isfile(f): 100 if os.path.isfile(f):
95 os.remove(f) 101 os.remove(f)
102 try:
103 os.rmdir(tmp_dir)
104 except:
105 pass
96 106
97 if len(jobs) > 1 and num_threads > 1: 107 if len(jobs) > 1 and num_threads > 1:
98 #A small "info" message for Galaxy to show the user. 108 #A small "info" message for Galaxy to show the user.
99 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 109 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
100 results = run_jobs(jobs, num_threads) 110 results = run_jobs(jobs, num_threads)
103 if error_level: 113 if error_level:
104 try: 114 try:
105 output = open(temp).readline() 115 output = open(temp).readline()
106 except IOError: 116 except IOError:
107 output = "" 117 output = ""
108 clean_up(fasta_files) 118 clean_up(fasta_files + temp_files)
109 clean_up(temp_files)
110 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 119 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
111 error_level) 120 error_level)
112 del results 121 del results
113 del jobs 122 del jobs
114 123
117 for temp in temp_files: 126 for temp in temp_files:
118 data_handle = open(temp) 127 data_handle = open(temp)
119 count = clean_tabular(data_handle, out_handle) 128 count = clean_tabular(data_handle, out_handle)
120 data_handle.close() 129 data_handle.close()
121 if not count: 130 if not count:
122 clean_up(fasta_files) 131 clean_up(fasta_files + temp_files)
123 clean_up(temp_files)
124 stop_err("No output from tmhmm2") 132 stop_err("No output from tmhmm2")
125 out_handle.close() 133 out_handle.close()
126 134
127 clean_up(fasta_files) 135 clean_up(fasta_files + temp_files)
128 clean_up(temp_files)