comparison tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author peterjc
date Mon, 30 Jul 2012 10:25:07 -0400
parents 0f1c61998b22
children e52220a9ddad
comparison
equal deleted inserted replaced
6:a290c6d4e658 7:9b45a8743100
2 """Wrapper for SignalP v3.0 for use in Galaxy. 2 """Wrapper for SignalP v3.0 for use in Galaxy.
3 3
4 This script takes exactly five command line arguments: 4 This script takes exactly five command line arguments:
5 * the organism type (euk, gram+ or gram-) 5 * the organism type (euk, gram+ or gram-)
6 * length to truncate sequences to (integer) 6 * length to truncate sequences to (integer)
7 * number of threads to use (integer) 7 * number of threads to use (integer, defaults to one)
8 * an input protein FASTA filename 8 * an input protein FASTA filename
9 * output tabular filename. 9 * output tabular filename.
10
11 There are two further optional arguments
12 * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax)
13 * output GFF3 filename
10 14
11 It then calls the standalone SignalP v3.0 program (not the webservice) 15 It then calls the standalone SignalP v3.0 program (not the webservice)
12 requesting the short output (one line per protein) using both NN and HMM 16 requesting the short output (one line per protein) using both NN and HMM
13 for predictions. 17 for predictions.
14 18
39 The third major feature is taking advantage of multiple cores (since SignalP 43 The third major feature is taking advantage of multiple cores (since SignalP
40 v3.0 itself is single threaded) by using the individual FASTA input files to 44 v3.0 itself is single threaded) by using the individual FASTA input files to
41 run multiple copies of TMHMM in parallel. I would normally use Python's 45 run multiple copies of TMHMM in parallel. I would normally use Python's
42 multiprocessing library in this situation but it requires at least Python 2.6 46 multiprocessing library in this situation but it requires at least Python 2.6
43 and at the time of writing Galaxy still supports Python 2.4. 47 and at the time of writing Galaxy still supports Python 2.4.
48
49 Note that this is somewhat redundant with job-splitting available in Galaxy
50 itself (see the SignalP XML file for settings).
51
52 Finally, you can opt to have a GFF3 file produced which will describe the
53 predicted signal peptide and mature peptide for each protein (using one of
54 the predictors which gives a cleavage site). *WORK IN PROGRESS*
44 """ 55 """
45 import sys 56 import sys
46 import os 57 import os
47 from seq_analysis_utils import stop_err, split_fasta, run_jobs 58 import tempfile
59 from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator
48 60
49 FASTA_CHUNK = 500 61 FASTA_CHUNK = 500
50 MAX_LEN = 6000 #Found by trial and error 62 MAX_LEN = 6000 #Found by trial and error
51 63
52 if len(sys.argv) != 6: 64 if len(sys.argv) not in [6,8]:
53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") 65 stop_err("Require five (or 7) arguments, organism, truncate, threads, "
66 "input protein FASTA file & output tabular file (plus "
67 "optionally cut method and GFF3 output file). "
68 "Got %i arguments." % (len(sys.argv)-1))
54 69
55 organism = sys.argv[1] 70 organism = sys.argv[1]
56 if organism not in ["euk", "gram+", "gram-"]: 71 if organism not in ["euk", "gram+", "gram-"]:
57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) 72 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)
58 73
64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) 79 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
65 80
66 try: 81 try:
67 num_threads = int(sys.argv[3]) 82 num_threads = int(sys.argv[3])
68 except: 83 except:
69 num_threads = 0 84 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
70 if num_threads < 1: 85 if num_threads < 1:
71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) 86 stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
72 87
73 fasta_file = sys.argv[4] 88 fasta_file = sys.argv[4]
74 89
75 tabular_file = sys.argv[5] 90 tabular_file = sys.argv[5]
76 91
77 def clean_tabular(raw_handle, out_handle): 92 if len(sys.argv) == 8:
93 cut_method = sys.argv[6]
94 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
95 stop_err("Invalid cut method %r" % cut_method)
96 gff3_file = sys.argv[7]
97 else:
98 cut_method = None
99 gff3_file = None
100
101
102 tmp_dir = tempfile.mkdtemp()
103
104 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
78 """Clean up SignalP output to make it tabular.""" 105 """Clean up SignalP output to make it tabular."""
106 if cut_method:
107 cut_col = {"NN_Cmax" : 2,
108 "NN_Ymax" : 5,
109 "NN_Smax" : 8,
110 "HMM_Cmax" : 16}[cut_method]
111 else:
112 cut_col = None
79 for line in raw_handle: 113 for line in raw_handle:
80 if not line or line.startswith("#"): 114 if not line or line.startswith("#"):
81 continue 115 continue
82 parts = line.rstrip("\r\n").split() 116 parts = line.rstrip("\r\n").split()
83 assert len(parts)==21, repr(line) 117 assert len(parts)==21, repr(line)
85 #Remove redundant truncated name column (col 0) 119 #Remove redundant truncated name column (col 0)
86 #and put full name at start (col 14) 120 #and put full name at start (col 14)
87 parts = parts[14:15] + parts[1:14] + parts[15:] 121 parts = parts[14:15] + parts[1:14] + parts[15:]
88 out_handle.write("\t".join(parts) + "\n") 122 out_handle.write("\t".join(parts) + "\n")
89 123
90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) 124 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
125 cut_col, score_col = {"NN_Cmax" : (2,1),
126 "NN_Ymax" : (5,4),
127 "NN_Smax" : (8,7),
128 "HMM_Cmax" : (16,15),
129 }[cut_method]
130
131 source = "SignalP"
132 strand = "." #not stranded
133 phase = "." #not phased
134 tags = "Note=%s" % cut_method
135
136 tab_handle = open(tabular_file)
137 line = tab_handle.readline()
138 assert line.startswith("#ID\t"), line
139
140 gff_handle = open(gff_file, "w")
141 gff_handle.write("##gff-version 3\n")
142
143 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
144 parts = line.rstrip("\n").split("\t")
145 seqid = parts[0]
146 assert title.startswith(seqid), "%s vs %s" % (seqid, title)
147 if len(seq)==0:
148 #Is it possible to have a zero length reference in GFF3?
149 continue
150 cut = int(parts[cut_col])
151 if cut == 0:
152 assert cut_method == "HMM_Cmax", cut_method
153 #TODO - Why does it do this?
154 cut = 1
155 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
156 score = parts[score_col]
157 gff_handle.write("##sequence-region %s %i %i\n" \
158 % (seqid, 1, len(seq)))
159 #If the cut is at the very begining, there is no signal peptide!
160 if cut > 1:
161 #signal_peptide = SO:0000418
162 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
163 % (seqid, source,
164 "signal_peptide", 1, cut-1,
165 score, strand, phase, tags))
166 #mature_protein_region = SO:0000419
167 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
168 % (seqid, source,
169 "mature_protein_region", cut, len(seq),
170 score, strand, phase, tags))
171 tab_handle.close()
172 gff_handle.close()
173
174
175 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
176 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
91 temp_files = [f+".out" for f in fasta_files] 177 temp_files = [f+".out" for f in fasta_files]
92 assert len(fasta_files) == len(temp_files) 178 assert len(fasta_files) == len(temp_files)
93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) 179 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
94 for (fasta, temp) in zip(fasta_files, temp_files)] 180 for (fasta, temp) in zip(fasta_files, temp_files)]
95 assert len(fasta_files) == len(temp_files) == len(jobs) 181 assert len(fasta_files) == len(temp_files) == len(jobs)
96 182
97 def clean_up(file_list): 183 def clean_up(file_list):
98 for f in file_list: 184 for f in file_list:
99 if os.path.isfile(f): 185 if os.path.isfile(f):
100 os.remove(f) 186 os.remove(f)
187 try:
188 os.rmdir(tmp_dir)
189 except:
190 pass
101 191
102 if len(jobs) > 1 and num_threads > 1: 192 if len(jobs) > 1 and num_threads > 1:
103 #A small "info" message for Galaxy to show the user. 193 #A small "info" message for Galaxy to show the user.
104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 194 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
105 results = run_jobs(jobs, num_threads) 195 results = run_jobs(jobs, num_threads)
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 197 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
108 error_level = results[cmd] 198 error_level = results[cmd]
109 try: 199 try:
110 output = open(temp).readline() 200 output = open(temp).readline()
111 except IOError: 201 except IOError:
112 output = "" 202 output = "(no output)"
113 if error_level or output.lower().startswith("error running"): 203 if error_level or output.lower().startswith("error running"):
114 clean_up(fasta_files) 204 clean_up(fasta_files + temp_files)
115 clean_up(temp_files)
116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 205 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
117 error_level) 206 error_level)
118 del results 207 del results
119 208
120 out_handle = open(tabular_file, "w") 209 out_handle = open(tabular_file, "w")
131 data_handle = open(temp) 220 data_handle = open(temp)
132 clean_tabular(data_handle, out_handle) 221 clean_tabular(data_handle, out_handle)
133 data_handle.close() 222 data_handle.close()
134 out_handle.close() 223 out_handle.close()
135 224
136 clean_up(fasta_files) 225 #GFF3:
137 clean_up(temp_files) 226 if cut_method:
227 make_gff(fasta_file, tabular_file, gff3_file, cut_method)
228
229 clean_up(fasta_files + temp_files)