comparison tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
comparison
equal deleted inserted replaced
18:eb6ac44d4b8e 19:f3ecd80850e2
19 First major feature is cleaning up the output. The raw output from SignalP 19 First major feature is cleaning up the output. The raw output from SignalP
20 v3.0 looks like this (21 columns space separated): 20 v3.0 looks like this (21 columns space separated):
21 21
22 # SignalP-NN euk predictions # SignalP-HMM euk predictions 22 # SignalP-NN euk predictions # SignalP-HMM euk predictions
23 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? 23 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
24 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N 24 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N
25 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N 25 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N
26 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N 26 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N
27 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N 27 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N
28 28
29 In order to make it easier to use in Galaxy, this wrapper script reformats 29 In order to make it easier to use in Galaxy, this wrapper script reformats
30 this to use tab separators. Also it removes the redundant truncated name 30 this to use tab separators. Also it removes the redundant truncated name
31 column, and assigns unique column names in the header: 31 column, and assigns unique column names in the header:
54 the predictors which gives a cleavage site). *WORK IN PROGRESS* 54 the predictors which gives a cleavage site). *WORK IN PROGRESS*
55 """ 55 """
56 import sys 56 import sys
57 import os 57 import os
58 import tempfile 58 import tempfile
59 from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator 59 from seq_analysis_utils import split_fasta, fasta_iterator
60 from seq_analysis_utils import run_jobs, thread_count 60 from seq_analysis_utils import run_jobs, thread_count
61 61
62 FASTA_CHUNK = 500 62 FASTA_CHUNK = 500
63 MAX_LEN = 6000 #Found by trial and error 63 MAX_LEN = 6000 # Found by trial and error
64 64
65 if len(sys.argv) not in [6,8]: 65 if len(sys.argv) not in [6, 8]:
66 sys_exit("Require five (or 7) arguments, organism, truncate, threads, " 66 sys.exit("Require five (or 7) arguments, organism, truncate, threads, "
67 "input protein FASTA file & output tabular file (plus " 67 "input protein FASTA file & output tabular file (plus "
68 "optionally cut method and GFF3 output file). " 68 "optionally cut method and GFF3 output file). "
69 "Got %i arguments." % (len(sys.argv)-1)) 69 "Got %i arguments." % (len(sys.argv) - 1))
70 70
71 organism = sys.argv[1] 71 organism = sys.argv[1]
72 if organism not in ["euk", "gram+", "gram-"]: 72 if organism not in ["euk", "gram+", "gram-"]:
73 sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) 73 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
74 74
75 try: 75 try:
76 truncate = int(sys.argv[2]) 76 truncate = int(sys.argv[2])
77 except: 77 except ValueError:
78 truncate = 0 78 truncate = 0
79 if truncate < 0: 79 if truncate < 0:
80 sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) 80 sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
81 81
82 num_threads = thread_count(sys.argv[3], default=4) 82 num_threads = thread_count(sys.argv[3], default=4)
83 fasta_file = sys.argv[4] 83 fasta_file = sys.argv[4]
84 tabular_file = sys.argv[5] 84 tabular_file = sys.argv[5]
85 85
86 if len(sys.argv) == 8: 86 if len(sys.argv) == 8:
87 cut_method = sys.argv[6] 87 cut_method = sys.argv[6]
88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
89 sys_exit("Invalid cut method %r" % cut_method) 89 sys.exit("Invalid cut method %r" % cut_method)
90 gff3_file = sys.argv[7] 90 gff3_file = sys.argv[7]
91 else: 91 else:
92 cut_method = None 92 cut_method = None
93 gff3_file = None 93 gff3_file = None
94 94
95 95
96 tmp_dir = tempfile.mkdtemp() 96 tmp_dir = tempfile.mkdtemp()
97 97
98
98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): 99 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
99 """Clean up SignalP output to make it tabular.""" 100 """Clean up SignalP output to make it tabular."""
100 if cut_method: 101 if cut_method:
101 cut_col = {"NN_Cmax" : 2, 102 cut_col = {"NN_Cmax": 2,
102 "NN_Ymax" : 5, 103 "NN_Ymax": 5,
103 "NN_Smax" : 8, 104 "NN_Smax": 8,
104 "HMM_Cmax" : 16}[cut_method] 105 "HMM_Cmax": 16}[cut_method]
105 else: 106 else:
106 cut_col = None 107 cut_col = None
107 for line in raw_handle: 108 for line in raw_handle:
108 if not line or line.startswith("#"): 109 if not line or line.startswith("#"):
109 continue 110 continue
110 parts = line.rstrip("\r\n").split() 111 parts = line.rstrip("\r\n").split()
111 assert len(parts)==21, repr(line) 112 assert len(parts) == 21, repr(line)
112 assert parts[14].startswith(parts[0]), \ 113 assert parts[14].startswith(parts[0]), \
113 "Bad entry in SignalP output, ID miss-match:\n%r" % line 114 "Bad entry in SignalP output, ID miss-match:\n%r" % line
114 #Remove redundant truncated name column (col 0) 115 # Remove redundant truncated name column (col 0)
115 #and put full name at start (col 14) 116 # and put full name at start (col 14)
116 parts = parts[14:15] + parts[1:14] + parts[15:] 117 parts = parts[14:15] + parts[1:14] + parts[15:]
117 out_handle.write("\t".join(parts) + "\n") 118 out_handle.write("\t".join(parts) + "\n")
118 119
120
119 def make_gff(fasta_file, tabular_file, gff_file, cut_method): 121 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
120 cut_col, score_col = {"NN_Cmax" : (2,1), 122 cut_col, score_col = {"NN_Cmax": (2, 1),
121 "NN_Ymax" : (5,4), 123 "NN_Ymax": (5, 4),
122 "NN_Smax" : (8,7), 124 "NN_Smax": (8, 7),
123 "HMM_Cmax" : (16,15), 125 "HMM_Cmax": (16, 15),
124 }[cut_method] 126 }[cut_method]
125 127
126 source = "SignalP" 128 source = "SignalP"
127 strand = "." #not stranded 129 strand = "." # not stranded
128 phase = "." #not phased 130 phase = "." # not phased
129 tags = "Note=%s" % cut_method 131 tags = "Note=%s" % cut_method
130 132
131 tab_handle = open(tabular_file) 133 tab_handle = open(tabular_file)
132 line = tab_handle.readline() 134 line = tab_handle.readline()
133 assert line.startswith("#ID\t"), line 135 assert line.startswith("#ID\t"), line
134 136
135 gff_handle = open(gff_file, "w") 137 gff_handle = open(gff_file, "w")
137 139
138 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): 140 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
139 parts = line.rstrip("\n").split("\t") 141 parts = line.rstrip("\n").split("\t")
140 seqid = parts[0] 142 seqid = parts[0]
141 assert title.startswith(seqid), "%s vs %s" % (seqid, title) 143 assert title.startswith(seqid), "%s vs %s" % (seqid, title)
142 if len(seq)==0: 144 if not seq:
143 #Is it possible to have a zero length reference in GFF3? 145 # Is it possible to have a zero length reference in GFF3?
144 continue 146 continue
145 cut = int(parts[cut_col]) 147 cut = int(parts[cut_col])
146 if cut == 0: 148 if cut == 0:
147 assert cut_method == "HMM_Cmax", cut_method 149 assert cut_method == "HMM_Cmax", cut_method
148 #TODO - Why does it do this? 150 # TODO - Why does it do this?
149 cut = 1 151 cut = 1
150 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) 152 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
151 score = parts[score_col] 153 score = parts[score_col]
152 gff_handle.write("##sequence-region %s %i %i\n" \ 154 gff_handle.write("##sequence-region %s %i %i\n"
153 % (seqid, 1, len(seq))) 155 % (seqid, 1, len(seq)))
154 #If the cut is at the very begining, there is no signal peptide! 156 # If the cut is at the very begining, there is no signal peptide!
155 if cut > 1: 157 if cut > 1:
156 #signal_peptide = SO:0000418 158 # signal_peptide = SO:0000418
157 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ 159 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
158 % (seqid, source, 160 % (seqid, source,
159 "signal_peptide", 1, cut-1, 161 "signal_peptide", 1, cut - 1,
160 score, strand, phase, tags)) 162 score, strand, phase, tags))
161 #mature_protein_region = SO:0000419 163 # mature_protein_region = SO:0000419
162 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ 164 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
163 % (seqid, source, 165 % (seqid, source,
164 "mature_protein_region", cut, len(seq), 166 "mature_protein_region", cut, len(seq),
165 score, strand, phase, tags)) 167 score, strand, phase, tags))
166 tab_handle.close() 168 tab_handle.close()
167 gff_handle.close() 169 gff_handle.close()
168 170
169 171
170 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), 172 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
171 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) 173 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
172 temp_files = [f+".out" for f in fasta_files] 174 temp_files = [f + ".out" for f in fasta_files]
173 assert len(fasta_files) == len(temp_files) 175 assert len(fasta_files) == len(temp_files)
174 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) 176 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
175 for (fasta, temp) in zip(fasta_files, temp_files)] 177 for (fasta, temp) in zip(fasta_files, temp_files)]
176 assert len(fasta_files) == len(temp_files) == len(jobs) 178 assert len(fasta_files) == len(temp_files) == len(jobs)
177 179
180
178 def clean_up(file_list): 181 def clean_up(file_list):
182 """Remove temp files, and if possible the temp directory."""
179 for f in file_list: 183 for f in file_list:
180 if os.path.isfile(f): 184 if os.path.isfile(f):
181 os.remove(f) 185 os.remove(f)
182 try: 186 try:
183 os.rmdir(tmp_dir) 187 os.rmdir(tmp_dir)
184 except: 188 except Exception:
185 pass 189 pass
186 190
187 if len(jobs) > 1 and num_threads > 1: 191 if len(jobs) > 1 and num_threads > 1:
188 #A small "info" message for Galaxy to show the user. 192 # A small "info" message for Galaxy to show the user.
189 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 193 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
190 results = run_jobs(jobs, num_threads) 194 results = run_jobs(jobs, num_threads)
191 assert len(fasta_files) == len(temp_files) == len(jobs) 195 assert len(fasta_files) == len(temp_files) == len(jobs)
192 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 196 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
193 error_level = results[cmd] 197 error_level = results[cmd]
195 output = open(temp).readline() 199 output = open(temp).readline()
196 except IOError: 200 except IOError:
197 output = "(no output)" 201 output = "(no output)"
198 if error_level or output.lower().startswith("error running"): 202 if error_level or output.lower().startswith("error running"):
199 clean_up(fasta_files + temp_files) 203 clean_up(fasta_files + temp_files)
200 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 204 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
201 error_level) 205 error_level)
202 del results 206 del results
203 207
204 out_handle = open(tabular_file, "w") 208 out_handle = open(tabular_file, "w")
205 fields = ["ID"] 209 fields = ["ID"]
206 #NN results: 210 # NN results:
207 for name in ["Cmax", "Ymax", "Smax"]: 211 for name in ["Cmax", "Ymax", "Smax"]:
208 fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) 212 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])
209 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) 213 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
210 #HMM results: 214 # HMM results:
211 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", 215 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",
212 "HMM_Sprob_score", "HMM_Sprob_pred"]) 216 "HMM_Sprob_score", "HMM_Sprob_pred"])
213 out_handle.write("#" + "\t".join(fields) + "\n") 217 out_handle.write("#" + "\t".join(fields) + "\n")
214 for temp in temp_files: 218 for temp in temp_files:
215 data_handle = open(temp) 219 data_handle = open(temp)
216 clean_tabular(data_handle, out_handle) 220 clean_tabular(data_handle, out_handle)
217 data_handle.close() 221 data_handle.close()
218 out_handle.close() 222 out_handle.close()
219 223
220 #GFF3: 224 # GFF3:
221 if cut_method: 225 if cut_method:
222 make_gff(fasta_file, tabular_file, gff3_file, cut_method) 226 make_gff(fasta_file, tabular_file, gff3_file, cut_method)
223 227
224 clean_up(fasta_files + temp_files) 228 clean_up(fasta_files + temp_files)