Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
| author | peterjc |
|---|---|
| date | Wed, 01 Feb 2017 09:46:42 -0500 |
| parents | eb6ac44d4b8e |
| children | a19b3ded8f33 |
comparison
equal
deleted
inserted
replaced
| 18:eb6ac44d4b8e | 19:f3ecd80850e2 |
|---|---|
| 19 First major feature is cleaning up the output. The raw output from SignalP | 19 First major feature is cleaning up the output. The raw output from SignalP |
| 20 v3.0 looks like this (21 columns space separated): | 20 v3.0 looks like this (21 columns space separated): |
| 21 | 21 |
| 22 # SignalP-NN euk predictions # SignalP-HMM euk predictions | 22 # SignalP-NN euk predictions # SignalP-HMM euk predictions |
| 23 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? | 23 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? |
| 24 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N | 24 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N |
| 25 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N | 25 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N |
| 26 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N | 26 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N |
| 27 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N | 27 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N |
| 28 | 28 |
| 29 In order to make it easier to use in Galaxy, this wrapper script reformats | 29 In order to make it easier to use in Galaxy, this wrapper script reformats |
| 30 this to use tab separators. Also it removes the redundant truncated name | 30 this to use tab separators. Also it removes the redundant truncated name |
| 31 column, and assigns unique column names in the header: | 31 column, and assigns unique column names in the header: |
| 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* | 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* |
| 55 """ | 55 """ |
| 56 import sys | 56 import sys |
| 57 import os | 57 import os |
| 58 import tempfile | 58 import tempfile |
| 59 from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator | 59 from seq_analysis_utils import split_fasta, fasta_iterator |
| 60 from seq_analysis_utils import run_jobs, thread_count | 60 from seq_analysis_utils import run_jobs, thread_count |
| 61 | 61 |
| 62 FASTA_CHUNK = 500 | 62 FASTA_CHUNK = 500 |
| 63 MAX_LEN = 6000 #Found by trial and error | 63 MAX_LEN = 6000 # Found by trial and error |
| 64 | 64 |
| 65 if len(sys.argv) not in [6,8]: | 65 if len(sys.argv) not in [6, 8]: |
| 66 sys_exit("Require five (or 7) arguments, organism, truncate, threads, " | 66 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " |
| 67 "input protein FASTA file & output tabular file (plus " | 67 "input protein FASTA file & output tabular file (plus " |
| 68 "optionally cut method and GFF3 output file). " | 68 "optionally cut method and GFF3 output file). " |
| 69 "Got %i arguments." % (len(sys.argv)-1)) | 69 "Got %i arguments." % (len(sys.argv) - 1)) |
| 70 | 70 |
| 71 organism = sys.argv[1] | 71 organism = sys.argv[1] |
| 72 if organism not in ["euk", "gram+", "gram-"]: | 72 if organism not in ["euk", "gram+", "gram-"]: |
| 73 sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 73 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) |
| 74 | 74 |
| 75 try: | 75 try: |
| 76 truncate = int(sys.argv[2]) | 76 truncate = int(sys.argv[2]) |
| 77 except: | 77 except ValueError: |
| 78 truncate = 0 | 78 truncate = 0 |
| 79 if truncate < 0: | 79 if truncate < 0: |
| 80 sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | 80 sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) |
| 81 | 81 |
| 82 num_threads = thread_count(sys.argv[3], default=4) | 82 num_threads = thread_count(sys.argv[3], default=4) |
| 83 fasta_file = sys.argv[4] | 83 fasta_file = sys.argv[4] |
| 84 tabular_file = sys.argv[5] | 84 tabular_file = sys.argv[5] |
| 85 | 85 |
| 86 if len(sys.argv) == 8: | 86 if len(sys.argv) == 8: |
| 87 cut_method = sys.argv[6] | 87 cut_method = sys.argv[6] |
| 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: | 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: |
| 89 sys_exit("Invalid cut method %r" % cut_method) | 89 sys.exit("Invalid cut method %r" % cut_method) |
| 90 gff3_file = sys.argv[7] | 90 gff3_file = sys.argv[7] |
| 91 else: | 91 else: |
| 92 cut_method = None | 92 cut_method = None |
| 93 gff3_file = None | 93 gff3_file = None |
| 94 | 94 |
| 95 | 95 |
| 96 tmp_dir = tempfile.mkdtemp() | 96 tmp_dir = tempfile.mkdtemp() |
| 97 | 97 |
| 98 | |
| 98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | 99 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): |
| 99 """Clean up SignalP output to make it tabular.""" | 100 """Clean up SignalP output to make it tabular.""" |
| 100 if cut_method: | 101 if cut_method: |
| 101 cut_col = {"NN_Cmax" : 2, | 102 cut_col = {"NN_Cmax": 2, |
| 102 "NN_Ymax" : 5, | 103 "NN_Ymax": 5, |
| 103 "NN_Smax" : 8, | 104 "NN_Smax": 8, |
| 104 "HMM_Cmax" : 16}[cut_method] | 105 "HMM_Cmax": 16}[cut_method] |
| 105 else: | 106 else: |
| 106 cut_col = None | 107 cut_col = None |
| 107 for line in raw_handle: | 108 for line in raw_handle: |
| 108 if not line or line.startswith("#"): | 109 if not line or line.startswith("#"): |
| 109 continue | 110 continue |
| 110 parts = line.rstrip("\r\n").split() | 111 parts = line.rstrip("\r\n").split() |
| 111 assert len(parts)==21, repr(line) | 112 assert len(parts) == 21, repr(line) |
| 112 assert parts[14].startswith(parts[0]), \ | 113 assert parts[14].startswith(parts[0]), \ |
| 113 "Bad entry in SignalP output, ID miss-match:\n%r" % line | 114 "Bad entry in SignalP output, ID miss-match:\n%r" % line |
| 114 #Remove redundant truncated name column (col 0) | 115 # Remove redundant truncated name column (col 0) |
| 115 #and put full name at start (col 14) | 116 # and put full name at start (col 14) |
| 116 parts = parts[14:15] + parts[1:14] + parts[15:] | 117 parts = parts[14:15] + parts[1:14] + parts[15:] |
| 117 out_handle.write("\t".join(parts) + "\n") | 118 out_handle.write("\t".join(parts) + "\n") |
| 118 | 119 |
| 120 | |
| 119 def make_gff(fasta_file, tabular_file, gff_file, cut_method): | 121 def make_gff(fasta_file, tabular_file, gff_file, cut_method): |
| 120 cut_col, score_col = {"NN_Cmax" : (2,1), | 122 cut_col, score_col = {"NN_Cmax": (2, 1), |
| 121 "NN_Ymax" : (5,4), | 123 "NN_Ymax": (5, 4), |
| 122 "NN_Smax" : (8,7), | 124 "NN_Smax": (8, 7), |
| 123 "HMM_Cmax" : (16,15), | 125 "HMM_Cmax": (16, 15), |
| 124 }[cut_method] | 126 }[cut_method] |
| 125 | 127 |
| 126 source = "SignalP" | 128 source = "SignalP" |
| 127 strand = "." #not stranded | 129 strand = "." # not stranded |
| 128 phase = "." #not phased | 130 phase = "." # not phased |
| 129 tags = "Note=%s" % cut_method | 131 tags = "Note=%s" % cut_method |
| 130 | 132 |
| 131 tab_handle = open(tabular_file) | 133 tab_handle = open(tabular_file) |
| 132 line = tab_handle.readline() | 134 line = tab_handle.readline() |
| 133 assert line.startswith("#ID\t"), line | 135 assert line.startswith("#ID\t"), line |
| 134 | 136 |
| 135 gff_handle = open(gff_file, "w") | 137 gff_handle = open(gff_file, "w") |
| 137 | 139 |
| 138 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): | 140 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): |
| 139 parts = line.rstrip("\n").split("\t") | 141 parts = line.rstrip("\n").split("\t") |
| 140 seqid = parts[0] | 142 seqid = parts[0] |
| 141 assert title.startswith(seqid), "%s vs %s" % (seqid, title) | 143 assert title.startswith(seqid), "%s vs %s" % (seqid, title) |
| 142 if len(seq)==0: | 144 if not seq: |
| 143 #Is it possible to have a zero length reference in GFF3? | 145 # Is it possible to have a zero length reference in GFF3? |
| 144 continue | 146 continue |
| 145 cut = int(parts[cut_col]) | 147 cut = int(parts[cut_col]) |
| 146 if cut == 0: | 148 if cut == 0: |
| 147 assert cut_method == "HMM_Cmax", cut_method | 149 assert cut_method == "HMM_Cmax", cut_method |
| 148 #TODO - Why does it do this? | 150 # TODO - Why does it do this? |
| 149 cut = 1 | 151 cut = 1 |
| 150 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | 152 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) |
| 151 score = parts[score_col] | 153 score = parts[score_col] |
| 152 gff_handle.write("##sequence-region %s %i %i\n" \ | 154 gff_handle.write("##sequence-region %s %i %i\n" |
| 153 % (seqid, 1, len(seq))) | 155 % (seqid, 1, len(seq))) |
| 154 #If the cut is at the very begining, there is no signal peptide! | 156 # If the cut is at the very begining, there is no signal peptide! |
| 155 if cut > 1: | 157 if cut > 1: |
| 156 #signal_peptide = SO:0000418 | 158 # signal_peptide = SO:0000418 |
| 157 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | 159 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
| 158 % (seqid, source, | 160 % (seqid, source, |
| 159 "signal_peptide", 1, cut-1, | 161 "signal_peptide", 1, cut - 1, |
| 160 score, strand, phase, tags)) | 162 score, strand, phase, tags)) |
| 161 #mature_protein_region = SO:0000419 | 163 # mature_protein_region = SO:0000419 |
| 162 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | 164 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
| 163 % (seqid, source, | 165 % (seqid, source, |
| 164 "mature_protein_region", cut, len(seq), | 166 "mature_protein_region", cut, len(seq), |
| 165 score, strand, phase, tags)) | 167 score, strand, phase, tags)) |
| 166 tab_handle.close() | 168 tab_handle.close() |
| 167 gff_handle.close() | 169 gff_handle.close() |
| 168 | 170 |
| 169 | 171 |
| 170 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), | 172 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), |
| 171 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | 173 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) |
| 172 temp_files = [f+".out" for f in fasta_files] | 174 temp_files = [f + ".out" for f in fasta_files] |
| 173 assert len(fasta_files) == len(temp_files) | 175 assert len(fasta_files) == len(temp_files) |
| 174 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | 176 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) |
| 175 for (fasta, temp) in zip(fasta_files, temp_files)] | 177 for (fasta, temp) in zip(fasta_files, temp_files)] |
| 176 assert len(fasta_files) == len(temp_files) == len(jobs) | 178 assert len(fasta_files) == len(temp_files) == len(jobs) |
| 177 | 179 |
| 180 | |
| 178 def clean_up(file_list): | 181 def clean_up(file_list): |
| 182 """Remove temp files, and if possible the temp directory.""" | |
| 179 for f in file_list: | 183 for f in file_list: |
| 180 if os.path.isfile(f): | 184 if os.path.isfile(f): |
| 181 os.remove(f) | 185 os.remove(f) |
| 182 try: | 186 try: |
| 183 os.rmdir(tmp_dir) | 187 os.rmdir(tmp_dir) |
| 184 except: | 188 except Exception: |
| 185 pass | 189 pass |
| 186 | 190 |
| 187 if len(jobs) > 1 and num_threads > 1: | 191 if len(jobs) > 1 and num_threads > 1: |
| 188 #A small "info" message for Galaxy to show the user. | 192 # A small "info" message for Galaxy to show the user. |
| 189 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 193 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
| 190 results = run_jobs(jobs, num_threads) | 194 results = run_jobs(jobs, num_threads) |
| 191 assert len(fasta_files) == len(temp_files) == len(jobs) | 195 assert len(fasta_files) == len(temp_files) == len(jobs) |
| 192 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 196 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): |
| 193 error_level = results[cmd] | 197 error_level = results[cmd] |
| 195 output = open(temp).readline() | 199 output = open(temp).readline() |
| 196 except IOError: | 200 except IOError: |
| 197 output = "(no output)" | 201 output = "(no output)" |
| 198 if error_level or output.lower().startswith("error running"): | 202 if error_level or output.lower().startswith("error running"): |
| 199 clean_up(fasta_files + temp_files) | 203 clean_up(fasta_files + temp_files) |
| 200 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 204 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
| 201 error_level) | 205 error_level) |
| 202 del results | 206 del results |
| 203 | 207 |
| 204 out_handle = open(tabular_file, "w") | 208 out_handle = open(tabular_file, "w") |
| 205 fields = ["ID"] | 209 fields = ["ID"] |
| 206 #NN results: | 210 # NN results: |
| 207 for name in ["Cmax", "Ymax", "Smax"]: | 211 for name in ["Cmax", "Ymax", "Smax"]: |
| 208 fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) | 212 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) |
| 209 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) | 213 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) |
| 210 #HMM results: | 214 # HMM results: |
| 211 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", | 215 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", |
| 212 "HMM_Sprob_score", "HMM_Sprob_pred"]) | 216 "HMM_Sprob_score", "HMM_Sprob_pred"]) |
| 213 out_handle.write("#" + "\t".join(fields) + "\n") | 217 out_handle.write("#" + "\t".join(fields) + "\n") |
| 214 for temp in temp_files: | 218 for temp in temp_files: |
| 215 data_handle = open(temp) | 219 data_handle = open(temp) |
| 216 clean_tabular(data_handle, out_handle) | 220 clean_tabular(data_handle, out_handle) |
| 217 data_handle.close() | 221 data_handle.close() |
| 218 out_handle.close() | 222 out_handle.close() |
| 219 | 223 |
| 220 #GFF3: | 224 # GFF3: |
| 221 if cut_method: | 225 if cut_method: |
| 222 make_gff(fasta_file, tabular_file, gff3_file, cut_method) | 226 make_gff(fasta_file, tabular_file, gff3_file, cut_method) |
| 223 | 227 |
| 224 clean_up(fasta_files + temp_files) | 228 clean_up(fasta_files + temp_files) |
