comparison tools/protein_analysis/signalp3.py @ 0:bca9bc7fdaef

Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:03:34 -0400
parents
children 0f1c61998b22
comparison
equal deleted inserted replaced
-1:000000000000 0:bca9bc7fdaef
1 #!/usr/bin/env python
2 """Wrapper for SignalP v3.0 for use in Galaxy.
3
4 This script takes exactly fives command line arguments:
5 * the organism type (euk, gram+ or gram-)
6 * length to truncate sequences to (integer)
7 * number of threads to use (integer)
8 * an input protein FASTA filename
9 * output tabular filename.
10
11 It then calls the standalone SignalP v3.0 program (not the webservice)
12 requesting the short output (one line per protein) using both NN and HMM
13 for predictions.
14
15 First major feature is cleaning up the output. The raw output from SignalP
16 v3.0 looks like this (21 columns space separated):
17
18 # SignalP-NN euk predictions # SignalP-HMM euk predictions
19 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ?
20 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N
21 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N
22 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N
23 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N
24
25 In order to make it easier to use in Galaxy, this wrapper script reformats
26 this to use tab separators. Also it removes the redundant truncated name
27 column, and assigns unique column names in the header:
28
29 #ID NN_Cmax_score NN_Cmax_pos NN_Cmax_pred NN_Ymax_score NN_Ymax_pos NN_Ymax_pred NN_Smax_score NN_Smax_pos NN_Smax_pred NN_Smean_score NN_Smean_pred NN_D_score NN_D_pred HMM_bang HMM_Cmax_score HMM_Cmax_pos HMM_Cmax_pred HMM_Sprob_score HMM_Sprob_pred
30 gi|2781234|pdb|1JLY|B 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N Q 0.000 17 N 0.000 N
31 gi|4959044|gb|AAD34209.1|AF069992_1 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N Q 0.000 0 N 0.000 N
32 gi|671626|emb|CAA85685.1| 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N Q 0.000 0 N 0.000 N
33 gi|3298468|dbj|BAA31520.1| 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N Q 0.066 24 N 0.139 N
34
35 The second major feature is overcoming SignalP's built in limit of 4000
36 sequences by breaking up the input FASTA file into chunks. This also allows
37 us to pre-trim the sequences since SignalP only needs their starts.
38
39 The third major feature is taking advantage of multiple cores (since SignalP
40 v3.0 itself is single threaded) by using the individual FASTA input files to
41 run multiple copies of TMHMM in parallel. I would normally use Python's
42 multiprocessing library in this situation but it requires at least Python 2.6
43 and at the time of writing Galaxy still supports Python 2.4.
44 """
45 import sys
46 import os
47 from seq_analysis_utils import stop_err, split_fasta, run_jobs
48
49 FASTA_CHUNK = 500
50 MAX_LEN = 6000 #Found by trial and error
51
52 if len(sys.argv) != 6:
53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file")
54
55 organism = sys.argv[1]
56 if organism not in ["euk", "gram+", "gram-"]:
57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)
58
59 try:
60 truncate = int(sys.argv[2])
61 except:
62 truncate = 0
63 if truncate < 0:
64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
65
66 try:
67 num_threads = int(sys.argv[3])
68 except:
69 num_threads = 0
70 if num_threads < 1:
71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
72
73 fasta_file = sys.argv[4]
74
75 tabular_file = sys.argv[5]
76
77 def clean_tabular(raw_handle, out_handle):
78 """Clean up SignalP output to make it tabular."""
79 for line in raw_handle:
80 if not line or line.startswith("#"):
81 continue
82 parts = line.rstrip("\r\n").split()
83 assert len(parts)==21, repr(line)
84 assert parts[14].startswith(parts[0])
85 #Remove redundant truncated name column (col 0)
86 #and put full name at start (col 14)
87 parts = parts[14:15] + parts[1:14] + parts[15:]
88 out_handle.write("\t".join(parts) + "\n")
89
90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
91 temp_files = [f+".out" for f in fasta_files]
92 assert len(fasta_files) == len(temp_files)
93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
94 for (fasta, temp) in zip(fasta_files, temp_files)]
95 assert len(fasta_files) == len(temp_files) == len(jobs)
96
97 def clean_up(file_list):
98 for f in file_list:
99 if os.path.isfile(f):
100 os.remove(f)
101
102 if len(jobs) > 1 and num_threads > 1:
103 #A small "info" message for Galaxy to show the user.
104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
105 results = run_jobs(jobs, num_threads)
106 assert len(fasta_files) == len(temp_files) == len(jobs)
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
108 error_level = results[cmd]
109 try:
110 output = open(temp).readline()
111 except IOError:
112 output = ""
113 if error_level or output.lower().startswith("error running"):
114 clean_up(fasta_files)
115 clean_up(temp_files)
116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
117 error_level)
118 del results
119
120 out_handle = open(tabular_file, "w")
121 fields = ["ID"]
122 #NN results:
123 for name in ["Cmax", "Ymax", "Smax"]:
124 fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name])
125 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
126 #HMM results:
127 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred",
128 "HMM_Sprob_score", "HMM_Sprob_pred"])
129 out_handle.write("#" + "\t".join(fields) + "\n")
130 for temp in temp_files:
131 data_handle = open(temp)
132 clean_tabular(data_handle, out_handle)
133 data_handle.close()
134 out_handle.close()
135
136 clean_up(fasta_files)
137 clean_up(temp_files)