comparison tools/protein_analysis/promoter2.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author peterjc
date Mon, 30 Jul 2012 10:25:07 -0400
parents
children 976a5f2833cd
comparison
equal deleted inserted replaced
6:a290c6d4e658 7:9b45a8743100
1 #!/usr/bin/env python
2 """Wrapper for Promoter 2.0 for use in Galaxy.
3
4 This script takes exactly three command line arguments:
5 * number of threads
6 * an input DNA FASTA filename
7 * output tabular filename.
8
9 It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux,
10 bypassing the Perl wrapper script 'promoter' which imposes a significant
11 performace overhead for no benefit here (we don't need HTML output for
12 example).
13
14 The main feature is this Python wrapper script parsers the bespoke
15 tabular output from Promoter 2.0 and reformats it into a Galaxy friendly
16 tab separated table.
17
18 Additionally, in order to take advantage of multiple cores the input FASTA
19 file is broken into chunks and multiple copies of promoter run at once.
20 This can be used in combination with the job-splitting available in Galaxy.
21
22 Note that rewriting the FASTA input file allows us to avoid a bug in
23 promoter 2 with long descriptions in the FASTA header line (over 200
24 characters) which produces stray fragements of the description in the
25 output file, making parsing non-trivial.
26
27 TODO - Automatically extract the sequence containing a promoter prediction?
28 """
29 import sys
30 import os
31 import commands
32 import tempfile
33 from seq_analysis_utils import stop_err, split_fasta, run_jobs
34
35 FASTA_CHUNK = 500
36
37 if len(sys.argv) != 4:
38 stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
39 "Got %i arguments." % (len(sys.argv)-1))
40 try:
41 num_threads = int(sys.argv[1])
42 except:
43 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
44 if num_threads < 1:
45 stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
46
47 fasta_file = os.path.abspath(sys.argv[2])
48 tabular_file = os.path.abspath(sys.argv[3])
49
50 tmp_dir = tempfile.mkdtemp()
51
52 def get_path_and_binary():
53 platform = commands.getoutput("uname") #e.g. Linux
54 shell_script = commands.getoutput("which promoter")
55 if not os.path.isfile(shell_script):
56 stop_err("ERROR: Missing promoter executable shell script")
57 path = None
58 for line in open(shell_script):
59 if line.startswith("setenv"): #could then be tab or space!
60 parts = line.rstrip().split(None, 2)
61 if parts[0] == "setenv" and parts[1] == "PROM":
62 path = parts[2]
63 if not path:
64 stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
65 if not os.path.isdir(path):
66 stop_error("ERROR: %r is not a directory" % path)
67 bin = "%s/bin/promoter_%s" % (path, platform)
68 if not os.path.isfile(bin):
69 stop_err("ERROR: Missing promoter binary %r" % bin)
70 return path, bin
71
72 def make_tabular(raw_handle, out_handle):
73 """Parse text output into tabular, return query count."""
74 identifier = None
75 queries = 0
76 #out.write("#Identifier\tDescription\tPosition\tScore\tLikelihood\n")
77 for line in raw_handle:
78 #print repr(line)
79 if not line.strip() or line == "Promoter prediction:\n":
80 pass
81 elif line[0] != " ":
82 identifier = line.strip().replace("\t", " ").split(None,1)[0]
83 queries += 1
84 elif line == " No promoter predicted\n":
85 #End of a record
86 identifier = None
87 elif line == " Position Score Likelihood\n":
88 assert identifier
89 else:
90 try:
91 position, score, likelihood = line.strip().split(None,2)
92 except ValueError:
93 print "WARNING: Problem with line: %r" % line
94 continue
95 #stop_err("ERROR: Problem with line: %r" % line)
96 if likelihood not in ["ignored",
97 "Marginal prediction",
98 "Medium likely prediction",
99 "Highly likely prediction"]:
100 stop_err("ERROR: Problem with line: %r" % line)
101 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
102 #out.close()
103 return queries
104
105 working_dir, bin = get_path_and_binary()
106
107 if not os.path.isfile(fasta_file):
108 stop_err("ERROR: Missing input FASTA file %r" % fasta_file)
109
110 #Note that if the input FASTA file contains no sequences,
111 #split_fasta returns an empty list (i.e. zero temp files).
112 #We deliberately omit the FASTA descriptions to avoid a
113 #bug in promoter2 with descriptions over 200 characters.
114 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False)
115 temp_files = [f+".out" for f in fasta_files]
116 jobs = ["%s %s > %s" % (bin, fasta, temp)
117 for fasta, temp in zip(fasta_files, temp_files)]
118
119 def clean_up(file_list):
120 for f in file_list:
121 if os.path.isfile(f):
122 os.remove(f)
123 try:
124 os.rmdir(tmp_dir)
125 except:
126 pass
127
128 if len(jobs) > 1 and num_threads > 1:
129 #A small "info" message for Galaxy to show the user.
130 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
131 cur_dir = os.path.abspath(os.curdir)
132 os.chdir(working_dir)
133 results = run_jobs(jobs, num_threads)
134 os.chdir(cur_dir)
135 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
136 error_level = results[cmd]
137 if error_level:
138 try:
139 output = open(temp).readline()
140 except IOError:
141 output = ""
142 clean_up(fasta_files + temp_files)
143 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
144 error_level)
145
146 del results
147 del jobs
148
149 out_handle = open(tabular_file, "w")
150 out_handle.write("#Identifier\tDescription\tPosition\tScore\tLikelihood\n")
151 queries = 0
152 for temp in temp_files:
153 data_handle = open(temp)
154 count = make_tabular(data_handle, out_handle)
155 data_handle.close()
156 if not count:
157 clean_up(fasta_files + temp_files)
158 stop_err("No output from promoter2")
159 queries += count
160 out_handle.close()
161
162 clean_up(fasta_files + temp_files)
163 print "Results for %i queries" % queries