comparison tools/protein_analysis/promoter2.py @ 20:a19b3ded8f33 draft

v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author peterjc
date Thu, 21 Sep 2017 11:35:20 -0400
parents f3ecd80850e2
children 238eae32483c
comparison
equal deleted inserted replaced
19:f3ecd80850e2 20:a19b3ded8f33
16 tab separated table. 16 tab separated table.
17 17
18 Additionally, in order to take advantage of multiple cores the input FASTA 18 Additionally, in order to take advantage of multiple cores the input FASTA
19 file is broken into chunks and multiple copies of promoter run at once. 19 file is broken into chunks and multiple copies of promoter run at once.
20 This can be used in combination with the job-splitting available in Galaxy. 20 This can be used in combination with the job-splitting available in Galaxy.
21
22 Note that rewriting the FASTA input file allows us to avoid a bug in 21 Note that rewriting the FASTA input file allows us to avoid a bug in
23 promoter 2 with long descriptions in the FASTA header line (over 200 22 promoter 2 with long descriptions in the FASTA header line (over 200
24 characters) which produces stray fragements of the description in the 23 characters) which produces stray fragements of the description in the
25 output file, making parsing non-trivial. 24 output file, making parsing non-trivial.
26 25
27 TODO - Automatically extract the sequence containing a promoter prediction? 26 TODO - Automatically extract the sequence containing a promoter prediction?
28 """ 27 """
28
29 from __future__ import print_function
30
31 import commands
32 import os
29 import sys 33 import sys
30 import os
31 import commands
32 import tempfile 34 import tempfile
33 from seq_analysis_utils import split_fasta, run_jobs, thread_count 35
36 from seq_analysis_utils import run_jobs, split_fasta, thread_count
34 37
35 FASTA_CHUNK = 500 38 FASTA_CHUNK = 500
36 39
37 if "-v" in sys.argv or "--version" in sys.argv: 40 if "-v" in sys.argv or "--version" in sys.argv:
38 sys.exit(os.system("promoter -V")) 41 sys.exit(os.system("promoter -V"))
47 50
48 tmp_dir = tempfile.mkdtemp() 51 tmp_dir = tempfile.mkdtemp()
49 52
50 53
51 def get_path_and_binary(): 54 def get_path_and_binary():
55 """Determine path and binary names for promoter tool."""
52 platform = commands.getoutput("uname") # e.g. Linux 56 platform = commands.getoutput("uname") # e.g. Linux
53 shell_script = commands.getoutput("which promoter") 57 shell_script = commands.getoutput("which promoter")
54 if not os.path.isfile(shell_script): 58 if not os.path.isfile(shell_script):
55 sys.exit("ERROR: Missing promoter executable shell script") 59 sys.exit("ERROR: Missing promoter executable shell script")
56 path = None 60 path = None
72 def make_tabular(raw_handle, out_handle): 76 def make_tabular(raw_handle, out_handle):
73 """Parse text output into tabular, return query count.""" 77 """Parse text output into tabular, return query count."""
74 identifier = None 78 identifier = None
75 queries = 0 79 queries = 0
76 for line in raw_handle: 80 for line in raw_handle:
77 # print repr(line) 81 # print(repr(line))
78 if not line.strip() or line == "Promoter prediction:\n": 82 if not line.strip() or line == "Promoter prediction:\n":
79 pass 83 pass
80 elif line[0] != " ": 84 elif line[0] != " ":
81 identifier = line.strip().replace("\t", " ").split(None, 1)[0] 85 identifier = line.strip().replace("\t", " ").split(None, 1)[0]
82 queries += 1 86 queries += 1
87 assert identifier 91 assert identifier
88 else: 92 else:
89 try: 93 try:
90 position, score, likelihood = line.strip().split(None, 2) 94 position, score, likelihood = line.strip().split(None, 2)
91 except ValueError: 95 except ValueError:
92 print "WARNING: Problem with line: %r" % line 96 print("WARNING: Problem with line: %r" % line)
93 continue 97 continue
94 # sys.exit("ERROR: Problem with line: %r" % line) 98 # sys.exit("ERROR: Problem with line: %r" % line)
95 if likelihood not in ["ignored", 99 if likelihood not in ["ignored",
96 "Marginal prediction", 100 "Marginal prediction",
97 "Medium likely prediction", 101 "Medium likely prediction",
98 "Highly likely prediction"]: 102 "Highly likely prediction"]:
99 sys.exit("ERROR: Problem with line: %r" % line) 103 sys.exit("ERROR: Problem with line: %r" % line)
100 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) 104 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
101 return queries 105 return queries
106
102 107
103 working_dir, bin = get_path_and_binary() 108 working_dir, bin = get_path_and_binary()
104 109
105 if not os.path.isfile(fasta_file): 110 if not os.path.isfile(fasta_file):
106 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file) 111 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file)
122 try: 127 try:
123 os.rmdir(tmp_dir) 128 os.rmdir(tmp_dir)
124 except Exception: 129 except Exception:
125 pass 130 pass
126 131
132
127 if len(jobs) > 1 and num_threads > 1: 133 if len(jobs) > 1 and num_threads > 1:
128 # A small "info" message for Galaxy to show the user. 134 # A small "info" message for Galaxy to show the user.
129 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 135 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)))
130 cur_dir = os.path.abspath(os.curdir) 136 cur_dir = os.path.abspath(os.curdir)
131 os.chdir(working_dir) 137 os.chdir(working_dir)
132 results = run_jobs(jobs, num_threads) 138 results = run_jobs(jobs, num_threads)
133 os.chdir(cur_dir) 139 os.chdir(cur_dir)
134 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 140 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
157 sys.exit("No output from promoter2") 163 sys.exit("No output from promoter2")
158 queries += count 164 queries += count
159 out_handle.close() 165 out_handle.close()
160 166
161 clean_up(fasta_files + temp_files) 167 clean_up(fasta_files + temp_files)
162 print "Results for %i queries" % queries 168 print("Results for %i queries" % queries)