comparison scripts/modules/utils.py @ 0:e37910d2c794 draft

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Mon, 20 Jan 2020 15:11:03 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e37910d2c794
1 import pickle
2 import traceback
3 import shlex
4 import subprocess
5 from threading import Timer
6 import shutil
7 import time
8 import functools
9 import os.path
10 import sys
11 import argparse
12
13
14 def start_logger(workdir):
15 time_str = time.strftime("%Y%m%d-%H%M%S")
16 sys.stdout = Logger(workdir, time_str)
17 logfile = sys.stdout.getLogFile()
18 return logfile, time_str
19
20
21 class Logger(object):
22 def __init__(self, out_directory, time_str):
23 self.logfile = os.path.join(out_directory, str('run.' + time_str + '.log'))
24 self.terminal = sys.stdout
25 self.log = open(self.logfile, "w")
26
27 def write(self, message):
28 self.terminal.write(message)
29 self.log.write(message)
30 self.log.flush()
31
32 def flush(self):
33 pass
34
35 def getLogFile(self):
36 return self.logfile
37
38
39 def checkPrograms(programs_version_dictionary):
40 print '\n' + 'Checking dependencies...'
41 programs = programs_version_dictionary
42 which_program = ['which', '']
43 listMissings = []
44 for program in programs:
45 which_program[1] = program
46 run_successfully, stdout, stderr = runCommandPopenCommunicate(which_program, False, None, False)
47 if not run_successfully:
48 listMissings.append(program + ' not found in PATH.')
49 else:
50 print stdout.splitlines()[0]
51 if programs[program][0] is None:
52 print program + ' (impossible to determine programme version) found at: ' + stdout.splitlines()[0]
53 else:
54 if program.endswith('.jar'):
55 check_version = ['java', '-jar', stdout.splitlines()[0], programs[program][0]]
56 programs[program].append(stdout.splitlines()[0])
57 else:
58 check_version = [stdout.splitlines()[0], programs[program][0]]
59 run_successfully, stdout, stderr = runCommandPopenCommunicate(check_version, False, None, False)
60 if stdout == '':
61 stdout = stderr
62 if program == 'wget':
63 version_line = stdout.splitlines()[0].split(' ', 3)[2]
64 else:
65 version_line = stdout.splitlines()[0].split(' ')[-1]
66 replace_characters = ['"', 'v', 'V', '+']
67 for i in replace_characters:
68 version_line = version_line.replace(i, '')
69 print program + ' (' + version_line + ') found'
70 if programs[program][1] == '>=':
71 program_found_version = version_line.split('.')
72 program_version_required = programs[program][2].split('.')
73 if len(program_version_required) == 3:
74 if len(program_found_version) == 2:
75 program_found_version.append(0)
76 else:
77 program_found_version[2] = program_found_version[2].split('_')[0]
78 for i in range(0, len(program_version_required)):
79 if isinstance(program_found_version[i], (int, long)):
80 if int(program_found_version[i]) < int(program_version_required[i]):
81 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + ' ' + programs[program][2])
82 else:
83 if version_line != programs[program][2]:
84 listMissings.append('It is required ' + program + ' with version ' + programs[program][1] + ' ' + programs[program][2])
85 return listMissings
86
87
88 def requiredPrograms():
89 programs_version_dictionary = {}
90 programs_version_dictionary['rematch.py'] = ['--version', '>=', '3.2']
91 missingPrograms = checkPrograms(programs_version_dictionary)
92 if len(missingPrograms) > 0:
93 sys.exit('\n' + 'Errors:' + '\n' + '\n'.join(missingPrograms))
94
95
96 def general_information(logfile, version, outdir, time_str):
97 # Check if output directory exists
98
99 print '\n' + '==========> patho_typing <=========='
100 print '\n' + 'Program start: ' + time.ctime()
101
102 # Tells where the logfile will be stored
103 print '\n' + 'LOGFILE:'
104 print logfile
105
106 # Print command
107 print '\n' + 'COMMAND:'
108 script_path = os.path.abspath(sys.argv[0])
109 print sys.executable + ' ' + script_path + ' ' + ' '.join(sys.argv[1:])
110
111 # Print directory where programme was lunch
112 print '\n' + 'PRESENT DIRECTORY:'
113 present_directory = os.path.abspath(os.getcwd())
114 print present_directory
115
116 # Print program version
117 print '\n' + 'VERSION:'
118 scriptVersionGit(version, present_directory, script_path)
119
120 # Check programms
121 requiredPrograms()
122
123 return script_path
124
125
126 def setPATHvariable(doNotUseProvidedSoftware, script_path):
127 path_variable = os.environ['PATH']
128 script_folder = os.path.dirname(script_path)
129 # Set path to use provided softwares
130 if not doNotUseProvidedSoftware:
131 bowtie2 = os.path.join(script_folder, 'src', 'bowtie2-2.2.9')
132 samtools = os.path.join(script_folder, 'src', 'samtools-1.3.1', 'bin')
133 bcftools = os.path.join(script_folder, 'src', 'bcftools-1.3.1', 'bin')
134
135 os.environ['PATH'] = str(':'.join([bowtie2, samtools, bcftools, path_variable]))
136
137 # Print PATH variable
138 print '\n' + 'PATH variable:'
139 print os.environ['PATH']
140
141
142 def scriptVersionGit(version, directory, script_path):
143 print 'Version ' + version
144
145 try:
146 os.chdir(os.path.dirname(script_path))
147 command = ['git', 'log', '-1', '--date=local', '--pretty=format:"%h (%H) - Commit by %cn, %cd) : %s"']
148 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False)
149 print stdout
150 command = ['git', 'remote', 'show', 'origin']
151 run_successfully, stdout, stderr = runCommandPopenCommunicate(command, False, 15, False)
152 print stdout
153 os.chdir(directory)
154 except:
155 print 'HARMLESS WARNING: git command possibly not found. The GitHub repository information will not be obtained.'
156
157
158 def runTime(start_time):
159 end_time = time.time()
160 time_taken = end_time - start_time
161 hours, rest = divmod(time_taken, 3600)
162 minutes, seconds = divmod(rest, 60)
163 print 'Runtime :' + str(hours) + 'h:' + str(minutes) + 'm:' + str(round(seconds, 2)) + 's'
164 return round(time_taken, 2)
165
166
167 def timer(function, name):
168 @functools.wraps(function)
169 def wrapper(*args, **kwargs):
170 print('\n' + 'RUNNING {0}\n'.format(name))
171 start_time = time.time()
172
173 results = list(function(*args, **kwargs)) # guarantees return is a list to allow .insert()
174
175 time_taken = runTime(start_time)
176 print('END {0}'.format(name))
177
178 results.insert(0, time_taken)
179 return results
180 return wrapper
181
182
183 def removeDirectory(directory):
184 if os.path.isdir(directory):
185 shutil.rmtree(directory)
186
187
188 def saveVariableToPickle(variableToStore, pickleFile):
189 with open(pickleFile, 'wb') as writer:
190 pickle.dump(variableToStore, writer)
191
192
193 def extractVariableFromPickle(pickleFile):
194 with open(pickleFile, 'rb') as reader:
195 variable = pickle.load(reader)
196 return variable
197
198
199 def trace_unhandled_exceptions(func):
200 @functools.wraps(func)
201 def wrapped_func(*args, **kwargs):
202 try:
203 func(*args, **kwargs)
204 except:
205 print 'Exception in ' + func.__name__
206 traceback.print_exc()
207 return wrapped_func
208
209
210 def kill_subprocess_Popen(subprocess_Popen, command):
211 print 'Command run out of time: ' + str(command)
212 subprocess_Popen.kill()
213
214
215 def runCommandPopenCommunicate(command, shell_True, timeout_sec_None, print_comand_True):
216 run_successfully = False
217 if not isinstance(command, basestring):
218 command = ' '.join(command)
219 command = shlex.split(command)
220
221 if print_comand_True:
222 print 'Running: ' + ' '.join(command)
223
224 if shell_True:
225 command = ' '.join(command)
226 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
227 else:
228 proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
229
230 not_killed_by_timer = True
231 if timeout_sec_None is None:
232 stdout, stderr = proc.communicate()
233 else:
234 timer = Timer(timeout_sec_None, kill_subprocess_Popen, args=(proc, command,))
235 timer.start()
236 stdout, stderr = proc.communicate()
237 timer.cancel()
238 not_killed_by_timer = timer.isAlive()
239
240 if proc.returncode == 0:
241 run_successfully = True
242 else:
243 if not print_comand_True and not_killed_by_timer:
244 print 'Running: ' + str(command)
245 if len(stdout) > 0:
246 print 'STDOUT'
247 print stdout.decode("utf-8")
248 if len(stderr) > 0:
249 print 'STDERR'
250 print stderr.decode("utf-8")
251 return run_successfully, stdout, stderr
252
253
254 def required_length(tuple_length_options, argument_name):
255 class RequiredLength(argparse.Action):
256 def __call__(self, parser, args, values, option_string=None):
257 if len(values) not in tuple_length_options:
258 msg = 'Option {argument_name} requires one of the following number of arguments: {tuple_length_options}'.format(
259 argument_name=self.argument_name, tuple_length_options=tuple_length_options)
260 raise argparse.ArgumentTypeError(msg)
261 setattr(args, self.dest, values)
262 return RequiredLength
263
264
265 def get_sequence_information(fasta_file, length_extra_seq):
266 sequence_dict = {}
267 headers = {}
268
269 with open(fasta_file, 'rtU') as reader:
270 blank_line_found = False
271 sequence_counter = 0
272 temp_sequence_dict = {}
273 for line in reader:
274 line = line.splitlines()[0]
275 if len(line) > 0:
276 if not blank_line_found:
277 if line.startswith('>'):
278 if len(temp_sequence_dict) > 0:
279 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0:
280 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0]
281 headers[temp_sequence_dict.values()[0]['header'].lower()] = sequence_counter
282 else:
283 print temp_sequence_dict.values()[0]['header'] + ' sequence ignored due to length <= 0'
284 temp_sequence_dict = {}
285
286 if line[1:].lower() in headers:
287 sys.exit('Found duplicated sequence headers')
288
289 sequence_counter += 1
290 temp_sequence_dict[sequence_counter] = {'header': line[1:].lower(), 'sequence': '', 'length': 0}
291 else:
292 temp_sequence_dict[sequence_counter]['sequence'] += line.upper()
293 temp_sequence_dict[sequence_counter]['length'] += len(line)
294 else:
295 sys.exit('It was found a blank line between the fasta file above line ' + line)
296 else:
297 blank_line_found = True
298
299 if len(temp_sequence_dict) > 0:
300 if temp_sequence_dict.values()[0]['length'] - 2 * length_extra_seq > 0:
301 sequence_dict[temp_sequence_dict.keys()[0]] = temp_sequence_dict.values()[0]
302 headers[temp_sequence_dict.values()[0]['header'].lower()] = sequence_counter
303 else:
304 print temp_sequence_dict.values()[0]['header'] + ' sequence ignored due to length <= 0'
305
306 return sequence_dict, headers
307
308
309 def simplify_sequence_dict(sequence_dict):
310 simple_sequence_dict = {}
311 for counter, info in sequence_dict.items():
312 simple_sequence_dict[info['header']] = info
313 del simple_sequence_dict[info['header']]['header']
314 return simple_sequence_dict
315
316
317 def chunkstring(string, length):
318 return (string[0 + i:length + i] for i in range(0, len(string), length))
319
320
321 def clean_headers_sequences(sequence_dict):
322 problematic_characters = ["|", " ", ",", ".", "(", ")", "'", "/", ":"]
323 # print 'Checking if reference sequences contain ' + str(problematic_characters) + '\n'
324
325 headers_changed = False
326 new_headers = {}
327 for i in sequence_dict:
328 if any(x in sequence_dict[i]['header'] for x in problematic_characters):
329 for x in problematic_characters:
330 sequence_dict[i]['header'] = sequence_dict[i]['header'].replace(x, '_')
331 headers_changed = True
332 new_headers[sequence_dict[i]['header'].lower()] = i
333
334 if headers_changed:
335 print 'At least one of the those characters was found. Replacing those with _' + '\n'
336
337 return sequence_dict, new_headers