Mercurial > repos > iss > eurl_vtec_wgs_pt
comparison scripts/ReMatCh/utils/restart_rematch.py @ 0:c6bab5103a14 draft
"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"
author | iss |
---|---|
date | Mon, 21 Mar 2022 15:23:09 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c6bab5103a14 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 # -*- coding: utf-8 -*- | |
4 | |
5 """ | |
6 restart_rematch.py - Restarts a ReMatCh run abruptly terminated | |
7 <https://github.com/B-UMMI/ReMatCh/> | |
8 | |
9 Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt> | |
10 | |
11 Last modified: October 15, 2018 | |
12 | |
13 This program is free software: you can redistribute it and/or modify | |
14 it under the terms of the GNU General Public License as published by | |
15 the Free Software Foundation, either version 3 of the License, or | |
16 (at your option) any later version. | |
17 | |
18 This program is distributed in the hope that it will be useful, | |
19 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
21 GNU General Public License for more details. | |
22 | |
23 You should have received a copy of the GNU General Public License | |
24 along with this program. If not, see <http://www.gnu.org/licenses/>. | |
25 """ | |
26 | |
27 import os | |
28 import argparse | |
29 import subprocess | |
30 import time | |
31 | |
32 | |
33 version = '0.1' | |
34 | |
35 | |
36 def run_rematch(args): | |
37 print('\n' + '==========> Restarting ReMatCh <==========' + '\n') | |
38 | |
39 workdir = os.path.abspath(args.workdir) | |
40 if not os.path.isdir(workdir): | |
41 os.makedirs(workdir) | |
42 | |
43 initial_workdir = os.path.abspath(args.initialWorkdir) | |
44 | |
45 files_required = get_files_required(initial_workdir) | |
46 | |
47 samples_run = get_samples_run(files_required['sample_report']['file']) | |
48 | |
49 command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file']) | |
50 | |
51 samples_fastq = {} | |
52 | |
53 if list_ids is not None: | |
54 total_samples = get_list_ids_from_file(list_ids) | |
55 elif taxon: | |
56 total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file']) | |
57 else: | |
58 samples_fastq = search_fastq_files(initial_workdir) | |
59 total_samples = list(samples_fastq.keys()) | |
60 | |
61 samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if | |
62 not args.runFailedSamples else | |
63 samples_run['True'] if | |
64 'True' in samples_run else ['']))) | |
65 | |
66 print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by' | |
67 ' ReMatCh' + '\n') | |
68 | |
69 if list_ids is not None or taxon: | |
70 samples_to_run_file = write_samples_to_run(samples_to_run, workdir) | |
71 else: | |
72 set_samples_from_folders(samples_to_run, samples_fastq, workdir) | |
73 | |
74 command.extend(['-w', workdir]) | |
75 command.extend(['-j', str(threads) if args.threads is None else str(args.threads)]) | |
76 if list_ids is not None or taxon: | |
77 command.extend(['-l', samples_to_run_file]) | |
78 | |
79 print('ReMatCh will start in 5 seconds...') | |
80 time.sleep(5) | |
81 | |
82 os.chdir(initial_present_directory) | |
83 subprocess.call(command) | |
84 | |
85 | |
86 def write_samples_to_run(samples_to_run, workdir): | |
87 samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt') | |
88 with open(samples_to_run_file, 'wt') as writer: | |
89 for sample in samples_to_run: | |
90 writer.write(sample + '\n') | |
91 return samples_to_run_file | |
92 | |
93 | |
94 def get_files_required(initial_workdir): | |
95 files_required = {'sample_report': {'extension': 'tab'}, | |
96 'run': {'extension': 'log'}, | |
97 'IDs_list.seqFromWebTaxon': {'extension': 'tab'}} | |
98 files = sorted([f for f in os.listdir(initial_workdir) if | |
99 not f.startswith('.') and | |
100 os.path.isfile(os.path.join(initial_workdir, f))]) | |
101 for file_found in files: | |
102 file_path = os.path.join(initial_workdir, file_found) | |
103 file_modification = os.path.getmtime(file_path) | |
104 for prefix, values in list(files_required.items()): | |
105 if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']): | |
106 if 'file' not in values: | |
107 files_required[prefix]['file'] = file_path | |
108 files_required[prefix]['modification'] = file_modification | |
109 else: | |
110 if file_modification > files_required[prefix]['modification']: | |
111 files_required[prefix]['file'] = file_path | |
112 files_required[prefix]['modification'] = file_modification | |
113 return files_required | |
114 | |
115 | |
116 def get_samples_run(sample_report_file): | |
117 samples_run = {} | |
118 with open(sample_report_file, 'rtU') as reader: | |
119 for line in reader: | |
120 line = line.splitlines()[0] | |
121 if len(line) > 0: | |
122 if not line.startswith('#'): | |
123 sample_info = line.split('\t') | |
124 if sample_info[1] not in samples_run: | |
125 samples_run[sample_info[1]] = [] | |
126 samples_run[sample_info[1]].append(sample_info[0]) | |
127 return samples_run | |
128 | |
129 | |
130 def get_rematch_command(log_file): | |
131 variables = {'command': False, 'directory': False} | |
132 with open(log_file, 'rtU') as reader: | |
133 for line in reader: | |
134 if any([isinstance(value, bool) for value in list(variables.values())]): | |
135 line = line.splitlines()[0] | |
136 if len(line) > 0: | |
137 if line == 'COMMAND:': | |
138 variables['command'] = True | |
139 elif line == 'PRESENT DIRECTORY:': | |
140 variables['directory'] = True | |
141 else: | |
142 if variables['command'] is True: | |
143 variables['command'] = line.split(' ') | |
144 elif variables['directory'] is True: | |
145 variables['directory'] = line | |
146 else: | |
147 break | |
148 command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None} | |
149 if all([not isinstance(value, bool) for value in list(variables.values())]): | |
150 counter = 0 | |
151 while counter < len(variables['command']): | |
152 if variables['command'][counter].startswith('-'): | |
153 if variables['command'][counter] not in ('-t', '--taxon'): | |
154 if variables['command'][counter] in ('-l', '--listIDs'): | |
155 command['listIDs'] = variables['command'][counter + 1] | |
156 counter += 1 | |
157 elif variables['command'][counter] in ('-w', '--workdir'): | |
158 counter += 1 | |
159 elif variables['command'][counter] in ('-j', '--threads'): | |
160 command['threads'] = int(variables['command'][counter + 1]) | |
161 counter += 1 | |
162 elif variables['command'][counter] == '--mlst': | |
163 species = [] | |
164 counter += 1 | |
165 while counter < len(variables['command']) and not variables['command'][counter].startswith('-'): | |
166 if len(variables['command'][counter]) > 0: | |
167 species.append(variables['command'][counter]) | |
168 counter += 1 | |
169 command['command'].extend(['--mlst', ' '.join(species)]) | |
170 else: | |
171 command['command'].append(variables['command'][counter]) | |
172 if counter + 1 < len(variables['command']) and \ | |
173 not variables['command'][counter + 1].startswith('-'): | |
174 command['command'].append(variables['command'][counter + 1]) | |
175 counter += 1 | |
176 else: | |
177 command['taxon'] = True | |
178 for i in range(counter, len(variables['command'])): | |
179 if i + 1 < len(variables['command']): | |
180 if variables['command'][i + 1].startswith('-'): | |
181 counter = i | |
182 break | |
183 else: | |
184 counter = i | |
185 else: | |
186 command['command'].append(variables['command'][counter]) | |
187 counter += 1 | |
188 return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory'] | |
189 | |
190 | |
191 def get_taxon_run_ids(ids_list_seq_from_web_taxon_file): | |
192 list_ids = [] | |
193 with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader: | |
194 for line in reader: | |
195 line = line.splitlines()[0] | |
196 if len(line) > 0: | |
197 if not line.startswith('#'): | |
198 line = line.split('\t') | |
199 list_ids.append(line[0]) | |
200 return list_ids | |
201 | |
202 | |
203 def get_list_ids_from_file(list_ids_file): | |
204 list_ids = [] | |
205 with open(list_ids_file, 'rtU') as lines: | |
206 for line in lines: | |
207 line = line.splitlines()[0] | |
208 if len(line) > 0: | |
209 list_ids.append(line) | |
210 return list_ids | |
211 | |
212 | |
213 def search_fastq_files(initial_workdir): | |
214 files_extensions = ['.fastq.gz', '.fq.gz'] | |
215 pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']] | |
216 | |
217 list_ids = {} | |
218 directories = [d for d in os.listdir(initial_workdir) if | |
219 not d.startswith('.') and | |
220 os.path.isdir(os.path.join(initial_workdir, d, ''))] | |
221 for directory_found in directories: | |
222 directory_path = os.path.join(initial_workdir, directory_found, '') | |
223 | |
224 fastq_found = [] | |
225 files = [f for f in os.listdir(directory_path) if | |
226 not f.startswith('.') and | |
227 os.path.isfile(os.path.join(directory_path, f))] | |
228 for file_found in files: | |
229 if file_found.endswith(tuple(files_extensions)): | |
230 fastq_found.append(file_found) | |
231 | |
232 if len(fastq_found) == 1: | |
233 list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found] | |
234 elif len(fastq_found) >= 2: | |
235 file_pair = [] | |
236 | |
237 # Search pairs | |
238 for pe_separation in pair_end_files_separation: | |
239 for fastq in fastq_found: | |
240 if pe_separation[0] in fastq or pe_separation[1] in fastq: | |
241 file_pair.append(fastq) | |
242 | |
243 if len(file_pair) == 2: | |
244 break | |
245 else: | |
246 file_pair = [] | |
247 | |
248 # Search single | |
249 if len(file_pair) == 0: | |
250 for pe_separation in pair_end_files_separation: | |
251 for fastq in fastq_found: | |
252 if pe_separation[0] not in fastq or pe_separation[1] not in fastq: | |
253 file_pair.append(fastq) | |
254 | |
255 if len(file_pair) >= 1: | |
256 file_pair = file_pair[0] | |
257 | |
258 if len(file_pair) >= 1: | |
259 list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair] | |
260 | |
261 return list_ids | |
262 | |
263 | |
264 def set_samples_from_folders(samples_to_run, samples_fastq, workdir): | |
265 for sample in samples_to_run: | |
266 sample_dir = os.path.join(workdir, sample, '') | |
267 if not os.path.isdir(sample_dir): | |
268 os.mkdir(sample_dir) | |
269 for file_found in samples_fastq[sample]: | |
270 link_path = os.path.join(sample_dir, os.path.basename(file_found)) | |
271 if os.path.islink(link_path): | |
272 os.remove(link_path) | |
273 if not os.path.isfile(link_path): | |
274 os.symlink(file_found, link_path) | |
275 | |
276 | |
277 def main(): | |
278 parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated', | |
279 formatter_class=argparse.ArgumentDefaultsHelpFormatter) | |
280 parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version)) | |
281 | |
282 parser_required = parser.add_argument_group('Required options') | |
283 parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/', | |
284 help='Path to the directory where ReMatCh was running', required=True) | |
285 | |
286 parser_optional_general = parser.add_argument_group('General facultative options') | |
287 parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/', | |
288 help='Path to the directory where ReMatCh will run again', required=False, | |
289 default='.') | |
290 parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', | |
291 help='Number of threads to use instead of the ones set in initial ReMatCh run', | |
292 required=False) | |
293 parser_optional_general.add_argument('--runFailedSamples', action='store_true', | |
294 help='Will run ReMatCh for those samples missing, as well as for samples that' | |
295 ' did not run successfully in initial ReMatCh run') | |
296 | |
297 args = parser.parse_args() | |
298 | |
299 run_rematch(args) | |
300 | |
301 | |
302 if __name__ == "__main__": | |
303 main() |