eurl_vtec_wgs_pt: scripts/ReMatCh/utils/restart

comparison scripts/ReMatCh/utils/restart_rematch.py @ 0:c6bab5103a14 draft

"planemo upload commit 6abf3e299d82d07e6c3cf8642bdea80e96df64c3-dirty"

author	iss
date	Mon, 21 Mar 2022 15:23:09 +0000
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:c6bab5103a14
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+restart_rematch.py - Restarts a ReMatCh run abruptly terminated
+<https://github.com/B-UMMI/ReMatCh/>
+Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
+Last modified: October 15, 2018
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+import os
+import argparse
+import subprocess
+import time
+version = '0.1'
+def run_rematch(args):
+print('\n' + '==========> Restarting ReMatCh <==========' + '\n')
+workdir = os.path.abspath(args.workdir)
+if not os.path.isdir(workdir):
+os.makedirs(workdir)
+initial_workdir = os.path.abspath(args.initialWorkdir)
+files_required = get_files_required(initial_workdir)
+samples_run = get_samples_run(files_required['sample_report']['file'])
+command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file'])
+samples_fastq = {}
+if list_ids is not None:
+total_samples = get_list_ids_from_file(list_ids)
+elif taxon:
+total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file'])
+else:
+samples_fastq = search_fastq_files(initial_workdir)
+total_samples = list(samples_fastq.keys())
+samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if
+not args.runFailedSamples else
+samples_run['True'] if
+'True' in samples_run else [''])))
+print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by'
+' ReMatCh' + '\n')
+if list_ids is not None or taxon:
+samples_to_run_file = write_samples_to_run(samples_to_run, workdir)
+else:
+set_samples_from_folders(samples_to_run, samples_fastq, workdir)
+command.extend(['-w', workdir])
+command.extend(['-j', str(threads) if args.threads is None else str(args.threads)])
+if list_ids is not None or taxon:
+command.extend(['-l', samples_to_run_file])
+print('ReMatCh will start in 5 seconds...')
+time.sleep(5)
+os.chdir(initial_present_directory)
+subprocess.call(command)
+def write_samples_to_run(samples_to_run, workdir):
+samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt')
+with open(samples_to_run_file, 'wt') as writer:
+for sample in samples_to_run:
+writer.write(sample + '\n')
+return samples_to_run_file
+def get_files_required(initial_workdir):
+files_required = {'sample_report': {'extension': 'tab'},
+'run': {'extension': 'log'},
+'IDs_list.seqFromWebTaxon': {'extension': 'tab'}}
+files = sorted([f for f in os.listdir(initial_workdir) if
+not f.startswith('.') and
+os.path.isfile(os.path.join(initial_workdir, f))])
+for file_found in files:
+file_path = os.path.join(initial_workdir, file_found)
+file_modification = os.path.getmtime(file_path)
+for prefix, values in list(files_required.items()):
+if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']):
+if 'file' not in values:
+files_required[prefix]['file'] = file_path
+files_required[prefix]['modification'] = file_modification
+else:
+if file_modification > files_required[prefix]['modification']:
+files_required[prefix]['file'] = file_path
+files_required[prefix]['modification'] = file_modification
+return files_required
+def get_samples_run(sample_report_file):
+samples_run = {}
+with open(sample_report_file, 'rtU') as reader:
+for line in reader:
+line = line.splitlines()[0]
+if len(line) > 0:
+if not line.startswith('#'):
+sample_info = line.split('\t')
+if sample_info[1] not in samples_run:
+samples_run[sample_info[1]] = []
+samples_run[sample_info[1]].append(sample_info[0])
+return samples_run
+def get_rematch_command(log_file):
+variables = {'command': False, 'directory': False}
+with open(log_file, 'rtU') as reader:
+for line in reader:
+if any([isinstance(value, bool) for value in list(variables.values())]):
+line = line.splitlines()[0]
+if len(line) > 0:
+if line == 'COMMAND:':
+variables['command'] = True
+elif line == 'PRESENT DIRECTORY:':
+variables['directory'] = True
+else:
+if variables['command'] is True:
+variables['command'] = line.split(' ')
+elif variables['directory'] is True:
+variables['directory'] = line
+else:
+break
+command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None}
+if all([not isinstance(value, bool) for value in list(variables.values())]):
+counter = 0
+while counter < len(variables['command']):
+if variables['command'][counter].startswith('-'):
+if variables['command'][counter] not in ('-t', '--taxon'):
+if variables['command'][counter] in ('-l', '--listIDs'):
+command['listIDs'] = variables['command'][counter + 1]
+counter += 1
+elif variables['command'][counter] in ('-w', '--workdir'):
+counter += 1
+elif variables['command'][counter] in ('-j', '--threads'):
+command['threads'] = int(variables['command'][counter + 1])
+counter += 1
+elif variables['command'][counter] == '--mlst':
+species = []
+counter += 1
+while counter < len(variables['command']) and not variables['command'][counter].startswith('-'):
+if len(variables['command'][counter]) > 0:
+species.append(variables['command'][counter])
+counter += 1
+command['command'].extend(['--mlst', ' '.join(species)])
+else:
+command['command'].append(variables['command'][counter])
+if counter + 1 < len(variables['command']) and \
+not variables['command'][counter + 1].startswith('-'):
+command['command'].append(variables['command'][counter + 1])
+counter += 1
+else:
+command['taxon'] = True
+for i in range(counter, len(variables['command'])):
+if i + 1 < len(variables['command']):
+if variables['command'][i + 1].startswith('-'):
+counter = i
+break
+else:
+counter = i
+else:
+command['command'].append(variables['command'][counter])
+counter += 1
+return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory']
+def get_taxon_run_ids(ids_list_seq_from_web_taxon_file):
+list_ids = []
+with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader:
+for line in reader:
+line = line.splitlines()[0]
+if len(line) > 0:
+if not line.startswith('#'):
+line = line.split('\t')
+list_ids.append(line[0])
+return list_ids
+def get_list_ids_from_file(list_ids_file):
+list_ids = []
+with open(list_ids_file, 'rtU') as lines:
+for line in lines:
+line = line.splitlines()[0]
+if len(line) > 0:
+list_ids.append(line)
+return list_ids
+def search_fastq_files(initial_workdir):
+files_extensions = ['.fastq.gz', '.fq.gz']
+pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]
+list_ids = {}
+directories = [d for d in os.listdir(initial_workdir) if
+not d.startswith('.') and
+os.path.isdir(os.path.join(initial_workdir, d, ''))]
+for directory_found in directories:
+directory_path = os.path.join(initial_workdir, directory_found, '')
+fastq_found = []
+files = [f for f in os.listdir(directory_path) if
+not f.startswith('.') and
+os.path.isfile(os.path.join(directory_path, f))]
+for file_found in files:
+if file_found.endswith(tuple(files_extensions)):
+fastq_found.append(file_found)
+if len(fastq_found) == 1:
+list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
+elif len(fastq_found) >= 2:
+file_pair = []
+# Search pairs
+for pe_separation in pair_end_files_separation:
+for fastq in fastq_found:
+if pe_separation[0] in fastq or pe_separation[1] in fastq:
+file_pair.append(fastq)
+if len(file_pair) == 2:
+break
+else:
+file_pair = []
+# Search single
+if len(file_pair) == 0:
+for pe_separation in pair_end_files_separation:
+for fastq in fastq_found:
+if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
+file_pair.append(fastq)
+if len(file_pair) >= 1:
+file_pair = file_pair[0]
+if len(file_pair) >= 1:
+list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]
+return list_ids
+def set_samples_from_folders(samples_to_run, samples_fastq, workdir):
+for sample in samples_to_run:
+sample_dir = os.path.join(workdir, sample, '')
+if not os.path.isdir(sample_dir):
+os.mkdir(sample_dir)
+for file_found in samples_fastq[sample]:
+link_path = os.path.join(sample_dir, os.path.basename(file_found))
+if os.path.islink(link_path):
+os.remove(link_path)
+if not os.path.isfile(link_path):
+os.symlink(file_found, link_path)
+def main():
+parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated',
+formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
+parser_required = parser.add_argument_group('Required options')
+parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/',
+help='Path to the directory where ReMatCh was running', required=True)
+parser_optional_general = parser.add_argument_group('General facultative options')
+parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
+help='Path to the directory where ReMatCh will run again', required=False,
+default='.')
+parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N',
+help='Number of threads to use instead of the ones set in initial ReMatCh run',
+required=False)
+parser_optional_general.add_argument('--runFailedSamples', action='store_true',
+help='Will run ReMatCh for those samples missing, as well as for samples that'
+' did not run successfully in initial ReMatCh run')
+args = parser.parse_args()
+run_rematch(args)
+if __name__ == "__main__":
+main()

Mercurial > repos > iss > eurl_vtec_wgs_pt

comparison scripts/ReMatCh/utils/restart_rematch.py @ 0:c6bab5103a14 draft