view scripts/ReMatCh/utils/restart_rematch.py @ 3:0cbed1c0a762 draft default tip

planemo upload commit 15239f1674081ab51ab8dd75a9a40cf1bfaa93e8
author cstrittmatter
date Tue, 28 Jan 2020 10:42:31 -0500
parents 965517909457
children
line wrap: on
line source

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""
restart_rematch.py - Restarts a ReMatCh run abruptly terminated
<https://github.com/B-UMMI/ReMatCh/>

Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>

Last modified: October 15, 2018

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import os
import argparse
import subprocess
import time


version = '0.1'


def run_rematch(args):
    print('\n' + '==========> Restarting ReMatCh <==========' + '\n')

    workdir = os.path.abspath(args.workdir)
    if not os.path.isdir(workdir):
        os.makedirs(workdir)

    initial_workdir = os.path.abspath(args.initialWorkdir)

    files_required = get_files_required(initial_workdir)

    samples_run = get_samples_run(files_required['sample_report']['file'])

    command, list_ids, taxon, threads, initial_present_directory = get_rematch_command(files_required['run']['file'])

    samples_fastq = {}

    if list_ids is not None:
        total_samples = get_list_ids_from_file(list_ids)
    elif taxon:
        total_samples = get_taxon_run_ids(files_required['IDs_list.seqFromWebTaxon']['file'])
    else:
        samples_fastq = search_fastq_files(initial_workdir)
        total_samples = list(samples_fastq.keys())

    samples_to_run = list(set(total_samples).symmetric_difference(set(sum(list(samples_run.values()), []) if
                                                                      not args.runFailedSamples else
                                                                      samples_run['True'] if
                                                                      'True' in samples_run else [''])))

    print(str(len(samples_to_run)) + ' samples out of ' + str(len(total_samples)) + ' will be analysed by'
                                                                                    ' ReMatCh' + '\n')

    if list_ids is not None or taxon:
        samples_to_run_file = write_samples_to_run(samples_to_run, workdir)
    else:
        set_samples_from_folders(samples_to_run, samples_fastq, workdir)

    command.extend(['-w', workdir])
    command.extend(['-j', str(threads) if args.threads is None else str(args.threads)])
    if list_ids is not None or taxon:
        command.extend(['-l', samples_to_run_file])

    print('ReMatCh will start in 5 seconds...')
    time.sleep(5)

    os.chdir(initial_present_directory)
    subprocess.call(command)


def write_samples_to_run(samples_to_run, workdir):
    samples_to_run_file = os.path.join(workdir, 'restart_rematch.samples_to_run.txt')
    with open(samples_to_run_file, 'wt') as writer:
        for sample in samples_to_run:
            writer.write(sample + '\n')
    return samples_to_run_file


def get_files_required(initial_workdir):
    files_required = {'sample_report': {'extension': 'tab'},
                      'run': {'extension': 'log'},
                      'IDs_list.seqFromWebTaxon': {'extension': 'tab'}}
    files = sorted([f for f in os.listdir(initial_workdir) if
                    not f.startswith('.') and
                    os.path.isfile(os.path.join(initial_workdir, f))])
    for file_found in files:
        file_path = os.path.join(initial_workdir, file_found)
        file_modification = os.path.getmtime(file_path)
        for prefix, values in list(files_required.items()):
            if file_found.startswith(prefix + '.') and file_found.endswith('.' + values['extension']):
                if 'file' not in values:
                    files_required[prefix]['file'] = file_path
                    files_required[prefix]['modification'] = file_modification
                else:
                    if file_modification > files_required[prefix]['modification']:
                        files_required[prefix]['file'] = file_path
                        files_required[prefix]['modification'] = file_modification
    return files_required


def get_samples_run(sample_report_file):
    samples_run = {}
    with open(sample_report_file, 'rtU') as reader:
        for line in reader:
            line = line.splitlines()[0]
            if len(line) > 0:
                if not line.startswith('#'):
                    sample_info = line.split('\t')
                    if sample_info[1] not in samples_run:
                        samples_run[sample_info[1]] = []
                    samples_run[sample_info[1]].append(sample_info[0])
    return samples_run


def get_rematch_command(log_file):
    variables = {'command': False, 'directory': False}
    with open(log_file, 'rtU') as reader:
        for line in reader:
            if any([isinstance(value, bool) for value in list(variables.values())]):
                line = line.splitlines()[0]
                if len(line) > 0:
                    if line == 'COMMAND:':
                        variables['command'] = True
                    elif line == 'PRESENT DIRECTORY:':
                        variables['directory'] = True
                    else:
                        if variables['command'] is True:
                            variables['command'] = line.split(' ')
                        elif variables['directory'] is True:
                            variables['directory'] = line
            else:
                break
    command = {'command': [], 'listIDs': None, 'taxon': False, 'threads': None}
    if all([not isinstance(value, bool) for value in list(variables.values())]):
        counter = 0
        while counter < len(variables['command']):
            if variables['command'][counter].startswith('-'):
                if variables['command'][counter] not in ('-t', '--taxon'):
                    if variables['command'][counter] in ('-l', '--listIDs'):
                        command['listIDs'] = variables['command'][counter + 1]
                        counter += 1
                    elif variables['command'][counter] in ('-w', '--workdir'):
                        counter += 1
                    elif variables['command'][counter] in ('-j', '--threads'):
                        command['threads'] = int(variables['command'][counter + 1])
                        counter += 1
                    elif variables['command'][counter] == '--mlst':
                        species = []
                        counter += 1
                        while counter < len(variables['command']) and not variables['command'][counter].startswith('-'):
                            if len(variables['command'][counter]) > 0:
                                species.append(variables['command'][counter])
                            counter += 1
                        command['command'].extend(['--mlst', ' '.join(species)])
                    else:
                        command['command'].append(variables['command'][counter])
                        if counter + 1 < len(variables['command']) and \
                                not variables['command'][counter + 1].startswith('-'):
                            command['command'].append(variables['command'][counter + 1])
                            counter += 1
                else:
                    command['taxon'] = True
                    for i in range(counter, len(variables['command'])):
                        if i + 1 < len(variables['command']):
                            if variables['command'][i + 1].startswith('-'):
                                counter = i
                                break
                        else:
                            counter = i
            else:
                command['command'].append(variables['command'][counter])
            counter += 1
    return command['command'], command['listIDs'], command['taxon'], command['threads'], variables['directory']


def get_taxon_run_ids(ids_list_seq_from_web_taxon_file):
    list_ids = []
    with open(ids_list_seq_from_web_taxon_file, 'rtU') as reader:
        for line in reader:
            line = line.splitlines()[0]
            if len(line) > 0:
                if not line.startswith('#'):
                    line = line.split('\t')
                    list_ids.append(line[0])
    return list_ids


def get_list_ids_from_file(list_ids_file):
    list_ids = []
    with open(list_ids_file, 'rtU') as lines:
        for line in lines:
            line = line.splitlines()[0]
            if len(line) > 0:
                list_ids.append(line)
    return list_ids


def search_fastq_files(initial_workdir):
    files_extensions = ['.fastq.gz', '.fq.gz']
    pair_end_files_separation = [['_R1_001.f', '_R2_001.f'], ['_1.f', '_2.f']]

    list_ids = {}
    directories = [d for d in os.listdir(initial_workdir) if
                   not d.startswith('.') and
                   os.path.isdir(os.path.join(initial_workdir, d, ''))]
    for directory_found in directories:
        directory_path = os.path.join(initial_workdir, directory_found, '')

        fastq_found = []
        files = [f for f in os.listdir(directory_path) if
                 not f.startswith('.') and
                 os.path.isfile(os.path.join(directory_path, f))]
        for file_found in files:
            if file_found.endswith(tuple(files_extensions)):
                fastq_found.append(file_found)

        if len(fastq_found) == 1:
            list_ids[directory_found] = [os.path.join(directory_path, f) for f in fastq_found]
        elif len(fastq_found) >= 2:
            file_pair = []

            # Search pairs
            for pe_separation in pair_end_files_separation:
                for fastq in fastq_found:
                    if pe_separation[0] in fastq or pe_separation[1] in fastq:
                        file_pair.append(fastq)

                if len(file_pair) == 2:
                    break
                else:
                    file_pair = []

            # Search single
            if len(file_pair) == 0:
                for pe_separation in pair_end_files_separation:
                    for fastq in fastq_found:
                        if pe_separation[0] not in fastq or pe_separation[1] not in fastq:
                            file_pair.append(fastq)

                if len(file_pair) >= 1:
                    file_pair = file_pair[0]

            if len(file_pair) >= 1:
                list_ids[directory_found] = [os.path.join(directory_path, f) for f in file_pair]

    return list_ids


def set_samples_from_folders(samples_to_run, samples_fastq, workdir):
    for sample in samples_to_run:
        sample_dir = os.path.join(workdir, sample, '')
        if not os.path.isdir(sample_dir):
            os.mkdir(sample_dir)
        for file_found in samples_fastq[sample]:
            link_path = os.path.join(sample_dir, os.path.basename(file_found))
            if os.path.islink(link_path):
                os.remove(link_path)
            if not os.path.isfile(link_path):
                os.symlink(file_found, link_path)


def main():
    parser = argparse.ArgumentParser(prog='restart_rematch.py', description='Restart a ReMatCh run abruptly terminated',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-i', '--initialWorkdir', type=str, metavar='/path/to/initial/workdir/directory/',
                                 help='Path to the directory where ReMatCh was running', required=True)

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-w', '--workdir', type=str, metavar='/path/to/workdir/directory/',
                                         help='Path to the directory where ReMatCh will run again', required=False,
                                         default='.')
    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N',
                                         help='Number of threads to use instead of the ones set in initial ReMatCh run',
                                         required=False)
    parser_optional_general.add_argument('--runFailedSamples', action='store_true',
                                         help='Will run ReMatCh for those samples missing, as well as for samples that'
                                              ' did not run successfully in initial ReMatCh run')

    args = parser.parse_args()

    run_rematch(args)


if __name__ == "__main__":
    main()