view scripts/patho_typing.py @ 6:20ff3dca457f draft default tip

planemo upload commit 6857c749c21f580c828aba3543e294b69d32b662
author iss
date Mon, 23 Oct 2023 11:45:36 +0000
parents c6bab5103a14
children
line wrap: on
line source

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

"""
patho_typing.py - In silico pathogenic typing directly from raw
Illumina reads
<https://github.com/B-UMMI/patho_typing/>

Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>

Last modified: October 15, 2018

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

2020-01-13 ISS
In order to use the module within the EURL_WGS_Typer pipeline with a 
different virulence database for E coli, mapping against the 
typing_rules.tab was deactivated.
"""

import argparse
import os
import time
import sys
from pkg_resources import resource_filename

try:
    from __init__ import __version__

    import modules.utils as utils
    import modules.run_rematch as run_rematch
    import modules.typing as typing
except ImportError:
    from pathotyping.__init__ import __version__

    from pathotyping.modules import utils as utils
    from pathotyping.modules import run_rematch as run_rematch
    from pathotyping.modules import typing as typing


def set_reference(species, outdir, script_path, trueCoverage):
    reference_file = None
    trueCoverage_file = None
    trueCoverage_sequences = None
    trueCoverage_headers = None
    trueCoverage_config = None
    typing_file = None
    typing_sequences = None
    typing_headers = None
    typing_rules = None
    typing_config = None

    species_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf',
                                  '_'.join([s.lower() for s in species]), '')

    if os.path.isdir(species_folder):
        typing_rules = os.path.join(species_folder, 'typing_rules.tab')
        typing_file = os.path.join(species_folder, 'typing.fasta')
        typing_sequences, ignore = utils.get_sequence_information(typing_file, 0)
        typing_sequences, typing_headers = utils.clean_headers_sequences(typing_sequences)
        typing_sequences = utils.simplify_sequence_dict(typing_sequences)
        typing_config = os.path.join(species_folder, 'typing.config')
        if trueCoverage:
            if os.path.isfile(os.path.join(species_folder, 'trueCoverage.fasta')):
                trueCoverage_file = os.path.join(species_folder, 'trueCoverage.fasta')
                trueCoverage_sequences, ignore = utils.get_sequence_information(trueCoverage_file, 0)
                trueCoverage_sequences, trueCoverage_headers = utils.clean_headers_sequences(trueCoverage_sequences)
                trueCoverage_sequences = utils.simplify_sequence_dict(trueCoverage_sequences)
                trueCoverage_config = os.path.join(species_folder, 'trueCoverage.config')

                trueCoverage_typing_sequences = trueCoverage_sequences.copy()
                for header in typing_sequences:
                    if header not in trueCoverage_sequences:
                        trueCoverage_typing_sequences[header] = typing_sequences[header]
                    else:
                        print('Sequence {header} of typing.fasta already present in'
                              ' trueCoverage.fasta'.format(header=header))

                reference_file = os.path.join(outdir, 'trueCoverage_typing.fasta')
                write_sequeces(reference_file, trueCoverage_typing_sequences)
        else:
            reference_file = os.path.join(outdir, 'typing.fasta')
            write_sequeces(reference_file, typing_sequences)
    else:
        species_present = []
        seq_conf_folder = os.path.join(os.path.dirname(script_path), 'modules', 'seq_conf', '')
        species_folder = [d for d in os.listdir(seq_conf_folder) if
                          not d.startswith('.') and os.path.isdir(os.path.join(seq_conf_folder, d, ''))]
        for species in species_folder:
            species = species.split('_')
            species[0] = species[0][0].upper() + species[0][1:]
            species_present.append(' '.join(species))
        sys.exit('Only these species are available:' + '\n' + '\n'.join(species_present))

    return reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, \
           typing_file, typing_sequences, typing_headers, typing_rules, typing_config


def index_fasta_samtools(fasta, region_None, region_outfile_none, print_comand_True):
    command = ['samtools', 'faidx', fasta, '', '', '']
    shell_true = False
    if region_None is not None:
        command[3] = region_None
    if region_outfile_none is not None:
        command[4] = '>'
        command[5] = region_outfile_none
        shell_true = True
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, shell_true, None, print_comand_True)
    return run_successfully, stdout


def indexSequenceBowtie2(referenceFile, threads):
    if os.path.isfile(str(referenceFile + '.1.bt2')):
        run_successfully = True
    else:
        command = ['bowtie2-build', '--threads', str(threads), referenceFile, referenceFile]
        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    return run_successfully


def run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
    sam_file = os.path.join(outdir, str('alignment.sam'))

    run_successfully = indexSequenceBowtie2(referenceFile, threads)
    if run_successfully:
        command = ['bowtie2', '-k', str(numMapLoc), '-q', '', '--threads', str(threads), '-x', referenceFile, '',
                   '--no-unal', '-S', sam_file]

        if len(fastq_files) == 1:
            command[9] = '-U ' + fastq_files[0]
        elif len(fastq_files) == 2:
            command[9] = '-1 ' + fastq_files[0] + ' -2 ' + fastq_files[1]
        else:
            return False, None

        if conserved_True:
            command[4] = '--sensitive'
        else:
            command[4] = '--very-sensitive-local'

        run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)

    if not run_successfully:
        sam_file = None

    return run_successfully, sam_file


def sortAlignment(alignment_file, output_file, sortByName_True, threads):
    outFormat_string = os.path.splitext(output_file)[1][1:].lower()
    command = ['samtools', 'sort', '-o', output_file, '-O', outFormat_string, '', '-@', str(threads), alignment_file]
    if sortByName_True:
        command[6] = '-n'
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    if not run_successfully:
        output_file = None
    return run_successfully, output_file


def indexAlignment(alignment_file):
    command = ['samtools', 'index', alignment_file]
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    return run_successfully


def mapping_reads(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc):
    print('\n' + 'Mapping the reads' + '\n')
    run_successfully, sam_file = run_bowtie(fastq_files, referenceFile, threads, outdir, conserved_True, numMapLoc)
    bam_file = None
    if run_successfully:
        run_successfully, bam_file = sortAlignment(sam_file, str(os.path.splitext(sam_file)[0] + '.bam'), False,
                                                   threads)
        if run_successfully:
            os.remove(sam_file)
            run_successfully = indexAlignment(bam_file)
            if run_successfully:
                index_fasta_samtools(referenceFile, None, None, True)
    return run_successfully, bam_file


def include_rematch_dependencies_path():
    original_rematch = None
    command = ['which', 'rematch.py']
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, False)
    if run_successfully:
        original_rematch = stdout.splitlines()[0]

    resource_rematch = None
    try:
        resource_rematch = resource_filename('ReMatCh', 'rematch.py')
    except ModuleNotFoundError:
        resource_rematch = original_rematch
    else:
        print('\n'
              'Using ReMatCh "{resource_rematch}" via "{original_rematch}"\n'.format(resource_rematch=resource_rematch,
                                                                                     original_rematch=original_rematch))

    if resource_rematch is not None:
        utils.setPATHvariable(False, resource_rematch)
    else:
        sys.exit('ReMatCh not found in the PATH')

    return resource_rematch


def split_bam(bam_file, list_sequences, outdir, threads):
    new_bam = os.path.join(outdir, 'partial.bam')
    command = ['samtools', 'view', '-b', '-u', '-h', '-o', new_bam, '-@', str(threads), bam_file,
               ' '.join(list_sequences)]
    run_successfully, stdout, stderr = utils.runCommandPopenCommunicate(command, False, None, True)
    return run_successfully, new_bam


def parse_config(config_file):
    config = {'reference_file': None, 'length_extra_seq': None, 'maximum_number_absent_genes': None,
              'maximum_number_genes_multiple_alleles': None, 'minimum_read_coverage': None,
              'minimum_depth_presence': None, 'minimum_depth_call': None,
              'minimum_depth_frequency_dominant_allele': None, 'minimum_gene_coverage': None,
              'minimum_gene_identity': None}

    with open(config_file, 'rt') as reader:
        field = None
        for line in reader:
            line = line.splitlines()[0]
            if len(line) > 0:
                line = line.split(' ')[0]
                if line.startswith('#'):
                    line = line[1:].split(' ')[0]
                    field = line
                else:
                    if field is not None:
                        if field in ['length_extra_seq', 'maximum_number_absent_genes',
                                     'maximum_number_genes_multiple_alleles', 'minimum_read_coverage',
                                     'minimum_depth_presence', 'minimum_depth_call', 'minimum_gene_coverage',
                                     'minimum_gene_identity']:
                            line = int(line)
                            if field in ['minimum_gene_coverage', 'minimum_gene_identity']:
                                if line < 0 or line > 100:
                                    sys.exit('minimum_gene_coverage in trueCoverage_rematch config file must be an'
                                             ' integer between 0 and 100')
                        elif field == 'minimum_depth_frequency_dominant_allele':
                            line = float(line)
                            if line < 0 or line > 1:
                                sys.exit('minimum_depth_frequency_dominant_allele in trueCoverage_rematch config file'
                                         ' must be a double between 0 and 1')
                        config[field] = line
                        field = None

    for field in config:
        if config[field] is None:
            sys.exit(field + ' in trueCoverage_rematch config file is missing')

    return config


def clean_pathotyping_folder(outdir, reference_file, debug_mode_true):
    if not debug_mode_true:
        files = [f for f in os.listdir(outdir) if not f.startswith('.') and os.path.isfile(os.path.join(outdir, f))]
        for file_found in files:
            if file_found.startswith(('alignment.', os.path.splitext(os.path.basename(reference_file))[0])):
                file_found = os.path.join(outdir, file_found)
                os.remove(file_found)


def write_sequeces(out_file, sequences_dict):
    with open(out_file, 'wt') as writer:
        for header in sequences_dict:
            writer.write('>' + header + '\n')
            writer.write('\n'.join(utils.chunkstring(sequences_dict[header]['sequence'], 80)) + '\n')


def main():
    parser = argparse.ArgumentParser(prog='patho_typing.py',
                                     description='In silico pathogenic typing directly from raw Illumina reads',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--version', help='Version information', action='version',
                        version='{prog} v{version}'.format(prog=parser.prog, version=__version__))

    parser_required = parser.add_argument_group('Required options')
    parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'),
                                 type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'),
                                 help='Path to single OR paired-end fastq files. If two files are passed, they will be'
                                      ' assumed as being the paired fastq files', required=True)
    parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'),
                                 help='Species name', required=True)

    parser_optional_general = parser.add_argument_group('General facultative options')
    parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/',
                                         help='Path to the directory where the information will be stored',
                                         required=False, default='.')
    parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use',
                                         required=False, default=1)
    parser_optional_general.add_argument('--trueCoverage', action='store_true',
                                         help='Assess true coverage before continue typing')
    parser_optional_general.add_argument('--noCheckPoint', action='store_true',
                                         help='Ignore the true coverage checking point')
    parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N',
                                         help='Minimum typing percentage of target reference gene sequence covered to'
                                              ' consider a gene to be present (value between [0, 100])', required=False)
    parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N',
                                         help='Minimum typing percentage of identity of reference gene sequence covered'
                                              ' to consider a gene to be present (value between [0, 100]). One INDEL'
                                              ' will be considered as one difference', required=False)
    parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N',
                                         help='Minimum typing gene average coverage depth of present positions to'
                                              ' consider a gene to be present (default is 1/3 of average sample'
                                              ' coverage or 15x)', required=False)
    parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true',
                                         help='Do not remove ReMatCh consensus sequences')
    parser_optional_general.add_argument('--debug', action='store_true',
                                         help='DeBug Mode: do not remove temporary files')

    args = parser.parse_args()

    if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100):
        parser.error('--minGeneCoverage should be a value between [0, 100]')
    if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100):
        parser.error('--minGeneIdentity should be a value between [0, 100]')

    start_time = time.time()

    args.outdir = os.path.abspath(args.outdir)
    if not os.path.isdir(args.outdir):
        os.makedirs(args.outdir)

    # Start logger
    logfile, time_str = utils.start_logger(args.outdir)

    script_path = utils.general_information(logfile, __version__, args.outdir, time_str)
    print('\n')

    rematch = include_rematch_dependencies_path()

    args.fastq = [fastq.name for fastq in args.fastq]

    reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \
    typing_sequences, typing_headers, typing_rules, typing_config = \
        set_reference(args.species, args.outdir, script_path, args.trueCoverage)
    original_reference_file = str(reference_file)

    run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1)
    if run_successfully:
        rematch_dir = os.path.join(args.outdir, 'rematch', '')
        if not os.path.isdir(rematch_dir):
            os.makedirs(rematch_dir)

        if args.trueCoverage:
            if trueCoverage_file is not None:
                trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '')
                if not os.path.isdir(trueCoverage_dir):
                    os.makedirs(trueCoverage_dir)

                print('\n')
                run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir,
                                                               args.threads)
                if run_successfully:
                    run_successfully = indexAlignment(trueCoverage_bam)
                    if run_successfully:
                        reference_file = os.path.join(trueCoverage_dir, 'reference.fasta')
                        write_sequeces(reference_file, trueCoverage_sequences)
                        index_fasta_samtools(reference_file, None, None, True)
                        config = parse_config(trueCoverage_config)
                        runtime, run_successfully, sample_data_general, data_by_gene = \
                            run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam,
                                                    args.threads, config['length_extra_seq'],
                                                    config['minimum_depth_presence'], config['minimum_depth_call'],
                                                    config['minimum_depth_frequency_dominant_allele'],
                                                    config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                                    args.debug, args.doNotRemoveConsensus)

                        if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \
                                sample_data_general['number_absent_genes'] is not None and \
                                sample_data_general['number_genes_multiple_alleles'] is not None:
                            if args.minGeneDepth is None:
                                args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                                    sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                                    15

                            exit_info = []
                            if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']:
                                exit_info.append('Sample coverage ({mean}) lower than the minimum'
                                                 ' required ({minimum})'
                                                 ''.format(mean=sample_data_general['mean_sample_coverage'],
                                                           minimum=config['minimum_read_coverage']))
                            if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']:
                                exit_info.append('Number of absent genes ({number}) higher than the'
                                                 ' maximum allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_absent_genes'],
                                                           maximum=config['maximum_number_absent_genes']))
                            if sample_data_general['number_genes_multiple_alleles'] > \
                                    config['maximum_number_genes_multiple_alleles']:
                                exit_info.append('Number of genes with multiple alleles'
                                                 ' ({number}) higher than the maximum'
                                                 ' allowed ({maximum})'
                                                 ''.format(number=sample_data_general['number_genes_multiple_alleles'],
                                                           maximum=config['maximum_number_genes_multiple_alleles']))

                            if len(exit_info) > 0:
                                print('\n' + '\n'.join(exit_info) + '\n')
                                e = 'TrueCoverage requirements not fulfilled'
                                print('\n' + e + '\n')
                                if not args.noCheckPoint:
                                    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                    _ = utils.runTime(start_time)
                                    sys.exit(e)
                        else:
                            e = 'TrueCoverage module did not run successfully'
                            print('\n' + e + '\n')
                            if not args.noCheckPoint:
                                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                                _ = utils.runTime(start_time)
                                sys.exit(e)

                        print('\n')
                        typing_dir = os.path.join(rematch_dir, 'typing', '')
                        if not os.path.isdir(typing_dir):
                            os.makedirs(typing_dir)
                        run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads)
                        if run_successfully:
                            run_successfully = indexAlignment(bam_file)
                            if run_successfully:
                                reference_file = os.path.join(typing_dir, 'reference.fasta')
                                write_sequeces(reference_file, typing_sequences)
                                index_fasta_samtools(reference_file, None, None, True)
                                rematch_dir = str(typing_dir)
                if not run_successfully:
                    if args.noCheckPoint:
                        clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                        _ = utils.runTime(start_time)
                        sys.exit('Something in the required TrueCoverage analysis went wrong')
            else:
                print('\n'
                      'WARNING: it was not found trueCoverage target files. trueCoverage will not run.'
                      '\n')

        if run_successfully:
            config = parse_config(typing_config)
            if args.minGeneCoverage is not None:
                config['minimum_gene_coverage'] = args.minGeneCoverage
            if args.minGeneIdentity is not None:
                config['minimum_gene_identity'] = args.minGeneIdentity

            runtime, run_successfully, sample_data_general, data_by_gene = \
                run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads,
                                        config['length_extra_seq'], config['minimum_depth_presence'],
                                        config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'],
                                        config['minimum_gene_coverage'], config['minimum_gene_identity'],
                                        args.debug, args.doNotRemoveConsensus)
            if run_successfully and data_by_gene is not None:
                if args.minGeneDepth is None:
                    args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \
                                        sample_data_general['mean_sample_coverage'] / 3 > 15 else \
                                        15
            else:
                clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
                _ = utils.runTime(start_time)
                sys.exit('ReMatCh run for pathotyping did not run successfully')
        else:
            clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)
            _ = utils.runTime(start_time)
            sys.exit('Something did not run successfully')

    clean_pathotyping_folder(args.outdir, original_reference_file, args.debug)

    print('\n')
    _ = utils.runTime(start_time)


if __name__ == "__main__":
    main()