view bismark_methylation_extractor.py @ 16:a4504327c890 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bismark commit ce4520063ab4d016dd5c5680c0f03eae849b150c"
author bgruening
date Wed, 21 Aug 2019 12:59:48 -0400
parents 9bfe38410155
children ff6ee551b153
line wrap: on
line source

#!/usr/bin/env python

import argparse
import logging
import math
import os
import re
import shutil
import subprocess
import sys
import tempfile
import zipfile
from glob import glob


def stop_err(logger, msg):
    logger.critical(msg)
    sys.exit(1)


def log_subprocess_output(logger, pipe):
    for line in iter(pipe.readline, b''):
        logger.debug(line.decode().rstrip())


def zipper(dir, zip_file):
    output_files_regex = re.compile('^(Non_)?C[pH][GH]_.*')
    bedgraph_regex = re.compile('.*bedGraph.gz')
    zip = zipfile.ZipFile(zip_file, 'w', compression=zipfile.ZIP_DEFLATED)
    root_len = len(os.path.abspath(dir))
    for root, dirs, files in os.walk(dir):
        archive_root = os.path.abspath(root)[root_len:]
        for f in files:
            if re.search(output_files_regex, f) or re.search(bedgraph_regex, f):
                fullpath = os.path.join(root, f)
                archive_name = os.path.join(archive_root, f)
                zip.write(fullpath, archive_name, zipfile.ZIP_DEFLATED)
    zip.close()
    return zip_file


def build_genome_dir(genome_file):
    tmp_genome_dir = tempfile.mkdtemp(prefix='tmp')
    genome_path = os.path.join(tmp_genome_dir, '.'.join(os.path.split(genome_file)[1].split('.')[:-1]))
    try:
        # Create a hard link pointing to genome_file named 'genome_path'.fa.
        os.symlink(genome_file, genome_path + '.fa')
    except Exception as e:
        if os.path.exists(tmp_genome_dir):
            shutil.rmtree(tmp_genome_dir)
        stop_err('Error in linking the reference database!\n%s' % e)
    return tmp_genome_dir


def __main__():
    # Parse Command Line
    parser = argparse.ArgumentParser(description='Wrapper for the bismark methylation caller.')

    # input options
    parser.add_argument('--infile', help='Input file in SAM or BAM format.')
    parser.add_argument('--single-end', dest='single_end', action="store_true")
    parser.add_argument('--paired-end', dest='paired_end', action="store_true")

    parser.add_argument('--multicore', dest='multicore', type=int, default=1)
    parser.add_argument('--splitting_report', dest='splitting_report')
    parser.add_argument('--mbias_report', dest='mbias_report')
    parser.add_argument('--cytosine_report', dest="cytosine_report")
    parser.add_argument('--genome_file', dest="genome_file")
    parser.add_argument('--cx_context', action="store_true")

    parser.add_argument('--comprehensive', action="store_true")
    parser.add_argument('--merge-non-cpg', dest='merge_non_cpg', action="store_true")
    parser.add_argument('--no-overlap', dest='no_overlap', action="store_true")
    parser.add_argument('--compress', dest='compress')
    parser.add_argument('--ignore', dest='ignore', type=int)
    parser.add_argument('--ignore_r2', dest='ignore_r2', type=int)
    parser.add_argument('--ignore_3prime', dest='ignore_3prime', type=int)
    parser.add_argument('--ignore_3prime_r2', dest='ignore_3prime_r2', type=int)
    parser.add_argument('--log_report', dest='log_report', metavar='log_filename', type=str)
    args = parser.parse_args()

    logger = logging.getLogger('bismark_methylation_extractor_wrapper')
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler(sys.stdout)
    if args.log_report:
        ch.setLevel(logging.WARNING)
        handler = logging.FileHandler(args.log_report)
        handler.setLevel(logging.DEBUG)
        logger.addHandler(handler)
    else:
        ch.setLevel(logging.DEBUG)
    logger.addHandler(ch)

    # Build methylation extractor command
    output_dir = tempfile.mkdtemp()
    cmd = ['bismark_methylation_extractor', '--no_header', '-o', output_dir]
    # Set up all options
    if args.multicore > 3:
        # divide multicore by 3 here since bismark will spawn ~3 jobs.
        cmd.extend(['--multicore', str(math.ceil(args.multicore / 3))])
    if args.single_end:
        cmd.append('--single-end')
    else:
        cmd.append('--paired-end')
    if args.no_overlap:
        cmd.append('--no_overlap')
    if args.ignore:
        cmd.extend(['--ignore', str(args.ignore)])
    if args.ignore_r2:
        cmd.extend(['--ignore_r2', str(args.ignore_r2)])
    if args.ignore_3prime:
        cmd.extend(['--ignore_3prime', str(args.ignore_3prime)])
    if args.ignore_3prime_r2:
        cmd.extend(['--ignore_3prime_r2', str(args.ignore_3prime_r2)])
    if args.comprehensive:
        cmd.append('--comprehensive')
    if args.merge_non_cpg:
        cmd.append('--merge_non_CpG')
    if args.splitting_report:
        cmd.append('--report')
    tmp_genome_dir = None
    if args.cytosine_report:
        tmp_genome_dir = build_genome_dir(args.genome_file)
        if args.cx_context:
            cmd.extend(
                ['--bedGraph', '--CX_context', '--cytosine_report', '--CX_context', '--genome_folder', tmp_genome_dir])
        else:
            cmd.extend(['--bedGraph', '--cytosine_report', '--genome_folder', tmp_genome_dir])

    cmd.append(args.infile)

    # Run
    logger.info("Methylation extractor run with: '%s'", " ".join(cmd))
    prev_dir = os.getcwd()
    os.chdir(output_dir) # needed due to a bug in bismark where the coverage file cannot be found
    process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    with process.stdout:
        log_subprocess_output(logger, process.stdout)
    exitcode = process.wait()
    if exitcode != 0:
        stop_err(logger, "Bismark methylation extractor error (also check the log file if any)!\n%s" % process.stderr)
    logger.info("Finished methylation extractor.")
    # collect and copy output files
    logger.debug("Zip output files to '%s'.", args.compress)
    os.chdir(prev_dir)
    zipper(output_dir, args.compress)

    # cytosine report
    if args.cytosine_report:
        logger.debug("Collecting cytosine report.")
        if args.cx_context:
            shutil.move(glob(os.path.join(output_dir, '*CX_report.txt'))[0], args.cytosine_report)
        else:
            shutil.move(glob(os.path.join(output_dir, '*CpG_report.txt'))[0], args.cytosine_report)
    # splitting report
    if args.splitting_report:
        logger.debug("Collecting splitting report.")
        shutil.move(glob(os.path.join(output_dir, '*_splitting_report.txt'))[0], args.splitting_report)
    if args.mbias_report:
        logger.debug("Collecting M-Bias file.")
        shutil.move(glob(os.path.join(output_dir, '*M-bias.txt'))[0], args.mbias_report)

    # Clean up temp dirs
    logger.debug("Cleanup temp dirs.")
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    if tmp_genome_dir and os.path.exists(tmp_genome_dir):
        shutil.rmtree(tmp_genome_dir)
    logger.info("Done.")

if __name__ == "__main__": __main__()