view STACKS_denovomap.py @ 1:ccfa8e539bdf

add archive toolbox to manage zip outputs
author cmonjeau
date Mon, 24 Aug 2015 10:09:14 +0000
parents d6ba40f6c824
children
line wrap: on
line source

#!/usr/bin/python
# -*- coding: utf-8 -*-

import sys
import re
import os
import tempfile
import shutil
import subprocess
import glob
import argparse
from os.path import basename
import zipfile
import tarfile
import gzip
from galaxy.datatypes.checkers import *
from stacks import *


def __main__():

    # arguments recuperation

    parser = argparse.ArgumentParser()
    parser.add_argument('-p')
    parser.add_argument('-b')
    parser.add_argument('-r')
    parser.add_argument('-s')
    parser.add_argument('-O')
    parser.add_argument('-m')
    parser.add_argument('-P')
    parser.add_argument('-M')
    parser.add_argument('-N')
    parser.add_argument('-n')
    parser.add_argument('-t')
    parser.add_argument('-H')
    parser.add_argument('--bound_low')
    parser.add_argument('--bound_high')
    parser.add_argument('--alpha')
    parser.add_argument('--logfile')
    parser.add_argument('--compress_output')
    parser.add_argument('--catalogsnps')
    parser.add_argument('--catalogalleles')
    parser.add_argument('--catalogtags')

    # additionnal outputs
    parser.add_argument('--total_output')
    parser.add_argument('--tags_output')
    parser.add_argument('--snps_output')
    parser.add_argument('--alleles_output')
    parser.add_argument('--matches_output')

    options = parser.parse_args()

    # create working directories

    os.mkdir('inputs')
    os.mkdir('job_outputs')
    os.mkdir('galaxy_outputs')

    cmd_line = []
    cmd_line.append('denovo_map.pl')

    # if genetic map

    if options.p:

        # parse config files

        tab_parent_files = galaxy_config_to_tabfiles_for_STACKS(options.p)

        # check if zipped files are into the tab and change tab content

        extract_compress_files_from_tabfiles(tab_parent_files, 'inputs')

        # check files extension (important to have .fq or .fasta files)

        check_fastq_extension_and_add(tab_parent_files, 'inputs')

        # create symlink into the temp dir

        create_symlinks_from_tabfiles(tab_parent_files, 'inputs')

        # parse the input dir and store all file names into a tab

        fastq_files = []
        for fastq_file in glob.glob('inputs/*'):
            # if is a file (skip repository created after a decompression)
            if os.path.isfile(fastq_file):
                fastq_files.append(fastq_file)

        fastq_files.sort()

        # test if fastq are paired-end
        if options.b == 'true':
            for n in range(0, len(fastq_files), 2):
                cmd_line.extend(['-p', fastq_files[n]])
        else:
            for myfastqfile in fastq_files:
                cmd_line.extend(['-p', myfastqfile])

    # if genetic map with progeny files

    if options.r:

        # parse config files
        tab_progeny_files = galaxy_config_to_tabfiles_for_STACKS(options.r)

        # check if zipped files are into the tab and change tab content
        extract_compress_files_from_tabfiles(tab_progeny_files, 'inputs')

        # check files extension (important to have .fq or .fasta files)
        check_fastq_extension_and_add(tab_progeny_files, 'inputs')

        # create symlink into the temp dir
        create_symlinks_from_tabfiles(tab_progeny_files, 'inputs')

        for key in tab_progeny_files:

            # if is a file (skip repository created after a decompression)

            if os.path.isfile('inputs/' + key):
                cmd_line.extend(['-r', 'inputs/' + key])

    # if population is checked
    if options.s:

        tab_individual_files = galaxy_config_to_tabfiles_for_STACKS(options.s)

        # check if zipped files are into the tab and change tab content
        extract_compress_files_from_tabfiles(tab_individual_files, 'inputs')

        # check files extension (important to have .fq or .fasta files)
        check_fastq_extension_and_add(tab_individual_files, 'inputs')

        # create symlink into the temp dir
        create_symlinks_from_tabfiles(tab_individual_files, 'inputs')

        # create the command input line
        for key in tab_individual_files:

            # if is a file (skip repository created after a decompression)
            if os.path.isfile('inputs/' + key):
                cmd_line.extend(['-s', 'inputs/' + key])

    # create the command line
    cmd_line.extend([
        '-S',
        '-b',
        '1',
        '-T',
        '4',
        '-o',
        'job_outputs/'
        ])

    if options.O:
        cmd_line.extend(['-O', options.O])

    if options.m and options.m != '-1':
        cmd_line.extend(['-m', options.m])

    if options.P and options.P != '-1':
        cmd_line.extend(['-P', options.P])

    if options.M and options.M != '-1':
        cmd_line.extend(['-M', options.M])

    if options.N and options.N != '-1':
        cmd_line.extend(['-N', options.N])

    if options.n and options.n != '-1':
        cmd_line.extend(['-n', options.n])

    if options.t and options.t == 'true':
        cmd_line.append('-t')

    if options.H and options.H == 'true':
        cmd_line.append('-H')

    ## SNP model 
    if options.bound_low:
        cmd_line.extend(['--bound_low', options.bound_low])
        cmd_line.extend(['--bound_high', options.bound_high])

    if options.alpha:
        cmd_line.extend(['--alpha', options.alpha])

    # launch the command line
    print "[CMD_LINE] : "+' '.join(cmd_line)    

    p = subprocess.call(cmd_line)

    # postprocesses
    try:
        shutil.move('job_outputs/denovo_map.log', options.logfile)
    except:
        sys.stderr.write('Error in denovo_map execution; Please read the additional output (stdout)\n')
        sys.exit(1)

    # go inside the outputs dir
    os.chdir('job_outputs')

    # move files
    for i in glob.glob('*'):
        if re.search('catalog.snps.tsv$', i):
            shutil.copy(i, options.catalogsnps)
        if re.search('catalog.alleles.tsv$', i):
            shutil.copy(i, options.catalogalleles)
        if re.search('catalog.tags.tsv$', i):
            shutil.copy(i, options.catalogtags)

    list_files = glob.glob('*')

    # if compress output is total
    if options.compress_output == 'total':

        mytotalzipfile = zipfile.ZipFile('total.zip.temp', 'w',
                allowZip64=True)

        for i in list_files:
            mytotalzipfile.write(os.path.basename(i))

        # return the unique archive
        shutil.move('total.zip.temp', options.total_output)
    elif options.compress_output == 'categories':

    # if compress output is by categories
        mytagszip = zipfile.ZipFile('tags.zip.temp', 'w', allowZip64=True)
        mysnpszip = zipfile.ZipFile('snps.zip.temp', 'w', allowZip64=True)
        myalleleszip = zipfile.ZipFile('alleles.zip.temp', 'w', allowZip64=True)
        mymatcheszip = zipfile.ZipFile('matches.zip.temp', 'w', allowZip64=True)

        for i in list_files:
            # for each type of files
            if re.search("tags\.tsv$", i) and not re.search('batch', i):
                mytagszip.write(os.path.basename(i))
                os.remove(i)
            elif re.search("snps\.tsv$", i) and not re.search('batch', i):
                mysnpszip.write(os.path.basename(i))
                os.remove(i)
            elif re.search("alleles\.tsv$", i) and not re.search('batch', i):
                myalleleszip.write(os.path.basename(i))
                os.remove(i)
            elif re.search("matches\.tsv$", i) and not re.search('batch', i):
                mymatcheszip.write(os.path.basename(i))
                os.remove(i)
            else:
                shutil.move(os.path.basename(i), '../galaxy_outputs')

        # return archives....
        shutil.move('tags.zip.temp', options.tags_output)
        shutil.move('snps.zip.temp', options.snps_output)
        shutil.move('alleles.zip.temp', options.alleles_output)
        shutil.move('matches.zip.temp', options.matches_output)
    else:
    # else no compression
        for i in list_files:
            shutil.move(os.path.basename(i), '../galaxy_outputs')


if __name__ == '__main__':
    __main__()