view SMART/DiffExpAnal/gsnap_parallel_unSQL.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

import optparse, os, shutil, subprocess, sys, tempfile, fileinput, tarfile, glob 
import time
from commons.core.launcher.Launcher import Launcher
from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory
from commons.core.utils.FileUtils import FileUtils
from optparse import OptionParser

def stop_err( msg ):
    sys.stderr.write( "%s\n" % msg )
    sys.exit()

def toTar(tarFileName, accepted_hits_outputNames):
    tfile = tarfile.open(tarFileName + ".tmp.tar", "w")
    currentPath = os.getcwd()
    os.chdir(dir)
    for file in accepted_hits_outputNames:
        relativeFileName = os.path.basename(file)
        tfile.add(relativeFileName)
    os.system("mv %s %s" % (tarFileName + ".tmp.tar", tarFileName))
    tfile.close()
    os.chdir(currentPath)
    
def joinSAM(dCutOut2Out):
    for key in dCutOut2Out.keys():
        FileUtils.catFilesFromList(dCutOut2Out[key],key, False)
        
def _map(iLauncher, cmd, cmdStart, cmdFinish ):
    lCmds = []
    lCmds.extend(cmd)
    lCmdStart = []
    lCmdStart.extend(cmdStart)
    lCmdFinish = []
    lCmdFinish.extend(cmdFinish)
    return(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish))

def _createGsnapSplicingOptions(options):
    lArgs = []
    lArgs.append("-N %s" % options.novelsplicing)
    if options.useSplicing:
        lArgs.append("-s %s" % options.useSplicing)
    lArgs.append("-w %s" % options.localsplicedist)
    lArgs.append("-e %s" % options.localSplicePenality)
    lArgs.append("-E %s" % options.distantSplicePenality)
    lArgs.append("-K %s" % options.distantSpliceEndlength)
    lArgs.append("-l %s" % options.shortendSpliceEndlength)
    
    
    return lArgs

def _createGsnapPairedEndOptions(options):
    lArgs = []
    if not(options.useSplicing or options.pairedEndFile):
        lArgs.append("--pairmax-dna %s" % options.pairmaxRna)
    if options.useSplicing or options.pairedEndFile:
        lArgs.append("--pairmax-rna %s" % options.pairmaxRna)
    lArgs.append("--pairexpect=%s" % options.pairexpect)
    lArgs.append("--pairdev=%s" % options.pairedev)
    
    

def _createGsnapCommand(iLauncher, options, workingDir, inputFileNames, inputRevFilesNames, outputFileName, batchNumber, numberOfBatch):
    lArgs = []
    lArgs.append("-d %s" % options.genomeName)
    lArgs.append("-k %s" % options.kmer)
    lArgs.append("-D %s" % workingDir)
    lArgs.append("-A %s" % options.outputFormat)
    lArgs.append("-q %s/%s" % (batchNumber, numberOfBatch))
    lArgs.append("--no-sam-headers")
    lArgs.append(inputFileNames)
    print 'N option: %s, pairedEndFile option: %s' %(options.novelsplicing, options.pairedEndFile)
    if options.pairedEndFile:
        lArgs.append(inputRevFilesNames)
    if options.novelsplicing == '1':
        lArgs.extend(_createGsnapSplicingOptions(options))
    elif options.pairedEndFile:
        lArgs.extend(_createGsnapPairedEndOptions(options))
    
    lArgs.append("> %s" % outputFileName)  
    return iLauncher.getSystemCommand("gsnap", lArgs)   

def __main__():
    #Parse Command Line
    description = "GMAP/GSNAP version:2012-12-20."
    parser = OptionParser(description = description)
    parser.add_option('-o', '--outputTxtFile', dest='outputTxtFile', help='for Differential expression analysis pipeline, new output option gives a txt output containing the list of mapping results.')
    parser.add_option("-q", "--inputTxt",         dest="inputTxt",         action="store",                 type="string",     help="input, a txt file for a list of input reads files [compulsory]")
    parser.add_option('-t', '--tar', dest='outputTar', default=None, help='output all accepted hits results in a tar file.' )
    parser.add_option("-d", "--genomeName", dest="genomeName", help="Define the reference genome name.[compulsory]")
#    parser.add_option("-o", "--outputFile", dest="outputfile", help="output[compulsory]")
    parser.add_option("-k", "--kmer", dest="kmer", default=12, help="Choose kmer value (<=16), a big kmer value can take more RAM(4Go).[compulsory]")
    parser.add_option("-i", "--inputFasta", dest="inputFastaFile", help="Reference genome file, fasta format.[compulsory]")
    parser.add_option("-p", "--pairedEnd", dest="pairedEndFile", default=None, help="Input paired-end fastq file.")
    parser.add_option("-A", "--outputFormat", dest="outputFormat", default="sam", help="Choose an output format [sam, goby (need to re-compile with appropriate options)].")
    #Splicing options for RNA-Seq
    parser.add_option("-N","--novelsplicing", dest="novelsplicing", default=0, help="Look for novel splicing (0=no (default), 1=yes)")
    parser.add_option("-s","--use-splicing", dest="useSplicing",action="store", type="string", help="Look for splicing involving known sites or known introns (in <STRING>.iit), at short or long distances")
    parser.add_option("-w","--localsplicedist", dest="localsplicedist", default=200000, help="Definition of local novel splicing event (default 200000)")
    parser.add_option("-e","--local-splice-penality", dest="localSplicePenality", default=0, help="Penalty for a local splice (default 0).  Counts against mismatches allowed")
    parser.add_option("-E","--distant-splice-penality", dest="distantSplicePenality", default=1, help="Penalty for a distant splice (default 1).  A distant splice is one where the intron length exceeds the value of -w, or --localsplicedist, or is an inversion, scramble, or translocation between two different chromosomes Counts against mismatches allowed")
    parser.add_option("-K","--distant-splice-endlength", dest="distantSpliceEndlength", default=16, help="Minimum length at end required for distant spliced alignments (default 16, min allowed is the value of -k, or kmer size)")
    parser.add_option("-l","--shortend-splice-endlength", dest="shortendSpliceEndlength", default=2, help="Minimum length at end required for short-end spliced alignments (default 2, but unless known splice sites are provided with the -s flag, GSNAP may still need the end length to be the value of -k, or kmer size to find a given splice")

    #Specific paired-ends reads
    parser.add_option("--pairmax-dna", dest="pairmaxDna", default=1000, help="Max total genomic length for DNA-Seq paired reads, or other reads without splicing (default 1000).")
    parser.add_option("--pairmax-rna", dest="pairmaxRna", default=2000, help="Max total genomic length for RNA-Seq paired reads, or other reads that could have a splice (default 200000).")
    parser.add_option("--pairexpect", dest="pairexpect", default=200, help="Expected paired-end length, used for calling splices in medial part of paired-end reads (default 200)")
    parser.add_option("--pairdev", dest="pairdev", default=25, help="Allowable deviation from expected paired-end length, used for calling splices in medial part of paired-end reads (default 25)")
    
    (options, args) = parser.parse_args()    

    workingDir = os.path.dirname(options.inputFastaFile)
    
    file = open(options.inputTxt,"r")
    lines = file.readlines()
    inputFileNames = []
    gsnapOutputNames = []
    outputName = options.outputTxtFile
    resDirName = os.path.dirname(outputName) + '/'
    out = open(outputName, "w")
    for line in lines:
        timeId = time.strftime("%Y%m%d%H%M%S")
        tab = line.split()
        inputFileNames.append(tab[1])
        OutputName = resDirName + tab[0] + '_samOutput_%s.sam' % timeId
        gsnapOutputNames.append(OutputName) 
        out.write(tab[0] + '\t' + OutputName + '\n')
    file.close()
    out.close()
    
    if options.pairedEndFile:
        revFile = open(options.pairedEndFile,"r")
        lines = revFile.readlines()
        inputRevFileNames = []
        for line in lines:
            revTab = line.split()
            inputRevFileNames.append(revTab[1])
        revFile.close()

    #Create gsnap make 
    lCmdsTuples =[]
    acronym = "gsnap_make"
    jobdb = TableJobAdaptatorFactory.createJobInstance()
    iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
    cmds = []
    cmd_setup = "gmap_setup -d %s -D %s -k %s %s;" % (options.genomeName, workingDir, options.kmer, options.inputFastaFile)
    cmds.append(cmd_setup)
    cmd_make_coords = "make -f Makefile.%s coords;" % options.genomeName 
    cmds.append(cmd_make_coords)
    cmd_make_gmapdb = "make -f Makefile.%s gmapdb;" % options.genomeName
    cmds.append(cmd_make_gmapdb)
    cmd_make_install = "make -f Makefile.%s install;" % options.genomeName
    cmds.append(cmd_make_install)
    cmd_index = iLauncher.getSystemCommand("", cmds)
    cmd2Launch = []
    cmdStart = []
    cmdFinish = []
    cmd2Launch.append(cmd_index)
    lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish)) 
    iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)    
    
    acronym = "gsnap"
    jobdb = TableJobAdaptatorFactory.createJobInstance()
    iLauncher = Launcher(jobdb, os.getcwd(), "", "", os.getcwd(), os.getcwd(), "jobs", "", acronym, acronym, False, True)
    lCmdsTuples = []
    dCutOut2Out = {}
    lAllFile2remove = []
    numberOfBatch = 20  #usually for testing, working on to find a value for default launch on galaxy
    for i in range(len(inputFileNames)):
        lCutOutput = []
        for j in range(numberOfBatch):
            cutOutput = "%s_out_%s" % (inputFileNames[i], j)
            lCutOutput.append(cutOutput)
            lAllFile2remove.extend(lCutOutput)
            cmd2Launch = []
            if options.pairedEndFile: 
                inputRevFile = inputRevFileNames[i]
            else:
                inputRevFile = ""
            cmd2Launch.append(_createGsnapCommand(iLauncher, options, workingDir, inputFileNames[i], inputRevFile, cutOutput, j, numberOfBatch))
            cmdStart = []
            cmdFinish = []
            lCmdsTuples.append(_map(iLauncher, cmd2Launch, cmdStart, cmdFinish))    
            dCutOut2Out[gsnapOutputNames[i]] = lCutOutput
    iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, True)
    
    joinSAM(dCutOut2Out) 
    FileUtils.removeFilesFromListIfExist(lAllFile2remove)   
                 
    if options.outputTar != None:
        toTar(options.outputTar, gsnapOutputNames)


if __name__=="__main__": __main__()