view commons/launcher/launchMreps.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

from commons.core.seq.BioseqDB import BioseqDB
from commons.core.parsing.MrepsToSet import MrepsToSet
import subprocess
import os
import sys
import getopt

def help():
    """
    Give the list of the command-line options.
    """
    print
    print "usage: ",sys.argv[0],"[ options ]"
    print "options:"
    print "     -h: this help"
    print "     -i: name of the input file (format='fasta')"
    print "     -o: name of the output file (default=inFileName+'.Mreps.set')"
    print "     -f: error filter (default=1.0)"
    print "     -c: clean"
    print "     -v: verbosity level (default=0/1)"
    print

def main():
    """
    Launch Mreps.
    """
    inFileName = ""
    outFileName = ""
    errorFilter = 1.0
    clean = False
    verbose = 0

    try:
        opts=getopt.getopt(sys.argv[1:],"hi:o:f:cv:")[0]
    except getopt.GetoptError, err:
        print str(err)
        help()
        sys.exit(1)
    for o,a in opts:
        if o == "-h":
            help()
            sys.exit(0)
        elif o == "-i":
            inFileName = a
        elif o == "-o":
            outFileName = a
        elif o == "-f":
            errorFilter = float(a)
        elif o == "-c":
            clean = True
        elif o == "-v":
            verbose = int(a)

    if inFileName == "":
        print "ERROR: missing compulsory options"
        help()
        sys.exit(1)

    if verbose > 0:
        print "beginning of %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    # Mreps 2.5 doesn't fully support IUPAC nomenclature
    if verbose > 0:
        print "* check IUPAC symbols"; sys.stdout.flush()
    tmpInFileName = "%s.tmp%i" % ( inFileName, os.getpid() )
    if os.path.exists( tmpInFileName ):
        os.system( "rm -f %s" % ( tmpInFileName ) )
    bsDB = BioseqDB( inFileName )
    for bs in bsDB.db:
        if verbose > 0:
            print bs.header; sys.stdout.flush()
        bs.partialIUPAC()
        onlyN = True
        for nt in ["A","T","G","C"]:
            if nt in bs.sequence:
                onlyN = False
        if onlyN == True:
            if verbose > 0:
                print "** Warning: only Ns"; sys.stdout.flush()
        else:
            bsDB.save( tmpInFileName )

    if not os.path.exists( tmpInFileName ):
        sys.exit(0)

    if verbose > 0:
        print "* remove N stretches"; sys.stdout.flush()
    prg = os.environ["REPET_PATH"] + "/bin/cutterDB"
    cmd = prg
    cmd += " -l 200000"
    cmd += " -o 0"
    cmd += " -w 11"
    cmd += " %s" % ( tmpInFileName )
    if verbose > 0:
        print cmd; sys.stdout.flush()
    log = os.system( cmd )
    if log != 0:
        print "ERROR: %s returned %i" % ( prg, log )
        sys.exit(1)

    # launch Mreps on the input file
    MrepsOutFileName = "%s.Mreps.xml" % ( tmpInFileName )
    prg = "mreps"
    cmd = prg
    cmd += " -res 3"
    cmd += " -exp 3.0"
    cmd += " -maxsize 50"
    cmd += " -xmloutput %s" % MrepsOutFileName
    cmd += " -fasta %s_cut" % tmpInFileName
    process = subprocess.Popen(cmd, shell = True)
    process.communicate()
    if process.returncode != 0:
        raise Exception("ERROR when launching '%s'" % cmd)

    if outFileName == "":
        outFileName = inFileName + ".Mreps.set"

    # parse Mreps results in xml format
    iMrepsToSet = MrepsToSet(inFileName, MrepsOutFileName, outFileName, errorFilter)
    iMrepsToSet.run()
    if clean:
        iMrepsToSet.clean()

    # remove temporary input filename
    os.remove(tmpInFileName)
    os.remove("%s_cut" % tmpInFileName)
    os.remove("%s.Nstretch.map" % tmpInFileName)

    if verbose > 0:
        print "%s finished successfully\n" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    return 0


if __name__ == '__main__':
    main()