view commons/tools/dbConsensus.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line source

#!/usr/bin/env python

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.

import os
import sys
import getopt

##@file
# usage: dbConsensus.py [ options ]
# options:
#      -h: this help
#      -i: name of the input file (format=aligned fasta)
#      -n: minimum number of nucleotides in a column to edit a consensus (default=1)
#      -p: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)
#      -o: name of the output file (default=inFileName+'.cons')
#      -v: verbose (default=0/1/2)

if not os.environ.has_key( "REPET_PATH" ):
    print "ERROR: no environment variable REPET_PATH"
    sys.exit(1)
sys.path.append( os.environ["REPET_PATH"] )

import commons.core.seq.AlignedBioseqDB


def help():
    """
    Give the list of the command-line options.
    """
    print
    print "usage:",sys.argv[0]," [ options ]"
    print "options:"
    print "     -h: this help"
    print "     -i: name of the input file (format=aligned fasta)"
    print "     -n: minimum number of nucleotides in a column to edit a consensus (default=1)"
    print "     -p: minimum proportion for the major nucleotide to be used, otherwise add 'N' (default=0.0)"
    print "     -o: name of the output file (default=inFileName+'.cons')"
    print "     -H: format the header with pyramid and piles informations (SATannot)"
    print "     -v: verbose (default=0/1/2)"
    print


def main():

    inFileName = ""
    minNbNt = 1
    minPropNt = 0.0
    outFileName = ""
    header_SATannot = False
    verbose = 0

    try:
        opts, args = getopt.getopt(sys.argv[1:],"hi:n:p:o:v:H")
    except getopt.GetoptError, err:
        print str(err); help(); sys.exit(1)
    for o,a in opts:
        if o == "-h":
            help()
            sys.exit(0)
        elif o == "-i":
            inFileName = a
        elif o == "-n":
            minNbNt = int(a)
        elif o == "-p":
            minPropNt = float(a)
        elif o == "-o":
            outFileName = a
        elif o == "-H":
            header_SATannot = True
        elif o == "-v":
            verbose = int(a)

    if inFileName == "":
        print "ERROR: missing input file name"
        help()
        sys.exit(1)

    if verbose > 0:
        print "START %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    alnDB = commons.core.seq.AlignedBioseqDB.AlignedBioseqDB( inFileName )

    if alnDB.getSize() < minNbNt:
        print "WARNING: not enough sequences (<%i)" % ( minNbNt )

    else:
        consensus = alnDB.getConsensus( minNbNt, minPropNt, verbose,  header_SATannot)
        if consensus != None:
            consensus.upCase()
            if outFileName == "":
                outFileName = "%s.cons" % ( inFileName )
            outFile = open( outFileName, "w" )
            consensus.write( outFile )
            outFile.close()

    if verbose > 0:
        print "END %s" % (sys.argv[0].split("/")[-1])
        sys.stdout.flush()

    return 0

if __name__ == "__main__":
    main ()