Mercurial > repos > urgi-team > teiso

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability.
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and,  more generally, to use and operate it in the
# same conditions as regards security.
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.


import os
import re
import sys
import math
import glob
import string
import shutil
from commons.core.utils.FileUtils import FileUtils
from commons.core.seq.Bioseq import Bioseq
from commons.core.seq.BioseqDB import BioseqDB
from commons.core.coord.MapUtils import MapUtils
from commons.core.coord.Range import Range
from commons.core.coord.ConvCoord import ConvCoord
from commons.core.parsing.FastaParser import FastaParser
from commons.core.checker.CheckerUtils import CheckerUtils
from commons.core.launcher.LauncherUtils import LauncherUtils


## Static methods for fasta file manipulation
#
class FastaUtils( object ):

    ## Count the number of sequences in the input fasta file
    #
    # @param inFile name of the input fasta file
    #
    # @return integer number of sequences in the input fasta file
    #
    @staticmethod
    def dbSize( inFile ):
        nbSeq = 0
        with open(inFile) as fH:
            for line in fH:
                if line[0] == ">":
                    nbSeq += 1

        return nbSeq


    ## Compute the cumulative sequence length in the input fasta file
    #
    # @param inFile handler of the input fasta file
    #
    @staticmethod
    def dbCumLength( inFile ):
        cumLength = 0
        for line in inFile:
            if line[0] != ">":
                cumLength += len(string.rstrip(line))

        return cumLength


    ## Return a list with the length of each sequence in the input fasta file
    #
    # @param inFile string name of the input fasta file
    #
    @staticmethod
    def dbLengths(inFile):
        lLengths = []
        currentLength = 0
        with open(inFile) as fH:
            for line in fH:
                if line[0] == ">":
                    if currentLength != 0:
                        lLengths.append(currentLength)
                    currentLength = 0
                else:
                    currentLength += len(line[:-1])
            lLengths.append(currentLength)

        return lLengths


    ## Retrieve the sequence headers present in the input fasta file
    #
    # @param inFile string name of the input fasta file
    # @param verbose integer level of verbosity
    #
    # @return list of sequence headers
    #
    @staticmethod
    def dbHeaders(inFile, verbose = 0):
        lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
        if verbose:
            for header in lHeaders:
                print header

        return lHeaders


    ## Cut a data bank into chunks according to the input parameters
    # If a sequence is shorter than the threshold, it is only renamed (not cut)
    #
    # @param inFileName string name of the input fasta file
    # @param chkLgth string chunk length (in bp, default=200000)
    # @param chkOver string chunk overlap (in bp, default=10000)
    # @param wordN string N stretch word length (default=11, 0 for no detection)
    # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
    # @param clean boolean remove 'cut' and 'Nstretch' files
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
        nbSeq = FastaUtils.dbSize(inFileName)
        if verbose > 0:
            print "cut the %i input sequences with cutterDB..." % nbSeq
            sys.stdout.flush()

        prg = "cutterDB"
        cmd = prg
        cmd += " -l %s" % chkLgth
        cmd += " -o %s" % chkOver
        cmd += " -w %s" % wordN
        cmd += " %s" % inFileName
        returnStatus = os.system(cmd)
        if returnStatus != 0:
            msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
            sys.stderr.write("%s\n" % msg)
            sys.exit(1)

        nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
        if verbose > 0:
            print "done (%i chunks)" % ( nbChunks )
            sys.stdout.flush()

        if verbose > 0:
            print "rename the headers..."
            sys.stdout.flush()

        if outFilePrefix == "":
            outFastaName = inFileName + "_chunks.fa"
            outMapName = inFileName + "_chunks.map"
        else:
            outFastaName = outFilePrefix + ".fa"
            outMapName = outFilePrefix + ".map"

        with open("%s_cut" % inFileName) as inFile:
            outFasta = open(outFastaName, "w")
            outMap = open(outMapName, "w")

            for line in inFile:
                if line[0] == ">":
                    if verbose > 1:
                        print "rename '%s'" % (line[:-1]); sys.stdout.flush()
                    data = line[:-1].split(" ")
                    seqID = data[0].split(">")[1]
                    newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
                    oldHeader = data[2]
                    seqStart = data[4].split("..")[0]
                    seqEnd = data[4].split("..")[1]
                    outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
                    outFasta.write(">%s\n" % newHeader)

                else:
                    outFasta.write(line.upper())

            outFasta.close()
            outMap.close()

        #stats on .Nstretch.map file
        genomeLength = FastaUtils.dbCumLength(open(inFileName))
        NstretchMapFile = inFileName + ".Nstretch.map"
        outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
        if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
                outNstrechStats.write("No N in stretch length > %s\n" % wordN)
        else:
            with open(NstretchMapFile) as f:
                dHeader2lLengths = {}
                for line in f:
                    data = line.rstrip().split()
                    header = data[1]
                    length = int(data[3]) - int(data[2]) + 1
                    if header not in dHeader2lLengths:
                        dHeader2lLengths[header] = []
                    dHeader2lLengths[header].append(length)

            for header in sorted(dHeader2lLengths):
                lLengths = dHeader2lLengths[header]
                outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))

            cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])

            NstretchPrct = float(cumulAllStretch)/genomeLength*100
            outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
        outNstrechStats.close()

        if clean == True:
            os.remove(inFileName + "_cut")
            os.remove(NstretchMapFile)


    ## Split the input fasta file in several output files
    #
    # @param inFile string name of the input fasta file
    # @param nbSeqPerBatch integer number of sequences per output file
    # @param newDir boolean put the sequences in a new directory called 'batches'
    # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
    # @param prefix prefix in output file name
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
        if not os.path.exists(inFile):
            msg = "ERROR: file '%s' doesn't exist" % inFile
            sys.stderr.write("%s\n" % msg)
            sys.exit(1)

        nbSeq = FastaUtils.dbSize(inFile)

        nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
        if verbose > 0:
            print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
            sys.stdout.flush()

        if nbSeqPerBatch != 1 and useSeqHeader:
            useSeqHeader = False

        if newDir == True:
            if os.path.exists("batches"):
                shutil.rmtree("batches")
            os.mkdir("batches")
            os.chdir("batches")
            os.system("ln -s ../%s ." % inFile)

        with open(inFile) as inFileHandler:
            countBatch = 0
            countSeq = 0
            for line in inFileHandler:
                if line[0] == ">":
                    countSeq += 1
                    if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
                        try:
                            outFile.close()
                        except: pass
                        countBatch += 1
                        if useSeqHeader:
                            outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
                        else:
                            outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
                        outFile = open(outFileName, "w")

                    if verbose > 1:
                        print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
                        sys.stdout.flush()
                outFile.write(line)

        if newDir:
            os.remove(os.path.basename(inFile))
            os.chdir("..")


    ## Split the input fasta file in several output files
    #
    # @param inFileName string name of the input fasta file
    # @param maxSize integer max cumulative length for each output file
    #
    @staticmethod
    def splitFastaFileInBatches(inFileName, maxSize = 200000):
        iBioseqDB = BioseqDB(inFileName)
        lHeadersSizeTuples = []
        for iBioseq in iBioseqDB.db:
            lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))

        lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
        os.mkdir("batches")
        os.chdir("batches")

        iterator = 0
        for lHeader in lHeadersList:
            iterator += 1
            with open("batch_%s.fa" % iterator, 'w') as f:
                for header in lHeader :
                    iBioseq = iBioseqDB.fetch(header)
                    iBioseq.write(f)
        os.chdir("..")


    ## Split the input fasta file in several output files according to their cluster identifier
    #
    # @param inFileName string  name of the input fasta file
    # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
    # @param simplifyHeader boolean simplify the headers
    # @param createDir boolean put the sequences in different directories
    # @param outPrefix string prefix of the output files (default='seqCluster')
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
        if not os.path.exists(inFileName):
            print "ERROR: %s doesn't exist" % inFileName
            sys.exit(1)

        inFile = open(inFileName)

        line = inFile.readline()
        if line:
            name = line.split(" ")[0]
            if "Cluster" in name:
                clusterID = name.split("Cluster")[1].split("Mb")[0]
                seqID = name.split("Mb")[1]
            else:
                clusterID = name.split("Cl")[0].split("Gr")[1]   # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
                if "Q" in name.split("Gr")[0]:
                    seqID = name.split("Gr")[0].split("MbQ")[1]
                elif "S" in name:
                    seqID = name.split("Gr")[0].split("MbS")[1]
            sClusterIDs = set( [ clusterID ] )
            if simplifyHeader == True:
                header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
            else:
                header = line[1:-1]
            if createDir == True:
                if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
                    os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
                os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
            outFileName = "%s%s.fa" % ( outPrefix, clusterID )
            outFile = open( outFileName, "w" )
            outFile.write( ">%s\n" % ( header ) )
            prevClusterID = clusterID

            line = inFile.readline()
            while line:
                if line[0] == ">":
                    name = line.split(" ")[0]
                    if "Cluster" in name:
                        clusterID = name.split("Cluster")[1].split("Mb")[0]
                        seqID = name.split("Mb")[1]
                    else:
                        clusterID = name.split("Cl")[0].split("Gr")[1]
                        if "Q" in name.split("Gr")[0]:
                            seqID = name.split("Gr")[0].split("MbQ")[1]
                        elif "S" in name:
                            seqID = name.split("Gr")[0].split("MbS")[1]

                    if clusterID != prevClusterID:
                        outFile.close()

                    if simplifyHeader == True:
                        header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
                    else:
                        header = line[1:-1]

                    if createDir == True:
                        os.chdir( ".." )
                        if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
                            os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
                        os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )

                    outFileName = "%s%s.fa" % ( outPrefix, clusterID )
                    if not os.path.exists( outFileName ):
                        outFile = open( outFileName, "w" )
                    else:
                        if clusterID != prevClusterID:
                            outFile.close()
                            outFile = open( outFileName, "a" )
                    outFile.write( ">%s\n" % ( header ) )
                    prevClusterID = clusterID
                    sClusterIDs.add( clusterID )

                else:
                    outFile.write( line )

                line = inFile.readline()

            outFile.close()
            if verbose > 0:
                print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()

            if createDir == True:
                os.chdir("..")
        else:
            print "WARNING: empty input file - no cluster found"; sys.stdout.flush()


    ## Filter a fasta file in two fasta files using the length of each sequence as a criterion
    #
    # @param len_min integer    length sequence criterion to filter
    # @param inFileName string  name of the input fasta file
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def dbLengthFilter(len_min, inFileName, verbose = 0):
        file_db = open(inFileName,)
        file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
        file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
        seq = Bioseq()
        numseq = 0
        nbsaveInf = 0
        nbsaveSup = 0
        seq.read(file_db)
        while seq.getHeader():
            l = seq.getLength()
            numseq = numseq + 1
            if l >= len_min:
                seq.write(file_dbSup)
                if verbose > 0:
                        nbsaveSup = nbsaveSup + 1
            else:
                seq.write(file_dbInf)
                if verbose > 0:
                        nbsaveInf = nbsaveInf + 1
            seq.read(file_db)

        file_db.close()
        file_dbInf.close()
        file_dbSup.close()
        if verbose > 0:
            print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))


    ## Extract the longest sequences from a fasta file
    #
    # @param num integer maximum number of sequences in the output file
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param minThresh integer minimum length threshold (default=0)
    # @param verbose integer (default = 0)
    #
    @staticmethod
    def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
        bsDB = BioseqDB(inFileName)
        if verbose > 0:
            print "nb of input sequences: %i" % bsDB.getSize()

        if outFileName == "":
            outFileName = inFileName + ".best" + str(num)
        outFile = open( outFileName, "w" )

        if bsDB.getSize()==0:
            return 0

        num = int(num)
        if verbose > 0:
            print "keep the %i longest sequences" % num
            if minThresh > 0:
                print "with length > %i bp" % minThresh
            sys.stdout.flush()

        tmpLSeqLgth = bsDB.getListOfSequencesLength()
        tmpLSeqLgth.sort(reverse = True)

        # select the longests
        lSeqLgth = []
        for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
            if tmpLSeqLgth[i] >= minThresh:
                lSeqLgth.append( tmpLSeqLgth[i] )
        if verbose > 0:
            print "selected max length: %i" % max(lSeqLgth)
            print "selected min length: %i" % min(lSeqLgth)
            sys.stdout.flush()

        # save the longest
        inFile = open( inFileName )
        seqNum = 0
        nbSave = 0
        for bs in bsDB.db:
            seqNum += 1
            if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
                bs.write( outFile )
                if verbose > 1:
                    print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
                    sys.stdout.flush()
                nbSave += 1
            if nbSave == num:
                break
        inFile.close()
        outFile.close()
        if verbose > 0:
            print nbSave, "saved sequences in ", outFileName
            sys.stdout.flush()

        return 0


    ## Extract all the sequence headers from a fasta file and write them in a new file
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
    #
    @staticmethod
    def dbExtractSeqHeaders(inFileName, outFileName = ""):
        if not outFileName:
            outFileName = inFileName + ".headers"

        with open(outFileName, "w") as f:
            for header in FastaUtils.dbHeaders(inFileName):
                f.write("%s\n" % header)

        return 0


    ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
    #
    # @param pattern regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
        if not pattern:
            return

        if not outFileName:
            outFileName = inFileName + '.extracted'
        outFile = open(outFileName, 'w')

        patternTosearch = re.compile(pattern)
        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        inFile = open(inFileName)
        bioseq.read(inFile)
        while bioseq.sequence:
            bioseqNb = bioseqNb + 1
            m = patternTosearch.search(bioseq.header)
            if m:
                bioseq.write(outFile)
                if verbose > 1:
                    print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
                savedBioseqNb = savedBioseqNb + 1
            bioseq.read(inFile)
        inFile.close()

        outFile.close()

        if verbose > 0:
            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)


    ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
    #
    # @param patternFileName string file containing regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
        if not patternFileName:
            print "ERROR: no file of pattern"
            sys.exit(1)

        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []

        with open(inFileName) as inFile:
            bioseq.read(inFile)
            while bioseq.sequence:
                lHeaders.append(bioseq.header)
                bioseq.read(inFile)

        lHeadersToKeep = []
        with open(patternFileName) as patternFile:
            for pattern in patternFile:
                pattern = pattern.rstrip()
                if verbose > 0:
                    print "pattern: ", pattern; sys.stdout.flush()

                patternToSearch = re.compile(pattern)
                lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])

        if not outFileName:
            outFileName = inFileName + ".extracted"

        with open(outFileName, "w") as outFile:
            with open(inFileName) as inFile:
                bioseq.read(inFile)
                while bioseq.sequence:
                    bioseqNb += 1
                    if bioseq.header in lHeadersToKeep:
                        bioseq.write(outFile)
                        savedBioseqNb += 1
                        if verbose > 1:
                            print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
                    bioseq.read(inFile)

        if verbose > 0:
            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)


    ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
    #
    # @param pattern regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
        if not pattern:
            return

        patternToSearch = re.compile(pattern)

        if outFileName == "":
            outFileName = inFileName + '.cleaned'

        with open(outFileName, 'w') as outFile:
            bioseq = Bioseq()
            bioseqNb = 0
            savedBioseqNb = 0
            with open(inFileName) as inFile:
                bioseq.read(inFile)
                while bioseq.sequence:
                    bioseqNb += 1
                    if not patternToSearch.search(bioseq.header):
                        bioseq.write(outFile)
                        savedBioseqNb += 1
                        if verbose > 1:
                            print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
                    bioseq.read(inFile)

        if verbose > 0:
            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)


    ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
    #
    # @param patternFileName string file containing regular expression to search in headers
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
        if not patternFileName:
            print "ERROR: no file of pattern"
            sys.exit(1)

        bioseq = Bioseq()
        bioseqNb = 0
        savedBioseqNb = 0
        lHeaders = []
        with open(inFileName) as inFile:
            bioseq.read(inFile)
            while bioseq.sequence:
                lHeaders.append(bioseq.header)
                bioseq.read(inFile)

        with open(patternFileName) as patternFile:
            lHeadersToRemove = []
            for pattern in patternFile:
                pattern = pattern.rstrip()
                if verbose > 0:
                    print "pattern: ", pattern; sys.stdout.flush()

                patternToSearch = re.compile(pattern)
                lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])

        if not outFileName:
            outFileName = inFileName + '.cleaned'

        with open(outFileName, 'w') as outFile:
            bioseqNum = 0
            with open(inFileName) as inFile:
                bioseq.read(inFile)
                while bioseq.sequence:
                    bioseqNum += 1
                    if bioseq.header not in lHeadersToRemove:
                        bioseq.write(outFile)
                        savedBioseqNb += 1
                        if verbose > 1:
                            print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
                    bioseq.read(inFile)

        if verbose > 0:
            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)


    ## Find sequence's ORFs from a fasta file and write them in a tab file
    #
    # @param inFileName string name of the input fasta file
    # @param orfMaxNb integer Select orfMaxNb best ORFs
    # @param orfMinLength integer Keep ORFs with length > orfMinLength
    # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
    # @param verbose integer verbosity level (default = 0)
    #
    @staticmethod
    def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
        if not outFileName:
            outFileName = inFileName + ".ORF.map"
        outFile = open(outFileName, "w")

        bioseq = Bioseq()
        bioseqNb = 0

        inFile = open(inFileName)
        bioseq.read(inFile)
        while bioseq.sequence:
            bioseq.upCase()
            bioseqNb += 1
            if verbose > 0:
                print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'

            orf = bioseq.findORF()
            bestOrf = []
            for i in orf.keys():
                orfLen = len(orf[i])
                for j in xrange(1, orfLen):
                    start = orf[i][j - 1] + 4
                    end = orf[i][j] + 3
                    if end - start >= orfMinLength:
                        bestOrf.append((end - start, i + 1, start, end))

            bioseq.reverseComplement()

            orf = bioseq.findORF()
            seqLen = bioseq.getLength()
            for i in orf.keys():
                orfLen = len(orf[i])
                for j in xrange(1, orfLen):
                    start = seqLen - orf[i][j - 1] - 3
                    end = seqLen - orf[i][j] - 2
                    if start - end >= orfMinLength:
                        bestOrf.append((start - end, (i + 1) * -1, start, end))

            bestOrf.sort(reverse = True)
            bestOrfNb = len(bestOrf)
            if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
                bestOrfNb = orfMaxNb
            for i in xrange(0, bestOrfNb):
                if verbose > 0:
                    print bestOrf[i]
                outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
                                   "|" + str(bestOrf[i][0]), bioseq.header,
                                   bestOrf[i][2], bestOrf[i][3]))
            bioseq.read(inFile)

        inFile.close()
        outFile.close()

        return 0


    ## Sort sequences by increasing length (write a new file)
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param verbose integer verbosity level
    #
    @staticmethod
    def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
        if verbose > 0:
            print "sort sequences by increasing length"
            sys.stdout.flush()
        if not os.path.exists(inFileName):
            print "ERROR: file '%s' doesn't exist" % (inFileName)
            sys.exit(1)

        # read each seq one by one
        # save them in distinct temporary files
        # with their length in the name
        inFileHandler = open(inFileName, "r")
        countSeq = 0
        bs = Bioseq()
        bs.read(inFileHandler)
        while bs.header:
            countSeq += 1
            tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
            bs.appendBioseqInFile(tmpFile)
            if verbose > 1:
                print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
            bs.header = ""
            bs.sequence = ""
            bs.read(inFileHandler)
        inFileHandler.close()

        # sort temporary file names
        # concatenate them into the output file
        if os.path.exists(outFileName):
            os.remove(outFileName)
        lFiles = glob.glob("*bp_*nb")
        lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
        for fileName in lFiles:
            cmd = "cat %s >> %s" % (fileName, outFileName)
            returnValue = os.system(cmd)
            if returnValue != 0:
                print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
                sys.exit(1)
            os.remove(fileName)

        return 0


    ## Sort sequences by header
    #
    # @param inFileName string name of the input fasta file
    # @param outFileName string name of the output fasta file
    # @param verbose integer verbosity level
    #
    @staticmethod
    def sortSequencesByHeader(inFileName, outFileName = ""):
        if outFileName == "":
            outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
        iBioseqDB = BioseqDB(inFileName)
        with open(outFileName, "w") as f:
            for header in sorted(iBioseqDB.getHeaderList()):
                iBioseq = iBioseqDB.fetch(header)
                iBioseq.write(f)


    ## Return a dictionary which keys are the headers and values the length of the sequences
    #
    # @param inFile string name of the input fasta file
    # @param verbose integer verbosity level
    #
    @staticmethod
    def getLengthPerHeader(inFile, verbose = 0):
        dHeader2Length = {}

        with open(inFile) as inFileHandler:
            currentSeqHeader = ""
            currentSeqLength = 0
            for line in inFileHandler:
                if line[0] == ">":
                    if currentSeqHeader != "":
                        dHeader2Length[currentSeqHeader] = currentSeqLength
                        currentSeqLength = 0
                    currentSeqHeader = line[1:-1]
                    if verbose > 0:
                        print "current header: %s" % currentSeqHeader
                        sys.stdout.flush()
                else:
                    currentSeqLength += len(line.replace("\n", ""))
            dHeader2Length[currentSeqHeader] = currentSeqLength

        return dHeader2Length


    ## Convert headers from a fasta file having chunk coordinates
    #
    # @param inFile string name of the input fasta file
    # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
        inFileHandler = open(inFile, "r")
        outFileHandler = open(outFile, "w")
        dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
        iConvCoord = ConvCoord()
        for line in inFileHandler:
            if line[0] == ">":
                if "{Fragment}" in line:
                    chkName = line.split(" ")[1]
                    chrName = dChunk2Map[chkName].seqname
                    lCoordPairs = line.split(" ")[3].split(",")
                    lRangesOnChk = []
                    for i in lCoordPairs:
                        iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
                        lRangesOnChk.append(iRange)
                    lRangesOnChr = []
                    for iRange in lRangesOnChk:
                        lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
                    newHeader = line[1:-1].split(" ")[0]
                    newHeader += " %s" % chrName
                    newHeader += " {Fragment}"
                    newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
                    for iRange in lRangesOnChr[1:]:
                        newHeader += ",%i..%i" % (iRange.start, iRange.end)
                    outFileHandler.write(">%s\n" % newHeader)
                else:
                    chkName = line.split("_")[1].split(" ")[0]
                    chrName = dChunk2Map[chkName].seqname
                    coords = line[line.find("[")+1 : line.find("]")]
                    start = int(coords.split(",")[0])
                    end = int(coords.split(",")[1])
                    iRangeOnChk = Range(chkName, start, end)
                    iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
                    newHeader = line[1:-1].split("_")[0]
                    newHeader += " %s" % chrName
                    newHeader += " %s" % line[line.find("(") : line.find(")")+1]
                    newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
                    outFileHandler.write(">%s\n" % newHeader)
            else:
                outFileHandler.write(line)
        inFileHandler.close()
        outFileHandler.close()


    ## Convert a fasta file to a length file
    #
    # @param inFile string name of the input fasta file
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaToLength(inFile, outFile = ""):
        if not outFile:
            outFile = "%s.length" % inFile

        if inFile:
            with open(inFile) as inFH:
                with open(outFile, "w") as outFH:
                    bioseq = Bioseq()
                    bioseq.read(inFH)
                    while bioseq.sequence:
                        seqLen = bioseq.getLength()
                        outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
                        bioseq.read(inFH)


    ## Convert a fasta file to a seq file
    #
    # @param inFile string name of the input fasta file
    # @param outFile string name of the output file
    #
    @staticmethod
    def convertFastaToSeq(inFile, outFile = ""):
        if not outFile:
            outFile = "%s.seq" % inFile

        if inFile:
            with open(inFile) as inFH:
                with open(outFile, "w") as outFH:
                    bioseq = Bioseq()
                    bioseq.read(inFH)
                    while bioseq.sequence:
                        seqLen = bioseq.getLength()
                        outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
                                                bioseq.sequence, bioseq.header, seqLen))
                        bioseq.read(inFH)


    ## Splice an input fasta file using coordinates in a Map file
    #
    # @note the coordinates should be merged beforehand!
    #
    @staticmethod
    def spliceFromCoords(genomeFile, coordFile, obsFile):
        genomeFileHandler = open(genomeFile)
        obsFileHandler = open(obsFile, "w")
        dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)

        bs = Bioseq()
        bs.read(genomeFileHandler)
        while bs.sequence:

            if dChr2Maps.has_key(bs.header):
                lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
                splicedSeq = []
                currentSite = 0
                for iMap in lCoords:
                    minSplice = iMap.getMin() - 1
                    if minSplice > currentSite:
                        splicedSeq += bs.sequence[currentSite : minSplice]
                    if currentSite <= iMap.getMax():
                        currentSite = iMap.getMax()
                splicedSeq += bs.sequence[currentSite:]
                bs.sequence = "".join(splicedSeq)
            bs.write(obsFileHandler)
            bs.read(genomeFileHandler)

        genomeFileHandler.close()
        obsFileHandler.close()


    ## Shuffle input sequences (single file or files in a directory)
    #
    @staticmethod
    def dbShuffle(inData, outData, verbose = 0):
        if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
            prg = "esl-shuffle"
        else : prg = "shuffle"
        genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
        if os.path.isfile(inData):
            if verbose > 0:
                print "shuffle input file '%s'" % inData
            cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
            print cmd
            returnStatus = os.system(cmd)
            if returnStatus:
                sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
                sys.exit(1)

        elif os.path.isdir(inData):
            if verbose > 0:
                print "shuffle files in input directory '%s'" % inData
            if os.path.exists(outData):
                shutil.rmtree(outData)
            os.mkdir(outData)
            lInputFiles = glob.glob("%s/*.fa" % inData)
            nbFastaFiles = 0
            for inputFile in lInputFiles:
                nbFastaFiles += 1
                if verbose > 1:
                    print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
                fastaBaseName = os.path.basename(inputFile)
                prefix = os.path.splitext(fastaBaseName)[0]
                cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
                returnStatus = os.system(cmd)
                if returnStatus:
                    sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
                    sys.exit(1)


    ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
    #
    # @param inClusterFileName string input cluster file name
    # @param inFastaFileName string input fasta file name
    # @param outFileName string output file name
    # @param verbosity integer verbosity
    #
    @staticmethod
    def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
        dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
        iFastaParser = FastaParser(inFastaFileName)
        with open(outFileName, "w") as f:
            for iSequence in iFastaParser.getIterator():

                header = iSequence.getName()
                if dHeader2ClusterClusterMember.get(header):
                    cluster = dHeader2ClusterClusterMember[header][0]
                    member = dHeader2ClusterClusterMember[header][1]
                else:
                    clusterIdForSingletonCluster +=  1
                    cluster = clusterIdForSingletonCluster
                    member = 1

                newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
                iSequence.setName(newHeader)
                f.write(iSequence.printFasta())

    @staticmethod
    def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
        dHeader2ClusterClusterMember = {}
        clusterId = 0
        with open(inClusterFileName) as f:
            for line in f:
                lineWithoutLastChar = line.rstrip()
                lHeaders = lineWithoutLastChar.split("\t")
                clusterId += 1
                if verbosity > 0:
                    print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
                memberId = 0
                for header in lHeaders:
                    memberId += 1
                    dHeader2ClusterClusterMember[header] = (clusterId, memberId)
            if verbosity > 0:
                print "%i clusters" % clusterId
        return dHeader2ClusterClusterMember, clusterId

    @staticmethod
    def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
        """
        Write a map file from fasta output of clustering tool.
        Warning: only works if input fasta headers are formated like LTRharvest fasta output.
        """
        if not outMapFileName:
            outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])

        fileDb = open(fastaFileNameFromClustering)
        fileMap = open(outMapFileName, "w")
        seq = Bioseq()
        numseq = 0
        seq.read(fileDb)
        while seq.sequence:
            numseq = numseq + 1
            ID = seq.header.split(' ')[0].split('_')[0]
            chunk = seq.header.split(' ')[0].split('_')[1]
            start = seq.header.split(' ')[-1].split(',')[0][1:]
            end = seq.header.split(' ')[-1].split(',')[1][:-1]
            line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
            fileMap.write("%s\n" % line)
            seq.read(fileDb)

        fileDb.close()
        fileMap.close()
        print "saved in %s" % outMapFileName

    @staticmethod
    def getNstretchesRangesList(fastaFileName, nbN = 2):
        lNstretchesRanges = []
        if nbN != 0:
            iBSDB = BioseqDB(fastaFileName)
            for iBS in iBSDB.db:
                nbNFound = 0
                start = 1
                pos = 1
                lastPos = 0

                while pos <= iBS.getLength():
                    if nbNFound == 0:
                        start = pos

                    while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
                        nbNFound += 1
                        pos += 1
                        lastPos = pos

                    if pos - lastPos >= nbN:
                        if nbNFound >= nbN:
                            lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
                        nbNFound = 0
                    pos += 1

                if nbNFound >= nbN:
                    lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))

            lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)

        return lNstretchesRanges


    @staticmethod
    def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
        lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)

        outFormat = outFormat.lower()
        if outFormat in ["gff", "gff3"]:
            outFormat = "gff3"
        else:
            outFormat = "map"

        if outFileName == "":
            outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)

        with open(outFileName, "w") as fH:
            if outFormat == "gff3":
                fH.write("##gff-version 3\n")
            for iRange in lNstretchesRanges:
                seq = iRange.getSeqname()
                start = iRange.getStart()
                end = iRange.getEnd()
                if outFormat == "gff3":
                    fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
                else:
                    fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
author	urgi-team
date	Thu, 21 Jul 2016 07:36:44 -0400
parents	feef9a0db09d
children