Mercurial > repos > urgi-team > teiso

diff TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 16:836ce3d9d47a draft default tip
Uploaded
author: urgi-team
date: Thu, 21 Jul 2016 07:42:47 -0400
parents: 255c852351c5
--- a/TEisotools-1.1.a/commons/core/seq/FastaUtils.py	Thu Jul 21 07:36:44 2016 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1163 +0,0 @@
-# Copyright INRA (Institut National de la Recherche Agronomique)
-# http://www.inra.fr
-# http://urgi.versailles.inra.fr
-#
-# This software is governed by the CeCILL license under French law and
-# abiding by the rules of distribution of free software.  You can  use, 
-# modify and/ or redistribute the software under the terms of the CeCILL
-# license as circulated by CEA, CNRS and INRIA at the following URL
-# "http://www.cecill.info". 
-#
-# As a counterpart to the access to the source code and  rights to copy,
-# modify and redistribute granted by the license, users are provided only
-# with a limited warranty  and the software's author,  the holder of the
-# economic rights,  and the successive licensors  have only  limited
-# liability. 
-#
-# In this respect, the user's attention is drawn to the risks associated
-# with loading,  using,  modifying and/or developing or reproducing the
-# software by the user in light of its specific status of free software,
-# that may mean  that it is complicated to manipulate,  and  that  also
-# therefore means  that it is reserved for developers  and  experienced
-# professionals having in-depth computer knowledge. Users are therefore
-# encouraged to load and test the software's suitability as regards their
-# requirements in conditions enabling the security of their systems and/or 
-# data to be ensured and,  more generally, to use and operate it in the 
-# same conditions as regards security. 
-#
-# The fact that you are presently reading this means that you have had
-# knowledge of the CeCILL license and that you accept its terms.
-
-
-import os
-import re
-import sys
-import math
-import glob
-import string
-import shutil
-from commons.core.utils.FileUtils import FileUtils
-from commons.core.seq.Bioseq import Bioseq
-from commons.core.seq.BioseqDB import BioseqDB
-from commons.core.coord.MapUtils import MapUtils
-from commons.core.coord.Range import Range
-from commons.core.coord.ConvCoord import ConvCoord
-from commons.core.parsing.FastaParser import FastaParser
-from commons.core.checker.CheckerUtils import CheckerUtils
-from commons.core.launcher.LauncherUtils import LauncherUtils
-
-
-## Static methods for fasta file manipulation
-#
-class FastaUtils( object ):
-    
-    ## Count the number of sequences in the input fasta file
-    #
-    # @param inFile name of the input fasta file
-    #
-    # @return integer number of sequences in the input fasta file
-    #
-    @staticmethod
-    def dbSize( inFile ):
-        nbSeq = 0
-        with open(inFile) as fH:
-            for line in fH:
-                if line[0] == ">":
-                    nbSeq += 1
-
-        return nbSeq
-    
-    
-    ## Compute the cumulative sequence length in the input fasta file
-    #
-    # @param inFile handler of the input fasta file
-    #
-    @staticmethod
-    def dbCumLength( inFile ):
-        cumLength = 0
-        for line in inFile:
-            if line[0] != ">":
-                cumLength += len(string.rstrip(line))
-    
-        return cumLength
-    
-    
-    ## Return a list with the length of each sequence in the input fasta file
-    #
-    # @param inFile string name of the input fasta file
-    #
-    @staticmethod
-    def dbLengths(inFile):
-        lLengths = []
-        currentLength = 0
-        with open(inFile) as fH:
-            for line in fH:
-                if line[0] == ">":
-                    if currentLength != 0:
-                        lLengths.append(currentLength)
-                    currentLength = 0
-                else:
-                    currentLength += len(line[:-1])
-            lLengths.append(currentLength)
-        
-        return lLengths
-        
-    
-    ## Retrieve the sequence headers present in the input fasta file
-    #
-    # @param inFile string name of the input fasta file
-    # @param verbose integer level of verbosity
-    #
-    # @return list of sequence headers
-    #
-    @staticmethod
-    def dbHeaders(inFile, verbose = 0):
-        lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
-        if verbose:
-            for header in lHeaders:
-                print header
-
-        return lHeaders
-    
-    
-    ## Cut a data bank into chunks according to the input parameters
-    # If a sequence is shorter than the threshold, it is only renamed (not cut)
-    #
-    # @param inFileName string name of the input fasta file
-    # @param chkLgth string chunk length (in bp, default=200000)
-    # @param chkOver string chunk overlap (in bp, default=10000)
-    # @param wordN string N stretch word length (default=11, 0 for no detection)
-    # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
-    # @param clean boolean remove 'cut' and 'Nstretch' files
-    # @param verbose integer (default = 0)
-    #
-    @staticmethod
-    def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
-        nbSeq = FastaUtils.dbSize(inFileName)
-        if verbose > 0:
-            print "cut the %i input sequences with cutterDB..." % nbSeq
-            sys.stdout.flush()
-            
-        prg = "cutterDB"
-        cmd = prg
-        cmd += " -l %s" % chkLgth
-        cmd += " -o %s" % chkOver
-        cmd += " -w %s" % wordN
-        cmd += " %s" % inFileName
-        returnStatus = os.system(cmd)
-        if returnStatus != 0:
-            msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
-            sys.stderr.write("%s\n" % msg)
-            sys.exit(1)
-            
-        nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
-        if verbose > 0:
-            print "done (%i chunks)" % ( nbChunks )
-            sys.stdout.flush()
-            
-        if verbose > 0:
-            print "rename the headers..."
-            sys.stdout.flush()
-            
-        if outFilePrefix == "":
-            outFastaName = inFileName + "_chunks.fa"
-            outMapName = inFileName + "_chunks.map"
-        else:
-            outFastaName = outFilePrefix + ".fa"
-            outMapName = outFilePrefix + ".map"
-            
-        with open("%s_cut" % inFileName) as inFile:
-            outFasta = open(outFastaName, "w")
-            outMap = open(outMapName, "w")
-
-            for line in inFile:
-                if line[0] == ">":
-                    if verbose > 1:
-                        print "rename '%s'" % (line[:-1]); sys.stdout.flush()
-                    data = line[:-1].split(" ")
-                    seqID = data[0].split(">")[1]
-                    newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
-                    oldHeader = data[2]
-                    seqStart = data[4].split("..")[0]
-                    seqEnd = data[4].split("..")[1]
-                    outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
-                    outFasta.write(">%s\n" % newHeader)
-
-                else:
-                    outFasta.write(line.upper())
-            
-            outFasta.close()
-            outMap.close()
-
-        #stats on .Nstretch.map file
-        genomeLength = FastaUtils.dbCumLength(open(inFileName))
-        NstretchMapFile = inFileName + ".Nstretch.map"
-        outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
-        if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
-                outNstrechStats.write("No N in stretch length > %s\n" % wordN)
-        else:
-            with open(NstretchMapFile) as f:
-                dHeader2lLengths = {}
-                for line in f:
-                    data = line.rstrip().split()
-                    header = data[1]
-                    length = int(data[3]) - int(data[2]) + 1
-                    if header not in dHeader2lLengths:
-                        dHeader2lLengths[header] = []
-                    dHeader2lLengths[header].append(length)
-
-            for header in sorted(dHeader2lLengths):
-                lLengths = dHeader2lLengths[header]
-                outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))
-
-            cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])
-
-            NstretchPrct = float(cumulAllStretch)/genomeLength*100
-            outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
-        outNstrechStats.close()       
-
-        if clean == True:
-            os.remove(inFileName + "_cut")
-            os.remove(NstretchMapFile)
-            
-    
-    ## Split the input fasta file in several output files
-    #
-    # @param inFile string name of the input fasta file
-    # @param nbSeqPerBatch integer number of sequences per output file
-    # @param newDir boolean put the sequences in a new directory called 'batches'
-    # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
-    # @param prefix prefix in output file name 
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
-        if not os.path.exists(inFile):
-            msg = "ERROR: file '%s' doesn't exist" % inFile
-            sys.stderr.write("%s\n" % msg)
-            sys.exit(1)
-            
-        nbSeq = FastaUtils.dbSize(inFile)
-        
-        nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
-        if verbose > 0:
-            print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
-            sys.stdout.flush()
-            
-        if nbSeqPerBatch != 1 and useSeqHeader:
-            useSeqHeader = False
-            
-        if newDir == True:
-            if os.path.exists("batches"):
-                shutil.rmtree("batches")
-            os.mkdir("batches")
-            os.chdir("batches")
-            os.system("ln -s ../%s ." % inFile)
-            
-        with open(inFile) as inFileHandler:
-            countBatch = 0
-            countSeq = 0
-            for line in inFileHandler:
-                if line[0] == ">":
-                    countSeq += 1
-                    if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
-                        try:
-                            outFile.close()
-                        except: pass
-                        countBatch += 1
-                        if useSeqHeader:
-                            outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
-                        else:
-                            outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
-                        outFile = open(outFileName, "w")
-
-                    if verbose > 1:
-                        print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
-                        sys.stdout.flush()
-                outFile.write(line)
-        
-        if newDir:
-            os.remove(os.path.basename(inFile))
-            os.chdir("..")
-            
-    
-    ## Split the input fasta file in several output files
-    #
-    # @param inFileName string name of the input fasta file
-    # @param maxSize integer max cumulative length for each output file
-    #
-    @staticmethod
-    def splitFastaFileInBatches(inFileName, maxSize = 200000):
-        iBioseqDB = BioseqDB(inFileName)
-        lHeadersSizeTuples = []
-        for iBioseq in iBioseqDB.db:
-            lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
-            
-        lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
-        os.mkdir("batches")
-        os.chdir("batches")
-        
-        iterator = 0 
-        for lHeader in lHeadersList:
-            iterator += 1
-            with open("batch_%s.fa" % iterator, 'w') as f:
-                for header in lHeader :
-                    iBioseq = iBioseqDB.fetch(header)
-                    iBioseq.write(f)
-        os.chdir("..")
-        
-                    
-    ## Split the input fasta file in several output files according to their cluster identifier
-    #
-    # @param inFileName string  name of the input fasta file
-    # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
-    # @param simplifyHeader boolean simplify the headers
-    # @param createDir boolean put the sequences in different directories
-    # @param outPrefix string prefix of the output files (default='seqCluster')
-    # @param verbose integer (default = 0)
-    #
-    @staticmethod
-    def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
-        if not os.path.exists(inFileName):
-            print "ERROR: %s doesn't exist" % inFileName
-            sys.exit(1)
-    
-        inFile = open(inFileName)
-    
-        line = inFile.readline()
-        if line:
-            name = line.split(" ")[0]
-            if "Cluster" in name:
-                clusterID = name.split("Cluster")[1].split("Mb")[0]
-                seqID = name.split("Mb")[1]
-            else:
-                clusterID = name.split("Cl")[0].split("Gr")[1]   # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
-                if "Q" in name.split("Gr")[0]:
-                    seqID = name.split("Gr")[0].split("MbQ")[1]
-                elif "S" in name:
-                    seqID = name.split("Gr")[0].split("MbS")[1]
-            sClusterIDs = set( [ clusterID ] )
-            if simplifyHeader == True:
-                header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
-            else:
-                header = line[1:-1]
-            if createDir == True:
-                if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
-                    os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
-                os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
-            outFileName = "%s%s.fa" % ( outPrefix, clusterID )
-            outFile = open( outFileName, "w" )
-            outFile.write( ">%s\n" % ( header ) )
-            prevClusterID = clusterID
-        
-            line = inFile.readline()
-            while line:
-                if line[0] == ">":
-                    name = line.split(" ")[0]
-                    if "Cluster" in name:
-                        clusterID = name.split("Cluster")[1].split("Mb")[0]
-                        seqID = name.split("Mb")[1]
-                    else:
-                        clusterID = name.split("Cl")[0].split("Gr")[1]
-                        if "Q" in name.split("Gr")[0]:
-                            seqID = name.split("Gr")[0].split("MbQ")[1]
-                        elif "S" in name:
-                            seqID = name.split("Gr")[0].split("MbS")[1]
-        
-                    if clusterID != prevClusterID:
-                        outFile.close()
-        
-                    if simplifyHeader == True:
-                        header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
-                    else:
-                        header = line[1:-1]
-        
-                    if createDir == True:
-                        os.chdir( ".." )
-                        if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
-                            os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
-                        os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
-        
-                    outFileName = "%s%s.fa" % ( outPrefix, clusterID )
-                    if not os.path.exists( outFileName ):
-                        outFile = open( outFileName, "w" )
-                    else:
-                        if clusterID != prevClusterID:
-                            outFile.close()
-                            outFile = open( outFileName, "a" )
-                    outFile.write( ">%s\n" % ( header ) )
-                    prevClusterID = clusterID
-                    sClusterIDs.add( clusterID )
-        
-                else:
-                    outFile.write( line )
-        
-                line = inFile.readline()
-        
-            outFile.close()
-            if verbose > 0:
-                print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
-                
-            if createDir == True:
-                os.chdir("..")
-        else:
-            print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
-                
-
-    ## Filter a fasta file in two fasta files using the length of each sequence as a criterion
-    #
-    # @param len_min integer    length sequence criterion to filter
-    # @param inFileName string  name of the input fasta file
-    # @param verbose integer (default = 0)      
-    #
-    @staticmethod
-    def dbLengthFilter(len_min, inFileName, verbose = 0):
-        file_db = open(inFileName,)
-        file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
-        file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
-        seq = Bioseq()
-        numseq = 0
-        nbsaveInf = 0
-        nbsaveSup = 0
-        seq.read(file_db)
-        while seq.getHeader():
-            l = seq.getLength()
-            numseq = numseq + 1
-            if l >= len_min:
-                seq.write(file_dbSup)
-                if verbose > 0:
-                        nbsaveSup = nbsaveSup + 1
-            else:
-                seq.write(file_dbInf)
-                if verbose > 0:
-                        nbsaveInf = nbsaveInf + 1
-            seq.read(file_db)
-
-        file_db.close()
-        file_dbInf.close()
-        file_dbSup.close()
-        if verbose > 0:
-            print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))
-
-
-    ## Extract the longest sequences from a fasta file
-    #
-    # @param num integer maximum number of sequences in the output file
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output fasta file
-    # @param minThresh integer minimum length threshold (default=0)
-    # @param verbose integer (default = 0)
-    #
-    @staticmethod
-    def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
-        bsDB = BioseqDB(inFileName)
-        if verbose > 0:
-            print "nb of input sequences: %i" % bsDB.getSize()
-    
-        if outFileName == "":
-            outFileName = inFileName + ".best" + str(num)
-        outFile = open( outFileName, "w" )
-        
-        if bsDB.getSize()==0:
-            return 0
-        
-        num = int(num)
-        if verbose > 0:
-            print "keep the %i longest sequences" % num
-            if minThresh > 0:
-                print "with length > %i bp" % minThresh
-            sys.stdout.flush()
-            
-        tmpLSeqLgth = bsDB.getListOfSequencesLength()
-        tmpLSeqLgth.sort(reverse = True)
-    
-        # select the longests
-        lSeqLgth = []
-        for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
-            if tmpLSeqLgth[i] >= minThresh:
-                lSeqLgth.append( tmpLSeqLgth[i] )
-        if verbose > 0:
-            print "selected max length: %i" % max(lSeqLgth)
-            print "selected min length: %i" % min(lSeqLgth)
-            sys.stdout.flush()
-    
-        # save the longest
-        inFile = open( inFileName )
-        seqNum = 0
-        nbSave = 0
-        for bs in bsDB.db:
-            seqNum += 1
-            if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
-                bs.write( outFile )
-                if verbose > 1:
-                    print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
-                    sys.stdout.flush()
-                nbSave += 1
-            if nbSave == num:
-                break
-        inFile.close()
-        outFile.close()
-        if verbose > 0:
-            print nbSave, "saved sequences in ", outFileName
-            sys.stdout.flush()
-            
-        return 0
-    
-    
-    ## Extract all the sequence headers from a fasta file and write them in a new file
-    #
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
-    #
-    @staticmethod
-    def dbExtractSeqHeaders(inFileName, outFileName = ""):
-        if not outFileName:
-            outFileName = inFileName + ".headers"
-
-        with open(outFileName, "w") as f:
-            for header in FastaUtils.dbHeaders(inFileName):
-                f.write("%s\n" % header)
-
-        return 0
-
-    
-    ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
-    #
-    # @param pattern regular expression to search in headers
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
-        if not pattern:
-            return
-
-        if not outFileName:
-            outFileName = inFileName + '.extracted'
-        outFile = open(outFileName, 'w')
-
-        patternTosearch = re.compile(pattern)
-        bioseq = Bioseq()
-        bioseqNb = 0
-        savedBioseqNb = 0
-        inFile = open(inFileName)
-        bioseq.read(inFile)
-        while bioseq.sequence:
-            bioseqNb = bioseqNb + 1
-            m = patternTosearch.search(bioseq.header)
-            if m:
-                bioseq.write(outFile)
-                if verbose > 1:
-                    print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
-                savedBioseqNb = savedBioseqNb + 1
-            bioseq.read(inFile)
-        inFile.close()
-
-        outFile.close()
-
-        if verbose > 0:
-            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
-            
-    
-    ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
-    #
-    # @param patternFileName string file containing regular expression to search in headers
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
-        if not patternFileName:
-            print "ERROR: no file of pattern"
-            sys.exit(1)
-
-        bioseq = Bioseq()
-        bioseqNb = 0
-        savedBioseqNb = 0
-        lHeaders = []
-
-        with open(inFileName) as inFile:
-            bioseq.read(inFile)
-            while bioseq.sequence:
-                lHeaders.append(bioseq.header)
-                bioseq.read(inFile)
-
-        lHeadersToKeep = []
-        with open(patternFileName) as patternFile:
-            for pattern in patternFile:
-                pattern = pattern.rstrip()
-                if verbose > 0:
-                    print "pattern: ", pattern; sys.stdout.flush()
-
-                patternToSearch = re.compile(pattern)
-                lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])
-
-        if not outFileName:
-            outFileName = inFileName + ".extracted"
-
-        with open(outFileName, "w") as outFile:
-            with open(inFileName) as inFile:
-                bioseq.read(inFile)
-                while bioseq.sequence:
-                    bioseqNb += 1
-                    if bioseq.header in lHeadersToKeep:
-                        bioseq.write(outFile)
-                        savedBioseqNb += 1
-                        if verbose > 1:
-                            print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
-                    bioseq.read(inFile)
-
-        if verbose > 0:
-            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
-            
-    
-    ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
-    #
-    # @param pattern regular expression to search in headers
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
-        if not pattern:
-            return
-
-        patternToSearch = re.compile(pattern)
-
-        if outFileName == "":
-            outFileName = inFileName + '.cleaned'
-
-        with open(outFileName, 'w') as outFile:
-            bioseq = Bioseq()
-            bioseqNb = 0
-            savedBioseqNb = 0
-            with open(inFileName) as inFile:
-                bioseq.read(inFile)
-                while bioseq.sequence:
-                    bioseqNb += 1
-                    if not patternToSearch.search(bioseq.header):
-                        bioseq.write(outFile)
-                        savedBioseqNb += 1
-                        if verbose > 1:
-                            print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
-                    bioseq.read(inFile)
-
-        if verbose > 0:
-            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
-            
-    
-    ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
-    #
-    # @param patternFileName string file containing regular expression to search in headers
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
-        if not patternFileName:
-            print "ERROR: no file of pattern"
-            sys.exit(1)
-
-        bioseq = Bioseq()
-        bioseqNb = 0
-        savedBioseqNb = 0
-        lHeaders = []
-        with open(inFileName) as inFile:
-            bioseq.read(inFile)
-            while bioseq.sequence:
-                lHeaders.append(bioseq.header)
-                bioseq.read(inFile)
-
-        with open(patternFileName) as patternFile:
-            lHeadersToRemove = []
-            for pattern in patternFile:
-                pattern = pattern.rstrip()
-                if verbose > 0:
-                    print "pattern: ", pattern; sys.stdout.flush()
-
-                patternToSearch = re.compile(pattern)
-                lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])
-
-        if not outFileName:
-            outFileName = inFileName + '.cleaned'
-        
-        with open(outFileName, 'w') as outFile:
-            bioseqNum = 0
-            with open(inFileName) as inFile:
-                bioseq.read(inFile)
-                while bioseq.sequence:
-                    bioseqNum += 1
-                    if bioseq.header not in lHeadersToRemove:
-                        bioseq.write(outFile)
-                        savedBioseqNb += 1
-                        if verbose > 1:
-                            print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
-                    bioseq.read(inFile)
-
-        if verbose > 0:
-            print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
-            
-    
-    ## Find sequence's ORFs from a fasta file and write them in a tab file 
-    # 
-    # @param inFileName string name of the input fasta file
-    # @param orfMaxNb integer Select orfMaxNb best ORFs
-    # @param orfMinLength integer Keep ORFs with length > orfMinLength 
-    # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
-    # @param verbose integer verbosity level (default = 0)
-    #
-    @staticmethod
-    def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
-        if not outFileName:
-            outFileName = inFileName + ".ORF.map"
-        outFile = open(outFileName, "w")
-
-        bioseq = Bioseq()
-        bioseqNb = 0
-
-        inFile = open(inFileName)
-        bioseq.read(inFile)
-        while bioseq.sequence:
-            bioseq.upCase()
-            bioseqNb += 1
-            if verbose > 0:
-                print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'
-
-            orf = bioseq.findORF()
-            bestOrf = []
-            for i in orf.keys():
-                orfLen = len(orf[i])
-                for j in xrange(1, orfLen):
-                    start = orf[i][j - 1] + 4
-                    end = orf[i][j] + 3
-                    if end - start >= orfMinLength:
-                        bestOrf.append((end - start, i + 1, start, end))
-
-            bioseq.reverseComplement()
-
-            orf = bioseq.findORF()
-            seqLen = bioseq.getLength()
-            for i in orf.keys():
-                orfLen = len(orf[i])
-                for j in xrange(1, orfLen):
-                    start = seqLen - orf[i][j - 1] - 3
-                    end = seqLen - orf[i][j] - 2
-                    if start - end >= orfMinLength:
-                        bestOrf.append((start - end, (i + 1) * -1, start, end))
-
-            bestOrf.sort(reverse = True)
-            bestOrfNb = len(bestOrf)
-            if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
-                bestOrfNb = orfMaxNb
-            for i in xrange(0, bestOrfNb):
-                if verbose > 0:
-                    print bestOrf[i]
-                outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
-                                   "|" + str(bestOrf[i][0]), bioseq.header,
-                                   bestOrf[i][2], bestOrf[i][3]))
-            bioseq.read(inFile)
-
-        inFile.close()
-        outFile.close()
-
-        return 0
-
-
-    ## Sort sequences by increasing length (write a new file)
-    #
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output fasta file
-    # @param verbose integer verbosity level
-    #
-    @staticmethod
-    def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
-        if verbose > 0:
-            print "sort sequences by increasing length"
-            sys.stdout.flush()
-        if not os.path.exists(inFileName):
-            print "ERROR: file '%s' doesn't exist" % (inFileName)
-            sys.exit(1)
-
-        # read each seq one by one
-        # save them in distinct temporary files
-        # with their length in the name
-        inFileHandler = open(inFileName, "r")
-        countSeq = 0
-        bs = Bioseq()
-        bs.read(inFileHandler)
-        while bs.header:
-            countSeq += 1
-            tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
-            bs.appendBioseqInFile(tmpFile)
-            if verbose > 1:
-                print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
-            bs.header = ""
-            bs.sequence = ""
-            bs.read(inFileHandler)
-        inFileHandler.close()
-
-        # sort temporary file names
-        # concatenate them into the output file
-        if os.path.exists(outFileName):
-            os.remove(outFileName)
-        lFiles = glob.glob("*bp_*nb")
-        lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
-        for fileName in lFiles:
-            cmd = "cat %s >> %s" % (fileName, outFileName)
-            returnValue = os.system(cmd)
-            if returnValue != 0:
-                print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
-                sys.exit(1)
-            os.remove(fileName)
-
-        return 0
-
-    
-    ## Sort sequences by header
-    #
-    # @param inFileName string name of the input fasta file
-    # @param outFileName string name of the output fasta file
-    # @param verbose integer verbosity level   
-    #
-    @staticmethod
-    def sortSequencesByHeader(inFileName, outFileName = ""):
-        if outFileName == "":
-            outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
-        iBioseqDB = BioseqDB(inFileName)
-        with open(outFileName, "w") as f:
-            for header in sorted(iBioseqDB.getHeaderList()):
-                iBioseq = iBioseqDB.fetch(header)
-                iBioseq.write(f)
-
-    
-    ## Return a dictionary which keys are the headers and values the length of the sequences
-    #
-    # @param inFile string name of the input fasta file
-    # @param verbose integer verbosity level   
-    #
-    @staticmethod
-    def getLengthPerHeader(inFile, verbose = 0):
-        dHeader2Length = {}
-
-        with open(inFile) as inFileHandler:
-            currentSeqHeader = ""
-            currentSeqLength = 0
-            for line in inFileHandler:
-                if line[0] == ">":
-                    if currentSeqHeader != "":
-                        dHeader2Length[currentSeqHeader] = currentSeqLength
-                        currentSeqLength = 0
-                    currentSeqHeader = line[1:-1]
-                    if verbose > 0:
-                        print "current header: %s" % currentSeqHeader
-                        sys.stdout.flush()
-                else:
-                    currentSeqLength += len(line.replace("\n", ""))
-            dHeader2Length[currentSeqHeader] = currentSeqLength
-
-        return dHeader2Length
-    
-    
-    ## Convert headers from a fasta file having chunk coordinates
-    #
-    # @param inFile string name of the input fasta file
-    # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
-    # @param outFile string name of the output file
-    #
-    @staticmethod
-    def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
-        inFileHandler = open(inFile, "r")
-        outFileHandler = open(outFile, "w")
-        dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
-        iConvCoord = ConvCoord()
-        for line in inFileHandler:
-            if line[0] == ">":
-                if "{Fragment}" in line:
-                    chkName = line.split(" ")[1]
-                    chrName = dChunk2Map[chkName].seqname
-                    lCoordPairs = line.split(" ")[3].split(",")
-                    lRangesOnChk = []
-                    for i in lCoordPairs:
-                        iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
-                        lRangesOnChk.append(iRange)
-                    lRangesOnChr = []
-                    for iRange in lRangesOnChk:
-                        lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
-                    newHeader = line[1:-1].split(" ")[0]
-                    newHeader += " %s" % chrName
-                    newHeader += " {Fragment}"
-                    newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
-                    for iRange in lRangesOnChr[1:]:
-                        newHeader += ",%i..%i" % (iRange.start, iRange.end)
-                    outFileHandler.write(">%s\n" % newHeader)
-                else:
-                    chkName = line.split("_")[1].split(" ")[0]
-                    chrName = dChunk2Map[chkName].seqname
-                    coords = line[line.find("[")+1 : line.find("]")]
-                    start = int(coords.split(",")[0])
-                    end = int(coords.split(",")[1])
-                    iRangeOnChk = Range(chkName, start, end)
-                    iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
-                    newHeader = line[1:-1].split("_")[0]
-                    newHeader += " %s" % chrName
-                    newHeader += " %s" % line[line.find("(") : line.find(")")+1]
-                    newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
-                    outFileHandler.write(">%s\n" % newHeader)
-            else:
-                outFileHandler.write(line)
-        inFileHandler.close()
-        outFileHandler.close()
-        
-    
-    ## Convert a fasta file to a length file
-    #
-    # @param inFile string name of the input fasta file
-    # @param outFile string name of the output file
-    #
-    @staticmethod
-    def convertFastaToLength(inFile, outFile = ""):
-        if not outFile:
-            outFile = "%s.length" % inFile
-        
-        if inFile:
-            with open(inFile) as inFH:
-                with open(outFile, "w") as outFH:
-                    bioseq = Bioseq()
-                    bioseq.read(inFH)
-                    while bioseq.sequence:
-                        seqLen = bioseq.getLength()
-                        outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
-                        bioseq.read(inFH)
-
-    
-    ## Convert a fasta file to a seq file
-    #
-    # @param inFile string name of the input fasta file
-    # @param outFile string name of the output file
-    #
-    @staticmethod
-    def convertFastaToSeq(inFile, outFile = ""):
-        if not outFile:
-            outFile = "%s.seq" % inFile
-        
-        if inFile:
-            with open(inFile) as inFH:
-                with open(outFile, "w") as outFH:
-                    bioseq = Bioseq()
-                    bioseq.read(inFH)
-                    while bioseq.sequence:
-                        seqLen = bioseq.getLength()
-                        outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
-                                                bioseq.sequence, bioseq.header, seqLen))
-                        bioseq.read(inFH)
-
-
-    ## Splice an input fasta file using coordinates in a Map file
-    #
-    # @note the coordinates should be merged beforehand!
-    #
-    @staticmethod
-    def spliceFromCoords(genomeFile, coordFile, obsFile):
-        genomeFileHandler = open(genomeFile)
-        obsFileHandler = open(obsFile, "w")
-        dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)
-
-        bs = Bioseq()
-        bs.read(genomeFileHandler)
-        while bs.sequence:
-
-            if dChr2Maps.has_key(bs.header):
-                lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
-                splicedSeq = []
-                currentSite = 0
-                for iMap in lCoords:
-                    minSplice = iMap.getMin() - 1
-                    if minSplice > currentSite:
-                        splicedSeq += bs.sequence[currentSite : minSplice]
-                    if currentSite <= iMap.getMax():
-                        currentSite = iMap.getMax()
-                splicedSeq += bs.sequence[currentSite:]
-                bs.sequence = "".join(splicedSeq)
-            bs.write(obsFileHandler)
-            bs.read(genomeFileHandler)
-
-        genomeFileHandler.close()
-        obsFileHandler.close()
-        
-    
-    ## Shuffle input sequences (single file or files in a directory)
-    #
-    @staticmethod
-    def dbShuffle(inData, outData, verbose = 0):
-        if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
-            prg = "esl-shuffle"
-        else : prg = "shuffle"
-        genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
-        if os.path.isfile(inData):
-            if verbose > 0:
-                print "shuffle input file '%s'" % inData
-            cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
-            print cmd
-            returnStatus = os.system(cmd)
-            if returnStatus:
-                sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
-                sys.exit(1)
-
-        elif os.path.isdir(inData):
-            if verbose > 0:
-                print "shuffle files in input directory '%s'" % inData
-            if os.path.exists(outData):
-                shutil.rmtree(outData)
-            os.mkdir(outData)
-            lInputFiles = glob.glob("%s/*.fa" % inData)
-            nbFastaFiles = 0
-            for inputFile in lInputFiles:
-                nbFastaFiles += 1
-                if verbose > 1:
-                    print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
-                fastaBaseName = os.path.basename(inputFile)
-                prefix = os.path.splitext(fastaBaseName)[0]
-                cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
-                returnStatus = os.system(cmd)
-                if returnStatus:
-                    sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
-                    sys.exit(1)
-                    
-    
-    ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
-    #
-    # @param inClusterFileName string input cluster file name
-    # @param inFastaFileName string input fasta file name
-    # @param outFileName string output file name
-    # @param verbosity integer verbosity
-    #
-    @staticmethod
-    def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
-        dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
-        iFastaParser = FastaParser(inFastaFileName)
-        with open(outFileName, "w") as f:
-            for iSequence in iFastaParser.getIterator():
-                
-                header = iSequence.getName()
-                if dHeader2ClusterClusterMember.get(header):
-                    cluster = dHeader2ClusterClusterMember[header][0]
-                    member = dHeader2ClusterClusterMember[header][1]
-                else:
-                    clusterIdForSingletonCluster +=  1
-                    cluster = clusterIdForSingletonCluster
-                    member = 1
-                
-                newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
-                iSequence.setName(newHeader)
-                f.write(iSequence.printFasta())
-              
-    @staticmethod
-    def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
-        dHeader2ClusterClusterMember = {}
-        clusterId = 0
-        with open(inClusterFileName) as f:
-            for line in f:
-                lineWithoutLastChar = line.rstrip()
-                lHeaders = lineWithoutLastChar.split("\t")
-                clusterId += 1
-                if verbosity > 0:
-                    print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
-                memberId = 0
-                for header in lHeaders:
-                    memberId += 1
-                    dHeader2ClusterClusterMember[header] = (clusterId, memberId)
-            if verbosity > 0:
-                print "%i clusters" % clusterId
-        return dHeader2ClusterClusterMember, clusterId
-    
-    @staticmethod
-    def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
-        """
-        Write a map file from fasta output of clustering tool.
-        Warning: only works if input fasta headers are formated like LTRharvest fasta output.
-        """
-        if not outMapFileName:
-            outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
-        
-        fileDb = open(fastaFileNameFromClustering)
-        fileMap = open(outMapFileName, "w")
-        seq = Bioseq()
-        numseq = 0
-        seq.read(fileDb)
-        while seq.sequence:
-            numseq = numseq + 1
-            ID = seq.header.split(' ')[0].split('_')[0]
-            chunk = seq.header.split(' ')[0].split('_')[1]
-            start = seq.header.split(' ')[-1].split(',')[0][1:]
-            end = seq.header.split(' ')[-1].split(',')[1][:-1]
-            line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
-            fileMap.write("%s\n" % line)
-            seq.read(fileDb)
-    
-        fileDb.close()
-        fileMap.close()
-        print "saved in %s" % outMapFileName
-        
-    @staticmethod    
-    def getNstretchesRangesList(fastaFileName, nbN = 2):
-        lNstretchesRanges = []
-        if nbN != 0:
-            iBSDB = BioseqDB(fastaFileName)
-            for iBS in iBSDB.db:
-                nbNFound = 0
-                start = 1
-                pos = 1
-                lastPos = 0
-                
-                while pos <= iBS.getLength():
-                    if nbNFound == 0:
-                        start = pos
-                        
-                    while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
-                        nbNFound += 1
-                        pos += 1
-                        lastPos = pos
-                    
-                    if pos - lastPos >= nbN:
-                        if nbNFound >= nbN:
-                            lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
-                        nbNFound = 0
-                    pos += 1
-                
-                if nbNFound >= nbN:
-                    lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
-    
-            lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)
-            
-        return lNstretchesRanges
-        
-
-    @staticmethod
-    def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
-        lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)
-        
-        outFormat = outFormat.lower()
-        if outFormat in ["gff", "gff3"]:
-            outFormat = "gff3"
-        else:
-            outFormat = "map"
-            
-        if outFileName == "":
-            outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
-        
-        with open(outFileName, "w") as fH:
-            if outFormat == "gff3":
-                fH.write("##gff-version 3\n")
-            for iRange in lNstretchesRanges:
-                seq = iRange.getSeqname()
-                start = iRange.getStart()
-                end = iRange.getEnd()
-                if outFormat == "gff3":
-                    fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
-                else:
-                    fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
-                
\ No newline at end of file
author	urgi-team
date	Thu, 21 Jul 2016 07:42:47 -0400
parents	255c852351c5
children