Mercurial > repos > urgi-team > teiso
diff TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 16:836ce3d9d47a draft default tip
Uploaded
author | urgi-team |
---|---|
date | Thu, 21 Jul 2016 07:42:47 -0400 |
parents | 255c852351c5 |
children |
line wrap: on
line diff
--- a/TEisotools-1.1.a/commons/core/seq/FastaUtils.py Thu Jul 21 07:36:44 2016 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1163 +0,0 @@ -# Copyright INRA (Institut National de la Recherche Agronomique) -# http://www.inra.fr -# http://urgi.versailles.inra.fr -# -# This software is governed by the CeCILL license under French law and -# abiding by the rules of distribution of free software. You can use, -# modify and/ or redistribute the software under the terms of the CeCILL -# license as circulated by CEA, CNRS and INRIA at the following URL -# "http://www.cecill.info". -# -# As a counterpart to the access to the source code and rights to copy, -# modify and redistribute granted by the license, users are provided only -# with a limited warranty and the software's author, the holder of the -# economic rights, and the successive licensors have only limited -# liability. -# -# In this respect, the user's attention is drawn to the risks associated -# with loading, using, modifying and/or developing or reproducing the -# software by the user in light of its specific status of free software, -# that may mean that it is complicated to manipulate, and that also -# therefore means that it is reserved for developers and experienced -# professionals having in-depth computer knowledge. Users are therefore -# encouraged to load and test the software's suitability as regards their -# requirements in conditions enabling the security of their systems and/or -# data to be ensured and, more generally, to use and operate it in the -# same conditions as regards security. -# -# The fact that you are presently reading this means that you have had -# knowledge of the CeCILL license and that you accept its terms. - - -import os -import re -import sys -import math -import glob -import string -import shutil -from commons.core.utils.FileUtils import FileUtils -from commons.core.seq.Bioseq import Bioseq -from commons.core.seq.BioseqDB import BioseqDB -from commons.core.coord.MapUtils import MapUtils -from commons.core.coord.Range import Range -from commons.core.coord.ConvCoord import ConvCoord -from commons.core.parsing.FastaParser import FastaParser -from commons.core.checker.CheckerUtils import CheckerUtils -from commons.core.launcher.LauncherUtils import LauncherUtils - - -## Static methods for fasta file manipulation -# -class FastaUtils( object ): - - ## Count the number of sequences in the input fasta file - # - # @param inFile name of the input fasta file - # - # @return integer number of sequences in the input fasta file - # - @staticmethod - def dbSize( inFile ): - nbSeq = 0 - with open(inFile) as fH: - for line in fH: - if line[0] == ">": - nbSeq += 1 - - return nbSeq - - - ## Compute the cumulative sequence length in the input fasta file - # - # @param inFile handler of the input fasta file - # - @staticmethod - def dbCumLength( inFile ): - cumLength = 0 - for line in inFile: - if line[0] != ">": - cumLength += len(string.rstrip(line)) - - return cumLength - - - ## Return a list with the length of each sequence in the input fasta file - # - # @param inFile string name of the input fasta file - # - @staticmethod - def dbLengths(inFile): - lLengths = [] - currentLength = 0 - with open(inFile) as fH: - for line in fH: - if line[0] == ">": - if currentLength != 0: - lLengths.append(currentLength) - currentLength = 0 - else: - currentLength += len(line[:-1]) - lLengths.append(currentLength) - - return lLengths - - - ## Retrieve the sequence headers present in the input fasta file - # - # @param inFile string name of the input fasta file - # @param verbose integer level of verbosity - # - # @return list of sequence headers - # - @staticmethod - def dbHeaders(inFile, verbose = 0): - lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"] - if verbose: - for header in lHeaders: - print header - - return lHeaders - - - ## Cut a data bank into chunks according to the input parameters - # If a sequence is shorter than the threshold, it is only renamed (not cut) - # - # @param inFileName string name of the input fasta file - # @param chkLgth string chunk length (in bp, default=200000) - # @param chkOver string chunk overlap (in bp, default=10000) - # @param wordN string N stretch word length (default=11, 0 for no detection) - # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') - # @param clean boolean remove 'cut' and 'Nstretch' files - # @param verbose integer (default = 0) - # - @staticmethod - def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0): - nbSeq = FastaUtils.dbSize(inFileName) - if verbose > 0: - print "cut the %i input sequences with cutterDB..." % nbSeq - sys.stdout.flush() - - prg = "cutterDB" - cmd = prg - cmd += " -l %s" % chkLgth - cmd += " -o %s" % chkOver - cmd += " -w %s" % wordN - cmd += " %s" % inFileName - returnStatus = os.system(cmd) - if returnStatus != 0: - msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus) - sys.stderr.write("%s\n" % msg) - sys.exit(1) - - nbChunks = FastaUtils.dbSize("%s_cut" % inFileName) - if verbose > 0: - print "done (%i chunks)" % ( nbChunks ) - sys.stdout.flush() - - if verbose > 0: - print "rename the headers..." - sys.stdout.flush() - - if outFilePrefix == "": - outFastaName = inFileName + "_chunks.fa" - outMapName = inFileName + "_chunks.map" - else: - outFastaName = outFilePrefix + ".fa" - outMapName = outFilePrefix + ".map" - - with open("%s_cut" % inFileName) as inFile: - outFasta = open(outFastaName, "w") - outMap = open(outMapName, "w") - - for line in inFile: - if line[0] == ">": - if verbose > 1: - print "rename '%s'" % (line[:-1]); sys.stdout.flush() - data = line[:-1].split(" ") - seqID = data[0].split(">")[1] - newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks)))) - oldHeader = data[2] - seqStart = data[4].split("..")[0] - seqEnd = data[4].split("..")[1] - outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd)) - outFasta.write(">%s\n" % newHeader) - - else: - outFasta.write(line.upper()) - - outFasta.close() - outMap.close() - - #stats on .Nstretch.map file - genomeLength = FastaUtils.dbCumLength(open(inFileName)) - NstretchMapFile = inFileName + ".Nstretch.map" - outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w") - if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile): - outNstrechStats.write("No N in stretch length > %s\n" % wordN) - else: - with open(NstretchMapFile) as f: - dHeader2lLengths = {} - for line in f: - data = line.rstrip().split() - header = data[1] - length = int(data[3]) - int(data[2]) + 1 - if header not in dHeader2lLengths: - dHeader2lLengths[header] = [] - dHeader2lLengths[header].append(length) - - for header in sorted(dHeader2lLengths): - lLengths = dHeader2lLengths[header] - outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths))) - - cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()]) - - NstretchPrct = float(cumulAllStretch)/genomeLength*100 - outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct)) - outNstrechStats.close() - - if clean == True: - os.remove(inFileName + "_cut") - os.remove(NstretchMapFile) - - - ## Split the input fasta file in several output files - # - # @param inFile string name of the input fasta file - # @param nbSeqPerBatch integer number of sequences per output file - # @param newDir boolean put the sequences in a new directory called 'batches' - # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') - # @param prefix prefix in output file name - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0): - if not os.path.exists(inFile): - msg = "ERROR: file '%s' doesn't exist" % inFile - sys.stderr.write("%s\n" % msg) - sys.exit(1) - - nbSeq = FastaUtils.dbSize(inFile) - - nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch))) - if verbose > 0: - print "save the %i input sequences into %i batches" % (nbSeq, nbBatches) - sys.stdout.flush() - - if nbSeqPerBatch != 1 and useSeqHeader: - useSeqHeader = False - - if newDir == True: - if os.path.exists("batches"): - shutil.rmtree("batches") - os.mkdir("batches") - os.chdir("batches") - os.system("ln -s ../%s ." % inFile) - - with open(inFile) as inFileHandler: - countBatch = 0 - countSeq = 0 - for line in inFileHandler: - if line[0] == ">": - countSeq += 1 - if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: - try: - outFile.close() - except: pass - countBatch += 1 - if useSeqHeader: - outFileName = "%s.fa" % (line[1:-1].replace(" ", "_")) - else: - outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches)))) - outFile = open(outFileName, "w") - - if verbose > 1: - print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName) - sys.stdout.flush() - outFile.write(line) - - if newDir: - os.remove(os.path.basename(inFile)) - os.chdir("..") - - - ## Split the input fasta file in several output files - # - # @param inFileName string name of the input fasta file - # @param maxSize integer max cumulative length for each output file - # - @staticmethod - def splitFastaFileInBatches(inFileName, maxSize = 200000): - iBioseqDB = BioseqDB(inFileName) - lHeadersSizeTuples = [] - for iBioseq in iBioseqDB.db: - lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) - - lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) - os.mkdir("batches") - os.chdir("batches") - - iterator = 0 - for lHeader in lHeadersList: - iterator += 1 - with open("batch_%s.fa" % iterator, 'w') as f: - for header in lHeader : - iBioseq = iBioseqDB.fetch(header) - iBioseq.write(f) - os.chdir("..") - - - ## Split the input fasta file in several output files according to their cluster identifier - # - # @param inFileName string name of the input fasta file - # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) - # @param simplifyHeader boolean simplify the headers - # @param createDir boolean put the sequences in different directories - # @param outPrefix string prefix of the output files (default='seqCluster') - # @param verbose integer (default = 0) - # - @staticmethod - def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0): - if not os.path.exists(inFileName): - print "ERROR: %s doesn't exist" % inFileName - sys.exit(1) - - inFile = open(inFileName) - - line = inFile.readline() - if line: - name = line.split(" ")[0] - if "Cluster" in name: - clusterID = name.split("Cluster")[1].split("Mb")[0] - seqID = name.split("Mb")[1] - else: - clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust - if "Q" in name.split("Gr")[0]: - seqID = name.split("Gr")[0].split("MbQ")[1] - elif "S" in name: - seqID = name.split("Gr")[0].split("MbS")[1] - sClusterIDs = set( [ clusterID ] ) - if simplifyHeader == True: - header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) - else: - header = line[1:-1] - if createDir == True: - if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): - os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) - os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) - outFileName = "%s%s.fa" % ( outPrefix, clusterID ) - outFile = open( outFileName, "w" ) - outFile.write( ">%s\n" % ( header ) ) - prevClusterID = clusterID - - line = inFile.readline() - while line: - if line[0] == ">": - name = line.split(" ")[0] - if "Cluster" in name: - clusterID = name.split("Cluster")[1].split("Mb")[0] - seqID = name.split("Mb")[1] - else: - clusterID = name.split("Cl")[0].split("Gr")[1] - if "Q" in name.split("Gr")[0]: - seqID = name.split("Gr")[0].split("MbQ")[1] - elif "S" in name: - seqID = name.split("Gr")[0].split("MbS")[1] - - if clusterID != prevClusterID: - outFile.close() - - if simplifyHeader == True: - header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) - else: - header = line[1:-1] - - if createDir == True: - os.chdir( ".." ) - if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): - os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) - os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) - - outFileName = "%s%s.fa" % ( outPrefix, clusterID ) - if not os.path.exists( outFileName ): - outFile = open( outFileName, "w" ) - else: - if clusterID != prevClusterID: - outFile.close() - outFile = open( outFileName, "a" ) - outFile.write( ">%s\n" % ( header ) ) - prevClusterID = clusterID - sClusterIDs.add( clusterID ) - - else: - outFile.write( line ) - - line = inFile.readline() - - outFile.close() - if verbose > 0: - print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() - - if createDir == True: - os.chdir("..") - else: - print "WARNING: empty input file - no cluster found"; sys.stdout.flush() - - - ## Filter a fasta file in two fasta files using the length of each sequence as a criterion - # - # @param len_min integer length sequence criterion to filter - # @param inFileName string name of the input fasta file - # @param verbose integer (default = 0) - # - @staticmethod - def dbLengthFilter(len_min, inFileName, verbose = 0): - file_db = open(inFileName,) - file_dbInf = open(inFileName + ".Inf" + str(len_min), "w") - file_dbSup = open(inFileName + ".Sup" + str(len_min), "w") - seq = Bioseq() - numseq = 0 - nbsaveInf = 0 - nbsaveSup = 0 - seq.read(file_db) - while seq.getHeader(): - l = seq.getLength() - numseq = numseq + 1 - if l >= len_min: - seq.write(file_dbSup) - if verbose > 0: - nbsaveSup = nbsaveSup + 1 - else: - seq.write(file_dbInf) - if verbose > 0: - nbsaveInf = nbsaveInf + 1 - seq.read(file_db) - - file_db.close() - file_dbInf.close() - file_dbSup.close() - if verbose > 0: - print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min)) - - - ## Extract the longest sequences from a fasta file - # - # @param num integer maximum number of sequences in the output file - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output fasta file - # @param minThresh integer minimum length threshold (default=0) - # @param verbose integer (default = 0) - # - @staticmethod - def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0): - bsDB = BioseqDB(inFileName) - if verbose > 0: - print "nb of input sequences: %i" % bsDB.getSize() - - if outFileName == "": - outFileName = inFileName + ".best" + str(num) - outFile = open( outFileName, "w" ) - - if bsDB.getSize()==0: - return 0 - - num = int(num) - if verbose > 0: - print "keep the %i longest sequences" % num - if minThresh > 0: - print "with length > %i bp" % minThresh - sys.stdout.flush() - - tmpLSeqLgth = bsDB.getListOfSequencesLength() - tmpLSeqLgth.sort(reverse = True) - - # select the longests - lSeqLgth = [] - for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): - if tmpLSeqLgth[i] >= minThresh: - lSeqLgth.append( tmpLSeqLgth[i] ) - if verbose > 0: - print "selected max length: %i" % max(lSeqLgth) - print "selected min length: %i" % min(lSeqLgth) - sys.stdout.flush() - - # save the longest - inFile = open( inFileName ) - seqNum = 0 - nbSave = 0 - for bs in bsDB.db: - seqNum += 1 - if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: - bs.write( outFile ) - if verbose > 1: - print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) - sys.stdout.flush() - nbSave += 1 - if nbSave == num: - break - inFile.close() - outFile.close() - if verbose > 0: - print nbSave, "saved sequences in ", outFileName - sys.stdout.flush() - - return 0 - - - ## Extract all the sequence headers from a fasta file and write them in a new file - # - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') - # - @staticmethod - def dbExtractSeqHeaders(inFileName, outFileName = ""): - if not outFileName: - outFileName = inFileName + ".headers" - - with open(outFileName, "w") as f: - for header in FastaUtils.dbHeaders(inFileName): - f.write("%s\n" % header) - - return 0 - - - ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file - # - # @param pattern regular expression to search in headers - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0): - if not pattern: - return - - if not outFileName: - outFileName = inFileName + '.extracted' - outFile = open(outFileName, 'w') - - patternTosearch = re.compile(pattern) - bioseq = Bioseq() - bioseqNb = 0 - savedBioseqNb = 0 - inFile = open(inFileName) - bioseq.read(inFile) - while bioseq.sequence: - bioseqNb = bioseqNb + 1 - m = patternTosearch.search(bioseq.header) - if m: - bioseq.write(outFile) - if verbose > 1: - print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!' - savedBioseqNb = savedBioseqNb + 1 - bioseq.read(inFile) - inFile.close() - - outFile.close() - - if verbose > 0: - print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) - - - ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file - # - # @param patternFileName string file containing regular expression to search in headers - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): - if not patternFileName: - print "ERROR: no file of pattern" - sys.exit(1) - - bioseq = Bioseq() - bioseqNb = 0 - savedBioseqNb = 0 - lHeaders = [] - - with open(inFileName) as inFile: - bioseq.read(inFile) - while bioseq.sequence: - lHeaders.append(bioseq.header) - bioseq.read(inFile) - - lHeadersToKeep = [] - with open(patternFileName) as patternFile: - for pattern in patternFile: - pattern = pattern.rstrip() - if verbose > 0: - print "pattern: ", pattern; sys.stdout.flush() - - patternToSearch = re.compile(pattern) - lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)]) - - if not outFileName: - outFileName = inFileName + ".extracted" - - with open(outFileName, "w") as outFile: - with open(inFileName) as inFile: - bioseq.read(inFile) - while bioseq.sequence: - bioseqNb += 1 - if bioseq.header in lHeadersToKeep: - bioseq.write(outFile) - savedBioseqNb += 1 - if verbose > 1: - print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() - bioseq.read(inFile) - - if verbose > 0: - print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) - - - ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file - # - # @param pattern regular expression to search in headers - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0): - if not pattern: - return - - patternToSearch = re.compile(pattern) - - if outFileName == "": - outFileName = inFileName + '.cleaned' - - with open(outFileName, 'w') as outFile: - bioseq = Bioseq() - bioseqNb = 0 - savedBioseqNb = 0 - with open(inFileName) as inFile: - bioseq.read(inFile) - while bioseq.sequence: - bioseqNb += 1 - if not patternToSearch.search(bioseq.header): - bioseq.write(outFile) - savedBioseqNb += 1 - if verbose > 1: - print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!' - bioseq.read(inFile) - - if verbose > 0: - print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) - - - ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file - # - # @param patternFileName string file containing regular expression to search in headers - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): - if not patternFileName: - print "ERROR: no file of pattern" - sys.exit(1) - - bioseq = Bioseq() - bioseqNb = 0 - savedBioseqNb = 0 - lHeaders = [] - with open(inFileName) as inFile: - bioseq.read(inFile) - while bioseq.sequence: - lHeaders.append(bioseq.header) - bioseq.read(inFile) - - with open(patternFileName) as patternFile: - lHeadersToRemove = [] - for pattern in patternFile: - pattern = pattern.rstrip() - if verbose > 0: - print "pattern: ", pattern; sys.stdout.flush() - - patternToSearch = re.compile(pattern) - lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)]) - - if not outFileName: - outFileName = inFileName + '.cleaned' - - with open(outFileName, 'w') as outFile: - bioseqNum = 0 - with open(inFileName) as inFile: - bioseq.read(inFile) - while bioseq.sequence: - bioseqNum += 1 - if bioseq.header not in lHeadersToRemove: - bioseq.write(outFile) - savedBioseqNb += 1 - if verbose > 1: - print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() - bioseq.read(inFile) - - if verbose > 0: - print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) - - - ## Find sequence's ORFs from a fasta file and write them in a tab file - # - # @param inFileName string name of the input fasta file - # @param orfMaxNb integer Select orfMaxNb best ORFs - # @param orfMinLength integer Keep ORFs with length > orfMinLength - # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') - # @param verbose integer verbosity level (default = 0) - # - @staticmethod - def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0): - if not outFileName: - outFileName = inFileName + ".ORF.map" - outFile = open(outFileName, "w") - - bioseq = Bioseq() - bioseqNb = 0 - - inFile = open(inFileName) - bioseq.read(inFile) - while bioseq.sequence: - bioseq.upCase() - bioseqNb += 1 - if verbose > 0: - print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]' - - orf = bioseq.findORF() - bestOrf = [] - for i in orf.keys(): - orfLen = len(orf[i]) - for j in xrange(1, orfLen): - start = orf[i][j - 1] + 4 - end = orf[i][j] + 3 - if end - start >= orfMinLength: - bestOrf.append((end - start, i + 1, start, end)) - - bioseq.reverseComplement() - - orf = bioseq.findORF() - seqLen = bioseq.getLength() - for i in orf.keys(): - orfLen = len(orf[i]) - for j in xrange(1, orfLen): - start = seqLen - orf[i][j - 1] - 3 - end = seqLen - orf[i][j] - 2 - if start - end >= orfMinLength: - bestOrf.append((start - end, (i + 1) * -1, start, end)) - - bestOrf.sort(reverse = True) - bestOrfNb = len(bestOrf) - if orfMaxNb != 0 and orfMaxNb < bestOrfNb: - bestOrfNb = orfMaxNb - for i in xrange(0, bestOrfNb): - if verbose > 0: - print bestOrf[i] - outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \ - "|" + str(bestOrf[i][0]), bioseq.header, - bestOrf[i][2], bestOrf[i][3])) - bioseq.read(inFile) - - inFile.close() - outFile.close() - - return 0 - - - ## Sort sequences by increasing length (write a new file) - # - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output fasta file - # @param verbose integer verbosity level - # - @staticmethod - def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0): - if verbose > 0: - print "sort sequences by increasing length" - sys.stdout.flush() - if not os.path.exists(inFileName): - print "ERROR: file '%s' doesn't exist" % (inFileName) - sys.exit(1) - - # read each seq one by one - # save them in distinct temporary files - # with their length in the name - inFileHandler = open(inFileName, "r") - countSeq = 0 - bs = Bioseq() - bs.read(inFileHandler) - while bs.header: - countSeq += 1 - tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq) - bs.appendBioseqInFile(tmpFile) - if verbose > 1: - print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile) - bs.header = "" - bs.sequence = "" - bs.read(inFileHandler) - inFileHandler.close() - - # sort temporary file names - # concatenate them into the output file - if os.path.exists(outFileName): - os.remove(outFileName) - lFiles = glob.glob("*bp_*nb") - lFiles.sort(key = lambda s:int(s.split("bp_")[0])) - for fileName in lFiles: - cmd = "cat %s >> %s" % (fileName, outFileName) - returnValue = os.system(cmd) - if returnValue != 0: - print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName) - sys.exit(1) - os.remove(fileName) - - return 0 - - - ## Sort sequences by header - # - # @param inFileName string name of the input fasta file - # @param outFileName string name of the output fasta file - # @param verbose integer verbosity level - # - @staticmethod - def sortSequencesByHeader(inFileName, outFileName = ""): - if outFileName == "": - outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] - iBioseqDB = BioseqDB(inFileName) - with open(outFileName, "w") as f: - for header in sorted(iBioseqDB.getHeaderList()): - iBioseq = iBioseqDB.fetch(header) - iBioseq.write(f) - - - ## Return a dictionary which keys are the headers and values the length of the sequences - # - # @param inFile string name of the input fasta file - # @param verbose integer verbosity level - # - @staticmethod - def getLengthPerHeader(inFile, verbose = 0): - dHeader2Length = {} - - with open(inFile) as inFileHandler: - currentSeqHeader = "" - currentSeqLength = 0 - for line in inFileHandler: - if line[0] == ">": - if currentSeqHeader != "": - dHeader2Length[currentSeqHeader] = currentSeqLength - currentSeqLength = 0 - currentSeqHeader = line[1:-1] - if verbose > 0: - print "current header: %s" % currentSeqHeader - sys.stdout.flush() - else: - currentSeqLength += len(line.replace("\n", "")) - dHeader2Length[currentSeqHeader] = currentSeqLength - - return dHeader2Length - - - ## Convert headers from a fasta file having chunk coordinates - # - # @param inFile string name of the input fasta file - # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes - # @param outFile string name of the output file - # - @staticmethod - def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): - inFileHandler = open(inFile, "r") - outFileHandler = open(outFile, "w") - dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) - iConvCoord = ConvCoord() - for line in inFileHandler: - if line[0] == ">": - if "{Fragment}" in line: - chkName = line.split(" ")[1] - chrName = dChunk2Map[chkName].seqname - lCoordPairs = line.split(" ")[3].split(",") - lRangesOnChk = [] - for i in lCoordPairs: - iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) - lRangesOnChk.append(iRange) - lRangesOnChr = [] - for iRange in lRangesOnChk: - lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) - newHeader = line[1:-1].split(" ")[0] - newHeader += " %s" % chrName - newHeader += " {Fragment}" - newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) - for iRange in lRangesOnChr[1:]: - newHeader += ",%i..%i" % (iRange.start, iRange.end) - outFileHandler.write(">%s\n" % newHeader) - else: - chkName = line.split("_")[1].split(" ")[0] - chrName = dChunk2Map[chkName].seqname - coords = line[line.find("[")+1 : line.find("]")] - start = int(coords.split(",")[0]) - end = int(coords.split(",")[1]) - iRangeOnChk = Range(chkName, start, end) - iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) - newHeader = line[1:-1].split("_")[0] - newHeader += " %s" % chrName - newHeader += " %s" % line[line.find("(") : line.find(")")+1] - newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) - outFileHandler.write(">%s\n" % newHeader) - else: - outFileHandler.write(line) - inFileHandler.close() - outFileHandler.close() - - - ## Convert a fasta file to a length file - # - # @param inFile string name of the input fasta file - # @param outFile string name of the output file - # - @staticmethod - def convertFastaToLength(inFile, outFile = ""): - if not outFile: - outFile = "%s.length" % inFile - - if inFile: - with open(inFile) as inFH: - with open(outFile, "w") as outFH: - bioseq = Bioseq() - bioseq.read(inFH) - while bioseq.sequence: - seqLen = bioseq.getLength() - outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) - bioseq.read(inFH) - - - ## Convert a fasta file to a seq file - # - # @param inFile string name of the input fasta file - # @param outFile string name of the output file - # - @staticmethod - def convertFastaToSeq(inFile, outFile = ""): - if not outFile: - outFile = "%s.seq" % inFile - - if inFile: - with open(inFile) as inFH: - with open(outFile, "w") as outFH: - bioseq = Bioseq() - bioseq.read(inFH) - while bioseq.sequence: - seqLen = bioseq.getLength() - outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ - bioseq.sequence, bioseq.header, seqLen)) - bioseq.read(inFH) - - - ## Splice an input fasta file using coordinates in a Map file - # - # @note the coordinates should be merged beforehand! - # - @staticmethod - def spliceFromCoords(genomeFile, coordFile, obsFile): - genomeFileHandler = open(genomeFile) - obsFileHandler = open(obsFile, "w") - dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile) - - bs = Bioseq() - bs.read(genomeFileHandler) - while bs.sequence: - - if dChr2Maps.has_key(bs.header): - lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header]) - splicedSeq = [] - currentSite = 0 - for iMap in lCoords: - minSplice = iMap.getMin() - 1 - if minSplice > currentSite: - splicedSeq += bs.sequence[currentSite : minSplice] - if currentSite <= iMap.getMax(): - currentSite = iMap.getMax() - splicedSeq += bs.sequence[currentSite:] - bs.sequence = "".join(splicedSeq) - bs.write(obsFileHandler) - bs.read(genomeFileHandler) - - genomeFileHandler.close() - obsFileHandler.close() - - - ## Shuffle input sequences (single file or files in a directory) - # - @staticmethod - def dbShuffle(inData, outData, verbose = 0): - if CheckerUtils.isExecutableInUserPath("esl-shuffle"): - prg = "esl-shuffle" - else : prg = "shuffle" - genericCmd = prg + " --seed 1 -d INPUT > OUTPUT" - if os.path.isfile(inData): - if verbose > 0: - print "shuffle input file '%s'" % inData - cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData) - print cmd - returnStatus = os.system(cmd) - if returnStatus: - sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) - sys.exit(1) - - elif os.path.isdir(inData): - if verbose > 0: - print "shuffle files in input directory '%s'" % inData - if os.path.exists(outData): - shutil.rmtree(outData) - os.mkdir(outData) - lInputFiles = glob.glob("%s/*.fa" % inData) - nbFastaFiles = 0 - for inputFile in lInputFiles: - nbFastaFiles += 1 - if verbose > 1: - print "%3i / %3i" % (nbFastaFiles, len(lInputFiles)) - fastaBaseName = os.path.basename(inputFile) - prefix = os.path.splitext(fastaBaseName)[0] - cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix)) - returnStatus = os.system(cmd) - if returnStatus: - sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) - sys.exit(1) - - - ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers - # - # @param inClusterFileName string input cluster file name - # @param inFastaFileName string input fasta file name - # @param outFileName string output file name - # @param verbosity integer verbosity - # - @staticmethod - def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): - dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) - iFastaParser = FastaParser(inFastaFileName) - with open(outFileName, "w") as f: - for iSequence in iFastaParser.getIterator(): - - header = iSequence.getName() - if dHeader2ClusterClusterMember.get(header): - cluster = dHeader2ClusterClusterMember[header][0] - member = dHeader2ClusterClusterMember[header][1] - else: - clusterIdForSingletonCluster += 1 - cluster = clusterIdForSingletonCluster - member = 1 - - newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) - iSequence.setName(newHeader) - f.write(iSequence.printFasta()) - - @staticmethod - def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): - dHeader2ClusterClusterMember = {} - clusterId = 0 - with open(inClusterFileName) as f: - for line in f: - lineWithoutLastChar = line.rstrip() - lHeaders = lineWithoutLastChar.split("\t") - clusterId += 1 - if verbosity > 0: - print "%i sequences in cluster %i" % (len(lHeaders), clusterId) - memberId = 0 - for header in lHeaders: - memberId += 1 - dHeader2ClusterClusterMember[header] = (clusterId, memberId) - if verbosity > 0: - print "%i clusters" % clusterId - return dHeader2ClusterClusterMember, clusterId - - @staticmethod - def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): - """ - Write a map file from fasta output of clustering tool. - Warning: only works if input fasta headers are formated like LTRharvest fasta output. - """ - if not outMapFileName: - outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) - - fileDb = open(fastaFileNameFromClustering) - fileMap = open(outMapFileName, "w") - seq = Bioseq() - numseq = 0 - seq.read(fileDb) - while seq.sequence: - numseq = numseq + 1 - ID = seq.header.split(' ')[0].split('_')[0] - chunk = seq.header.split(' ')[0].split('_')[1] - start = seq.header.split(' ')[-1].split(',')[0][1:] - end = seq.header.split(' ')[-1].split(',')[1][:-1] - line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) - fileMap.write("%s\n" % line) - seq.read(fileDb) - - fileDb.close() - fileMap.close() - print "saved in %s" % outMapFileName - - @staticmethod - def getNstretchesRangesList(fastaFileName, nbN = 2): - lNstretchesRanges = [] - if nbN != 0: - iBSDB = BioseqDB(fastaFileName) - for iBS in iBSDB.db: - nbNFound = 0 - start = 1 - pos = 1 - lastPos = 0 - - while pos <= iBS.getLength(): - if nbNFound == 0: - start = pos - - while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: - nbNFound += 1 - pos += 1 - lastPos = pos - - if pos - lastPos >= nbN: - if nbNFound >= nbN: - lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) - nbNFound = 0 - pos += 1 - - if nbNFound >= nbN: - lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) - - lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False) - - return lNstretchesRanges - - - @staticmethod - def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): - lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN) - - outFormat = outFormat.lower() - if outFormat in ["gff", "gff3"]: - outFormat = "gff3" - else: - outFormat = "map" - - if outFileName == "": - outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) - - with open(outFileName, "w") as fH: - if outFormat == "gff3": - fH.write("##gff-version 3\n") - for iRange in lNstretchesRanges: - seq = iRange.getSeqname() - start = iRange.getStart() - end = iRange.getEnd() - if outFormat == "gff3": - fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) - else: - fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) - \ No newline at end of file