teiso: TEisotools-1.1.a/commons/core/seq/FastaUtils.py comparison

comparison TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft

Uploaded

author	urgi-team
date	Wed, 20 Jul 2016 09:04:42 -0400
parents
children

comparison

equal deleted inserted replaced

-:22b0494ec883
+:feef9a0db09d
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability.
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and,  more generally, to use and operate it in the
+# same conditions as regards security.
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+import os
+import re
+import sys
+import math
+import glob
+import string
+import shutil
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.seq.Bioseq import Bioseq
+from commons.core.seq.BioseqDB import BioseqDB
+from commons.core.coord.MapUtils import MapUtils
+from commons.core.coord.Range import Range
+from commons.core.coord.ConvCoord import ConvCoord
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.checker.CheckerUtils import CheckerUtils
+from commons.core.launcher.LauncherUtils import LauncherUtils
+## Static methods for fasta file manipulation
+#
+class FastaUtils( object ):
+## Count the number of sequences in the input fasta file
+#
+# @param inFile name of the input fasta file
+#
+# @return integer number of sequences in the input fasta file
+#
+@staticmethod
+def dbSize( inFile ):
+nbSeq = 0
+with open(inFile) as fH:
+for line in fH:
+if line[0] == ">":
+nbSeq += 1
+return nbSeq
+## Compute the cumulative sequence length in the input fasta file
+#
+# @param inFile handler of the input fasta file
+#
+@staticmethod
+def dbCumLength( inFile ):
+cumLength = 0
+for line in inFile:
+if line[0] != ">":
+cumLength += len(string.rstrip(line))
+return cumLength
+## Return a list with the length of each sequence in the input fasta file
+#
+# @param inFile string name of the input fasta file
+#
+@staticmethod
+def dbLengths(inFile):
+lLengths = []
+currentLength = 0
+with open(inFile) as fH:
+for line in fH:
+if line[0] == ">":
+if currentLength != 0:
+lLengths.append(currentLength)
+currentLength = 0
+else:
+currentLength += len(line[:-1])
+lLengths.append(currentLength)
+return lLengths
+## Retrieve the sequence headers present in the input fasta file
+#
+# @param inFile string name of the input fasta file
+# @param verbose integer level of verbosity
+#
+# @return list of sequence headers
+#
+@staticmethod
+def dbHeaders(inFile, verbose = 0):
+lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
+if verbose:
+for header in lHeaders:
+print header
+return lHeaders
+## Cut a data bank into chunks according to the input parameters
+# If a sequence is shorter than the threshold, it is only renamed (not cut)
+#
+# @param inFileName string name of the input fasta file
+# @param chkLgth string chunk length (in bp, default=200000)
+# @param chkOver string chunk overlap (in bp, default=10000)
+# @param wordN string N stretch word length (default=11, 0 for no detection)
+# @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
+# @param clean boolean remove 'cut' and 'Nstretch' files
+# @param verbose integer (default = 0)
+#
+@staticmethod
+def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
+nbSeq = FastaUtils.dbSize(inFileName)
+if verbose > 0:
+print "cut the %i input sequences with cutterDB..." % nbSeq
+sys.stdout.flush()
+prg = "cutterDB"
+cmd = prg
+cmd += " -l %s" % chkLgth
+cmd += " -o %s" % chkOver
+cmd += " -w %s" % wordN
+cmd += " %s" % inFileName
+returnStatus = os.system(cmd)
+if returnStatus != 0:
+msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
+sys.stderr.write("%s\n" % msg)
+sys.exit(1)
+nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
+if verbose > 0:
+print "done (%i chunks)" % ( nbChunks )
+sys.stdout.flush()
+if verbose > 0:
+print "rename the headers..."
+sys.stdout.flush()
+if outFilePrefix == "":
+outFastaName = inFileName + "_chunks.fa"
+outMapName = inFileName + "_chunks.map"
+else:
+outFastaName = outFilePrefix + ".fa"
+outMapName = outFilePrefix + ".map"
+with open("%s_cut" % inFileName) as inFile:
+outFasta = open(outFastaName, "w")
+outMap = open(outMapName, "w")
+for line in inFile:
+if line[0] == ">":
+if verbose > 1:
+print "rename '%s'" % (line[:-1]); sys.stdout.flush()
+data = line[:-1].split(" ")
+seqID = data[0].split(">")[1]
+newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
+oldHeader = data[2]
+seqStart = data[4].split("..")[0]
+seqEnd = data[4].split("..")[1]
+outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
+outFasta.write(">%s\n" % newHeader)
+else:
+outFasta.write(line.upper())
+outFasta.close()
+outMap.close()
+#stats on .Nstretch.map file
+genomeLength = FastaUtils.dbCumLength(open(inFileName))
+NstretchMapFile = inFileName + ".Nstretch.map"
+outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
+if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
+outNstrechStats.write("No N in stretch length > %s\n" % wordN)
+else:
+with open(NstretchMapFile) as f:
+dHeader2lLengths = {}
+for line in f:
+data = line.rstrip().split()
+header = data[1]
+length = int(data[3]) - int(data[2]) + 1
+if header not in dHeader2lLengths:
+dHeader2lLengths[header] = []
+dHeader2lLengths[header].append(length)
+for header in sorted(dHeader2lLengths):
+lLengths = dHeader2lLengths[header]
+outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))
+cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])
+NstretchPrct = float(cumulAllStretch)/genomeLength*100
+outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
+outNstrechStats.close()
+if clean == True:
+os.remove(inFileName + "_cut")
+os.remove(NstretchMapFile)
+## Split the input fasta file in several output files
+#
+# @param inFile string name of the input fasta file
+# @param nbSeqPerBatch integer number of sequences per output file
+# @param newDir boolean put the sequences in a new directory called 'batches'
+# @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
+# @param prefix prefix in output file name
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
+if not os.path.exists(inFile):
+msg = "ERROR: file '%s' doesn't exist" % inFile
+sys.stderr.write("%s\n" % msg)
+sys.exit(1)
+nbSeq = FastaUtils.dbSize(inFile)
+nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
+if verbose > 0:
+print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
+sys.stdout.flush()
+if nbSeqPerBatch != 1 and useSeqHeader:
+useSeqHeader = False
+if newDir == True:
+if os.path.exists("batches"):
+shutil.rmtree("batches")
+os.mkdir("batches")
+os.chdir("batches")
+os.system("ln -s ../%s ." % inFile)
+with open(inFile) as inFileHandler:
+countBatch = 0
+countSeq = 0
+for line in inFileHandler:
+if line[0] == ">":
+countSeq += 1
+if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
+try:
+outFile.close()
+except: pass
+countBatch += 1
+if useSeqHeader:
+outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
+else:
+outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
+outFile = open(outFileName, "w")
+if verbose > 1:
+print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
+sys.stdout.flush()
+outFile.write(line)
+if newDir:
+os.remove(os.path.basename(inFile))
+os.chdir("..")
+## Split the input fasta file in several output files
+#
+# @param inFileName string name of the input fasta file
+# @param maxSize integer max cumulative length for each output file
+#
+@staticmethod
+def splitFastaFileInBatches(inFileName, maxSize = 200000):
+iBioseqDB = BioseqDB(inFileName)
+lHeadersSizeTuples = []
+for iBioseq in iBioseqDB.db:
+lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
+lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
+os.mkdir("batches")
+os.chdir("batches")
+iterator = 0
+for lHeader in lHeadersList:
+iterator += 1
+with open("batch_%s.fa" % iterator, 'w') as f:
+for header in lHeader :
+iBioseq = iBioseqDB.fetch(header)
+iBioseq.write(f)
+os.chdir("..")
+## Split the input fasta file in several output files according to their cluster identifier
+#
+# @param inFileName string  name of the input fasta file
+# @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
+# @param simplifyHeader boolean simplify the headers
+# @param createDir boolean put the sequences in different directories
+# @param outPrefix string prefix of the output files (default='seqCluster')
+# @param verbose integer (default = 0)
+#
+@staticmethod
+def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
+if not os.path.exists(inFileName):
+print "ERROR: %s doesn't exist" % inFileName
+sys.exit(1)
+inFile = open(inFileName)
+line = inFile.readline()
+if line:
+name = line.split(" ")[0]
+if "Cluster" in name:
+clusterID = name.split("Cluster")[1].split("Mb")[0]
+seqID = name.split("Mb")[1]
+else:
+clusterID = name.split("Cl")[0].split("Gr")[1]   # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
+if "Q" in name.split("Gr")[0]:
+seqID = name.split("Gr")[0].split("MbQ")[1]
+elif "S" in name:
+seqID = name.split("Gr")[0].split("MbS")[1]
+sClusterIDs = set( [ clusterID ] )
+if simplifyHeader == True:
+header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
+else:
+header = line[1:-1]
+if createDir == True:
+if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
+os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
+os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
+outFileName = "%s%s.fa" % ( outPrefix, clusterID )
+outFile = open( outFileName, "w" )
+outFile.write( ">%s\n" % ( header ) )
+prevClusterID = clusterID
+line = inFile.readline()
+while line:
+if line[0] == ">":
+name = line.split(" ")[0]
+if "Cluster" in name:
+clusterID = name.split("Cluster")[1].split("Mb")[0]
+seqID = name.split("Mb")[1]
+else:
+clusterID = name.split("Cl")[0].split("Gr")[1]
+if "Q" in name.split("Gr")[0]:
+seqID = name.split("Gr")[0].split("MbQ")[1]
+elif "S" in name:
+seqID = name.split("Gr")[0].split("MbS")[1]
+if clusterID != prevClusterID:
+outFile.close()
+if simplifyHeader == True:
+header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
+else:
+header = line[1:-1]
+if createDir == True:
+os.chdir( ".." )
+if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
+os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
+os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
+outFileName = "%s%s.fa" % ( outPrefix, clusterID )
+if not os.path.exists( outFileName ):
+outFile = open( outFileName, "w" )
+else:
+if clusterID != prevClusterID:
+outFile.close()
+outFile = open( outFileName, "a" )
+outFile.write( ">%s\n" % ( header ) )
+prevClusterID = clusterID
+sClusterIDs.add( clusterID )
+else:
+outFile.write( line )
+line = inFile.readline()
+outFile.close()
+if verbose > 0:
+print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
+if createDir == True:
+os.chdir("..")
+else:
+print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
+## Filter a fasta file in two fasta files using the length of each sequence as a criterion
+#
+# @param len_min integer    length sequence criterion to filter
+# @param inFileName string  name of the input fasta file
+# @param verbose integer (default = 0)
+#
+@staticmethod
+def dbLengthFilter(len_min, inFileName, verbose = 0):
+file_db = open(inFileName,)
+file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
+file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
+seq = Bioseq()
+numseq = 0
+nbsaveInf = 0
+nbsaveSup = 0
+seq.read(file_db)
+while seq.getHeader():
+l = seq.getLength()
+numseq = numseq + 1
+if l >= len_min:
+seq.write(file_dbSup)
+if verbose > 0:
+nbsaveSup = nbsaveSup + 1
+else:
+seq.write(file_dbInf)
+if verbose > 0:
+nbsaveInf = nbsaveInf + 1
+seq.read(file_db)
+file_db.close()
+file_dbInf.close()
+file_dbSup.close()
+if verbose > 0:
+print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))
+## Extract the longest sequences from a fasta file
+#
+# @param num integer maximum number of sequences in the output file
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output fasta file
+# @param minThresh integer minimum length threshold (default=0)
+# @param verbose integer (default = 0)
+#
+@staticmethod
+def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
+bsDB = BioseqDB(inFileName)
+if verbose > 0:
+print "nb of input sequences: %i" % bsDB.getSize()
+if outFileName == "":
+outFileName = inFileName + ".best" + str(num)
+outFile = open( outFileName, "w" )
+if bsDB.getSize()==0:
+return 0
+num = int(num)
+if verbose > 0:
+print "keep the %i longest sequences" % num
+if minThresh > 0:
+print "with length > %i bp" % minThresh
+sys.stdout.flush()
+tmpLSeqLgth = bsDB.getListOfSequencesLength()
+tmpLSeqLgth.sort(reverse = True)
+# select the longests
+lSeqLgth = []
+for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
+if tmpLSeqLgth[i] >= minThresh:
+lSeqLgth.append( tmpLSeqLgth[i] )
+if verbose > 0:
+print "selected max length: %i" % max(lSeqLgth)
+print "selected min length: %i" % min(lSeqLgth)
+sys.stdout.flush()
+# save the longest
+inFile = open( inFileName )
+seqNum = 0
+nbSave = 0
+for bs in bsDB.db:
+seqNum += 1
+if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
+bs.write( outFile )
+if verbose > 1:
+print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
+sys.stdout.flush()
+nbSave += 1
+if nbSave == num:
+break
+inFile.close()
+outFile.close()
+if verbose > 0:
+print nbSave, "saved sequences in ", outFileName
+sys.stdout.flush()
+return 0
+## Extract all the sequence headers from a fasta file and write them in a new file
+#
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
+#
+@staticmethod
+def dbExtractSeqHeaders(inFileName, outFileName = ""):
+if not outFileName:
+outFileName = inFileName + ".headers"
+with open(outFileName, "w") as f:
+for header in FastaUtils.dbHeaders(inFileName):
+f.write("%s\n" % header)
+return 0
+## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
+#
+# @param pattern regular expression to search in headers
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
+if not pattern:
+return
+if not outFileName:
+outFileName = inFileName + '.extracted'
+outFile = open(outFileName, 'w')
+patternTosearch = re.compile(pattern)
+bioseq = Bioseq()
+bioseqNb = 0
+savedBioseqNb = 0
+inFile = open(inFileName)
+bioseq.read(inFile)
+while bioseq.sequence:
+bioseqNb = bioseqNb + 1
+m = patternTosearch.search(bioseq.header)
+if m:
+bioseq.write(outFile)
+if verbose > 1:
+print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
+savedBioseqNb = savedBioseqNb + 1
+bioseq.read(inFile)
+inFile.close()
+outFile.close()
+if verbose > 0:
+print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
+## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
+#
+# @param patternFileName string file containing regular expression to search in headers
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
+if not patternFileName:
+print "ERROR: no file of pattern"
+sys.exit(1)
+bioseq = Bioseq()
+bioseqNb = 0
+savedBioseqNb = 0
+lHeaders = []
+with open(inFileName) as inFile:
+bioseq.read(inFile)
+while bioseq.sequence:
+lHeaders.append(bioseq.header)
+bioseq.read(inFile)
+lHeadersToKeep = []
+with open(patternFileName) as patternFile:
+for pattern in patternFile:
+pattern = pattern.rstrip()
+if verbose > 0:
+print "pattern: ", pattern; sys.stdout.flush()
+patternToSearch = re.compile(pattern)
+lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])
+if not outFileName:
+outFileName = inFileName + ".extracted"
+with open(outFileName, "w") as outFile:
+with open(inFileName) as inFile:
+bioseq.read(inFile)
+while bioseq.sequence:
+bioseqNb += 1
+if bioseq.header in lHeadersToKeep:
+bioseq.write(outFile)
+savedBioseqNb += 1
+if verbose > 1:
+print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
+bioseq.read(inFile)
+if verbose > 0:
+print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
+## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
+#
+# @param pattern regular expression to search in headers
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
+if not pattern:
+return
+patternToSearch = re.compile(pattern)
+if outFileName == "":
+outFileName = inFileName + '.cleaned'
+with open(outFileName, 'w') as outFile:
+bioseq = Bioseq()
+bioseqNb = 0
+savedBioseqNb = 0
+with open(inFileName) as inFile:
+bioseq.read(inFile)
+while bioseq.sequence:
+bioseqNb += 1
+if not patternToSearch.search(bioseq.header):
+bioseq.write(outFile)
+savedBioseqNb += 1
+if verbose > 1:
+print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
+bioseq.read(inFile)
+if verbose > 0:
+print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
+## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
+#
+# @param patternFileName string file containing regular expression to search in headers
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
+if not patternFileName:
+print "ERROR: no file of pattern"
+sys.exit(1)
+bioseq = Bioseq()
+bioseqNb = 0
+savedBioseqNb = 0
+lHeaders = []
+with open(inFileName) as inFile:
+bioseq.read(inFile)
+while bioseq.sequence:
+lHeaders.append(bioseq.header)
+bioseq.read(inFile)
+with open(patternFileName) as patternFile:
+lHeadersToRemove = []
+for pattern in patternFile:
+pattern = pattern.rstrip()
+if verbose > 0:
+print "pattern: ", pattern; sys.stdout.flush()
+patternToSearch = re.compile(pattern)
+lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])
+if not outFileName:
+outFileName = inFileName + '.cleaned'
+with open(outFileName, 'w') as outFile:
+bioseqNum = 0
+with open(inFileName) as inFile:
+bioseq.read(inFile)
+while bioseq.sequence:
+bioseqNum += 1
+if bioseq.header not in lHeadersToRemove:
+bioseq.write(outFile)
+savedBioseqNb += 1
+if verbose > 1:
+print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
+bioseq.read(inFile)
+if verbose > 0:
+print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
+## Find sequence's ORFs from a fasta file and write them in a tab file
+#
+# @param inFileName string name of the input fasta file
+# @param orfMaxNb integer Select orfMaxNb best ORFs
+# @param orfMinLength integer Keep ORFs with length > orfMinLength
+# @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
+# @param verbose integer verbosity level (default = 0)
+#
+@staticmethod
+def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
+if not outFileName:
+outFileName = inFileName + ".ORF.map"
+outFile = open(outFileName, "w")
+bioseq = Bioseq()
+bioseqNb = 0
+inFile = open(inFileName)
+bioseq.read(inFile)
+while bioseq.sequence:
+bioseq.upCase()
+bioseqNb += 1
+if verbose > 0:
+print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'
+orf = bioseq.findORF()
+bestOrf = []
+for i in orf.keys():
+orfLen = len(orf[i])
+for j in xrange(1, orfLen):
+start = orf[i][j - 1] + 4
+end = orf[i][j] + 3
+if end - start >= orfMinLength:
+bestOrf.append((end - start, i + 1, start, end))
+bioseq.reverseComplement()
+orf = bioseq.findORF()
+seqLen = bioseq.getLength()
+for i in orf.keys():
+orfLen = len(orf[i])
+for j in xrange(1, orfLen):
+start = seqLen - orf[i][j - 1] - 3
+end = seqLen - orf[i][j] - 2
+if start - end >= orfMinLength:
+bestOrf.append((start - end, (i + 1) * -1, start, end))
+bestOrf.sort(reverse = True)
+bestOrfNb = len(bestOrf)
+if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
+bestOrfNb = orfMaxNb
+for i in xrange(0, bestOrfNb):
+if verbose > 0:
+print bestOrf[i]
+outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
+"|" + str(bestOrf[i][0]), bioseq.header,
+bestOrf[i][2], bestOrf[i][3]))
+bioseq.read(inFile)
+inFile.close()
+outFile.close()
+return 0
+## Sort sequences by increasing length (write a new file)
+#
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output fasta file
+# @param verbose integer verbosity level
+#
+@staticmethod
+def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
+if verbose > 0:
+print "sort sequences by increasing length"
+sys.stdout.flush()
+if not os.path.exists(inFileName):
+print "ERROR: file '%s' doesn't exist" % (inFileName)
+sys.exit(1)
+# read each seq one by one
+# save them in distinct temporary files
+# with their length in the name
+inFileHandler = open(inFileName, "r")
+countSeq = 0
+bs = Bioseq()
+bs.read(inFileHandler)
+while bs.header:
+countSeq += 1
+tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
+bs.appendBioseqInFile(tmpFile)
+if verbose > 1:
+print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
+bs.header = ""
+bs.sequence = ""
+bs.read(inFileHandler)
+inFileHandler.close()
+# sort temporary file names
+# concatenate them into the output file
+if os.path.exists(outFileName):
+os.remove(outFileName)
+lFiles = glob.glob("*bp_*nb")
+lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
+for fileName in lFiles:
+cmd = "cat %s >> %s" % (fileName, outFileName)
+returnValue = os.system(cmd)
+if returnValue != 0:
+print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
+sys.exit(1)
+os.remove(fileName)
+return 0
+## Sort sequences by header
+#
+# @param inFileName string name of the input fasta file
+# @param outFileName string name of the output fasta file
+# @param verbose integer verbosity level
+#
+@staticmethod
+def sortSequencesByHeader(inFileName, outFileName = ""):
+if outFileName == "":
+outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
+iBioseqDB = BioseqDB(inFileName)
+with open(outFileName, "w") as f:
+for header in sorted(iBioseqDB.getHeaderList()):
+iBioseq = iBioseqDB.fetch(header)
+iBioseq.write(f)
+## Return a dictionary which keys are the headers and values the length of the sequences
+#
+# @param inFile string name of the input fasta file
+# @param verbose integer verbosity level
+#
+@staticmethod
+def getLengthPerHeader(inFile, verbose = 0):
+dHeader2Length = {}
+with open(inFile) as inFileHandler:
+currentSeqHeader = ""
+currentSeqLength = 0
+for line in inFileHandler:
+if line[0] == ">":
+if currentSeqHeader != "":
+dHeader2Length[currentSeqHeader] = currentSeqLength
+currentSeqLength = 0
+currentSeqHeader = line[1:-1]
+if verbose > 0:
+print "current header: %s" % currentSeqHeader
+sys.stdout.flush()
+else:
+currentSeqLength += len(line.replace("\n", ""))
+dHeader2Length[currentSeqHeader] = currentSeqLength
+return dHeader2Length
+## Convert headers from a fasta file having chunk coordinates
+#
+# @param inFile string name of the input fasta file
+# @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
+# @param outFile string name of the output file
+#
+@staticmethod
+def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
+inFileHandler = open(inFile, "r")
+outFileHandler = open(outFile, "w")
+dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
+iConvCoord = ConvCoord()
+for line in inFileHandler:
+if line[0] == ">":
+if "{Fragment}" in line:
+chkName = line.split(" ")[1]
+chrName = dChunk2Map[chkName].seqname
+lCoordPairs = line.split(" ")[3].split(",")
+lRangesOnChk = []
+for i in lCoordPairs:
+iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
+lRangesOnChk.append(iRange)
+lRangesOnChr = []
+for iRange in lRangesOnChk:
+lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
+newHeader = line[1:-1].split(" ")[0]
+newHeader += " %s" % chrName
+newHeader += " {Fragment}"
+newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
+for iRange in lRangesOnChr[1:]:
+newHeader += ",%i..%i" % (iRange.start, iRange.end)
+outFileHandler.write(">%s\n" % newHeader)
+else:
+chkName = line.split("_")[1].split(" ")[0]
+chrName = dChunk2Map[chkName].seqname
+coords = line[line.find("[")+1 : line.find("]")]
+start = int(coords.split(",")[0])
+end = int(coords.split(",")[1])
+iRangeOnChk = Range(chkName, start, end)
+iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
+newHeader = line[1:-1].split("_")[0]
+newHeader += " %s" % chrName
+newHeader += " %s" % line[line.find("(") : line.find(")")+1]
+newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
+outFileHandler.write(">%s\n" % newHeader)
+else:
+outFileHandler.write(line)
+inFileHandler.close()
+outFileHandler.close()
+## Convert a fasta file to a length file
+#
+# @param inFile string name of the input fasta file
+# @param outFile string name of the output file
+#
+@staticmethod
+def convertFastaToLength(inFile, outFile = ""):
+if not outFile:
+outFile = "%s.length" % inFile
+if inFile:
+with open(inFile) as inFH:
+with open(outFile, "w") as outFH:
+bioseq = Bioseq()
+bioseq.read(inFH)
+while bioseq.sequence:
+seqLen = bioseq.getLength()
+outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
+bioseq.read(inFH)
+## Convert a fasta file to a seq file
+#
+# @param inFile string name of the input fasta file
+# @param outFile string name of the output file
+#
+@staticmethod
+def convertFastaToSeq(inFile, outFile = ""):
+if not outFile:
+outFile = "%s.seq" % inFile
+if inFile:
+with open(inFile) as inFH:
+with open(outFile, "w") as outFH:
+bioseq = Bioseq()
+bioseq.read(inFH)
+while bioseq.sequence:
+seqLen = bioseq.getLength()
+outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
+bioseq.sequence, bioseq.header, seqLen))
+bioseq.read(inFH)
+## Splice an input fasta file using coordinates in a Map file
+#
+# @note the coordinates should be merged beforehand!
+#
+@staticmethod
+def spliceFromCoords(genomeFile, coordFile, obsFile):
+genomeFileHandler = open(genomeFile)
+obsFileHandler = open(obsFile, "w")
+dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)
+bs = Bioseq()
+bs.read(genomeFileHandler)
+while bs.sequence:
+if dChr2Maps.has_key(bs.header):
+lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
+splicedSeq = []
+currentSite = 0
+for iMap in lCoords:
+minSplice = iMap.getMin() - 1
+if minSplice > currentSite:
+splicedSeq += bs.sequence[currentSite : minSplice]
+if currentSite <= iMap.getMax():
+currentSite = iMap.getMax()
+splicedSeq += bs.sequence[currentSite:]
+bs.sequence = "".join(splicedSeq)
+bs.write(obsFileHandler)
+bs.read(genomeFileHandler)
+genomeFileHandler.close()
+obsFileHandler.close()
+## Shuffle input sequences (single file or files in a directory)
+#
+@staticmethod
+def dbShuffle(inData, outData, verbose = 0):
+if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
+prg = "esl-shuffle"
+else : prg = "shuffle"
+genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
+if os.path.isfile(inData):
+if verbose > 0:
+print "shuffle input file '%s'" % inData
+cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
+print cmd
+returnStatus = os.system(cmd)
+if returnStatus:
+sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
+sys.exit(1)
+elif os.path.isdir(inData):
+if verbose > 0:
+print "shuffle files in input directory '%s'" % inData
+if os.path.exists(outData):
+shutil.rmtree(outData)
+os.mkdir(outData)
+lInputFiles = glob.glob("%s/*.fa" % inData)
+nbFastaFiles = 0
+for inputFile in lInputFiles:
+nbFastaFiles += 1
+if verbose > 1:
+print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
+fastaBaseName = os.path.basename(inputFile)
+prefix = os.path.splitext(fastaBaseName)[0]
+cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
+returnStatus = os.system(cmd)
+if returnStatus:
+sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
+sys.exit(1)
+## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
+#
+# @param inClusterFileName string input cluster file name
+# @param inFastaFileName string input fasta file name
+# @param outFileName string output file name
+# @param verbosity integer verbosity
+#
+@staticmethod
+def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
+dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
+iFastaParser = FastaParser(inFastaFileName)
+with open(outFileName, "w") as f:
+for iSequence in iFastaParser.getIterator():
+header = iSequence.getName()
+if dHeader2ClusterClusterMember.get(header):
+cluster = dHeader2ClusterClusterMember[header][0]
+member = dHeader2ClusterClusterMember[header][1]
+else:
+clusterIdForSingletonCluster +=  1
+cluster = clusterIdForSingletonCluster
+member = 1
+newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
+iSequence.setName(newHeader)
+f.write(iSequence.printFasta())
+@staticmethod
+def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
+dHeader2ClusterClusterMember = {}
+clusterId = 0
+with open(inClusterFileName) as f:
+for line in f:
+lineWithoutLastChar = line.rstrip()
+lHeaders = lineWithoutLastChar.split("\t")
+clusterId += 1
+if verbosity > 0:
+print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
+memberId = 0
+for header in lHeaders:
+memberId += 1
+dHeader2ClusterClusterMember[header] = (clusterId, memberId)
+if verbosity > 0:
+print "%i clusters" % clusterId
+return dHeader2ClusterClusterMember, clusterId
+@staticmethod
+def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
+"""
+Write a map file from fasta output of clustering tool.
+Warning: only works if input fasta headers are formated like LTRharvest fasta output.
+"""
+if not outMapFileName:
+outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
+fileDb = open(fastaFileNameFromClustering)
+fileMap = open(outMapFileName, "w")
+seq = Bioseq()
+numseq = 0
+seq.read(fileDb)
+while seq.sequence:
+numseq = numseq + 1
+ID = seq.header.split(' ')[0].split('_')[0]
+chunk = seq.header.split(' ')[0].split('_')[1]
+start = seq.header.split(' ')[-1].split(',')[0][1:]
+end = seq.header.split(' ')[-1].split(',')[1][:-1]
+line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
+fileMap.write("%s\n" % line)
+seq.read(fileDb)
+fileDb.close()
+fileMap.close()
+print "saved in %s" % outMapFileName
+@staticmethod
+def getNstretchesRangesList(fastaFileName, nbN = 2):
+lNstretchesRanges = []
+if nbN != 0:
+iBSDB = BioseqDB(fastaFileName)
+for iBS in iBSDB.db:
+nbNFound = 0
+start = 1
+pos = 1
+lastPos = 0
+while pos <= iBS.getLength():
+if nbNFound == 0:
+start = pos
+while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
+nbNFound += 1
+pos += 1
+lastPos = pos
+if pos - lastPos >= nbN:
+if nbNFound >= nbN:
+lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
+nbNFound = 0
+pos += 1
+if nbNFound >= nbN:
+lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
+lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)
+return lNstretchesRanges
+@staticmethod
+def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
+lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)
+outFormat = outFormat.lower()
+if outFormat in ["gff", "gff3"]:
+outFormat = "gff3"
+else:
+outFormat = "map"
+if outFileName == "":
+outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
+with open(outFileName, "w") as fH:
+if outFormat == "gff3":
+fH.write("##gff-version 3\n")
+for iRange in lNstretchesRanges:
+seq = iRange.getSeqname()
+start = iRange.getStart()
+end = iRange.getEnd()
+if outFormat == "gff3":
+fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
+else:
+fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))

Mercurial > repos > urgi-team > teiso

comparison TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft