Mercurial > repos > yufei-luo > s_mart
view commons/core/seq/FastaUtils.py @ 65:4ab9c86890ba draft
Minor change
author | m-zytnicki |
---|---|
date | Wed, 04 Nov 2015 03:41:26 -0500 (2015-11-04) |
parents | 44d5973c188c |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import sys import string import math import shutil import re import glob from operator import itemgetter from commons.core.seq.BioseqDB import BioseqDB from commons.core.seq.Bioseq import Bioseq from commons.core.coord.MapUtils import MapUtils from commons.core.coord.Range import Range from commons.core.checker.CheckerUtils import CheckerUtils from commons.core.launcher.LauncherUtils import LauncherUtils from commons.core.coord.ConvCoord import ConvCoord from commons.core.parsing.FastaParser import FastaParser ## Static methods for fasta file manipulation # class FastaUtils( object ): ## Count the number of sequences in the input fasta file # # @param inFile name of the input fasta file # # @return integer number of sequences in the input fasta file # @staticmethod def dbSize( inFile ): nbSeq = 0 inFileHandler = open( inFile, "r" ) line = inFileHandler.readline() while line: if line[0] == ">": nbSeq = nbSeq + 1 line = inFileHandler.readline() inFileHandler.close() return nbSeq ## Compute the cumulative sequence length in the input fasta file # # @param inFile handler of the input fasta file # @staticmethod def dbCumLength( inFile ): cumLength = 0 line = inFile.readline() while line: if line[0] != ">": cumLength += len(string.rstrip(line)) line = inFile.readline() return cumLength ## Return a list with the length of each sequence in the input fasta file # # @param inFile string name of the input fasta file # @staticmethod def dbLengths( inFile ): lLengths = [] inFileHandler = open( inFile, "r" ) currentLength = 0 line = inFileHandler.readline() while line: if line[0] == ">": if currentLength != 0: lLengths.append( currentLength ) currentLength = 0 else: currentLength += len(line[:-1]) line = inFileHandler.readline() lLengths.append( currentLength ) inFileHandler.close() return lLengths ## Retrieve the sequence headers present in the input fasta file # # @param inFile string name of the input fasta file # @param verbose integer level of verbosity # # @return list of sequence headers # @staticmethod def dbHeaders( inFile, verbose=0 ): lHeaders = [] inFileHandler = open( inFile, "r" ) line = inFileHandler.readline() while line: if line[0] == ">": lHeaders.append( string.rstrip(line[1:]) ) if verbose > 0: print string.rstrip(line[1:]) line = inFileHandler.readline() inFileHandler.close() return lHeaders ## Cut a data bank into chunks according to the input parameters # If a sequence is shorter than the threshold, it is only renamed (not cut) # # @param inFileName string name of the input fasta file # @param chkLgth string chunk length (in bp, default=200000) # @param chkOver string chunk overlap (in bp, default=10000) # @param wordN string N stretch word length (default=11, 0 for no detection) # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') # @param clean boolean remove 'cut' and 'Nstretch' files # @param verbose integer (default = 0) # @staticmethod def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ): nbSeq = FastaUtils.dbSize( inFileName ) if verbose > 0: print "cut the %i input sequences with cutterDB..." % ( nbSeq ) sys.stdout.flush() prg = "cutterDB" cmd = prg cmd += " -l %s" % ( chkLgth ) cmd += " -o %s" %( chkOver ) cmd += " -w %s" % ( wordN ) cmd += " %s" % ( inFileName ) returnStatus = os.system( cmd ) if returnStatus != 0: msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) sys.stderr.write( "%s\n" % ( msg ) ) sys.exit(1) nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) ) if verbose > 0: print "done (%i chunks)" % ( nbChunks ) sys.stdout.flush() if verbose > 0: print "rename the headers..." sys.stdout.flush() if outFilePrefix == "": outFastaName = inFileName + "_chunks.fa" outMapName = inFileName + "_chunks.map" else: outFastaName = outFilePrefix + ".fa" outMapName = outFilePrefix + ".map" inFile = open( "%s_cut" % ( inFileName ), "r" ) line = inFile.readline() outFasta = open( outFastaName, "w" ) outMap = open( outMapName, "w" ) # read line after line (no need for big RAM) and change the sequence headers while line: if line[0] == ">": if verbose > 1: print "rename '%s'" % ( line[:-1] ); sys.stdout.flush() data = line[:-1].split(" ") seqID = data[0].split(">")[1] newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) ) oldHeader = data[2] seqStart = data[4].split("..")[0] seqEnd = data[4].split("..")[1] outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) ) outFasta.write( ">%s\n" % ( newHeader ) ) else: outFasta.write( line.upper() ) line = inFile.readline() inFile.close() outFasta.close() outMap.close() if clean == True: os.remove(inFileName + "_cut") os.remove(inFileName + ".Nstretch.map") ## Split the input fasta file in several output files # # @param inFile string name of the input fasta file # @param nbSeqPerBatch integer number of sequences per output file # @param newDir boolean put the sequences in a new directory called 'batches' # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') # @param prefix prefix in output file name # @param verbose integer verbosity level (default = 0) # @staticmethod def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ): if not os.path.exists( inFile ): msg = "ERROR: file '%s' doesn't exist" % ( inFile ) sys.stderr.write( "%s\n" % ( msg ) ) sys.exit(1) nbSeq = FastaUtils.dbSize( inFile ) nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) ) if verbose > 0: print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches ) sys.stdout.flush() if nbSeqPerBatch > 1 and useSeqHeader: useSeqHeader = False if newDir == True: if os.path.exists( "batches" ): shutil.rmtree( "batches" ) os.mkdir( "batches" ) os.chdir( "batches" ) os.system( "ln -s ../%s ." % ( inFile ) ) inFileHandler = open( inFile, "r" ) inFileHandler.seek( 0, 0 ) countBatch = 0 countSeq = 0 line = inFileHandler.readline() while line: if line == "": break if line[0] == ">": countSeq += 1 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: if "outFile" in locals(): outFile.close() countBatch += 1 if nbSeqPerBatch == 1 and useSeqHeader: outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") ) else: outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) ) outFile = open( outFileName, "w" ) if verbose > 1: print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName ) sys.stdout.flush() outFile.write( line ) line = inFileHandler.readline() inFileHandler.close() if newDir == True: os.remove( os.path.basename( inFile ) ) os.chdir( ".." ) ## Split the input fasta file in several output files # # @param inFileName string name of the input fasta file # @param maxSize integer max cumulative length for each output file # @staticmethod def splitFastaFileInBatches(inFileName, maxSize = 200000): iBioseqDB = BioseqDB(inFileName) lHeadersSizeTuples = [] for iBioseq in iBioseqDB.db: lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) os.mkdir("batches") os.chdir("batches") iterator = 0 for lHeader in lHeadersList : iterator += 1 with open("batch_%s.fa" % iterator, 'w') as f : for header in lHeader : iBioseq = iBioseqDB.fetch(header) iBioseq.write(f) os.chdir("..") ## Split the input fasta file in several output files according to their cluster identifier # # @param inFileName string name of the input fasta file # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) # @param simplifyHeader boolean simplify the headers # @param createDir boolean put the sequences in different directories # @param outPrefix string prefix of the output files (default='seqCluster') # @param verbose integer (default = 0) # @staticmethod def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ): if not os.path.exists( inFileName ): print "ERROR: %s doesn't exist" % ( inFileName ) sys.exit(1) inFile = open( inFileName, "r" ) line = inFile.readline() if line: name = line.split(" ")[0] if "Cluster" in name: clusterID = name.split("Cluster")[1].split("Mb")[0] seqID = name.split("Mb")[1] else: clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust if "Q" in name.split("Gr")[0]: seqID = name.split("Gr")[0].split("MbQ")[1] elif "S" in name: seqID = name.split("Gr")[0].split("MbS")[1] sClusterIDs = set( [ clusterID ] ) if simplifyHeader == True: header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) else: header = line[1:-1] if createDir == True: if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) outFileName = "%s%s.fa" % ( outPrefix, clusterID ) outFile = open( outFileName, "w" ) outFile.write( ">%s\n" % ( header ) ) prevClusterID = clusterID line = inFile.readline() while line: if line[0] == ">": name = line.split(" ")[0] if "Cluster" in name: clusterID = name.split("Cluster")[1].split("Mb")[0] seqID = name.split("Mb")[1] else: clusterID = name.split("Cl")[0].split("Gr")[1] if "Q" in name.split("Gr")[0]: seqID = name.split("Gr")[0].split("MbQ")[1] elif "S" in name: seqID = name.split("Gr")[0].split("MbS")[1] if clusterID != prevClusterID: outFile.close() if simplifyHeader == True: header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) else: header = line[1:-1] if createDir == True: os.chdir( ".." ) if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) outFileName = "%s%s.fa" % ( outPrefix, clusterID ) if not os.path.exists( outFileName ): outFile = open( outFileName, "w" ) else: if clusterID != prevClusterID: outFile.close() outFile = open( outFileName, "a" ) outFile.write( ">%s\n" % ( header ) ) prevClusterID = clusterID sClusterIDs.add( clusterID ) else: outFile.write( line ) line = inFile.readline() outFile.close() if verbose > 0: print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() if createDir == True: os.chdir("..") else: print "WARNING: empty input file - no cluster found"; sys.stdout.flush() ## Filter a fasta file in two fasta files using the length of each sequence as a criteron # # @param len_min integer length sequence criterion to filter # @param inFileName string name of the input fasta file # @param verbose integer (default = 0) # @staticmethod def dbLengthFilter( len_min, inFileName, verbose=0 ): file_db = open( inFileName, "r" ) file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) seq = Bioseq() numseq = 0 nbsave = 0 seq.read( file_db ) while seq.sequence: l = seq.getLength() numseq = numseq + 1 if l >= len_min: seq.write( file_dbSup ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' nbsave=nbsave+1 else: seq.write( file_dbInf ) if verbose > 0: print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' nbsave=nbsave+1 seq.read( file_db ) file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min) ## Extract the longest sequences from a fasta file # # @param num integer maximum number of sequences in the output file # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param minThresh integer minimum length threshold (default=0) # @param verbose integer (default = 0) # @staticmethod def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): bsDB = BioseqDB( inFileName ) if verbose > 0: print "nb of input sequences: %i" % ( bsDB.getSize() ) if outFileName == "": outFileName = inFileName + ".best" + str(num) outFile = open( outFileName, "w" ) if bsDB.getSize()==0: return 0 num = int(num) if verbose > 0: print "keep the %i longest sequences" % ( num ) if minThresh > 0: print "with length > %i bp" % ( minThresh ) sys.stdout.flush() # retrieve the length of each input sequence tmpLSeqLgth = [] seqNum = 0 for bs in bsDB.db: seqNum += 1 tmpLSeqLgth.append( bs.getLength() ) if verbose > 1: print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) sys.stdout.flush() # sort the lengths tmpLSeqLgth.sort() tmpLSeqLgth.reverse() # select the longest lSeqLgth = [] for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): if tmpLSeqLgth[i] >= minThresh: lSeqLgth.append( tmpLSeqLgth[i] ) if verbose > 0: print "selected max length: %i" % ( max(lSeqLgth) ) print "selected min length: %i" % ( min(lSeqLgth) ) sys.stdout.flush() # save the longest inFile = open( inFileName ) seqNum = 0 nbSave = 0 for bs in bsDB.db: seqNum += 1 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: bs.write( outFile ) if verbose > 1: print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) sys.stdout.flush() nbSave += 1 if nbSave == num: break inFile.close() outFile.close() if verbose > 0: print nbSave, "saved sequences in ", outFileName sys.stdout.flush() return 0 ## Extract all the sequence headers from a fasta file and write them in a new fasta file # # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') # @staticmethod def dbExtractSeqHeaders( inFileName, outFileName="" ): lHeaders = FastaUtils.dbHeaders( inFileName ) if outFileName == "": outFileName = inFileName + ".headers" outFile = open( outFileName, "w" ) for i in lHeaders: outFile.write( i + "\n" ) outFile.close() return 0 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file # # @param pattern regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return if outFileName == "": outFileName = inFileName + '.extracted' outFile = open( outFileName, 'w' ) patternTosearch = re.compile( pattern ) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open( inFileName, "r" ) bioseq.read( inFile ) while bioseq.sequence: bioseqNb = bioseqNb + 1 m = patternTosearch.search( bioseq.header ) if m: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb = savedBioseqNb + 1 bioseq.read( inFile ) inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file # # @param patternFileName string file containing regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) bioseq.read( inFile ) while bioseq.sequence != None: lHeaders.append( bioseq.header ) bioseq.read( inFile ) inFile.close() lHeadersToKeep = [] patternFile = open( patternFileName, "r" ) for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile(pattern[:-1]) for h in lHeaders: if patternToSearch.search(h): lHeadersToKeep.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + ".extracted" outFile=open( outFileName, "w" ) inFile = open( inFileName, "r" ) bioseq.read(inFile) while bioseq.sequence: bioseqNb += 1 if bioseq.header in lHeadersToKeep: bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 bioseq.read(inFile) inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file # # @param pattern regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): if pattern == "": return patternToSearch = re.compile(pattern) if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open(outFileName,'w') bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open(inFileName) bioseq.read(inFile) while bioseq.sequence != None: bioseqNb += 1 if not patternToSearch.search(bioseq.header): bioseq.write(outFile) if verbose > 1: print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' savedBioseqNb += 1 bioseq.read(inFile) inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file # # @param patternFileName string file containing regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): if patternFileName == "": print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] inFile = open( inFileName, "r" ) bioseq.read( inFile ) while bioseq.sequence != None: bioseqNb += 1 lHeaders.append( bioseq.header ) bioseq.read( inFile ) inFile.close() patternFile = open( patternFileName, "r") lHeadersToRemove = [] for pattern in patternFile: if verbose > 0: print "pattern: ",pattern[:-1]; sys.stdout.flush() patternToSearch = re.compile( pattern[:-1] ) for h in lHeaders: if patternToSearch.search(h): lHeadersToRemove.append(h) patternFile.close() if outFileName == "": outFileName = inFileName + '.cleaned' outFile = open( outFileName, 'w' ) bioseqNum = 0 inFile=open( inFileName ) bioseq.read( inFile ) while bioseq.sequence != None: bioseqNum += 1 if bioseq.header not in lHeadersToRemove: bioseq.write( outFile ) if verbose > 1: print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() savedBioseqNb += 1 bioseq.read( inFile ) inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) ## Find sequence's ORFs from a fasta file and write them in a tab file # # @param inFileName string name of the input fasta file # @param orfMaxNb integer Select orfMaxNb best ORFs # @param orfMinLength integer Keep ORFs with length > orfMinLength # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): if outFileName == "": outFileName = inFileName + ".ORF.map" outFile = open( outFileName, "w" ) bioseq = Bioseq() bioseqNb = 0 inFile = open( inFileName ) bioseq.read( inFile ) while bioseq.sequence != None: bioseq.upCase() bioseqNb += 1 if verbose > 0: print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' orf = bioseq.findORF() bestOrf = [] for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = orf[i][j-1] + 4 end = orf[i][j] + 3 if end - start >= orfMinLength: bestOrf.append( ( end-start, i+1, start, end ) ) bioseq.reverseComplement() orf = bioseq.findORF() seqLen = bioseq.getLength() for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = seqLen - orf[i][j-1] - 3 end = seqLen - orf[i][j] - 2 if start - end >= orfMinLength: bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) bestOrf.sort() bestOrf.reverse() bestOrfNb = len(bestOrf) if orfMaxNb != 0 and orfMaxNb < bestOrfNb: bestOrfNb = orfMaxNb for i in xrange(0, bestOrfNb): if verbose > 0: print bestOrf[i] outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ "|"+str(bestOrf[i][0]),bioseq.header, bestOrf[i][2],bestOrf[i][3])) bioseq.read( inFile ) inFile.close() outFile.close() return 0 ## Sort sequences by increasing length (write a new file) # # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param verbose integer verbosity level # @staticmethod def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists( inFileName ): print "ERROR: file '%s' doesn't exist" % ( inFileName ) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open( inFileName, "r" ) countSeq = 0 bs = Bioseq() bs.read( inFileHandler ) while bs.header: countSeq += 1 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) bs.appendBioseqInFile( tmpFile ) if verbose > 1: print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) bs.header = "" bs.sequence = "" bs.read( inFileHandler ) inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists( outFileName ): os.remove( outFileName ) lFiles = glob.glob( "*bp_*nb" ) lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) for fileName in lFiles: cmd = "cat %s >> %s" % ( fileName, outFileName ) returnValue = os.system( cmd ) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) sys.exit(1) os.remove( fileName ) return 0 ## Sort sequences by header # # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param verbose integer verbosity level # @staticmethod def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0): if outFileName == "": outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] iBioseqDB = BioseqDB(inFileName) f = open(outFileName, "w") lHeaders = sorted(iBioseqDB.getHeaderList()) for header in lHeaders: iBioseq = iBioseqDB.fetch(header) iBioseq.write(f) f.close() ## Return a dictionary which keys are the headers and values the length of the sequences # # @param inFile string name of the input fasta file # @param verbose integer verbosity level # @staticmethod def getLengthPerHeader( inFile, verbose=0 ): dHeader2Length = {} inFileHandler = open( inFile, "r" ) currentSeqHeader = "" currentSeqLength = 0 line = inFileHandler.readline() while line: if line[0] == ">": if currentSeqHeader != "": dHeader2Length[ currentSeqHeader ] = currentSeqLength currentSeqLength = 0 currentSeqHeader = line[1:-1] if verbose > 0: print "current header: %s" % ( currentSeqHeader ) sys.stdout.flush() else: currentSeqLength += len( line.replace("\n","") ) line = inFileHandler.readline() dHeader2Length[ currentSeqHeader ] = currentSeqLength inFileHandler.close() return dHeader2Length ## Convert headers from a fasta file having chunk coordinates # # @param inFile string name of the input fasta file # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes # @param outFile string name of the output file # @staticmethod def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): inFileHandler = open(inFile, "r") outFileHandler = open(outFile, "w") dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) iConvCoord = ConvCoord() line = inFileHandler.readline() while line: if line[0] == ">": if "{Fragment}" in line: chkName = line.split(" ")[1] chrName = dChunk2Map[chkName].seqname lCoordPairs = line.split(" ")[3].split(",") lRangesOnChk = [] for i in lCoordPairs: iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) lRangesOnChk.append(iRange) lRangesOnChr = [] for iRange in lRangesOnChk: lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) newHeader = line[1:-1].split(" ")[0] newHeader += " %s" % chrName newHeader += " {Fragment}" newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) for iRange in lRangesOnChr[1:]: newHeader += ",%i..%i" % (iRange.start, iRange.end) outFileHandler.write(">%s\n" % newHeader) else: chkName = line.split("_")[1].split(" ")[0] chrName = dChunk2Map[chkName].seqname coords = line[line.find("[")+1 : line.find("]")] start = int(coords.split(",")[0]) end = int(coords.split(",")[1]) iRangeOnChk = Range(chkName, start, end) iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) newHeader = line[1:-1].split("_")[0] newHeader += " %s" % chrName newHeader += " %s" % line[line.find("(") : line.find(")")+1] newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) outFileHandler.write(">%s\n" % newHeader) else: outFileHandler.write(line) line = inFileHandler.readline() inFileHandler.close() outFileHandler.close() ## Convert a fasta file to a length file # # @param inFile string name of the input fasta file # @param outFile string name of the output file # @staticmethod def convertFastaToLength(inFile, outFile = ""): if outFile == "": outFile = "%s.length" % inFile if inFile != "": with open(inFile, "r") as inFH: with open(outFile, "w") as outFH: bioseq = Bioseq() bioseq.read(inFH) while bioseq.sequence != None: seqLen = bioseq.getLength() outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) bioseq.read(inFH) ## Convert a fasta file to a seq file # # @param inFile string name of the input fasta file # @param outFile string name of the output file # @staticmethod def convertFastaToSeq(inFile, outFile = ""): if outFile == "": outFile = "%s.seq" % inFile if inFile != "": with open(inFile, "r") as inFH: with open(outFile, "w") as outFH: bioseq = Bioseq() bioseq.read(inFH) while bioseq.sequence != None: seqLen = bioseq.getLength() outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ bioseq.sequence, bioseq.header, seqLen)) bioseq.read(inFH) ## Splice an input fasta file using coordinates in a Map file # # @note the coordinates should be merged beforehand! # @staticmethod def spliceFromCoords( genomeFile, coordFile, obsFile ): genomeFileHandler = open( genomeFile, "r" ) obsFileHandler = open( obsFile, "w" ) dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) bs = Bioseq() bs.read( genomeFileHandler ) while bs.sequence: if dChr2Maps.has_key( bs.header ): lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) splicedSeq = "" currentSite = 0 for iMap in lCoords: minSplice = iMap.getMin() - 1 if minSplice > currentSite: splicedSeq += bs.sequence[ currentSite : minSplice ] currentSite = iMap.getMax() splicedSeq += bs.sequence[ currentSite : ] bs.sequence = splicedSeq bs.write( obsFileHandler ) bs.read( genomeFileHandler ) genomeFileHandler.close() obsFileHandler.close() ## Shuffle input sequences (single file or files in a directory) # @staticmethod def dbShuffle( inData, outData, verbose=0 ): if CheckerUtils.isExecutableInUserPath("esl-shuffle"): prg = "esl-shuffle" else : prg = "shuffle" genericCmd = prg + " -d INPUT > OUTPUT" if os.path.isfile( inData ): if verbose > 0: print "shuffle input file '%s'" % inData cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData) print cmd returnStatus = os.system( cmd ) if returnStatus != 0: sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) sys.exit(1) elif os.path.isdir( inData ): if verbose > 0: print "shuffle files in input directory '%s'" % inData if os.path.exists( outData ): shutil.rmtree( outData ) os.mkdir( outData ) lInputFiles = glob.glob( "%s/*.fa" %( inData ) ) nbFastaFiles = 0 for inputFile in lInputFiles: nbFastaFiles += 1 if verbose > 1: print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) ) fastaBaseName = os.path.basename( inputFile ) prefix, extension = os.path.splitext( fastaBaseName ) cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix)) returnStatus = os.system( cmd ) if returnStatus != 0: sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) sys.exit(1) ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers # # @param inClusterFileName string input cluster file name # @param inFastaFileName string input fasta file name # @param outFileName string output file name # @param verbosity integer verbosity # @staticmethod def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) iFastaParser = FastaParser(inFastaFileName) with open(outFileName, "w") as f: for iSequence in iFastaParser.getIterator(): header = iSequence.getName() if dHeader2ClusterClusterMember.get(header): cluster = dHeader2ClusterClusterMember[header][0] member = dHeader2ClusterClusterMember[header][1] else: clusterIdForSingletonCluster += 1 cluster = clusterIdForSingletonCluster member = 1 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) iSequence.setName(newHeader) f.write(iSequence.printFasta()) @staticmethod def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): dHeader2ClusterClusterMember = {} clusterId = 0 with open(inClusterFileName) as f: line = f.readline() while line: lineWithoutLastChar = line.rstrip() lHeaders = lineWithoutLastChar.split("\t") clusterId += 1 if verbosity > 0: print "%i sequences in cluster %i" % (len(lHeaders), clusterId) memberId = 0 for header in lHeaders: memberId += 1 dHeader2ClusterClusterMember[header] = (clusterId, memberId) line = f.readline() if verbosity > 0: print "%i clusters" % clusterId return dHeader2ClusterClusterMember, clusterId @staticmethod def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): """ Write a map file from fasta output of clustering tool. Warning: only works if input fasta headers are formated like LTRharvest fasta output. """ if not outMapFileName: outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) fileDb = open(fastaFileNameFromClustering , "r") fileMap = open(outMapFileName, "w") seq = Bioseq() numseq = 0 while 1: seq.read(fileDb) if seq.sequence == None: break numseq = numseq + 1 ID = seq.header.split(' ')[0].split('_')[0] chunk = seq.header.split(' ')[0].split('_')[1] start = seq.header.split(' ')[-1].split(',')[0][1:] end = seq.header.split(' ')[-1].split(',')[1][:-1] line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) fileMap.write(line + "\n") fileDb.close() fileMap.close() print "saved in %s" % outMapFileName @staticmethod def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): outFormat = outFormat.lower() if outFormat in ["gff", "gff3"]: outFormat = "gff3" else: outFormat = "map" lTNstretches = [] if nbN != 0: iBSDB = BioseqDB(fastaFileName) for iBS in iBSDB.db: nbNFound = 0 start = 1 pos = 1 lastPos = 0 while pos <= iBS.getLength(): if nbNFound == 0: start = pos while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: nbNFound += 1 pos += 1 lastPos = pos if pos - lastPos >= nbN: if nbNFound >= nbN: lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) nbNFound = 0 pos += 1 if nbNFound >= nbN: lTNstretches.append((iBS.getHeader(), start, lastPos - 1)) lTNstretches.sort(key = itemgetter(0, 1, 2)) if outFileName == "": outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) with open(outFileName, "w") as fH: if outFormat == "gff3": fH.write("##gff-version 3\n") for item in lTNstretches: seq = item[0] start = item[1] end = item[2] if outFormat == "gff3": fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) else: fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))