Mercurial > repos > urgi-team > teiso
view TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 09:04:42 -0400 |
parents | |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. import os import re import sys import math import glob import string import shutil from commons.core.utils.FileUtils import FileUtils from commons.core.seq.Bioseq import Bioseq from commons.core.seq.BioseqDB import BioseqDB from commons.core.coord.MapUtils import MapUtils from commons.core.coord.Range import Range from commons.core.coord.ConvCoord import ConvCoord from commons.core.parsing.FastaParser import FastaParser from commons.core.checker.CheckerUtils import CheckerUtils from commons.core.launcher.LauncherUtils import LauncherUtils ## Static methods for fasta file manipulation # class FastaUtils( object ): ## Count the number of sequences in the input fasta file # # @param inFile name of the input fasta file # # @return integer number of sequences in the input fasta file # @staticmethod def dbSize( inFile ): nbSeq = 0 with open(inFile) as fH: for line in fH: if line[0] == ">": nbSeq += 1 return nbSeq ## Compute the cumulative sequence length in the input fasta file # # @param inFile handler of the input fasta file # @staticmethod def dbCumLength( inFile ): cumLength = 0 for line in inFile: if line[0] != ">": cumLength += len(string.rstrip(line)) return cumLength ## Return a list with the length of each sequence in the input fasta file # # @param inFile string name of the input fasta file # @staticmethod def dbLengths(inFile): lLengths = [] currentLength = 0 with open(inFile) as fH: for line in fH: if line[0] == ">": if currentLength != 0: lLengths.append(currentLength) currentLength = 0 else: currentLength += len(line[:-1]) lLengths.append(currentLength) return lLengths ## Retrieve the sequence headers present in the input fasta file # # @param inFile string name of the input fasta file # @param verbose integer level of verbosity # # @return list of sequence headers # @staticmethod def dbHeaders(inFile, verbose = 0): lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"] if verbose: for header in lHeaders: print header return lHeaders ## Cut a data bank into chunks according to the input parameters # If a sequence is shorter than the threshold, it is only renamed (not cut) # # @param inFileName string name of the input fasta file # @param chkLgth string chunk length (in bp, default=200000) # @param chkOver string chunk overlap (in bp, default=10000) # @param wordN string N stretch word length (default=11, 0 for no detection) # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') # @param clean boolean remove 'cut' and 'Nstretch' files # @param verbose integer (default = 0) # @staticmethod def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0): nbSeq = FastaUtils.dbSize(inFileName) if verbose > 0: print "cut the %i input sequences with cutterDB..." % nbSeq sys.stdout.flush() prg = "cutterDB" cmd = prg cmd += " -l %s" % chkLgth cmd += " -o %s" % chkOver cmd += " -w %s" % wordN cmd += " %s" % inFileName returnStatus = os.system(cmd) if returnStatus != 0: msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus) sys.stderr.write("%s\n" % msg) sys.exit(1) nbChunks = FastaUtils.dbSize("%s_cut" % inFileName) if verbose > 0: print "done (%i chunks)" % ( nbChunks ) sys.stdout.flush() if verbose > 0: print "rename the headers..." sys.stdout.flush() if outFilePrefix == "": outFastaName = inFileName + "_chunks.fa" outMapName = inFileName + "_chunks.map" else: outFastaName = outFilePrefix + ".fa" outMapName = outFilePrefix + ".map" with open("%s_cut" % inFileName) as inFile: outFasta = open(outFastaName, "w") outMap = open(outMapName, "w") for line in inFile: if line[0] == ">": if verbose > 1: print "rename '%s'" % (line[:-1]); sys.stdout.flush() data = line[:-1].split(" ") seqID = data[0].split(">")[1] newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks)))) oldHeader = data[2] seqStart = data[4].split("..")[0] seqEnd = data[4].split("..")[1] outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd)) outFasta.write(">%s\n" % newHeader) else: outFasta.write(line.upper()) outFasta.close() outMap.close() #stats on .Nstretch.map file genomeLength = FastaUtils.dbCumLength(open(inFileName)) NstretchMapFile = inFileName + ".Nstretch.map" outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w") if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile): outNstrechStats.write("No N in stretch length > %s\n" % wordN) else: with open(NstretchMapFile) as f: dHeader2lLengths = {} for line in f: data = line.rstrip().split() header = data[1] length = int(data[3]) - int(data[2]) + 1 if header not in dHeader2lLengths: dHeader2lLengths[header] = [] dHeader2lLengths[header].append(length) for header in sorted(dHeader2lLengths): lLengths = dHeader2lLengths[header] outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths))) cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()]) NstretchPrct = float(cumulAllStretch)/genomeLength*100 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct)) outNstrechStats.close() if clean == True: os.remove(inFileName + "_cut") os.remove(NstretchMapFile) ## Split the input fasta file in several output files # # @param inFile string name of the input fasta file # @param nbSeqPerBatch integer number of sequences per output file # @param newDir boolean put the sequences in a new directory called 'batches' # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') # @param prefix prefix in output file name # @param verbose integer verbosity level (default = 0) # @staticmethod def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0): if not os.path.exists(inFile): msg = "ERROR: file '%s' doesn't exist" % inFile sys.stderr.write("%s\n" % msg) sys.exit(1) nbSeq = FastaUtils.dbSize(inFile) nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch))) if verbose > 0: print "save the %i input sequences into %i batches" % (nbSeq, nbBatches) sys.stdout.flush() if nbSeqPerBatch != 1 and useSeqHeader: useSeqHeader = False if newDir == True: if os.path.exists("batches"): shutil.rmtree("batches") os.mkdir("batches") os.chdir("batches") os.system("ln -s ../%s ." % inFile) with open(inFile) as inFileHandler: countBatch = 0 countSeq = 0 for line in inFileHandler: if line[0] == ">": countSeq += 1 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: try: outFile.close() except: pass countBatch += 1 if useSeqHeader: outFileName = "%s.fa" % (line[1:-1].replace(" ", "_")) else: outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches)))) outFile = open(outFileName, "w") if verbose > 1: print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName) sys.stdout.flush() outFile.write(line) if newDir: os.remove(os.path.basename(inFile)) os.chdir("..") ## Split the input fasta file in several output files # # @param inFileName string name of the input fasta file # @param maxSize integer max cumulative length for each output file # @staticmethod def splitFastaFileInBatches(inFileName, maxSize = 200000): iBioseqDB = BioseqDB(inFileName) lHeadersSizeTuples = [] for iBioseq in iBioseqDB.db: lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) os.mkdir("batches") os.chdir("batches") iterator = 0 for lHeader in lHeadersList: iterator += 1 with open("batch_%s.fa" % iterator, 'w') as f: for header in lHeader : iBioseq = iBioseqDB.fetch(header) iBioseq.write(f) os.chdir("..") ## Split the input fasta file in several output files according to their cluster identifier # # @param inFileName string name of the input fasta file # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) # @param simplifyHeader boolean simplify the headers # @param createDir boolean put the sequences in different directories # @param outPrefix string prefix of the output files (default='seqCluster') # @param verbose integer (default = 0) # @staticmethod def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0): if not os.path.exists(inFileName): print "ERROR: %s doesn't exist" % inFileName sys.exit(1) inFile = open(inFileName) line = inFile.readline() if line: name = line.split(" ")[0] if "Cluster" in name: clusterID = name.split("Cluster")[1].split("Mb")[0] seqID = name.split("Mb")[1] else: clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust if "Q" in name.split("Gr")[0]: seqID = name.split("Gr")[0].split("MbQ")[1] elif "S" in name: seqID = name.split("Gr")[0].split("MbS")[1] sClusterIDs = set( [ clusterID ] ) if simplifyHeader == True: header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) else: header = line[1:-1] if createDir == True: if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) outFileName = "%s%s.fa" % ( outPrefix, clusterID ) outFile = open( outFileName, "w" ) outFile.write( ">%s\n" % ( header ) ) prevClusterID = clusterID line = inFile.readline() while line: if line[0] == ">": name = line.split(" ")[0] if "Cluster" in name: clusterID = name.split("Cluster")[1].split("Mb")[0] seqID = name.split("Mb")[1] else: clusterID = name.split("Cl")[0].split("Gr")[1] if "Q" in name.split("Gr")[0]: seqID = name.split("Gr")[0].split("MbQ")[1] elif "S" in name: seqID = name.split("Gr")[0].split("MbS")[1] if clusterID != prevClusterID: outFile.close() if simplifyHeader == True: header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) else: header = line[1:-1] if createDir == True: os.chdir( ".." ) if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) outFileName = "%s%s.fa" % ( outPrefix, clusterID ) if not os.path.exists( outFileName ): outFile = open( outFileName, "w" ) else: if clusterID != prevClusterID: outFile.close() outFile = open( outFileName, "a" ) outFile.write( ">%s\n" % ( header ) ) prevClusterID = clusterID sClusterIDs.add( clusterID ) else: outFile.write( line ) line = inFile.readline() outFile.close() if verbose > 0: print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() if createDir == True: os.chdir("..") else: print "WARNING: empty input file - no cluster found"; sys.stdout.flush() ## Filter a fasta file in two fasta files using the length of each sequence as a criterion # # @param len_min integer length sequence criterion to filter # @param inFileName string name of the input fasta file # @param verbose integer (default = 0) # @staticmethod def dbLengthFilter(len_min, inFileName, verbose = 0): file_db = open(inFileName,) file_dbInf = open(inFileName + ".Inf" + str(len_min), "w") file_dbSup = open(inFileName + ".Sup" + str(len_min), "w") seq = Bioseq() numseq = 0 nbsaveInf = 0 nbsaveSup = 0 seq.read(file_db) while seq.getHeader(): l = seq.getLength() numseq = numseq + 1 if l >= len_min: seq.write(file_dbSup) if verbose > 0: nbsaveSup = nbsaveSup + 1 else: seq.write(file_dbInf) if verbose > 0: nbsaveInf = nbsaveInf + 1 seq.read(file_db) file_db.close() file_dbInf.close() file_dbSup.close() if verbose > 0: print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min)) ## Extract the longest sequences from a fasta file # # @param num integer maximum number of sequences in the output file # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param minThresh integer minimum length threshold (default=0) # @param verbose integer (default = 0) # @staticmethod def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0): bsDB = BioseqDB(inFileName) if verbose > 0: print "nb of input sequences: %i" % bsDB.getSize() if outFileName == "": outFileName = inFileName + ".best" + str(num) outFile = open( outFileName, "w" ) if bsDB.getSize()==0: return 0 num = int(num) if verbose > 0: print "keep the %i longest sequences" % num if minThresh > 0: print "with length > %i bp" % minThresh sys.stdout.flush() tmpLSeqLgth = bsDB.getListOfSequencesLength() tmpLSeqLgth.sort(reverse = True) # select the longests lSeqLgth = [] for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): if tmpLSeqLgth[i] >= minThresh: lSeqLgth.append( tmpLSeqLgth[i] ) if verbose > 0: print "selected max length: %i" % max(lSeqLgth) print "selected min length: %i" % min(lSeqLgth) sys.stdout.flush() # save the longest inFile = open( inFileName ) seqNum = 0 nbSave = 0 for bs in bsDB.db: seqNum += 1 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: bs.write( outFile ) if verbose > 1: print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) sys.stdout.flush() nbSave += 1 if nbSave == num: break inFile.close() outFile.close() if verbose > 0: print nbSave, "saved sequences in ", outFileName sys.stdout.flush() return 0 ## Extract all the sequence headers from a fasta file and write them in a new file # # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') # @staticmethod def dbExtractSeqHeaders(inFileName, outFileName = ""): if not outFileName: outFileName = inFileName + ".headers" with open(outFileName, "w") as f: for header in FastaUtils.dbHeaders(inFileName): f.write("%s\n" % header) return 0 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file # # @param pattern regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0): if not pattern: return if not outFileName: outFileName = inFileName + '.extracted' outFile = open(outFileName, 'w') patternTosearch = re.compile(pattern) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 inFile = open(inFileName) bioseq.read(inFile) while bioseq.sequence: bioseqNb = bioseqNb + 1 m = patternTosearch.search(bioseq.header) if m: bioseq.write(outFile) if verbose > 1: print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!' savedBioseqNb = savedBioseqNb + 1 bioseq.read(inFile) inFile.close() outFile.close() if verbose > 0: print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file # # @param patternFileName string file containing regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): if not patternFileName: print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] with open(inFileName) as inFile: bioseq.read(inFile) while bioseq.sequence: lHeaders.append(bioseq.header) bioseq.read(inFile) lHeadersToKeep = [] with open(patternFileName) as patternFile: for pattern in patternFile: pattern = pattern.rstrip() if verbose > 0: print "pattern: ", pattern; sys.stdout.flush() patternToSearch = re.compile(pattern) lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)]) if not outFileName: outFileName = inFileName + ".extracted" with open(outFileName, "w") as outFile: with open(inFileName) as inFile: bioseq.read(inFile) while bioseq.sequence: bioseqNb += 1 if bioseq.header in lHeadersToKeep: bioseq.write(outFile) savedBioseqNb += 1 if verbose > 1: print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() bioseq.read(inFile) if verbose > 0: print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file # # @param pattern regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0): if not pattern: return patternToSearch = re.compile(pattern) if outFileName == "": outFileName = inFileName + '.cleaned' with open(outFileName, 'w') as outFile: bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 with open(inFileName) as inFile: bioseq.read(inFile) while bioseq.sequence: bioseqNb += 1 if not patternToSearch.search(bioseq.header): bioseq.write(outFile) savedBioseqNb += 1 if verbose > 1: print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!' bioseq.read(inFile) if verbose > 0: print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file # # @param patternFileName string file containing regular expression to search in headers # @param inFileName string name of the input fasta file # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): if not patternFileName: print "ERROR: no file of pattern" sys.exit(1) bioseq = Bioseq() bioseqNb = 0 savedBioseqNb = 0 lHeaders = [] with open(inFileName) as inFile: bioseq.read(inFile) while bioseq.sequence: lHeaders.append(bioseq.header) bioseq.read(inFile) with open(patternFileName) as patternFile: lHeadersToRemove = [] for pattern in patternFile: pattern = pattern.rstrip() if verbose > 0: print "pattern: ", pattern; sys.stdout.flush() patternToSearch = re.compile(pattern) lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)]) if not outFileName: outFileName = inFileName + '.cleaned' with open(outFileName, 'w') as outFile: bioseqNum = 0 with open(inFileName) as inFile: bioseq.read(inFile) while bioseq.sequence: bioseqNum += 1 if bioseq.header not in lHeadersToRemove: bioseq.write(outFile) savedBioseqNb += 1 if verbose > 1: print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() bioseq.read(inFile) if verbose > 0: print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) ## Find sequence's ORFs from a fasta file and write them in a tab file # # @param inFileName string name of the input fasta file # @param orfMaxNb integer Select orfMaxNb best ORFs # @param orfMinLength integer Keep ORFs with length > orfMinLength # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') # @param verbose integer verbosity level (default = 0) # @staticmethod def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0): if not outFileName: outFileName = inFileName + ".ORF.map" outFile = open(outFileName, "w") bioseq = Bioseq() bioseqNb = 0 inFile = open(inFileName) bioseq.read(inFile) while bioseq.sequence: bioseq.upCase() bioseqNb += 1 if verbose > 0: print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]' orf = bioseq.findORF() bestOrf = [] for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = orf[i][j - 1] + 4 end = orf[i][j] + 3 if end - start >= orfMinLength: bestOrf.append((end - start, i + 1, start, end)) bioseq.reverseComplement() orf = bioseq.findORF() seqLen = bioseq.getLength() for i in orf.keys(): orfLen = len(orf[i]) for j in xrange(1, orfLen): start = seqLen - orf[i][j - 1] - 3 end = seqLen - orf[i][j] - 2 if start - end >= orfMinLength: bestOrf.append((start - end, (i + 1) * -1, start, end)) bestOrf.sort(reverse = True) bestOrfNb = len(bestOrf) if orfMaxNb != 0 and orfMaxNb < bestOrfNb: bestOrfNb = orfMaxNb for i in xrange(0, bestOrfNb): if verbose > 0: print bestOrf[i] outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \ "|" + str(bestOrf[i][0]), bioseq.header, bestOrf[i][2], bestOrf[i][3])) bioseq.read(inFile) inFile.close() outFile.close() return 0 ## Sort sequences by increasing length (write a new file) # # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param verbose integer verbosity level # @staticmethod def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0): if verbose > 0: print "sort sequences by increasing length" sys.stdout.flush() if not os.path.exists(inFileName): print "ERROR: file '%s' doesn't exist" % (inFileName) sys.exit(1) # read each seq one by one # save them in distinct temporary files # with their length in the name inFileHandler = open(inFileName, "r") countSeq = 0 bs = Bioseq() bs.read(inFileHandler) while bs.header: countSeq += 1 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq) bs.appendBioseqInFile(tmpFile) if verbose > 1: print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile) bs.header = "" bs.sequence = "" bs.read(inFileHandler) inFileHandler.close() # sort temporary file names # concatenate them into the output file if os.path.exists(outFileName): os.remove(outFileName) lFiles = glob.glob("*bp_*nb") lFiles.sort(key = lambda s:int(s.split("bp_")[0])) for fileName in lFiles: cmd = "cat %s >> %s" % (fileName, outFileName) returnValue = os.system(cmd) if returnValue != 0: print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName) sys.exit(1) os.remove(fileName) return 0 ## Sort sequences by header # # @param inFileName string name of the input fasta file # @param outFileName string name of the output fasta file # @param verbose integer verbosity level # @staticmethod def sortSequencesByHeader(inFileName, outFileName = ""): if outFileName == "": outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] iBioseqDB = BioseqDB(inFileName) with open(outFileName, "w") as f: for header in sorted(iBioseqDB.getHeaderList()): iBioseq = iBioseqDB.fetch(header) iBioseq.write(f) ## Return a dictionary which keys are the headers and values the length of the sequences # # @param inFile string name of the input fasta file # @param verbose integer verbosity level # @staticmethod def getLengthPerHeader(inFile, verbose = 0): dHeader2Length = {} with open(inFile) as inFileHandler: currentSeqHeader = "" currentSeqLength = 0 for line in inFileHandler: if line[0] == ">": if currentSeqHeader != "": dHeader2Length[currentSeqHeader] = currentSeqLength currentSeqLength = 0 currentSeqHeader = line[1:-1] if verbose > 0: print "current header: %s" % currentSeqHeader sys.stdout.flush() else: currentSeqLength += len(line.replace("\n", "")) dHeader2Length[currentSeqHeader] = currentSeqLength return dHeader2Length ## Convert headers from a fasta file having chunk coordinates # # @param inFile string name of the input fasta file # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes # @param outFile string name of the output file # @staticmethod def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): inFileHandler = open(inFile, "r") outFileHandler = open(outFile, "w") dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) iConvCoord = ConvCoord() for line in inFileHandler: if line[0] == ">": if "{Fragment}" in line: chkName = line.split(" ")[1] chrName = dChunk2Map[chkName].seqname lCoordPairs = line.split(" ")[3].split(",") lRangesOnChk = [] for i in lCoordPairs: iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) lRangesOnChk.append(iRange) lRangesOnChr = [] for iRange in lRangesOnChk: lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) newHeader = line[1:-1].split(" ")[0] newHeader += " %s" % chrName newHeader += " {Fragment}" newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) for iRange in lRangesOnChr[1:]: newHeader += ",%i..%i" % (iRange.start, iRange.end) outFileHandler.write(">%s\n" % newHeader) else: chkName = line.split("_")[1].split(" ")[0] chrName = dChunk2Map[chkName].seqname coords = line[line.find("[")+1 : line.find("]")] start = int(coords.split(",")[0]) end = int(coords.split(",")[1]) iRangeOnChk = Range(chkName, start, end) iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) newHeader = line[1:-1].split("_")[0] newHeader += " %s" % chrName newHeader += " %s" % line[line.find("(") : line.find(")")+1] newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) outFileHandler.write(">%s\n" % newHeader) else: outFileHandler.write(line) inFileHandler.close() outFileHandler.close() ## Convert a fasta file to a length file # # @param inFile string name of the input fasta file # @param outFile string name of the output file # @staticmethod def convertFastaToLength(inFile, outFile = ""): if not outFile: outFile = "%s.length" % inFile if inFile: with open(inFile) as inFH: with open(outFile, "w") as outFH: bioseq = Bioseq() bioseq.read(inFH) while bioseq.sequence: seqLen = bioseq.getLength() outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) bioseq.read(inFH) ## Convert a fasta file to a seq file # # @param inFile string name of the input fasta file # @param outFile string name of the output file # @staticmethod def convertFastaToSeq(inFile, outFile = ""): if not outFile: outFile = "%s.seq" % inFile if inFile: with open(inFile) as inFH: with open(outFile, "w") as outFH: bioseq = Bioseq() bioseq.read(inFH) while bioseq.sequence: seqLen = bioseq.getLength() outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ bioseq.sequence, bioseq.header, seqLen)) bioseq.read(inFH) ## Splice an input fasta file using coordinates in a Map file # # @note the coordinates should be merged beforehand! # @staticmethod def spliceFromCoords(genomeFile, coordFile, obsFile): genomeFileHandler = open(genomeFile) obsFileHandler = open(obsFile, "w") dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile) bs = Bioseq() bs.read(genomeFileHandler) while bs.sequence: if dChr2Maps.has_key(bs.header): lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header]) splicedSeq = [] currentSite = 0 for iMap in lCoords: minSplice = iMap.getMin() - 1 if minSplice > currentSite: splicedSeq += bs.sequence[currentSite : minSplice] if currentSite <= iMap.getMax(): currentSite = iMap.getMax() splicedSeq += bs.sequence[currentSite:] bs.sequence = "".join(splicedSeq) bs.write(obsFileHandler) bs.read(genomeFileHandler) genomeFileHandler.close() obsFileHandler.close() ## Shuffle input sequences (single file or files in a directory) # @staticmethod def dbShuffle(inData, outData, verbose = 0): if CheckerUtils.isExecutableInUserPath("esl-shuffle"): prg = "esl-shuffle" else : prg = "shuffle" genericCmd = prg + " --seed 1 -d INPUT > OUTPUT" if os.path.isfile(inData): if verbose > 0: print "shuffle input file '%s'" % inData cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData) print cmd returnStatus = os.system(cmd) if returnStatus: sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) sys.exit(1) elif os.path.isdir(inData): if verbose > 0: print "shuffle files in input directory '%s'" % inData if os.path.exists(outData): shutil.rmtree(outData) os.mkdir(outData) lInputFiles = glob.glob("%s/*.fa" % inData) nbFastaFiles = 0 for inputFile in lInputFiles: nbFastaFiles += 1 if verbose > 1: print "%3i / %3i" % (nbFastaFiles, len(lInputFiles)) fastaBaseName = os.path.basename(inputFile) prefix = os.path.splitext(fastaBaseName)[0] cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix)) returnStatus = os.system(cmd) if returnStatus: sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) sys.exit(1) ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers # # @param inClusterFileName string input cluster file name # @param inFastaFileName string input fasta file name # @param outFileName string output file name # @param verbosity integer verbosity # @staticmethod def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) iFastaParser = FastaParser(inFastaFileName) with open(outFileName, "w") as f: for iSequence in iFastaParser.getIterator(): header = iSequence.getName() if dHeader2ClusterClusterMember.get(header): cluster = dHeader2ClusterClusterMember[header][0] member = dHeader2ClusterClusterMember[header][1] else: clusterIdForSingletonCluster += 1 cluster = clusterIdForSingletonCluster member = 1 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) iSequence.setName(newHeader) f.write(iSequence.printFasta()) @staticmethod def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): dHeader2ClusterClusterMember = {} clusterId = 0 with open(inClusterFileName) as f: for line in f: lineWithoutLastChar = line.rstrip() lHeaders = lineWithoutLastChar.split("\t") clusterId += 1 if verbosity > 0: print "%i sequences in cluster %i" % (len(lHeaders), clusterId) memberId = 0 for header in lHeaders: memberId += 1 dHeader2ClusterClusterMember[header] = (clusterId, memberId) if verbosity > 0: print "%i clusters" % clusterId return dHeader2ClusterClusterMember, clusterId @staticmethod def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): """ Write a map file from fasta output of clustering tool. Warning: only works if input fasta headers are formated like LTRharvest fasta output. """ if not outMapFileName: outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) fileDb = open(fastaFileNameFromClustering) fileMap = open(outMapFileName, "w") seq = Bioseq() numseq = 0 seq.read(fileDb) while seq.sequence: numseq = numseq + 1 ID = seq.header.split(' ')[0].split('_')[0] chunk = seq.header.split(' ')[0].split('_')[1] start = seq.header.split(' ')[-1].split(',')[0][1:] end = seq.header.split(' ')[-1].split(',')[1][:-1] line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) fileMap.write("%s\n" % line) seq.read(fileDb) fileDb.close() fileMap.close() print "saved in %s" % outMapFileName @staticmethod def getNstretchesRangesList(fastaFileName, nbN = 2): lNstretchesRanges = [] if nbN != 0: iBSDB = BioseqDB(fastaFileName) for iBS in iBSDB.db: nbNFound = 0 start = 1 pos = 1 lastPos = 0 while pos <= iBS.getLength(): if nbNFound == 0: start = pos while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: nbNFound += 1 pos += 1 lastPos = pos if pos - lastPos >= nbN: if nbNFound >= nbN: lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) nbNFound = 0 pos += 1 if nbNFound >= nbN: lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False) return lNstretchesRanges @staticmethod def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN) outFormat = outFormat.lower() if outFormat in ["gff", "gff3"]: outFormat = "gff3" else: outFormat = "map" if outFileName == "": outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) with open(outFileName, "w") as fH: if outFormat == "gff3": fH.write("##gff-version 3\n") for iRange in lNstretchesRanges: seq = iRange.getSeqname() start = iRange.getStart() end = iRange.getEnd() if outFormat == "gff3": fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) else: fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))