| 
6
 | 
     1 # Copyright INRA (Institut National de la Recherche Agronomique)
 | 
| 
 | 
     2 # http://www.inra.fr
 | 
| 
 | 
     3 # http://urgi.versailles.inra.fr
 | 
| 
 | 
     4 #
 | 
| 
 | 
     5 # This software is governed by the CeCILL license under French law and
 | 
| 
 | 
     6 # abiding by the rules of distribution of free software.  You can  use, 
 | 
| 
 | 
     7 # modify and/ or redistribute the software under the terms of the CeCILL
 | 
| 
 | 
     8 # license as circulated by CEA, CNRS and INRIA at the following URL
 | 
| 
 | 
     9 # "http://www.cecill.info". 
 | 
| 
 | 
    10 #
 | 
| 
 | 
    11 # As a counterpart to the access to the source code and  rights to copy,
 | 
| 
 | 
    12 # modify and redistribute granted by the license, users are provided only
 | 
| 
 | 
    13 # with a limited warranty  and the software's author,  the holder of the
 | 
| 
 | 
    14 # economic rights,  and the successive licensors  have only  limited
 | 
| 
 | 
    15 # liability. 
 | 
| 
 | 
    16 #
 | 
| 
 | 
    17 # In this respect, the user's attention is drawn to the risks associated
 | 
| 
 | 
    18 # with loading,  using,  modifying and/or developing or reproducing the
 | 
| 
 | 
    19 # software by the user in light of its specific status of free software,
 | 
| 
 | 
    20 # that may mean  that it is complicated to manipulate,  and  that  also
 | 
| 
 | 
    21 # therefore means  that it is reserved for developers  and  experienced
 | 
| 
 | 
    22 # professionals having in-depth computer knowledge. Users are therefore
 | 
| 
 | 
    23 # encouraged to load and test the software's suitability as regards their
 | 
| 
 | 
    24 # requirements in conditions enabling the security of their systems and/or 
 | 
| 
 | 
    25 # data to be ensured and,  more generally, to use and operate it in the 
 | 
| 
 | 
    26 # same conditions as regards security. 
 | 
| 
 | 
    27 #
 | 
| 
 | 
    28 # The fact that you are presently reading this means that you have had
 | 
| 
 | 
    29 # knowledge of the CeCILL license and that you accept its terms.
 | 
| 
 | 
    30 
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 import os
 | 
| 
 | 
    33 import sys
 | 
| 
 | 
    34 import string
 | 
| 
 | 
    35 import math
 | 
| 
 | 
    36 import shutil
 | 
| 
 | 
    37 import re
 | 
| 
 | 
    38 import glob
 | 
| 
18
 | 
    39 from operator import itemgetter
 | 
| 
6
 | 
    40 from commons.core.seq.BioseqDB import BioseqDB
 | 
| 
 | 
    41 from commons.core.seq.Bioseq import Bioseq
 | 
| 
 | 
    42 from commons.core.coord.MapUtils import MapUtils
 | 
| 
 | 
    43 from commons.core.coord.Range import Range
 | 
| 
 | 
    44 from commons.core.checker.CheckerUtils import CheckerUtils
 | 
| 
 | 
    45 from commons.core.launcher.LauncherUtils import LauncherUtils
 | 
| 
 | 
    46 from commons.core.coord.ConvCoord import ConvCoord
 | 
| 
 | 
    47 from commons.core.parsing.FastaParser import FastaParser
 | 
| 
 | 
    48 
 | 
| 
 | 
    49 
 | 
| 
 | 
    50 ## Static methods for fasta file manipulation
 | 
| 
 | 
    51 #
 | 
| 
 | 
    52 class FastaUtils( object ):
 | 
| 
 | 
    53     
 | 
| 
 | 
    54     ## Count the number of sequences in the input fasta file
 | 
| 
 | 
    55     #
 | 
| 
 | 
    56     # @param inFile name of the input fasta file
 | 
| 
 | 
    57     #
 | 
| 
 | 
    58     # @return integer number of sequences in the input fasta file
 | 
| 
 | 
    59     #
 | 
| 
 | 
    60     @staticmethod
 | 
| 
 | 
    61     def dbSize( inFile ):
 | 
| 
 | 
    62         nbSeq = 0
 | 
| 
 | 
    63         inFileHandler = open( inFile, "r" )
 | 
| 
 | 
    64         line = inFileHandler.readline()
 | 
| 
 | 
    65         while line:
 | 
| 
 | 
    66             if line[0] == ">":
 | 
| 
 | 
    67                 nbSeq = nbSeq + 1
 | 
| 
 | 
    68             line = inFileHandler.readline()
 | 
| 
 | 
    69         inFileHandler.close()
 | 
| 
 | 
    70         
 | 
| 
 | 
    71         return nbSeq
 | 
| 
 | 
    72     
 | 
| 
 | 
    73     
 | 
| 
 | 
    74     ## Compute the cumulative sequence length in the input fasta file
 | 
| 
 | 
    75     #
 | 
| 
 | 
    76     # @param inFile handler of the input fasta file
 | 
| 
 | 
    77     #
 | 
| 
 | 
    78     @staticmethod
 | 
| 
 | 
    79     def dbCumLength( inFile ):
 | 
| 
 | 
    80         cumLength = 0
 | 
| 
 | 
    81         line = inFile.readline()
 | 
| 
 | 
    82         while line:
 | 
| 
 | 
    83             if line[0] != ">":
 | 
| 
 | 
    84                 cumLength += len(string.rstrip(line))
 | 
| 
 | 
    85             line = inFile.readline()
 | 
| 
 | 
    86     
 | 
| 
 | 
    87         return cumLength
 | 
| 
 | 
    88     
 | 
| 
 | 
    89     
 | 
| 
 | 
    90     ## Return a list with the length of each sequence in the input fasta file
 | 
| 
 | 
    91     #
 | 
| 
 | 
    92     # @param inFile string name of the input fasta file
 | 
| 
 | 
    93     #
 | 
| 
 | 
    94     @staticmethod
 | 
| 
 | 
    95     def dbLengths( inFile ):
 | 
| 
 | 
    96         lLengths = []
 | 
| 
 | 
    97         inFileHandler = open( inFile, "r" )
 | 
| 
 | 
    98         currentLength = 0
 | 
| 
 | 
    99         line = inFileHandler.readline()
 | 
| 
 | 
   100         while line:
 | 
| 
 | 
   101             if line[0] == ">":
 | 
| 
 | 
   102                 if currentLength != 0:
 | 
| 
 | 
   103                     lLengths.append( currentLength )
 | 
| 
 | 
   104                 currentLength = 0
 | 
| 
 | 
   105             else:
 | 
| 
 | 
   106                 currentLength += len(line[:-1])
 | 
| 
 | 
   107             line = inFileHandler.readline()
 | 
| 
 | 
   108         lLengths.append( currentLength )
 | 
| 
 | 
   109         inFileHandler.close()
 | 
| 
 | 
   110         return lLengths
 | 
| 
 | 
   111     
 | 
| 
 | 
   112     
 | 
| 
 | 
   113     ## Retrieve the sequence headers present in the input fasta file
 | 
| 
 | 
   114     #
 | 
| 
 | 
   115     # @param inFile string name of the input fasta file
 | 
| 
 | 
   116     # @param verbose integer level of verbosity
 | 
| 
 | 
   117     #
 | 
| 
 | 
   118     # @return list of sequence headers
 | 
| 
 | 
   119     #
 | 
| 
 | 
   120     @staticmethod
 | 
| 
 | 
   121     def dbHeaders( inFile, verbose=0 ):
 | 
| 
 | 
   122         lHeaders = []
 | 
| 
 | 
   123         
 | 
| 
 | 
   124         inFileHandler = open( inFile, "r" )
 | 
| 
 | 
   125         line = inFileHandler.readline()
 | 
| 
 | 
   126         while line:
 | 
| 
 | 
   127             if line[0] == ">":
 | 
| 
 | 
   128                 lHeaders.append( string.rstrip(line[1:]) )
 | 
| 
 | 
   129                 if verbose > 0:
 | 
| 
 | 
   130                     print string.rstrip(line[1:])
 | 
| 
 | 
   131             line = inFileHandler.readline()
 | 
| 
 | 
   132         inFileHandler.close()
 | 
| 
 | 
   133         
 | 
| 
 | 
   134         return lHeaders
 | 
| 
 | 
   135     
 | 
| 
 | 
   136     
 | 
| 
 | 
   137     ## Cut a data bank into chunks according to the input parameters
 | 
| 
 | 
   138     # If a sequence is shorter than the threshold, it is only renamed (not cut)
 | 
| 
 | 
   139     #
 | 
| 
 | 
   140     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   141     # @param chkLgth string chunk length (in bp, default=200000)
 | 
| 
 | 
   142     # @param chkOver string chunk overlap (in bp, default=10000)
 | 
| 
 | 
   143     # @param wordN string N stretch word length (default=11, 0 for no detection)
 | 
| 
 | 
   144     # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
 | 
| 
 | 
   145     # @param clean boolean remove 'cut' and 'Nstretch' files
 | 
| 
 | 
   146     # @param verbose integer (default = 0)
 | 
| 
 | 
   147     #
 | 
| 
 | 
   148     @staticmethod
 | 
| 
 | 
   149     def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ):
 | 
| 
 | 
   150         nbSeq = FastaUtils.dbSize( inFileName )
 | 
| 
 | 
   151         if verbose > 0:
 | 
| 
 | 
   152             print "cut the %i input sequences with cutterDB..." % ( nbSeq )
 | 
| 
 | 
   153             sys.stdout.flush()
 | 
| 
 | 
   154             
 | 
| 
 | 
   155         prg = "cutterDB"
 | 
| 
 | 
   156         cmd = prg
 | 
| 
 | 
   157         cmd += " -l %s" % ( chkLgth )
 | 
| 
 | 
   158         cmd += " -o %s"  %( chkOver )
 | 
| 
 | 
   159         cmd += " -w %s" % ( wordN )
 | 
| 
 | 
   160         cmd += " %s" % ( inFileName )
 | 
| 
 | 
   161         returnStatus = os.system( cmd )
 | 
| 
 | 
   162         if returnStatus != 0:
 | 
| 
 | 
   163             msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
 | 
| 
 | 
   164             sys.stderr.write( "%s\n" % ( msg ) )
 | 
| 
 | 
   165             sys.exit(1)
 | 
| 
 | 
   166             
 | 
| 
 | 
   167         nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) )
 | 
| 
 | 
   168         if verbose > 0:
 | 
| 
 | 
   169             print "done (%i chunks)" % ( nbChunks )
 | 
| 
 | 
   170             sys.stdout.flush()
 | 
| 
 | 
   171             
 | 
| 
 | 
   172         if verbose > 0:
 | 
| 
 | 
   173             print "rename the headers..."
 | 
| 
 | 
   174             sys.stdout.flush()
 | 
| 
 | 
   175             
 | 
| 
 | 
   176         if outFilePrefix == "":
 | 
| 
 | 
   177             outFastaName = inFileName + "_chunks.fa"
 | 
| 
 | 
   178             outMapName = inFileName + "_chunks.map"
 | 
| 
 | 
   179         else:
 | 
| 
 | 
   180             outFastaName = outFilePrefix + ".fa"
 | 
| 
 | 
   181             outMapName = outFilePrefix + ".map"
 | 
| 
 | 
   182             
 | 
| 
 | 
   183         inFile = open( "%s_cut" % ( inFileName ), "r" )
 | 
| 
 | 
   184         line = inFile.readline()
 | 
| 
 | 
   185         
 | 
| 
 | 
   186         outFasta = open( outFastaName, "w" )
 | 
| 
 | 
   187         outMap = open( outMapName, "w" )
 | 
| 
 | 
   188         
 | 
| 
 | 
   189         # read line after line (no need for big RAM) and change the sequence headers
 | 
| 
 | 
   190         while line:
 | 
| 
 | 
   191             
 | 
| 
 | 
   192             if line[0] == ">":
 | 
| 
 | 
   193                 if verbose > 1:
 | 
| 
 | 
   194                     print "rename '%s'" % ( line[:-1] ); sys.stdout.flush()
 | 
| 
 | 
   195                 data = line[:-1].split(" ")
 | 
| 
 | 
   196                 seqID = data[0].split(">")[1]
 | 
| 
 | 
   197                 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) )
 | 
| 
 | 
   198                 oldHeader = data[2]
 | 
| 
 | 
   199                 seqStart = data[4].split("..")[0]
 | 
| 
 | 
   200                 seqEnd = data[4].split("..")[1]
 | 
| 
 | 
   201                 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) )
 | 
| 
 | 
   202                 outFasta.write( ">%s\n" % ( newHeader ) )
 | 
| 
 | 
   203                 
 | 
| 
 | 
   204             else:
 | 
| 
 | 
   205                 outFasta.write( line.upper() )
 | 
| 
 | 
   206                 
 | 
| 
 | 
   207             line = inFile.readline()
 | 
| 
 | 
   208             
 | 
| 
 | 
   209         inFile.close()
 | 
| 
 | 
   210         outFasta.close()
 | 
| 
 | 
   211         outMap.close()
 | 
| 
 | 
   212         
 | 
| 
 | 
   213         if clean == True:
 | 
| 
 | 
   214             os.remove(inFileName + "_cut")
 | 
| 
 | 
   215             os.remove(inFileName + ".Nstretch.map")
 | 
| 
 | 
   216             
 | 
| 
 | 
   217     
 | 
| 
 | 
   218     ## Split the input fasta file in several output files
 | 
| 
 | 
   219     #
 | 
| 
 | 
   220     # @param inFile string name of the input fasta file
 | 
| 
 | 
   221     # @param nbSeqPerBatch integer number of sequences per output file
 | 
| 
 | 
   222     # @param newDir boolean put the sequences in a new directory called 'batches'
 | 
| 
 | 
   223     # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
 | 
| 
 | 
   224     # @param prefix prefix in output file name 
 | 
| 
 | 
   225     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   226     #
 | 
| 
 | 
   227     @staticmethod
 | 
| 
 | 
   228     def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ):
 | 
| 
 | 
   229         if not os.path.exists( inFile ):
 | 
| 
 | 
   230             msg = "ERROR: file '%s' doesn't exist" % ( inFile )
 | 
| 
 | 
   231             sys.stderr.write( "%s\n" % ( msg ) )
 | 
| 
 | 
   232             sys.exit(1)
 | 
| 
 | 
   233             
 | 
| 
 | 
   234         nbSeq = FastaUtils.dbSize( inFile )
 | 
| 
 | 
   235         
 | 
| 
 | 
   236         nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) )
 | 
| 
 | 
   237         if verbose > 0:
 | 
| 
 | 
   238             print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches )
 | 
| 
 | 
   239             sys.stdout.flush()
 | 
| 
 | 
   240             
 | 
| 
 | 
   241         if nbSeqPerBatch > 1 and useSeqHeader:
 | 
| 
 | 
   242             useSeqHeader = False
 | 
| 
 | 
   243             
 | 
| 
 | 
   244         if newDir == True:
 | 
| 
 | 
   245             if os.path.exists( "batches" ):
 | 
| 
 | 
   246                 shutil.rmtree( "batches" )
 | 
| 
 | 
   247             os.mkdir( "batches" )
 | 
| 
 | 
   248             os.chdir( "batches" )
 | 
| 
 | 
   249             os.system( "ln -s ../%s ." % ( inFile ) )
 | 
| 
 | 
   250             
 | 
| 
 | 
   251         inFileHandler = open( inFile, "r" )
 | 
| 
 | 
   252         inFileHandler.seek( 0, 0 )
 | 
| 
 | 
   253         countBatch = 0
 | 
| 
 | 
   254         countSeq = 0
 | 
| 
 | 
   255         line = inFileHandler.readline()
 | 
| 
 | 
   256         while line:
 | 
| 
 | 
   257             if line == "":
 | 
| 
 | 
   258                 break
 | 
| 
 | 
   259             if line[0] == ">":
 | 
| 
 | 
   260                 countSeq += 1
 | 
| 
 | 
   261                 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
 | 
| 
 | 
   262                     if "outFile" in locals():
 | 
| 
 | 
   263                         outFile.close()
 | 
| 
 | 
   264                     countBatch += 1
 | 
| 
 | 
   265                     if nbSeqPerBatch == 1 and useSeqHeader:
 | 
| 
 | 
   266                         outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") )
 | 
| 
 | 
   267                     else:
 | 
| 
 | 
   268                         outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) )
 | 
| 
 | 
   269                     outFile = open( outFileName, "w" )
 | 
| 
 | 
   270                 if verbose > 1:
 | 
| 
 | 
   271                     print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName )
 | 
| 
 | 
   272                     sys.stdout.flush()
 | 
| 
 | 
   273             outFile.write( line )
 | 
| 
 | 
   274             line = inFileHandler.readline()
 | 
| 
 | 
   275         inFileHandler.close()
 | 
| 
 | 
   276         
 | 
| 
 | 
   277         if newDir == True:
 | 
| 
 | 
   278             os.remove( os.path.basename( inFile ) )
 | 
| 
 | 
   279             os.chdir( ".." )
 | 
| 
 | 
   280             
 | 
| 
 | 
   281     
 | 
| 
 | 
   282     ## Split the input fasta file in several output files
 | 
| 
 | 
   283     #
 | 
| 
 | 
   284     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   285     # @param maxSize integer max cumulative length for each output file
 | 
| 
 | 
   286     #
 | 
| 
 | 
   287     @staticmethod
 | 
| 
 | 
   288     def splitFastaFileInBatches(inFileName, maxSize = 200000):
 | 
| 
 | 
   289         iBioseqDB = BioseqDB(inFileName)
 | 
| 
 | 
   290         lHeadersSizeTuples = []
 | 
| 
 | 
   291         for iBioseq in iBioseqDB.db:
 | 
| 
 | 
   292             lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
 | 
| 
 | 
   293             
 | 
| 
 | 
   294         lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
 | 
| 
 | 
   295         os.mkdir("batches")
 | 
| 
 | 
   296         os.chdir("batches")
 | 
| 
 | 
   297         
 | 
| 
 | 
   298         iterator = 0 
 | 
| 
 | 
   299         for lHeader in lHeadersList :
 | 
| 
 | 
   300             iterator += 1
 | 
| 
 | 
   301             with open("batch_%s.fa" % iterator, 'w') as f :
 | 
| 
 | 
   302                 for header in lHeader :
 | 
| 
 | 
   303                     iBioseq = iBioseqDB.fetch(header)
 | 
| 
 | 
   304                     iBioseq.write(f)
 | 
| 
 | 
   305         os.chdir("..")
 | 
| 
 | 
   306         
 | 
| 
 | 
   307                     
 | 
| 
 | 
   308     ## Split the input fasta file in several output files according to their cluster identifier
 | 
| 
 | 
   309     #
 | 
| 
 | 
   310     # @param inFileName string  name of the input fasta file
 | 
| 
 | 
   311     # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
 | 
| 
 | 
   312     # @param simplifyHeader boolean simplify the headers
 | 
| 
 | 
   313     # @param createDir boolean put the sequences in different directories
 | 
| 
 | 
   314     # @param outPrefix string prefix of the output files (default='seqCluster')
 | 
| 
 | 
   315     # @param verbose integer (default = 0)
 | 
| 
 | 
   316     #
 | 
| 
 | 
   317     @staticmethod
 | 
| 
 | 
   318     def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ):
 | 
| 
 | 
   319         if not os.path.exists( inFileName ):
 | 
| 
 | 
   320             print "ERROR: %s doesn't exist" % ( inFileName )
 | 
| 
 | 
   321             sys.exit(1)
 | 
| 
 | 
   322     
 | 
| 
 | 
   323         inFile = open( inFileName, "r" )
 | 
| 
 | 
   324     
 | 
| 
 | 
   325         line = inFile.readline()
 | 
| 
 | 
   326         if line:
 | 
| 
 | 
   327             name = line.split(" ")[0]
 | 
| 
 | 
   328             if "Cluster" in name:
 | 
| 
 | 
   329                 clusterID = name.split("Cluster")[1].split("Mb")[0]
 | 
| 
 | 
   330                 seqID = name.split("Mb")[1]
 | 
| 
 | 
   331             else:
 | 
| 
 | 
   332                 clusterID = name.split("Cl")[0].split("Gr")[1]   # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
 | 
| 
 | 
   333                 if "Q" in name.split("Gr")[0]:
 | 
| 
 | 
   334                     seqID = name.split("Gr")[0].split("MbQ")[1]
 | 
| 
 | 
   335                 elif "S" in name:
 | 
| 
 | 
   336                     seqID = name.split("Gr")[0].split("MbS")[1]
 | 
| 
 | 
   337             sClusterIDs = set( [ clusterID ] )
 | 
| 
 | 
   338             if simplifyHeader == True:
 | 
| 
 | 
   339                 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
 | 
| 
 | 
   340             else:
 | 
| 
 | 
   341                 header = line[1:-1]
 | 
| 
 | 
   342             if createDir == True:
 | 
| 
 | 
   343                 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
 | 
| 
 | 
   344                     os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
 | 
| 
 | 
   345                 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
 | 
| 
 | 
   346             outFileName = "%s%s.fa" % ( outPrefix, clusterID )
 | 
| 
 | 
   347             outFile = open( outFileName, "w" )
 | 
| 
 | 
   348             outFile.write( ">%s\n" % ( header ) )
 | 
| 
 | 
   349             prevClusterID = clusterID
 | 
| 
 | 
   350         
 | 
| 
 | 
   351             line = inFile.readline()
 | 
| 
 | 
   352             while line:
 | 
| 
 | 
   353                 if line[0] == ">":
 | 
| 
 | 
   354                     name = line.split(" ")[0]
 | 
| 
 | 
   355                     if "Cluster" in name:
 | 
| 
 | 
   356                         clusterID = name.split("Cluster")[1].split("Mb")[0]
 | 
| 
 | 
   357                         seqID = name.split("Mb")[1]
 | 
| 
 | 
   358                     else:
 | 
| 
 | 
   359                         clusterID = name.split("Cl")[0].split("Gr")[1]
 | 
| 
 | 
   360                         if "Q" in name.split("Gr")[0]:
 | 
| 
 | 
   361                             seqID = name.split("Gr")[0].split("MbQ")[1]
 | 
| 
 | 
   362                         elif "S" in name:
 | 
| 
 | 
   363                             seqID = name.split("Gr")[0].split("MbS")[1]
 | 
| 
 | 
   364         
 | 
| 
 | 
   365                     if clusterID != prevClusterID:
 | 
| 
 | 
   366                         outFile.close()
 | 
| 
 | 
   367         
 | 
| 
 | 
   368                     if simplifyHeader == True:
 | 
| 
 | 
   369                         header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
 | 
| 
 | 
   370                     else:
 | 
| 
 | 
   371                         header = line[1:-1]
 | 
| 
 | 
   372         
 | 
| 
 | 
   373                     if createDir == True:
 | 
| 
 | 
   374                         os.chdir( ".." )
 | 
| 
 | 
   375                         if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID )  ):
 | 
| 
 | 
   376                             os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
 | 
| 
 | 
   377                         os.chdir( "%s_cluster_%s" % ( inFileName, clusterID )  )
 | 
| 
 | 
   378         
 | 
| 
 | 
   379                     outFileName = "%s%s.fa" % ( outPrefix, clusterID )
 | 
| 
 | 
   380                     if not os.path.exists( outFileName ):
 | 
| 
 | 
   381                         outFile = open( outFileName, "w" )
 | 
| 
 | 
   382                     else:
 | 
| 
 | 
   383                         if clusterID != prevClusterID:
 | 
| 
 | 
   384                             outFile.close()
 | 
| 
 | 
   385                             outFile = open( outFileName, "a" )
 | 
| 
 | 
   386                     outFile.write( ">%s\n" % ( header ) )
 | 
| 
 | 
   387                     prevClusterID = clusterID
 | 
| 
 | 
   388                     sClusterIDs.add( clusterID )
 | 
| 
 | 
   389         
 | 
| 
 | 
   390                 else:
 | 
| 
 | 
   391                     outFile.write( line )
 | 
| 
 | 
   392         
 | 
| 
 | 
   393                 line = inFile.readline()
 | 
| 
 | 
   394         
 | 
| 
 | 
   395             outFile.close()
 | 
| 
 | 
   396             if verbose > 0:
 | 
| 
 | 
   397                 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
 | 
| 
 | 
   398                 
 | 
| 
 | 
   399             if createDir == True:
 | 
| 
 | 
   400                 os.chdir("..")
 | 
| 
 | 
   401         else:
 | 
| 
 | 
   402             print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
 | 
| 
 | 
   403                 
 | 
| 
 | 
   404 
 | 
| 
 | 
   405     ## Filter a fasta file in two fasta files using the length of each sequence as a criteron
 | 
| 
 | 
   406     #
 | 
| 
 | 
   407     # @param len_min integer    length sequence criterion to filter
 | 
| 
 | 
   408     # @param inFileName string  name of the input fasta file
 | 
| 
 | 
   409     # @param verbose integer (default = 0)      
 | 
| 
 | 
   410     #
 | 
| 
 | 
   411     @staticmethod
 | 
| 
 | 
   412     def dbLengthFilter( len_min, inFileName, verbose=0 ):
 | 
| 
 | 
   413         file_db = open( inFileName, "r" )
 | 
| 
 | 
   414         file_dbInf = open( inFileName+".Inf"+str(len_min), "w" )
 | 
| 
 | 
   415         file_dbSup = open( inFileName+".Sup"+str(len_min), "w" )
 | 
| 
 | 
   416         seq = Bioseq()
 | 
| 
 | 
   417         numseq = 0
 | 
| 
 | 
   418         nbsave = 0
 | 
| 
 | 
   419         
 | 
| 
 | 
   420         seq.read( file_db )
 | 
| 
 | 
   421         while seq.sequence:
 | 
| 
 | 
   422             l = seq.getLength()
 | 
| 
 | 
   423             numseq = numseq + 1
 | 
| 
 | 
   424             if l >= len_min:
 | 
| 
 | 
   425                 seq.write( file_dbSup )
 | 
| 
 | 
   426                 if verbose > 0:
 | 
| 
 | 
   427                         print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!'
 | 
| 
 | 
   428                         nbsave=nbsave+1
 | 
| 
 | 
   429             else:
 | 
| 
 | 
   430                 seq.write( file_dbInf )
 | 
| 
 | 
   431                 if verbose > 0:
 | 
| 
 | 
   432                         print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!'
 | 
| 
 | 
   433                         nbsave=nbsave+1
 | 
| 
 | 
   434             seq.read( file_db )
 | 
| 
 | 
   435                         
 | 
| 
 | 
   436         file_db.close()
 | 
| 
 | 
   437         file_dbInf.close()
 | 
| 
 | 
   438         file_dbSup.close()
 | 
| 
 | 
   439         if verbose > 0:
 | 
| 
 | 
   440             print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min)
 | 
| 
 | 
   441             
 | 
| 
 | 
   442     
 | 
| 
 | 
   443     ## Extract the longest sequences from a fasta file
 | 
| 
 | 
   444     #
 | 
| 
 | 
   445     # @param num integer maximum number of sequences in the output file
 | 
| 
 | 
   446     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   447     # @param outFileName string name of the output fasta file
 | 
| 
 | 
   448     # @param minThresh integer minimum length threshold (default=0)
 | 
| 
 | 
   449     # @param verbose integer (default = 0)
 | 
| 
 | 
   450     #
 | 
| 
 | 
   451     @staticmethod
 | 
| 
 | 
   452     def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ):
 | 
| 
 | 
   453         bsDB = BioseqDB( inFileName )
 | 
| 
 | 
   454         if verbose > 0:
 | 
| 
 | 
   455             print "nb of input sequences: %i" % ( bsDB.getSize() )
 | 
| 
 | 
   456     
 | 
| 
 | 
   457         if outFileName == "":
 | 
| 
 | 
   458             outFileName = inFileName + ".best" + str(num)
 | 
| 
 | 
   459         outFile = open( outFileName, "w" )
 | 
| 
 | 
   460         
 | 
| 
 | 
   461         if bsDB.getSize()==0:
 | 
| 
 | 
   462             return 0
 | 
| 
 | 
   463         
 | 
| 
 | 
   464         num = int(num)
 | 
| 
 | 
   465         if verbose > 0:
 | 
| 
 | 
   466             print "keep the %i longest sequences" % ( num )
 | 
| 
 | 
   467             if minThresh > 0:
 | 
| 
 | 
   468                 print "with length > %i bp" % ( minThresh )
 | 
| 
 | 
   469             sys.stdout.flush()
 | 
| 
 | 
   470             
 | 
| 
 | 
   471         # retrieve the length of each input sequence
 | 
| 
 | 
   472         tmpLSeqLgth = []
 | 
| 
 | 
   473         seqNum = 0
 | 
| 
 | 
   474         for bs in bsDB.db:
 | 
| 
 | 
   475             seqNum += 1
 | 
| 
 | 
   476             tmpLSeqLgth.append( bs.getLength() )
 | 
| 
 | 
   477             if verbose > 1:
 | 
| 
 | 
   478                 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() )
 | 
| 
 | 
   479             sys.stdout.flush()
 | 
| 
 | 
   480     
 | 
| 
 | 
   481         # sort the lengths
 | 
| 
 | 
   482         tmpLSeqLgth.sort()
 | 
| 
 | 
   483         tmpLSeqLgth.reverse()
 | 
| 
 | 
   484     
 | 
| 
 | 
   485         # select the longest
 | 
| 
 | 
   486         lSeqLgth = []
 | 
| 
 | 
   487         for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
 | 
| 
 | 
   488             if tmpLSeqLgth[i] >= minThresh:
 | 
| 
 | 
   489                 lSeqLgth.append( tmpLSeqLgth[i] )
 | 
| 
 | 
   490         if verbose > 0:
 | 
| 
 | 
   491             print "selected max length: %i" % ( max(lSeqLgth) )
 | 
| 
 | 
   492             print "selected min length: %i" % ( min(lSeqLgth) )
 | 
| 
 | 
   493             sys.stdout.flush()
 | 
| 
 | 
   494     
 | 
| 
 | 
   495         # save the longest
 | 
| 
 | 
   496         inFile = open( inFileName )
 | 
| 
 | 
   497         seqNum = 0
 | 
| 
 | 
   498         nbSave = 0
 | 
| 
 | 
   499         for bs in bsDB.db:
 | 
| 
 | 
   500             seqNum += 1
 | 
| 
 | 
   501             if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
 | 
| 
 | 
   502                 bs.write( outFile )
 | 
| 
 | 
   503                 if verbose > 1:
 | 
| 
 | 
   504                     print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
 | 
| 
 | 
   505                     sys.stdout.flush()
 | 
| 
 | 
   506                 nbSave += 1
 | 
| 
 | 
   507             if nbSave == num:
 | 
| 
 | 
   508                 break
 | 
| 
 | 
   509         inFile.close()
 | 
| 
 | 
   510         outFile.close()
 | 
| 
 | 
   511         if verbose > 0:
 | 
| 
 | 
   512             print nbSave, "saved sequences in ", outFileName
 | 
| 
 | 
   513             sys.stdout.flush()
 | 
| 
 | 
   514             
 | 
| 
 | 
   515         return 0
 | 
| 
 | 
   516     
 | 
| 
 | 
   517     
 | 
| 
 | 
   518     ## Extract all the sequence headers from a fasta file and write them in a new fasta file
 | 
| 
 | 
   519     #
 | 
| 
 | 
   520     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   521     # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
 | 
| 
 | 
   522     #
 | 
| 
 | 
   523     @staticmethod
 | 
| 
 | 
   524     def dbExtractSeqHeaders( inFileName, outFileName="" ):
 | 
| 
 | 
   525         lHeaders = FastaUtils.dbHeaders( inFileName )
 | 
| 
 | 
   526         
 | 
| 
 | 
   527         if outFileName == "":
 | 
| 
 | 
   528             outFileName = inFileName + ".headers"
 | 
| 
 | 
   529             
 | 
| 
 | 
   530         outFile = open( outFileName, "w" )
 | 
| 
 | 
   531         for i in lHeaders:
 | 
| 
 | 
   532             outFile.write( i + "\n" )
 | 
| 
 | 
   533         outFile.close()
 | 
| 
 | 
   534     
 | 
| 
 | 
   535         return 0
 | 
| 
 | 
   536     
 | 
| 
 | 
   537     
 | 
| 
 | 
   538     ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
 | 
| 
 | 
   539     #
 | 
| 
 | 
   540     # @param pattern regular expression to search in headers
 | 
| 
 | 
   541     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   542     # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
 | 
| 
 | 
   543     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   544     #
 | 
| 
 | 
   545     @staticmethod
 | 
| 
 | 
   546     def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ):
 | 
| 
 | 
   547         if pattern == "":
 | 
| 
 | 
   548             return
 | 
| 
 | 
   549         
 | 
| 
 | 
   550         if outFileName == "":
 | 
| 
 | 
   551             outFileName = inFileName + '.extracted'
 | 
| 
 | 
   552         outFile = open( outFileName, 'w' )
 | 
| 
 | 
   553         
 | 
| 
 | 
   554         patternTosearch = re.compile( pattern )
 | 
| 
 | 
   555         bioseq = Bioseq()
 | 
| 
 | 
   556         bioseqNb = 0
 | 
| 
 | 
   557         savedBioseqNb = 0
 | 
| 
 | 
   558         inFile = open( inFileName, "r" )
 | 
| 
 | 
   559         bioseq.read( inFile )
 | 
| 
 | 
   560         while bioseq.sequence:
 | 
| 
 | 
   561             bioseqNb = bioseqNb + 1
 | 
| 
 | 
   562             m = patternTosearch.search( bioseq.header )
 | 
| 
 | 
   563             if m:
 | 
| 
 | 
   564                 bioseq.write( outFile )
 | 
| 
 | 
   565                 if verbose > 1:
 | 
| 
 | 
   566                     print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!'
 | 
| 
 | 
   567                 savedBioseqNb = savedBioseqNb + 1
 | 
| 
 | 
   568             bioseq.read( inFile )
 | 
| 
 | 
   569         inFile.close()
 | 
| 
 | 
   570         
 | 
| 
 | 
   571         outFile.close()
 | 
| 
 | 
   572         
 | 
| 
 | 
   573         if verbose > 0:
 | 
| 
 | 
   574             print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
 | 
| 
 | 
   575             
 | 
| 
 | 
   576     
 | 
| 
 | 
   577     ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
 | 
| 
 | 
   578     #
 | 
| 
 | 
   579     # @param patternFileName string file containing regular expression to search in headers
 | 
| 
 | 
   580     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   581     # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
 | 
| 
 | 
   582     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   583     #
 | 
| 
 | 
   584     @staticmethod
 | 
| 
 | 
   585     def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
 | 
| 
 | 
   586     
 | 
| 
 | 
   587         if patternFileName == "":
 | 
| 
 | 
   588             print "ERROR: no file of pattern"
 | 
| 
 | 
   589             sys.exit(1)
 | 
| 
 | 
   590     
 | 
| 
 | 
   591         bioseq = Bioseq()
 | 
| 
 | 
   592         bioseqNb = 0
 | 
| 
 | 
   593         savedBioseqNb = 0
 | 
| 
 | 
   594         lHeaders = []
 | 
| 
 | 
   595 
 | 
| 
 | 
   596         inFile = open( inFileName, "r" )
 | 
| 
 | 
   597         bioseq.read( inFile )
 | 
| 
 | 
   598         while bioseq.sequence != None:
 | 
| 
 | 
   599             lHeaders.append( bioseq.header )
 | 
| 
 | 
   600             bioseq.read( inFile )
 | 
| 
 | 
   601         inFile.close()
 | 
| 
 | 
   602     
 | 
| 
 | 
   603         lHeadersToKeep = []
 | 
| 
 | 
   604         patternFile = open( patternFileName, "r" )
 | 
| 
 | 
   605         for pattern in patternFile:
 | 
| 
 | 
   606             if verbose > 0:
 | 
| 
 | 
   607                 print "pattern: ",pattern[:-1]; sys.stdout.flush()
 | 
| 
 | 
   608                 
 | 
| 
 | 
   609             patternToSearch = re.compile(pattern[:-1])
 | 
| 
 | 
   610             for h in lHeaders:
 | 
| 
 | 
   611                 if patternToSearch.search(h):
 | 
| 
 | 
   612                     lHeadersToKeep.append(h)
 | 
| 
 | 
   613         patternFile.close()
 | 
| 
 | 
   614     
 | 
| 
 | 
   615         if outFileName == "":
 | 
| 
 | 
   616             outFileName = inFileName + ".extracted"
 | 
| 
 | 
   617         outFile=open( outFileName, "w" )
 | 
| 
 | 
   618     
 | 
| 
 | 
   619         inFile = open( inFileName, "r" )
 | 
| 
 | 
   620         bioseq.read(inFile)
 | 
| 
 | 
   621         while bioseq.sequence:
 | 
| 
 | 
   622             bioseqNb += 1
 | 
| 
 | 
   623             if bioseq.header in lHeadersToKeep:
 | 
| 
 | 
   624                 bioseq.write(outFile)
 | 
| 
 | 
   625                 if verbose > 1:
 | 
| 
 | 
   626                     print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
 | 
| 
 | 
   627                 savedBioseqNb += 1
 | 
| 
 | 
   628             bioseq.read(inFile)
 | 
| 
 | 
   629         inFile.close()
 | 
| 
 | 
   630     
 | 
| 
 | 
   631         outFile.close()
 | 
| 
 | 
   632         
 | 
| 
 | 
   633         if verbose > 0:
 | 
| 
 | 
   634             print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
 | 
| 
 | 
   635             
 | 
| 
 | 
   636     
 | 
| 
 | 
   637     ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
 | 
| 
 | 
   638     #
 | 
| 
 | 
   639     # @param pattern regular expression to search in headers
 | 
| 
 | 
   640     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   641     # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
 | 
| 
 | 
   642     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   643     #
 | 
| 
 | 
   644     @staticmethod
 | 
| 
 | 
   645     def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ):
 | 
| 
 | 
   646         if pattern == "":
 | 
| 
 | 
   647             return
 | 
| 
 | 
   648         
 | 
| 
 | 
   649         patternToSearch = re.compile(pattern)
 | 
| 
 | 
   650         
 | 
| 
 | 
   651         if outFileName == "":
 | 
| 
 | 
   652             outFileName = inFileName + '.cleaned'
 | 
| 
 | 
   653         outFile = open(outFileName,'w')
 | 
| 
 | 
   654         
 | 
| 
 | 
   655         bioseq = Bioseq()
 | 
| 
 | 
   656         bioseqNb = 0
 | 
| 
 | 
   657         savedBioseqNb = 0
 | 
| 
 | 
   658         inFile = open(inFileName)
 | 
| 
 | 
   659         bioseq.read(inFile)
 | 
| 
 | 
   660         while bioseq.sequence != None:
 | 
| 
 | 
   661             bioseqNb += 1
 | 
| 
 | 
   662             if not patternToSearch.search(bioseq.header):
 | 
| 
 | 
   663                 bioseq.write(outFile)
 | 
| 
 | 
   664                 if verbose > 1:
 | 
| 
 | 
   665                     print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'
 | 
| 
 | 
   666                 savedBioseqNb += 1
 | 
| 
 | 
   667             bioseq.read(inFile)
 | 
| 
 | 
   668         inFile.close()
 | 
| 
 | 
   669         
 | 
| 
 | 
   670         outFile.close()
 | 
| 
 | 
   671         
 | 
| 
 | 
   672         if verbose > 0:
 | 
| 
 | 
   673             print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
 | 
| 
 | 
   674             
 | 
| 
 | 
   675     
 | 
| 
 | 
   676     ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
 | 
| 
 | 
   677     #
 | 
| 
 | 
   678     # @param patternFileName string file containing regular expression to search in headers
 | 
| 
 | 
   679     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   680     # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
 | 
| 
 | 
   681     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   682     #
 | 
| 
 | 
   683     @staticmethod
 | 
| 
 | 
   684     def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ):
 | 
| 
 | 
   685         if patternFileName == "":
 | 
| 
 | 
   686             print "ERROR: no file of pattern"
 | 
| 
 | 
   687             sys.exit(1)
 | 
| 
 | 
   688             
 | 
| 
 | 
   689         bioseq = Bioseq()
 | 
| 
 | 
   690         bioseqNb = 0
 | 
| 
 | 
   691         savedBioseqNb = 0
 | 
| 
 | 
   692         lHeaders = []
 | 
| 
 | 
   693         inFile = open( inFileName, "r" )
 | 
| 
 | 
   694         bioseq.read( inFile )
 | 
| 
 | 
   695         while bioseq.sequence != None:
 | 
| 
 | 
   696             bioseqNb += 1
 | 
| 
 | 
   697             lHeaders.append( bioseq.header )
 | 
| 
 | 
   698             bioseq.read( inFile )
 | 
| 
 | 
   699         inFile.close()
 | 
| 
 | 
   700         
 | 
| 
 | 
   701         patternFile = open( patternFileName, "r")
 | 
| 
 | 
   702         lHeadersToRemove = []
 | 
| 
 | 
   703         for pattern in patternFile:
 | 
| 
 | 
   704             if verbose > 0:
 | 
| 
 | 
   705                 print "pattern: ",pattern[:-1]; sys.stdout.flush()
 | 
| 
 | 
   706                 
 | 
| 
 | 
   707             patternToSearch = re.compile( pattern[:-1] )
 | 
| 
 | 
   708             for h in lHeaders:
 | 
| 
 | 
   709                 if patternToSearch.search(h):
 | 
| 
 | 
   710                     lHeadersToRemove.append(h)
 | 
| 
 | 
   711         patternFile.close()
 | 
| 
 | 
   712         
 | 
| 
 | 
   713         if outFileName == "":
 | 
| 
 | 
   714             outFileName = inFileName + '.cleaned'
 | 
| 
 | 
   715         outFile = open( outFileName, 'w' )
 | 
| 
 | 
   716     
 | 
| 
 | 
   717         bioseqNum = 0
 | 
| 
 | 
   718         inFile=open( inFileName )
 | 
| 
 | 
   719         bioseq.read( inFile )
 | 
| 
 | 
   720         while bioseq.sequence != None:
 | 
| 
 | 
   721             bioseqNum += 1
 | 
| 
 | 
   722             if bioseq.header not in lHeadersToRemove:
 | 
| 
 | 
   723                 bioseq.write( outFile )
 | 
| 
 | 
   724                 if verbose > 1:
 | 
| 
 | 
   725                     print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush()
 | 
| 
 | 
   726                 savedBioseqNb += 1
 | 
| 
 | 
   727             bioseq.read( inFile )
 | 
| 
 | 
   728         inFile.close()
 | 
| 
 | 
   729         
 | 
| 
 | 
   730         outFile.close()
 | 
| 
 | 
   731         
 | 
| 
 | 
   732         if verbose > 0:
 | 
| 
 | 
   733             print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName )
 | 
| 
 | 
   734             
 | 
| 
 | 
   735     
 | 
| 
 | 
   736     ## Find sequence's ORFs from a fasta file and write them in a tab file 
 | 
| 
 | 
   737     # 
 | 
| 
 | 
   738     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   739     # @param orfMaxNb integer Select orfMaxNb best ORFs
 | 
| 
 | 
   740     # @param orfMinLength integer Keep ORFs with length > orfMinLength 
 | 
| 
 | 
   741     # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
 | 
| 
 | 
   742     # @param verbose integer verbosity level (default = 0)
 | 
| 
 | 
   743     #
 | 
| 
 | 
   744     @staticmethod
 | 
| 
 | 
   745     def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ):
 | 
| 
 | 
   746         if outFileName == "":
 | 
| 
 | 
   747             outFileName = inFileName + ".ORF.map"
 | 
| 
 | 
   748         outFile = open( outFileName, "w" )
 | 
| 
 | 
   749     
 | 
| 
 | 
   750         bioseq = Bioseq()
 | 
| 
 | 
   751         bioseqNb = 0
 | 
| 
 | 
   752     
 | 
| 
 | 
   753         inFile = open( inFileName )
 | 
| 
 | 
   754         bioseq.read( inFile )
 | 
| 
 | 
   755         while bioseq.sequence != None:
 | 
| 
 | 
   756             bioseq.upCase() 
 | 
| 
 | 
   757             bioseqNb += 1
 | 
| 
 | 
   758             if verbose > 0:
 | 
| 
 | 
   759                 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]'
 | 
| 
 | 
   760                 
 | 
| 
 | 
   761             orf = bioseq.findORF()
 | 
| 
 | 
   762             bestOrf = []
 | 
| 
 | 
   763             for i in orf.keys():
 | 
| 
 | 
   764                 orfLen = len(orf[i])
 | 
| 
 | 
   765                 for j in xrange(1, orfLen):
 | 
| 
 | 
   766                     start = orf[i][j-1] + 4
 | 
| 
 | 
   767                     end = orf[i][j] + 3
 | 
| 
 | 
   768                     if end - start >= orfMinLength:
 | 
| 
 | 
   769                         bestOrf.append( ( end-start, i+1, start, end ) )
 | 
| 
 | 
   770     
 | 
| 
 | 
   771             bioseq.reverseComplement()
 | 
| 
 | 
   772             
 | 
| 
 | 
   773             orf = bioseq.findORF()
 | 
| 
 | 
   774             seqLen = bioseq.getLength()
 | 
| 
 | 
   775             for i in orf.keys():
 | 
| 
 | 
   776                 orfLen = len(orf[i])
 | 
| 
 | 
   777                 for j in xrange(1, orfLen):
 | 
| 
 | 
   778                     start = seqLen - orf[i][j-1] - 3
 | 
| 
 | 
   779                     end = seqLen - orf[i][j] - 2
 | 
| 
 | 
   780                     if start - end >= orfMinLength:
 | 
| 
 | 
   781                         bestOrf.append( ( start-end, (i+1)*-1, start, end ) )
 | 
| 
 | 
   782     
 | 
| 
 | 
   783             bestOrf.sort()
 | 
| 
 | 
   784             bestOrf.reverse()
 | 
| 
 | 
   785             bestOrfNb = len(bestOrf)
 | 
| 
 | 
   786             if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
 | 
| 
 | 
   787                 bestOrfNb = orfMaxNb
 | 
| 
 | 
   788             for i in xrange(0, bestOrfNb):
 | 
| 
 | 
   789                 if verbose > 0:
 | 
| 
 | 
   790                     print bestOrf[i]
 | 
| 
 | 
   791                 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\
 | 
| 
 | 
   792                                    "|"+str(bestOrf[i][0]),bioseq.header,
 | 
| 
 | 
   793                                    bestOrf[i][2],bestOrf[i][3]))
 | 
| 
 | 
   794             bioseq.read( inFile )
 | 
| 
 | 
   795     
 | 
| 
 | 
   796         inFile.close()
 | 
| 
 | 
   797         outFile.close()
 | 
| 
 | 
   798     
 | 
| 
 | 
   799         return 0
 | 
| 
 | 
   800 
 | 
| 
 | 
   801 
 | 
| 
 | 
   802     ## Sort sequences by increasing length (write a new file)
 | 
| 
 | 
   803     #
 | 
| 
 | 
   804     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   805     # @param outFileName string name of the output fasta file
 | 
| 
 | 
   806     # @param verbose integer verbosity level   
 | 
| 
 | 
   807     #
 | 
| 
 | 
   808     @staticmethod
 | 
| 
 | 
   809     def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0):
 | 
| 
 | 
   810         if verbose > 0:
 | 
| 
 | 
   811             print "sort sequences by increasing length"
 | 
| 
 | 
   812             sys.stdout.flush()
 | 
| 
 | 
   813         if not os.path.exists( inFileName ):
 | 
| 
 | 
   814             print "ERROR: file '%s' doesn't exist" % ( inFileName )
 | 
| 
 | 
   815             sys.exit(1)
 | 
| 
 | 
   816             
 | 
| 
 | 
   817         # read each seq one by one
 | 
| 
 | 
   818         # save them in distinct temporary files
 | 
| 
 | 
   819         # with their length in the name
 | 
| 
 | 
   820         inFileHandler = open( inFileName, "r" )
 | 
| 
 | 
   821         countSeq = 0
 | 
| 
 | 
   822         bs = Bioseq()
 | 
| 
 | 
   823         bs.read( inFileHandler )
 | 
| 
 | 
   824         while bs.header:
 | 
| 
 | 
   825             countSeq += 1
 | 
| 
 | 
   826             tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq )
 | 
| 
 | 
   827             bs.appendBioseqInFile( tmpFile )
 | 
| 
 | 
   828             if verbose > 1:
 | 
| 
 | 
   829                 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile )
 | 
| 
 | 
   830             bs.header = ""
 | 
| 
 | 
   831             bs.sequence = ""
 | 
| 
 | 
   832             bs.read( inFileHandler )
 | 
| 
 | 
   833         inFileHandler.close()
 | 
| 
 | 
   834         
 | 
| 
 | 
   835         # sort temporary file names
 | 
| 
 | 
   836         # concatenate them into the output file
 | 
| 
 | 
   837         if os.path.exists( outFileName ):
 | 
| 
 | 
   838             os.remove( outFileName )
 | 
| 
 | 
   839         lFiles = glob.glob( "*bp_*nb" )
 | 
| 
 | 
   840         lFiles.sort( key=lambda s:int(s.split("bp_")[0]) )
 | 
| 
 | 
   841         for fileName in lFiles:
 | 
| 
 | 
   842             cmd = "cat %s >> %s" % ( fileName, outFileName )
 | 
| 
 | 
   843             returnValue = os.system( cmd )
 | 
| 
 | 
   844             if returnValue != 0:
 | 
| 
 | 
   845                 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName )
 | 
| 
 | 
   846                 sys.exit(1)
 | 
| 
 | 
   847             os.remove( fileName )
 | 
| 
 | 
   848             
 | 
| 
 | 
   849         return 0
 | 
| 
 | 
   850 
 | 
| 
 | 
   851     
 | 
| 
 | 
   852     ## Sort sequences by header
 | 
| 
 | 
   853     #
 | 
| 
 | 
   854     # @param inFileName string name of the input fasta file
 | 
| 
 | 
   855     # @param outFileName string name of the output fasta file
 | 
| 
 | 
   856     # @param verbose integer verbosity level   
 | 
| 
 | 
   857     #
 | 
| 
 | 
   858     @staticmethod
 | 
| 
 | 
   859     def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0):
 | 
| 
 | 
   860         if outFileName == "":
 | 
| 
 | 
   861             outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
 | 
| 
 | 
   862         iBioseqDB = BioseqDB(inFileName)
 | 
| 
 | 
   863         f = open(outFileName, "w")
 | 
| 
 | 
   864         lHeaders = sorted(iBioseqDB.getHeaderList())
 | 
| 
 | 
   865         for header in lHeaders:
 | 
| 
 | 
   866             iBioseq = iBioseqDB.fetch(header)
 | 
| 
 | 
   867             iBioseq.write(f)
 | 
| 
 | 
   868         f.close()
 | 
| 
 | 
   869 
 | 
| 
 | 
   870     
 | 
| 
 | 
   871     ## Return a dictionary which keys are the headers and values the length of the sequences
 | 
| 
 | 
   872     #
 | 
| 
 | 
   873     # @param inFile string name of the input fasta file
 | 
| 
 | 
   874     # @param verbose integer verbosity level   
 | 
| 
 | 
   875     #
 | 
| 
 | 
   876     @staticmethod
 | 
| 
 | 
   877     def getLengthPerHeader( inFile, verbose=0 ):
 | 
| 
 | 
   878         dHeader2Length = {}
 | 
| 
 | 
   879         
 | 
| 
 | 
   880         inFileHandler = open( inFile, "r" )
 | 
| 
 | 
   881         currentSeqHeader = ""
 | 
| 
 | 
   882         currentSeqLength = 0
 | 
| 
 | 
   883         line = inFileHandler.readline()
 | 
| 
 | 
   884         while line:
 | 
| 
 | 
   885             if line[0] == ">":
 | 
| 
 | 
   886                 if currentSeqHeader != "":
 | 
| 
 | 
   887                     dHeader2Length[ currentSeqHeader ] = currentSeqLength
 | 
| 
 | 
   888                     currentSeqLength = 0
 | 
| 
 | 
   889                 currentSeqHeader = line[1:-1]
 | 
| 
 | 
   890                 if verbose > 0:
 | 
| 
 | 
   891                     print "current header: %s" % ( currentSeqHeader )
 | 
| 
 | 
   892                     sys.stdout.flush()
 | 
| 
 | 
   893             else:
 | 
| 
 | 
   894                 currentSeqLength += len( line.replace("\n","") )
 | 
| 
 | 
   895             line = inFileHandler.readline()
 | 
| 
 | 
   896         dHeader2Length[ currentSeqHeader ] = currentSeqLength
 | 
| 
 | 
   897         inFileHandler.close()
 | 
| 
 | 
   898         
 | 
| 
 | 
   899         return dHeader2Length
 | 
| 
 | 
   900     
 | 
| 
 | 
   901     
 | 
| 
 | 
   902     ## Convert headers from a fasta file having chunk coordinates
 | 
| 
 | 
   903     #
 | 
| 
 | 
   904     # @param inFile string name of the input fasta file
 | 
| 
 | 
   905     # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
 | 
| 
 | 
   906     # @param outFile string name of the output file
 | 
| 
 | 
   907     #
 | 
| 
 | 
   908     @staticmethod
 | 
| 
 | 
   909     def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
 | 
| 
 | 
   910         inFileHandler = open(inFile, "r")
 | 
| 
 | 
   911         outFileHandler = open(outFile, "w")
 | 
| 
 | 
   912         dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
 | 
| 
 | 
   913         iConvCoord = ConvCoord()
 | 
| 
 | 
   914         line = inFileHandler.readline()
 | 
| 
 | 
   915         while line:
 | 
| 
 | 
   916             if line[0] == ">":
 | 
| 
 | 
   917                 if "{Fragment}" in line:
 | 
| 
 | 
   918                     chkName = line.split(" ")[1]
 | 
| 
 | 
   919                     chrName = dChunk2Map[chkName].seqname
 | 
| 
 | 
   920                     lCoordPairs = line.split(" ")[3].split(",")
 | 
| 
 | 
   921                     lRangesOnChk = []
 | 
| 
 | 
   922                     for i in lCoordPairs:
 | 
| 
 | 
   923                         iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
 | 
| 
 | 
   924                         lRangesOnChk.append(iRange)
 | 
| 
 | 
   925                     lRangesOnChr = []
 | 
| 
 | 
   926                     for iRange in lRangesOnChk:
 | 
| 
 | 
   927                         lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
 | 
| 
 | 
   928                     newHeader = line[1:-1].split(" ")[0]
 | 
| 
 | 
   929                     newHeader += " %s" % chrName
 | 
| 
 | 
   930                     newHeader += " {Fragment}"
 | 
| 
 | 
   931                     newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
 | 
| 
 | 
   932                     for iRange in lRangesOnChr[1:]:
 | 
| 
 | 
   933                         newHeader += ",%i..%i" % (iRange.start, iRange.end)
 | 
| 
 | 
   934                     outFileHandler.write(">%s\n" % newHeader)
 | 
| 
 | 
   935                 else:
 | 
| 
 | 
   936                     chkName = line.split("_")[1].split(" ")[0]
 | 
| 
 | 
   937                     chrName = dChunk2Map[chkName].seqname
 | 
| 
 | 
   938                     coords = line[line.find("[")+1 : line.find("]")]
 | 
| 
 | 
   939                     start = int(coords.split(",")[0])
 | 
| 
 | 
   940                     end = int(coords.split(",")[1])
 | 
| 
 | 
   941                     iRangeOnChk = Range(chkName, start, end)
 | 
| 
 | 
   942                     iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
 | 
| 
 | 
   943                     newHeader = line[1:-1].split("_")[0]
 | 
| 
 | 
   944                     newHeader += " %s" % chrName
 | 
| 
 | 
   945                     newHeader += " %s" % line[line.find("(") : line.find(")")+1]
 | 
| 
 | 
   946                     newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
 | 
| 
 | 
   947                     outFileHandler.write(">%s\n" % newHeader)
 | 
| 
 | 
   948             else:
 | 
| 
 | 
   949                 outFileHandler.write(line)
 | 
| 
 | 
   950             line = inFileHandler.readline()
 | 
| 
 | 
   951         inFileHandler.close()
 | 
| 
 | 
   952         outFileHandler.close()
 | 
| 
 | 
   953         
 | 
| 
 | 
   954     
 | 
| 
 | 
   955     ## Convert a fasta file to a length file
 | 
| 
 | 
   956     #
 | 
| 
 | 
   957     # @param inFile string name of the input fasta file
 | 
| 
 | 
   958     # @param outFile string name of the output file
 | 
| 
 | 
   959     #
 | 
| 
 | 
   960     @staticmethod
 | 
| 
 | 
   961     def convertFastaToLength(inFile, outFile = ""):
 | 
| 
 | 
   962         if outFile == "":
 | 
| 
 | 
   963             outFile = "%s.length" % inFile
 | 
| 
 | 
   964         
 | 
| 
 | 
   965         if inFile != "":
 | 
| 
 | 
   966             with open(inFile, "r") as inFH:
 | 
| 
 | 
   967                 with open(outFile, "w") as outFH:
 | 
| 
 | 
   968                     bioseq = Bioseq()
 | 
| 
 | 
   969                     bioseq.read(inFH)
 | 
| 
 | 
   970                     while bioseq.sequence != None:
 | 
| 
 | 
   971                         seqLen = bioseq.getLength()
 | 
| 
 | 
   972                         outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
 | 
| 
 | 
   973                         bioseq.read(inFH)
 | 
| 
 | 
   974 
 | 
| 
 | 
   975     
 | 
| 
 | 
   976     ## Convert a fasta file to a seq file
 | 
| 
 | 
   977     #
 | 
| 
 | 
   978     # @param inFile string name of the input fasta file
 | 
| 
 | 
   979     # @param outFile string name of the output file
 | 
| 
 | 
   980     #
 | 
| 
 | 
   981     @staticmethod
 | 
| 
 | 
   982     def convertFastaToSeq(inFile, outFile = ""):
 | 
| 
 | 
   983         if outFile == "":
 | 
| 
 | 
   984             outFile = "%s.seq" % inFile
 | 
| 
 | 
   985         
 | 
| 
 | 
   986         if inFile != "":
 | 
| 
 | 
   987             with open(inFile, "r") as inFH:
 | 
| 
 | 
   988                 with open(outFile, "w") as outFH:
 | 
| 
 | 
   989                     bioseq = Bioseq()
 | 
| 
 | 
   990                     bioseq.read(inFH)
 | 
| 
 | 
   991                     while bioseq.sequence != None:
 | 
| 
 | 
   992                         seqLen = bioseq.getLength()
 | 
| 
 | 
   993                         outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
 | 
| 
 | 
   994                                                 bioseq.sequence, bioseq.header, seqLen))
 | 
| 
 | 
   995                         bioseq.read(inFH)
 | 
| 
 | 
   996 
 | 
| 
 | 
   997 
 | 
| 
 | 
   998     ## Splice an input fasta file using coordinates in a Map file
 | 
| 
 | 
   999     #
 | 
| 
 | 
  1000     # @note the coordinates should be merged beforehand!
 | 
| 
 | 
  1001     #
 | 
| 
 | 
  1002     @staticmethod
 | 
| 
 | 
  1003     def spliceFromCoords( genomeFile, coordFile, obsFile ):
 | 
| 
 | 
  1004         genomeFileHandler = open( genomeFile, "r" )
 | 
| 
 | 
  1005         obsFileHandler = open( obsFile, "w" )
 | 
| 
 | 
  1006         dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile )
 | 
| 
 | 
  1007             
 | 
| 
 | 
  1008         bs = Bioseq()
 | 
| 
 | 
  1009         bs.read( genomeFileHandler )
 | 
| 
 | 
  1010         while bs.sequence:
 | 
| 
 | 
  1011             if dChr2Maps.has_key( bs.header ):
 | 
| 
 | 
  1012                 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] )
 | 
| 
 | 
  1013                 splicedSeq = ""
 | 
| 
 | 
  1014                 currentSite = 0
 | 
| 
 | 
  1015                 for iMap in lCoords:
 | 
| 
 | 
  1016                     minSplice = iMap.getMin() - 1
 | 
| 
 | 
  1017                     if minSplice > currentSite:
 | 
| 
 | 
  1018                         splicedSeq += bs.sequence[ currentSite : minSplice ]
 | 
| 
 | 
  1019                     currentSite = iMap.getMax()
 | 
| 
 | 
  1020                 splicedSeq += bs.sequence[ currentSite : ]
 | 
| 
 | 
  1021                 bs.sequence = splicedSeq
 | 
| 
 | 
  1022             bs.write( obsFileHandler )
 | 
| 
 | 
  1023             bs.read( genomeFileHandler )
 | 
| 
 | 
  1024             
 | 
| 
 | 
  1025         genomeFileHandler.close()
 | 
| 
 | 
  1026         obsFileHandler.close()
 | 
| 
 | 
  1027         
 | 
| 
 | 
  1028     
 | 
| 
 | 
  1029     ## Shuffle input sequences (single file or files in a directory)
 | 
| 
 | 
  1030     #
 | 
| 
 | 
  1031     @staticmethod
 | 
| 
 | 
  1032     def dbShuffle( inData, outData, verbose=0 ):
 | 
| 
 | 
  1033         if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
 | 
| 
 | 
  1034             prg = "esl-shuffle"
 | 
| 
 | 
  1035         else : prg = "shuffle"
 | 
| 
 | 
  1036         genericCmd = prg + " -d INPUT > OUTPUT"
 | 
| 
 | 
  1037         if os.path.isfile( inData ):
 | 
| 
 | 
  1038             if verbose > 0:
 | 
| 
 | 
  1039                 print "shuffle input file '%s'" % inData
 | 
| 
 | 
  1040             cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData)
 | 
| 
 | 
  1041             print cmd
 | 
| 
 | 
  1042             returnStatus = os.system( cmd )
 | 
| 
 | 
  1043             if returnStatus != 0:
 | 
| 
 | 
  1044                 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
 | 
| 
 | 
  1045                 sys.exit(1)
 | 
| 
 | 
  1046                 
 | 
| 
 | 
  1047         elif os.path.isdir( inData ):
 | 
| 
 | 
  1048             if verbose > 0:
 | 
| 
 | 
  1049                 print "shuffle files in input directory '%s'" % inData
 | 
| 
 | 
  1050             if os.path.exists( outData ):
 | 
| 
 | 
  1051                 shutil.rmtree( outData )
 | 
| 
 | 
  1052             os.mkdir( outData )
 | 
| 
 | 
  1053             lInputFiles = glob.glob( "%s/*.fa" %( inData ) )
 | 
| 
 | 
  1054             nbFastaFiles = 0
 | 
| 
 | 
  1055             for inputFile in lInputFiles:
 | 
| 
 | 
  1056                 nbFastaFiles += 1
 | 
| 
 | 
  1057                 if verbose > 1:
 | 
| 
 | 
  1058                     print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) )
 | 
| 
 | 
  1059                 fastaBaseName = os.path.basename( inputFile )
 | 
| 
 | 
  1060                 prefix, extension = os.path.splitext( fastaBaseName )
 | 
| 
 | 
  1061                 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix))
 | 
| 
 | 
  1062                 returnStatus = os.system( cmd )
 | 
| 
 | 
  1063                 if returnStatus != 0:
 | 
| 
 | 
  1064                     sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus )
 | 
| 
 | 
  1065                     sys.exit(1)
 | 
| 
 | 
  1066                     
 | 
| 
 | 
  1067     
 | 
| 
 | 
  1068     ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
 | 
| 
 | 
  1069     #
 | 
| 
 | 
  1070     # @param inClusterFileName string input cluster file name
 | 
| 
 | 
  1071     # @param inFastaFileName string input fasta file name
 | 
| 
 | 
  1072     # @param outFileName string output file name
 | 
| 
 | 
  1073     # @param verbosity integer verbosity
 | 
| 
 | 
  1074     #
 | 
| 
 | 
  1075     @staticmethod
 | 
| 
 | 
  1076     def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
 | 
| 
 | 
  1077         dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
 | 
| 
 | 
  1078         iFastaParser = FastaParser(inFastaFileName)
 | 
| 
 | 
  1079         with open(outFileName, "w") as f:
 | 
| 
 | 
  1080             for iSequence in iFastaParser.getIterator():
 | 
| 
 | 
  1081                 
 | 
| 
 | 
  1082                 header = iSequence.getName()
 | 
| 
 | 
  1083                 if dHeader2ClusterClusterMember.get(header):
 | 
| 
 | 
  1084                     cluster = dHeader2ClusterClusterMember[header][0]
 | 
| 
 | 
  1085                     member = dHeader2ClusterClusterMember[header][1]
 | 
| 
 | 
  1086                 else:
 | 
| 
 | 
  1087                     clusterIdForSingletonCluster +=  1
 | 
| 
 | 
  1088                     cluster = clusterIdForSingletonCluster
 | 
| 
 | 
  1089                     member = 1
 | 
| 
 | 
  1090                 
 | 
| 
 | 
  1091                 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
 | 
| 
 | 
  1092                 iSequence.setName(newHeader)
 | 
| 
 | 
  1093                 f.write(iSequence.printFasta())
 | 
| 
 | 
  1094               
 | 
| 
18
 | 
  1095     @staticmethod
 | 
| 
6
 | 
  1096     def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
 | 
| 
 | 
  1097         dHeader2ClusterClusterMember = {}
 | 
| 
 | 
  1098         clusterId = 0
 | 
| 
 | 
  1099         with open(inClusterFileName) as f:
 | 
| 
 | 
  1100             line = f.readline()
 | 
| 
 | 
  1101             while line:
 | 
| 
 | 
  1102                 lineWithoutLastChar = line.rstrip()
 | 
| 
 | 
  1103                 lHeaders = lineWithoutLastChar.split("\t")
 | 
| 
 | 
  1104                 clusterId += 1
 | 
| 
 | 
  1105                 if verbosity > 0:
 | 
| 
 | 
  1106                     print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
 | 
| 
 | 
  1107                 memberId = 0
 | 
| 
 | 
  1108                 for header in lHeaders:
 | 
| 
 | 
  1109                     memberId += 1
 | 
| 
 | 
  1110                     dHeader2ClusterClusterMember[header] = (clusterId, memberId)
 | 
| 
 | 
  1111                 line = f.readline()
 | 
| 
 | 
  1112             if verbosity > 0:
 | 
| 
 | 
  1113                 print "%i clusters" % clusterId
 | 
| 
 | 
  1114         return dHeader2ClusterClusterMember, clusterId
 | 
| 
 | 
  1115     
 | 
| 
18
 | 
  1116     @staticmethod
 | 
| 
6
 | 
  1117     def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
 | 
| 
 | 
  1118         """
 | 
| 
 | 
  1119         Write a map file from fasta output of clustering tool.
 | 
| 
 | 
  1120         Warning: only works if input fasta headers are formated like LTRharvest fasta output.
 | 
| 
 | 
  1121         """
 | 
| 
 | 
  1122         if not outMapFileName:
 | 
| 
 | 
  1123             outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
 | 
| 
 | 
  1124         
 | 
| 
 | 
  1125         fileDb = open(fastaFileNameFromClustering , "r")
 | 
| 
 | 
  1126         fileMap = open(outMapFileName, "w")
 | 
| 
 | 
  1127         seq = Bioseq()
 | 
| 
 | 
  1128         numseq = 0
 | 
| 
 | 
  1129         while 1:
 | 
| 
 | 
  1130             seq.read(fileDb)
 | 
| 
 | 
  1131             if seq.sequence == None:
 | 
| 
 | 
  1132                 break
 | 
| 
 | 
  1133             numseq = numseq + 1
 | 
| 
 | 
  1134             ID = seq.header.split(' ')[0].split('_')[0]
 | 
| 
 | 
  1135             chunk = seq.header.split(' ')[0].split('_')[1]
 | 
| 
 | 
  1136             start = seq.header.split(' ')[-1].split(',')[0][1:]
 | 
| 
 | 
  1137             end = seq.header.split(' ')[-1].split(',')[1][:-1]
 | 
| 
 | 
  1138             line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
 | 
| 
 | 
  1139             fileMap.write(line + "\n")
 | 
| 
 | 
  1140     
 | 
| 
 | 
  1141         fileDb.close()
 | 
| 
 | 
  1142         fileMap.close()
 | 
| 
 | 
  1143         print "saved in %s" % outMapFileName
 | 
| 
18
 | 
  1144 
 | 
| 
 | 
  1145     @staticmethod
 | 
| 
 | 
  1146     def writeNstreches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
 | 
| 
 | 
  1147         outFormat = outFormat.lower()
 | 
| 
 | 
  1148         if outFormat in ["gff", "gff3"]:
 | 
| 
 | 
  1149             outFormat = "gff3"
 | 
| 
 | 
  1150         else:
 | 
| 
 | 
  1151             outFormat = "map"
 | 
| 
 | 
  1152             
 | 
| 
 | 
  1153         lTNstretches = []
 | 
| 
 | 
  1154         if nbN != 0:
 | 
| 
 | 
  1155             iBSDB = BioseqDB(fastaFileName)
 | 
| 
 | 
  1156             for iBS in iBSDB.db:
 | 
| 
 | 
  1157                 nbNFound = 0
 | 
| 
 | 
  1158                 start = 1
 | 
| 
 | 
  1159                 pos = 1
 | 
| 
 | 
  1160                 lastPos = 0
 | 
| 
 | 
  1161                 
 | 
| 
 | 
  1162                 while pos <= iBS.getLength():
 | 
| 
 | 
  1163                     if nbNFound == 0:
 | 
| 
 | 
  1164                         start = pos
 | 
| 
 | 
  1165                         
 | 
| 
 | 
  1166                     while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
 | 
| 
 | 
  1167                         nbNFound += 1
 | 
| 
 | 
  1168                         pos += 1
 | 
| 
 | 
  1169                         lastPos = pos
 | 
| 
 | 
  1170                     
 | 
| 
 | 
  1171                     if pos - lastPos >= nbN:
 | 
| 
 | 
  1172                         if nbNFound >= nbN:
 | 
| 
 | 
  1173                             lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
 | 
| 
 | 
  1174                         nbNFound = 0
 | 
| 
 | 
  1175                     pos += 1
 | 
| 
 | 
  1176                 
 | 
| 
 | 
  1177                 if nbNFound >= nbN:
 | 
| 
 | 
  1178                     lTNstretches.append((iBS.getHeader(), start, lastPos - 1))
 | 
| 
 | 
  1179     
 | 
| 
 | 
  1180             lTNstretches.sort(key = itemgetter(0, 1, 2))
 | 
| 
 | 
  1181         
 | 
| 
 | 
  1182         if outFileName == "":
 | 
| 
 | 
  1183             outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
 | 
| 
 | 
  1184         
 | 
| 
 | 
  1185         with open(outFileName, "w") as fH:
 | 
| 
 | 
  1186             if outFormat == "gff3":
 | 
| 
 | 
  1187                 fH.write("##gff-version 3\n")
 | 
| 
 | 
  1188             for item in lTNstretches:
 | 
| 
 | 
  1189                 seq = item[0]
 | 
| 
 | 
  1190                 start = item[1]
 | 
| 
 | 
  1191                 end = item[2]
 | 
| 
 | 
  1192                 if outFormat == "gff3":
 | 
| 
 | 
  1193                     fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
 | 
| 
 | 
  1194                 else:
 | 
| 
 | 
  1195                     fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
 | 
| 
 | 
  1196                 
 | 
| 
 | 
  1197                  |