13
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import os
|
|
33 import re
|
|
34 import sys
|
|
35 import math
|
|
36 import glob
|
|
37 import string
|
|
38 import shutil
|
|
39 from commons.core.utils.FileUtils import FileUtils
|
|
40 from commons.core.seq.Bioseq import Bioseq
|
|
41 from commons.core.seq.BioseqDB import BioseqDB
|
|
42 from commons.core.coord.MapUtils import MapUtils
|
|
43 from commons.core.coord.Range import Range
|
|
44 from commons.core.coord.ConvCoord import ConvCoord
|
|
45 from commons.core.parsing.FastaParser import FastaParser
|
|
46 from commons.core.checker.CheckerUtils import CheckerUtils
|
|
47 from commons.core.launcher.LauncherUtils import LauncherUtils
|
|
48
|
|
49
|
|
50 ## Static methods for fasta file manipulation
|
|
51 #
|
|
52 class FastaUtils( object ):
|
|
53
|
|
54 ## Count the number of sequences in the input fasta file
|
|
55 #
|
|
56 # @param inFile name of the input fasta file
|
|
57 #
|
|
58 # @return integer number of sequences in the input fasta file
|
|
59 #
|
|
60 @staticmethod
|
|
61 def dbSize( inFile ):
|
|
62 nbSeq = 0
|
|
63 with open(inFile) as fH:
|
|
64 for line in fH:
|
|
65 if line[0] == ">":
|
|
66 nbSeq += 1
|
|
67
|
|
68 return nbSeq
|
|
69
|
|
70
|
|
71 ## Compute the cumulative sequence length in the input fasta file
|
|
72 #
|
|
73 # @param inFile handler of the input fasta file
|
|
74 #
|
|
75 @staticmethod
|
|
76 def dbCumLength( inFile ):
|
|
77 cumLength = 0
|
|
78 for line in inFile:
|
|
79 if line[0] != ">":
|
|
80 cumLength += len(string.rstrip(line))
|
|
81
|
|
82 return cumLength
|
|
83
|
|
84
|
|
85 ## Return a list with the length of each sequence in the input fasta file
|
|
86 #
|
|
87 # @param inFile string name of the input fasta file
|
|
88 #
|
|
89 @staticmethod
|
|
90 def dbLengths(inFile):
|
|
91 lLengths = []
|
|
92 currentLength = 0
|
|
93 with open(inFile) as fH:
|
|
94 for line in fH:
|
|
95 if line[0] == ">":
|
|
96 if currentLength != 0:
|
|
97 lLengths.append(currentLength)
|
|
98 currentLength = 0
|
|
99 else:
|
|
100 currentLength += len(line[:-1])
|
|
101 lLengths.append(currentLength)
|
|
102
|
|
103 return lLengths
|
|
104
|
|
105
|
|
106 ## Retrieve the sequence headers present in the input fasta file
|
|
107 #
|
|
108 # @param inFile string name of the input fasta file
|
|
109 # @param verbose integer level of verbosity
|
|
110 #
|
|
111 # @return list of sequence headers
|
|
112 #
|
|
113 @staticmethod
|
|
114 def dbHeaders(inFile, verbose = 0):
|
|
115 lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
|
|
116 if verbose:
|
|
117 for header in lHeaders:
|
|
118 print header
|
|
119
|
|
120 return lHeaders
|
|
121
|
|
122
|
|
123 ## Cut a data bank into chunks according to the input parameters
|
|
124 # If a sequence is shorter than the threshold, it is only renamed (not cut)
|
|
125 #
|
|
126 # @param inFileName string name of the input fasta file
|
|
127 # @param chkLgth string chunk length (in bp, default=200000)
|
|
128 # @param chkOver string chunk overlap (in bp, default=10000)
|
|
129 # @param wordN string N stretch word length (default=11, 0 for no detection)
|
|
130 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
|
|
131 # @param clean boolean remove 'cut' and 'Nstretch' files
|
|
132 # @param verbose integer (default = 0)
|
|
133 #
|
|
134 @staticmethod
|
|
135 def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
|
|
136 nbSeq = FastaUtils.dbSize(inFileName)
|
|
137 if verbose > 0:
|
|
138 print "cut the %i input sequences with cutterDB..." % nbSeq
|
|
139 sys.stdout.flush()
|
|
140
|
|
141 prg = "cutterDB"
|
|
142 cmd = prg
|
|
143 cmd += " -l %s" % chkLgth
|
|
144 cmd += " -o %s" % chkOver
|
|
145 cmd += " -w %s" % wordN
|
|
146 cmd += " %s" % inFileName
|
|
147 returnStatus = os.system(cmd)
|
|
148 if returnStatus != 0:
|
|
149 msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
|
|
150 sys.stderr.write("%s\n" % msg)
|
|
151 sys.exit(1)
|
|
152
|
|
153 nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
|
|
154 if verbose > 0:
|
|
155 print "done (%i chunks)" % ( nbChunks )
|
|
156 sys.stdout.flush()
|
|
157
|
|
158 if verbose > 0:
|
|
159 print "rename the headers..."
|
|
160 sys.stdout.flush()
|
|
161
|
|
162 if outFilePrefix == "":
|
|
163 outFastaName = inFileName + "_chunks.fa"
|
|
164 outMapName = inFileName + "_chunks.map"
|
|
165 else:
|
|
166 outFastaName = outFilePrefix + ".fa"
|
|
167 outMapName = outFilePrefix + ".map"
|
|
168
|
|
169 with open("%s_cut" % inFileName) as inFile:
|
|
170 outFasta = open(outFastaName, "w")
|
|
171 outMap = open(outMapName, "w")
|
|
172
|
|
173 for line in inFile:
|
|
174 if line[0] == ">":
|
|
175 if verbose > 1:
|
|
176 print "rename '%s'" % (line[:-1]); sys.stdout.flush()
|
|
177 data = line[:-1].split(" ")
|
|
178 seqID = data[0].split(">")[1]
|
|
179 newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
|
|
180 oldHeader = data[2]
|
|
181 seqStart = data[4].split("..")[0]
|
|
182 seqEnd = data[4].split("..")[1]
|
|
183 outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
|
|
184 outFasta.write(">%s\n" % newHeader)
|
|
185
|
|
186 else:
|
|
187 outFasta.write(line.upper())
|
|
188
|
|
189 outFasta.close()
|
|
190 outMap.close()
|
|
191
|
|
192 #stats on .Nstretch.map file
|
|
193 genomeLength = FastaUtils.dbCumLength(open(inFileName))
|
|
194 NstretchMapFile = inFileName + ".Nstretch.map"
|
|
195 outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
|
|
196 if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
|
|
197 outNstrechStats.write("No N in stretch length > %s\n" % wordN)
|
|
198 else:
|
|
199 with open(NstretchMapFile) as f:
|
|
200 dHeader2lLengths = {}
|
|
201 for line in f:
|
|
202 data = line.rstrip().split()
|
|
203 header = data[1]
|
|
204 length = int(data[3]) - int(data[2]) + 1
|
|
205 if header not in dHeader2lLengths:
|
|
206 dHeader2lLengths[header] = []
|
|
207 dHeader2lLengths[header].append(length)
|
|
208
|
|
209 for header in sorted(dHeader2lLengths):
|
|
210 lLengths = dHeader2lLengths[header]
|
|
211 outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))
|
|
212
|
|
213 cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])
|
|
214
|
|
215 NstretchPrct = float(cumulAllStretch)/genomeLength*100
|
|
216 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
|
|
217 outNstrechStats.close()
|
|
218
|
|
219 if clean == True:
|
|
220 os.remove(inFileName + "_cut")
|
|
221 os.remove(NstretchMapFile)
|
|
222
|
|
223
|
|
224 ## Split the input fasta file in several output files
|
|
225 #
|
|
226 # @param inFile string name of the input fasta file
|
|
227 # @param nbSeqPerBatch integer number of sequences per output file
|
|
228 # @param newDir boolean put the sequences in a new directory called 'batches'
|
|
229 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
|
|
230 # @param prefix prefix in output file name
|
|
231 # @param verbose integer verbosity level (default = 0)
|
|
232 #
|
|
233 @staticmethod
|
|
234 def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
|
|
235 if not os.path.exists(inFile):
|
|
236 msg = "ERROR: file '%s' doesn't exist" % inFile
|
|
237 sys.stderr.write("%s\n" % msg)
|
|
238 sys.exit(1)
|
|
239
|
|
240 nbSeq = FastaUtils.dbSize(inFile)
|
|
241
|
|
242 nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
|
|
243 if verbose > 0:
|
|
244 print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
|
|
245 sys.stdout.flush()
|
|
246
|
|
247 if nbSeqPerBatch != 1 and useSeqHeader:
|
|
248 useSeqHeader = False
|
|
249
|
|
250 if newDir == True:
|
|
251 if os.path.exists("batches"):
|
|
252 shutil.rmtree("batches")
|
|
253 os.mkdir("batches")
|
|
254 os.chdir("batches")
|
|
255 os.system("ln -s ../%s ." % inFile)
|
|
256
|
|
257 with open(inFile) as inFileHandler:
|
|
258 countBatch = 0
|
|
259 countSeq = 0
|
|
260 for line in inFileHandler:
|
|
261 if line[0] == ">":
|
|
262 countSeq += 1
|
|
263 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
|
|
264 try:
|
|
265 outFile.close()
|
|
266 except: pass
|
|
267 countBatch += 1
|
|
268 if useSeqHeader:
|
|
269 outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
|
|
270 else:
|
|
271 outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
|
|
272 outFile = open(outFileName, "w")
|
|
273
|
|
274 if verbose > 1:
|
|
275 print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
|
|
276 sys.stdout.flush()
|
|
277 outFile.write(line)
|
|
278
|
|
279 if newDir:
|
|
280 os.remove(os.path.basename(inFile))
|
|
281 os.chdir("..")
|
|
282
|
|
283
|
|
284 ## Split the input fasta file in several output files
|
|
285 #
|
|
286 # @param inFileName string name of the input fasta file
|
|
287 # @param maxSize integer max cumulative length for each output file
|
|
288 #
|
|
289 @staticmethod
|
|
290 def splitFastaFileInBatches(inFileName, maxSize = 200000):
|
|
291 iBioseqDB = BioseqDB(inFileName)
|
|
292 lHeadersSizeTuples = []
|
|
293 for iBioseq in iBioseqDB.db:
|
|
294 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
|
|
295
|
|
296 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
|
|
297 os.mkdir("batches")
|
|
298 os.chdir("batches")
|
|
299
|
|
300 iterator = 0
|
|
301 for lHeader in lHeadersList:
|
|
302 iterator += 1
|
|
303 with open("batch_%s.fa" % iterator, 'w') as f:
|
|
304 for header in lHeader :
|
|
305 iBioseq = iBioseqDB.fetch(header)
|
|
306 iBioseq.write(f)
|
|
307 os.chdir("..")
|
|
308
|
|
309
|
|
310 ## Split the input fasta file in several output files according to their cluster identifier
|
|
311 #
|
|
312 # @param inFileName string name of the input fasta file
|
|
313 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
|
|
314 # @param simplifyHeader boolean simplify the headers
|
|
315 # @param createDir boolean put the sequences in different directories
|
|
316 # @param outPrefix string prefix of the output files (default='seqCluster')
|
|
317 # @param verbose integer (default = 0)
|
|
318 #
|
|
319 @staticmethod
|
|
320 def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
|
|
321 if not os.path.exists(inFileName):
|
|
322 print "ERROR: %s doesn't exist" % inFileName
|
|
323 sys.exit(1)
|
|
324
|
|
325 inFile = open(inFileName)
|
|
326
|
|
327 line = inFile.readline()
|
|
328 if line:
|
|
329 name = line.split(" ")[0]
|
|
330 if "Cluster" in name:
|
|
331 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
332 seqID = name.split("Mb")[1]
|
|
333 else:
|
|
334 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
|
|
335 if "Q" in name.split("Gr")[0]:
|
|
336 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
337 elif "S" in name:
|
|
338 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
339 sClusterIDs = set( [ clusterID ] )
|
|
340 if simplifyHeader == True:
|
|
341 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
342 else:
|
|
343 header = line[1:-1]
|
|
344 if createDir == True:
|
|
345 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
346 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
347 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
348 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
349 outFile = open( outFileName, "w" )
|
|
350 outFile.write( ">%s\n" % ( header ) )
|
|
351 prevClusterID = clusterID
|
|
352
|
|
353 line = inFile.readline()
|
|
354 while line:
|
|
355 if line[0] == ">":
|
|
356 name = line.split(" ")[0]
|
|
357 if "Cluster" in name:
|
|
358 clusterID = name.split("Cluster")[1].split("Mb")[0]
|
|
359 seqID = name.split("Mb")[1]
|
|
360 else:
|
|
361 clusterID = name.split("Cl")[0].split("Gr")[1]
|
|
362 if "Q" in name.split("Gr")[0]:
|
|
363 seqID = name.split("Gr")[0].split("MbQ")[1]
|
|
364 elif "S" in name:
|
|
365 seqID = name.split("Gr")[0].split("MbS")[1]
|
|
366
|
|
367 if clusterID != prevClusterID:
|
|
368 outFile.close()
|
|
369
|
|
370 if simplifyHeader == True:
|
|
371 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
|
|
372 else:
|
|
373 header = line[1:-1]
|
|
374
|
|
375 if createDir == True:
|
|
376 os.chdir( ".." )
|
|
377 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
|
|
378 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
379 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
|
|
380
|
|
381 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
|
|
382 if not os.path.exists( outFileName ):
|
|
383 outFile = open( outFileName, "w" )
|
|
384 else:
|
|
385 if clusterID != prevClusterID:
|
|
386 outFile.close()
|
|
387 outFile = open( outFileName, "a" )
|
|
388 outFile.write( ">%s\n" % ( header ) )
|
|
389 prevClusterID = clusterID
|
|
390 sClusterIDs.add( clusterID )
|
|
391
|
|
392 else:
|
|
393 outFile.write( line )
|
|
394
|
|
395 line = inFile.readline()
|
|
396
|
|
397 outFile.close()
|
|
398 if verbose > 0:
|
|
399 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
|
|
400
|
|
401 if createDir == True:
|
|
402 os.chdir("..")
|
|
403 else:
|
|
404 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
|
|
405
|
|
406
|
|
407 ## Filter a fasta file in two fasta files using the length of each sequence as a criterion
|
|
408 #
|
|
409 # @param len_min integer length sequence criterion to filter
|
|
410 # @param inFileName string name of the input fasta file
|
|
411 # @param verbose integer (default = 0)
|
|
412 #
|
|
413 @staticmethod
|
|
414 def dbLengthFilter(len_min, inFileName, verbose = 0):
|
|
415 file_db = open(inFileName,)
|
|
416 file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
|
|
417 file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
|
|
418 seq = Bioseq()
|
|
419 numseq = 0
|
|
420 nbsaveInf = 0
|
|
421 nbsaveSup = 0
|
|
422 seq.read(file_db)
|
|
423 while seq.getHeader():
|
|
424 l = seq.getLength()
|
|
425 numseq = numseq + 1
|
|
426 if l >= len_min:
|
|
427 seq.write(file_dbSup)
|
|
428 if verbose > 0:
|
|
429 nbsaveSup = nbsaveSup + 1
|
|
430 else:
|
|
431 seq.write(file_dbInf)
|
|
432 if verbose > 0:
|
|
433 nbsaveInf = nbsaveInf + 1
|
|
434 seq.read(file_db)
|
|
435
|
|
436 file_db.close()
|
|
437 file_dbInf.close()
|
|
438 file_dbSup.close()
|
|
439 if verbose > 0:
|
|
440 print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))
|
|
441
|
|
442
|
|
443 ## Extract the longest sequences from a fasta file
|
|
444 #
|
|
445 # @param num integer maximum number of sequences in the output file
|
|
446 # @param inFileName string name of the input fasta file
|
|
447 # @param outFileName string name of the output fasta file
|
|
448 # @param minThresh integer minimum length threshold (default=0)
|
|
449 # @param verbose integer (default = 0)
|
|
450 #
|
|
451 @staticmethod
|
|
452 def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
|
|
453 bsDB = BioseqDB(inFileName)
|
|
454 if verbose > 0:
|
|
455 print "nb of input sequences: %i" % bsDB.getSize()
|
|
456
|
|
457 if outFileName == "":
|
|
458 outFileName = inFileName + ".best" + str(num)
|
|
459 outFile = open( outFileName, "w" )
|
|
460
|
|
461 if bsDB.getSize()==0:
|
|
462 return 0
|
|
463
|
|
464 num = int(num)
|
|
465 if verbose > 0:
|
|
466 print "keep the %i longest sequences" % num
|
|
467 if minThresh > 0:
|
|
468 print "with length > %i bp" % minThresh
|
|
469 sys.stdout.flush()
|
|
470
|
|
471 tmpLSeqLgth = bsDB.getListOfSequencesLength()
|
|
472 tmpLSeqLgth.sort(reverse = True)
|
|
473
|
|
474 # select the longests
|
|
475 lSeqLgth = []
|
|
476 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
|
|
477 if tmpLSeqLgth[i] >= minThresh:
|
|
478 lSeqLgth.append( tmpLSeqLgth[i] )
|
|
479 if verbose > 0:
|
|
480 print "selected max length: %i" % max(lSeqLgth)
|
|
481 print "selected min length: %i" % min(lSeqLgth)
|
|
482 sys.stdout.flush()
|
|
483
|
|
484 # save the longest
|
|
485 inFile = open( inFileName )
|
|
486 seqNum = 0
|
|
487 nbSave = 0
|
|
488 for bs in bsDB.db:
|
|
489 seqNum += 1
|
|
490 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
|
|
491 bs.write( outFile )
|
|
492 if verbose > 1:
|
|
493 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
|
|
494 sys.stdout.flush()
|
|
495 nbSave += 1
|
|
496 if nbSave == num:
|
|
497 break
|
|
498 inFile.close()
|
|
499 outFile.close()
|
|
500 if verbose > 0:
|
|
501 print nbSave, "saved sequences in ", outFileName
|
|
502 sys.stdout.flush()
|
|
503
|
|
504 return 0
|
|
505
|
|
506
|
|
507 ## Extract all the sequence headers from a fasta file and write them in a new file
|
|
508 #
|
|
509 # @param inFileName string name of the input fasta file
|
|
510 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
|
|
511 #
|
|
512 @staticmethod
|
|
513 def dbExtractSeqHeaders(inFileName, outFileName = ""):
|
|
514 if not outFileName:
|
|
515 outFileName = inFileName + ".headers"
|
|
516
|
|
517 with open(outFileName, "w") as f:
|
|
518 for header in FastaUtils.dbHeaders(inFileName):
|
|
519 f.write("%s\n" % header)
|
|
520
|
|
521 return 0
|
|
522
|
|
523
|
|
524 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
|
|
525 #
|
|
526 # @param pattern regular expression to search in headers
|
|
527 # @param inFileName string name of the input fasta file
|
|
528 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
529 # @param verbose integer verbosity level (default = 0)
|
|
530 #
|
|
531 @staticmethod
|
|
532 def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
|
|
533 if not pattern:
|
|
534 return
|
|
535
|
|
536 if not outFileName:
|
|
537 outFileName = inFileName + '.extracted'
|
|
538 outFile = open(outFileName, 'w')
|
|
539
|
|
540 patternTosearch = re.compile(pattern)
|
|
541 bioseq = Bioseq()
|
|
542 bioseqNb = 0
|
|
543 savedBioseqNb = 0
|
|
544 inFile = open(inFileName)
|
|
545 bioseq.read(inFile)
|
|
546 while bioseq.sequence:
|
|
547 bioseqNb = bioseqNb + 1
|
|
548 m = patternTosearch.search(bioseq.header)
|
|
549 if m:
|
|
550 bioseq.write(outFile)
|
|
551 if verbose > 1:
|
|
552 print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
|
|
553 savedBioseqNb = savedBioseqNb + 1
|
|
554 bioseq.read(inFile)
|
|
555 inFile.close()
|
|
556
|
|
557 outFile.close()
|
|
558
|
|
559 if verbose > 0:
|
|
560 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
|
|
561
|
|
562
|
|
563 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
564 #
|
|
565 # @param patternFileName string file containing regular expression to search in headers
|
|
566 # @param inFileName string name of the input fasta file
|
|
567 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
568 # @param verbose integer verbosity level (default = 0)
|
|
569 #
|
|
570 @staticmethod
|
|
571 def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
|
|
572 if not patternFileName:
|
|
573 print "ERROR: no file of pattern"
|
|
574 sys.exit(1)
|
|
575
|
|
576 bioseq = Bioseq()
|
|
577 bioseqNb = 0
|
|
578 savedBioseqNb = 0
|
|
579 lHeaders = []
|
|
580
|
|
581 with open(inFileName) as inFile:
|
|
582 bioseq.read(inFile)
|
|
583 while bioseq.sequence:
|
|
584 lHeaders.append(bioseq.header)
|
|
585 bioseq.read(inFile)
|
|
586
|
|
587 lHeadersToKeep = []
|
|
588 with open(patternFileName) as patternFile:
|
|
589 for pattern in patternFile:
|
|
590 pattern = pattern.rstrip()
|
|
591 if verbose > 0:
|
|
592 print "pattern: ", pattern; sys.stdout.flush()
|
|
593
|
|
594 patternToSearch = re.compile(pattern)
|
|
595 lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])
|
|
596
|
|
597 if not outFileName:
|
|
598 outFileName = inFileName + ".extracted"
|
|
599
|
|
600 with open(outFileName, "w") as outFile:
|
|
601 with open(inFileName) as inFile:
|
|
602 bioseq.read(inFile)
|
|
603 while bioseq.sequence:
|
|
604 bioseqNb += 1
|
|
605 if bioseq.header in lHeadersToKeep:
|
|
606 bioseq.write(outFile)
|
|
607 savedBioseqNb += 1
|
|
608 if verbose > 1:
|
|
609 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
|
|
610 bioseq.read(inFile)
|
|
611
|
|
612 if verbose > 0:
|
|
613 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
|
|
614
|
|
615
|
|
616 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
|
|
617 #
|
|
618 # @param pattern regular expression to search in headers
|
|
619 # @param inFileName string name of the input fasta file
|
|
620 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
621 # @param verbose integer verbosity level (default = 0)
|
|
622 #
|
|
623 @staticmethod
|
|
624 def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
|
|
625 if not pattern:
|
|
626 return
|
|
627
|
|
628 patternToSearch = re.compile(pattern)
|
|
629
|
|
630 if outFileName == "":
|
|
631 outFileName = inFileName + '.cleaned'
|
|
632
|
|
633 with open(outFileName, 'w') as outFile:
|
|
634 bioseq = Bioseq()
|
|
635 bioseqNb = 0
|
|
636 savedBioseqNb = 0
|
|
637 with open(inFileName) as inFile:
|
|
638 bioseq.read(inFile)
|
|
639 while bioseq.sequence:
|
|
640 bioseqNb += 1
|
|
641 if not patternToSearch.search(bioseq.header):
|
|
642 bioseq.write(outFile)
|
|
643 savedBioseqNb += 1
|
|
644 if verbose > 1:
|
|
645 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
|
|
646 bioseq.read(inFile)
|
|
647
|
|
648 if verbose > 0:
|
|
649 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
|
|
650
|
|
651
|
|
652 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
|
|
653 #
|
|
654 # @param patternFileName string file containing regular expression to search in headers
|
|
655 # @param inFileName string name of the input fasta file
|
|
656 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
|
|
657 # @param verbose integer verbosity level (default = 0)
|
|
658 #
|
|
659 @staticmethod
|
|
660 def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
|
|
661 if not patternFileName:
|
|
662 print "ERROR: no file of pattern"
|
|
663 sys.exit(1)
|
|
664
|
|
665 bioseq = Bioseq()
|
|
666 bioseqNb = 0
|
|
667 savedBioseqNb = 0
|
|
668 lHeaders = []
|
|
669 with open(inFileName) as inFile:
|
|
670 bioseq.read(inFile)
|
|
671 while bioseq.sequence:
|
|
672 lHeaders.append(bioseq.header)
|
|
673 bioseq.read(inFile)
|
|
674
|
|
675 with open(patternFileName) as patternFile:
|
|
676 lHeadersToRemove = []
|
|
677 for pattern in patternFile:
|
|
678 pattern = pattern.rstrip()
|
|
679 if verbose > 0:
|
|
680 print "pattern: ", pattern; sys.stdout.flush()
|
|
681
|
|
682 patternToSearch = re.compile(pattern)
|
|
683 lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])
|
|
684
|
|
685 if not outFileName:
|
|
686 outFileName = inFileName + '.cleaned'
|
|
687
|
|
688 with open(outFileName, 'w') as outFile:
|
|
689 bioseqNum = 0
|
|
690 with open(inFileName) as inFile:
|
|
691 bioseq.read(inFile)
|
|
692 while bioseq.sequence:
|
|
693 bioseqNum += 1
|
|
694 if bioseq.header not in lHeadersToRemove:
|
|
695 bioseq.write(outFile)
|
|
696 savedBioseqNb += 1
|
|
697 if verbose > 1:
|
|
698 print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
|
|
699 bioseq.read(inFile)
|
|
700
|
|
701 if verbose > 0:
|
|
702 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
|
|
703
|
|
704
|
|
705 ## Find sequence's ORFs from a fasta file and write them in a tab file
|
|
706 #
|
|
707 # @param inFileName string name of the input fasta file
|
|
708 # @param orfMaxNb integer Select orfMaxNb best ORFs
|
|
709 # @param orfMinLength integer Keep ORFs with length > orfMinLength
|
|
710 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
|
|
711 # @param verbose integer verbosity level (default = 0)
|
|
712 #
|
|
713 @staticmethod
|
|
714 def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
|
|
715 if not outFileName:
|
|
716 outFileName = inFileName + ".ORF.map"
|
|
717 outFile = open(outFileName, "w")
|
|
718
|
|
719 bioseq = Bioseq()
|
|
720 bioseqNb = 0
|
|
721
|
|
722 inFile = open(inFileName)
|
|
723 bioseq.read(inFile)
|
|
724 while bioseq.sequence:
|
|
725 bioseq.upCase()
|
|
726 bioseqNb += 1
|
|
727 if verbose > 0:
|
|
728 print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'
|
|
729
|
|
730 orf = bioseq.findORF()
|
|
731 bestOrf = []
|
|
732 for i in orf.keys():
|
|
733 orfLen = len(orf[i])
|
|
734 for j in xrange(1, orfLen):
|
|
735 start = orf[i][j - 1] + 4
|
|
736 end = orf[i][j] + 3
|
|
737 if end - start >= orfMinLength:
|
|
738 bestOrf.append((end - start, i + 1, start, end))
|
|
739
|
|
740 bioseq.reverseComplement()
|
|
741
|
|
742 orf = bioseq.findORF()
|
|
743 seqLen = bioseq.getLength()
|
|
744 for i in orf.keys():
|
|
745 orfLen = len(orf[i])
|
|
746 for j in xrange(1, orfLen):
|
|
747 start = seqLen - orf[i][j - 1] - 3
|
|
748 end = seqLen - orf[i][j] - 2
|
|
749 if start - end >= orfMinLength:
|
|
750 bestOrf.append((start - end, (i + 1) * -1, start, end))
|
|
751
|
|
752 bestOrf.sort(reverse = True)
|
|
753 bestOrfNb = len(bestOrf)
|
|
754 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
|
|
755 bestOrfNb = orfMaxNb
|
|
756 for i in xrange(0, bestOrfNb):
|
|
757 if verbose > 0:
|
|
758 print bestOrf[i]
|
|
759 outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
|
|
760 "|" + str(bestOrf[i][0]), bioseq.header,
|
|
761 bestOrf[i][2], bestOrf[i][3]))
|
|
762 bioseq.read(inFile)
|
|
763
|
|
764 inFile.close()
|
|
765 outFile.close()
|
|
766
|
|
767 return 0
|
|
768
|
|
769
|
|
770 ## Sort sequences by increasing length (write a new file)
|
|
771 #
|
|
772 # @param inFileName string name of the input fasta file
|
|
773 # @param outFileName string name of the output fasta file
|
|
774 # @param verbose integer verbosity level
|
|
775 #
|
|
776 @staticmethod
|
|
777 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
|
|
778 if verbose > 0:
|
|
779 print "sort sequences by increasing length"
|
|
780 sys.stdout.flush()
|
|
781 if not os.path.exists(inFileName):
|
|
782 print "ERROR: file '%s' doesn't exist" % (inFileName)
|
|
783 sys.exit(1)
|
|
784
|
|
785 # read each seq one by one
|
|
786 # save them in distinct temporary files
|
|
787 # with their length in the name
|
|
788 inFileHandler = open(inFileName, "r")
|
|
789 countSeq = 0
|
|
790 bs = Bioseq()
|
|
791 bs.read(inFileHandler)
|
|
792 while bs.header:
|
|
793 countSeq += 1
|
|
794 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
|
|
795 bs.appendBioseqInFile(tmpFile)
|
|
796 if verbose > 1:
|
|
797 print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
|
|
798 bs.header = ""
|
|
799 bs.sequence = ""
|
|
800 bs.read(inFileHandler)
|
|
801 inFileHandler.close()
|
|
802
|
|
803 # sort temporary file names
|
|
804 # concatenate them into the output file
|
|
805 if os.path.exists(outFileName):
|
|
806 os.remove(outFileName)
|
|
807 lFiles = glob.glob("*bp_*nb")
|
|
808 lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
|
|
809 for fileName in lFiles:
|
|
810 cmd = "cat %s >> %s" % (fileName, outFileName)
|
|
811 returnValue = os.system(cmd)
|
|
812 if returnValue != 0:
|
|
813 print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
|
|
814 sys.exit(1)
|
|
815 os.remove(fileName)
|
|
816
|
|
817 return 0
|
|
818
|
|
819
|
|
820 ## Sort sequences by header
|
|
821 #
|
|
822 # @param inFileName string name of the input fasta file
|
|
823 # @param outFileName string name of the output fasta file
|
|
824 # @param verbose integer verbosity level
|
|
825 #
|
|
826 @staticmethod
|
|
827 def sortSequencesByHeader(inFileName, outFileName = ""):
|
|
828 if outFileName == "":
|
|
829 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
|
|
830 iBioseqDB = BioseqDB(inFileName)
|
|
831 with open(outFileName, "w") as f:
|
|
832 for header in sorted(iBioseqDB.getHeaderList()):
|
|
833 iBioseq = iBioseqDB.fetch(header)
|
|
834 iBioseq.write(f)
|
|
835
|
|
836
|
|
837 ## Return a dictionary which keys are the headers and values the length of the sequences
|
|
838 #
|
|
839 # @param inFile string name of the input fasta file
|
|
840 # @param verbose integer verbosity level
|
|
841 #
|
|
842 @staticmethod
|
|
843 def getLengthPerHeader(inFile, verbose = 0):
|
|
844 dHeader2Length = {}
|
|
845
|
|
846 with open(inFile) as inFileHandler:
|
|
847 currentSeqHeader = ""
|
|
848 currentSeqLength = 0
|
|
849 for line in inFileHandler:
|
|
850 if line[0] == ">":
|
|
851 if currentSeqHeader != "":
|
|
852 dHeader2Length[currentSeqHeader] = currentSeqLength
|
|
853 currentSeqLength = 0
|
|
854 currentSeqHeader = line[1:-1]
|
|
855 if verbose > 0:
|
|
856 print "current header: %s" % currentSeqHeader
|
|
857 sys.stdout.flush()
|
|
858 else:
|
|
859 currentSeqLength += len(line.replace("\n", ""))
|
|
860 dHeader2Length[currentSeqHeader] = currentSeqLength
|
|
861
|
|
862 return dHeader2Length
|
|
863
|
|
864
|
|
865 ## Convert headers from a fasta file having chunk coordinates
|
|
866 #
|
|
867 # @param inFile string name of the input fasta file
|
|
868 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
|
|
869 # @param outFile string name of the output file
|
|
870 #
|
|
871 @staticmethod
|
|
872 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
|
|
873 inFileHandler = open(inFile, "r")
|
|
874 outFileHandler = open(outFile, "w")
|
|
875 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
|
|
876 iConvCoord = ConvCoord()
|
|
877 for line in inFileHandler:
|
|
878 if line[0] == ">":
|
|
879 if "{Fragment}" in line:
|
|
880 chkName = line.split(" ")[1]
|
|
881 chrName = dChunk2Map[chkName].seqname
|
|
882 lCoordPairs = line.split(" ")[3].split(",")
|
|
883 lRangesOnChk = []
|
|
884 for i in lCoordPairs:
|
|
885 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
|
|
886 lRangesOnChk.append(iRange)
|
|
887 lRangesOnChr = []
|
|
888 for iRange in lRangesOnChk:
|
|
889 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
|
|
890 newHeader = line[1:-1].split(" ")[0]
|
|
891 newHeader += " %s" % chrName
|
|
892 newHeader += " {Fragment}"
|
|
893 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
|
|
894 for iRange in lRangesOnChr[1:]:
|
|
895 newHeader += ",%i..%i" % (iRange.start, iRange.end)
|
|
896 outFileHandler.write(">%s\n" % newHeader)
|
|
897 else:
|
|
898 chkName = line.split("_")[1].split(" ")[0]
|
|
899 chrName = dChunk2Map[chkName].seqname
|
|
900 coords = line[line.find("[")+1 : line.find("]")]
|
|
901 start = int(coords.split(",")[0])
|
|
902 end = int(coords.split(",")[1])
|
|
903 iRangeOnChk = Range(chkName, start, end)
|
|
904 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
|
|
905 newHeader = line[1:-1].split("_")[0]
|
|
906 newHeader += " %s" % chrName
|
|
907 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
|
|
908 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
|
|
909 outFileHandler.write(">%s\n" % newHeader)
|
|
910 else:
|
|
911 outFileHandler.write(line)
|
|
912 inFileHandler.close()
|
|
913 outFileHandler.close()
|
|
914
|
|
915
|
|
916 ## Convert a fasta file to a length file
|
|
917 #
|
|
918 # @param inFile string name of the input fasta file
|
|
919 # @param outFile string name of the output file
|
|
920 #
|
|
921 @staticmethod
|
|
922 def convertFastaToLength(inFile, outFile = ""):
|
|
923 if not outFile:
|
|
924 outFile = "%s.length" % inFile
|
|
925
|
|
926 if inFile:
|
|
927 with open(inFile) as inFH:
|
|
928 with open(outFile, "w") as outFH:
|
|
929 bioseq = Bioseq()
|
|
930 bioseq.read(inFH)
|
|
931 while bioseq.sequence:
|
|
932 seqLen = bioseq.getLength()
|
|
933 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
|
|
934 bioseq.read(inFH)
|
|
935
|
|
936
|
|
937 ## Convert a fasta file to a seq file
|
|
938 #
|
|
939 # @param inFile string name of the input fasta file
|
|
940 # @param outFile string name of the output file
|
|
941 #
|
|
942 @staticmethod
|
|
943 def convertFastaToSeq(inFile, outFile = ""):
|
|
944 if not outFile:
|
|
945 outFile = "%s.seq" % inFile
|
|
946
|
|
947 if inFile:
|
|
948 with open(inFile) as inFH:
|
|
949 with open(outFile, "w") as outFH:
|
|
950 bioseq = Bioseq()
|
|
951 bioseq.read(inFH)
|
|
952 while bioseq.sequence:
|
|
953 seqLen = bioseq.getLength()
|
|
954 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
|
|
955 bioseq.sequence, bioseq.header, seqLen))
|
|
956 bioseq.read(inFH)
|
|
957
|
|
958
|
|
959 ## Splice an input fasta file using coordinates in a Map file
|
|
960 #
|
|
961 # @note the coordinates should be merged beforehand!
|
|
962 #
|
|
963 @staticmethod
|
|
964 def spliceFromCoords(genomeFile, coordFile, obsFile):
|
|
965 genomeFileHandler = open(genomeFile)
|
|
966 obsFileHandler = open(obsFile, "w")
|
|
967 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)
|
|
968
|
|
969 bs = Bioseq()
|
|
970 bs.read(genomeFileHandler)
|
|
971 while bs.sequence:
|
|
972
|
|
973 if dChr2Maps.has_key(bs.header):
|
|
974 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
|
|
975 splicedSeq = []
|
|
976 currentSite = 0
|
|
977 for iMap in lCoords:
|
|
978 minSplice = iMap.getMin() - 1
|
|
979 if minSplice > currentSite:
|
|
980 splicedSeq += bs.sequence[currentSite : minSplice]
|
|
981 if currentSite <= iMap.getMax():
|
|
982 currentSite = iMap.getMax()
|
|
983 splicedSeq += bs.sequence[currentSite:]
|
|
984 bs.sequence = "".join(splicedSeq)
|
|
985 bs.write(obsFileHandler)
|
|
986 bs.read(genomeFileHandler)
|
|
987
|
|
988 genomeFileHandler.close()
|
|
989 obsFileHandler.close()
|
|
990
|
|
991
|
|
992 ## Shuffle input sequences (single file or files in a directory)
|
|
993 #
|
|
994 @staticmethod
|
|
995 def dbShuffle(inData, outData, verbose = 0):
|
|
996 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
|
|
997 prg = "esl-shuffle"
|
|
998 else : prg = "shuffle"
|
|
999 genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
|
|
1000 if os.path.isfile(inData):
|
|
1001 if verbose > 0:
|
|
1002 print "shuffle input file '%s'" % inData
|
|
1003 cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
|
|
1004 print cmd
|
|
1005 returnStatus = os.system(cmd)
|
|
1006 if returnStatus:
|
|
1007 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
|
|
1008 sys.exit(1)
|
|
1009
|
|
1010 elif os.path.isdir(inData):
|
|
1011 if verbose > 0:
|
|
1012 print "shuffle files in input directory '%s'" % inData
|
|
1013 if os.path.exists(outData):
|
|
1014 shutil.rmtree(outData)
|
|
1015 os.mkdir(outData)
|
|
1016 lInputFiles = glob.glob("%s/*.fa" % inData)
|
|
1017 nbFastaFiles = 0
|
|
1018 for inputFile in lInputFiles:
|
|
1019 nbFastaFiles += 1
|
|
1020 if verbose > 1:
|
|
1021 print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
|
|
1022 fastaBaseName = os.path.basename(inputFile)
|
|
1023 prefix = os.path.splitext(fastaBaseName)[0]
|
|
1024 cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
|
|
1025 returnStatus = os.system(cmd)
|
|
1026 if returnStatus:
|
|
1027 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
|
|
1028 sys.exit(1)
|
|
1029
|
|
1030
|
|
1031 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
|
|
1032 #
|
|
1033 # @param inClusterFileName string input cluster file name
|
|
1034 # @param inFastaFileName string input fasta file name
|
|
1035 # @param outFileName string output file name
|
|
1036 # @param verbosity integer verbosity
|
|
1037 #
|
|
1038 @staticmethod
|
|
1039 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
|
|
1040 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
|
|
1041 iFastaParser = FastaParser(inFastaFileName)
|
|
1042 with open(outFileName, "w") as f:
|
|
1043 for iSequence in iFastaParser.getIterator():
|
|
1044
|
|
1045 header = iSequence.getName()
|
|
1046 if dHeader2ClusterClusterMember.get(header):
|
|
1047 cluster = dHeader2ClusterClusterMember[header][0]
|
|
1048 member = dHeader2ClusterClusterMember[header][1]
|
|
1049 else:
|
|
1050 clusterIdForSingletonCluster += 1
|
|
1051 cluster = clusterIdForSingletonCluster
|
|
1052 member = 1
|
|
1053
|
|
1054 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
|
|
1055 iSequence.setName(newHeader)
|
|
1056 f.write(iSequence.printFasta())
|
|
1057
|
|
1058 @staticmethod
|
|
1059 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
|
|
1060 dHeader2ClusterClusterMember = {}
|
|
1061 clusterId = 0
|
|
1062 with open(inClusterFileName) as f:
|
|
1063 for line in f:
|
|
1064 lineWithoutLastChar = line.rstrip()
|
|
1065 lHeaders = lineWithoutLastChar.split("\t")
|
|
1066 clusterId += 1
|
|
1067 if verbosity > 0:
|
|
1068 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
|
|
1069 memberId = 0
|
|
1070 for header in lHeaders:
|
|
1071 memberId += 1
|
|
1072 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
|
|
1073 if verbosity > 0:
|
|
1074 print "%i clusters" % clusterId
|
|
1075 return dHeader2ClusterClusterMember, clusterId
|
|
1076
|
|
1077 @staticmethod
|
|
1078 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
|
|
1079 """
|
|
1080 Write a map file from fasta output of clustering tool.
|
|
1081 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
|
|
1082 """
|
|
1083 if not outMapFileName:
|
|
1084 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
|
|
1085
|
|
1086 fileDb = open(fastaFileNameFromClustering)
|
|
1087 fileMap = open(outMapFileName, "w")
|
|
1088 seq = Bioseq()
|
|
1089 numseq = 0
|
|
1090 seq.read(fileDb)
|
|
1091 while seq.sequence:
|
|
1092 numseq = numseq + 1
|
|
1093 ID = seq.header.split(' ')[0].split('_')[0]
|
|
1094 chunk = seq.header.split(' ')[0].split('_')[1]
|
|
1095 start = seq.header.split(' ')[-1].split(',')[0][1:]
|
|
1096 end = seq.header.split(' ')[-1].split(',')[1][:-1]
|
|
1097 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
|
|
1098 fileMap.write("%s\n" % line)
|
|
1099 seq.read(fileDb)
|
|
1100
|
|
1101 fileDb.close()
|
|
1102 fileMap.close()
|
|
1103 print "saved in %s" % outMapFileName
|
|
1104
|
|
1105 @staticmethod
|
|
1106 def getNstretchesRangesList(fastaFileName, nbN = 2):
|
|
1107 lNstretchesRanges = []
|
|
1108 if nbN != 0:
|
|
1109 iBSDB = BioseqDB(fastaFileName)
|
|
1110 for iBS in iBSDB.db:
|
|
1111 nbNFound = 0
|
|
1112 start = 1
|
|
1113 pos = 1
|
|
1114 lastPos = 0
|
|
1115
|
|
1116 while pos <= iBS.getLength():
|
|
1117 if nbNFound == 0:
|
|
1118 start = pos
|
|
1119
|
|
1120 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
|
|
1121 nbNFound += 1
|
|
1122 pos += 1
|
|
1123 lastPos = pos
|
|
1124
|
|
1125 if pos - lastPos >= nbN:
|
|
1126 if nbNFound >= nbN:
|
|
1127 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
|
|
1128 nbNFound = 0
|
|
1129 pos += 1
|
|
1130
|
|
1131 if nbNFound >= nbN:
|
|
1132 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
|
|
1133
|
|
1134 lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)
|
|
1135
|
|
1136 return lNstretchesRanges
|
|
1137
|
|
1138
|
|
1139 @staticmethod
|
|
1140 def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
|
|
1141 lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)
|
|
1142
|
|
1143 outFormat = outFormat.lower()
|
|
1144 if outFormat in ["gff", "gff3"]:
|
|
1145 outFormat = "gff3"
|
|
1146 else:
|
|
1147 outFormat = "map"
|
|
1148
|
|
1149 if outFileName == "":
|
|
1150 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
|
|
1151
|
|
1152 with open(outFileName, "w") as fH:
|
|
1153 if outFormat == "gff3":
|
|
1154 fH.write("##gff-version 3\n")
|
|
1155 for iRange in lNstretchesRanges:
|
|
1156 seq = iRange.getSeqname()
|
|
1157 start = iRange.getStart()
|
|
1158 end = iRange.getEnd()
|
|
1159 if outFormat == "gff3":
|
|
1160 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
|
|
1161 else:
|
|
1162 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
|
|
1163 |