Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 09:04:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:22b0494ec883 | 13:feef9a0db09d |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import os | |
33 import re | |
34 import sys | |
35 import math | |
36 import glob | |
37 import string | |
38 import shutil | |
39 from commons.core.utils.FileUtils import FileUtils | |
40 from commons.core.seq.Bioseq import Bioseq | |
41 from commons.core.seq.BioseqDB import BioseqDB | |
42 from commons.core.coord.MapUtils import MapUtils | |
43 from commons.core.coord.Range import Range | |
44 from commons.core.coord.ConvCoord import ConvCoord | |
45 from commons.core.parsing.FastaParser import FastaParser | |
46 from commons.core.checker.CheckerUtils import CheckerUtils | |
47 from commons.core.launcher.LauncherUtils import LauncherUtils | |
48 | |
49 | |
50 ## Static methods for fasta file manipulation | |
51 # | |
52 class FastaUtils( object ): | |
53 | |
54 ## Count the number of sequences in the input fasta file | |
55 # | |
56 # @param inFile name of the input fasta file | |
57 # | |
58 # @return integer number of sequences in the input fasta file | |
59 # | |
60 @staticmethod | |
61 def dbSize( inFile ): | |
62 nbSeq = 0 | |
63 with open(inFile) as fH: | |
64 for line in fH: | |
65 if line[0] == ">": | |
66 nbSeq += 1 | |
67 | |
68 return nbSeq | |
69 | |
70 | |
71 ## Compute the cumulative sequence length in the input fasta file | |
72 # | |
73 # @param inFile handler of the input fasta file | |
74 # | |
75 @staticmethod | |
76 def dbCumLength( inFile ): | |
77 cumLength = 0 | |
78 for line in inFile: | |
79 if line[0] != ">": | |
80 cumLength += len(string.rstrip(line)) | |
81 | |
82 return cumLength | |
83 | |
84 | |
85 ## Return a list with the length of each sequence in the input fasta file | |
86 # | |
87 # @param inFile string name of the input fasta file | |
88 # | |
89 @staticmethod | |
90 def dbLengths(inFile): | |
91 lLengths = [] | |
92 currentLength = 0 | |
93 with open(inFile) as fH: | |
94 for line in fH: | |
95 if line[0] == ">": | |
96 if currentLength != 0: | |
97 lLengths.append(currentLength) | |
98 currentLength = 0 | |
99 else: | |
100 currentLength += len(line[:-1]) | |
101 lLengths.append(currentLength) | |
102 | |
103 return lLengths | |
104 | |
105 | |
106 ## Retrieve the sequence headers present in the input fasta file | |
107 # | |
108 # @param inFile string name of the input fasta file | |
109 # @param verbose integer level of verbosity | |
110 # | |
111 # @return list of sequence headers | |
112 # | |
113 @staticmethod | |
114 def dbHeaders(inFile, verbose = 0): | |
115 lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"] | |
116 if verbose: | |
117 for header in lHeaders: | |
118 print header | |
119 | |
120 return lHeaders | |
121 | |
122 | |
123 ## Cut a data bank into chunks according to the input parameters | |
124 # If a sequence is shorter than the threshold, it is only renamed (not cut) | |
125 # | |
126 # @param inFileName string name of the input fasta file | |
127 # @param chkLgth string chunk length (in bp, default=200000) | |
128 # @param chkOver string chunk overlap (in bp, default=10000) | |
129 # @param wordN string N stretch word length (default=11, 0 for no detection) | |
130 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') | |
131 # @param clean boolean remove 'cut' and 'Nstretch' files | |
132 # @param verbose integer (default = 0) | |
133 # | |
134 @staticmethod | |
135 def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0): | |
136 nbSeq = FastaUtils.dbSize(inFileName) | |
137 if verbose > 0: | |
138 print "cut the %i input sequences with cutterDB..." % nbSeq | |
139 sys.stdout.flush() | |
140 | |
141 prg = "cutterDB" | |
142 cmd = prg | |
143 cmd += " -l %s" % chkLgth | |
144 cmd += " -o %s" % chkOver | |
145 cmd += " -w %s" % wordN | |
146 cmd += " %s" % inFileName | |
147 returnStatus = os.system(cmd) | |
148 if returnStatus != 0: | |
149 msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus) | |
150 sys.stderr.write("%s\n" % msg) | |
151 sys.exit(1) | |
152 | |
153 nbChunks = FastaUtils.dbSize("%s_cut" % inFileName) | |
154 if verbose > 0: | |
155 print "done (%i chunks)" % ( nbChunks ) | |
156 sys.stdout.flush() | |
157 | |
158 if verbose > 0: | |
159 print "rename the headers..." | |
160 sys.stdout.flush() | |
161 | |
162 if outFilePrefix == "": | |
163 outFastaName = inFileName + "_chunks.fa" | |
164 outMapName = inFileName + "_chunks.map" | |
165 else: | |
166 outFastaName = outFilePrefix + ".fa" | |
167 outMapName = outFilePrefix + ".map" | |
168 | |
169 with open("%s_cut" % inFileName) as inFile: | |
170 outFasta = open(outFastaName, "w") | |
171 outMap = open(outMapName, "w") | |
172 | |
173 for line in inFile: | |
174 if line[0] == ">": | |
175 if verbose > 1: | |
176 print "rename '%s'" % (line[:-1]); sys.stdout.flush() | |
177 data = line[:-1].split(" ") | |
178 seqID = data[0].split(">")[1] | |
179 newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks)))) | |
180 oldHeader = data[2] | |
181 seqStart = data[4].split("..")[0] | |
182 seqEnd = data[4].split("..")[1] | |
183 outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd)) | |
184 outFasta.write(">%s\n" % newHeader) | |
185 | |
186 else: | |
187 outFasta.write(line.upper()) | |
188 | |
189 outFasta.close() | |
190 outMap.close() | |
191 | |
192 #stats on .Nstretch.map file | |
193 genomeLength = FastaUtils.dbCumLength(open(inFileName)) | |
194 NstretchMapFile = inFileName + ".Nstretch.map" | |
195 outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w") | |
196 if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile): | |
197 outNstrechStats.write("No N in stretch length > %s\n" % wordN) | |
198 else: | |
199 with open(NstretchMapFile) as f: | |
200 dHeader2lLengths = {} | |
201 for line in f: | |
202 data = line.rstrip().split() | |
203 header = data[1] | |
204 length = int(data[3]) - int(data[2]) + 1 | |
205 if header not in dHeader2lLengths: | |
206 dHeader2lLengths[header] = [] | |
207 dHeader2lLengths[header].append(length) | |
208 | |
209 for header in sorted(dHeader2lLengths): | |
210 lLengths = dHeader2lLengths[header] | |
211 outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths))) | |
212 | |
213 cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()]) | |
214 | |
215 NstretchPrct = float(cumulAllStretch)/genomeLength*100 | |
216 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct)) | |
217 outNstrechStats.close() | |
218 | |
219 if clean == True: | |
220 os.remove(inFileName + "_cut") | |
221 os.remove(NstretchMapFile) | |
222 | |
223 | |
224 ## Split the input fasta file in several output files | |
225 # | |
226 # @param inFile string name of the input fasta file | |
227 # @param nbSeqPerBatch integer number of sequences per output file | |
228 # @param newDir boolean put the sequences in a new directory called 'batches' | |
229 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') | |
230 # @param prefix prefix in output file name | |
231 # @param verbose integer verbosity level (default = 0) | |
232 # | |
233 @staticmethod | |
234 def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0): | |
235 if not os.path.exists(inFile): | |
236 msg = "ERROR: file '%s' doesn't exist" % inFile | |
237 sys.stderr.write("%s\n" % msg) | |
238 sys.exit(1) | |
239 | |
240 nbSeq = FastaUtils.dbSize(inFile) | |
241 | |
242 nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch))) | |
243 if verbose > 0: | |
244 print "save the %i input sequences into %i batches" % (nbSeq, nbBatches) | |
245 sys.stdout.flush() | |
246 | |
247 if nbSeqPerBatch != 1 and useSeqHeader: | |
248 useSeqHeader = False | |
249 | |
250 if newDir == True: | |
251 if os.path.exists("batches"): | |
252 shutil.rmtree("batches") | |
253 os.mkdir("batches") | |
254 os.chdir("batches") | |
255 os.system("ln -s ../%s ." % inFile) | |
256 | |
257 with open(inFile) as inFileHandler: | |
258 countBatch = 0 | |
259 countSeq = 0 | |
260 for line in inFileHandler: | |
261 if line[0] == ">": | |
262 countSeq += 1 | |
263 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: | |
264 try: | |
265 outFile.close() | |
266 except: pass | |
267 countBatch += 1 | |
268 if useSeqHeader: | |
269 outFileName = "%s.fa" % (line[1:-1].replace(" ", "_")) | |
270 else: | |
271 outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches)))) | |
272 outFile = open(outFileName, "w") | |
273 | |
274 if verbose > 1: | |
275 print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName) | |
276 sys.stdout.flush() | |
277 outFile.write(line) | |
278 | |
279 if newDir: | |
280 os.remove(os.path.basename(inFile)) | |
281 os.chdir("..") | |
282 | |
283 | |
284 ## Split the input fasta file in several output files | |
285 # | |
286 # @param inFileName string name of the input fasta file | |
287 # @param maxSize integer max cumulative length for each output file | |
288 # | |
289 @staticmethod | |
290 def splitFastaFileInBatches(inFileName, maxSize = 200000): | |
291 iBioseqDB = BioseqDB(inFileName) | |
292 lHeadersSizeTuples = [] | |
293 for iBioseq in iBioseqDB.db: | |
294 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) | |
295 | |
296 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) | |
297 os.mkdir("batches") | |
298 os.chdir("batches") | |
299 | |
300 iterator = 0 | |
301 for lHeader in lHeadersList: | |
302 iterator += 1 | |
303 with open("batch_%s.fa" % iterator, 'w') as f: | |
304 for header in lHeader : | |
305 iBioseq = iBioseqDB.fetch(header) | |
306 iBioseq.write(f) | |
307 os.chdir("..") | |
308 | |
309 | |
310 ## Split the input fasta file in several output files according to their cluster identifier | |
311 # | |
312 # @param inFileName string name of the input fasta file | |
313 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) | |
314 # @param simplifyHeader boolean simplify the headers | |
315 # @param createDir boolean put the sequences in different directories | |
316 # @param outPrefix string prefix of the output files (default='seqCluster') | |
317 # @param verbose integer (default = 0) | |
318 # | |
319 @staticmethod | |
320 def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0): | |
321 if not os.path.exists(inFileName): | |
322 print "ERROR: %s doesn't exist" % inFileName | |
323 sys.exit(1) | |
324 | |
325 inFile = open(inFileName) | |
326 | |
327 line = inFile.readline() | |
328 if line: | |
329 name = line.split(" ")[0] | |
330 if "Cluster" in name: | |
331 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
332 seqID = name.split("Mb")[1] | |
333 else: | |
334 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust | |
335 if "Q" in name.split("Gr")[0]: | |
336 seqID = name.split("Gr")[0].split("MbQ")[1] | |
337 elif "S" in name: | |
338 seqID = name.split("Gr")[0].split("MbS")[1] | |
339 sClusterIDs = set( [ clusterID ] ) | |
340 if simplifyHeader == True: | |
341 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
342 else: | |
343 header = line[1:-1] | |
344 if createDir == True: | |
345 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
346 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
347 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
348 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
349 outFile = open( outFileName, "w" ) | |
350 outFile.write( ">%s\n" % ( header ) ) | |
351 prevClusterID = clusterID | |
352 | |
353 line = inFile.readline() | |
354 while line: | |
355 if line[0] == ">": | |
356 name = line.split(" ")[0] | |
357 if "Cluster" in name: | |
358 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
359 seqID = name.split("Mb")[1] | |
360 else: | |
361 clusterID = name.split("Cl")[0].split("Gr")[1] | |
362 if "Q" in name.split("Gr")[0]: | |
363 seqID = name.split("Gr")[0].split("MbQ")[1] | |
364 elif "S" in name: | |
365 seqID = name.split("Gr")[0].split("MbS")[1] | |
366 | |
367 if clusterID != prevClusterID: | |
368 outFile.close() | |
369 | |
370 if simplifyHeader == True: | |
371 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
372 else: | |
373 header = line[1:-1] | |
374 | |
375 if createDir == True: | |
376 os.chdir( ".." ) | |
377 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
378 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
379 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
380 | |
381 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
382 if not os.path.exists( outFileName ): | |
383 outFile = open( outFileName, "w" ) | |
384 else: | |
385 if clusterID != prevClusterID: | |
386 outFile.close() | |
387 outFile = open( outFileName, "a" ) | |
388 outFile.write( ">%s\n" % ( header ) ) | |
389 prevClusterID = clusterID | |
390 sClusterIDs.add( clusterID ) | |
391 | |
392 else: | |
393 outFile.write( line ) | |
394 | |
395 line = inFile.readline() | |
396 | |
397 outFile.close() | |
398 if verbose > 0: | |
399 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() | |
400 | |
401 if createDir == True: | |
402 os.chdir("..") | |
403 else: | |
404 print "WARNING: empty input file - no cluster found"; sys.stdout.flush() | |
405 | |
406 | |
407 ## Filter a fasta file in two fasta files using the length of each sequence as a criterion | |
408 # | |
409 # @param len_min integer length sequence criterion to filter | |
410 # @param inFileName string name of the input fasta file | |
411 # @param verbose integer (default = 0) | |
412 # | |
413 @staticmethod | |
414 def dbLengthFilter(len_min, inFileName, verbose = 0): | |
415 file_db = open(inFileName,) | |
416 file_dbInf = open(inFileName + ".Inf" + str(len_min), "w") | |
417 file_dbSup = open(inFileName + ".Sup" + str(len_min), "w") | |
418 seq = Bioseq() | |
419 numseq = 0 | |
420 nbsaveInf = 0 | |
421 nbsaveSup = 0 | |
422 seq.read(file_db) | |
423 while seq.getHeader(): | |
424 l = seq.getLength() | |
425 numseq = numseq + 1 | |
426 if l >= len_min: | |
427 seq.write(file_dbSup) | |
428 if verbose > 0: | |
429 nbsaveSup = nbsaveSup + 1 | |
430 else: | |
431 seq.write(file_dbInf) | |
432 if verbose > 0: | |
433 nbsaveInf = nbsaveInf + 1 | |
434 seq.read(file_db) | |
435 | |
436 file_db.close() | |
437 file_dbInf.close() | |
438 file_dbSup.close() | |
439 if verbose > 0: | |
440 print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min)) | |
441 | |
442 | |
443 ## Extract the longest sequences from a fasta file | |
444 # | |
445 # @param num integer maximum number of sequences in the output file | |
446 # @param inFileName string name of the input fasta file | |
447 # @param outFileName string name of the output fasta file | |
448 # @param minThresh integer minimum length threshold (default=0) | |
449 # @param verbose integer (default = 0) | |
450 # | |
451 @staticmethod | |
452 def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0): | |
453 bsDB = BioseqDB(inFileName) | |
454 if verbose > 0: | |
455 print "nb of input sequences: %i" % bsDB.getSize() | |
456 | |
457 if outFileName == "": | |
458 outFileName = inFileName + ".best" + str(num) | |
459 outFile = open( outFileName, "w" ) | |
460 | |
461 if bsDB.getSize()==0: | |
462 return 0 | |
463 | |
464 num = int(num) | |
465 if verbose > 0: | |
466 print "keep the %i longest sequences" % num | |
467 if minThresh > 0: | |
468 print "with length > %i bp" % minThresh | |
469 sys.stdout.flush() | |
470 | |
471 tmpLSeqLgth = bsDB.getListOfSequencesLength() | |
472 tmpLSeqLgth.sort(reverse = True) | |
473 | |
474 # select the longests | |
475 lSeqLgth = [] | |
476 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): | |
477 if tmpLSeqLgth[i] >= minThresh: | |
478 lSeqLgth.append( tmpLSeqLgth[i] ) | |
479 if verbose > 0: | |
480 print "selected max length: %i" % max(lSeqLgth) | |
481 print "selected min length: %i" % min(lSeqLgth) | |
482 sys.stdout.flush() | |
483 | |
484 # save the longest | |
485 inFile = open( inFileName ) | |
486 seqNum = 0 | |
487 nbSave = 0 | |
488 for bs in bsDB.db: | |
489 seqNum += 1 | |
490 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: | |
491 bs.write( outFile ) | |
492 if verbose > 1: | |
493 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) | |
494 sys.stdout.flush() | |
495 nbSave += 1 | |
496 if nbSave == num: | |
497 break | |
498 inFile.close() | |
499 outFile.close() | |
500 if verbose > 0: | |
501 print nbSave, "saved sequences in ", outFileName | |
502 sys.stdout.flush() | |
503 | |
504 return 0 | |
505 | |
506 | |
507 ## Extract all the sequence headers from a fasta file and write them in a new file | |
508 # | |
509 # @param inFileName string name of the input fasta file | |
510 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') | |
511 # | |
512 @staticmethod | |
513 def dbExtractSeqHeaders(inFileName, outFileName = ""): | |
514 if not outFileName: | |
515 outFileName = inFileName + ".headers" | |
516 | |
517 with open(outFileName, "w") as f: | |
518 for header in FastaUtils.dbHeaders(inFileName): | |
519 f.write("%s\n" % header) | |
520 | |
521 return 0 | |
522 | |
523 | |
524 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file | |
525 # | |
526 # @param pattern regular expression to search in headers | |
527 # @param inFileName string name of the input fasta file | |
528 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
529 # @param verbose integer verbosity level (default = 0) | |
530 # | |
531 @staticmethod | |
532 def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0): | |
533 if not pattern: | |
534 return | |
535 | |
536 if not outFileName: | |
537 outFileName = inFileName + '.extracted' | |
538 outFile = open(outFileName, 'w') | |
539 | |
540 patternTosearch = re.compile(pattern) | |
541 bioseq = Bioseq() | |
542 bioseqNb = 0 | |
543 savedBioseqNb = 0 | |
544 inFile = open(inFileName) | |
545 bioseq.read(inFile) | |
546 while bioseq.sequence: | |
547 bioseqNb = bioseqNb + 1 | |
548 m = patternTosearch.search(bioseq.header) | |
549 if m: | |
550 bioseq.write(outFile) | |
551 if verbose > 1: | |
552 print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!' | |
553 savedBioseqNb = savedBioseqNb + 1 | |
554 bioseq.read(inFile) | |
555 inFile.close() | |
556 | |
557 outFile.close() | |
558 | |
559 if verbose > 0: | |
560 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
561 | |
562 | |
563 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
564 # | |
565 # @param patternFileName string file containing regular expression to search in headers | |
566 # @param inFileName string name of the input fasta file | |
567 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
568 # @param verbose integer verbosity level (default = 0) | |
569 # | |
570 @staticmethod | |
571 def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): | |
572 if not patternFileName: | |
573 print "ERROR: no file of pattern" | |
574 sys.exit(1) | |
575 | |
576 bioseq = Bioseq() | |
577 bioseqNb = 0 | |
578 savedBioseqNb = 0 | |
579 lHeaders = [] | |
580 | |
581 with open(inFileName) as inFile: | |
582 bioseq.read(inFile) | |
583 while bioseq.sequence: | |
584 lHeaders.append(bioseq.header) | |
585 bioseq.read(inFile) | |
586 | |
587 lHeadersToKeep = [] | |
588 with open(patternFileName) as patternFile: | |
589 for pattern in patternFile: | |
590 pattern = pattern.rstrip() | |
591 if verbose > 0: | |
592 print "pattern: ", pattern; sys.stdout.flush() | |
593 | |
594 patternToSearch = re.compile(pattern) | |
595 lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)]) | |
596 | |
597 if not outFileName: | |
598 outFileName = inFileName + ".extracted" | |
599 | |
600 with open(outFileName, "w") as outFile: | |
601 with open(inFileName) as inFile: | |
602 bioseq.read(inFile) | |
603 while bioseq.sequence: | |
604 bioseqNb += 1 | |
605 if bioseq.header in lHeadersToKeep: | |
606 bioseq.write(outFile) | |
607 savedBioseqNb += 1 | |
608 if verbose > 1: | |
609 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() | |
610 bioseq.read(inFile) | |
611 | |
612 if verbose > 0: | |
613 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
614 | |
615 | |
616 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file | |
617 # | |
618 # @param pattern regular expression to search in headers | |
619 # @param inFileName string name of the input fasta file | |
620 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
621 # @param verbose integer verbosity level (default = 0) | |
622 # | |
623 @staticmethod | |
624 def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0): | |
625 if not pattern: | |
626 return | |
627 | |
628 patternToSearch = re.compile(pattern) | |
629 | |
630 if outFileName == "": | |
631 outFileName = inFileName + '.cleaned' | |
632 | |
633 with open(outFileName, 'w') as outFile: | |
634 bioseq = Bioseq() | |
635 bioseqNb = 0 | |
636 savedBioseqNb = 0 | |
637 with open(inFileName) as inFile: | |
638 bioseq.read(inFile) | |
639 while bioseq.sequence: | |
640 bioseqNb += 1 | |
641 if not patternToSearch.search(bioseq.header): | |
642 bioseq.write(outFile) | |
643 savedBioseqNb += 1 | |
644 if verbose > 1: | |
645 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!' | |
646 bioseq.read(inFile) | |
647 | |
648 if verbose > 0: | |
649 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
650 | |
651 | |
652 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
653 # | |
654 # @param patternFileName string file containing regular expression to search in headers | |
655 # @param inFileName string name of the input fasta file | |
656 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
657 # @param verbose integer verbosity level (default = 0) | |
658 # | |
659 @staticmethod | |
660 def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0): | |
661 if not patternFileName: | |
662 print "ERROR: no file of pattern" | |
663 sys.exit(1) | |
664 | |
665 bioseq = Bioseq() | |
666 bioseqNb = 0 | |
667 savedBioseqNb = 0 | |
668 lHeaders = [] | |
669 with open(inFileName) as inFile: | |
670 bioseq.read(inFile) | |
671 while bioseq.sequence: | |
672 lHeaders.append(bioseq.header) | |
673 bioseq.read(inFile) | |
674 | |
675 with open(patternFileName) as patternFile: | |
676 lHeadersToRemove = [] | |
677 for pattern in patternFile: | |
678 pattern = pattern.rstrip() | |
679 if verbose > 0: | |
680 print "pattern: ", pattern; sys.stdout.flush() | |
681 | |
682 patternToSearch = re.compile(pattern) | |
683 lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)]) | |
684 | |
685 if not outFileName: | |
686 outFileName = inFileName + '.cleaned' | |
687 | |
688 with open(outFileName, 'w') as outFile: | |
689 bioseqNum = 0 | |
690 with open(inFileName) as inFile: | |
691 bioseq.read(inFile) | |
692 while bioseq.sequence: | |
693 bioseqNum += 1 | |
694 if bioseq.header not in lHeadersToRemove: | |
695 bioseq.write(outFile) | |
696 savedBioseqNb += 1 | |
697 if verbose > 1: | |
698 print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush() | |
699 bioseq.read(inFile) | |
700 | |
701 if verbose > 0: | |
702 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName) | |
703 | |
704 | |
705 ## Find sequence's ORFs from a fasta file and write them in a tab file | |
706 # | |
707 # @param inFileName string name of the input fasta file | |
708 # @param orfMaxNb integer Select orfMaxNb best ORFs | |
709 # @param orfMinLength integer Keep ORFs with length > orfMinLength | |
710 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') | |
711 # @param verbose integer verbosity level (default = 0) | |
712 # | |
713 @staticmethod | |
714 def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0): | |
715 if not outFileName: | |
716 outFileName = inFileName + ".ORF.map" | |
717 outFile = open(outFileName, "w") | |
718 | |
719 bioseq = Bioseq() | |
720 bioseqNb = 0 | |
721 | |
722 inFile = open(inFileName) | |
723 bioseq.read(inFile) | |
724 while bioseq.sequence: | |
725 bioseq.upCase() | |
726 bioseqNb += 1 | |
727 if verbose > 0: | |
728 print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]' | |
729 | |
730 orf = bioseq.findORF() | |
731 bestOrf = [] | |
732 for i in orf.keys(): | |
733 orfLen = len(orf[i]) | |
734 for j in xrange(1, orfLen): | |
735 start = orf[i][j - 1] + 4 | |
736 end = orf[i][j] + 3 | |
737 if end - start >= orfMinLength: | |
738 bestOrf.append((end - start, i + 1, start, end)) | |
739 | |
740 bioseq.reverseComplement() | |
741 | |
742 orf = bioseq.findORF() | |
743 seqLen = bioseq.getLength() | |
744 for i in orf.keys(): | |
745 orfLen = len(orf[i]) | |
746 for j in xrange(1, orfLen): | |
747 start = seqLen - orf[i][j - 1] - 3 | |
748 end = seqLen - orf[i][j] - 2 | |
749 if start - end >= orfMinLength: | |
750 bestOrf.append((start - end, (i + 1) * -1, start, end)) | |
751 | |
752 bestOrf.sort(reverse = True) | |
753 bestOrfNb = len(bestOrf) | |
754 if orfMaxNb != 0 and orfMaxNb < bestOrfNb: | |
755 bestOrfNb = orfMaxNb | |
756 for i in xrange(0, bestOrfNb): | |
757 if verbose > 0: | |
758 print bestOrf[i] | |
759 outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \ | |
760 "|" + str(bestOrf[i][0]), bioseq.header, | |
761 bestOrf[i][2], bestOrf[i][3])) | |
762 bioseq.read(inFile) | |
763 | |
764 inFile.close() | |
765 outFile.close() | |
766 | |
767 return 0 | |
768 | |
769 | |
770 ## Sort sequences by increasing length (write a new file) | |
771 # | |
772 # @param inFileName string name of the input fasta file | |
773 # @param outFileName string name of the output fasta file | |
774 # @param verbose integer verbosity level | |
775 # | |
776 @staticmethod | |
777 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0): | |
778 if verbose > 0: | |
779 print "sort sequences by increasing length" | |
780 sys.stdout.flush() | |
781 if not os.path.exists(inFileName): | |
782 print "ERROR: file '%s' doesn't exist" % (inFileName) | |
783 sys.exit(1) | |
784 | |
785 # read each seq one by one | |
786 # save them in distinct temporary files | |
787 # with their length in the name | |
788 inFileHandler = open(inFileName, "r") | |
789 countSeq = 0 | |
790 bs = Bioseq() | |
791 bs.read(inFileHandler) | |
792 while bs.header: | |
793 countSeq += 1 | |
794 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq) | |
795 bs.appendBioseqInFile(tmpFile) | |
796 if verbose > 1: | |
797 print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile) | |
798 bs.header = "" | |
799 bs.sequence = "" | |
800 bs.read(inFileHandler) | |
801 inFileHandler.close() | |
802 | |
803 # sort temporary file names | |
804 # concatenate them into the output file | |
805 if os.path.exists(outFileName): | |
806 os.remove(outFileName) | |
807 lFiles = glob.glob("*bp_*nb") | |
808 lFiles.sort(key = lambda s:int(s.split("bp_")[0])) | |
809 for fileName in lFiles: | |
810 cmd = "cat %s >> %s" % (fileName, outFileName) | |
811 returnValue = os.system(cmd) | |
812 if returnValue != 0: | |
813 print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName) | |
814 sys.exit(1) | |
815 os.remove(fileName) | |
816 | |
817 return 0 | |
818 | |
819 | |
820 ## Sort sequences by header | |
821 # | |
822 # @param inFileName string name of the input fasta file | |
823 # @param outFileName string name of the output fasta file | |
824 # @param verbose integer verbosity level | |
825 # | |
826 @staticmethod | |
827 def sortSequencesByHeader(inFileName, outFileName = ""): | |
828 if outFileName == "": | |
829 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] | |
830 iBioseqDB = BioseqDB(inFileName) | |
831 with open(outFileName, "w") as f: | |
832 for header in sorted(iBioseqDB.getHeaderList()): | |
833 iBioseq = iBioseqDB.fetch(header) | |
834 iBioseq.write(f) | |
835 | |
836 | |
837 ## Return a dictionary which keys are the headers and values the length of the sequences | |
838 # | |
839 # @param inFile string name of the input fasta file | |
840 # @param verbose integer verbosity level | |
841 # | |
842 @staticmethod | |
843 def getLengthPerHeader(inFile, verbose = 0): | |
844 dHeader2Length = {} | |
845 | |
846 with open(inFile) as inFileHandler: | |
847 currentSeqHeader = "" | |
848 currentSeqLength = 0 | |
849 for line in inFileHandler: | |
850 if line[0] == ">": | |
851 if currentSeqHeader != "": | |
852 dHeader2Length[currentSeqHeader] = currentSeqLength | |
853 currentSeqLength = 0 | |
854 currentSeqHeader = line[1:-1] | |
855 if verbose > 0: | |
856 print "current header: %s" % currentSeqHeader | |
857 sys.stdout.flush() | |
858 else: | |
859 currentSeqLength += len(line.replace("\n", "")) | |
860 dHeader2Length[currentSeqHeader] = currentSeqLength | |
861 | |
862 return dHeader2Length | |
863 | |
864 | |
865 ## Convert headers from a fasta file having chunk coordinates | |
866 # | |
867 # @param inFile string name of the input fasta file | |
868 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes | |
869 # @param outFile string name of the output file | |
870 # | |
871 @staticmethod | |
872 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): | |
873 inFileHandler = open(inFile, "r") | |
874 outFileHandler = open(outFile, "w") | |
875 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) | |
876 iConvCoord = ConvCoord() | |
877 for line in inFileHandler: | |
878 if line[0] == ">": | |
879 if "{Fragment}" in line: | |
880 chkName = line.split(" ")[1] | |
881 chrName = dChunk2Map[chkName].seqname | |
882 lCoordPairs = line.split(" ")[3].split(",") | |
883 lRangesOnChk = [] | |
884 for i in lCoordPairs: | |
885 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) | |
886 lRangesOnChk.append(iRange) | |
887 lRangesOnChr = [] | |
888 for iRange in lRangesOnChk: | |
889 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) | |
890 newHeader = line[1:-1].split(" ")[0] | |
891 newHeader += " %s" % chrName | |
892 newHeader += " {Fragment}" | |
893 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) | |
894 for iRange in lRangesOnChr[1:]: | |
895 newHeader += ",%i..%i" % (iRange.start, iRange.end) | |
896 outFileHandler.write(">%s\n" % newHeader) | |
897 else: | |
898 chkName = line.split("_")[1].split(" ")[0] | |
899 chrName = dChunk2Map[chkName].seqname | |
900 coords = line[line.find("[")+1 : line.find("]")] | |
901 start = int(coords.split(",")[0]) | |
902 end = int(coords.split(",")[1]) | |
903 iRangeOnChk = Range(chkName, start, end) | |
904 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) | |
905 newHeader = line[1:-1].split("_")[0] | |
906 newHeader += " %s" % chrName | |
907 newHeader += " %s" % line[line.find("(") : line.find(")")+1] | |
908 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) | |
909 outFileHandler.write(">%s\n" % newHeader) | |
910 else: | |
911 outFileHandler.write(line) | |
912 inFileHandler.close() | |
913 outFileHandler.close() | |
914 | |
915 | |
916 ## Convert a fasta file to a length file | |
917 # | |
918 # @param inFile string name of the input fasta file | |
919 # @param outFile string name of the output file | |
920 # | |
921 @staticmethod | |
922 def convertFastaToLength(inFile, outFile = ""): | |
923 if not outFile: | |
924 outFile = "%s.length" % inFile | |
925 | |
926 if inFile: | |
927 with open(inFile) as inFH: | |
928 with open(outFile, "w") as outFH: | |
929 bioseq = Bioseq() | |
930 bioseq.read(inFH) | |
931 while bioseq.sequence: | |
932 seqLen = bioseq.getLength() | |
933 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) | |
934 bioseq.read(inFH) | |
935 | |
936 | |
937 ## Convert a fasta file to a seq file | |
938 # | |
939 # @param inFile string name of the input fasta file | |
940 # @param outFile string name of the output file | |
941 # | |
942 @staticmethod | |
943 def convertFastaToSeq(inFile, outFile = ""): | |
944 if not outFile: | |
945 outFile = "%s.seq" % inFile | |
946 | |
947 if inFile: | |
948 with open(inFile) as inFH: | |
949 with open(outFile, "w") as outFH: | |
950 bioseq = Bioseq() | |
951 bioseq.read(inFH) | |
952 while bioseq.sequence: | |
953 seqLen = bioseq.getLength() | |
954 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ | |
955 bioseq.sequence, bioseq.header, seqLen)) | |
956 bioseq.read(inFH) | |
957 | |
958 | |
959 ## Splice an input fasta file using coordinates in a Map file | |
960 # | |
961 # @note the coordinates should be merged beforehand! | |
962 # | |
963 @staticmethod | |
964 def spliceFromCoords(genomeFile, coordFile, obsFile): | |
965 genomeFileHandler = open(genomeFile) | |
966 obsFileHandler = open(obsFile, "w") | |
967 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile) | |
968 | |
969 bs = Bioseq() | |
970 bs.read(genomeFileHandler) | |
971 while bs.sequence: | |
972 | |
973 if dChr2Maps.has_key(bs.header): | |
974 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header]) | |
975 splicedSeq = [] | |
976 currentSite = 0 | |
977 for iMap in lCoords: | |
978 minSplice = iMap.getMin() - 1 | |
979 if minSplice > currentSite: | |
980 splicedSeq += bs.sequence[currentSite : minSplice] | |
981 if currentSite <= iMap.getMax(): | |
982 currentSite = iMap.getMax() | |
983 splicedSeq += bs.sequence[currentSite:] | |
984 bs.sequence = "".join(splicedSeq) | |
985 bs.write(obsFileHandler) | |
986 bs.read(genomeFileHandler) | |
987 | |
988 genomeFileHandler.close() | |
989 obsFileHandler.close() | |
990 | |
991 | |
992 ## Shuffle input sequences (single file or files in a directory) | |
993 # | |
994 @staticmethod | |
995 def dbShuffle(inData, outData, verbose = 0): | |
996 if CheckerUtils.isExecutableInUserPath("esl-shuffle"): | |
997 prg = "esl-shuffle" | |
998 else : prg = "shuffle" | |
999 genericCmd = prg + " --seed 1 -d INPUT > OUTPUT" | |
1000 if os.path.isfile(inData): | |
1001 if verbose > 0: | |
1002 print "shuffle input file '%s'" % inData | |
1003 cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData) | |
1004 print cmd | |
1005 returnStatus = os.system(cmd) | |
1006 if returnStatus: | |
1007 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) | |
1008 sys.exit(1) | |
1009 | |
1010 elif os.path.isdir(inData): | |
1011 if verbose > 0: | |
1012 print "shuffle files in input directory '%s'" % inData | |
1013 if os.path.exists(outData): | |
1014 shutil.rmtree(outData) | |
1015 os.mkdir(outData) | |
1016 lInputFiles = glob.glob("%s/*.fa" % inData) | |
1017 nbFastaFiles = 0 | |
1018 for inputFile in lInputFiles: | |
1019 nbFastaFiles += 1 | |
1020 if verbose > 1: | |
1021 print "%3i / %3i" % (nbFastaFiles, len(lInputFiles)) | |
1022 fastaBaseName = os.path.basename(inputFile) | |
1023 prefix = os.path.splitext(fastaBaseName)[0] | |
1024 cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix)) | |
1025 returnStatus = os.system(cmd) | |
1026 if returnStatus: | |
1027 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus) | |
1028 sys.exit(1) | |
1029 | |
1030 | |
1031 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers | |
1032 # | |
1033 # @param inClusterFileName string input cluster file name | |
1034 # @param inFastaFileName string input fasta file name | |
1035 # @param outFileName string output file name | |
1036 # @param verbosity integer verbosity | |
1037 # | |
1038 @staticmethod | |
1039 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): | |
1040 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) | |
1041 iFastaParser = FastaParser(inFastaFileName) | |
1042 with open(outFileName, "w") as f: | |
1043 for iSequence in iFastaParser.getIterator(): | |
1044 | |
1045 header = iSequence.getName() | |
1046 if dHeader2ClusterClusterMember.get(header): | |
1047 cluster = dHeader2ClusterClusterMember[header][0] | |
1048 member = dHeader2ClusterClusterMember[header][1] | |
1049 else: | |
1050 clusterIdForSingletonCluster += 1 | |
1051 cluster = clusterIdForSingletonCluster | |
1052 member = 1 | |
1053 | |
1054 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) | |
1055 iSequence.setName(newHeader) | |
1056 f.write(iSequence.printFasta()) | |
1057 | |
1058 @staticmethod | |
1059 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): | |
1060 dHeader2ClusterClusterMember = {} | |
1061 clusterId = 0 | |
1062 with open(inClusterFileName) as f: | |
1063 for line in f: | |
1064 lineWithoutLastChar = line.rstrip() | |
1065 lHeaders = lineWithoutLastChar.split("\t") | |
1066 clusterId += 1 | |
1067 if verbosity > 0: | |
1068 print "%i sequences in cluster %i" % (len(lHeaders), clusterId) | |
1069 memberId = 0 | |
1070 for header in lHeaders: | |
1071 memberId += 1 | |
1072 dHeader2ClusterClusterMember[header] = (clusterId, memberId) | |
1073 if verbosity > 0: | |
1074 print "%i clusters" % clusterId | |
1075 return dHeader2ClusterClusterMember, clusterId | |
1076 | |
1077 @staticmethod | |
1078 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): | |
1079 """ | |
1080 Write a map file from fasta output of clustering tool. | |
1081 Warning: only works if input fasta headers are formated like LTRharvest fasta output. | |
1082 """ | |
1083 if not outMapFileName: | |
1084 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) | |
1085 | |
1086 fileDb = open(fastaFileNameFromClustering) | |
1087 fileMap = open(outMapFileName, "w") | |
1088 seq = Bioseq() | |
1089 numseq = 0 | |
1090 seq.read(fileDb) | |
1091 while seq.sequence: | |
1092 numseq = numseq + 1 | |
1093 ID = seq.header.split(' ')[0].split('_')[0] | |
1094 chunk = seq.header.split(' ')[0].split('_')[1] | |
1095 start = seq.header.split(' ')[-1].split(',')[0][1:] | |
1096 end = seq.header.split(' ')[-1].split(',')[1][:-1] | |
1097 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) | |
1098 fileMap.write("%s\n" % line) | |
1099 seq.read(fileDb) | |
1100 | |
1101 fileDb.close() | |
1102 fileMap.close() | |
1103 print "saved in %s" % outMapFileName | |
1104 | |
1105 @staticmethod | |
1106 def getNstretchesRangesList(fastaFileName, nbN = 2): | |
1107 lNstretchesRanges = [] | |
1108 if nbN != 0: | |
1109 iBSDB = BioseqDB(fastaFileName) | |
1110 for iBS in iBSDB.db: | |
1111 nbNFound = 0 | |
1112 start = 1 | |
1113 pos = 1 | |
1114 lastPos = 0 | |
1115 | |
1116 while pos <= iBS.getLength(): | |
1117 if nbNFound == 0: | |
1118 start = pos | |
1119 | |
1120 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']: | |
1121 nbNFound += 1 | |
1122 pos += 1 | |
1123 lastPos = pos | |
1124 | |
1125 if pos - lastPos >= nbN: | |
1126 if nbNFound >= nbN: | |
1127 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) | |
1128 nbNFound = 0 | |
1129 pos += 1 | |
1130 | |
1131 if nbNFound >= nbN: | |
1132 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1)) | |
1133 | |
1134 lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False) | |
1135 | |
1136 return lNstretchesRanges | |
1137 | |
1138 | |
1139 @staticmethod | |
1140 def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"): | |
1141 lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN) | |
1142 | |
1143 outFormat = outFormat.lower() | |
1144 if outFormat in ["gff", "gff3"]: | |
1145 outFormat = "gff3" | |
1146 else: | |
1147 outFormat = "map" | |
1148 | |
1149 if outFileName == "": | |
1150 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat) | |
1151 | |
1152 with open(outFileName, "w") as fH: | |
1153 if outFormat == "gff3": | |
1154 fH.write("##gff-version 3\n") | |
1155 for iRange in lNstretchesRanges: | |
1156 seq = iRange.getSeqname() | |
1157 start = iRange.getStart() | |
1158 end = iRange.getEnd() | |
1159 if outFormat == "gff3": | |
1160 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end)) | |
1161 else: | |
1162 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end)) | |
1163 |