Mercurial > repos > yufei-luo > s_mart
comparison commons/core/seq/FastaUtils.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children | 94ab73e8a190 |
comparison
equal
deleted
inserted
replaced
5:ea3082881bf8 | 6:769e306b7933 |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import os | |
33 import sys | |
34 import string | |
35 import math | |
36 import shutil | |
37 import re | |
38 import glob | |
39 from commons.core.seq.BioseqDB import BioseqDB | |
40 from commons.core.seq.Bioseq import Bioseq | |
41 from commons.core.coord.MapUtils import MapUtils | |
42 from commons.core.coord.Range import Range | |
43 from commons.core.checker.CheckerUtils import CheckerUtils | |
44 from commons.core.launcher.LauncherUtils import LauncherUtils | |
45 from commons.core.coord.ConvCoord import ConvCoord | |
46 from commons.core.parsing.FastaParser import FastaParser | |
47 | |
48 | |
49 ## Static methods for fasta file manipulation | |
50 # | |
51 class FastaUtils( object ): | |
52 | |
53 ## Count the number of sequences in the input fasta file | |
54 # | |
55 # @param inFile name of the input fasta file | |
56 # | |
57 # @return integer number of sequences in the input fasta file | |
58 # | |
59 @staticmethod | |
60 def dbSize( inFile ): | |
61 nbSeq = 0 | |
62 inFileHandler = open( inFile, "r" ) | |
63 line = inFileHandler.readline() | |
64 while line: | |
65 if line[0] == ">": | |
66 nbSeq = nbSeq + 1 | |
67 line = inFileHandler.readline() | |
68 inFileHandler.close() | |
69 | |
70 return nbSeq | |
71 | |
72 | |
73 ## Compute the cumulative sequence length in the input fasta file | |
74 # | |
75 # @param inFile handler of the input fasta file | |
76 # | |
77 @staticmethod | |
78 def dbCumLength( inFile ): | |
79 cumLength = 0 | |
80 line = inFile.readline() | |
81 while line: | |
82 if line[0] != ">": | |
83 cumLength += len(string.rstrip(line)) | |
84 line = inFile.readline() | |
85 | |
86 return cumLength | |
87 | |
88 | |
89 ## Return a list with the length of each sequence in the input fasta file | |
90 # | |
91 # @param inFile string name of the input fasta file | |
92 # | |
93 @staticmethod | |
94 def dbLengths( inFile ): | |
95 lLengths = [] | |
96 inFileHandler = open( inFile, "r" ) | |
97 currentLength = 0 | |
98 line = inFileHandler.readline() | |
99 while line: | |
100 if line[0] == ">": | |
101 if currentLength != 0: | |
102 lLengths.append( currentLength ) | |
103 currentLength = 0 | |
104 else: | |
105 currentLength += len(line[:-1]) | |
106 line = inFileHandler.readline() | |
107 lLengths.append( currentLength ) | |
108 inFileHandler.close() | |
109 return lLengths | |
110 | |
111 | |
112 ## Retrieve the sequence headers present in the input fasta file | |
113 # | |
114 # @param inFile string name of the input fasta file | |
115 # @param verbose integer level of verbosity | |
116 # | |
117 # @return list of sequence headers | |
118 # | |
119 @staticmethod | |
120 def dbHeaders( inFile, verbose=0 ): | |
121 lHeaders = [] | |
122 | |
123 inFileHandler = open( inFile, "r" ) | |
124 line = inFileHandler.readline() | |
125 while line: | |
126 if line[0] == ">": | |
127 lHeaders.append( string.rstrip(line[1:]) ) | |
128 if verbose > 0: | |
129 print string.rstrip(line[1:]) | |
130 line = inFileHandler.readline() | |
131 inFileHandler.close() | |
132 | |
133 return lHeaders | |
134 | |
135 | |
136 ## Cut a data bank into chunks according to the input parameters | |
137 # If a sequence is shorter than the threshold, it is only renamed (not cut) | |
138 # | |
139 # @param inFileName string name of the input fasta file | |
140 # @param chkLgth string chunk length (in bp, default=200000) | |
141 # @param chkOver string chunk overlap (in bp, default=10000) | |
142 # @param wordN string N stretch word length (default=11, 0 for no detection) | |
143 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map') | |
144 # @param clean boolean remove 'cut' and 'Nstretch' files | |
145 # @param verbose integer (default = 0) | |
146 # | |
147 @staticmethod | |
148 def dbChunks( inFileName, chkLgth="200000", chkOver="10000", wordN="11", outFilePrefix="", clean=False, verbose=0 ): | |
149 nbSeq = FastaUtils.dbSize( inFileName ) | |
150 if verbose > 0: | |
151 print "cut the %i input sequences with cutterDB..." % ( nbSeq ) | |
152 sys.stdout.flush() | |
153 | |
154 prg = "cutterDB" | |
155 cmd = prg | |
156 cmd += " -l %s" % ( chkLgth ) | |
157 cmd += " -o %s" %( chkOver ) | |
158 cmd += " -w %s" % ( wordN ) | |
159 cmd += " %s" % ( inFileName ) | |
160 returnStatus = os.system( cmd ) | |
161 if returnStatus != 0: | |
162 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus ) | |
163 sys.stderr.write( "%s\n" % ( msg ) ) | |
164 sys.exit(1) | |
165 | |
166 nbChunks = FastaUtils.dbSize( "%s_cut" % ( inFileName ) ) | |
167 if verbose > 0: | |
168 print "done (%i chunks)" % ( nbChunks ) | |
169 sys.stdout.flush() | |
170 | |
171 if verbose > 0: | |
172 print "rename the headers..." | |
173 sys.stdout.flush() | |
174 | |
175 if outFilePrefix == "": | |
176 outFastaName = inFileName + "_chunks.fa" | |
177 outMapName = inFileName + "_chunks.map" | |
178 else: | |
179 outFastaName = outFilePrefix + ".fa" | |
180 outMapName = outFilePrefix + ".map" | |
181 | |
182 inFile = open( "%s_cut" % ( inFileName ), "r" ) | |
183 line = inFile.readline() | |
184 | |
185 outFasta = open( outFastaName, "w" ) | |
186 outMap = open( outMapName, "w" ) | |
187 | |
188 # read line after line (no need for big RAM) and change the sequence headers | |
189 while line: | |
190 | |
191 if line[0] == ">": | |
192 if verbose > 1: | |
193 print "rename '%s'" % ( line[:-1] ); sys.stdout.flush() | |
194 data = line[:-1].split(" ") | |
195 seqID = data[0].split(">")[1] | |
196 newHeader = "chunk%s" % ( str(seqID).zfill( len(str(nbChunks)) ) ) | |
197 oldHeader = data[2] | |
198 seqStart = data[4].split("..")[0] | |
199 seqEnd = data[4].split("..")[1] | |
200 outMap.write( "%s\t%s\t%s\t%s\n" % ( newHeader, oldHeader, seqStart, seqEnd ) ) | |
201 outFasta.write( ">%s\n" % ( newHeader ) ) | |
202 | |
203 else: | |
204 outFasta.write( line.upper() ) | |
205 | |
206 line = inFile.readline() | |
207 | |
208 inFile.close() | |
209 outFasta.close() | |
210 outMap.close() | |
211 | |
212 if clean == True: | |
213 os.remove(inFileName + "_cut") | |
214 os.remove(inFileName + ".Nstretch.map") | |
215 | |
216 | |
217 ## Split the input fasta file in several output files | |
218 # | |
219 # @param inFile string name of the input fasta file | |
220 # @param nbSeqPerBatch integer number of sequences per output file | |
221 # @param newDir boolean put the sequences in a new directory called 'batches' | |
222 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1') | |
223 # @param prefix prefix in output file name | |
224 # @param verbose integer verbosity level (default = 0) | |
225 # | |
226 @staticmethod | |
227 def dbSplit( inFile, nbSeqPerBatch, newDir, useSeqHeader=False, prefix="batch", verbose=0 ): | |
228 if not os.path.exists( inFile ): | |
229 msg = "ERROR: file '%s' doesn't exist" % ( inFile ) | |
230 sys.stderr.write( "%s\n" % ( msg ) ) | |
231 sys.exit(1) | |
232 | |
233 nbSeq = FastaUtils.dbSize( inFile ) | |
234 | |
235 nbBatches = int( math.ceil( nbSeq / float(nbSeqPerBatch) ) ) | |
236 if verbose > 0: | |
237 print "save the %i input sequences into %i batches" % ( nbSeq, nbBatches ) | |
238 sys.stdout.flush() | |
239 | |
240 if nbSeqPerBatch > 1 and useSeqHeader: | |
241 useSeqHeader = False | |
242 | |
243 if newDir == True: | |
244 if os.path.exists( "batches" ): | |
245 shutil.rmtree( "batches" ) | |
246 os.mkdir( "batches" ) | |
247 os.chdir( "batches" ) | |
248 os.system( "ln -s ../%s ." % ( inFile ) ) | |
249 | |
250 inFileHandler = open( inFile, "r" ) | |
251 inFileHandler.seek( 0, 0 ) | |
252 countBatch = 0 | |
253 countSeq = 0 | |
254 line = inFileHandler.readline() | |
255 while line: | |
256 if line == "": | |
257 break | |
258 if line[0] == ">": | |
259 countSeq += 1 | |
260 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1: | |
261 if "outFile" in locals(): | |
262 outFile.close() | |
263 countBatch += 1 | |
264 if nbSeqPerBatch == 1 and useSeqHeader: | |
265 outFileName = "%s.fa" % ( line[1:-1].replace(" ","_") ) | |
266 else: | |
267 outFileName = "%s_%s.fa" % ( prefix, str(countBatch).zfill( len(str(nbBatches)) ) ) | |
268 outFile = open( outFileName, "w" ) | |
269 if verbose > 1: | |
270 print "saving seq '%s' in file '%s'..." % ( line[1:40][:-1], outFileName ) | |
271 sys.stdout.flush() | |
272 outFile.write( line ) | |
273 line = inFileHandler.readline() | |
274 inFileHandler.close() | |
275 | |
276 if newDir == True: | |
277 os.remove( os.path.basename( inFile ) ) | |
278 os.chdir( ".." ) | |
279 | |
280 | |
281 ## Split the input fasta file in several output files | |
282 # | |
283 # @param inFileName string name of the input fasta file | |
284 # @param maxSize integer max cumulative length for each output file | |
285 # | |
286 @staticmethod | |
287 def splitFastaFileInBatches(inFileName, maxSize = 200000): | |
288 iBioseqDB = BioseqDB(inFileName) | |
289 lHeadersSizeTuples = [] | |
290 for iBioseq in iBioseqDB.db: | |
291 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength())) | |
292 | |
293 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize) | |
294 os.mkdir("batches") | |
295 os.chdir("batches") | |
296 | |
297 iterator = 0 | |
298 for lHeader in lHeadersList : | |
299 iterator += 1 | |
300 with open("batch_%s.fa" % iterator, 'w') as f : | |
301 for header in lHeader : | |
302 iBioseq = iBioseqDB.fetch(header) | |
303 iBioseq.write(f) | |
304 os.chdir("..") | |
305 | |
306 | |
307 ## Split the input fasta file in several output files according to their cluster identifier | |
308 # | |
309 # @param inFileName string name of the input fasta file | |
310 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust) | |
311 # @param simplifyHeader boolean simplify the headers | |
312 # @param createDir boolean put the sequences in different directories | |
313 # @param outPrefix string prefix of the output files (default='seqCluster') | |
314 # @param verbose integer (default = 0) | |
315 # | |
316 @staticmethod | |
317 def splitSeqPerCluster( inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix="seqCluster", verbose=0 ): | |
318 if not os.path.exists( inFileName ): | |
319 print "ERROR: %s doesn't exist" % ( inFileName ) | |
320 sys.exit(1) | |
321 | |
322 inFile = open( inFileName, "r" ) | |
323 | |
324 line = inFile.readline() | |
325 if line: | |
326 name = line.split(" ")[0] | |
327 if "Cluster" in name: | |
328 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
329 seqID = name.split("Mb")[1] | |
330 else: | |
331 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust | |
332 if "Q" in name.split("Gr")[0]: | |
333 seqID = name.split("Gr")[0].split("MbQ")[1] | |
334 elif "S" in name: | |
335 seqID = name.split("Gr")[0].split("MbS")[1] | |
336 sClusterIDs = set( [ clusterID ] ) | |
337 if simplifyHeader == True: | |
338 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
339 else: | |
340 header = line[1:-1] | |
341 if createDir == True: | |
342 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
343 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
344 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
345 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
346 outFile = open( outFileName, "w" ) | |
347 outFile.write( ">%s\n" % ( header ) ) | |
348 prevClusterID = clusterID | |
349 | |
350 line = inFile.readline() | |
351 while line: | |
352 if line[0] == ">": | |
353 name = line.split(" ")[0] | |
354 if "Cluster" in name: | |
355 clusterID = name.split("Cluster")[1].split("Mb")[0] | |
356 seqID = name.split("Mb")[1] | |
357 else: | |
358 clusterID = name.split("Cl")[0].split("Gr")[1] | |
359 if "Q" in name.split("Gr")[0]: | |
360 seqID = name.split("Gr")[0].split("MbQ")[1] | |
361 elif "S" in name: | |
362 seqID = name.split("Gr")[0].split("MbS")[1] | |
363 | |
364 if clusterID != prevClusterID: | |
365 outFile.close() | |
366 | |
367 if simplifyHeader == True: | |
368 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID ) | |
369 else: | |
370 header = line[1:-1] | |
371 | |
372 if createDir == True: | |
373 os.chdir( ".." ) | |
374 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ): | |
375 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
376 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) ) | |
377 | |
378 outFileName = "%s%s.fa" % ( outPrefix, clusterID ) | |
379 if not os.path.exists( outFileName ): | |
380 outFile = open( outFileName, "w" ) | |
381 else: | |
382 if clusterID != prevClusterID: | |
383 outFile.close() | |
384 outFile = open( outFileName, "a" ) | |
385 outFile.write( ">%s\n" % ( header ) ) | |
386 prevClusterID = clusterID | |
387 sClusterIDs.add( clusterID ) | |
388 | |
389 else: | |
390 outFile.write( line ) | |
391 | |
392 line = inFile.readline() | |
393 | |
394 outFile.close() | |
395 if verbose > 0: | |
396 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush() | |
397 | |
398 if createDir == True: | |
399 os.chdir("..") | |
400 else: | |
401 print "WARNING: empty input file - no cluster found"; sys.stdout.flush() | |
402 | |
403 | |
404 ## Filter a fasta file in two fasta files using the length of each sequence as a criteron | |
405 # | |
406 # @param len_min integer length sequence criterion to filter | |
407 # @param inFileName string name of the input fasta file | |
408 # @param verbose integer (default = 0) | |
409 # | |
410 @staticmethod | |
411 def dbLengthFilter( len_min, inFileName, verbose=0 ): | |
412 file_db = open( inFileName, "r" ) | |
413 file_dbInf = open( inFileName+".Inf"+str(len_min), "w" ) | |
414 file_dbSup = open( inFileName+".Sup"+str(len_min), "w" ) | |
415 seq = Bioseq() | |
416 numseq = 0 | |
417 nbsave = 0 | |
418 | |
419 seq.read( file_db ) | |
420 while seq.sequence: | |
421 l = seq.getLength() | |
422 numseq = numseq + 1 | |
423 if l >= len_min: | |
424 seq.write( file_dbSup ) | |
425 if verbose > 0: | |
426 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Sup !!' | |
427 nbsave=nbsave+1 | |
428 else: | |
429 seq.write( file_dbInf ) | |
430 if verbose > 0: | |
431 print 'sequence #',numseq,'=',l,'[',seq.header[0:40],'...] Inf !!' | |
432 nbsave=nbsave+1 | |
433 seq.read( file_db ) | |
434 | |
435 file_db.close() | |
436 file_dbInf.close() | |
437 file_dbSup.close() | |
438 if verbose > 0: | |
439 print nbsave,'saved sequences in ',inFileName+".Inf"+str(len_min)," and ", inFileName+".Sup"+str(len_min) | |
440 | |
441 | |
442 ## Extract the longest sequences from a fasta file | |
443 # | |
444 # @param num integer maximum number of sequences in the output file | |
445 # @param inFileName string name of the input fasta file | |
446 # @param outFileName string name of the output fasta file | |
447 # @param minThresh integer minimum length threshold (default=0) | |
448 # @param verbose integer (default = 0) | |
449 # | |
450 @staticmethod | |
451 def dbLongestSequences( num, inFileName, outFileName="", verbose=0, minThresh=0 ): | |
452 bsDB = BioseqDB( inFileName ) | |
453 if verbose > 0: | |
454 print "nb of input sequences: %i" % ( bsDB.getSize() ) | |
455 | |
456 if outFileName == "": | |
457 outFileName = inFileName + ".best" + str(num) | |
458 outFile = open( outFileName, "w" ) | |
459 | |
460 if bsDB.getSize()==0: | |
461 return 0 | |
462 | |
463 num = int(num) | |
464 if verbose > 0: | |
465 print "keep the %i longest sequences" % ( num ) | |
466 if minThresh > 0: | |
467 print "with length > %i bp" % ( minThresh ) | |
468 sys.stdout.flush() | |
469 | |
470 # retrieve the length of each input sequence | |
471 tmpLSeqLgth = [] | |
472 seqNum = 0 | |
473 for bs in bsDB.db: | |
474 seqNum += 1 | |
475 tmpLSeqLgth.append( bs.getLength() ) | |
476 if verbose > 1: | |
477 print "%d seq %s : %d bp" % ( seqNum, bs.header[0:40], bs.getLength() ) | |
478 sys.stdout.flush() | |
479 | |
480 # sort the lengths | |
481 tmpLSeqLgth.sort() | |
482 tmpLSeqLgth.reverse() | |
483 | |
484 # select the longest | |
485 lSeqLgth = [] | |
486 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ): | |
487 if tmpLSeqLgth[i] >= minThresh: | |
488 lSeqLgth.append( tmpLSeqLgth[i] ) | |
489 if verbose > 0: | |
490 print "selected max length: %i" % ( max(lSeqLgth) ) | |
491 print "selected min length: %i" % ( min(lSeqLgth) ) | |
492 sys.stdout.flush() | |
493 | |
494 # save the longest | |
495 inFile = open( inFileName ) | |
496 seqNum = 0 | |
497 nbSave = 0 | |
498 for bs in bsDB.db: | |
499 seqNum += 1 | |
500 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh: | |
501 bs.write( outFile ) | |
502 if verbose > 1: | |
503 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] ) | |
504 sys.stdout.flush() | |
505 nbSave += 1 | |
506 if nbSave == num: | |
507 break | |
508 inFile.close() | |
509 outFile.close() | |
510 if verbose > 0: | |
511 print nbSave, "saved sequences in ", outFileName | |
512 sys.stdout.flush() | |
513 | |
514 return 0 | |
515 | |
516 | |
517 ## Extract all the sequence headers from a fasta file and write them in a new fasta file | |
518 # | |
519 # @param inFileName string name of the input fasta file | |
520 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers') | |
521 # | |
522 @staticmethod | |
523 def dbExtractSeqHeaders( inFileName, outFileName="" ): | |
524 lHeaders = FastaUtils.dbHeaders( inFileName ) | |
525 | |
526 if outFileName == "": | |
527 outFileName = inFileName + ".headers" | |
528 | |
529 outFile = open( outFileName, "w" ) | |
530 for i in lHeaders: | |
531 outFile.write( i + "\n" ) | |
532 outFile.close() | |
533 | |
534 return 0 | |
535 | |
536 | |
537 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file | |
538 # | |
539 # @param pattern regular expression to search in headers | |
540 # @param inFileName string name of the input fasta file | |
541 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
542 # @param verbose integer verbosity level (default = 0) | |
543 # | |
544 @staticmethod | |
545 def dbExtractByPattern( pattern, inFileName, outFileName="", verbose=0 ): | |
546 if pattern == "": | |
547 return | |
548 | |
549 if outFileName == "": | |
550 outFileName = inFileName + '.extracted' | |
551 outFile = open( outFileName, 'w' ) | |
552 | |
553 patternTosearch = re.compile( pattern ) | |
554 bioseq = Bioseq() | |
555 bioseqNb = 0 | |
556 savedBioseqNb = 0 | |
557 inFile = open( inFileName, "r" ) | |
558 bioseq.read( inFile ) | |
559 while bioseq.sequence: | |
560 bioseqNb = bioseqNb + 1 | |
561 m = patternTosearch.search( bioseq.header ) | |
562 if m: | |
563 bioseq.write( outFile ) | |
564 if verbose > 1: | |
565 print 'sequence num',bioseqNb,'matched on',m.group(),'[',bioseq.header[0:40],'...] saved !!' | |
566 savedBioseqNb = savedBioseqNb + 1 | |
567 bioseq.read( inFile ) | |
568 inFile.close() | |
569 | |
570 outFile.close() | |
571 | |
572 if verbose > 0: | |
573 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
574 | |
575 | |
576 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
577 # | |
578 # @param patternFileName string file containing regular expression to search in headers | |
579 # @param inFileName string name of the input fasta file | |
580 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
581 # @param verbose integer verbosity level (default = 0) | |
582 # | |
583 @staticmethod | |
584 def dbExtractByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): | |
585 | |
586 if patternFileName == "": | |
587 print "ERROR: no file of pattern" | |
588 sys.exit(1) | |
589 | |
590 bioseq = Bioseq() | |
591 bioseqNb = 0 | |
592 savedBioseqNb = 0 | |
593 lHeaders = [] | |
594 | |
595 inFile = open( inFileName, "r" ) | |
596 bioseq.read( inFile ) | |
597 while bioseq.sequence != None: | |
598 lHeaders.append( bioseq.header ) | |
599 bioseq.read( inFile ) | |
600 inFile.close() | |
601 | |
602 lHeadersToKeep = [] | |
603 patternFile = open( patternFileName, "r" ) | |
604 for pattern in patternFile: | |
605 if verbose > 0: | |
606 print "pattern: ",pattern[:-1]; sys.stdout.flush() | |
607 | |
608 patternToSearch = re.compile(pattern[:-1]) | |
609 for h in lHeaders: | |
610 if patternToSearch.search(h): | |
611 lHeadersToKeep.append(h) | |
612 patternFile.close() | |
613 | |
614 if outFileName == "": | |
615 outFileName = inFileName + ".extracted" | |
616 outFile=open( outFileName, "w" ) | |
617 | |
618 inFile = open( inFileName, "r" ) | |
619 bioseq.read(inFile) | |
620 while bioseq.sequence: | |
621 bioseqNb += 1 | |
622 if bioseq.header in lHeadersToKeep: | |
623 bioseq.write(outFile) | |
624 if verbose > 1: | |
625 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() | |
626 savedBioseqNb += 1 | |
627 bioseq.read(inFile) | |
628 inFile.close() | |
629 | |
630 outFile.close() | |
631 | |
632 if verbose > 0: | |
633 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
634 | |
635 | |
636 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file | |
637 # | |
638 # @param pattern regular expression to search in headers | |
639 # @param inFileName string name of the input fasta file | |
640 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
641 # @param verbose integer verbosity level (default = 0) | |
642 # | |
643 @staticmethod | |
644 def dbCleanByPattern( pattern, inFileName, outFileName="", verbose=0 ): | |
645 if pattern == "": | |
646 return | |
647 | |
648 patternToSearch = re.compile(pattern) | |
649 | |
650 if outFileName == "": | |
651 outFileName = inFileName + '.cleaned' | |
652 outFile = open(outFileName,'w') | |
653 | |
654 bioseq = Bioseq() | |
655 bioseqNb = 0 | |
656 savedBioseqNb = 0 | |
657 inFile = open(inFileName) | |
658 bioseq.read(inFile) | |
659 while bioseq.sequence != None: | |
660 bioseqNb += 1 | |
661 if not patternToSearch.search(bioseq.header): | |
662 bioseq.write(outFile) | |
663 if verbose > 1: | |
664 print 'sequence num',bioseqNb,'[',bioseq.header[0:40],'...] saved !!' | |
665 savedBioseqNb += 1 | |
666 bioseq.read(inFile) | |
667 inFile.close() | |
668 | |
669 outFile.close() | |
670 | |
671 if verbose > 0: | |
672 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
673 | |
674 | |
675 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file | |
676 # | |
677 # @param patternFileName string file containing regular expression to search in headers | |
678 # @param inFileName string name of the input fasta file | |
679 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted') | |
680 # @param verbose integer verbosity level (default = 0) | |
681 # | |
682 @staticmethod | |
683 def dbCleanByFilePattern( patternFileName, inFileName, outFileName="", verbose=0 ): | |
684 if patternFileName == "": | |
685 print "ERROR: no file of pattern" | |
686 sys.exit(1) | |
687 | |
688 bioseq = Bioseq() | |
689 bioseqNb = 0 | |
690 savedBioseqNb = 0 | |
691 lHeaders = [] | |
692 inFile = open( inFileName, "r" ) | |
693 bioseq.read( inFile ) | |
694 while bioseq.sequence != None: | |
695 bioseqNb += 1 | |
696 lHeaders.append( bioseq.header ) | |
697 bioseq.read( inFile ) | |
698 inFile.close() | |
699 | |
700 patternFile = open( patternFileName, "r") | |
701 lHeadersToRemove = [] | |
702 for pattern in patternFile: | |
703 if verbose > 0: | |
704 print "pattern: ",pattern[:-1]; sys.stdout.flush() | |
705 | |
706 patternToSearch = re.compile( pattern[:-1] ) | |
707 for h in lHeaders: | |
708 if patternToSearch.search(h): | |
709 lHeadersToRemove.append(h) | |
710 patternFile.close() | |
711 | |
712 if outFileName == "": | |
713 outFileName = inFileName + '.cleaned' | |
714 outFile = open( outFileName, 'w' ) | |
715 | |
716 bioseqNum = 0 | |
717 inFile=open( inFileName ) | |
718 bioseq.read( inFile ) | |
719 while bioseq.sequence != None: | |
720 bioseqNum += 1 | |
721 if bioseq.header not in lHeadersToRemove: | |
722 bioseq.write( outFile ) | |
723 if verbose > 1: | |
724 print 'sequence num',bioseqNum,'/',bioseqNb,'[',bioseq.header[0:40],'...] saved !!'; sys.stdout.flush() | |
725 savedBioseqNb += 1 | |
726 bioseq.read( inFile ) | |
727 inFile.close() | |
728 | |
729 outFile.close() | |
730 | |
731 if verbose > 0: | |
732 print "%i sequences saved in file '%s'" % ( savedBioseqNb, outFileName ) | |
733 | |
734 | |
735 ## Find sequence's ORFs from a fasta file and write them in a tab file | |
736 # | |
737 # @param inFileName string name of the input fasta file | |
738 # @param orfMaxNb integer Select orfMaxNb best ORFs | |
739 # @param orfMinLength integer Keep ORFs with length > orfMinLength | |
740 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map') | |
741 # @param verbose integer verbosity level (default = 0) | |
742 # | |
743 @staticmethod | |
744 def dbORF( inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose=0 ): | |
745 if outFileName == "": | |
746 outFileName = inFileName + ".ORF.map" | |
747 outFile = open( outFileName, "w" ) | |
748 | |
749 bioseq = Bioseq() | |
750 bioseqNb = 0 | |
751 | |
752 inFile = open( inFileName ) | |
753 bioseq.read( inFile ) | |
754 while bioseq.sequence != None: | |
755 bioseq.upCase() | |
756 bioseqNb += 1 | |
757 if verbose > 0: | |
758 print 'sequence num',bioseqNb,'=',bioseq.getLength(),'[',bioseq.header[0:40],'...]' | |
759 | |
760 orf = bioseq.findORF() | |
761 bestOrf = [] | |
762 for i in orf.keys(): | |
763 orfLen = len(orf[i]) | |
764 for j in xrange(1, orfLen): | |
765 start = orf[i][j-1] + 4 | |
766 end = orf[i][j] + 3 | |
767 if end - start >= orfMinLength: | |
768 bestOrf.append( ( end-start, i+1, start, end ) ) | |
769 | |
770 bioseq.reverseComplement() | |
771 | |
772 orf = bioseq.findORF() | |
773 seqLen = bioseq.getLength() | |
774 for i in orf.keys(): | |
775 orfLen = len(orf[i]) | |
776 for j in xrange(1, orfLen): | |
777 start = seqLen - orf[i][j-1] - 3 | |
778 end = seqLen - orf[i][j] - 2 | |
779 if start - end >= orfMinLength: | |
780 bestOrf.append( ( start-end, (i+1)*-1, start, end ) ) | |
781 | |
782 bestOrf.sort() | |
783 bestOrf.reverse() | |
784 bestOrfNb = len(bestOrf) | |
785 if orfMaxNb != 0 and orfMaxNb < bestOrfNb: | |
786 bestOrfNb = orfMaxNb | |
787 for i in xrange(0, bestOrfNb): | |
788 if verbose > 0: | |
789 print bestOrf[i] | |
790 outFile.write("%s\t%s\t%d\t%d\n"%("ORF|"+str(bestOrf[i][1])+\ | |
791 "|"+str(bestOrf[i][0]),bioseq.header, | |
792 bestOrf[i][2],bestOrf[i][3])) | |
793 bioseq.read( inFile ) | |
794 | |
795 inFile.close() | |
796 outFile.close() | |
797 | |
798 return 0 | |
799 | |
800 | |
801 ## Sort sequences by increasing length (write a new file) | |
802 # | |
803 # @param inFileName string name of the input fasta file | |
804 # @param outFileName string name of the output fasta file | |
805 # @param verbose integer verbosity level | |
806 # | |
807 @staticmethod | |
808 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose=0): | |
809 if verbose > 0: | |
810 print "sort sequences by increasing length" | |
811 sys.stdout.flush() | |
812 if not os.path.exists( inFileName ): | |
813 print "ERROR: file '%s' doesn't exist" % ( inFileName ) | |
814 sys.exit(1) | |
815 | |
816 # read each seq one by one | |
817 # save them in distinct temporary files | |
818 # with their length in the name | |
819 inFileHandler = open( inFileName, "r" ) | |
820 countSeq = 0 | |
821 bs = Bioseq() | |
822 bs.read( inFileHandler ) | |
823 while bs.header: | |
824 countSeq += 1 | |
825 tmpFile = "%ibp_%inb" % ( bs.getLength(), countSeq ) | |
826 bs.appendBioseqInFile( tmpFile ) | |
827 if verbose > 1: | |
828 print "%s (%i bp) saved in '%s'" % ( bs.header, bs.getLength(), tmpFile ) | |
829 bs.header = "" | |
830 bs.sequence = "" | |
831 bs.read( inFileHandler ) | |
832 inFileHandler.close() | |
833 | |
834 # sort temporary file names | |
835 # concatenate them into the output file | |
836 if os.path.exists( outFileName ): | |
837 os.remove( outFileName ) | |
838 lFiles = glob.glob( "*bp_*nb" ) | |
839 lFiles.sort( key=lambda s:int(s.split("bp_")[0]) ) | |
840 for fileName in lFiles: | |
841 cmd = "cat %s >> %s" % ( fileName, outFileName ) | |
842 returnValue = os.system( cmd ) | |
843 if returnValue != 0: | |
844 print "ERROR while concatenating '%s' with '%s'" % ( fileName, outFileName ) | |
845 sys.exit(1) | |
846 os.remove( fileName ) | |
847 | |
848 return 0 | |
849 | |
850 | |
851 ## Sort sequences by header | |
852 # | |
853 # @param inFileName string name of the input fasta file | |
854 # @param outFileName string name of the output fasta file | |
855 # @param verbose integer verbosity level | |
856 # | |
857 @staticmethod | |
858 def sortSequencesByHeader(inFileName, outFileName = "", verbose = 0): | |
859 if outFileName == "": | |
860 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0] | |
861 iBioseqDB = BioseqDB(inFileName) | |
862 f = open(outFileName, "w") | |
863 lHeaders = sorted(iBioseqDB.getHeaderList()) | |
864 for header in lHeaders: | |
865 iBioseq = iBioseqDB.fetch(header) | |
866 iBioseq.write(f) | |
867 f.close() | |
868 | |
869 | |
870 ## Return a dictionary which keys are the headers and values the length of the sequences | |
871 # | |
872 # @param inFile string name of the input fasta file | |
873 # @param verbose integer verbosity level | |
874 # | |
875 @staticmethod | |
876 def getLengthPerHeader( inFile, verbose=0 ): | |
877 dHeader2Length = {} | |
878 | |
879 inFileHandler = open( inFile, "r" ) | |
880 currentSeqHeader = "" | |
881 currentSeqLength = 0 | |
882 line = inFileHandler.readline() | |
883 while line: | |
884 if line[0] == ">": | |
885 if currentSeqHeader != "": | |
886 dHeader2Length[ currentSeqHeader ] = currentSeqLength | |
887 currentSeqLength = 0 | |
888 currentSeqHeader = line[1:-1] | |
889 if verbose > 0: | |
890 print "current header: %s" % ( currentSeqHeader ) | |
891 sys.stdout.flush() | |
892 else: | |
893 currentSeqLength += len( line.replace("\n","") ) | |
894 line = inFileHandler.readline() | |
895 dHeader2Length[ currentSeqHeader ] = currentSeqLength | |
896 inFileHandler.close() | |
897 | |
898 return dHeader2Length | |
899 | |
900 | |
901 ## Convert headers from a fasta file having chunk coordinates | |
902 # | |
903 # @param inFile string name of the input fasta file | |
904 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes | |
905 # @param outFile string name of the output file | |
906 # | |
907 @staticmethod | |
908 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile): | |
909 inFileHandler = open(inFile, "r") | |
910 outFileHandler = open(outFile, "w") | |
911 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile) | |
912 iConvCoord = ConvCoord() | |
913 line = inFileHandler.readline() | |
914 while line: | |
915 if line[0] == ">": | |
916 if "{Fragment}" in line: | |
917 chkName = line.split(" ")[1] | |
918 chrName = dChunk2Map[chkName].seqname | |
919 lCoordPairs = line.split(" ")[3].split(",") | |
920 lRangesOnChk = [] | |
921 for i in lCoordPairs: | |
922 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1])) | |
923 lRangesOnChk.append(iRange) | |
924 lRangesOnChr = [] | |
925 for iRange in lRangesOnChk: | |
926 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map)) | |
927 newHeader = line[1:-1].split(" ")[0] | |
928 newHeader += " %s" % chrName | |
929 newHeader += " {Fragment}" | |
930 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end) | |
931 for iRange in lRangesOnChr[1:]: | |
932 newHeader += ",%i..%i" % (iRange.start, iRange.end) | |
933 outFileHandler.write(">%s\n" % newHeader) | |
934 else: | |
935 chkName = line.split("_")[1].split(" ")[0] | |
936 chrName = dChunk2Map[chkName].seqname | |
937 coords = line[line.find("[")+1 : line.find("]")] | |
938 start = int(coords.split(",")[0]) | |
939 end = int(coords.split(",")[1]) | |
940 iRangeOnChk = Range(chkName, start, end) | |
941 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map) | |
942 newHeader = line[1:-1].split("_")[0] | |
943 newHeader += " %s" % chrName | |
944 newHeader += " %s" % line[line.find("(") : line.find(")")+1] | |
945 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd()) | |
946 outFileHandler.write(">%s\n" % newHeader) | |
947 else: | |
948 outFileHandler.write(line) | |
949 line = inFileHandler.readline() | |
950 inFileHandler.close() | |
951 outFileHandler.close() | |
952 | |
953 | |
954 ## Convert a fasta file to a length file | |
955 # | |
956 # @param inFile string name of the input fasta file | |
957 # @param outFile string name of the output file | |
958 # | |
959 @staticmethod | |
960 def convertFastaToLength(inFile, outFile = ""): | |
961 if outFile == "": | |
962 outFile = "%s.length" % inFile | |
963 | |
964 if inFile != "": | |
965 with open(inFile, "r") as inFH: | |
966 with open(outFile, "w") as outFH: | |
967 bioseq = Bioseq() | |
968 bioseq.read(inFH) | |
969 while bioseq.sequence != None: | |
970 seqLen = bioseq.getLength() | |
971 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen)) | |
972 bioseq.read(inFH) | |
973 | |
974 | |
975 ## Convert a fasta file to a seq file | |
976 # | |
977 # @param inFile string name of the input fasta file | |
978 # @param outFile string name of the output file | |
979 # | |
980 @staticmethod | |
981 def convertFastaToSeq(inFile, outFile = ""): | |
982 if outFile == "": | |
983 outFile = "%s.seq" % inFile | |
984 | |
985 if inFile != "": | |
986 with open(inFile, "r") as inFH: | |
987 with open(outFile, "w") as outFH: | |
988 bioseq = Bioseq() | |
989 bioseq.read(inFH) | |
990 while bioseq.sequence != None: | |
991 seqLen = bioseq.getLength() | |
992 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \ | |
993 bioseq.sequence, bioseq.header, seqLen)) | |
994 bioseq.read(inFH) | |
995 | |
996 | |
997 ## Splice an input fasta file using coordinates in a Map file | |
998 # | |
999 # @note the coordinates should be merged beforehand! | |
1000 # | |
1001 @staticmethod | |
1002 def spliceFromCoords( genomeFile, coordFile, obsFile ): | |
1003 genomeFileHandler = open( genomeFile, "r" ) | |
1004 obsFileHandler = open( obsFile, "w" ) | |
1005 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile( coordFile ) | |
1006 | |
1007 bs = Bioseq() | |
1008 bs.read( genomeFileHandler ) | |
1009 while bs.sequence: | |
1010 if dChr2Maps.has_key( bs.header ): | |
1011 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax( dChr2Maps[ bs.header ] ) | |
1012 splicedSeq = "" | |
1013 currentSite = 0 | |
1014 for iMap in lCoords: | |
1015 minSplice = iMap.getMin() - 1 | |
1016 if minSplice > currentSite: | |
1017 splicedSeq += bs.sequence[ currentSite : minSplice ] | |
1018 currentSite = iMap.getMax() | |
1019 splicedSeq += bs.sequence[ currentSite : ] | |
1020 bs.sequence = splicedSeq | |
1021 bs.write( obsFileHandler ) | |
1022 bs.read( genomeFileHandler ) | |
1023 | |
1024 genomeFileHandler.close() | |
1025 obsFileHandler.close() | |
1026 | |
1027 | |
1028 ## Shuffle input sequences (single file or files in a directory) | |
1029 # | |
1030 @staticmethod | |
1031 def dbShuffle( inData, outData, verbose=0 ): | |
1032 if CheckerUtils.isExecutableInUserPath("esl-shuffle"): | |
1033 prg = "esl-shuffle" | |
1034 else : prg = "shuffle" | |
1035 genericCmd = prg + " -d INPUT > OUTPUT" | |
1036 if os.path.isfile( inData ): | |
1037 if verbose > 0: | |
1038 print "shuffle input file '%s'" % inData | |
1039 cmd = genericCmd.replace("INPUT",inData).replace("OUTPUT",outData) | |
1040 print cmd | |
1041 returnStatus = os.system( cmd ) | |
1042 if returnStatus != 0: | |
1043 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) | |
1044 sys.exit(1) | |
1045 | |
1046 elif os.path.isdir( inData ): | |
1047 if verbose > 0: | |
1048 print "shuffle files in input directory '%s'" % inData | |
1049 if os.path.exists( outData ): | |
1050 shutil.rmtree( outData ) | |
1051 os.mkdir( outData ) | |
1052 lInputFiles = glob.glob( "%s/*.fa" %( inData ) ) | |
1053 nbFastaFiles = 0 | |
1054 for inputFile in lInputFiles: | |
1055 nbFastaFiles += 1 | |
1056 if verbose > 1: | |
1057 print "%3i / %3i" % ( nbFastaFiles, len(lInputFiles) ) | |
1058 fastaBaseName = os.path.basename( inputFile ) | |
1059 prefix, extension = os.path.splitext( fastaBaseName ) | |
1060 cmd = genericCmd.replace("INPUT",inputFile).replace("OUTPUT","%s/%s_shuffle.fa"%(outData,prefix)) | |
1061 returnStatus = os.system( cmd ) | |
1062 if returnStatus != 0: | |
1063 sys.stderr.write( "ERROR: 'shuffle' returned '%i'\n" % returnStatus ) | |
1064 sys.exit(1) | |
1065 | |
1066 | |
1067 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers | |
1068 # | |
1069 # @param inClusterFileName string input cluster file name | |
1070 # @param inFastaFileName string input fasta file name | |
1071 # @param outFileName string output file name | |
1072 # @param verbosity integer verbosity | |
1073 # | |
1074 @staticmethod | |
1075 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0): | |
1076 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity) | |
1077 iFastaParser = FastaParser(inFastaFileName) | |
1078 with open(outFileName, "w") as f: | |
1079 for iSequence in iFastaParser.getIterator(): | |
1080 | |
1081 header = iSequence.getName() | |
1082 if dHeader2ClusterClusterMember.get(header): | |
1083 cluster = dHeader2ClusterClusterMember[header][0] | |
1084 member = dHeader2ClusterClusterMember[header][1] | |
1085 else: | |
1086 clusterIdForSingletonCluster += 1 | |
1087 cluster = clusterIdForSingletonCluster | |
1088 member = 1 | |
1089 | |
1090 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header) | |
1091 iSequence.setName(newHeader) | |
1092 f.write(iSequence.printFasta()) | |
1093 | |
1094 @staticmethod | |
1095 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0): | |
1096 dHeader2ClusterClusterMember = {} | |
1097 clusterId = 0 | |
1098 with open(inClusterFileName) as f: | |
1099 line = f.readline() | |
1100 while line: | |
1101 lineWithoutLastChar = line.rstrip() | |
1102 lHeaders = lineWithoutLastChar.split("\t") | |
1103 clusterId += 1 | |
1104 if verbosity > 0: | |
1105 print "%i sequences in cluster %i" % (len(lHeaders), clusterId) | |
1106 memberId = 0 | |
1107 for header in lHeaders: | |
1108 memberId += 1 | |
1109 dHeader2ClusterClusterMember[header] = (clusterId, memberId) | |
1110 line = f.readline() | |
1111 if verbosity > 0: | |
1112 print "%i clusters" % clusterId | |
1113 return dHeader2ClusterClusterMember, clusterId | |
1114 | |
1115 @staticmethod | |
1116 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""): | |
1117 """ | |
1118 Write a map file from fasta output of clustering tool. | |
1119 Warning: only works if input fasta headers are formated like LTRharvest fasta output. | |
1120 """ | |
1121 if not outMapFileName: | |
1122 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0]) | |
1123 | |
1124 fileDb = open(fastaFileNameFromClustering , "r") | |
1125 fileMap = open(outMapFileName, "w") | |
1126 seq = Bioseq() | |
1127 numseq = 0 | |
1128 while 1: | |
1129 seq.read(fileDb) | |
1130 if seq.sequence == None: | |
1131 break | |
1132 numseq = numseq + 1 | |
1133 ID = seq.header.split(' ')[0].split('_')[0] | |
1134 chunk = seq.header.split(' ')[0].split('_')[1] | |
1135 start = seq.header.split(' ')[-1].split(',')[0][1:] | |
1136 end = seq.header.split(' ')[-1].split(',')[1][:-1] | |
1137 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end) | |
1138 fileMap.write(line + "\n") | |
1139 | |
1140 fileDb.close() | |
1141 fileMap.close() | |
1142 print "saved in %s" % outMapFileName | |
1143 |