comparison TEisotools-1.1.a/commons/core/seq/FastaUtils.py @ 13:feef9a0db09d draft

Uploaded
author urgi-team
date Wed, 20 Jul 2016 09:04:42 -0400
parents
children
comparison
equal deleted inserted replaced
12:22b0494ec883 13:feef9a0db09d
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import re
34 import sys
35 import math
36 import glob
37 import string
38 import shutil
39 from commons.core.utils.FileUtils import FileUtils
40 from commons.core.seq.Bioseq import Bioseq
41 from commons.core.seq.BioseqDB import BioseqDB
42 from commons.core.coord.MapUtils import MapUtils
43 from commons.core.coord.Range import Range
44 from commons.core.coord.ConvCoord import ConvCoord
45 from commons.core.parsing.FastaParser import FastaParser
46 from commons.core.checker.CheckerUtils import CheckerUtils
47 from commons.core.launcher.LauncherUtils import LauncherUtils
48
49
50 ## Static methods for fasta file manipulation
51 #
52 class FastaUtils( object ):
53
54 ## Count the number of sequences in the input fasta file
55 #
56 # @param inFile name of the input fasta file
57 #
58 # @return integer number of sequences in the input fasta file
59 #
60 @staticmethod
61 def dbSize( inFile ):
62 nbSeq = 0
63 with open(inFile) as fH:
64 for line in fH:
65 if line[0] == ">":
66 nbSeq += 1
67
68 return nbSeq
69
70
71 ## Compute the cumulative sequence length in the input fasta file
72 #
73 # @param inFile handler of the input fasta file
74 #
75 @staticmethod
76 def dbCumLength( inFile ):
77 cumLength = 0
78 for line in inFile:
79 if line[0] != ">":
80 cumLength += len(string.rstrip(line))
81
82 return cumLength
83
84
85 ## Return a list with the length of each sequence in the input fasta file
86 #
87 # @param inFile string name of the input fasta file
88 #
89 @staticmethod
90 def dbLengths(inFile):
91 lLengths = []
92 currentLength = 0
93 with open(inFile) as fH:
94 for line in fH:
95 if line[0] == ">":
96 if currentLength != 0:
97 lLengths.append(currentLength)
98 currentLength = 0
99 else:
100 currentLength += len(line[:-1])
101 lLengths.append(currentLength)
102
103 return lLengths
104
105
106 ## Retrieve the sequence headers present in the input fasta file
107 #
108 # @param inFile string name of the input fasta file
109 # @param verbose integer level of verbosity
110 #
111 # @return list of sequence headers
112 #
113 @staticmethod
114 def dbHeaders(inFile, verbose = 0):
115 lHeaders = [line[1:].rstrip() for line in open(inFile) if line[0] == ">"]
116 if verbose:
117 for header in lHeaders:
118 print header
119
120 return lHeaders
121
122
123 ## Cut a data bank into chunks according to the input parameters
124 # If a sequence is shorter than the threshold, it is only renamed (not cut)
125 #
126 # @param inFileName string name of the input fasta file
127 # @param chkLgth string chunk length (in bp, default=200000)
128 # @param chkOver string chunk overlap (in bp, default=10000)
129 # @param wordN string N stretch word length (default=11, 0 for no detection)
130 # @param outFilePrefix string prefix of the output files (default=inFileName + '_chunks.fa' and '_chunks.map')
131 # @param clean boolean remove 'cut' and 'Nstretch' files
132 # @param verbose integer (default = 0)
133 #
134 @staticmethod
135 def dbChunks(inFileName, chkLgth = "200000", chkOver = "10000", wordN = "11", outFilePrefix = "", clean = False, verbose = 0):
136 nbSeq = FastaUtils.dbSize(inFileName)
137 if verbose > 0:
138 print "cut the %i input sequences with cutterDB..." % nbSeq
139 sys.stdout.flush()
140
141 prg = "cutterDB"
142 cmd = prg
143 cmd += " -l %s" % chkLgth
144 cmd += " -o %s" % chkOver
145 cmd += " -w %s" % wordN
146 cmd += " %s" % inFileName
147 returnStatus = os.system(cmd)
148 if returnStatus != 0:
149 msg = "ERROR: '%s' returned '%i'" % (prg, returnStatus)
150 sys.stderr.write("%s\n" % msg)
151 sys.exit(1)
152
153 nbChunks = FastaUtils.dbSize("%s_cut" % inFileName)
154 if verbose > 0:
155 print "done (%i chunks)" % ( nbChunks )
156 sys.stdout.flush()
157
158 if verbose > 0:
159 print "rename the headers..."
160 sys.stdout.flush()
161
162 if outFilePrefix == "":
163 outFastaName = inFileName + "_chunks.fa"
164 outMapName = inFileName + "_chunks.map"
165 else:
166 outFastaName = outFilePrefix + ".fa"
167 outMapName = outFilePrefix + ".map"
168
169 with open("%s_cut" % inFileName) as inFile:
170 outFasta = open(outFastaName, "w")
171 outMap = open(outMapName, "w")
172
173 for line in inFile:
174 if line[0] == ">":
175 if verbose > 1:
176 print "rename '%s'" % (line[:-1]); sys.stdout.flush()
177 data = line[:-1].split(" ")
178 seqID = data[0].split(">")[1]
179 newHeader = "chunk%s" % (str(seqID).zfill(len(str(nbChunks))))
180 oldHeader = data[2]
181 seqStart = data[4].split("..")[0]
182 seqEnd = data[4].split("..")[1]
183 outMap.write("%s\t%s\t%s\t%s\n" % (newHeader, oldHeader, seqStart, seqEnd))
184 outFasta.write(">%s\n" % newHeader)
185
186 else:
187 outFasta.write(line.upper())
188
189 outFasta.close()
190 outMap.close()
191
192 #stats on .Nstretch.map file
193 genomeLength = FastaUtils.dbCumLength(open(inFileName))
194 NstretchMapFile = inFileName + ".Nstretch.map"
195 outNstrechStats = open('%s.NstretchStats.txt' % inFileName , "w")
196 if FileUtils.isEmpty(NstretchMapFile) or not FileUtils.isRessourceExists(NstretchMapFile):
197 outNstrechStats.write("No N in stretch length > %s\n" % wordN)
198 else:
199 with open(NstretchMapFile) as f:
200 dHeader2lLengths = {}
201 for line in f:
202 data = line.rstrip().split()
203 header = data[1]
204 length = int(data[3]) - int(data[2]) + 1
205 if header not in dHeader2lLengths:
206 dHeader2lLengths[header] = []
207 dHeader2lLengths[header].append(length)
208
209 for header in sorted(dHeader2lLengths):
210 lLengths = dHeader2lLengths[header]
211 outNstrechStats.write("%s\tmin: %s\tmax: %s\tcumul: %s\n" % (header, min(lLengths), max(lLengths), sum(lLengths)))
212
213 cumulAllStretch = sum([sum(lengths) for lengths in dHeader2lLengths.values()])
214
215 NstretchPrct = float(cumulAllStretch)/genomeLength*100
216 outNstrechStats.write("Total N in stretch length > %s: %s bp represent %6.2f %% of genome\n" % (wordN, cumulAllStretch, NstretchPrct))
217 outNstrechStats.close()
218
219 if clean == True:
220 os.remove(inFileName + "_cut")
221 os.remove(NstretchMapFile)
222
223
224 ## Split the input fasta file in several output files
225 #
226 # @param inFile string name of the input fasta file
227 # @param nbSeqPerBatch integer number of sequences per output file
228 # @param newDir boolean put the sequences in a new directory called 'batches'
229 # @param useSeqHeader boolean use sequence header (only if 'nbSeqPerBatch=1')
230 # @param prefix prefix in output file name
231 # @param verbose integer verbosity level (default = 0)
232 #
233 @staticmethod
234 def dbSplit(inFile, nbSeqPerBatch, newDir, useSeqHeader = False, prefix = "batch", verbose = 0):
235 if not os.path.exists(inFile):
236 msg = "ERROR: file '%s' doesn't exist" % inFile
237 sys.stderr.write("%s\n" % msg)
238 sys.exit(1)
239
240 nbSeq = FastaUtils.dbSize(inFile)
241
242 nbBatches = int(math.ceil(nbSeq / float(nbSeqPerBatch)))
243 if verbose > 0:
244 print "save the %i input sequences into %i batches" % (nbSeq, nbBatches)
245 sys.stdout.flush()
246
247 if nbSeqPerBatch != 1 and useSeqHeader:
248 useSeqHeader = False
249
250 if newDir == True:
251 if os.path.exists("batches"):
252 shutil.rmtree("batches")
253 os.mkdir("batches")
254 os.chdir("batches")
255 os.system("ln -s ../%s ." % inFile)
256
257 with open(inFile) as inFileHandler:
258 countBatch = 0
259 countSeq = 0
260 for line in inFileHandler:
261 if line[0] == ">":
262 countSeq += 1
263 if nbSeqPerBatch == 1 or countSeq % nbSeqPerBatch == 1:
264 try:
265 outFile.close()
266 except: pass
267 countBatch += 1
268 if useSeqHeader:
269 outFileName = "%s.fa" % (line[1:-1].replace(" ", "_"))
270 else:
271 outFileName = "%s_%s.fa" % (prefix, str(countBatch).zfill(len(str(nbBatches))))
272 outFile = open(outFileName, "w")
273
274 if verbose > 1:
275 print "saving seq '%s' in file '%s'..." % (line[1:].rstrip(), outFileName)
276 sys.stdout.flush()
277 outFile.write(line)
278
279 if newDir:
280 os.remove(os.path.basename(inFile))
281 os.chdir("..")
282
283
284 ## Split the input fasta file in several output files
285 #
286 # @param inFileName string name of the input fasta file
287 # @param maxSize integer max cumulative length for each output file
288 #
289 @staticmethod
290 def splitFastaFileInBatches(inFileName, maxSize = 200000):
291 iBioseqDB = BioseqDB(inFileName)
292 lHeadersSizeTuples = []
293 for iBioseq in iBioseqDB.db:
294 lHeadersSizeTuples.append((iBioseq.getHeader(), iBioseq.getLength()))
295
296 lHeadersList = LauncherUtils.createHomogeneousSizeList(lHeadersSizeTuples, maxSize)
297 os.mkdir("batches")
298 os.chdir("batches")
299
300 iterator = 0
301 for lHeader in lHeadersList:
302 iterator += 1
303 with open("batch_%s.fa" % iterator, 'w') as f:
304 for header in lHeader :
305 iBioseq = iBioseqDB.fetch(header)
306 iBioseq.write(f)
307 os.chdir("..")
308
309
310 ## Split the input fasta file in several output files according to their cluster identifier
311 #
312 # @param inFileName string name of the input fasta file
313 # @param clusteringMethod string name of the clustering method (Grouper, Recon, Piler, Blastclust)
314 # @param simplifyHeader boolean simplify the headers
315 # @param createDir boolean put the sequences in different directories
316 # @param outPrefix string prefix of the output files (default='seqCluster')
317 # @param verbose integer (default = 0)
318 #
319 @staticmethod
320 def splitSeqPerCluster(inFileName, clusteringMethod, simplifyHeader, createDir, outPrefix = "seqCluster", verbose = 0):
321 if not os.path.exists(inFileName):
322 print "ERROR: %s doesn't exist" % inFileName
323 sys.exit(1)
324
325 inFile = open(inFileName)
326
327 line = inFile.readline()
328 if line:
329 name = line.split(" ")[0]
330 if "Cluster" in name:
331 clusterID = name.split("Cluster")[1].split("Mb")[0]
332 seqID = name.split("Mb")[1]
333 else:
334 clusterID = name.split("Cl")[0].split("Gr")[1] # the notion of 'group' in Grouper corresponds to 'cluster' in Piler, Recon and Blastclust
335 if "Q" in name.split("Gr")[0]:
336 seqID = name.split("Gr")[0].split("MbQ")[1]
337 elif "S" in name:
338 seqID = name.split("Gr")[0].split("MbS")[1]
339 sClusterIDs = set( [ clusterID ] )
340 if simplifyHeader == True:
341 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
342 else:
343 header = line[1:-1]
344 if createDir == True:
345 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
346 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
347 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
348 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
349 outFile = open( outFileName, "w" )
350 outFile.write( ">%s\n" % ( header ) )
351 prevClusterID = clusterID
352
353 line = inFile.readline()
354 while line:
355 if line[0] == ">":
356 name = line.split(" ")[0]
357 if "Cluster" in name:
358 clusterID = name.split("Cluster")[1].split("Mb")[0]
359 seqID = name.split("Mb")[1]
360 else:
361 clusterID = name.split("Cl")[0].split("Gr")[1]
362 if "Q" in name.split("Gr")[0]:
363 seqID = name.split("Gr")[0].split("MbQ")[1]
364 elif "S" in name:
365 seqID = name.split("Gr")[0].split("MbS")[1]
366
367 if clusterID != prevClusterID:
368 outFile.close()
369
370 if simplifyHeader == True:
371 header = "%s_Cluster%s_Seq%s" % ( clusteringMethod, clusterID, seqID )
372 else:
373 header = line[1:-1]
374
375 if createDir == True:
376 os.chdir( ".." )
377 if not os.path.exists( "%s_cluster_%s" % ( inFileName, clusterID ) ):
378 os.mkdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
379 os.chdir( "%s_cluster_%s" % ( inFileName, clusterID ) )
380
381 outFileName = "%s%s.fa" % ( outPrefix, clusterID )
382 if not os.path.exists( outFileName ):
383 outFile = open( outFileName, "w" )
384 else:
385 if clusterID != prevClusterID:
386 outFile.close()
387 outFile = open( outFileName, "a" )
388 outFile.write( ">%s\n" % ( header ) )
389 prevClusterID = clusterID
390 sClusterIDs.add( clusterID )
391
392 else:
393 outFile.write( line )
394
395 line = inFile.readline()
396
397 outFile.close()
398 if verbose > 0:
399 print "number of clusters: %i" % ( len(sClusterIDs) ); sys.stdout.flush()
400
401 if createDir == True:
402 os.chdir("..")
403 else:
404 print "WARNING: empty input file - no cluster found"; sys.stdout.flush()
405
406
407 ## Filter a fasta file in two fasta files using the length of each sequence as a criterion
408 #
409 # @param len_min integer length sequence criterion to filter
410 # @param inFileName string name of the input fasta file
411 # @param verbose integer (default = 0)
412 #
413 @staticmethod
414 def dbLengthFilter(len_min, inFileName, verbose = 0):
415 file_db = open(inFileName,)
416 file_dbInf = open(inFileName + ".Inf" + str(len_min), "w")
417 file_dbSup = open(inFileName + ".Sup" + str(len_min), "w")
418 seq = Bioseq()
419 numseq = 0
420 nbsaveInf = 0
421 nbsaveSup = 0
422 seq.read(file_db)
423 while seq.getHeader():
424 l = seq.getLength()
425 numseq = numseq + 1
426 if l >= len_min:
427 seq.write(file_dbSup)
428 if verbose > 0:
429 nbsaveSup = nbsaveSup + 1
430 else:
431 seq.write(file_dbInf)
432 if verbose > 0:
433 nbsaveInf = nbsaveInf + 1
434 seq.read(file_db)
435
436 file_db.close()
437 file_dbInf.close()
438 file_dbSup.close()
439 if verbose > 0:
440 print "%i saved sequences in %s: %i sequences for %s.Inf%s and %i sequences for %s.Sup%s" % (nbsaveInf + nbsaveSup, inFileName, nbsaveInf, inFileName, str(len_min), nbsaveSup, inFileName, str(len_min))
441
442
443 ## Extract the longest sequences from a fasta file
444 #
445 # @param num integer maximum number of sequences in the output file
446 # @param inFileName string name of the input fasta file
447 # @param outFileName string name of the output fasta file
448 # @param minThresh integer minimum length threshold (default=0)
449 # @param verbose integer (default = 0)
450 #
451 @staticmethod
452 def dbLongestSequences(num, inFileName, outFileName = "", verbose = 0, minThresh = 0):
453 bsDB = BioseqDB(inFileName)
454 if verbose > 0:
455 print "nb of input sequences: %i" % bsDB.getSize()
456
457 if outFileName == "":
458 outFileName = inFileName + ".best" + str(num)
459 outFile = open( outFileName, "w" )
460
461 if bsDB.getSize()==0:
462 return 0
463
464 num = int(num)
465 if verbose > 0:
466 print "keep the %i longest sequences" % num
467 if minThresh > 0:
468 print "with length > %i bp" % minThresh
469 sys.stdout.flush()
470
471 tmpLSeqLgth = bsDB.getListOfSequencesLength()
472 tmpLSeqLgth.sort(reverse = True)
473
474 # select the longests
475 lSeqLgth = []
476 for i in xrange( 0, min(num,len(tmpLSeqLgth)) ):
477 if tmpLSeqLgth[i] >= minThresh:
478 lSeqLgth.append( tmpLSeqLgth[i] )
479 if verbose > 0:
480 print "selected max length: %i" % max(lSeqLgth)
481 print "selected min length: %i" % min(lSeqLgth)
482 sys.stdout.flush()
483
484 # save the longest
485 inFile = open( inFileName )
486 seqNum = 0
487 nbSave = 0
488 for bs in bsDB.db:
489 seqNum += 1
490 if bs.getLength() >= min(lSeqLgth) and bs.getLength() >= minThresh:
491 bs.write( outFile )
492 if verbose > 1:
493 print "%d seq %s : saved !" % ( seqNum, bs.header[0:40] )
494 sys.stdout.flush()
495 nbSave += 1
496 if nbSave == num:
497 break
498 inFile.close()
499 outFile.close()
500 if verbose > 0:
501 print nbSave, "saved sequences in ", outFileName
502 sys.stdout.flush()
503
504 return 0
505
506
507 ## Extract all the sequence headers from a fasta file and write them in a new file
508 #
509 # @param inFileName string name of the input fasta file
510 # @param outFileName string name of the output file recording the headers (default = inFileName + '.headers')
511 #
512 @staticmethod
513 def dbExtractSeqHeaders(inFileName, outFileName = ""):
514 if not outFileName:
515 outFileName = inFileName + ".headers"
516
517 with open(outFileName, "w") as f:
518 for header in FastaUtils.dbHeaders(inFileName):
519 f.write("%s\n" % header)
520
521 return 0
522
523
524 ## Extract sequences and their headers selected by a given pattern from a fasta file and write them in a new fasta file
525 #
526 # @param pattern regular expression to search in headers
527 # @param inFileName string name of the input fasta file
528 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
529 # @param verbose integer verbosity level (default = 0)
530 #
531 @staticmethod
532 def dbExtractByPattern(pattern, inFileName, outFileName = "", verbose = 0):
533 if not pattern:
534 return
535
536 if not outFileName:
537 outFileName = inFileName + '.extracted'
538 outFile = open(outFileName, 'w')
539
540 patternTosearch = re.compile(pattern)
541 bioseq = Bioseq()
542 bioseqNb = 0
543 savedBioseqNb = 0
544 inFile = open(inFileName)
545 bioseq.read(inFile)
546 while bioseq.sequence:
547 bioseqNb = bioseqNb + 1
548 m = patternTosearch.search(bioseq.header)
549 if m:
550 bioseq.write(outFile)
551 if verbose > 1:
552 print 'sequence num', bioseqNb, 'matched on', m.group(), '[', bioseq.header[0:40], '...] saved !!'
553 savedBioseqNb = savedBioseqNb + 1
554 bioseq.read(inFile)
555 inFile.close()
556
557 outFile.close()
558
559 if verbose > 0:
560 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
561
562
563 ## Extract sequences and their headers selected by patterns contained in a file, from a fasta file and write them in a new fasta file
564 #
565 # @param patternFileName string file containing regular expression to search in headers
566 # @param inFileName string name of the input fasta file
567 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
568 # @param verbose integer verbosity level (default = 0)
569 #
570 @staticmethod
571 def dbExtractByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
572 if not patternFileName:
573 print "ERROR: no file of pattern"
574 sys.exit(1)
575
576 bioseq = Bioseq()
577 bioseqNb = 0
578 savedBioseqNb = 0
579 lHeaders = []
580
581 with open(inFileName) as inFile:
582 bioseq.read(inFile)
583 while bioseq.sequence:
584 lHeaders.append(bioseq.header)
585 bioseq.read(inFile)
586
587 lHeadersToKeep = []
588 with open(patternFileName) as patternFile:
589 for pattern in patternFile:
590 pattern = pattern.rstrip()
591 if verbose > 0:
592 print "pattern: ", pattern; sys.stdout.flush()
593
594 patternToSearch = re.compile(pattern)
595 lHeadersToKeep.extend([h for h in lHeaders if patternToSearch.search(h)])
596
597 if not outFileName:
598 outFileName = inFileName + ".extracted"
599
600 with open(outFileName, "w") as outFile:
601 with open(inFileName) as inFile:
602 bioseq.read(inFile)
603 while bioseq.sequence:
604 bioseqNb += 1
605 if bioseq.header in lHeadersToKeep:
606 bioseq.write(outFile)
607 savedBioseqNb += 1
608 if verbose > 1:
609 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
610 bioseq.read(inFile)
611
612 if verbose > 0:
613 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
614
615
616 ## Extract sequences and their headers not selected by a given pattern from a fasta file and write them in a new fasta file
617 #
618 # @param pattern regular expression to search in headers
619 # @param inFileName string name of the input fasta file
620 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
621 # @param verbose integer verbosity level (default = 0)
622 #
623 @staticmethod
624 def dbCleanByPattern(pattern, inFileName, outFileName = "", verbose = 0):
625 if not pattern:
626 return
627
628 patternToSearch = re.compile(pattern)
629
630 if outFileName == "":
631 outFileName = inFileName + '.cleaned'
632
633 with open(outFileName, 'w') as outFile:
634 bioseq = Bioseq()
635 bioseqNb = 0
636 savedBioseqNb = 0
637 with open(inFileName) as inFile:
638 bioseq.read(inFile)
639 while bioseq.sequence:
640 bioseqNb += 1
641 if not patternToSearch.search(bioseq.header):
642 bioseq.write(outFile)
643 savedBioseqNb += 1
644 if verbose > 1:
645 print 'sequence num', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'
646 bioseq.read(inFile)
647
648 if verbose > 0:
649 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
650
651
652 ## Extract sequences and their headers not selected by patterns contained in a file, from a fasta file and write them in a new fasta file
653 #
654 # @param patternFileName string file containing regular expression to search in headers
655 # @param inFileName string name of the input fasta file
656 # @param outFileName string name of the output file recording the selected bioseq (default = inFileName + '.extracted')
657 # @param verbose integer verbosity level (default = 0)
658 #
659 @staticmethod
660 def dbCleanByFilePattern(patternFileName, inFileName, outFileName = "", verbose = 0):
661 if not patternFileName:
662 print "ERROR: no file of pattern"
663 sys.exit(1)
664
665 bioseq = Bioseq()
666 bioseqNb = 0
667 savedBioseqNb = 0
668 lHeaders = []
669 with open(inFileName) as inFile:
670 bioseq.read(inFile)
671 while bioseq.sequence:
672 lHeaders.append(bioseq.header)
673 bioseq.read(inFile)
674
675 with open(patternFileName) as patternFile:
676 lHeadersToRemove = []
677 for pattern in patternFile:
678 pattern = pattern.rstrip()
679 if verbose > 0:
680 print "pattern: ", pattern; sys.stdout.flush()
681
682 patternToSearch = re.compile(pattern)
683 lHeadersToRemove.extend([h for h in lHeaders if patternToSearch.search(h)])
684
685 if not outFileName:
686 outFileName = inFileName + '.cleaned'
687
688 with open(outFileName, 'w') as outFile:
689 bioseqNum = 0
690 with open(inFileName) as inFile:
691 bioseq.read(inFile)
692 while bioseq.sequence:
693 bioseqNum += 1
694 if bioseq.header not in lHeadersToRemove:
695 bioseq.write(outFile)
696 savedBioseqNb += 1
697 if verbose > 1:
698 print 'sequence num', bioseqNum, '/', bioseqNb, '[', bioseq.header[0:40], '...] saved !!'; sys.stdout.flush()
699 bioseq.read(inFile)
700
701 if verbose > 0:
702 print "%i sequences saved in file '%s'" % (savedBioseqNb, outFileName)
703
704
705 ## Find sequence's ORFs from a fasta file and write them in a tab file
706 #
707 # @param inFileName string name of the input fasta file
708 # @param orfMaxNb integer Select orfMaxNb best ORFs
709 # @param orfMinLength integer Keep ORFs with length > orfMinLength
710 # @param outFileName string name of the output fasta file (default = inFileName + '.orf.map')
711 # @param verbose integer verbosity level (default = 0)
712 #
713 @staticmethod
714 def dbORF(inFileName, orfMaxNb = 0, orfMinLength = 0, outFileName = "", verbose = 0):
715 if not outFileName:
716 outFileName = inFileName + ".ORF.map"
717 outFile = open(outFileName, "w")
718
719 bioseq = Bioseq()
720 bioseqNb = 0
721
722 inFile = open(inFileName)
723 bioseq.read(inFile)
724 while bioseq.sequence:
725 bioseq.upCase()
726 bioseqNb += 1
727 if verbose > 0:
728 print 'sequence num', bioseqNb, '=', bioseq.getLength(), '[', bioseq.header[0:40], '...]'
729
730 orf = bioseq.findORF()
731 bestOrf = []
732 for i in orf.keys():
733 orfLen = len(orf[i])
734 for j in xrange(1, orfLen):
735 start = orf[i][j - 1] + 4
736 end = orf[i][j] + 3
737 if end - start >= orfMinLength:
738 bestOrf.append((end - start, i + 1, start, end))
739
740 bioseq.reverseComplement()
741
742 orf = bioseq.findORF()
743 seqLen = bioseq.getLength()
744 for i in orf.keys():
745 orfLen = len(orf[i])
746 for j in xrange(1, orfLen):
747 start = seqLen - orf[i][j - 1] - 3
748 end = seqLen - orf[i][j] - 2
749 if start - end >= orfMinLength:
750 bestOrf.append((start - end, (i + 1) * -1, start, end))
751
752 bestOrf.sort(reverse = True)
753 bestOrfNb = len(bestOrf)
754 if orfMaxNb != 0 and orfMaxNb < bestOrfNb:
755 bestOrfNb = orfMaxNb
756 for i in xrange(0, bestOrfNb):
757 if verbose > 0:
758 print bestOrf[i]
759 outFile.write("%s\t%s\t%d\t%d\n" % ("ORF|" + str(bestOrf[i][1]) + \
760 "|" + str(bestOrf[i][0]), bioseq.header,
761 bestOrf[i][2], bestOrf[i][3]))
762 bioseq.read(inFile)
763
764 inFile.close()
765 outFile.close()
766
767 return 0
768
769
770 ## Sort sequences by increasing length (write a new file)
771 #
772 # @param inFileName string name of the input fasta file
773 # @param outFileName string name of the output fasta file
774 # @param verbose integer verbosity level
775 #
776 @staticmethod
777 def sortSequencesByIncreasingLength(inFileName, outFileName, verbose = 0):
778 if verbose > 0:
779 print "sort sequences by increasing length"
780 sys.stdout.flush()
781 if not os.path.exists(inFileName):
782 print "ERROR: file '%s' doesn't exist" % (inFileName)
783 sys.exit(1)
784
785 # read each seq one by one
786 # save them in distinct temporary files
787 # with their length in the name
788 inFileHandler = open(inFileName, "r")
789 countSeq = 0
790 bs = Bioseq()
791 bs.read(inFileHandler)
792 while bs.header:
793 countSeq += 1
794 tmpFile = "%ibp_%inb" % (bs.getLength(), countSeq)
795 bs.appendBioseqInFile(tmpFile)
796 if verbose > 1:
797 print "%s (%i bp) saved in '%s'" % (bs.header, bs.getLength(), tmpFile)
798 bs.header = ""
799 bs.sequence = ""
800 bs.read(inFileHandler)
801 inFileHandler.close()
802
803 # sort temporary file names
804 # concatenate them into the output file
805 if os.path.exists(outFileName):
806 os.remove(outFileName)
807 lFiles = glob.glob("*bp_*nb")
808 lFiles.sort(key = lambda s:int(s.split("bp_")[0]))
809 for fileName in lFiles:
810 cmd = "cat %s >> %s" % (fileName, outFileName)
811 returnValue = os.system(cmd)
812 if returnValue != 0:
813 print "ERROR while concatenating '%s' with '%s'" % (fileName, outFileName)
814 sys.exit(1)
815 os.remove(fileName)
816
817 return 0
818
819
820 ## Sort sequences by header
821 #
822 # @param inFileName string name of the input fasta file
823 # @param outFileName string name of the output fasta file
824 # @param verbose integer verbosity level
825 #
826 @staticmethod
827 def sortSequencesByHeader(inFileName, outFileName = ""):
828 if outFileName == "":
829 outFileName = "%s_sortByHeaders.fa" % os.path.splitext(inFileName)[0]
830 iBioseqDB = BioseqDB(inFileName)
831 with open(outFileName, "w") as f:
832 for header in sorted(iBioseqDB.getHeaderList()):
833 iBioseq = iBioseqDB.fetch(header)
834 iBioseq.write(f)
835
836
837 ## Return a dictionary which keys are the headers and values the length of the sequences
838 #
839 # @param inFile string name of the input fasta file
840 # @param verbose integer verbosity level
841 #
842 @staticmethod
843 def getLengthPerHeader(inFile, verbose = 0):
844 dHeader2Length = {}
845
846 with open(inFile) as inFileHandler:
847 currentSeqHeader = ""
848 currentSeqLength = 0
849 for line in inFileHandler:
850 if line[0] == ">":
851 if currentSeqHeader != "":
852 dHeader2Length[currentSeqHeader] = currentSeqLength
853 currentSeqLength = 0
854 currentSeqHeader = line[1:-1]
855 if verbose > 0:
856 print "current header: %s" % currentSeqHeader
857 sys.stdout.flush()
858 else:
859 currentSeqLength += len(line.replace("\n", ""))
860 dHeader2Length[currentSeqHeader] = currentSeqLength
861
862 return dHeader2Length
863
864
865 ## Convert headers from a fasta file having chunk coordinates
866 #
867 # @param inFile string name of the input fasta file
868 # @param mapFile string name of the map file with the coordinates of the chunks on the chromosomes
869 # @param outFile string name of the output file
870 #
871 @staticmethod
872 def convertFastaHeadersFromChkToChr(inFile, mapFile, outFile):
873 inFileHandler = open(inFile, "r")
874 outFileHandler = open(outFile, "w")
875 dChunk2Map = MapUtils.getDictPerNameFromMapFile(mapFile)
876 iConvCoord = ConvCoord()
877 for line in inFileHandler:
878 if line[0] == ">":
879 if "{Fragment}" in line:
880 chkName = line.split(" ")[1]
881 chrName = dChunk2Map[chkName].seqname
882 lCoordPairs = line.split(" ")[3].split(",")
883 lRangesOnChk = []
884 for i in lCoordPairs:
885 iRange = Range(chkName, int(i.split("..")[0]), int(i.split("..")[1]))
886 lRangesOnChk.append(iRange)
887 lRangesOnChr = []
888 for iRange in lRangesOnChk:
889 lRangesOnChr.append(iConvCoord.getRangeOnChromosome(iRange, dChunk2Map))
890 newHeader = line[1:-1].split(" ")[0]
891 newHeader += " %s" % chrName
892 newHeader += " {Fragment}"
893 newHeader += " %i..%i" % (lRangesOnChr[0].start, lRangesOnChr[0].end)
894 for iRange in lRangesOnChr[1:]:
895 newHeader += ",%i..%i" % (iRange.start, iRange.end)
896 outFileHandler.write(">%s\n" % newHeader)
897 else:
898 chkName = line.split("_")[1].split(" ")[0]
899 chrName = dChunk2Map[chkName].seqname
900 coords = line[line.find("[")+1 : line.find("]")]
901 start = int(coords.split(",")[0])
902 end = int(coords.split(",")[1])
903 iRangeOnChk = Range(chkName, start, end)
904 iRangeOnChr = iConvCoord.getRangeOnChromosome(iRangeOnChk, dChunk2Map)
905 newHeader = line[1:-1].split("_")[0]
906 newHeader += " %s" % chrName
907 newHeader += " %s" % line[line.find("(") : line.find(")")+1]
908 newHeader += " %i..%i" % (iRangeOnChr.getStart(), iRangeOnChr.getEnd())
909 outFileHandler.write(">%s\n" % newHeader)
910 else:
911 outFileHandler.write(line)
912 inFileHandler.close()
913 outFileHandler.close()
914
915
916 ## Convert a fasta file to a length file
917 #
918 # @param inFile string name of the input fasta file
919 # @param outFile string name of the output file
920 #
921 @staticmethod
922 def convertFastaToLength(inFile, outFile = ""):
923 if not outFile:
924 outFile = "%s.length" % inFile
925
926 if inFile:
927 with open(inFile) as inFH:
928 with open(outFile, "w") as outFH:
929 bioseq = Bioseq()
930 bioseq.read(inFH)
931 while bioseq.sequence:
932 seqLen = bioseq.getLength()
933 outFH.write("%s\t%d\n" % (bioseq.header.split()[0], seqLen))
934 bioseq.read(inFH)
935
936
937 ## Convert a fasta file to a seq file
938 #
939 # @param inFile string name of the input fasta file
940 # @param outFile string name of the output file
941 #
942 @staticmethod
943 def convertFastaToSeq(inFile, outFile = ""):
944 if not outFile:
945 outFile = "%s.seq" % inFile
946
947 if inFile:
948 with open(inFile) as inFH:
949 with open(outFile, "w") as outFH:
950 bioseq = Bioseq()
951 bioseq.read(inFH)
952 while bioseq.sequence:
953 seqLen = bioseq.getLength()
954 outFH.write("%s\t%s\t%s\t%d\n" % (bioseq.header.split()[0], \
955 bioseq.sequence, bioseq.header, seqLen))
956 bioseq.read(inFH)
957
958
959 ## Splice an input fasta file using coordinates in a Map file
960 #
961 # @note the coordinates should be merged beforehand!
962 #
963 @staticmethod
964 def spliceFromCoords(genomeFile, coordFile, obsFile):
965 genomeFileHandler = open(genomeFile)
966 obsFileHandler = open(obsFile, "w")
967 dChr2Maps = MapUtils.getDictPerSeqNameFromMapFile(coordFile)
968
969 bs = Bioseq()
970 bs.read(genomeFileHandler)
971 while bs.sequence:
972
973 if dChr2Maps.has_key(bs.header):
974 lCoords = MapUtils.getMapListSortedByIncreasingMinThenMax(dChr2Maps[bs.header])
975 splicedSeq = []
976 currentSite = 0
977 for iMap in lCoords:
978 minSplice = iMap.getMin() - 1
979 if minSplice > currentSite:
980 splicedSeq += bs.sequence[currentSite : minSplice]
981 if currentSite <= iMap.getMax():
982 currentSite = iMap.getMax()
983 splicedSeq += bs.sequence[currentSite:]
984 bs.sequence = "".join(splicedSeq)
985 bs.write(obsFileHandler)
986 bs.read(genomeFileHandler)
987
988 genomeFileHandler.close()
989 obsFileHandler.close()
990
991
992 ## Shuffle input sequences (single file or files in a directory)
993 #
994 @staticmethod
995 def dbShuffle(inData, outData, verbose = 0):
996 if CheckerUtils.isExecutableInUserPath("esl-shuffle"):
997 prg = "esl-shuffle"
998 else : prg = "shuffle"
999 genericCmd = prg + " --seed 1 -d INPUT > OUTPUT"
1000 if os.path.isfile(inData):
1001 if verbose > 0:
1002 print "shuffle input file '%s'" % inData
1003 cmd = genericCmd.replace("INPUT", inData).replace("OUTPUT", outData)
1004 print cmd
1005 returnStatus = os.system(cmd)
1006 if returnStatus:
1007 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
1008 sys.exit(1)
1009
1010 elif os.path.isdir(inData):
1011 if verbose > 0:
1012 print "shuffle files in input directory '%s'" % inData
1013 if os.path.exists(outData):
1014 shutil.rmtree(outData)
1015 os.mkdir(outData)
1016 lInputFiles = glob.glob("%s/*.fa" % inData)
1017 nbFastaFiles = 0
1018 for inputFile in lInputFiles:
1019 nbFastaFiles += 1
1020 if verbose > 1:
1021 print "%3i / %3i" % (nbFastaFiles, len(lInputFiles))
1022 fastaBaseName = os.path.basename(inputFile)
1023 prefix = os.path.splitext(fastaBaseName)[0]
1024 cmd = genericCmd.replace("INPUT", inputFile).replace("OUTPUT", "%s/%s_shuffle.fa" % (outData, prefix))
1025 returnStatus = os.system(cmd)
1026 if returnStatus:
1027 sys.stderr.write("ERROR: 'shuffle' returned '%i'\n" % returnStatus)
1028 sys.exit(1)
1029
1030
1031 ## Convert a cluster file (one line = one cluster = one headers list) into a fasta file with cluster info in headers
1032 #
1033 # @param inClusterFileName string input cluster file name
1034 # @param inFastaFileName string input fasta file name
1035 # @param outFileName string output file name
1036 # @param verbosity integer verbosity
1037 #
1038 @staticmethod
1039 def convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, outFileName, clusteringTool = "", verbosity = 0):
1040 dHeader2ClusterClusterMember, clusterIdForSingletonCluster = FastaUtils._createHeader2ClusterMemberDict(inClusterFileName, verbosity)
1041 iFastaParser = FastaParser(inFastaFileName)
1042 with open(outFileName, "w") as f:
1043 for iSequence in iFastaParser.getIterator():
1044
1045 header = iSequence.getName()
1046 if dHeader2ClusterClusterMember.get(header):
1047 cluster = dHeader2ClusterClusterMember[header][0]
1048 member = dHeader2ClusterClusterMember[header][1]
1049 else:
1050 clusterIdForSingletonCluster += 1
1051 cluster = clusterIdForSingletonCluster
1052 member = 1
1053
1054 newHeader = "%sCluster%sMb%s_%s" % (clusteringTool, cluster, member, header)
1055 iSequence.setName(newHeader)
1056 f.write(iSequence.printFasta())
1057
1058 @staticmethod
1059 def _createHeader2ClusterMemberDict(inClusterFileName, verbosity = 0):
1060 dHeader2ClusterClusterMember = {}
1061 clusterId = 0
1062 with open(inClusterFileName) as f:
1063 for line in f:
1064 lineWithoutLastChar = line.rstrip()
1065 lHeaders = lineWithoutLastChar.split("\t")
1066 clusterId += 1
1067 if verbosity > 0:
1068 print "%i sequences in cluster %i" % (len(lHeaders), clusterId)
1069 memberId = 0
1070 for header in lHeaders:
1071 memberId += 1
1072 dHeader2ClusterClusterMember[header] = (clusterId, memberId)
1073 if verbosity > 0:
1074 print "%i clusters" % clusterId
1075 return dHeader2ClusterClusterMember, clusterId
1076
1077 @staticmethod
1078 def convertClusteredFastaFileToMapFile(fastaFileNameFromClustering, outMapFileName = ""):
1079 """
1080 Write a map file from fasta output of clustering tool.
1081 Warning: only works if input fasta headers are formated like LTRharvest fasta output.
1082 """
1083 if not outMapFileName:
1084 outMapFileName = "%s.map" % (os.path.splitext(fastaFileNameFromClustering)[0])
1085
1086 fileDb = open(fastaFileNameFromClustering)
1087 fileMap = open(outMapFileName, "w")
1088 seq = Bioseq()
1089 numseq = 0
1090 seq.read(fileDb)
1091 while seq.sequence:
1092 numseq = numseq + 1
1093 ID = seq.header.split(' ')[0].split('_')[0]
1094 chunk = seq.header.split(' ')[0].split('_')[1]
1095 start = seq.header.split(' ')[-1].split(',')[0][1:]
1096 end = seq.header.split(' ')[-1].split(',')[1][:-1]
1097 line = '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
1098 fileMap.write("%s\n" % line)
1099 seq.read(fileDb)
1100
1101 fileDb.close()
1102 fileMap.close()
1103 print "saved in %s" % outMapFileName
1104
1105 @staticmethod
1106 def getNstretchesRangesList(fastaFileName, nbN = 2):
1107 lNstretchesRanges = []
1108 if nbN != 0:
1109 iBSDB = BioseqDB(fastaFileName)
1110 for iBS in iBSDB.db:
1111 nbNFound = 0
1112 start = 1
1113 pos = 1
1114 lastPos = 0
1115
1116 while pos <= iBS.getLength():
1117 if nbNFound == 0:
1118 start = pos
1119
1120 while pos <= iBS.getLength() and iBS.getNtFromPosition(pos).lower() in ['n', 'x']:
1121 nbNFound += 1
1122 pos += 1
1123 lastPos = pos
1124
1125 if pos - lastPos >= nbN:
1126 if nbNFound >= nbN:
1127 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
1128 nbNFound = 0
1129 pos += 1
1130
1131 if nbNFound >= nbN:
1132 lNstretchesRanges.append(Range(iBS.getHeader(), start, lastPos - 1))
1133
1134 lNstretchesRanges.sort(key = lambda iRange: (iRange.getSeqname(), iRange.getStart(), iRange.getEnd()), reverse = False)
1135
1136 return lNstretchesRanges
1137
1138
1139 @staticmethod
1140 def writeNstretches(fastaFileName, nbN = 2, outFileName = "", outFormat = "map"):
1141 lNstretchesRanges = FastaUtils.getNstretchesRangesList(fastaFileName, nbN)
1142
1143 outFormat = outFormat.lower()
1144 if outFormat in ["gff", "gff3"]:
1145 outFormat = "gff3"
1146 else:
1147 outFormat = "map"
1148
1149 if outFileName == "":
1150 outFileName = "%s_Nstretches.%s" % (os.path.splitext(os.path.split(fastaFileName)[1])[0], outFormat)
1151
1152 with open(outFileName, "w") as fH:
1153 if outFormat == "gff3":
1154 fH.write("##gff-version 3\n")
1155 for iRange in lNstretchesRanges:
1156 seq = iRange.getSeqname()
1157 start = iRange.getStart()
1158 end = iRange.getEnd()
1159 if outFormat == "gff3":
1160 fH.write("%s\tFastaUtils\tN_stretch\t%s\t%s\t.\t.\t.\tName=N_stretch_%s-%s\n" % (seq, start, end, start, end))
1161 else:
1162 fH.write("N_stretch\t%s\t%s\t%s\n" % (seq, start, end))
1163