comparison TEisotools-1.0/commons/core/seq/BioseqDB.py @ 6:20ec0d14798e draft

Uploaded
author urgi-team
date Wed, 20 Jul 2016 05:00:24 -0400
parents
children
comparison
equal deleted inserted replaced
5:4093a2fb58be 6:20ec0d14798e
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import sys
33 import re
34 from commons.core.seq.Bioseq import Bioseq
35 from commons.core.stat.Stat import Stat
36
37
38 ## Handle a collection of a Bioseq (header-sequence)
39 #
40 class BioseqDB( object ):
41
42 def __init__( self, name="" ):
43 self.idx = {}
44 self.idx_renamed = {}
45 self.db = []
46 self.name = name
47 if name != "":
48 faFile = open( name )
49 self.read( faFile )
50 faFile.close()
51 self.mean_seq_lgth = None
52 self.stat = Stat()
53
54
55 ## Equal operator
56 #
57 def __eq__( self, o ):
58 if type(o) is type(self):
59 selfSize = self.getSize()
60 if selfSize != o.getSize():
61 return False
62 nbEqualInstances = 0
63 for i in self.db:
64 atLeastOneIsEqual = False
65 for j in o.db:
66 if i == j:
67 atLeastOneIsEqual = True
68 continue
69 if atLeastOneIsEqual:
70 nbEqualInstances += 1
71 if nbEqualInstances == selfSize:
72 return True
73 return False
74
75 ## Not equal operator
76 #
77 def __ne__(self, o):
78 return not self.__eq__(o)
79
80 ## Change the name of the BioseqDB
81 #
82 # @param name the BioseqDB name
83 #
84 def setName(self, name):
85 self.name = name
86
87
88 ## Record each sequence of the input file as a list of Bioseq instances
89 #
90 # @param faFileHandler handler of a fasta file
91 #
92 def read( self, faFileHandler ):
93 while True:
94 seq = Bioseq()
95 seq.read( faFileHandler )
96 if seq.sequence == None:
97 break
98 self.add( seq )
99
100
101 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)
102 #
103 # @param faFileHandler file handler of a fasta file
104 #
105 def write( self, faFileHandler ):
106 for bs in self.db:
107 bs.writeABioseqInAFastaFile( faFileHandler )
108
109
110 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long)
111 #
112 # @param outFaFileName file name of fasta file
113 # @param mode 'write' or 'append'
114 #
115 def save( self, outFaFileName, mode="w" ):
116 outFaFile = open( outFaFileName, mode )
117 self.write( outFaFile )
118 outFaFile.close()
119
120
121 ## Read a formatted fasta file and load it in the BioseqDB instance
122 #
123 # @param inFaFileName file name of fasta file
124 #
125 def load(self, inFaFileName):
126 fichier = open(inFaFileName)
127 self.read(fichier)
128 fichier.close()
129
130
131 ## Reverse each sequence of the collection
132 #
133 def reverse( self ):
134 for bs in self.db:
135 bs.reverse()
136
137
138 ## Turn each sequence into its complement
139 #
140 def complement( self ):
141 for bs in self.db:
142 bs.complement()
143
144
145 ## Reverse and complement each sequence
146 #
147 def reverseComplement( self ):
148 for bs in self.db:
149 bs.reverseComplement()
150
151
152 ## Set the collection from a list of Bioseq instances
153 #
154 def setData( self, lBioseqs ):
155 for i in lBioseqs:
156 self.add( i )
157
158
159 ## Initialization of each attribute of the collection
160 #
161 def reset( self ):
162 self.db = []
163 self.idx = {}
164 self.name = None
165 self.mean_seq_lgth = None
166 self.stat.reset()
167
168
169 ## Remove all the gap of the sequences of the collection
170 #
171 def cleanGap(self):
172 for iBioSeq in self.db:
173 iBioSeq.cleanGap()
174
175
176 ## Add a Bioseq instance and update the attributes
177 #
178 # @param bs a Bioseq instance
179 #
180 def add( self, bs ):
181 if self.idx.has_key( bs.header ):
182 sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) )
183 sys.exit(1)
184 self.db.append( bs )
185 self.idx[ bs.header ] = len(self.db) - 1
186 self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1
187
188
189 ## Give the Bioseq instance corresponding to specified index
190 #
191 # @return a Bioseq instance
192 #
193 def __getitem__(self,index):
194 if index < len(self.db):
195 return self.db[index]
196
197
198 ## Give the number of sequences in the bank
199 #
200 # @return an integer
201 #
202 def getSize( self ):
203 return len( self.db )
204
205
206 ## Give the cumulative sequence length in the bank
207 #
208 # @return an integer
209 #
210 def getLength( self ):
211 cumLength = 0
212 for iBioseq in self.db:
213 cumLength += iBioseq.getLength()
214
215 return cumLength
216
217
218 ## Return the length of a given sequence via its header
219 #
220 # @return an integer
221 #
222 def getSeqLength( self, header ):
223 return self.fetch(header).getLength()
224
225
226 ## Return a list with the sequence headers
227 #
228 def getHeaderList( self ):
229 lHeaders = []
230 for bs in self.db:
231 lHeaders.append( bs.header )
232 return lHeaders
233
234
235 ## Return a list with the sequences
236 #
237 def getSequencesList( self ):
238 lSeqs = []
239 for bs in self.db:
240 lSeqs.append( bs.getSequence() )
241 return lSeqs
242
243
244 ## Give the Bioseq instance of the BioseqDB specified by its header
245 #
246 # @warning name of this method not appropriate getBioseqByHeader is proposed
247 # @param header string
248 # @return a Bioseq instance
249 #
250 def fetch( self, header ):
251 idx = self.idx.get(header,None)
252 if idx is not None:
253 return self.db[idx]
254 else:
255 idx = self.idx_renamed.get(header,None)
256 if idx is not None:
257 return self.db[idx]
258 else:
259 raise Exception("Header: "+header+" not found")
260
261
262 ## Get a list of Bioseq instances based on a list of headers
263 #
264 # @param lHeader list
265 # @return a list of Bioseq instances
266 #
267 def fetchList( self, lheader ):
268 result = []
269 for headerName in lheader:
270 result.append(self.fetch( headerName ))
271 return result
272
273
274 ## Sort self on its Bioseq size, possibly by decreasing length
275 #
276 # @param reverse boolean
277 #
278 def sortByLength(self, reverse = False):
279 self.db.sort(key = lambda iBS: iBS.getLength(), reverse = reverse)
280
281
282 ## Give the Bioseq instance of the BioseqDB specified by its renamed header
283 # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_"
284 #
285 # @param renamedHeader string
286 # @return a Bioseq instance
287 #
288 def getBioseqByRenamedHeader( self, renamedHeader ):
289 return self.db[self.idx_renamed[renamedHeader]]
290
291
292 ## Count the number of times the given nucleotide is present in the bank.
293 #
294 # @param nt character (nt or aa)
295 # @return an integer
296 #
297 def countNt( self, nt ):
298 total = 0
299 for iBioseq in self.db:
300 total+= iBioseq.countNt( nt )
301 return total
302
303
304 ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank.
305 #
306 # @return a dictionary with nucleotide as key and an integer as values
307 #
308 def countAllNt( self ):
309 dNt2Count = {}
310 for nt in ["A","T","G","C","N"]:
311 dNt2Count[ nt ] = self.countNt( nt )
312 return dNt2Count
313
314
315 ## Extract a sub BioseqDB of specified size which beginning at specified start
316 #
317 # @param start integer index of first included Bioseq
318 # @param size integer size of expected BioseqDB
319 # @return a BioseqDB
320 #
321 def extractPart(self, start, size):
322 iShorterBioseqDB = BioseqDB()
323 for iBioseq in self.db[start:(start + size)]:
324 iShorterBioseqDB.add(iBioseq)
325 return iShorterBioseqDB
326
327
328 ## Extract a sub BioseqDB with the specified number of best length Bioseq
329 #
330 # @param numBioseq integer the number of Bioseq searched
331 # @return a BioseqDB
332 #
333 def bestLength(self, numBioseq):
334 length_list = []
335 numseq = 0
336 for each_seq in self.db:
337 if each_seq.sequence == None:
338 l=0
339 else:
340 l = each_seq.getLength()
341 length_list.append(l)
342 numseq = numseq + 1
343
344 length_list.sort()
345 size = len(length_list)
346 if numBioseq < size:
347 len_min = length_list[size-numBioseq]
348 else:
349 len_min = length_list[0]
350
351 numseq = 0
352 nbsave = 0
353 bestSeqs = BioseqDB()
354 bestSeqs.setName(self.name)
355 for each_seq in self.db:
356 if each_seq.sequence == None:
357 l=0
358 else :
359 l = each_seq.getLength()
360 numseq = numseq + 1
361 if l >= len_min:
362 bestSeqs.add(each_seq)
363 nbsave = nbsave + 1
364 if nbsave == numBioseq :
365 break
366 return bestSeqs
367
368
369 ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern
370 #
371 # @param pattern regular expression of wished Bioseq header
372 # @param inFileName name of fasta file in which we want extract the BioseqDB
373 #
374 def extractPatternOfFile(self, pattern, inFileName):
375 if pattern=="" :
376 return
377 srch=re.compile(pattern)
378 file_db=open(inFileName)
379 numseq=0
380 nbsave=0
381 while 1:
382 seq=Bioseq()
383 seq.read(file_db)
384 if seq.sequence==None:
385 break
386 numseq+=1
387 m=srch.search(seq.header)
388 if m:
389 self.add(seq)
390 nbsave+=1
391 file_db.close()
392
393
394 ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern
395 #
396 # @param pattern regular expression of wished Bioseq header
397 #
398 # @return a BioseqDB
399 #
400 def getByPattern(self,pattern):
401 if pattern=="" :
402 return
403 iBioseqDB=BioseqDB()
404 srch=re.compile(pattern)
405 for iBioseq in self.db:
406 if srch.search(iBioseq.header):
407 iBioseqDB.add(iBioseq)
408 return iBioseqDB
409
410
411 ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern
412 #
413 # @param pattern regular expression of not wished Bioseq header
414 #
415 # @return a BioseqDB
416 #
417 def getDiffFromPattern(self,pattern):
418 if pattern=="" :
419 return
420 iBioseqDB=BioseqDB()
421 srch=re.compile(pattern)
422 for iBioseq in self.db:
423 if not srch.search(iBioseq.header):
424 iBioseqDB.add(iBioseq)
425 return iBioseqDB
426
427 #TODO: to run several times to remove all concerned sequences when big data. How to fix it ?
428 ## Remove from the instance all Bioseq which header contains the specified pattern
429 #
430 # @param pattern regular expression of not wished Bioseq header
431 #
432 def rmByPattern(self,pattern):
433 if pattern=="" :
434 return
435 srch=re.compile(pattern)
436 for seq in self.db:
437 if srch.search(seq.header):
438 self.db.remove(seq)
439
440
441 ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern
442 #
443 # @warning this method is called extractPattern in pyRepet.seq.BioseqDB
444 #
445 # @param pattern regular expression of wished Bioseq header
446 # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq
447 #
448 def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB):
449 if pattern=="" :
450 return
451 srch=re.compile(pattern)
452 for seq in sourceBioseqDB.db:
453 m=srch.search(seq.header)
454 if m:
455 self.add(seq)
456
457
458 ## Up-case the sequence characters in all sequences
459 #
460 def upCase( self ):
461 for bs in self.db:
462 bs.upCase()
463
464
465 ## Split each gapped Bioseq in a list and store all in a dictionary
466 #
467 # @return a dict, keys are bioseq headers, values are list of Map instances
468 #
469 def getDictOfLMapsWithoutGaps( self ):
470 dSeq2Maps = {}
471
472 for bs in self.db:
473 dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap()
474
475 return dSeq2Maps
476
477 ## Give the list of the sequence length in the bank
478 #
479 # @return an list
480 #
481 def getListOfSequencesLength( self ):
482 lLength = []
483 for iBioseq in self.db:
484 lLength.append(iBioseq.getLength())
485
486 return lLength
487
488 ## Return sequence length for a list of sequence header
489 #
490 def getSeqLengthByListOfName( self, lHeaderName ):
491 lseqLength=[]
492 for headerName in lHeaderName:
493 lseqLength.append(self.getSeqLength( headerName ))
494 return lseqLength