Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/seq/BioseqDB.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:4093a2fb58be | 6:20ec0d14798e |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import sys | |
33 import re | |
34 from commons.core.seq.Bioseq import Bioseq | |
35 from commons.core.stat.Stat import Stat | |
36 | |
37 | |
38 ## Handle a collection of a Bioseq (header-sequence) | |
39 # | |
40 class BioseqDB( object ): | |
41 | |
42 def __init__( self, name="" ): | |
43 self.idx = {} | |
44 self.idx_renamed = {} | |
45 self.db = [] | |
46 self.name = name | |
47 if name != "": | |
48 faFile = open( name ) | |
49 self.read( faFile ) | |
50 faFile.close() | |
51 self.mean_seq_lgth = None | |
52 self.stat = Stat() | |
53 | |
54 | |
55 ## Equal operator | |
56 # | |
57 def __eq__( self, o ): | |
58 if type(o) is type(self): | |
59 selfSize = self.getSize() | |
60 if selfSize != o.getSize(): | |
61 return False | |
62 nbEqualInstances = 0 | |
63 for i in self.db: | |
64 atLeastOneIsEqual = False | |
65 for j in o.db: | |
66 if i == j: | |
67 atLeastOneIsEqual = True | |
68 continue | |
69 if atLeastOneIsEqual: | |
70 nbEqualInstances += 1 | |
71 if nbEqualInstances == selfSize: | |
72 return True | |
73 return False | |
74 | |
75 ## Not equal operator | |
76 # | |
77 def __ne__(self, o): | |
78 return not self.__eq__(o) | |
79 | |
80 ## Change the name of the BioseqDB | |
81 # | |
82 # @param name the BioseqDB name | |
83 # | |
84 def setName(self, name): | |
85 self.name = name | |
86 | |
87 | |
88 ## Record each sequence of the input file as a list of Bioseq instances | |
89 # | |
90 # @param faFileHandler handler of a fasta file | |
91 # | |
92 def read( self, faFileHandler ): | |
93 while True: | |
94 seq = Bioseq() | |
95 seq.read( faFileHandler ) | |
96 if seq.sequence == None: | |
97 break | |
98 self.add( seq ) | |
99 | |
100 | |
101 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) | |
102 # | |
103 # @param faFileHandler file handler of a fasta file | |
104 # | |
105 def write( self, faFileHandler ): | |
106 for bs in self.db: | |
107 bs.writeABioseqInAFastaFile( faFileHandler ) | |
108 | |
109 | |
110 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) | |
111 # | |
112 # @param outFaFileName file name of fasta file | |
113 # @param mode 'write' or 'append' | |
114 # | |
115 def save( self, outFaFileName, mode="w" ): | |
116 outFaFile = open( outFaFileName, mode ) | |
117 self.write( outFaFile ) | |
118 outFaFile.close() | |
119 | |
120 | |
121 ## Read a formatted fasta file and load it in the BioseqDB instance | |
122 # | |
123 # @param inFaFileName file name of fasta file | |
124 # | |
125 def load(self, inFaFileName): | |
126 fichier = open(inFaFileName) | |
127 self.read(fichier) | |
128 fichier.close() | |
129 | |
130 | |
131 ## Reverse each sequence of the collection | |
132 # | |
133 def reverse( self ): | |
134 for bs in self.db: | |
135 bs.reverse() | |
136 | |
137 | |
138 ## Turn each sequence into its complement | |
139 # | |
140 def complement( self ): | |
141 for bs in self.db: | |
142 bs.complement() | |
143 | |
144 | |
145 ## Reverse and complement each sequence | |
146 # | |
147 def reverseComplement( self ): | |
148 for bs in self.db: | |
149 bs.reverseComplement() | |
150 | |
151 | |
152 ## Set the collection from a list of Bioseq instances | |
153 # | |
154 def setData( self, lBioseqs ): | |
155 for i in lBioseqs: | |
156 self.add( i ) | |
157 | |
158 | |
159 ## Initialization of each attribute of the collection | |
160 # | |
161 def reset( self ): | |
162 self.db = [] | |
163 self.idx = {} | |
164 self.name = None | |
165 self.mean_seq_lgth = None | |
166 self.stat.reset() | |
167 | |
168 | |
169 ## Remove all the gap of the sequences of the collection | |
170 # | |
171 def cleanGap(self): | |
172 for iBioSeq in self.db: | |
173 iBioSeq.cleanGap() | |
174 | |
175 | |
176 ## Add a Bioseq instance and update the attributes | |
177 # | |
178 # @param bs a Bioseq instance | |
179 # | |
180 def add( self, bs ): | |
181 if self.idx.has_key( bs.header ): | |
182 sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) ) | |
183 sys.exit(1) | |
184 self.db.append( bs ) | |
185 self.idx[ bs.header ] = len(self.db) - 1 | |
186 self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1 | |
187 | |
188 | |
189 ## Give the Bioseq instance corresponding to specified index | |
190 # | |
191 # @return a Bioseq instance | |
192 # | |
193 def __getitem__(self,index): | |
194 if index < len(self.db): | |
195 return self.db[index] | |
196 | |
197 | |
198 ## Give the number of sequences in the bank | |
199 # | |
200 # @return an integer | |
201 # | |
202 def getSize( self ): | |
203 return len( self.db ) | |
204 | |
205 | |
206 ## Give the cumulative sequence length in the bank | |
207 # | |
208 # @return an integer | |
209 # | |
210 def getLength( self ): | |
211 cumLength = 0 | |
212 for iBioseq in self.db: | |
213 cumLength += iBioseq.getLength() | |
214 | |
215 return cumLength | |
216 | |
217 | |
218 ## Return the length of a given sequence via its header | |
219 # | |
220 # @return an integer | |
221 # | |
222 def getSeqLength( self, header ): | |
223 return self.fetch(header).getLength() | |
224 | |
225 | |
226 ## Return a list with the sequence headers | |
227 # | |
228 def getHeaderList( self ): | |
229 lHeaders = [] | |
230 for bs in self.db: | |
231 lHeaders.append( bs.header ) | |
232 return lHeaders | |
233 | |
234 | |
235 ## Return a list with the sequences | |
236 # | |
237 def getSequencesList( self ): | |
238 lSeqs = [] | |
239 for bs in self.db: | |
240 lSeqs.append( bs.getSequence() ) | |
241 return lSeqs | |
242 | |
243 | |
244 ## Give the Bioseq instance of the BioseqDB specified by its header | |
245 # | |
246 # @warning name of this method not appropriate getBioseqByHeader is proposed | |
247 # @param header string | |
248 # @return a Bioseq instance | |
249 # | |
250 def fetch( self, header ): | |
251 idx = self.idx.get(header,None) | |
252 if idx is not None: | |
253 return self.db[idx] | |
254 else: | |
255 idx = self.idx_renamed.get(header,None) | |
256 if idx is not None: | |
257 return self.db[idx] | |
258 else: | |
259 raise Exception("Header: "+header+" not found") | |
260 | |
261 | |
262 ## Get a list of Bioseq instances based on a list of headers | |
263 # | |
264 # @param lHeader list | |
265 # @return a list of Bioseq instances | |
266 # | |
267 def fetchList( self, lheader ): | |
268 result = [] | |
269 for headerName in lheader: | |
270 result.append(self.fetch( headerName )) | |
271 return result | |
272 | |
273 | |
274 ## Sort self on its Bioseq size, possibly by decreasing length | |
275 # | |
276 # @param reverse boolean | |
277 # | |
278 def sortByLength(self, reverse = False): | |
279 self.db.sort(key = lambda iBS: iBS.getLength(), reverse = reverse) | |
280 | |
281 | |
282 ## Give the Bioseq instance of the BioseqDB specified by its renamed header | |
283 # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_" | |
284 # | |
285 # @param renamedHeader string | |
286 # @return a Bioseq instance | |
287 # | |
288 def getBioseqByRenamedHeader( self, renamedHeader ): | |
289 return self.db[self.idx_renamed[renamedHeader]] | |
290 | |
291 | |
292 ## Count the number of times the given nucleotide is present in the bank. | |
293 # | |
294 # @param nt character (nt or aa) | |
295 # @return an integer | |
296 # | |
297 def countNt( self, nt ): | |
298 total = 0 | |
299 for iBioseq in self.db: | |
300 total+= iBioseq.countNt( nt ) | |
301 return total | |
302 | |
303 | |
304 ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank. | |
305 # | |
306 # @return a dictionary with nucleotide as key and an integer as values | |
307 # | |
308 def countAllNt( self ): | |
309 dNt2Count = {} | |
310 for nt in ["A","T","G","C","N"]: | |
311 dNt2Count[ nt ] = self.countNt( nt ) | |
312 return dNt2Count | |
313 | |
314 | |
315 ## Extract a sub BioseqDB of specified size which beginning at specified start | |
316 # | |
317 # @param start integer index of first included Bioseq | |
318 # @param size integer size of expected BioseqDB | |
319 # @return a BioseqDB | |
320 # | |
321 def extractPart(self, start, size): | |
322 iShorterBioseqDB = BioseqDB() | |
323 for iBioseq in self.db[start:(start + size)]: | |
324 iShorterBioseqDB.add(iBioseq) | |
325 return iShorterBioseqDB | |
326 | |
327 | |
328 ## Extract a sub BioseqDB with the specified number of best length Bioseq | |
329 # | |
330 # @param numBioseq integer the number of Bioseq searched | |
331 # @return a BioseqDB | |
332 # | |
333 def bestLength(self, numBioseq): | |
334 length_list = [] | |
335 numseq = 0 | |
336 for each_seq in self.db: | |
337 if each_seq.sequence == None: | |
338 l=0 | |
339 else: | |
340 l = each_seq.getLength() | |
341 length_list.append(l) | |
342 numseq = numseq + 1 | |
343 | |
344 length_list.sort() | |
345 size = len(length_list) | |
346 if numBioseq < size: | |
347 len_min = length_list[size-numBioseq] | |
348 else: | |
349 len_min = length_list[0] | |
350 | |
351 numseq = 0 | |
352 nbsave = 0 | |
353 bestSeqs = BioseqDB() | |
354 bestSeqs.setName(self.name) | |
355 for each_seq in self.db: | |
356 if each_seq.sequence == None: | |
357 l=0 | |
358 else : | |
359 l = each_seq.getLength() | |
360 numseq = numseq + 1 | |
361 if l >= len_min: | |
362 bestSeqs.add(each_seq) | |
363 nbsave = nbsave + 1 | |
364 if nbsave == numBioseq : | |
365 break | |
366 return bestSeqs | |
367 | |
368 | |
369 ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern | |
370 # | |
371 # @param pattern regular expression of wished Bioseq header | |
372 # @param inFileName name of fasta file in which we want extract the BioseqDB | |
373 # | |
374 def extractPatternOfFile(self, pattern, inFileName): | |
375 if pattern=="" : | |
376 return | |
377 srch=re.compile(pattern) | |
378 file_db=open(inFileName) | |
379 numseq=0 | |
380 nbsave=0 | |
381 while 1: | |
382 seq=Bioseq() | |
383 seq.read(file_db) | |
384 if seq.sequence==None: | |
385 break | |
386 numseq+=1 | |
387 m=srch.search(seq.header) | |
388 if m: | |
389 self.add(seq) | |
390 nbsave+=1 | |
391 file_db.close() | |
392 | |
393 | |
394 ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern | |
395 # | |
396 # @param pattern regular expression of wished Bioseq header | |
397 # | |
398 # @return a BioseqDB | |
399 # | |
400 def getByPattern(self,pattern): | |
401 if pattern=="" : | |
402 return | |
403 iBioseqDB=BioseqDB() | |
404 srch=re.compile(pattern) | |
405 for iBioseq in self.db: | |
406 if srch.search(iBioseq.header): | |
407 iBioseqDB.add(iBioseq) | |
408 return iBioseqDB | |
409 | |
410 | |
411 ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern | |
412 # | |
413 # @param pattern regular expression of not wished Bioseq header | |
414 # | |
415 # @return a BioseqDB | |
416 # | |
417 def getDiffFromPattern(self,pattern): | |
418 if pattern=="" : | |
419 return | |
420 iBioseqDB=BioseqDB() | |
421 srch=re.compile(pattern) | |
422 for iBioseq in self.db: | |
423 if not srch.search(iBioseq.header): | |
424 iBioseqDB.add(iBioseq) | |
425 return iBioseqDB | |
426 | |
427 #TODO: to run several times to remove all concerned sequences when big data. How to fix it ? | |
428 ## Remove from the instance all Bioseq which header contains the specified pattern | |
429 # | |
430 # @param pattern regular expression of not wished Bioseq header | |
431 # | |
432 def rmByPattern(self,pattern): | |
433 if pattern=="" : | |
434 return | |
435 srch=re.compile(pattern) | |
436 for seq in self.db: | |
437 if srch.search(seq.header): | |
438 self.db.remove(seq) | |
439 | |
440 | |
441 ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern | |
442 # | |
443 # @warning this method is called extractPattern in pyRepet.seq.BioseqDB | |
444 # | |
445 # @param pattern regular expression of wished Bioseq header | |
446 # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq | |
447 # | |
448 def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB): | |
449 if pattern=="" : | |
450 return | |
451 srch=re.compile(pattern) | |
452 for seq in sourceBioseqDB.db: | |
453 m=srch.search(seq.header) | |
454 if m: | |
455 self.add(seq) | |
456 | |
457 | |
458 ## Up-case the sequence characters in all sequences | |
459 # | |
460 def upCase( self ): | |
461 for bs in self.db: | |
462 bs.upCase() | |
463 | |
464 | |
465 ## Split each gapped Bioseq in a list and store all in a dictionary | |
466 # | |
467 # @return a dict, keys are bioseq headers, values are list of Map instances | |
468 # | |
469 def getDictOfLMapsWithoutGaps( self ): | |
470 dSeq2Maps = {} | |
471 | |
472 for bs in self.db: | |
473 dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap() | |
474 | |
475 return dSeq2Maps | |
476 | |
477 ## Give the list of the sequence length in the bank | |
478 # | |
479 # @return an list | |
480 # | |
481 def getListOfSequencesLength( self ): | |
482 lLength = [] | |
483 for iBioseq in self.db: | |
484 lLength.append(iBioseq.getLength()) | |
485 | |
486 return lLength | |
487 | |
488 ## Return sequence length for a list of sequence header | |
489 # | |
490 def getSeqLengthByListOfName( self, lHeaderName ): | |
491 lseqLength=[] | |
492 for headerName in lHeaderName: | |
493 lseqLength.append(self.getSeqLength( headerName )) | |
494 return lseqLength |