Mercurial > repos > yufei-luo > s_mart
comparison smart_toolShed/commons/core/seq/BioseqDB.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0f8dcca02ed |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import sys | |
33 import re | |
34 from commons.core.seq.Bioseq import Bioseq | |
35 from commons.core.stat.Stat import Stat | |
36 | |
37 | |
38 ## Handle a collection of a Bioseq (header-sequence) | |
39 # | |
40 class BioseqDB( object ): | |
41 | |
42 def __init__( self, name="" ): | |
43 self.idx = {} | |
44 self.idx_renamed = {} | |
45 self.db = [] | |
46 self.name = name | |
47 if name != "": | |
48 faFile = open( name ) | |
49 self.read( faFile ) | |
50 faFile.close() | |
51 self.mean_seq_lgth = None | |
52 self.stat = Stat() | |
53 | |
54 | |
55 ## Equal operator | |
56 # | |
57 def __eq__( self, o ): | |
58 selfSize = self.getSize() | |
59 if selfSize != o.getSize(): | |
60 return False | |
61 nbEqualInstances = 0 | |
62 for i in self.db: | |
63 atLeastOneIsEqual = False | |
64 for j in o.db: | |
65 if i == j: | |
66 atLeastOneIsEqual = True | |
67 continue | |
68 if atLeastOneIsEqual: | |
69 nbEqualInstances += 1 | |
70 if nbEqualInstances == selfSize: | |
71 return True | |
72 return False | |
73 | |
74 | |
75 ## Change the name of the BioseqDB | |
76 # | |
77 # @param name the BioseqDB name | |
78 # | |
79 def setName(self, name): | |
80 self.name = name | |
81 | |
82 | |
83 ## Record each sequence of the input file as a list of Bioseq instances | |
84 # | |
85 # @param faFileHandler handler of a fasta file | |
86 # | |
87 def read( self, faFileHandler ): | |
88 while True: | |
89 seq = Bioseq() | |
90 seq.read( faFileHandler ) | |
91 if seq.sequence == None: | |
92 break | |
93 self.add( seq ) | |
94 | |
95 | |
96 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) | |
97 # | |
98 # @param faFileHandler file handler of a fasta file | |
99 # | |
100 def write( self, faFileHandler ): | |
101 for bs in self.db: | |
102 bs.writeABioseqInAFastaFile( faFileHandler ) | |
103 | |
104 | |
105 ## Write all Bioseq of BioseqDB in a formatted fasta file (60 character long) | |
106 # | |
107 # @param outFaFileName file name of fasta file | |
108 # @param mode 'write' or 'append' | |
109 # | |
110 def save( self, outFaFileName, mode="w" ): | |
111 outFaFile = open( outFaFileName, mode ) | |
112 self.write( outFaFile ) | |
113 outFaFile.close() | |
114 | |
115 | |
116 ## Read a formatted fasta file and load it in the BioseqDB instance | |
117 # | |
118 # @param inFaFileName file name of fasta file | |
119 # | |
120 def load(self, inFaFileName): | |
121 fichier = open(inFaFileName) | |
122 self.read(fichier) | |
123 fichier.close() | |
124 | |
125 | |
126 ## Reverse each sequence of the collection | |
127 # | |
128 def reverse( self ): | |
129 for bs in self.db: | |
130 bs.reverse() | |
131 | |
132 | |
133 ## Turn each sequence into its complement | |
134 # | |
135 def complement( self ): | |
136 for bs in self.db: | |
137 bs.complement() | |
138 | |
139 | |
140 ## Reverse and complement each sequence | |
141 # | |
142 def reverseComplement( self ): | |
143 for bs in self.db: | |
144 bs.reverseComplement() | |
145 | |
146 | |
147 ## Set the collection from a list of Bioseq instances | |
148 # | |
149 def setData( self, lBioseqs ): | |
150 for i in lBioseqs: | |
151 self.add( i ) | |
152 | |
153 | |
154 ## Initialization of each attribute of the collection | |
155 # | |
156 def reset( self ): | |
157 self.db = [] | |
158 self.idx = {} | |
159 self.name = None | |
160 self.mean_seq_lgth = None | |
161 self.stat.reset() | |
162 | |
163 | |
164 ## Remove all the gap of the sequences of the collection | |
165 # | |
166 def cleanGap(self): | |
167 for iBioSeq in self.db: | |
168 iBioSeq.cleanGap() | |
169 | |
170 | |
171 ## Add a Bioseq instance and update the attributes | |
172 # | |
173 # @param bs a Bioseq instance | |
174 # | |
175 def add( self, bs ): | |
176 if self.idx.has_key( bs.header ): | |
177 sys.stderr.write( "ERROR: two sequences with same header '%s'\n" % ( bs.header ) ) | |
178 sys.exit(1) | |
179 self.db.append( bs ) | |
180 self.idx[ bs.header ] = len(self.db) - 1 | |
181 self.idx_renamed[ bs.header.replace("::","-").replace(":","-").replace(",","-").replace(" ","_") ] = len(self.db) - 1 | |
182 | |
183 | |
184 ## Give the Bioseq instance corresponding to specified index | |
185 # | |
186 # @return a Bioseq instance | |
187 # | |
188 def __getitem__(self,index): | |
189 if index < len(self.db): | |
190 return self.db[index] | |
191 | |
192 | |
193 ## Give the number of sequences in the bank | |
194 # | |
195 # @return an integer | |
196 # | |
197 def getSize( self ): | |
198 return len( self.db ) | |
199 | |
200 | |
201 ## Give the cumulative sequence length in the bank | |
202 # | |
203 # @return an integer | |
204 # | |
205 def getLength( self ): | |
206 cumLength = 0 | |
207 for iBioseq in self.db: | |
208 cumLength += iBioseq.getLength() | |
209 | |
210 return cumLength | |
211 | |
212 | |
213 ## Return the length of a given sequence via its header | |
214 # | |
215 # @return an integer | |
216 # | |
217 def getSeqLength( self, header ): | |
218 return self.fetch(header).getLength() | |
219 | |
220 | |
221 ## Return a list with the sequence headers | |
222 # | |
223 def getHeaderList( self ): | |
224 lHeaders = [] | |
225 for bs in self.db: | |
226 lHeaders.append( bs.header ) | |
227 return lHeaders | |
228 | |
229 | |
230 ## Return a list with the sequences | |
231 # | |
232 def getSequencesList( self ): | |
233 lSeqs = [] | |
234 for bs in self.db: | |
235 lSeqs.append( bs.getSequence() ) | |
236 return lSeqs | |
237 | |
238 | |
239 ## Give the Bioseq instance of the BioseqDB specified by its header | |
240 # | |
241 # @warning name of this method not appropriate getBioseqByHeader is proposed | |
242 # @param header string | |
243 # @return a Bioseq instance | |
244 # | |
245 def fetch( self, header ): | |
246 return self.db[self.idx[header]] | |
247 | |
248 | |
249 ## Give the Bioseq instance of the BioseqDB specified by its renamed header | |
250 # In renamed header "::", ":", "," character are been replaced by "-" and " " by "_" | |
251 # | |
252 # @param renamedHeader string | |
253 # @return a Bioseq instance | |
254 # | |
255 def getBioseqByRenamedHeader( self, renamedHeader ): | |
256 return self.db[self.idx_renamed[renamedHeader]] | |
257 | |
258 | |
259 ## Count the number of times the given nucleotide is present in the bank. | |
260 # | |
261 # @param nt character (nt or aa) | |
262 # @return an integer | |
263 # | |
264 def countNt( self, nt ): | |
265 total = 0 | |
266 for iBioseq in self.db: | |
267 total+= iBioseq.countNt( nt ) | |
268 return total | |
269 | |
270 | |
271 ## Count the number of times each nucleotide (A,T,G,C,N) is present in the bank. | |
272 # | |
273 # @return a dictionary with nucleotide as key and an integer as values | |
274 # | |
275 def countAllNt( self ): | |
276 dNt2Count = {} | |
277 for nt in ["A","T","G","C","N"]: | |
278 dNt2Count[ nt ] = self.countNt( nt ) | |
279 return dNt2Count | |
280 | |
281 | |
282 ## Extract a sub BioseqDB of specified size which beginning at specified start | |
283 # | |
284 # @param start integer index of first included Bioseq | |
285 # @param size integer size of expected BioseqDB | |
286 # @return a BioseqDB | |
287 # | |
288 def extractPart(self, start, size): | |
289 iShorterBioseqDB = BioseqDB() | |
290 for iBioseq in self.db[start:(start + size)]: | |
291 iShorterBioseqDB.add(iBioseq) | |
292 return iShorterBioseqDB | |
293 | |
294 | |
295 ## Extract a sub BioseqDB with the specified number of best length Bioseq | |
296 # | |
297 # @param numBioseq integer the number of Bioseq searched | |
298 # @return a BioseqDB | |
299 # | |
300 def bestLength(self, numBioseq): | |
301 length_list = [] | |
302 numseq = 0 | |
303 for each_seq in self.db: | |
304 if each_seq.sequence == None: | |
305 l=0 | |
306 else: | |
307 l = each_seq.getLength() | |
308 length_list.append(l) | |
309 numseq = numseq + 1 | |
310 | |
311 length_list.sort() | |
312 size = len(length_list) | |
313 if numBioseq < size: | |
314 len_min = length_list[size-numBioseq] | |
315 else: | |
316 len_min = length_list[0] | |
317 | |
318 numseq = 0 | |
319 nbsave = 0 | |
320 bestSeqs = BioseqDB() | |
321 bestSeqs.setName(self.name) | |
322 for each_seq in self.db: | |
323 if each_seq.sequence == None: | |
324 l=0 | |
325 else : | |
326 l = each_seq.getLength() | |
327 numseq = numseq + 1 | |
328 if l >= len_min: | |
329 bestSeqs.add(each_seq) | |
330 nbsave = nbsave + 1 | |
331 if nbsave == numBioseq : | |
332 break | |
333 return bestSeqs | |
334 | |
335 | |
336 ## Extract a sub BioseqDB from a file with Bioseq header containing the specified pattern | |
337 # | |
338 # @param pattern regular expression of wished Bioseq header | |
339 # @param inFileName name of fasta file in which we want extract the BioseqDB | |
340 # | |
341 def extractPatternOfFile(self, pattern, inFileName): | |
342 if pattern=="" : | |
343 return | |
344 srch=re.compile(pattern) | |
345 file_db=open(inFileName) | |
346 numseq=0 | |
347 nbsave=0 | |
348 while 1: | |
349 seq=Bioseq() | |
350 seq.read(file_db) | |
351 if seq.sequence==None: | |
352 break | |
353 numseq+=1 | |
354 m=srch.search(seq.header) | |
355 if m: | |
356 self.add(seq) | |
357 nbsave+=1 | |
358 file_db.close() | |
359 | |
360 | |
361 ## Extract a sub BioseqDB from the instance with all Bioseq header containing the specified pattern | |
362 # | |
363 # @param pattern regular expression of wished Bioseq header | |
364 # | |
365 # @return a BioseqDB | |
366 # | |
367 def getByPattern(self,pattern): | |
368 if pattern=="" : | |
369 return | |
370 iBioseqDB=BioseqDB() | |
371 srch=re.compile(pattern) | |
372 for iBioseq in self.db: | |
373 if srch.search(iBioseq.header): | |
374 iBioseqDB.add(iBioseq) | |
375 return iBioseqDB | |
376 | |
377 | |
378 ## Extract a sub BioseqDB from the instance with all Bioseq header not containing the specified pattern | |
379 # | |
380 # @param pattern regular expression of not wished Bioseq header | |
381 # | |
382 # @return a BioseqDB | |
383 # | |
384 def getDiffFromPattern(self,pattern): | |
385 if pattern=="" : | |
386 return | |
387 iBioseqDB=BioseqDB() | |
388 srch=re.compile(pattern) | |
389 for iBioseq in self.db: | |
390 if not srch.search(iBioseq.header): | |
391 iBioseqDB.add(iBioseq) | |
392 return iBioseqDB | |
393 | |
394 #TODO: to run several times to remove all concerned sequences when big data. How to fix it ? | |
395 ## Remove from the instance all Bioseq which header contains the specified pattern | |
396 # | |
397 # @param pattern regular expression of not wished Bioseq header | |
398 # | |
399 def rmByPattern(self,pattern): | |
400 if pattern=="" : | |
401 return | |
402 srch=re.compile(pattern) | |
403 for seq in self.db: | |
404 if srch.search(seq.header): | |
405 self.db.remove(seq) | |
406 | |
407 | |
408 ## Copy a part from another BioseqDB in the BioseqDB if Bioseq have got header containing the specified pattern | |
409 # | |
410 # @warning this method is called extractPattern in pyRepet.seq.BioseqDB | |
411 # | |
412 # @param pattern regular expression of wished Bioseq header | |
413 # @param sourceBioseqDB the BioseqDB from which we want extract Bioseq | |
414 # | |
415 def addBioseqFromABioseqDBIfHeaderContainPattern(self, pattern, sourceBioseqDB): | |
416 if pattern=="" : | |
417 return | |
418 srch=re.compile(pattern) | |
419 for seq in sourceBioseqDB.db: | |
420 m=srch.search(seq.header) | |
421 if m: | |
422 self.add(seq) | |
423 | |
424 | |
425 ## Up-case the sequence characters in all sequences | |
426 # | |
427 def upCase( self ): | |
428 for bs in self.db: | |
429 bs.upCase() | |
430 | |
431 | |
432 ## Split each gapped Bioseq in a list and store all in a dictionary | |
433 # | |
434 # @return a dict, keys are bioseq headers, values are list of Map instances | |
435 # | |
436 def getDictOfLMapsWithoutGaps( self ): | |
437 dSeq2Maps = {} | |
438 | |
439 for bs in self.db: | |
440 dSeq2Maps[ bs.header ] = bs.getLMapWhithoutGap() | |
441 | |
442 return dSeq2Maps | |
443 | |
444 ## Give the list of the sequence length in the bank | |
445 # | |
446 # @return an list | |
447 # | |
448 def getListOfSequencesLength( self ): | |
449 lLength = [] | |
450 for iBioseq in self.db: | |
451 lLength.append(iBioseq.getLength()) | |
452 | |
453 return lLength | |
454 | |
455 ## Return sequence length for a list of sequence header | |
456 # | |
457 def getSeqLengthByListOfName( self, lHeaderName ): | |
458 lseqLength=[] | |
459 for headerName in lHeaderName: | |
460 lseqLength.append(self.getSeqLength( headerName )) | |
461 return lseqLength |