comparison smart_toolShed/commons/core/seq/test/Test_AlignedBioseqDB.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import unittest
33 import sys
34 import os
35 import time
36 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
37 from commons.core.seq.Bioseq import Bioseq
38 from commons.core.utils.FileUtils import FileUtils
39 from commons.core.coord.Align import Align
40 from commons.core.coord.Range import Range
41 from commons.core.stat.Stat import Stat
42
43
44 class Test_AlignedBioseqDB( unittest.TestCase ):
45
46 def setUp( self ):
47 self._i = AlignedBioseqDB()
48 self._uniqId = "%s_%s" % ( time.strftime("%Y%m%d%H%M%S") , os.getpid() )
49
50
51 def tearDown( self ):
52 self._i = None
53 self._uniqId = ""
54
55
56 def test_getLength(self):
57 iAlignedBioseqDB = AlignedBioseqDB()
58
59 iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" )
60 iAlignedBioseqDB.setData([iBioseq1])
61
62 expLenght = 29
63 obsLength = iAlignedBioseqDB.getLength()
64
65 self.assertEquals(expLenght, obsLength)
66
67
68 def test_getSeqLengthWithoutGaps( self ):
69 iAlignedBioseqDB = AlignedBioseqDB()
70 iAlignedBioseqDB.add( Bioseq( "seq3",
71 "AGCG-GACGATGCAGCAT--GCGAATGA--CGAT" ) )
72 expLenght = 29
73 obsLength = iAlignedBioseqDB.getSeqLengthWithoutGaps( "seq3" )
74
75 self.assertEquals(expLenght, obsLength)
76
77
78 def test_getListOccPerSite(self):
79 iBioseq1 = Bioseq( "seq1", "AGAAA")
80 iBioseq2 = Bioseq( "seq2", "TCAAG")
81 iBioseq3 = Bioseq( "seq3", "GGTAC")
82 iBioseq4 = Bioseq( "seq4", "CCTTA")
83
84 iAlignedBioseqDB = AlignedBioseqDB()
85 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
86
87 expList = [
88
89 {"A":1, "T":1, "G":1, "C":1},
90
91 {"G":2, "C":2},
92
93 {"A":2, "T":2 },
94
95 {"A":3, "T":1 },
96
97 {"A":2, "G":1, "C":1}
98 ]
99
100 obsList = iAlignedBioseqDB.getListOccPerSite()
101
102 self.assertEquals(expList, obsList)
103
104
105 def test_getListOccPerSite_with_none_sequence(self):
106 iBioseq1 = Bioseq( "seq1", "AGAAA")
107 iBioseq2 = Bioseq( "seq2", "TCAAG")
108 iBioseq3 = Bioseq( "seq3", "GGTAC")
109 iBioseq4 = Bioseq( "seq4", None)
110 iBioseq5 = Bioseq( "seq5", "CCTTA")
111
112 iAlignedBioseqDB = AlignedBioseqDB()
113 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4, iBioseq5])
114
115 expList = [
116 {"A":1, "T":1, "G":1},
117 {"G":2, "C":1},
118 {"A":2, "T":1 },
119 {"A":3},
120 {"A":1, "G":1, "C":1},
121 ]
122
123 obsList = iAlignedBioseqDB.getListOccPerSite()
124
125 self.assertEquals(expList, obsList)
126
127
128 def test_getListOccPerSite_on_three_sequence(self):
129 iBioseq1 = Bioseq( "seq1", "AGAAA")
130 iBioseq2 = Bioseq( "seq2", "TCAAG")
131 iBioseq3 = Bioseq( "seq3", "GGTAC")
132
133 iAlignedBioseqDB = AlignedBioseqDB()
134 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
135
136 expList = [
137 {"A":1, "T":1, "G":1},
138 {"G":2, "C":1},
139 {"A":2, "T":1 },
140 {"A":3},
141 {"A":1, "G":1, "C":1},
142 ]
143
144 obsList = iAlignedBioseqDB.getListOccPerSite()
145
146 self.assertEquals(expList, obsList)
147
148
149 def test_getConsensus_with_minNbNt_greater_than_nbInSeq(self):
150 iBioseq1 = Bioseq("seq1", "AGAT")
151 iBioseq2 = Bioseq("seq2", "TGCA")
152 iBioseq3 = Bioseq("seq3", "TACT")
153
154 iAlignedBioseqDB = AlignedBioseqDB()
155 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
156
157 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT")
158 obsConsensus = iAlignedBioseqDB.getConsensus(5)
159
160 self.assertEquals(expConsensus, obsConsensus)
161
162
163 def test_getConsensus_with_minPropNt_greater_than_1(self):
164 isSysExitRaised = False
165
166 iBioseq1 = Bioseq("seq1", "AGAT")
167 iBioseq2 = Bioseq("seq2", "TGCA")
168 iBioseq3 = Bioseq("seq3", "TACT")
169
170 iAlignedBioseqDB = AlignedBioseqDB()
171 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
172
173 expFileName = "expFileName"
174 expFileHandler = open(expFileName,"w")
175 expMessage = "ERROR: minPropNt=2.00 should be a proportion (below 1.0)\n"
176 expFileHandler.write(expMessage)
177 expFileHandler.close()
178
179 obsFileName = "obsFileName"
180 obsFileHandler = open(obsFileName,"w")
181
182 stdoutRef = sys.stdout
183 sys.stdout = obsFileHandler
184
185 try:
186 iAlignedBioseqDB.getConsensus(2,2)
187 except SystemExit:
188 isSysExitRaised = True
189
190 obsFileHandler.close()
191 expFileHandler.close()
192
193 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
194 self.assertTrue( isSysExitRaised )
195
196 sys.stdout = stdoutRef
197 os.remove ( obsFileName )
198 os.remove ( expFileName )
199 stdoutRef = sys.stdout
200
201
202 def test_getConsensus_with_AlignBioseqInstance_size_1(self):
203 isSysExitRaised = False
204
205 iBioseq1 = Bioseq("seq1", "AGAT")
206
207 iAlignedBioseqDB = AlignedBioseqDB()
208 iAlignedBioseqDB.setData([iBioseq1])
209
210 expFileName = "expFileName"
211 expFileHandler = open(expFileName,"w")
212 expMessage = "ERROR: can't make a consensus with less than 2 sequences\n"
213 expFileHandler.write(expMessage)
214 expFileHandler.close()
215
216 obsFileName = "obsFileName"
217 obsFileHandler = open(obsFileName,"w")
218
219 stdoutRef = sys.stdout
220 sys.stdout = obsFileHandler
221
222 try:
223 iAlignedBioseqDB.getConsensus(4)
224 except SystemExit:
225 isSysExitRaised = True
226
227 obsFileHandler.close()
228 expFileHandler.close()
229
230 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
231 self.assertTrue( isSysExitRaised )
232
233 sys.stdout = stdoutRef
234 os.remove ( obsFileName )
235 os.remove ( expFileName )
236 stdoutRef = sys.stdout
237
238
239 def test_getConsensus_with_gap_assertion_on_warning_msg(self):
240 iBioseq1 = Bioseq("seq1", "A-GA-T")
241 iBioseq2 = Bioseq("seq2", "T-GC-A")
242 iBioseq3 = Bioseq("seq3", "T-AC-T")
243
244 iAlignedBioseqDB = AlignedBioseqDB()
245 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
246
247 expFileName = "expFileName"
248 expFileHandler = open(expFileName,"w")
249 expMessage = "WARNING: 2 sites were removed (33.33%)\n"
250 expFileHandler.write(expMessage)
251 expFileHandler.close()
252
253 obsFileName = "obsFileName"
254 obsFileHandler = open(obsFileName,"w")
255
256 stdoutRef = sys.stdout
257 sys.stdout = obsFileHandler
258
259 iAlignedBioseqDB.getConsensus(2)
260
261 obsFileHandler.close()
262 expFileHandler.close()
263
264 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
265
266 sys.stdout = stdoutRef
267 os.remove ( obsFileName )
268 os.remove ( expFileName )
269 stdoutRef = sys.stdout
270
271
272 def test_getConsensus_with_gap_assertion_on_result(self):
273 iBioseq1 = Bioseq("seq1", "A-GA-T")
274 iBioseq2 = Bioseq("seq2", "T-GC-A")
275 iBioseq3 = Bioseq("seq3", "T-AC-T")
276
277 iAlignedBioseqDB = AlignedBioseqDB()
278 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
279
280 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT")
281 obsConsensus = iAlignedBioseqDB.getConsensus(2)
282
283 self.assertEquals(expConsensus, obsConsensus)
284
285
286 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_result(self):
287 iBioseq1 = Bioseq("seq1", "A--A-T")
288 iBioseq2 = Bioseq("seq2", "----A-")
289 iBioseq3 = Bioseq("seq3", "--A---")
290
291 iAlignedBioseqDB = AlignedBioseqDB()
292 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
293
294 expConsensus = None
295 obsConsensus = iAlignedBioseqDB.getConsensus(2)
296 self.assertEquals(expConsensus, obsConsensus)
297
298
299 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_warning_messages(self):
300 iBioseq1 = Bioseq("seq1", "A--A-T")
301 iBioseq2 = Bioseq("seq2", "----A-")
302 iBioseq3 = Bioseq("seq3", "--A---")
303
304 iAlignedBioseqDB = AlignedBioseqDB()
305 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
306
307 expFileName = "expFileName"
308 expFileHandler = open(expFileName,"w")
309 expMessage1 = "WARNING: 1 site was removed (16.67%)\n"
310 expMessage2 = "WARNING: no consensus can be built (no sequence left)\n"
311 expFileHandler.write(expMessage1)
312 expFileHandler.write(expMessage2)
313 expFileHandler.close()
314
315 obsFileName = "obsFileName"
316 obsFileHandler = open(obsFileName,"w")
317
318 stdoutRef = sys.stdout
319 sys.stdout = obsFileHandler
320
321 iAlignedBioseqDB.getConsensus(2)
322
323 obsFileHandler.close()
324 expFileHandler.close()
325
326 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
327
328 sys.stdout = stdoutRef
329 os.remove ( obsFileName )
330 os.remove ( expFileName )
331 stdoutRef = sys.stdout
332
333
334 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_result(self):
335 iBioseq1 = Bioseq("seq1", "AGAT")
336 iBioseq2 = Bioseq("seq2", "CCCA")
337 iBioseq3 = Bioseq("seq3", "TTCT")
338 iBioseq4 = Bioseq("seq4", "TTAT")
339
340 iAlignedBioseqDB = AlignedBioseqDB()
341 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
342
343 expConsensus = None
344 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.75)
345
346 self.assertEquals(expConsensus, obsConsensus)
347
348
349 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_warning_msg(self):
350 iBioseq1 = Bioseq("seq1", "AGAT")
351 iBioseq2 = Bioseq("seq2", "CCCA")
352 iBioseq3 = Bioseq("seq3", "TTCT")
353 iBioseq4 = Bioseq("seq4", "TTAT")
354
355 iAlignedBioseqDB = AlignedBioseqDB()
356 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
357
358 expFileName = "expFileName"
359 expFileHandler = open(expFileName,"w")
360 expMessage = "WARNING: no consensus can be built (75% of N's >= 40%)\n"
361 expFileHandler.write(expMessage)
362 expFileHandler.close()
363
364 obsFileName = "obsFileName"
365 obsFileHandler = open(obsFileName,"w")
366
367 stdoutRef = sys.stdout
368 sys.stdout = obsFileHandler
369
370 iAlignedBioseqDB.getConsensus(2,0.75)
371
372 obsFileHandler.close()
373 expFileHandler.close()
374
375 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
376
377 sys.stdout = stdoutRef
378 os.remove ( obsFileName )
379 os.remove ( expFileName )
380 stdoutRef = sys.stdout
381
382
383 def test_getConsensus_with_acceptable_N_proportion_assertion_on_warning_msg(self):
384 iBioseq1 = Bioseq("seq1", "AGATAA")
385 iBioseq2 = Bioseq("seq2", "CTCAAA")
386 iBioseq3 = Bioseq("seq3", "TTCTAA")
387 iBioseq4 = Bioseq("seq4", "GTATCC")
388
389 iAlignedBioseqDB = AlignedBioseqDB()
390 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
391
392 expFileName = "expFileName"
393 expFileHandler = open(expFileName,"w")
394 expMessage = "WARNING: 33% of N's\n"
395 expFileHandler.write(expMessage)
396 expFileHandler.close()
397
398 obsFileName = "obsFileName"
399 obsFileHandler = open(obsFileName,"w")
400
401 stdoutRef = sys.stdout
402 sys.stdout = obsFileHandler
403
404 iAlignedBioseqDB.getConsensus(2,0.6)
405
406 obsFileHandler.close()
407 expFileHandler.close()
408
409 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
410
411 sys.stdout = stdoutRef
412 os.remove ( obsFileName )
413 os.remove ( expFileName )
414 stdoutRef = sys.stdout
415
416
417 def test_getConsensus_with_acceptable_N_proportion_assertion_on_result(self):
418 iBioseq1 = Bioseq("seq1", "AGATAA")
419 iBioseq2 = Bioseq("seq2", "CTCAAA")
420 iBioseq3 = Bioseq("seq3", "TTCTAA")
421 iBioseq4 = Bioseq("seq4", "GTATCC")
422
423 iAlignedBioseqDB = AlignedBioseqDB()
424 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
425
426 expConsensus = Bioseq("consensus= length=6 nbAlign=4","NTNTAA")
427 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.6)
428
429 self.assertEquals(expConsensus, obsConsensus)
430
431
432 def test_getConsensus_with_chimeric_seq_in_alignment(self):
433 iBioseq1 = Bioseq("seq1", "AGAGTTGTAA")
434 iBioseq2 = Bioseq("seq2", "ATC----AAA")
435 iBioseq3 = Bioseq("seq3", "ATC----TAA")
436 iBioseq4 = Bioseq("seq4", "GTC----TAA")
437
438 iAlignedBioseqDB = AlignedBioseqDB()
439 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
440
441 expConsensus = Bioseq("consensus= length=6 nbAlign=4","ATCTAA")
442 obsConsensus = iAlignedBioseqDB.getConsensus(2)
443
444 self.assertEquals(expConsensus, obsConsensus)
445
446
447 def test_getConsensus_with_SATannot_data(self):
448 iBioseq1 = Bioseq("MbS69Gr8Cl511 chunk21 {Fragment} 165740..165916", "--TTTAGCCGAAGTCCATATGAGTCTTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGATTGCGGTTTAAGTTCTTATACTCAATCATACACATGACATCAAGTCATATTCGAATCCAAAACTCTAAGCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGATG")
449 iBioseq2 = Bioseq("MbS68Gr8Cl511 chunk21 {Fragment} 165916..166092", "GGTCTAGCCGAAGTCCATATGAGTCTTTGTCTTTGTATCTTCTAACAAGAAAACACTACTTAGGCTTTTAGGATAAGGTTGCAGTTTAAGTTTTTATACTAAATCATACACATCACATCAAGTCATATTCGACTCCCAAACACTAACCAAGCTTCTT--TGCTTCTCAAC-GCTTTGATG")
450 iBioseq3 = Bioseq("MbS67Gr8Cl511 chunk21 {Fragment} 166093..166269", "-GTTTAGCCGAAGTCCATATGAGTCGTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTACGCTTTTAGGATAAGATTGTTGTTTAAGTTCTTATACTTAATCATACACATGACATAAAGTCATATTCGACTCCAAAACACTAATCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGTT-")
451 iBioseq4 = Bioseq("MbS66Gr8Cl511 chunk21 {Fragment} 173353..173529", "--TTTAGCAAAATTCTATATGAGTCTTTATCTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGGTTGCGGGTTAAGTTCTTATACTCAATCATACACATGATATCAAGTCATATTCGACTCCAAAACACTAACCAAGCTTCTTCTTGCTTCTTAAAAGCTTTGAA-")
452
453 iAlignedBioseqDB = AlignedBioseqDB()
454 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
455
456 expConsensus = Bioseq("consensus= length=178 nbAlign=4 pile=511 pyramid=8", "GTTTAGCCGAAGTCCATATGAGTCTTTGTNTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGNTTGCNGTTTAAGTTCTTATACTNAATCATACACATGACATCAAGTCATATTCGACTCCAAAACACTAANCAAGCTTCTTCTTGCTTCTCAAAGCTTTGATG")
457 obsConsensus = iAlignedBioseqDB.getConsensus(2, 0.6, isHeaderSAtannot=True)
458
459 self.assertEquals(expConsensus, obsConsensus)
460
461
462 def test_getEntropy_equal_zero(self):
463 iBioseq1 = Bioseq("seq1", "AGAT")
464 iBioseq2 = Bioseq("seq2", "AGAT")
465 iBioseq3 = Bioseq("seq3", "AGAT")
466 iBioseq4 = Bioseq("seq4", "AGAT")
467
468 iAlignedBioseqDB = AlignedBioseqDB()
469 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
470
471 expStats = Stat()
472 expStats._min = 0
473 expStats._max = 0
474 expStats._sum = 0
475 expStats._sumOfSquares = 0
476 expStats._n = 4
477 expStats._lValues = [0, 0, 0, 0]
478
479 obsStats = iAlignedBioseqDB.getEntropy()
480
481 self.assertEquals(expStats, obsStats)
482
483
484 def test_getEntropy_different_nucl(self):
485 iBioseq1 = Bioseq("seq1", "AGAT")
486 iBioseq2 = Bioseq("seq2", "CTCA")
487 iBioseq3 = Bioseq("seq3", "TTCT")
488 iBioseq4 = Bioseq("seq4", "GTAT")
489
490 iAlignedBioseqDB = AlignedBioseqDB()
491 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
492
493 expStats = Stat()
494 expStats._min = 0.81127812445913283
495 expStats._max = 2.0
496 expStats._sum = 4.62255624892
497 expStats._sumOfSquares = 6.31634439045
498 expStats._n = 4
499 expStats._lValues = [2.0, 0.81127812445913283, 1.0, 0.81127812445913283]
500
501 obsStats = iAlignedBioseqDB.getEntropy()
502
503 self.assertEquals(expStats, obsStats)
504
505
506 def test_getEntropy_different_nucl_with_N(self):
507 iBioseq1 = Bioseq("seq1", "AGAT")
508 iBioseq2 = Bioseq("seq2", "CNCA")
509 iBioseq3 = Bioseq("seq3", "TTNT")
510 iBioseq4 = Bioseq("seq4", "GTNT")
511
512 iAlignedBioseqDB = AlignedBioseqDB()
513 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
514
515 expStats = Stat()
516 expStats._min = 0.811278124459
517 expStats._max = 2
518 expStats._sum = 6.11387090595
519 expStats._sumOfSquares = 10.1629200457
520 expStats._n = 4
521 expStats._lValues = [2, 1.4913146570363986, 1.8112781244591329, 0.81127812445913283]
522
523 obsStats = iAlignedBioseqDB.getEntropy()
524
525 self.assertEquals(expStats, obsStats)
526
527
528 def test_getEntropy_different_nucl_with_gap(self):
529 iBioseq1 = Bioseq("seq1", "AGAT")
530 iBioseq2 = Bioseq("seq2", "C-CA")
531 iBioseq3 = Bioseq("seq3", "T--T")
532 iBioseq4 = Bioseq("seq4", "-TNT")
533
534 iAlignedBioseqDB = AlignedBioseqDB()
535 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
536
537 expStats = Stat()
538 expStats._min = 0.811278124459
539 expStats._max = 1.65002242165
540 expStats._sum = 5.04626304683
541 expStats._sumOfSquares = 6.89285231586
542 expStats._n = 4
543 expStats._lValues = [1.5849625007211561, 1.0, 1.6500224216483541, 0.81127812445913283]
544
545 obsStats = iAlignedBioseqDB.getEntropy()
546
547 self.assertEquals(expStats, obsStats)
548
549
550 def test_saveAsBinaryMatrix( self ):
551 iBioseq1 = Bioseq("seq1", "AGAT")
552 iBioseq2 = Bioseq("seq2", "C-CA")
553 iBioseq3 = Bioseq("seq3", "T--T")
554 iBioseq4 = Bioseq("seq4", "-TNT")
555
556 self._i.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4 ] )
557
558 expFile = "dummyExpFile_%s" % ( self._uniqId )
559 expFileHandler = open( expFile, "w" )
560 expFileHandler.write( "seq1\t1\t1\t1\t1\n" )
561 expFileHandler.write( "seq2\t1\t0\t1\t1\n" )
562 expFileHandler.write( "seq3\t1\t0\t0\t1\n" )
563 expFileHandler.write( "seq4\t0\t1\t1\t1\n" )
564 expFileHandler.close()
565
566 obsFile = "dummyObsFile_%s" % ( self._uniqId )
567 self._i.saveAsBinaryMatrix( obsFile )
568
569 self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) )
570 for f in [ expFile, obsFile ]:
571 os.remove( f )
572
573
574 def test_getAlignList( self ):
575 iBioseq1 = Bioseq( "seq1", "AGAAT" )
576 iBioseq2 = Bioseq( "seq2", "-G-AC" )
577 self._i.setData( [ iBioseq1, iBioseq2 ] )
578
579 iAlign1 = Align( Range( "seq1", 2, 2 ),
580 Range( "seq2", 1, 1 ),
581 0, 1, 100 )
582 iAlign2 = Align( Range( "seq1", 4, 5 ),
583 Range( "seq2", 2, 3 ),
584 0, 1, 50 )
585 lExp = [ iAlign1, iAlign2 ]
586
587 lObs = self._i.getAlignList( "seq1", "seq2" )
588 self.assertEquals( lExp, lObs )
589
590
591 def test_removeGaps(self):
592 iBioseq1 = Bioseq( "seq1", "AGAAT-" )
593 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
594 self._i.setData( [ iBioseq1, iBioseq2 ] )
595
596 exp = AlignedBioseqDB()
597 exp.setData( [ Bioseq( "seq1", "AGAAT" ),
598 Bioseq( "seq2", "AGACG" ) ] )
599
600 self._i.removeGaps()
601
602 self.assertEquals(exp, self._i)
603
604 def test_computeMeanPcentIdentity_between_two_sequences_50pcent(self):
605 iBioseq1 = Bioseq( "seq1", "AGAAT-" )
606 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
607 self._i.setData( [ iBioseq1, iBioseq2 ] )
608 expIdentity = 50.0
609 obsIdentity = self._i.computeMeanPcentIdentity()
610 self.assertEquals(expIdentity, obsIdentity)
611
612 def test_computeMeanPcentIdentity_between_two_sequences_57pcent(self):
613 iBioseq1 = Bioseq( "seq1", "AGAAT-T" )
614 iBioseq2 = Bioseq( "seq2", "AG-ACGT" )
615 self._i.setData( [ iBioseq1, iBioseq2 ] )
616 expIdentity = 57.0
617 obsIdentity = self._i.computeMeanPcentIdentity()
618 self.assertEquals(expIdentity, obsIdentity)
619
620 def test_compueNeamPcentIdentity_between_two_sequences_gaps_on_same_position(self):
621 iBioseq1 = Bioseq( "seq1", "AGAAT-T" )
622 iBioseq2 = Bioseq( "seq2", "AG-AC-T" )
623 self._i.setData( [ iBioseq1, iBioseq2 ] )
624 expIdentity = 57.0
625 obsIdentity = self._i.computeMeanPcentIdentity()
626 self.assertEquals(expIdentity, obsIdentity)
627
628 def test_computeMeanPcentIdentity_between_three_sequences_50_pcent(self):
629 iBioseqRef = Bioseq( "seqRef", "AGAAT-" )
630 iBioseq1 = Bioseq( "seq1", "AG-ACG" )
631 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
632
633 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] )
634
635 expIdentity = 50.0
636 obsIdentity = self._i.computeMeanPcentIdentity()
637 self.assertEquals(expIdentity, obsIdentity)
638
639 def test_computeMeanPcentIdentity_between_three_sequences_42_pcent(self):
640 iBioseqRef = Bioseq( "seqRef", "AGAAT-" )
641 iBioseq1 = Bioseq( "seq1", "AG-ACG" )
642 iBioseq2 = Bioseq( "seq2", "AG-CC-" )
643
644 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] )
645
646 expIdentity = 42.0
647 obsIdentity = self._i.computeMeanPcentIdentity()
648 self.assertEquals(expIdentity, obsIdentity)
649
650 #TODO: test with tthis data:
651 #>BlastclustCluster2Mb1_chunk7 (dbseq-nr 1) [101523,104351]
652 #------------------------------------------------------------
653 #------------------------------------------------------------
654 #------------------------------------------------------------
655 #------------------------------------------------------------
656 #------------------------------------------------------------
657 #------------------------------------------------------------
658 #-------------------------------TAAATCCAACACTGAGAAGAATTATTTAA
659 #AGAAGGTTTTTATTTAACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATG
660 #TAAATTGGATTGAGGAGAAGCCTCAGTATTTTAACAATATTTGTATTTCGGGAGGGCCTC
661 #GCTTTTGGATTCTTAAGATTAGGAAAAGATTCAAAGAGAGCAGTCAAATCTGCTTATGTC
662 #AGGCTTTAGGATTTTTACAAGAACGGCTAAGTGCCGCTGCCAAAGAATTCTTTATTAGAA
663 #ACGGACGCTGCGCAGCTGGGATACATTATGGAAATAACTTAAGTGCATTACTGTTTTCTT
664 #TGAAATAATTGAAGTGGCCGATTTGGACCACCCAAATACGTACCTCCCCTTCCCTTAAAG
665 #TGAAACCTATACTTGGGCTGGGATTGATAGATATTGTATGGAAAATGGAGTTTGTTCTAT
666 #TTCTATTTGTTGTGGTATGTTTTGATTTTTTCTTATTTTAAGTTTTGCATACAGCAACAT
667 #AA--ATGGCATAA--ATTATACATA-TCTTAAATATAAGTACAATAAAATTAGTCA--AT
668 #TATTACAATGGTCATTAGTATG-TAAAGTGTAAT-ATTGTTCTC-TGA-AATCATCTTAA
669 #CATTGGCGTTGTGTTTTACAATTTTTATTTTTGTTATATTAAGTGGTGTGTCAAGTGGTG
670 #TCAAATCTATTTCTGGTTTTAACATATTTTCACTTAAAATTATACTATTGATTTCTATTT
671 #TGCATTGCATTACTCCTTTCATTTTGTTTTCTTCTATTTTTATATTATTAATTGAATTTG
672 #GGTAATTTTGGTTATGAATGGTTTGGGTTAAGTTCCAGTAATTTGTAATAATTTAAGTTT
673 #AGTAATTTTGGGTTAAACAAGTCAAGTCTGGAAAGCTGGCATAGCCATTCTAAATCTTCT
674 #ACGTATTCCGTAAACTGCATTAGCTAGAACAAAAGTGGTTATTTTCATAGTTCTGTTTTT
675 #TTGGTTTCTTAAGTAAT--TAAT--TTGTATGTTTGATCAAAAAGGGTAACTATTATTTT
676 #TGTAAT--TAATAACTGATTGGACATTTTTCAACAATTTTCTGTCAATTAGCAT----AT
677 #CGTAATTATTAGAA-AATACTATT--TCTGGGTAA-CATAATCAAGTCGTTCAAGGTAAT
678 #AGGGCCATT-TAATGTTAA-AACTTCACTTCTACTATTTTGTATGGGAAGA--CAAAATA
679 #TATTTTCAT--TGATCATATTAATTGTGGAAGCCATTTGTATGTGTACCCTTTGTATACT
680 #ATTTTTAAGCATTCGTGTTGTCCGGGGCTGTTGACCGAAAATTTTCGTTTAGTTCAGAAT
681 #TATTTTGATTTTGTTCATTATTAATGTGTACGGGTTGGACCTGTTGTTGCCCTTCATTAA
682 #TGAACCCATGATTTTGATATGGATTGTATTGATTTTGGTCATAATACTGTTGTTCGTAGG
683 #TTGGGTCTGAAATAATTAGGTACGTGCCACATTGGTGGGTTGTAAGTCGTCTGCTCTGAT
684 #AGCTAGGTCTAAATGGTTGAATGTATTGATTAAAATTAGGCTGGAAGGGTTGGGAATATT
685 #GTGGGTAACTTGGGCGAGTTTGAACGTTTGTATTGAATTTATGTGCTTTCGAATTCTGAT
686 #TAAAATTTGAATTTTTATTACGGTATTCTGGGTTTATAAAATTCAAAATTAATGAGTTTT
687 #TCATAAATTCCCTCATTTTGTGCAATGGTTATTAAGGATCTTAAGTCTGTAATA-TCGTG
688 #ATGAG-ATAA-AATAGTAAATATAGTAG-AGTCTTTA--ATAGCCTGAATAAAAATAAGA
689 #AAATCCGATTGGTTACCTTCCAGGTGTAGTTTCGAAATTAATAATTGACGTCGTCTTTCC
690 #CCTTCTTCGGAGAATGTTTTTAGGTTTCCTCTGTATGGTGTCTCCCTGAAGCTCTCCAGA
691 #AGTTTGTAGTTTGGTGTTTGAGTTTTAAATTCCGCGATGAGTCTTGCTTTGAGGGTAGGC
692 #CAATGCTCGGAAGTCCCAAAGATCGTATTATCCGTCCAAGTTACTTTTGATGGCTCCCAG
693 #TAGAATCCTCTGTTTACGAAAATTACGTAAATCCACTCTGCTGACGACGGTGTGAAGTGT
694 #TTCTGGATCACCCTTAAATGTCATAATGTCTTTAAGCTGCCGACGGGCTTCGGCAAGGTT
695 #GTTGTCTCTTAGCGCAACGATTTAAGGTGCGGCAATAATTATAATAATTGGTTGAGACAT
696 #TTTTATTGTAAAATTTTAAATTTTGTGTTTTATTGTTTTATTGTTTCGAATTCAATGCTA
697 #TTATTTATTTTTTTTTTTGTTTTCTAGTTTCACTTTTTTTTTTATTGTATTGCTTTATTG
698 #TTCTTATTTTATTTTTATATGTTGGTTTTAAAACTTAGTTGCCTTTGGACTTAATGTTTT
699 #TGTTTCGTATTTCACTTCCACTTTAAATTGGATAACAGAATTGGAATTAAAATCCAAGTT
700 #GAAGAGTTTCCACGAATTTATTTGGGAAATGTTTCGAGCACTGGAATCCAGTGACTGGAT
701 #TATAAAATTTAACTTATTTCC-ACTCGAAGGTTCTTTTTT--CGG--------ATACTTT
702 #TTGTATCAGT--TGACTAAGAGCAACACTGAGAAGAATTATTTAAAGAAGGTTTTTATTT
703 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATGTAAATTGGATTGAGG
704 #AGAAGCCTCAGTATT--TCAACAATATTTGTATT--TCGGGAGGGCCTCGCTCTTGGAT-
705 #-TCTTAAGATTAGGAAAAAATTCA-AAGAGAGCAGTCA-AATC-TGCTTATGTCAGGCTT
706 #TAGGATTTTTACAAGAAGGGC-TAAGTGCCGCTGAAACGGATGC----------------
707 #------------------------------------------------------------
708 #------------------------------------------------------------
709 #------------
710 #>BlastclustCluster2Mb2_chunk7 (dbseq-nr 1) [99136,100579]
711 #GTAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA
712 #ATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATA
713 #ATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATC
714 #ATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA
715 #ATCATAATAATCATAATAATCATAATAATCATAATAATAATAATAATCATAATCATAATC
716 #ATAATAAGCGATAAAAAAATTAAAAAATAAAAATTAAAACCCACTGCAATCACGTTGGAC
717 #GGCGAGTCACAGACGTCAGAATAGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCA
718 #AGAAGGTTTTTATTGAACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATC
719 #ATAAT---AATCATAATAATCATAATAATCATAATAATCATAATAAT-------------
720 #------------------------------------------------------------
721 #------------------------------------------------------------
722 #------------------------------------------------------------
723 #------------------------------------------------------------
724 #------------------------------------------------------------
725 #-----------------------------------------------CATA-ATAATCAT
726 #AATAAT--CATAATAATCATA-ATAATCATAATAATCATAATAATCATAATAATCATAAT
727 #AATCATAATAATCATAATAATCATAA----TAATCATAATAATCATAATAATCATAATAA
728 #------------------------------------------------------------
729 #------------------------------------------------------------
730 #------------------------------------------------------------
731 #------------------------------------------------------------
732 #------------------------------------------------------------
733 #------------------------------------------------------------
734 #------TCATAA-TAATCATAATAATCGTAA---TAATCATAA----TAATCATAATAAT
735 #CATAATAATCATAA-TAAT----CAT-----AATAATCAT-----AATAATCATAATAAT
736 #CATAATAATCATAATAATCATAATAATCATAATAATCATAAT-AA-TCAT--AA--TAAT
737 #-----CATAATAATCATAATAA--TCA----TAATAATC---AT---AATAATCATAATA
738 #-AT---CATAATAATCATAATAATC-----------------------------------
739 #------------------------------------------------------------
740 #------------------------------------------------------------
741 #------------------------------------------------------------
742 #------------------------------------------------------------
743 #------------------------------------------------------------
744 #------------------------------------------------------------
745 #-----------------------------------ATAATAATCATAAT-AATCA-----
746 #TAATAA------TCATAAT----AATCATAAT-AATCATAATAA-TCA-TAATAATCATA
747 #ATAATCATAATAATCATAATAATAATAATAATCATAATCATAATCATAATAAGCATAAAA
748 #AAAT--------------------------------------------------------
749 #------------------------------------------------------------
750 #------------------------------------------------------------
751 #------------------------------------------------------------
752 #------------------------------------------------------------
753 #------------------------------------------------------------
754 #------------------------------------------------------------
755 #------------------------------------------------------------
756 #------------------------------------------------------------
757 #------------------------------------------------------------
758 #------------------------------------------------------------
759 #------------------------------------------------------------
760 #TAAAAAATAAAAATTAAAACCCACTGCAA---TCACGTTGGACGGCGAGTCACAGACGTC
761 #A-GAAT-AGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCAAGAAGGTTTTTATTG
762 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATCATAAT---AATCATA
763 #ATAA---TCA-TAATAATCAT-AATAATCATAATAATCATAA-----TAA-TCATA-ATA
764 #ATCATAATAATCATAATAA--TCATAATA-ATCA-TAATAATCATAATAATCATAATCAT
765 #CATAATAATCATAATAAT--CATAA-T-------AATC--ATAATAATCATAATAATCAT
766 #AATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAAT
767 #CATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAAT
768 #AATCATAATAAT
769
770 test_suite = unittest.TestSuite()
771 test_suite.addTest( unittest.makeSuite( Test_AlignedBioseqDB ) )
772 if __name__ == "__main__":
773 unittest.TextTestRunner(verbosity=2).run( test_suite )