Mercurial > repos > yufei-luo > s_mart
comparison smart_toolShed/commons/core/seq/test/Test_AlignedBioseqDB.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0f8dcca02ed |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import unittest | |
33 import sys | |
34 import os | |
35 import time | |
36 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB | |
37 from commons.core.seq.Bioseq import Bioseq | |
38 from commons.core.utils.FileUtils import FileUtils | |
39 from commons.core.coord.Align import Align | |
40 from commons.core.coord.Range import Range | |
41 from commons.core.stat.Stat import Stat | |
42 | |
43 | |
44 class Test_AlignedBioseqDB( unittest.TestCase ): | |
45 | |
46 def setUp( self ): | |
47 self._i = AlignedBioseqDB() | |
48 self._uniqId = "%s_%s" % ( time.strftime("%Y%m%d%H%M%S") , os.getpid() ) | |
49 | |
50 | |
51 def tearDown( self ): | |
52 self._i = None | |
53 self._uniqId = "" | |
54 | |
55 | |
56 def test_getLength(self): | |
57 iAlignedBioseqDB = AlignedBioseqDB() | |
58 | |
59 iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" ) | |
60 iAlignedBioseqDB.setData([iBioseq1]) | |
61 | |
62 expLenght = 29 | |
63 obsLength = iAlignedBioseqDB.getLength() | |
64 | |
65 self.assertEquals(expLenght, obsLength) | |
66 | |
67 | |
68 def test_getSeqLengthWithoutGaps( self ): | |
69 iAlignedBioseqDB = AlignedBioseqDB() | |
70 iAlignedBioseqDB.add( Bioseq( "seq3", | |
71 "AGCG-GACGATGCAGCAT--GCGAATGA--CGAT" ) ) | |
72 expLenght = 29 | |
73 obsLength = iAlignedBioseqDB.getSeqLengthWithoutGaps( "seq3" ) | |
74 | |
75 self.assertEquals(expLenght, obsLength) | |
76 | |
77 | |
78 def test_getListOccPerSite(self): | |
79 iBioseq1 = Bioseq( "seq1", "AGAAA") | |
80 iBioseq2 = Bioseq( "seq2", "TCAAG") | |
81 iBioseq3 = Bioseq( "seq3", "GGTAC") | |
82 iBioseq4 = Bioseq( "seq4", "CCTTA") | |
83 | |
84 iAlignedBioseqDB = AlignedBioseqDB() | |
85 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
86 | |
87 expList = [ | |
88 | |
89 {"A":1, "T":1, "G":1, "C":1}, | |
90 | |
91 {"G":2, "C":2}, | |
92 | |
93 {"A":2, "T":2 }, | |
94 | |
95 {"A":3, "T":1 }, | |
96 | |
97 {"A":2, "G":1, "C":1} | |
98 ] | |
99 | |
100 obsList = iAlignedBioseqDB.getListOccPerSite() | |
101 | |
102 self.assertEquals(expList, obsList) | |
103 | |
104 | |
105 def test_getListOccPerSite_with_none_sequence(self): | |
106 iBioseq1 = Bioseq( "seq1", "AGAAA") | |
107 iBioseq2 = Bioseq( "seq2", "TCAAG") | |
108 iBioseq3 = Bioseq( "seq3", "GGTAC") | |
109 iBioseq4 = Bioseq( "seq4", None) | |
110 iBioseq5 = Bioseq( "seq5", "CCTTA") | |
111 | |
112 iAlignedBioseqDB = AlignedBioseqDB() | |
113 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4, iBioseq5]) | |
114 | |
115 expList = [ | |
116 {"A":1, "T":1, "G":1}, | |
117 {"G":2, "C":1}, | |
118 {"A":2, "T":1 }, | |
119 {"A":3}, | |
120 {"A":1, "G":1, "C":1}, | |
121 ] | |
122 | |
123 obsList = iAlignedBioseqDB.getListOccPerSite() | |
124 | |
125 self.assertEquals(expList, obsList) | |
126 | |
127 | |
128 def test_getListOccPerSite_on_three_sequence(self): | |
129 iBioseq1 = Bioseq( "seq1", "AGAAA") | |
130 iBioseq2 = Bioseq( "seq2", "TCAAG") | |
131 iBioseq3 = Bioseq( "seq3", "GGTAC") | |
132 | |
133 iAlignedBioseqDB = AlignedBioseqDB() | |
134 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
135 | |
136 expList = [ | |
137 {"A":1, "T":1, "G":1}, | |
138 {"G":2, "C":1}, | |
139 {"A":2, "T":1 }, | |
140 {"A":3}, | |
141 {"A":1, "G":1, "C":1}, | |
142 ] | |
143 | |
144 obsList = iAlignedBioseqDB.getListOccPerSite() | |
145 | |
146 self.assertEquals(expList, obsList) | |
147 | |
148 | |
149 def test_getConsensus_with_minNbNt_greater_than_nbInSeq(self): | |
150 iBioseq1 = Bioseq("seq1", "AGAT") | |
151 iBioseq2 = Bioseq("seq2", "TGCA") | |
152 iBioseq3 = Bioseq("seq3", "TACT") | |
153 | |
154 iAlignedBioseqDB = AlignedBioseqDB() | |
155 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
156 | |
157 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT") | |
158 obsConsensus = iAlignedBioseqDB.getConsensus(5) | |
159 | |
160 self.assertEquals(expConsensus, obsConsensus) | |
161 | |
162 | |
163 def test_getConsensus_with_minPropNt_greater_than_1(self): | |
164 isSysExitRaised = False | |
165 | |
166 iBioseq1 = Bioseq("seq1", "AGAT") | |
167 iBioseq2 = Bioseq("seq2", "TGCA") | |
168 iBioseq3 = Bioseq("seq3", "TACT") | |
169 | |
170 iAlignedBioseqDB = AlignedBioseqDB() | |
171 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
172 | |
173 expFileName = "expFileName" | |
174 expFileHandler = open(expFileName,"w") | |
175 expMessage = "ERROR: minPropNt=2.00 should be a proportion (below 1.0)\n" | |
176 expFileHandler.write(expMessage) | |
177 expFileHandler.close() | |
178 | |
179 obsFileName = "obsFileName" | |
180 obsFileHandler = open(obsFileName,"w") | |
181 | |
182 stdoutRef = sys.stdout | |
183 sys.stdout = obsFileHandler | |
184 | |
185 try: | |
186 iAlignedBioseqDB.getConsensus(2,2) | |
187 except SystemExit: | |
188 isSysExitRaised = True | |
189 | |
190 obsFileHandler.close() | |
191 expFileHandler.close() | |
192 | |
193 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
194 self.assertTrue( isSysExitRaised ) | |
195 | |
196 sys.stdout = stdoutRef | |
197 os.remove ( obsFileName ) | |
198 os.remove ( expFileName ) | |
199 stdoutRef = sys.stdout | |
200 | |
201 | |
202 def test_getConsensus_with_AlignBioseqInstance_size_1(self): | |
203 isSysExitRaised = False | |
204 | |
205 iBioseq1 = Bioseq("seq1", "AGAT") | |
206 | |
207 iAlignedBioseqDB = AlignedBioseqDB() | |
208 iAlignedBioseqDB.setData([iBioseq1]) | |
209 | |
210 expFileName = "expFileName" | |
211 expFileHandler = open(expFileName,"w") | |
212 expMessage = "ERROR: can't make a consensus with less than 2 sequences\n" | |
213 expFileHandler.write(expMessage) | |
214 expFileHandler.close() | |
215 | |
216 obsFileName = "obsFileName" | |
217 obsFileHandler = open(obsFileName,"w") | |
218 | |
219 stdoutRef = sys.stdout | |
220 sys.stdout = obsFileHandler | |
221 | |
222 try: | |
223 iAlignedBioseqDB.getConsensus(4) | |
224 except SystemExit: | |
225 isSysExitRaised = True | |
226 | |
227 obsFileHandler.close() | |
228 expFileHandler.close() | |
229 | |
230 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
231 self.assertTrue( isSysExitRaised ) | |
232 | |
233 sys.stdout = stdoutRef | |
234 os.remove ( obsFileName ) | |
235 os.remove ( expFileName ) | |
236 stdoutRef = sys.stdout | |
237 | |
238 | |
239 def test_getConsensus_with_gap_assertion_on_warning_msg(self): | |
240 iBioseq1 = Bioseq("seq1", "A-GA-T") | |
241 iBioseq2 = Bioseq("seq2", "T-GC-A") | |
242 iBioseq3 = Bioseq("seq3", "T-AC-T") | |
243 | |
244 iAlignedBioseqDB = AlignedBioseqDB() | |
245 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
246 | |
247 expFileName = "expFileName" | |
248 expFileHandler = open(expFileName,"w") | |
249 expMessage = "WARNING: 2 sites were removed (33.33%)\n" | |
250 expFileHandler.write(expMessage) | |
251 expFileHandler.close() | |
252 | |
253 obsFileName = "obsFileName" | |
254 obsFileHandler = open(obsFileName,"w") | |
255 | |
256 stdoutRef = sys.stdout | |
257 sys.stdout = obsFileHandler | |
258 | |
259 iAlignedBioseqDB.getConsensus(2) | |
260 | |
261 obsFileHandler.close() | |
262 expFileHandler.close() | |
263 | |
264 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
265 | |
266 sys.stdout = stdoutRef | |
267 os.remove ( obsFileName ) | |
268 os.remove ( expFileName ) | |
269 stdoutRef = sys.stdout | |
270 | |
271 | |
272 def test_getConsensus_with_gap_assertion_on_result(self): | |
273 iBioseq1 = Bioseq("seq1", "A-GA-T") | |
274 iBioseq2 = Bioseq("seq2", "T-GC-A") | |
275 iBioseq3 = Bioseq("seq3", "T-AC-T") | |
276 | |
277 iAlignedBioseqDB = AlignedBioseqDB() | |
278 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
279 | |
280 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT") | |
281 obsConsensus = iAlignedBioseqDB.getConsensus(2) | |
282 | |
283 self.assertEquals(expConsensus, obsConsensus) | |
284 | |
285 | |
286 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_result(self): | |
287 iBioseq1 = Bioseq("seq1", "A--A-T") | |
288 iBioseq2 = Bioseq("seq2", "----A-") | |
289 iBioseq3 = Bioseq("seq3", "--A---") | |
290 | |
291 iAlignedBioseqDB = AlignedBioseqDB() | |
292 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
293 | |
294 expConsensus = None | |
295 obsConsensus = iAlignedBioseqDB.getConsensus(2) | |
296 self.assertEquals(expConsensus, obsConsensus) | |
297 | |
298 | |
299 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_warning_messages(self): | |
300 iBioseq1 = Bioseq("seq1", "A--A-T") | |
301 iBioseq2 = Bioseq("seq2", "----A-") | |
302 iBioseq3 = Bioseq("seq3", "--A---") | |
303 | |
304 iAlignedBioseqDB = AlignedBioseqDB() | |
305 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3]) | |
306 | |
307 expFileName = "expFileName" | |
308 expFileHandler = open(expFileName,"w") | |
309 expMessage1 = "WARNING: 1 site was removed (16.67%)\n" | |
310 expMessage2 = "WARNING: no consensus can be built (no sequence left)\n" | |
311 expFileHandler.write(expMessage1) | |
312 expFileHandler.write(expMessage2) | |
313 expFileHandler.close() | |
314 | |
315 obsFileName = "obsFileName" | |
316 obsFileHandler = open(obsFileName,"w") | |
317 | |
318 stdoutRef = sys.stdout | |
319 sys.stdout = obsFileHandler | |
320 | |
321 iAlignedBioseqDB.getConsensus(2) | |
322 | |
323 obsFileHandler.close() | |
324 expFileHandler.close() | |
325 | |
326 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
327 | |
328 sys.stdout = stdoutRef | |
329 os.remove ( obsFileName ) | |
330 os.remove ( expFileName ) | |
331 stdoutRef = sys.stdout | |
332 | |
333 | |
334 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_result(self): | |
335 iBioseq1 = Bioseq("seq1", "AGAT") | |
336 iBioseq2 = Bioseq("seq2", "CCCA") | |
337 iBioseq3 = Bioseq("seq3", "TTCT") | |
338 iBioseq4 = Bioseq("seq4", "TTAT") | |
339 | |
340 iAlignedBioseqDB = AlignedBioseqDB() | |
341 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
342 | |
343 expConsensus = None | |
344 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.75) | |
345 | |
346 self.assertEquals(expConsensus, obsConsensus) | |
347 | |
348 | |
349 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_warning_msg(self): | |
350 iBioseq1 = Bioseq("seq1", "AGAT") | |
351 iBioseq2 = Bioseq("seq2", "CCCA") | |
352 iBioseq3 = Bioseq("seq3", "TTCT") | |
353 iBioseq4 = Bioseq("seq4", "TTAT") | |
354 | |
355 iAlignedBioseqDB = AlignedBioseqDB() | |
356 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
357 | |
358 expFileName = "expFileName" | |
359 expFileHandler = open(expFileName,"w") | |
360 expMessage = "WARNING: no consensus can be built (75% of N's >= 40%)\n" | |
361 expFileHandler.write(expMessage) | |
362 expFileHandler.close() | |
363 | |
364 obsFileName = "obsFileName" | |
365 obsFileHandler = open(obsFileName,"w") | |
366 | |
367 stdoutRef = sys.stdout | |
368 sys.stdout = obsFileHandler | |
369 | |
370 iAlignedBioseqDB.getConsensus(2,0.75) | |
371 | |
372 obsFileHandler.close() | |
373 expFileHandler.close() | |
374 | |
375 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
376 | |
377 sys.stdout = stdoutRef | |
378 os.remove ( obsFileName ) | |
379 os.remove ( expFileName ) | |
380 stdoutRef = sys.stdout | |
381 | |
382 | |
383 def test_getConsensus_with_acceptable_N_proportion_assertion_on_warning_msg(self): | |
384 iBioseq1 = Bioseq("seq1", "AGATAA") | |
385 iBioseq2 = Bioseq("seq2", "CTCAAA") | |
386 iBioseq3 = Bioseq("seq3", "TTCTAA") | |
387 iBioseq4 = Bioseq("seq4", "GTATCC") | |
388 | |
389 iAlignedBioseqDB = AlignedBioseqDB() | |
390 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
391 | |
392 expFileName = "expFileName" | |
393 expFileHandler = open(expFileName,"w") | |
394 expMessage = "WARNING: 33% of N's\n" | |
395 expFileHandler.write(expMessage) | |
396 expFileHandler.close() | |
397 | |
398 obsFileName = "obsFileName" | |
399 obsFileHandler = open(obsFileName,"w") | |
400 | |
401 stdoutRef = sys.stdout | |
402 sys.stdout = obsFileHandler | |
403 | |
404 iAlignedBioseqDB.getConsensus(2,0.6) | |
405 | |
406 obsFileHandler.close() | |
407 expFileHandler.close() | |
408 | |
409 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) ) | |
410 | |
411 sys.stdout = stdoutRef | |
412 os.remove ( obsFileName ) | |
413 os.remove ( expFileName ) | |
414 stdoutRef = sys.stdout | |
415 | |
416 | |
417 def test_getConsensus_with_acceptable_N_proportion_assertion_on_result(self): | |
418 iBioseq1 = Bioseq("seq1", "AGATAA") | |
419 iBioseq2 = Bioseq("seq2", "CTCAAA") | |
420 iBioseq3 = Bioseq("seq3", "TTCTAA") | |
421 iBioseq4 = Bioseq("seq4", "GTATCC") | |
422 | |
423 iAlignedBioseqDB = AlignedBioseqDB() | |
424 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
425 | |
426 expConsensus = Bioseq("consensus= length=6 nbAlign=4","NTNTAA") | |
427 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.6) | |
428 | |
429 self.assertEquals(expConsensus, obsConsensus) | |
430 | |
431 | |
432 def test_getConsensus_with_chimeric_seq_in_alignment(self): | |
433 iBioseq1 = Bioseq("seq1", "AGAGTTGTAA") | |
434 iBioseq2 = Bioseq("seq2", "ATC----AAA") | |
435 iBioseq3 = Bioseq("seq3", "ATC----TAA") | |
436 iBioseq4 = Bioseq("seq4", "GTC----TAA") | |
437 | |
438 iAlignedBioseqDB = AlignedBioseqDB() | |
439 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
440 | |
441 expConsensus = Bioseq("consensus= length=6 nbAlign=4","ATCTAA") | |
442 obsConsensus = iAlignedBioseqDB.getConsensus(2) | |
443 | |
444 self.assertEquals(expConsensus, obsConsensus) | |
445 | |
446 | |
447 def test_getConsensus_with_SATannot_data(self): | |
448 iBioseq1 = Bioseq("MbS69Gr8Cl511 chunk21 {Fragment} 165740..165916", "--TTTAGCCGAAGTCCATATGAGTCTTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGATTGCGGTTTAAGTTCTTATACTCAATCATACACATGACATCAAGTCATATTCGAATCCAAAACTCTAAGCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGATG") | |
449 iBioseq2 = Bioseq("MbS68Gr8Cl511 chunk21 {Fragment} 165916..166092", "GGTCTAGCCGAAGTCCATATGAGTCTTTGTCTTTGTATCTTCTAACAAGAAAACACTACTTAGGCTTTTAGGATAAGGTTGCAGTTTAAGTTTTTATACTAAATCATACACATCACATCAAGTCATATTCGACTCCCAAACACTAACCAAGCTTCTT--TGCTTCTCAAC-GCTTTGATG") | |
450 iBioseq3 = Bioseq("MbS67Gr8Cl511 chunk21 {Fragment} 166093..166269", "-GTTTAGCCGAAGTCCATATGAGTCGTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTACGCTTTTAGGATAAGATTGTTGTTTAAGTTCTTATACTTAATCATACACATGACATAAAGTCATATTCGACTCCAAAACACTAATCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGTT-") | |
451 iBioseq4 = Bioseq("MbS66Gr8Cl511 chunk21 {Fragment} 173353..173529", "--TTTAGCAAAATTCTATATGAGTCTTTATCTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGGTTGCGGGTTAAGTTCTTATACTCAATCATACACATGATATCAAGTCATATTCGACTCCAAAACACTAACCAAGCTTCTTCTTGCTTCTTAAAAGCTTTGAA-") | |
452 | |
453 iAlignedBioseqDB = AlignedBioseqDB() | |
454 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
455 | |
456 expConsensus = Bioseq("consensus= length=178 nbAlign=4 pile=511 pyramid=8", "GTTTAGCCGAAGTCCATATGAGTCTTTGTNTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGNTTGCNGTTTAAGTTCTTATACTNAATCATACACATGACATCAAGTCATATTCGACTCCAAAACACTAANCAAGCTTCTTCTTGCTTCTCAAAGCTTTGATG") | |
457 obsConsensus = iAlignedBioseqDB.getConsensus(2, 0.6, isHeaderSAtannot=True) | |
458 | |
459 self.assertEquals(expConsensus, obsConsensus) | |
460 | |
461 | |
462 def test_getEntropy_equal_zero(self): | |
463 iBioseq1 = Bioseq("seq1", "AGAT") | |
464 iBioseq2 = Bioseq("seq2", "AGAT") | |
465 iBioseq3 = Bioseq("seq3", "AGAT") | |
466 iBioseq4 = Bioseq("seq4", "AGAT") | |
467 | |
468 iAlignedBioseqDB = AlignedBioseqDB() | |
469 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
470 | |
471 expStats = Stat() | |
472 expStats._min = 0 | |
473 expStats._max = 0 | |
474 expStats._sum = 0 | |
475 expStats._sumOfSquares = 0 | |
476 expStats._n = 4 | |
477 expStats._lValues = [0, 0, 0, 0] | |
478 | |
479 obsStats = iAlignedBioseqDB.getEntropy() | |
480 | |
481 self.assertEquals(expStats, obsStats) | |
482 | |
483 | |
484 def test_getEntropy_different_nucl(self): | |
485 iBioseq1 = Bioseq("seq1", "AGAT") | |
486 iBioseq2 = Bioseq("seq2", "CTCA") | |
487 iBioseq3 = Bioseq("seq3", "TTCT") | |
488 iBioseq4 = Bioseq("seq4", "GTAT") | |
489 | |
490 iAlignedBioseqDB = AlignedBioseqDB() | |
491 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
492 | |
493 expStats = Stat() | |
494 expStats._min = 0.81127812445913283 | |
495 expStats._max = 2.0 | |
496 expStats._sum = 4.62255624892 | |
497 expStats._sumOfSquares = 6.31634439045 | |
498 expStats._n = 4 | |
499 expStats._lValues = [2.0, 0.81127812445913283, 1.0, 0.81127812445913283] | |
500 | |
501 obsStats = iAlignedBioseqDB.getEntropy() | |
502 | |
503 self.assertEquals(expStats, obsStats) | |
504 | |
505 | |
506 def test_getEntropy_different_nucl_with_N(self): | |
507 iBioseq1 = Bioseq("seq1", "AGAT") | |
508 iBioseq2 = Bioseq("seq2", "CNCA") | |
509 iBioseq3 = Bioseq("seq3", "TTNT") | |
510 iBioseq4 = Bioseq("seq4", "GTNT") | |
511 | |
512 iAlignedBioseqDB = AlignedBioseqDB() | |
513 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
514 | |
515 expStats = Stat() | |
516 expStats._min = 0.811278124459 | |
517 expStats._max = 2 | |
518 expStats._sum = 6.11387090595 | |
519 expStats._sumOfSquares = 10.1629200457 | |
520 expStats._n = 4 | |
521 expStats._lValues = [2, 1.4913146570363986, 1.8112781244591329, 0.81127812445913283] | |
522 | |
523 obsStats = iAlignedBioseqDB.getEntropy() | |
524 | |
525 self.assertEquals(expStats, obsStats) | |
526 | |
527 | |
528 def test_getEntropy_different_nucl_with_gap(self): | |
529 iBioseq1 = Bioseq("seq1", "AGAT") | |
530 iBioseq2 = Bioseq("seq2", "C-CA") | |
531 iBioseq3 = Bioseq("seq3", "T--T") | |
532 iBioseq4 = Bioseq("seq4", "-TNT") | |
533 | |
534 iAlignedBioseqDB = AlignedBioseqDB() | |
535 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4]) | |
536 | |
537 expStats = Stat() | |
538 expStats._min = 0.811278124459 | |
539 expStats._max = 1.65002242165 | |
540 expStats._sum = 5.04626304683 | |
541 expStats._sumOfSquares = 6.89285231586 | |
542 expStats._n = 4 | |
543 expStats._lValues = [1.5849625007211561, 1.0, 1.6500224216483541, 0.81127812445913283] | |
544 | |
545 obsStats = iAlignedBioseqDB.getEntropy() | |
546 | |
547 self.assertEquals(expStats, obsStats) | |
548 | |
549 | |
550 def test_saveAsBinaryMatrix( self ): | |
551 iBioseq1 = Bioseq("seq1", "AGAT") | |
552 iBioseq2 = Bioseq("seq2", "C-CA") | |
553 iBioseq3 = Bioseq("seq3", "T--T") | |
554 iBioseq4 = Bioseq("seq4", "-TNT") | |
555 | |
556 self._i.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4 ] ) | |
557 | |
558 expFile = "dummyExpFile_%s" % ( self._uniqId ) | |
559 expFileHandler = open( expFile, "w" ) | |
560 expFileHandler.write( "seq1\t1\t1\t1\t1\n" ) | |
561 expFileHandler.write( "seq2\t1\t0\t1\t1\n" ) | |
562 expFileHandler.write( "seq3\t1\t0\t0\t1\n" ) | |
563 expFileHandler.write( "seq4\t0\t1\t1\t1\n" ) | |
564 expFileHandler.close() | |
565 | |
566 obsFile = "dummyObsFile_%s" % ( self._uniqId ) | |
567 self._i.saveAsBinaryMatrix( obsFile ) | |
568 | |
569 self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) ) | |
570 for f in [ expFile, obsFile ]: | |
571 os.remove( f ) | |
572 | |
573 | |
574 def test_getAlignList( self ): | |
575 iBioseq1 = Bioseq( "seq1", "AGAAT" ) | |
576 iBioseq2 = Bioseq( "seq2", "-G-AC" ) | |
577 self._i.setData( [ iBioseq1, iBioseq2 ] ) | |
578 | |
579 iAlign1 = Align( Range( "seq1", 2, 2 ), | |
580 Range( "seq2", 1, 1 ), | |
581 0, 1, 100 ) | |
582 iAlign2 = Align( Range( "seq1", 4, 5 ), | |
583 Range( "seq2", 2, 3 ), | |
584 0, 1, 50 ) | |
585 lExp = [ iAlign1, iAlign2 ] | |
586 | |
587 lObs = self._i.getAlignList( "seq1", "seq2" ) | |
588 self.assertEquals( lExp, lObs ) | |
589 | |
590 | |
591 def test_removeGaps(self): | |
592 iBioseq1 = Bioseq( "seq1", "AGAAT-" ) | |
593 iBioseq2 = Bioseq( "seq2", "AG-ACG" ) | |
594 self._i.setData( [ iBioseq1, iBioseq2 ] ) | |
595 | |
596 exp = AlignedBioseqDB() | |
597 exp.setData( [ Bioseq( "seq1", "AGAAT" ), | |
598 Bioseq( "seq2", "AGACG" ) ] ) | |
599 | |
600 self._i.removeGaps() | |
601 | |
602 self.assertEquals(exp, self._i) | |
603 | |
604 def test_computeMeanPcentIdentity_between_two_sequences_50pcent(self): | |
605 iBioseq1 = Bioseq( "seq1", "AGAAT-" ) | |
606 iBioseq2 = Bioseq( "seq2", "AG-ACG" ) | |
607 self._i.setData( [ iBioseq1, iBioseq2 ] ) | |
608 expIdentity = 50.0 | |
609 obsIdentity = self._i.computeMeanPcentIdentity() | |
610 self.assertEquals(expIdentity, obsIdentity) | |
611 | |
612 def test_computeMeanPcentIdentity_between_two_sequences_57pcent(self): | |
613 iBioseq1 = Bioseq( "seq1", "AGAAT-T" ) | |
614 iBioseq2 = Bioseq( "seq2", "AG-ACGT" ) | |
615 self._i.setData( [ iBioseq1, iBioseq2 ] ) | |
616 expIdentity = 57.0 | |
617 obsIdentity = self._i.computeMeanPcentIdentity() | |
618 self.assertEquals(expIdentity, obsIdentity) | |
619 | |
620 def test_compueNeamPcentIdentity_between_two_sequences_gaps_on_same_position(self): | |
621 iBioseq1 = Bioseq( "seq1", "AGAAT-T" ) | |
622 iBioseq2 = Bioseq( "seq2", "AG-AC-T" ) | |
623 self._i.setData( [ iBioseq1, iBioseq2 ] ) | |
624 expIdentity = 57.0 | |
625 obsIdentity = self._i.computeMeanPcentIdentity() | |
626 self.assertEquals(expIdentity, obsIdentity) | |
627 | |
628 def test_computeMeanPcentIdentity_between_three_sequences_50_pcent(self): | |
629 iBioseqRef = Bioseq( "seqRef", "AGAAT-" ) | |
630 iBioseq1 = Bioseq( "seq1", "AG-ACG" ) | |
631 iBioseq2 = Bioseq( "seq2", "AG-ACG" ) | |
632 | |
633 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] ) | |
634 | |
635 expIdentity = 50.0 | |
636 obsIdentity = self._i.computeMeanPcentIdentity() | |
637 self.assertEquals(expIdentity, obsIdentity) | |
638 | |
639 def test_computeMeanPcentIdentity_between_three_sequences_42_pcent(self): | |
640 iBioseqRef = Bioseq( "seqRef", "AGAAT-" ) | |
641 iBioseq1 = Bioseq( "seq1", "AG-ACG" ) | |
642 iBioseq2 = Bioseq( "seq2", "AG-CC-" ) | |
643 | |
644 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] ) | |
645 | |
646 expIdentity = 42.0 | |
647 obsIdentity = self._i.computeMeanPcentIdentity() | |
648 self.assertEquals(expIdentity, obsIdentity) | |
649 | |
650 #TODO: test with tthis data: | |
651 #>BlastclustCluster2Mb1_chunk7 (dbseq-nr 1) [101523,104351] | |
652 #------------------------------------------------------------ | |
653 #------------------------------------------------------------ | |
654 #------------------------------------------------------------ | |
655 #------------------------------------------------------------ | |
656 #------------------------------------------------------------ | |
657 #------------------------------------------------------------ | |
658 #-------------------------------TAAATCCAACACTGAGAAGAATTATTTAA | |
659 #AGAAGGTTTTTATTTAACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATG | |
660 #TAAATTGGATTGAGGAGAAGCCTCAGTATTTTAACAATATTTGTATTTCGGGAGGGCCTC | |
661 #GCTTTTGGATTCTTAAGATTAGGAAAAGATTCAAAGAGAGCAGTCAAATCTGCTTATGTC | |
662 #AGGCTTTAGGATTTTTACAAGAACGGCTAAGTGCCGCTGCCAAAGAATTCTTTATTAGAA | |
663 #ACGGACGCTGCGCAGCTGGGATACATTATGGAAATAACTTAAGTGCATTACTGTTTTCTT | |
664 #TGAAATAATTGAAGTGGCCGATTTGGACCACCCAAATACGTACCTCCCCTTCCCTTAAAG | |
665 #TGAAACCTATACTTGGGCTGGGATTGATAGATATTGTATGGAAAATGGAGTTTGTTCTAT | |
666 #TTCTATTTGTTGTGGTATGTTTTGATTTTTTCTTATTTTAAGTTTTGCATACAGCAACAT | |
667 #AA--ATGGCATAA--ATTATACATA-TCTTAAATATAAGTACAATAAAATTAGTCA--AT | |
668 #TATTACAATGGTCATTAGTATG-TAAAGTGTAAT-ATTGTTCTC-TGA-AATCATCTTAA | |
669 #CATTGGCGTTGTGTTTTACAATTTTTATTTTTGTTATATTAAGTGGTGTGTCAAGTGGTG | |
670 #TCAAATCTATTTCTGGTTTTAACATATTTTCACTTAAAATTATACTATTGATTTCTATTT | |
671 #TGCATTGCATTACTCCTTTCATTTTGTTTTCTTCTATTTTTATATTATTAATTGAATTTG | |
672 #GGTAATTTTGGTTATGAATGGTTTGGGTTAAGTTCCAGTAATTTGTAATAATTTAAGTTT | |
673 #AGTAATTTTGGGTTAAACAAGTCAAGTCTGGAAAGCTGGCATAGCCATTCTAAATCTTCT | |
674 #ACGTATTCCGTAAACTGCATTAGCTAGAACAAAAGTGGTTATTTTCATAGTTCTGTTTTT | |
675 #TTGGTTTCTTAAGTAAT--TAAT--TTGTATGTTTGATCAAAAAGGGTAACTATTATTTT | |
676 #TGTAAT--TAATAACTGATTGGACATTTTTCAACAATTTTCTGTCAATTAGCAT----AT | |
677 #CGTAATTATTAGAA-AATACTATT--TCTGGGTAA-CATAATCAAGTCGTTCAAGGTAAT | |
678 #AGGGCCATT-TAATGTTAA-AACTTCACTTCTACTATTTTGTATGGGAAGA--CAAAATA | |
679 #TATTTTCAT--TGATCATATTAATTGTGGAAGCCATTTGTATGTGTACCCTTTGTATACT | |
680 #ATTTTTAAGCATTCGTGTTGTCCGGGGCTGTTGACCGAAAATTTTCGTTTAGTTCAGAAT | |
681 #TATTTTGATTTTGTTCATTATTAATGTGTACGGGTTGGACCTGTTGTTGCCCTTCATTAA | |
682 #TGAACCCATGATTTTGATATGGATTGTATTGATTTTGGTCATAATACTGTTGTTCGTAGG | |
683 #TTGGGTCTGAAATAATTAGGTACGTGCCACATTGGTGGGTTGTAAGTCGTCTGCTCTGAT | |
684 #AGCTAGGTCTAAATGGTTGAATGTATTGATTAAAATTAGGCTGGAAGGGTTGGGAATATT | |
685 #GTGGGTAACTTGGGCGAGTTTGAACGTTTGTATTGAATTTATGTGCTTTCGAATTCTGAT | |
686 #TAAAATTTGAATTTTTATTACGGTATTCTGGGTTTATAAAATTCAAAATTAATGAGTTTT | |
687 #TCATAAATTCCCTCATTTTGTGCAATGGTTATTAAGGATCTTAAGTCTGTAATA-TCGTG | |
688 #ATGAG-ATAA-AATAGTAAATATAGTAG-AGTCTTTA--ATAGCCTGAATAAAAATAAGA | |
689 #AAATCCGATTGGTTACCTTCCAGGTGTAGTTTCGAAATTAATAATTGACGTCGTCTTTCC | |
690 #CCTTCTTCGGAGAATGTTTTTAGGTTTCCTCTGTATGGTGTCTCCCTGAAGCTCTCCAGA | |
691 #AGTTTGTAGTTTGGTGTTTGAGTTTTAAATTCCGCGATGAGTCTTGCTTTGAGGGTAGGC | |
692 #CAATGCTCGGAAGTCCCAAAGATCGTATTATCCGTCCAAGTTACTTTTGATGGCTCCCAG | |
693 #TAGAATCCTCTGTTTACGAAAATTACGTAAATCCACTCTGCTGACGACGGTGTGAAGTGT | |
694 #TTCTGGATCACCCTTAAATGTCATAATGTCTTTAAGCTGCCGACGGGCTTCGGCAAGGTT | |
695 #GTTGTCTCTTAGCGCAACGATTTAAGGTGCGGCAATAATTATAATAATTGGTTGAGACAT | |
696 #TTTTATTGTAAAATTTTAAATTTTGTGTTTTATTGTTTTATTGTTTCGAATTCAATGCTA | |
697 #TTATTTATTTTTTTTTTTGTTTTCTAGTTTCACTTTTTTTTTTATTGTATTGCTTTATTG | |
698 #TTCTTATTTTATTTTTATATGTTGGTTTTAAAACTTAGTTGCCTTTGGACTTAATGTTTT | |
699 #TGTTTCGTATTTCACTTCCACTTTAAATTGGATAACAGAATTGGAATTAAAATCCAAGTT | |
700 #GAAGAGTTTCCACGAATTTATTTGGGAAATGTTTCGAGCACTGGAATCCAGTGACTGGAT | |
701 #TATAAAATTTAACTTATTTCC-ACTCGAAGGTTCTTTTTT--CGG--------ATACTTT | |
702 #TTGTATCAGT--TGACTAAGAGCAACACTGAGAAGAATTATTTAAAGAAGGTTTTTATTT | |
703 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATGTAAATTGGATTGAGG | |
704 #AGAAGCCTCAGTATT--TCAACAATATTTGTATT--TCGGGAGGGCCTCGCTCTTGGAT- | |
705 #-TCTTAAGATTAGGAAAAAATTCA-AAGAGAGCAGTCA-AATC-TGCTTATGTCAGGCTT | |
706 #TAGGATTTTTACAAGAAGGGC-TAAGTGCCGCTGAAACGGATGC---------------- | |
707 #------------------------------------------------------------ | |
708 #------------------------------------------------------------ | |
709 #------------ | |
710 #>BlastclustCluster2Mb2_chunk7 (dbseq-nr 1) [99136,100579] | |
711 #GTAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA | |
712 #ATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATA | |
713 #ATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATC | |
714 #ATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA | |
715 #ATCATAATAATCATAATAATCATAATAATCATAATAATAATAATAATCATAATCATAATC | |
716 #ATAATAAGCGATAAAAAAATTAAAAAATAAAAATTAAAACCCACTGCAATCACGTTGGAC | |
717 #GGCGAGTCACAGACGTCAGAATAGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCA | |
718 #AGAAGGTTTTTATTGAACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATC | |
719 #ATAAT---AATCATAATAATCATAATAATCATAATAATCATAATAAT------------- | |
720 #------------------------------------------------------------ | |
721 #------------------------------------------------------------ | |
722 #------------------------------------------------------------ | |
723 #------------------------------------------------------------ | |
724 #------------------------------------------------------------ | |
725 #-----------------------------------------------CATA-ATAATCAT | |
726 #AATAAT--CATAATAATCATA-ATAATCATAATAATCATAATAATCATAATAATCATAAT | |
727 #AATCATAATAATCATAATAATCATAA----TAATCATAATAATCATAATAATCATAATAA | |
728 #------------------------------------------------------------ | |
729 #------------------------------------------------------------ | |
730 #------------------------------------------------------------ | |
731 #------------------------------------------------------------ | |
732 #------------------------------------------------------------ | |
733 #------------------------------------------------------------ | |
734 #------TCATAA-TAATCATAATAATCGTAA---TAATCATAA----TAATCATAATAAT | |
735 #CATAATAATCATAA-TAAT----CAT-----AATAATCAT-----AATAATCATAATAAT | |
736 #CATAATAATCATAATAATCATAATAATCATAATAATCATAAT-AA-TCAT--AA--TAAT | |
737 #-----CATAATAATCATAATAA--TCA----TAATAATC---AT---AATAATCATAATA | |
738 #-AT---CATAATAATCATAATAATC----------------------------------- | |
739 #------------------------------------------------------------ | |
740 #------------------------------------------------------------ | |
741 #------------------------------------------------------------ | |
742 #------------------------------------------------------------ | |
743 #------------------------------------------------------------ | |
744 #------------------------------------------------------------ | |
745 #-----------------------------------ATAATAATCATAAT-AATCA----- | |
746 #TAATAA------TCATAAT----AATCATAAT-AATCATAATAA-TCA-TAATAATCATA | |
747 #ATAATCATAATAATCATAATAATAATAATAATCATAATCATAATCATAATAAGCATAAAA | |
748 #AAAT-------------------------------------------------------- | |
749 #------------------------------------------------------------ | |
750 #------------------------------------------------------------ | |
751 #------------------------------------------------------------ | |
752 #------------------------------------------------------------ | |
753 #------------------------------------------------------------ | |
754 #------------------------------------------------------------ | |
755 #------------------------------------------------------------ | |
756 #------------------------------------------------------------ | |
757 #------------------------------------------------------------ | |
758 #------------------------------------------------------------ | |
759 #------------------------------------------------------------ | |
760 #TAAAAAATAAAAATTAAAACCCACTGCAA---TCACGTTGGACGGCGAGTCACAGACGTC | |
761 #A-GAAT-AGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCAAGAAGGTTTTTATTG | |
762 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATCATAAT---AATCATA | |
763 #ATAA---TCA-TAATAATCAT-AATAATCATAATAATCATAA-----TAA-TCATA-ATA | |
764 #ATCATAATAATCATAATAA--TCATAATA-ATCA-TAATAATCATAATAATCATAATCAT | |
765 #CATAATAATCATAATAAT--CATAA-T-------AATC--ATAATAATCATAATAATCAT | |
766 #AATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAAT | |
767 #CATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAAT | |
768 #AATCATAATAAT | |
769 | |
770 test_suite = unittest.TestSuite() | |
771 test_suite.addTest( unittest.makeSuite( Test_AlignedBioseqDB ) ) | |
772 if __name__ == "__main__": | |
773 unittest.TextTestRunner(verbosity=2).run( test_suite ) |