6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import unittest
|
|
33 import sys
|
|
34 import os
|
|
35 import time
|
|
36 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
|
|
37 from commons.core.seq.Bioseq import Bioseq
|
|
38 from commons.core.utils.FileUtils import FileUtils
|
|
39 from commons.core.coord.Align import Align
|
|
40 from commons.core.coord.Range import Range
|
|
41 from commons.core.stat.Stat import Stat
|
|
42
|
|
43
|
|
44 class Test_AlignedBioseqDB( unittest.TestCase ):
|
|
45
|
|
46 def setUp( self ):
|
|
47 self._i = AlignedBioseqDB()
|
|
48 self._uniqId = "%s_%s" % ( time.strftime("%Y%m%d%H%M%S") , os.getpid() )
|
|
49
|
|
50
|
|
51 def tearDown( self ):
|
|
52 self._i = None
|
|
53 self._uniqId = ""
|
|
54
|
|
55
|
|
56 def test_getLength(self):
|
|
57 iAlignedBioseqDB = AlignedBioseqDB()
|
|
58
|
|
59 iBioseq1 = Bioseq( "seq1", "AGCGGACGATGCAGCATGCGAATGACGAT" )
|
|
60 iAlignedBioseqDB.setData([iBioseq1])
|
|
61
|
|
62 expLenght = 29
|
|
63 obsLength = iAlignedBioseqDB.getLength()
|
|
64
|
|
65 self.assertEquals(expLenght, obsLength)
|
|
66
|
|
67
|
|
68 def test_getSeqLengthWithoutGaps( self ):
|
|
69 iAlignedBioseqDB = AlignedBioseqDB()
|
|
70 iAlignedBioseqDB.add( Bioseq( "seq3",
|
|
71 "AGCG-GACGATGCAGCAT--GCGAATGA--CGAT" ) )
|
|
72 expLenght = 29
|
|
73 obsLength = iAlignedBioseqDB.getSeqLengthWithoutGaps( "seq3" )
|
|
74
|
|
75 self.assertEquals(expLenght, obsLength)
|
|
76
|
|
77
|
|
78 def test_getListOccPerSite(self):
|
|
79 iBioseq1 = Bioseq( "seq1", "AGAAA")
|
|
80 iBioseq2 = Bioseq( "seq2", "TCAAG")
|
|
81 iBioseq3 = Bioseq( "seq3", "GGTAC")
|
|
82 iBioseq4 = Bioseq( "seq4", "CCTTA")
|
|
83
|
|
84 iAlignedBioseqDB = AlignedBioseqDB()
|
|
85 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
86
|
|
87 expList = [
|
|
88
|
|
89 {"A":1, "T":1, "G":1, "C":1},
|
|
90
|
|
91 {"G":2, "C":2},
|
|
92
|
|
93 {"A":2, "T":2 },
|
|
94
|
|
95 {"A":3, "T":1 },
|
|
96
|
|
97 {"A":2, "G":1, "C":1}
|
|
98 ]
|
|
99
|
|
100 obsList = iAlignedBioseqDB.getListOccPerSite()
|
|
101
|
|
102 self.assertEquals(expList, obsList)
|
|
103
|
|
104
|
|
105 def test_getListOccPerSite_with_none_sequence(self):
|
|
106 iBioseq1 = Bioseq( "seq1", "AGAAA")
|
|
107 iBioseq2 = Bioseq( "seq2", "TCAAG")
|
|
108 iBioseq3 = Bioseq( "seq3", "GGTAC")
|
|
109 iBioseq4 = Bioseq( "seq4", None)
|
|
110 iBioseq5 = Bioseq( "seq5", "CCTTA")
|
|
111
|
|
112 iAlignedBioseqDB = AlignedBioseqDB()
|
|
113 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4, iBioseq5])
|
|
114
|
|
115 expList = [
|
|
116 {"A":1, "T":1, "G":1},
|
|
117 {"G":2, "C":1},
|
|
118 {"A":2, "T":1 },
|
|
119 {"A":3},
|
|
120 {"A":1, "G":1, "C":1},
|
|
121 ]
|
|
122
|
|
123 obsList = iAlignedBioseqDB.getListOccPerSite()
|
|
124
|
|
125 self.assertEquals(expList, obsList)
|
|
126
|
|
127
|
|
128 def test_getListOccPerSite_on_three_sequence(self):
|
|
129 iBioseq1 = Bioseq( "seq1", "AGAAA")
|
|
130 iBioseq2 = Bioseq( "seq2", "TCAAG")
|
|
131 iBioseq3 = Bioseq( "seq3", "GGTAC")
|
|
132
|
|
133 iAlignedBioseqDB = AlignedBioseqDB()
|
|
134 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
135
|
|
136 expList = [
|
|
137 {"A":1, "T":1, "G":1},
|
|
138 {"G":2, "C":1},
|
|
139 {"A":2, "T":1 },
|
|
140 {"A":3},
|
|
141 {"A":1, "G":1, "C":1},
|
|
142 ]
|
|
143
|
|
144 obsList = iAlignedBioseqDB.getListOccPerSite()
|
|
145
|
|
146 self.assertEquals(expList, obsList)
|
|
147
|
|
148
|
|
149 def test_getConsensus_with_minNbNt_greater_than_nbInSeq(self):
|
|
150 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
151 iBioseq2 = Bioseq("seq2", "TGCA")
|
|
152 iBioseq3 = Bioseq("seq3", "TACT")
|
|
153
|
|
154 iAlignedBioseqDB = AlignedBioseqDB()
|
|
155 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
156
|
|
157 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT")
|
|
158 obsConsensus = iAlignedBioseqDB.getConsensus(5)
|
|
159
|
|
160 self.assertEquals(expConsensus, obsConsensus)
|
|
161
|
|
162
|
|
163 def test_getConsensus_with_minPropNt_greater_than_1(self):
|
|
164 isSysExitRaised = False
|
|
165
|
|
166 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
167 iBioseq2 = Bioseq("seq2", "TGCA")
|
|
168 iBioseq3 = Bioseq("seq3", "TACT")
|
|
169
|
|
170 iAlignedBioseqDB = AlignedBioseqDB()
|
|
171 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
172
|
|
173 expFileName = "expFileName"
|
|
174 expFileHandler = open(expFileName,"w")
|
|
175 expMessage = "ERROR: minPropNt=2.00 should be a proportion (below 1.0)\n"
|
|
176 expFileHandler.write(expMessage)
|
|
177 expFileHandler.close()
|
|
178
|
|
179 obsFileName = "obsFileName"
|
|
180 obsFileHandler = open(obsFileName,"w")
|
|
181
|
|
182 stdoutRef = sys.stdout
|
|
183 sys.stdout = obsFileHandler
|
|
184
|
|
185 try:
|
|
186 iAlignedBioseqDB.getConsensus(2,2)
|
|
187 except SystemExit:
|
|
188 isSysExitRaised = True
|
|
189
|
|
190 obsFileHandler.close()
|
|
191 expFileHandler.close()
|
|
192
|
|
193 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
194 self.assertTrue( isSysExitRaised )
|
|
195
|
|
196 sys.stdout = stdoutRef
|
|
197 os.remove ( obsFileName )
|
|
198 os.remove ( expFileName )
|
|
199 stdoutRef = sys.stdout
|
|
200
|
|
201
|
|
202 def test_getConsensus_with_AlignBioseqInstance_size_1(self):
|
|
203 isSysExitRaised = False
|
|
204
|
|
205 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
206
|
|
207 iAlignedBioseqDB = AlignedBioseqDB()
|
|
208 iAlignedBioseqDB.setData([iBioseq1])
|
|
209
|
|
210 expFileName = "expFileName"
|
|
211 expFileHandler = open(expFileName,"w")
|
|
212 expMessage = "ERROR: can't make a consensus with less than 2 sequences\n"
|
|
213 expFileHandler.write(expMessage)
|
|
214 expFileHandler.close()
|
|
215
|
|
216 obsFileName = "obsFileName"
|
|
217 obsFileHandler = open(obsFileName,"w")
|
|
218
|
|
219 stdoutRef = sys.stdout
|
|
220 sys.stdout = obsFileHandler
|
|
221
|
|
222 try:
|
|
223 iAlignedBioseqDB.getConsensus(4)
|
|
224 except SystemExit:
|
|
225 isSysExitRaised = True
|
|
226
|
|
227 obsFileHandler.close()
|
|
228 expFileHandler.close()
|
|
229
|
|
230 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
231 self.assertTrue( isSysExitRaised )
|
|
232
|
|
233 sys.stdout = stdoutRef
|
|
234 os.remove ( obsFileName )
|
|
235 os.remove ( expFileName )
|
|
236 stdoutRef = sys.stdout
|
|
237
|
|
238
|
|
239 def test_getConsensus_with_gap_assertion_on_warning_msg(self):
|
|
240 iBioseq1 = Bioseq("seq1", "A-GA-T")
|
|
241 iBioseq2 = Bioseq("seq2", "T-GC-A")
|
|
242 iBioseq3 = Bioseq("seq3", "T-AC-T")
|
|
243
|
|
244 iAlignedBioseqDB = AlignedBioseqDB()
|
|
245 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
246
|
|
247 expFileName = "expFileName"
|
|
248 expFileHandler = open(expFileName,"w")
|
|
249 expMessage = "WARNING: 2 sites were removed (33.33%)\n"
|
|
250 expFileHandler.write(expMessage)
|
|
251 expFileHandler.close()
|
|
252
|
|
253 obsFileName = "obsFileName"
|
|
254 obsFileHandler = open(obsFileName,"w")
|
|
255
|
|
256 stdoutRef = sys.stdout
|
|
257 sys.stdout = obsFileHandler
|
|
258
|
|
259 iAlignedBioseqDB.getConsensus(2)
|
|
260
|
|
261 obsFileHandler.close()
|
|
262 expFileHandler.close()
|
|
263
|
|
264 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
265
|
|
266 sys.stdout = stdoutRef
|
|
267 os.remove ( obsFileName )
|
|
268 os.remove ( expFileName )
|
|
269 stdoutRef = sys.stdout
|
|
270
|
|
271
|
|
272 def test_getConsensus_with_gap_assertion_on_result(self):
|
|
273 iBioseq1 = Bioseq("seq1", "A-GA-T")
|
|
274 iBioseq2 = Bioseq("seq2", "T-GC-A")
|
|
275 iBioseq3 = Bioseq("seq3", "T-AC-T")
|
|
276
|
|
277 iAlignedBioseqDB = AlignedBioseqDB()
|
|
278 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
279
|
|
280 expConsensus = Bioseq("consensus= length=4 nbAlign=3","TGCT")
|
|
281 obsConsensus = iAlignedBioseqDB.getConsensus(2)
|
|
282
|
|
283 self.assertEquals(expConsensus, obsConsensus)
|
|
284
|
|
285
|
|
286 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_result(self):
|
|
287 iBioseq1 = Bioseq("seq1", "A--A-T")
|
|
288 iBioseq2 = Bioseq("seq2", "----A-")
|
|
289 iBioseq3 = Bioseq("seq3", "--A---")
|
|
290
|
|
291 iAlignedBioseqDB = AlignedBioseqDB()
|
|
292 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
293
|
|
294 expConsensus = None
|
|
295 obsConsensus = iAlignedBioseqDB.getConsensus(2)
|
|
296 self.assertEquals(expConsensus, obsConsensus)
|
|
297
|
|
298
|
|
299 def test_getConsensus_with_gaps_and_no_consensus_built_assertion_on_warning_messages(self):
|
|
300 iBioseq1 = Bioseq("seq1", "A--A-T")
|
|
301 iBioseq2 = Bioseq("seq2", "----A-")
|
|
302 iBioseq3 = Bioseq("seq3", "--A---")
|
|
303
|
|
304 iAlignedBioseqDB = AlignedBioseqDB()
|
|
305 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3])
|
|
306
|
|
307 expFileName = "expFileName"
|
|
308 expFileHandler = open(expFileName,"w")
|
|
309 expMessage1 = "WARNING: 1 site was removed (16.67%)\n"
|
|
310 expMessage2 = "WARNING: no consensus can be built (no sequence left)\n"
|
|
311 expFileHandler.write(expMessage1)
|
|
312 expFileHandler.write(expMessage2)
|
|
313 expFileHandler.close()
|
|
314
|
|
315 obsFileName = "obsFileName"
|
|
316 obsFileHandler = open(obsFileName,"w")
|
|
317
|
|
318 stdoutRef = sys.stdout
|
|
319 sys.stdout = obsFileHandler
|
|
320
|
|
321 iAlignedBioseqDB.getConsensus(2)
|
|
322
|
|
323 obsFileHandler.close()
|
|
324 expFileHandler.close()
|
|
325
|
|
326 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
327
|
|
328 sys.stdout = stdoutRef
|
|
329 os.remove ( obsFileName )
|
|
330 os.remove ( expFileName )
|
|
331 stdoutRef = sys.stdout
|
|
332
|
|
333
|
|
334 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_result(self):
|
|
335 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
336 iBioseq2 = Bioseq("seq2", "CCCA")
|
|
337 iBioseq3 = Bioseq("seq3", "TTCT")
|
|
338 iBioseq4 = Bioseq("seq4", "TTAT")
|
|
339
|
|
340 iAlignedBioseqDB = AlignedBioseqDB()
|
|
341 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
342
|
|
343 expConsensus = None
|
|
344 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.75)
|
|
345
|
|
346 self.assertEquals(expConsensus, obsConsensus)
|
|
347
|
|
348
|
|
349 def test_getConsensus_with_unacceptable_N_proportion_assertion_on_warning_msg(self):
|
|
350 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
351 iBioseq2 = Bioseq("seq2", "CCCA")
|
|
352 iBioseq3 = Bioseq("seq3", "TTCT")
|
|
353 iBioseq4 = Bioseq("seq4", "TTAT")
|
|
354
|
|
355 iAlignedBioseqDB = AlignedBioseqDB()
|
|
356 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
357
|
|
358 expFileName = "expFileName"
|
|
359 expFileHandler = open(expFileName,"w")
|
|
360 expMessage = "WARNING: no consensus can be built (75% of N's >= 40%)\n"
|
|
361 expFileHandler.write(expMessage)
|
|
362 expFileHandler.close()
|
|
363
|
|
364 obsFileName = "obsFileName"
|
|
365 obsFileHandler = open(obsFileName,"w")
|
|
366
|
|
367 stdoutRef = sys.stdout
|
|
368 sys.stdout = obsFileHandler
|
|
369
|
|
370 iAlignedBioseqDB.getConsensus(2,0.75)
|
|
371
|
|
372 obsFileHandler.close()
|
|
373 expFileHandler.close()
|
|
374
|
|
375 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
376
|
|
377 sys.stdout = stdoutRef
|
|
378 os.remove ( obsFileName )
|
|
379 os.remove ( expFileName )
|
|
380 stdoutRef = sys.stdout
|
|
381
|
|
382
|
|
383 def test_getConsensus_with_acceptable_N_proportion_assertion_on_warning_msg(self):
|
|
384 iBioseq1 = Bioseq("seq1", "AGATAA")
|
|
385 iBioseq2 = Bioseq("seq2", "CTCAAA")
|
|
386 iBioseq3 = Bioseq("seq3", "TTCTAA")
|
|
387 iBioseq4 = Bioseq("seq4", "GTATCC")
|
|
388
|
|
389 iAlignedBioseqDB = AlignedBioseqDB()
|
|
390 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
391
|
|
392 expFileName = "expFileName"
|
|
393 expFileHandler = open(expFileName,"w")
|
|
394 expMessage = "WARNING: 33% of N's\n"
|
|
395 expFileHandler.write(expMessage)
|
|
396 expFileHandler.close()
|
|
397
|
|
398 obsFileName = "obsFileName"
|
|
399 obsFileHandler = open(obsFileName,"w")
|
|
400
|
|
401 stdoutRef = sys.stdout
|
|
402 sys.stdout = obsFileHandler
|
|
403
|
|
404 iAlignedBioseqDB.getConsensus(2,0.6)
|
|
405
|
|
406 obsFileHandler.close()
|
|
407 expFileHandler.close()
|
|
408
|
|
409 self.assertTrue( FileUtils.are2FilesIdentical( expFileName, obsFileName ) )
|
|
410
|
|
411 sys.stdout = stdoutRef
|
|
412 os.remove ( obsFileName )
|
|
413 os.remove ( expFileName )
|
|
414 stdoutRef = sys.stdout
|
|
415
|
|
416
|
|
417 def test_getConsensus_with_acceptable_N_proportion_assertion_on_result(self):
|
|
418 iBioseq1 = Bioseq("seq1", "AGATAA")
|
|
419 iBioseq2 = Bioseq("seq2", "CTCAAA")
|
|
420 iBioseq3 = Bioseq("seq3", "TTCTAA")
|
|
421 iBioseq4 = Bioseq("seq4", "GTATCC")
|
|
422
|
|
423 iAlignedBioseqDB = AlignedBioseqDB()
|
|
424 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
425
|
|
426 expConsensus = Bioseq("consensus= length=6 nbAlign=4","NTNTAA")
|
|
427 obsConsensus = iAlignedBioseqDB.getConsensus(2,0.6)
|
|
428
|
|
429 self.assertEquals(expConsensus, obsConsensus)
|
|
430
|
|
431
|
|
432 def test_getConsensus_with_chimeric_seq_in_alignment(self):
|
|
433 iBioseq1 = Bioseq("seq1", "AGAGTTGTAA")
|
|
434 iBioseq2 = Bioseq("seq2", "ATC----AAA")
|
|
435 iBioseq3 = Bioseq("seq3", "ATC----TAA")
|
|
436 iBioseq4 = Bioseq("seq4", "GTC----TAA")
|
|
437
|
|
438 iAlignedBioseqDB = AlignedBioseqDB()
|
|
439 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
440
|
|
441 expConsensus = Bioseq("consensus= length=6 nbAlign=4","ATCTAA")
|
|
442 obsConsensus = iAlignedBioseqDB.getConsensus(2)
|
|
443
|
|
444 self.assertEquals(expConsensus, obsConsensus)
|
|
445
|
|
446
|
|
447 def test_getConsensus_with_SATannot_data(self):
|
|
448 iBioseq1 = Bioseq("MbS69Gr8Cl511 chunk21 {Fragment} 165740..165916", "--TTTAGCCGAAGTCCATATGAGTCTTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGATTGCGGTTTAAGTTCTTATACTCAATCATACACATGACATCAAGTCATATTCGAATCCAAAACTCTAAGCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGATG")
|
|
449 iBioseq2 = Bioseq("MbS68Gr8Cl511 chunk21 {Fragment} 165916..166092", "GGTCTAGCCGAAGTCCATATGAGTCTTTGTCTTTGTATCTTCTAACAAGAAAACACTACTTAGGCTTTTAGGATAAGGTTGCAGTTTAAGTTTTTATACTAAATCATACACATCACATCAAGTCATATTCGACTCCCAAACACTAACCAAGCTTCTT--TGCTTCTCAAC-GCTTTGATG")
|
|
450 iBioseq3 = Bioseq("MbS67Gr8Cl511 chunk21 {Fragment} 166093..166269", "-GTTTAGCCGAAGTCCATATGAGTCGTTGTGTTTGTATCTTCTAACAAGGAAACACTACTTACGCTTTTAGGATAAGATTGTTGTTTAAGTTCTTATACTTAATCATACACATGACATAAAGTCATATTCGACTCCAAAACACTAATCAAGCTTCTTCTTGCTTCTCAAA-GCTTTGTT-")
|
|
451 iBioseq4 = Bioseq("MbS66Gr8Cl511 chunk21 {Fragment} 173353..173529", "--TTTAGCAAAATTCTATATGAGTCTTTATCTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGGTTGCGGGTTAAGTTCTTATACTCAATCATACACATGATATCAAGTCATATTCGACTCCAAAACACTAACCAAGCTTCTTCTTGCTTCTTAAAAGCTTTGAA-")
|
|
452
|
|
453 iAlignedBioseqDB = AlignedBioseqDB()
|
|
454 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
455
|
|
456 expConsensus = Bioseq("consensus= length=178 nbAlign=4 pile=511 pyramid=8", "GTTTAGCCGAAGTCCATATGAGTCTTTGTNTTTGTATCTTCTAACAAGGAAACACTACTTAGGCTTTTAGGATAAGNTTGCNGTTTAAGTTCTTATACTNAATCATACACATGACATCAAGTCATATTCGACTCCAAAACACTAANCAAGCTTCTTCTTGCTTCTCAAAGCTTTGATG")
|
|
457 obsConsensus = iAlignedBioseqDB.getConsensus(2, 0.6, isHeaderSAtannot=True)
|
|
458
|
|
459 self.assertEquals(expConsensus, obsConsensus)
|
|
460
|
|
461
|
|
462 def test_getEntropy_equal_zero(self):
|
|
463 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
464 iBioseq2 = Bioseq("seq2", "AGAT")
|
|
465 iBioseq3 = Bioseq("seq3", "AGAT")
|
|
466 iBioseq4 = Bioseq("seq4", "AGAT")
|
|
467
|
|
468 iAlignedBioseqDB = AlignedBioseqDB()
|
|
469 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
470
|
|
471 expStats = Stat()
|
|
472 expStats._min = 0
|
|
473 expStats._max = 0
|
|
474 expStats._sum = 0
|
|
475 expStats._sumOfSquares = 0
|
|
476 expStats._n = 4
|
|
477 expStats._lValues = [0, 0, 0, 0]
|
|
478
|
|
479 obsStats = iAlignedBioseqDB.getEntropy()
|
|
480
|
|
481 self.assertEquals(expStats, obsStats)
|
|
482
|
|
483
|
|
484 def test_getEntropy_different_nucl(self):
|
|
485 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
486 iBioseq2 = Bioseq("seq2", "CTCA")
|
|
487 iBioseq3 = Bioseq("seq3", "TTCT")
|
|
488 iBioseq4 = Bioseq("seq4", "GTAT")
|
|
489
|
|
490 iAlignedBioseqDB = AlignedBioseqDB()
|
|
491 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
492
|
|
493 expStats = Stat()
|
|
494 expStats._min = 0.81127812445913283
|
|
495 expStats._max = 2.0
|
|
496 expStats._sum = 4.62255624892
|
|
497 expStats._sumOfSquares = 6.31634439045
|
|
498 expStats._n = 4
|
|
499 expStats._lValues = [2.0, 0.81127812445913283, 1.0, 0.81127812445913283]
|
|
500
|
|
501 obsStats = iAlignedBioseqDB.getEntropy()
|
|
502
|
|
503 self.assertEquals(expStats, obsStats)
|
|
504
|
|
505
|
|
506 def test_getEntropy_different_nucl_with_N(self):
|
|
507 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
508 iBioseq2 = Bioseq("seq2", "CNCA")
|
|
509 iBioseq3 = Bioseq("seq3", "TTNT")
|
|
510 iBioseq4 = Bioseq("seq4", "GTNT")
|
|
511
|
|
512 iAlignedBioseqDB = AlignedBioseqDB()
|
|
513 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
514
|
|
515 expStats = Stat()
|
|
516 expStats._min = 0.811278124459
|
|
517 expStats._max = 2
|
|
518 expStats._sum = 6.11387090595
|
|
519 expStats._sumOfSquares = 10.1629200457
|
|
520 expStats._n = 4
|
|
521 expStats._lValues = [2, 1.4913146570363986, 1.8112781244591329, 0.81127812445913283]
|
|
522
|
|
523 obsStats = iAlignedBioseqDB.getEntropy()
|
|
524
|
|
525 self.assertEquals(expStats, obsStats)
|
|
526
|
|
527
|
|
528 def test_getEntropy_different_nucl_with_gap(self):
|
|
529 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
530 iBioseq2 = Bioseq("seq2", "C-CA")
|
|
531 iBioseq3 = Bioseq("seq3", "T--T")
|
|
532 iBioseq4 = Bioseq("seq4", "-TNT")
|
|
533
|
|
534 iAlignedBioseqDB = AlignedBioseqDB()
|
|
535 iAlignedBioseqDB.setData([iBioseq1, iBioseq2, iBioseq3, iBioseq4])
|
|
536
|
|
537 expStats = Stat()
|
|
538 expStats._min = 0.811278124459
|
|
539 expStats._max = 1.65002242165
|
|
540 expStats._sum = 5.04626304683
|
|
541 expStats._sumOfSquares = 6.89285231586
|
|
542 expStats._n = 4
|
|
543 expStats._lValues = [1.5849625007211561, 1.0, 1.6500224216483541, 0.81127812445913283]
|
|
544
|
|
545 obsStats = iAlignedBioseqDB.getEntropy()
|
|
546
|
|
547 self.assertEquals(expStats, obsStats)
|
|
548
|
|
549
|
|
550 def test_saveAsBinaryMatrix( self ):
|
|
551 iBioseq1 = Bioseq("seq1", "AGAT")
|
|
552 iBioseq2 = Bioseq("seq2", "C-CA")
|
|
553 iBioseq3 = Bioseq("seq3", "T--T")
|
|
554 iBioseq4 = Bioseq("seq4", "-TNT")
|
|
555
|
|
556 self._i.setData( [ iBioseq1, iBioseq2, iBioseq3, iBioseq4 ] )
|
|
557
|
|
558 expFile = "dummyExpFile_%s" % ( self._uniqId )
|
|
559 expFileHandler = open( expFile, "w" )
|
|
560 expFileHandler.write( "seq1\t1\t1\t1\t1\n" )
|
|
561 expFileHandler.write( "seq2\t1\t0\t1\t1\n" )
|
|
562 expFileHandler.write( "seq3\t1\t0\t0\t1\n" )
|
|
563 expFileHandler.write( "seq4\t0\t1\t1\t1\n" )
|
|
564 expFileHandler.close()
|
|
565
|
|
566 obsFile = "dummyObsFile_%s" % ( self._uniqId )
|
|
567 self._i.saveAsBinaryMatrix( obsFile )
|
|
568
|
|
569 self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) )
|
|
570 for f in [ expFile, obsFile ]:
|
|
571 os.remove( f )
|
|
572
|
|
573
|
|
574 def test_getAlignList( self ):
|
|
575 iBioseq1 = Bioseq( "seq1", "AGAAT" )
|
|
576 iBioseq2 = Bioseq( "seq2", "-G-AC" )
|
|
577 self._i.setData( [ iBioseq1, iBioseq2 ] )
|
|
578
|
|
579 iAlign1 = Align( Range( "seq1", 2, 2 ),
|
|
580 Range( "seq2", 1, 1 ),
|
|
581 0, 1, 100 )
|
|
582 iAlign2 = Align( Range( "seq1", 4, 5 ),
|
|
583 Range( "seq2", 2, 3 ),
|
|
584 0, 1, 50 )
|
|
585 lExp = [ iAlign1, iAlign2 ]
|
|
586
|
|
587 lObs = self._i.getAlignList( "seq1", "seq2" )
|
|
588 self.assertEquals( lExp, lObs )
|
|
589
|
|
590
|
|
591 def test_removeGaps(self):
|
|
592 iBioseq1 = Bioseq( "seq1", "AGAAT-" )
|
|
593 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
|
|
594 self._i.setData( [ iBioseq1, iBioseq2 ] )
|
|
595
|
|
596 exp = AlignedBioseqDB()
|
|
597 exp.setData( [ Bioseq( "seq1", "AGAAT" ),
|
|
598 Bioseq( "seq2", "AGACG" ) ] )
|
|
599
|
|
600 self._i.removeGaps()
|
|
601
|
|
602 self.assertEquals(exp, self._i)
|
|
603
|
|
604 def test_computeMeanPcentIdentity_between_two_sequences_50pcent(self):
|
|
605 iBioseq1 = Bioseq( "seq1", "AGAAT-" )
|
|
606 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
|
|
607 self._i.setData( [ iBioseq1, iBioseq2 ] )
|
|
608 expIdentity = 50.0
|
|
609 obsIdentity = self._i.computeMeanPcentIdentity()
|
|
610 self.assertEquals(expIdentity, obsIdentity)
|
|
611
|
|
612 def test_computeMeanPcentIdentity_between_two_sequences_57pcent(self):
|
|
613 iBioseq1 = Bioseq( "seq1", "AGAAT-T" )
|
|
614 iBioseq2 = Bioseq( "seq2", "AG-ACGT" )
|
|
615 self._i.setData( [ iBioseq1, iBioseq2 ] )
|
|
616 expIdentity = 57.0
|
|
617 obsIdentity = self._i.computeMeanPcentIdentity()
|
|
618 self.assertEquals(expIdentity, obsIdentity)
|
|
619
|
|
620 def test_compueNeamPcentIdentity_between_two_sequences_gaps_on_same_position(self):
|
|
621 iBioseq1 = Bioseq( "seq1", "AGAAT-T" )
|
|
622 iBioseq2 = Bioseq( "seq2", "AG-AC-T" )
|
|
623 self._i.setData( [ iBioseq1, iBioseq2 ] )
|
|
624 expIdentity = 57.0
|
|
625 obsIdentity = self._i.computeMeanPcentIdentity()
|
|
626 self.assertEquals(expIdentity, obsIdentity)
|
|
627
|
|
628 def test_computeMeanPcentIdentity_between_three_sequences_50_pcent(self):
|
|
629 iBioseqRef = Bioseq( "seqRef", "AGAAT-" )
|
|
630 iBioseq1 = Bioseq( "seq1", "AG-ACG" )
|
|
631 iBioseq2 = Bioseq( "seq2", "AG-ACG" )
|
|
632
|
|
633 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] )
|
|
634
|
|
635 expIdentity = 50.0
|
|
636 obsIdentity = self._i.computeMeanPcentIdentity()
|
|
637 self.assertEquals(expIdentity, obsIdentity)
|
|
638
|
|
639 def test_computeMeanPcentIdentity_between_three_sequences_42_pcent(self):
|
|
640 iBioseqRef = Bioseq( "seqRef", "AGAAT-" )
|
|
641 iBioseq1 = Bioseq( "seq1", "AG-ACG" )
|
|
642 iBioseq2 = Bioseq( "seq2", "AG-CC-" )
|
|
643
|
|
644 self._i.setData( [ iBioseqRef, iBioseq1, iBioseq2 ] )
|
|
645
|
|
646 expIdentity = 42.0
|
|
647 obsIdentity = self._i.computeMeanPcentIdentity()
|
|
648 self.assertEquals(expIdentity, obsIdentity)
|
|
649
|
|
650 #TODO: test with tthis data:
|
|
651 #>BlastclustCluster2Mb1_chunk7 (dbseq-nr 1) [101523,104351]
|
|
652 #------------------------------------------------------------
|
|
653 #------------------------------------------------------------
|
|
654 #------------------------------------------------------------
|
|
655 #------------------------------------------------------------
|
|
656 #------------------------------------------------------------
|
|
657 #------------------------------------------------------------
|
|
658 #-------------------------------TAAATCCAACACTGAGAAGAATTATTTAA
|
|
659 #AGAAGGTTTTTATTTAACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATG
|
|
660 #TAAATTGGATTGAGGAGAAGCCTCAGTATTTTAACAATATTTGTATTTCGGGAGGGCCTC
|
|
661 #GCTTTTGGATTCTTAAGATTAGGAAAAGATTCAAAGAGAGCAGTCAAATCTGCTTATGTC
|
|
662 #AGGCTTTAGGATTTTTACAAGAACGGCTAAGTGCCGCTGCCAAAGAATTCTTTATTAGAA
|
|
663 #ACGGACGCTGCGCAGCTGGGATACATTATGGAAATAACTTAAGTGCATTACTGTTTTCTT
|
|
664 #TGAAATAATTGAAGTGGCCGATTTGGACCACCCAAATACGTACCTCCCCTTCCCTTAAAG
|
|
665 #TGAAACCTATACTTGGGCTGGGATTGATAGATATTGTATGGAAAATGGAGTTTGTTCTAT
|
|
666 #TTCTATTTGTTGTGGTATGTTTTGATTTTTTCTTATTTTAAGTTTTGCATACAGCAACAT
|
|
667 #AA--ATGGCATAA--ATTATACATA-TCTTAAATATAAGTACAATAAAATTAGTCA--AT
|
|
668 #TATTACAATGGTCATTAGTATG-TAAAGTGTAAT-ATTGTTCTC-TGA-AATCATCTTAA
|
|
669 #CATTGGCGTTGTGTTTTACAATTTTTATTTTTGTTATATTAAGTGGTGTGTCAAGTGGTG
|
|
670 #TCAAATCTATTTCTGGTTTTAACATATTTTCACTTAAAATTATACTATTGATTTCTATTT
|
|
671 #TGCATTGCATTACTCCTTTCATTTTGTTTTCTTCTATTTTTATATTATTAATTGAATTTG
|
|
672 #GGTAATTTTGGTTATGAATGGTTTGGGTTAAGTTCCAGTAATTTGTAATAATTTAAGTTT
|
|
673 #AGTAATTTTGGGTTAAACAAGTCAAGTCTGGAAAGCTGGCATAGCCATTCTAAATCTTCT
|
|
674 #ACGTATTCCGTAAACTGCATTAGCTAGAACAAAAGTGGTTATTTTCATAGTTCTGTTTTT
|
|
675 #TTGGTTTCTTAAGTAAT--TAAT--TTGTATGTTTGATCAAAAAGGGTAACTATTATTTT
|
|
676 #TGTAAT--TAATAACTGATTGGACATTTTTCAACAATTTTCTGTCAATTAGCAT----AT
|
|
677 #CGTAATTATTAGAA-AATACTATT--TCTGGGTAA-CATAATCAAGTCGTTCAAGGTAAT
|
|
678 #AGGGCCATT-TAATGTTAA-AACTTCACTTCTACTATTTTGTATGGGAAGA--CAAAATA
|
|
679 #TATTTTCAT--TGATCATATTAATTGTGGAAGCCATTTGTATGTGTACCCTTTGTATACT
|
|
680 #ATTTTTAAGCATTCGTGTTGTCCGGGGCTGTTGACCGAAAATTTTCGTTTAGTTCAGAAT
|
|
681 #TATTTTGATTTTGTTCATTATTAATGTGTACGGGTTGGACCTGTTGTTGCCCTTCATTAA
|
|
682 #TGAACCCATGATTTTGATATGGATTGTATTGATTTTGGTCATAATACTGTTGTTCGTAGG
|
|
683 #TTGGGTCTGAAATAATTAGGTACGTGCCACATTGGTGGGTTGTAAGTCGTCTGCTCTGAT
|
|
684 #AGCTAGGTCTAAATGGTTGAATGTATTGATTAAAATTAGGCTGGAAGGGTTGGGAATATT
|
|
685 #GTGGGTAACTTGGGCGAGTTTGAACGTTTGTATTGAATTTATGTGCTTTCGAATTCTGAT
|
|
686 #TAAAATTTGAATTTTTATTACGGTATTCTGGGTTTATAAAATTCAAAATTAATGAGTTTT
|
|
687 #TCATAAATTCCCTCATTTTGTGCAATGGTTATTAAGGATCTTAAGTCTGTAATA-TCGTG
|
|
688 #ATGAG-ATAA-AATAGTAAATATAGTAG-AGTCTTTA--ATAGCCTGAATAAAAATAAGA
|
|
689 #AAATCCGATTGGTTACCTTCCAGGTGTAGTTTCGAAATTAATAATTGACGTCGTCTTTCC
|
|
690 #CCTTCTTCGGAGAATGTTTTTAGGTTTCCTCTGTATGGTGTCTCCCTGAAGCTCTCCAGA
|
|
691 #AGTTTGTAGTTTGGTGTTTGAGTTTTAAATTCCGCGATGAGTCTTGCTTTGAGGGTAGGC
|
|
692 #CAATGCTCGGAAGTCCCAAAGATCGTATTATCCGTCCAAGTTACTTTTGATGGCTCCCAG
|
|
693 #TAGAATCCTCTGTTTACGAAAATTACGTAAATCCACTCTGCTGACGACGGTGTGAAGTGT
|
|
694 #TTCTGGATCACCCTTAAATGTCATAATGTCTTTAAGCTGCCGACGGGCTTCGGCAAGGTT
|
|
695 #GTTGTCTCTTAGCGCAACGATTTAAGGTGCGGCAATAATTATAATAATTGGTTGAGACAT
|
|
696 #TTTTATTGTAAAATTTTAAATTTTGTGTTTTATTGTTTTATTGTTTCGAATTCAATGCTA
|
|
697 #TTATTTATTTTTTTTTTTGTTTTCTAGTTTCACTTTTTTTTTTATTGTATTGCTTTATTG
|
|
698 #TTCTTATTTTATTTTTATATGTTGGTTTTAAAACTTAGTTGCCTTTGGACTTAATGTTTT
|
|
699 #TGTTTCGTATTTCACTTCCACTTTAAATTGGATAACAGAATTGGAATTAAAATCCAAGTT
|
|
700 #GAAGAGTTTCCACGAATTTATTTGGGAAATGTTTCGAGCACTGGAATCCAGTGACTGGAT
|
|
701 #TATAAAATTTAACTTATTTCC-ACTCGAAGGTTCTTTTTT--CGG--------ATACTTT
|
|
702 #TTGTATCAGT--TGACTAAGAGCAACACTGAGAAGAATTATTTAAAGAAGGTTTTTATTT
|
|
703 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAA-TTCA-AATGTAAATTGGATTGAGG
|
|
704 #AGAAGCCTCAGTATT--TCAACAATATTTGTATT--TCGGGAGGGCCTCGCTCTTGGAT-
|
|
705 #-TCTTAAGATTAGGAAAAAATTCA-AAGAGAGCAGTCA-AATC-TGCTTATGTCAGGCTT
|
|
706 #TAGGATTTTTACAAGAAGGGC-TAAGTGCCGCTGAAACGGATGC----------------
|
|
707 #------------------------------------------------------------
|
|
708 #------------------------------------------------------------
|
|
709 #------------
|
|
710 #>BlastclustCluster2Mb2_chunk7 (dbseq-nr 1) [99136,100579]
|
|
711 #GTAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA
|
|
712 #ATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATA
|
|
713 #ATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATC
|
|
714 #ATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATA
|
|
715 #ATCATAATAATCATAATAATCATAATAATCATAATAATAATAATAATCATAATCATAATC
|
|
716 #ATAATAAGCGATAAAAAAATTAAAAAATAAAAATTAAAACCCACTGCAATCACGTTGGAC
|
|
717 #GGCGAGTCACAGACGTCAGAATAGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCA
|
|
718 #AGAAGGTTTTTATTGAACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATC
|
|
719 #ATAAT---AATCATAATAATCATAATAATCATAATAATCATAATAAT-------------
|
|
720 #------------------------------------------------------------
|
|
721 #------------------------------------------------------------
|
|
722 #------------------------------------------------------------
|
|
723 #------------------------------------------------------------
|
|
724 #------------------------------------------------------------
|
|
725 #-----------------------------------------------CATA-ATAATCAT
|
|
726 #AATAAT--CATAATAATCATA-ATAATCATAATAATCATAATAATCATAATAATCATAAT
|
|
727 #AATCATAATAATCATAATAATCATAA----TAATCATAATAATCATAATAATCATAATAA
|
|
728 #------------------------------------------------------------
|
|
729 #------------------------------------------------------------
|
|
730 #------------------------------------------------------------
|
|
731 #------------------------------------------------------------
|
|
732 #------------------------------------------------------------
|
|
733 #------------------------------------------------------------
|
|
734 #------TCATAA-TAATCATAATAATCGTAA---TAATCATAA----TAATCATAATAAT
|
|
735 #CATAATAATCATAA-TAAT----CAT-----AATAATCAT-----AATAATCATAATAAT
|
|
736 #CATAATAATCATAATAATCATAATAATCATAATAATCATAAT-AA-TCAT--AA--TAAT
|
|
737 #-----CATAATAATCATAATAA--TCA----TAATAATC---AT---AATAATCATAATA
|
|
738 #-AT---CATAATAATCATAATAATC-----------------------------------
|
|
739 #------------------------------------------------------------
|
|
740 #------------------------------------------------------------
|
|
741 #------------------------------------------------------------
|
|
742 #------------------------------------------------------------
|
|
743 #------------------------------------------------------------
|
|
744 #------------------------------------------------------------
|
|
745 #-----------------------------------ATAATAATCATAAT-AATCA-----
|
|
746 #TAATAA------TCATAAT----AATCATAAT-AATCATAATAA-TCA-TAATAATCATA
|
|
747 #ATAATCATAATAATCATAATAATAATAATAATCATAATCATAATCATAATAAGCATAAAA
|
|
748 #AAAT--------------------------------------------------------
|
|
749 #------------------------------------------------------------
|
|
750 #------------------------------------------------------------
|
|
751 #------------------------------------------------------------
|
|
752 #------------------------------------------------------------
|
|
753 #------------------------------------------------------------
|
|
754 #------------------------------------------------------------
|
|
755 #------------------------------------------------------------
|
|
756 #------------------------------------------------------------
|
|
757 #------------------------------------------------------------
|
|
758 #------------------------------------------------------------
|
|
759 #------------------------------------------------------------
|
|
760 #TAAAAAATAAAAATTAAAACCCACTGCAA---TCACGTTGGACGGCGAGTCACAGACGTC
|
|
761 #A-GAAT-AGTGGTGCGTAAATCCAACGCCGAGAAGAATTACTTCAAGAAGGTTTTTATTG
|
|
762 #AACTTCTTTATTCGGATATCAGTTTAAGACTAAAAATTAATAATCATAAT---AATCATA
|
|
763 #ATAA---TCA-TAATAATCAT-AATAATCATAATAATCATAA-----TAA-TCATA-ATA
|
|
764 #ATCATAATAATCATAATAA--TCATAATA-ATCA-TAATAATCATAATAATCATAATCAT
|
|
765 #CATAATAATCATAATAAT--CATAA-T-------AATC--ATAATAATCATAATAATCAT
|
|
766 #AATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAAT
|
|
767 #CATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAATAATCATAAT
|
|
768 #AATCATAATAAT
|
|
769
|
|
770 test_suite = unittest.TestSuite()
|
|
771 test_suite.addTest( unittest.makeSuite( Test_AlignedBioseqDB ) )
|
|
772 if __name__ == "__main__":
|
|
773 unittest.TextTestRunner(verbosity=2).run( test_suite )
|