comparison commons/core/seq/test/Test_FastaUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children 94ab73e8a190
comparison
equal deleted inserted replaced
5:ea3082881bf8 6:769e306b7933
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 from commons.core.seq.FastaUtils import FastaUtils
33 from commons.core.seq.test.Utils_for_T_FastaUtils import Utils_for_T_FastaUtils
34 from commons.core.utils.FileUtils import FileUtils
35 import glob
36 import os
37 import shutil
38 import unittest
39
40
41 class Test_FastaUtils( unittest.TestCase ):
42
43
44 def test_dbSize_for_empty_file(self):
45 fileName = "dummyFastaFile.fa"
46 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName)
47
48 obsNb = FastaUtils.dbSize( fileName )
49
50 expNb = 0
51 os.remove(fileName)
52 self.assertEquals(expNb, obsNb)
53
54
55 def test_dbSize_one_sequence(self):
56 fileName = "dummyFastaFile.fa"
57 Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName)
58
59 obsNb = FastaUtils.dbSize( fileName )
60
61 expNb = 1
62 os.remove(fileName)
63 self.assertEquals(expNb, obsNb)
64
65
66 def test_dbSize_four_sequences(self):
67 fileName = "dummyFastaFile.fa"
68 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName)
69
70 obsNb = FastaUtils.dbSize( fileName )
71
72 expNb = 4
73 os.remove(fileName)
74 self.assertEquals(expNb, obsNb)
75
76
77 def test_dbChunks(self):
78 inFileName = "dummyBigSeqFastaFile.fa"
79 expChunksFileName = 'exp' + inFileName +'_chunks.fa'
80 expChunksMapFileName = 'exp' + inFileName +'_chunks.map'
81 expCutFileName = 'exp' + inFileName +'_cut'
82 expNStretchFileName = 'exp' + inFileName +'.Nstretch.map'
83 Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName)
84 Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName)
85 Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName)
86 Utils_for_T_FastaUtils._createFastaFile_of_cut(expCutFileName)
87 Utils_for_T_FastaUtils._createFastaFile_of_Nstretch(expNStretchFileName)
88
89 FastaUtils.dbChunks(inFileName, '60', '10', '11', '', False, 0)
90
91 obsChunksFileName = inFileName +'_chunks.fa'
92 obsChunksMapFileName = inFileName +'_chunks.map'
93 obsCutFileName = inFileName +'_cut'
94 obsNStretchFileName = inFileName +'.Nstretch.map'
95
96 self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName))
97 self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName))
98 self.assertTrue(FileUtils.are2FilesIdentical(expCutFileName, obsCutFileName))
99 self.assertTrue(FileUtils.are2FilesIdentical(expNStretchFileName, obsNStretchFileName))
100
101 os.remove(inFileName)
102 os.remove(expChunksFileName)
103 os.remove(expChunksMapFileName)
104 os.remove(expCutFileName)
105 os.remove(expNStretchFileName)
106 os.remove(obsChunksFileName)
107 os.remove(obsChunksMapFileName)
108 os.remove(obsCutFileName)
109 os.remove(obsNStretchFileName)
110
111
112 def test_dbChunks_with_clean_and_prefix(self):
113 inFileName = "dummyBigSeqFastaFile.fa"
114 expChunksFileName = 'exp' + inFileName +'_chunks.fa'
115 expChunksMapFileName = 'exp' + inFileName +'_chunks.map'
116 Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName)
117 Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName)
118 Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName)
119
120 FastaUtils.dbChunks(inFileName, '60', '10', '11', 'outFile_chunks', True, 0)
121
122 obsChunksFileName = "outFile_chunks.fa"
123 obsChunksMapFileName = "outFile_chunks.map"
124
125 self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName))
126 self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName))
127
128 os.remove(inFileName)
129 os.remove(expChunksFileName)
130 os.remove(expChunksMapFileName)
131 os.remove(obsChunksFileName)
132 os.remove(obsChunksMapFileName)
133
134
135 def test_dbCumLength_with_empty_file(self):
136 inFileName = "dummyFastaFile.fa"
137 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(inFileName)
138
139 expCumulLength = 0
140
141 inFileHandler = open(inFileName, "r")
142 obsCumulLength = FastaUtils.dbCumLength(inFileHandler)
143 inFileHandler.close()
144 os.remove(inFileName)
145
146 self.assertEquals(expCumulLength, obsCumulLength)
147
148 def test_dbCumLength_four_sequences(self):
149 inFileName = "dummyFastaFile.fa"
150 Utils_for_T_FastaUtils._createFastaFile_four_sequences(inFileName)
151
152 expCumulLength = 1168
153
154 inFileHandler = open(inFileName, "r")
155 obsCumulLength = FastaUtils.dbCumLength(inFileHandler)
156 inFileHandler.close()
157 os.remove(inFileName)
158
159 self.assertEquals(expCumulLength, obsCumulLength)
160
161
162 def test_dbLengths( self ):
163 inFileName = "dummyFastaFile.fa"
164 inF = open( inFileName, "w" )
165 inF.write(">seq1\nATGACGT\n")
166 inF.write(">seq2\nATGGCGAGACGT\n")
167 inF.close()
168 lExp = [ 7, 12 ]
169 lObs = FastaUtils.dbLengths( inFileName )
170 self.assertEquals( lExp, lObs )
171 os.remove( inFileName )
172
173
174 def test_dbHeaders_with_empty_file(self):
175 inFile = "dummyFastaFile.fa"
176 Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile )
177 lExp = []
178 lObs = FastaUtils.dbHeaders( inFile )
179 self.assertEquals( lExp, lObs )
180 os.remove( inFile )
181
182
183 def test_dbHeaders_with_one_sequence_without_header(self):
184 inFile = "dummyFastaFile.fa"
185 Utils_for_T_FastaUtils._createFastaFile_sequence_without_header( inFile )
186 lExp = []
187 lObs = FastaUtils.dbHeaders( inFile )
188 self.assertEquals( lExp, lObs )
189 os.remove( inFile )
190
191
192 def test_dbHeaders_four_sequences(self):
193 inFile = "dummyFastaFile.fa"
194 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile )
195 lExp = [ "seq 1", "seq 2", "seq 3", "seq 4" ]
196 lObs = FastaUtils.dbHeaders( inFile )
197 self.assertEquals( lExp, lObs )
198 os.remove( inFile )
199
200
201 def test_dbSplit_no_in_file( self ):
202 inFileName = "dummyFastaFile.fa"
203 isSysExitRaised = False
204 try:
205 FastaUtils.dbSplit( inFileName, 1, False )
206 except SystemExit:
207 isSysExitRaised = True
208 self.assertTrue( isSysExitRaised )
209
210
211 def test_dbSplit_emptyFile( self ):
212 inFile = "dummyFastaFile.fa"
213 Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile )
214 FastaUtils.dbSplit( inFile, 10, False, 1 )
215 self.assertTrue( not os.path.exists( "batch_1.fa" ) )
216 os.remove( inFile )
217
218
219 def test_dbSplit_oneSequence_tenSequencesPerBatch( self ):
220 inFile = "dummyFastaFile.fa"
221 Utils_for_T_FastaUtils._createFastaFile_one_sequence( inFile )
222
223 expBatchFile = "dummyExpBatch_1.fa"
224 Utils_for_T_FastaUtils._createFastaFile_one_sequence( expBatchFile )
225
226 FastaUtils.dbSplit( inFile, 10, False )
227
228 obsBatchFile = "batch_1.fa"
229
230 self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) )
231
232 for f in [ inFile, expBatchFile, obsBatchFile ]:
233 os.remove( f )
234
235
236 def test_dbSplit_fourSequences_threeSequencesPerBatch( self ):
237 inFile = "dummyFastaFile.fa"
238 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile )
239
240 expBatch1File = "dummyExpBatch_1.fa"
241 expBatch2File = "dummyExpBatch_2.fa"
242 Utils_for_T_FastaUtils._createBatch1_three_sequences( expBatch1File )
243 Utils_for_T_FastaUtils._createBatch2_one_sequence( expBatch2File )
244
245 FastaUtils.dbSplit( inFile, 3, False )
246
247 obsBatch1File = "batch_1.fa"
248 obsBatch2File = "batch_2.fa"
249
250 self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) )
251 self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) )
252
253 for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]:
254 os.remove( f )
255
256
257 def test_dbSplit_fourSequences_twoSequencesPerBatch_inBatchDirectory( self ):
258 inFile = "dummyFastaFile.fa"
259 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile )
260
261 expBatch1File = "dummyExp_batch_1.fa"
262 expBatch2File = "dummyExp_batch_2.fa"
263 Utils_for_T_FastaUtils._createBatch1_two_sequences( expBatch1File )
264 Utils_for_T_FastaUtils._createBatch2_two_sequences( expBatch2File )
265
266 FastaUtils.dbSplit( inFile, 2, True, 1 )
267
268 obsBatch1File = "batches/batch_1.fa"
269 obsBatch2File = "batches/batch_2.fa"
270
271 self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) )
272 self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) )
273
274 for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]:
275 os.remove( f )
276
277
278 def test_dbSplit_tenSequences_oneSequencePerBatch_inBatchDirectory( self ):
279 inFile = "dummyFastaFile.fa"
280 Utils_for_T_FastaUtils._createFastaFile_ten_sequences( inFile )
281
282 FastaUtils.dbSplit( inFile, 1, True )
283
284 nb = 1
285 for s in [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10' ]:
286 expBatchFile = "exp_batch_%s.fa" % ( s )
287 Utils_for_T_FastaUtils._createBatch_one_small_sequence( expBatchFile, "seq " + str(nb) )
288 nb += 1
289 obsBatchFile = "batches/batch_%s.fa" % ( s )
290 self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) )
291 os.remove( expBatchFile )
292 os.remove( obsBatchFile )
293
294 os.remove( inFile )
295 os.rmdir( "batches" )
296
297
298 def test_dbSplit_twoSequences_oneSequencePerBatch_useSeqHeader( self ):
299 inFile = "dummyFastaFile.fa"
300 Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile )
301
302 lExpFileNames = [ "seq_1.fa", "seq_2.fa" ]
303 lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ]
304 Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] )
305 Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] )
306
307 FastaUtils.dbSplit( inFile, 1, False, True )
308
309 lObsFiles = glob.glob( "seq*.fa" )
310 lObsFiles.sort()
311 for i in range( 0, len(lExpFileNames) ):
312 self.assertEqual( lExpFileNames[i], lObsFiles[i] )
313 self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) )
314
315 for f in [ inFile ] + lExpFiles + lObsFiles:
316 os.remove( f )
317
318
319 def test_dbSplit_twoSequences_otherPrefix( self ):
320 inFile = "dummyFastaFile.fa"
321 Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile )
322
323 lExpFileNames = [ "query_1.fa", "query_2.fa" ]
324 lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ]
325 Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] )
326 Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] )
327
328 FastaUtils.dbSplit( inFile, 1, False, False, "query" )
329
330 lObsFiles = glob.glob( "query_*.fa" )
331 lObsFiles.sort()
332 for i in range( 0, len(lExpFileNames) ):
333 self.assertEqual( lExpFileNames[i], lObsFiles[i] )
334 self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) )
335
336 for f in [ inFile ] + lExpFiles + lObsFiles:
337 os.remove( f )
338
339
340 def test_splitFastaFileInBatches(self):
341 inFileName = "dummyFastaFile.fa"
342 with open(inFileName, "w") as f:
343 f.write(">seq1\n")
344 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n")
345 f.write(">seq2\n")
346 f.write("ATCGCTAGCTAGCTCG\n")
347 f.write(">seq3\n")
348 f.write("GTTTGGATCGCT\n")
349 f.write(">seq6\n")
350 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n")
351 f.write(">seq5\n")
352 f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n")
353 expBatch1 = "expBatch_1.fa"
354 with open(expBatch1, "w") as f:
355 f.write(">seq6\n")
356 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCT\n")
357 f.write("GTTTGGATCGCTCTCTGCTCGGAAATCC\n")
358 expBatch2 = "expBatch_2.fa"
359 with open(expBatch2, "w") as f:
360 f.write(">seq1\n")
361 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n")
362 expBatch3 = "expBatch_3.fa"
363 with open(expBatch3, "w") as f:
364 f.write(">seq5\n")
365 f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n")
366 f.write(">seq2\n")
367 f.write("ATCGCTAGCTAGCTCG\n")
368 f.write(">seq3\n")
369 f.write("GTTTGGATCGCT\n")
370
371 FastaUtils.splitFastaFileInBatches(inFileName, 60)
372
373 obsBatch1 = "batches/batch_1.fa"
374 obsBatch2 = "batches/batch_2.fa"
375 obsBatch3 = "batches/batch_3.fa"
376
377 self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1))
378 self.assertTrue(FileUtils.are2FilesIdentical(expBatch2, obsBatch2))
379 self.assertTrue(FileUtils.are2FilesIdentical(expBatch3, obsBatch3))
380
381 os.remove(inFileName)
382 os.remove(expBatch1)
383 os.remove(expBatch2)
384 os.remove(expBatch3)
385 shutil.rmtree("batches")
386
387
388 def test_splitFastaFileInBatches_one_seq(self):
389 inFileName = "dummyFastaFile.fa"
390 with open(inFileName, "w") as f:
391 f.write(">seq2\n")
392 f.write("ATCGCTAGCTAGCTCG\n")
393 expBatch1 = "expBatch_1.fa"
394 with open(expBatch1, "w") as f:
395 f.write(">seq2\n")
396 f.write("ATCGCTAGCTAGCTCG\n")
397
398 FastaUtils.splitFastaFileInBatches(inFileName, 60)
399
400 obsBatch1 = "batches/batch_1.fa"
401
402 self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1))
403
404 os.remove(inFileName)
405 os.remove(expBatch1)
406 shutil.rmtree("batches")
407
408
409 def test_splitSeqPerCluster_no_in_file(self):
410 inFileName = "dummyFastaFile.fa"
411 isSysExitRaised = False
412 try:
413 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster")
414 except SystemExit:
415 isSysExitRaised = True
416 self.assertTrue(isSysExitRaised)
417
418
419 def test_splitSeqPerCluster_in_file_empty(self):
420 inFileName = "dummyFastaFile.fa"
421 with open(inFileName, 'w'):
422 pass
423
424 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster")
425
426 self.assertEquals(glob.glob("seqCluster*.fa"), [])
427
428 os.remove(inFileName)
429
430
431 def test_splitSeqPerCluster_four_sequences_without_dir(self):
432 inFileName = "dummyFastaFile.fa"
433 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName)
434
435 expFirstClusterFileName = "exp_seqCluster1.fa"
436 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName)
437 expSecondClusterFileName = "exp_seqCluster2.fa"
438 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName)
439 expThirdClusterFileName = "exp_seqCluster3.574.fa"
440 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName)
441
442 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster")
443 obsFirstClusterFileName = "seqCluster1.fa"
444 obsSecondClusterFileName = "seqCluster2.fa"
445 obsThirdClusterFileName = "seqCluster3.574.fa"
446
447 os.remove(inFileName)
448
449 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName))
450 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName))
451 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName))
452
453 os.remove(expFirstClusterFileName)
454 os.remove(expSecondClusterFileName)
455 os.remove(expThirdClusterFileName)
456 os.remove(obsFirstClusterFileName)
457 os.remove(obsSecondClusterFileName)
458 os.remove(obsThirdClusterFileName)
459
460
461 def test_splitSeqPerCluster_four_sequences_without_dir_no_split(self):
462 inFileName = "dummyFastaFile.fa"
463 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(inFileName)
464
465 expClusterFileName = "exp_seqCluster.fa"
466 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(expClusterFileName)
467
468 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster")
469 obsClusterFileName = "seqCluster1.fa"
470
471 os.remove(inFileName)
472
473 self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName))
474
475 os.remove(expClusterFileName)
476 os.remove(obsClusterFileName)
477
478
479 def test_splitSeqPerCluster_four_sequences_without_dir_shuffle(self):
480 inFileName = "dummyFastaFile.fa"
481 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_shuffle(inFileName)
482
483 expFirstClusterFileName = "exp_seqCluster1.fa"
484 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName)
485 expSecondClusterFileName = "exp_seqCluster2.fa"
486 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName)
487 expThirdClusterFileName = "exp_seqCluster3.574.fa"
488 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName)
489
490 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster")
491 obsFirstClusterFileName = "seqCluster1.fa"
492 obsSecondClusterFileName = "seqCluster2.fa"
493 obsThirdClusterFileName = "seqCluster3.574.fa"
494
495 os.remove(inFileName)
496
497 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName))
498 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName))
499 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName))
500
501 os.remove(expFirstClusterFileName)
502 os.remove(expSecondClusterFileName)
503 os.remove(expThirdClusterFileName)
504 os.remove(obsFirstClusterFileName)
505 os.remove(obsSecondClusterFileName)
506 os.remove(obsThirdClusterFileName)
507
508
509 def test_splitSeqPerCluster_four_sequences_simplify_header(self):
510 inFileName = "dummyFastaFile.fa"
511 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName)
512
513 expFirstClusterFileName = "exp_seqCluster1.fa"
514 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result_with_simplify_header(expFirstClusterFileName)
515 expSecondClusterFileName = "exp_seqCluster2.fa"
516 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result_with_simplify_header(expSecondClusterFileName)
517 expThirdClusterFileName = "exp_seqCluster3.574.fa"
518 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result_with_simplify_header(expThirdClusterFileName)
519
520 FastaUtils.splitSeqPerCluster( inFileName, "Piler", True, False, "seqCluster")
521 obsFirstClusterFileName = "seqCluster1.fa"
522 obsSecondClusterFileName = "seqCluster2.fa"
523 obsThirdClusterFileName = "seqCluster3.574.fa"
524
525 os.remove(inFileName)
526
527 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName))
528 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName))
529 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName))
530
531 os.remove(expFirstClusterFileName)
532 os.remove(expSecondClusterFileName)
533 os.remove(expThirdClusterFileName)
534 os.remove(obsFirstClusterFileName)
535 os.remove(obsSecondClusterFileName)
536 os.remove(obsThirdClusterFileName)
537
538
539 def test_splitSeqPerCluster_four_sequences_with_dir(self):
540 inFileName = "dummyFastaFile.fa"
541 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName)
542 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, True, "seqCluster")
543 os.remove(inFileName)
544
545 for i in ['1', '2', '3.574']:
546 expClusterFileName = "exp_cluster" + i + ".fa"
547 if i == '1':
548 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expClusterFileName)
549 if i == '2':
550 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expClusterFileName)
551 if i == '3.574':
552 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expClusterFileName)
553
554 obsClusterFileName= inFileName + "_cluster_" + i + "/seqCluster" + i + ".fa"
555 self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName))
556 os.remove(expClusterFileName)
557 os.remove(obsClusterFileName)
558 os.rmdir( inFileName + "_cluster_" + i )
559
560
561 def test_dbLengthFilter_with_one_sequence(self):
562 fileName = "dummyFastaFile.fa"
563 Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName)
564
565 expFileNameInf = "exp_dummyFastaFile.fa.Inf12"
566 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileNameInf)
567 expFileNameSup = "exp_dummyFastaFile.fa.Sup12"
568 Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameSup)
569
570 FastaUtils.dbLengthFilter(12, fileName, verbose=0)
571
572 obsFileNameInf = "dummyFastaFile.fa.Inf12"
573 obsFileNameSup = "dummyFastaFile.fa.Sup12"
574
575 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf))
576 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup))
577
578 os.remove(fileName)
579 os.remove(expFileNameInf)
580 os.remove(expFileNameSup)
581 os.remove(obsFileNameInf)
582 os.remove(obsFileNameSup)
583
584 def test_dbLengthFilter_with_four_sequence(self):
585 fileName = "dummyFastaFile.fa"
586 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName)
587
588 expFileNameInf = "exp_dummyFastaFile.fa.Inf130"
589 Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameInf)
590 expFileNameSup = "exp_dummyFastaFile.fa.Sup130"
591 Utils_for_T_FastaUtils._createResult_of_dbLengthFilter_sup(expFileNameSup)
592
593 FastaUtils.dbLengthFilter(130, fileName, verbose=0)
594
595 obsFileNameInf = "dummyFastaFile.fa.Inf130"
596 obsFileNameSup = "dummyFastaFile.fa.Sup130"
597
598 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf))
599 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup))
600
601 os.remove(fileName)
602 os.remove(expFileNameInf)
603 os.remove(expFileNameSup)
604 os.remove(obsFileNameInf)
605 os.remove(obsFileNameSup)
606
607 def test_dbLongestSequences_with_empty_file(self):
608 fileName = "dummyFastaFile.fa"
609 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName)
610
611 expResult = 0
612
613 obsResult = FastaUtils.dbLongestSequences( 1, fileName )
614
615 self.assertEquals(expResult, obsResult)
616
617 os.remove(fileName)
618
619 def test_dbLongestSequences_with_one_longest_sequence(self):
620 fileName = "dummyFastaFile.fa"
621 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName)
622
623 expFileName = "exp_dummyFastaFile.fa.best1"
624 f = open(expFileName, 'w')
625 f.write(">seq 3\n")
626 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
627 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
628 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
629 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
630 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
631 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
632 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
633 f.write("ATATTCG\n")
634 f.close()
635
636 FastaUtils.dbLongestSequences( 1, fileName, outFileName="", verbose=0, minThresh=0 )
637
638 obsFileName = "dummyFastaFile.fa.best1"
639
640 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
641
642 os.remove(fileName)
643 os.remove(expFileName)
644 os.remove(obsFileName)
645
646 def test_dbLongestSequences_with_two_longest_sequence(self):
647 fileName = "dummyFastaFile.fa"
648 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
649 expFileName = "exp_dummyFastaFile.fa.best1"
650 f = open(expFileName, 'w')
651 f.write(">seq 2\n")
652 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
653 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
654 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
655 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
656 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
657 f.write("ATATTCG\n")
658 f.write(">seq 4\n")
659 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
660 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
661 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
662 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
663 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
664 f.write("ATATTCG\n")
665 f.close()
666
667 FastaUtils.dbLongestSequences( 2, fileName, outFileName="", verbose=0, minThresh=0 )
668 obsFileName = "dummyFastaFile.fa.best2"
669
670 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
671
672 os.remove(fileName)
673 os.remove(expFileName)
674 os.remove(obsFileName)
675
676 def test_dbExtractSeqHeaders(self):
677 fileName = "dummyFastaFile.fa"
678 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
679 expFileName = "exp_dummyFastaFile.fa"
680 f = open(expFileName, 'w')
681 f.write("seq 1\n")
682 f.write("seq 2\n")
683 f.write("seq 4\n")
684 f.close()
685
686 FastaUtils.dbExtractSeqHeaders(fileName)
687 obsFileName = "dummyFastaFile.fa.headers"
688
689 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
690
691 os.remove(fileName)
692 os.remove(expFileName)
693 os.remove(obsFileName)
694
695 def test_dbExtractSeqHeaders_with_empty_file(self):
696 fileName = "dummyFastaFile.fa"
697 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName)
698 expFileName = "exp_dummyFastaFile.fa"
699 f = open(expFileName, 'w')
700 f.write("")
701 f.close()
702
703 FastaUtils.dbExtractSeqHeaders(fileName)
704 obsFileName = "dummyFastaFile.fa.headers"
705
706 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
707
708 os.remove(fileName)
709 os.remove(expFileName)
710 os.remove(obsFileName)
711
712 def test_dbExtractSeqHeaders_without_header(self):
713 fileName = "dummyFastaFile.fa"
714 Utils_for_T_FastaUtils._createFastaFile_sequence_without_header(fileName)
715 expFileName = "exp_dummyFastaFile.fa"
716 f = open(expFileName, 'w')
717 f.write("")
718 f.close()
719
720 FastaUtils.dbExtractSeqHeaders(fileName)
721 obsFileName = "dummyFastaFile.fa.headers"
722
723 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
724
725 os.remove(fileName)
726 os.remove(expFileName)
727 os.remove(obsFileName)
728
729 def test_dbExtractByPattern_without_pattern(self):
730 fileName = "dummyFastaFile.fa"
731 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
732
733 obsResult = FastaUtils.dbExtractByPattern( "", fileName)
734
735 expResult = None
736
737 self.assertEquals(expResult, obsResult)
738
739 os.remove(fileName)
740
741 def test_dbExtractByPattern(self):
742 fileName = "dummyFastaFile.fa"
743 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
744 expFileName = "exp_dummyFastaFile.fa"
745 Utils_for_T_FastaUtils._createFastaFile_three_sequences(expFileName)
746
747 FastaUtils.dbExtractByPattern( 'seq', fileName)
748
749 obsFileName = "dummyFastaFile.fa.extracted"
750
751 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
752
753 os.remove(fileName)
754 os.remove(expFileName)
755 os.remove(obsFileName)
756
757 def test_dbExtractByPattern_with_2_as_pattern(self):
758 fileName = "dummyFastaFile.fa"
759 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
760 expFileName = "exp_dummyFastaFile.fa"
761 f = open(expFileName, 'w')
762 f.write(">seq 2\n")
763 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
764 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
765 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
766 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
767 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
768 f.write("ATATTCG\n")
769 f.close()
770
771 FastaUtils.dbExtractByPattern( ' 2', fileName)
772
773 obsFileName = "dummyFastaFile.fa.extracted"
774
775 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
776
777 os.remove(fileName)
778 os.remove(expFileName)
779 os.remove(obsFileName)
780
781 def test_dbExtractByPattern_with_sandie_as_pattern(self):
782 fileName = "dummyFastaFile.fa"
783 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
784 expFileName = "exp_dummyFastaFile.fa"
785 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileName)
786
787 FastaUtils.dbExtractByPattern( 'sandie', fileName)
788
789 obsFileName = "dummyFastaFile.fa.extracted"
790
791 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
792
793 os.remove(fileName)
794 os.remove(expFileName)
795 os.remove(obsFileName)
796
797 def test_dbExtractByFilePattern_empty_pattern_filename(self):
798 patternFileName = ""
799 isSysExitRaised = False
800 try:
801 FastaUtils.dbExtractByFilePattern(patternFileName , None, "")
802 except SystemExit:
803 isSysExitRaised = True
804 self.assertTrue(isSysExitRaised)
805
806 def test_dbExtractByFilePattern(self):
807 fileName = "dummyFastaFile.fa"
808 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName)
809 patternFileName = "dummyPatternFile.txt"
810 Utils_for_T_FastaUtils._createPatternFile(patternFileName)
811
812 expFileName = "exp_dummyFastaFile.fa"
813 f = open(expFileName, 'w')
814 f.write(">seq 1\n")
815 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
816 f.write(">seq 3\n")
817 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
818 f.write(">seq 8\n")
819 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
820 f.write(">seq 10\n")
821 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
822 f.close()
823
824 obsFileName = "dummyFastaFile.fa.extracted"
825
826 FastaUtils.dbExtractByFilePattern( patternFileName, fileName, "")
827
828 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
829
830 os.remove(fileName)
831 os.remove(patternFileName)
832 os.remove(expFileName)
833 os.remove(obsFileName)
834
835 def test_dbCleanByPattern_without_pattern(self):
836 fileName = "dummyFastaFile.fa"
837 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName)
838
839 obsResult = FastaUtils.dbCleanByPattern( "", fileName)
840
841 expResult = None
842
843 self.assertEquals(expResult, obsResult)
844
845 os.remove(fileName)
846
847 def test_dbCleanByPattern(self):
848 fileName = "dummyFastaFile.fa"
849 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName)
850
851 expFileName = "exp_dummyFastaFile.fa"
852 f = open(expFileName, 'w')
853 f.write(">seq 1\n")
854 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
855 f.write(">seq 3\n")
856 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
857 f.write(">seq 4\n")
858 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
859 f.write(">seq 5\n")
860 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
861 f.write(">seq 6\n")
862 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
863 f.write(">seq 7\n")
864 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
865 f.write(">seq 8\n")
866 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
867 f.write(">seq 9\n")
868 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
869 f.write(">seq 10\n")
870 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
871 f.close()
872
873 obsFileName = "dummyFastaFile.fa.cleaned"
874 FastaUtils.dbCleanByPattern( '2', fileName)
875
876 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
877
878 os.remove(fileName)
879 os.remove(expFileName)
880 os.remove(obsFileName)
881
882 def test_dbCleanByPattern_with_expectedFile_empty(self):
883 fileName = "dummyFastaFile.fa"
884 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName)
885
886 expFileName = "exp_dummyFastaFile.fa"
887 f = open(expFileName, 'w')
888 f.write("")
889 f.close()
890
891 obsFileName = "dummyFastaFile.fa.cleaned"
892 FastaUtils.dbCleanByPattern( 'seq', fileName)
893
894 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
895
896 os.remove(fileName)
897 os.remove(expFileName)
898 os.remove(obsFileName)
899
900 def test_dbCleanByFilePattern_empty_pattern_filename(self):
901 patternFileName = ""
902 isSysExitRaised = False
903 try:
904 FastaUtils.dbCleanByFilePattern(patternFileName , None, "")
905 except SystemExit:
906 isSysExitRaised = True
907 self.assertTrue(isSysExitRaised)
908
909 def test_dbCleanByFilePattern(self):
910 fileName = "dummyFastaFile.fa"
911 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName)
912 patternFileName = "dummyPatternFile.txt"
913 Utils_for_T_FastaUtils._createPatternFile(patternFileName)
914
915 expFileName = "exp_dummyFastaFile.fa"
916 f = open(expFileName, 'w')
917 f.write(">seq 2\n")
918 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
919 f.write(">seq 4\n")
920 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
921 f.write(">seq 5\n")
922 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
923 f.write(">seq 6\n")
924 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
925 f.write(">seq 7\n")
926 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
927 f.write(">seq 9\n")
928 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
929 f.close()
930
931 obsFileName = "dummyFastaFile.fa.cleaned"
932
933 FastaUtils.dbCleanByFilePattern( patternFileName, fileName, "")
934
935 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
936
937 os.remove(fileName)
938 os.remove(patternFileName)
939 os.remove(expFileName)
940 os.remove(obsFileName)
941
942 def test_dbORF_without_ORF(self):
943 fileName = "dummy.fa"
944 with open(fileName, "w") as f:
945 f.write(">dummy\n")
946 f.write("GGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTT\n")
947
948 expFileName = "exp.ORF.map"
949 with open(expFileName, "w") as f:
950 f.write("")
951 obsFileName = "%s.ORF.map" % fileName
952
953 FastaUtils.dbORF(fileName, 0, 0)
954
955 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
956
957 os.remove(fileName)
958 os.remove(obsFileName)
959 os.remove(expFileName)
960
961 def test_dbORF_with_one_ORF(self):
962 fileName = "dummyFastaFile.fa"
963 f = open(fileName, 'w')
964 f.write(">seq1\n")
965 f.write("GAAAATATGGGGTAGATAAGGGATCTGGGTTAATTTTTT\n")
966 f.close()
967
968 expFileName = "exp_dummyORFFile.ORF.map"
969 f = open(expFileName, 'w')
970 f.write("ORF|1|17\tseq1\t16\t33\n")
971 f.close()
972
973 FastaUtils.dbORF(fileName, 0, 0)
974 obsFileName = fileName + ".ORF.map"
975
976 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
977
978 os.remove(fileName)
979 os.remove(obsFileName)
980 os.remove(expFileName)
981
982 def test_dbORF_with_real_ORF(self):
983 fileName = "dummy.fa"
984 with open(fileName, "w") as f:
985 f.write(">DmelChr4_Blaster_Recon_13_Map_4\n")
986 f.write("AAGTTGGACATTGAGGGCTTTCTTCGCCGTGTTTCGTTCTTTTCGACAAACAGCAGTGCT\n")
987 f.write("TTGCGGATCATTTTGTTTGAACAACCGACAATGCGACCAATTTCAGCGTAGGTTTTACCT\n")
988 f.write("TCAGAGATCACGTTTTTAATCAAATTTCTTTTTTCGACGGTACAATGCTTTCCGCGACCC\n")
989 f.write("ATGACTAGAGAATTTTTGGTCTTCGTTTGGAAAAAATTCAATTAAAACCTTTAATACAAC\n")
990 f.write("TCCTTTTTTCAAAATTTTTCGAAAAAAACCCAAAGCAATCACTCCTATTAATTTTATTCA\n")
991 f.write("GCAAATACGTGTTCAGTGCTATTTTTGTTACCGCCTCATTTCGCGCACTTTTGCAGCAAG\n")
992 f.write("TGCCCAAAAACAAAAAGAACCGTTACATTGAGAGACTAAAAATTTCTTGCTCAGAGAGCC\n")
993 f.write("AACATATGGTACTTATTATTCATGCAATCTGACTTAAAAAAATATAAACATTTAATAATT\n")
994 f.write("TTTTTTAGGAAATCAACTTTCCACCTGCAGTAGTGCTATTATTTTAACCGCAGCTGTATA\n")
995 f.write(">DmelChr4_Blaster_Piler_3.5_Map_7\n")
996 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
997 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
998 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
999 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
1000 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
1001 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n")
1002 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGT\n")
1003 f.write("TAGGGCTAGGGTTAGGGGTTAGGGTTAGGGTTAGGCTTAGGGTTAGGGTTAGGGTTAGGG\n")
1004 f.write("TTAGGGTTAGGGTTAGGGTTAGGAGTTAGGGTGTAGGGTTAGGGTTAGGGTTAGGGTTAG\n")
1005 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n")
1006 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGCTAGGGTTAGGGTTAG\n")
1007 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n")
1008 f.write(">DmelChr4_Blaster_Grouper_10_Map_13\n")
1009 f.write("GCAAAGACACTAGAATAACAAGATGCGTAACGGCCATACATTGGTTTGGCACTATGCAGC\n")
1010 f.write("CACTTTTTTGGTGACGGCCAAAATTACTCTCTTTCCGCTCACTCCCGCTGAGAGCGTAAG\n")
1011 f.write("AAATCTAAAAATATAATTTGCTTGCTTGTGTGAGTAAAAACAAGAGACGAGAACGCGTAT\n")
1012 f.write("AAGTGTGCGTGTTGTGCTAGAAGACGATTTTCGGGACCGAAATCAATTCTGATCGAAGAA\n")
1013 f.write("ACGAATTTACATGGTACATATTAGGGTAGTTTTTGCCAATTTCCTAGCAATATGATAAAA\n")
1014 f.write("TAAAAAAATTTTTAAAAATTCGCGCCCTGACTATTATAATTTTAAAGCTTTTTAAAATTT\n")
1015 f.write("GTTTGTTAAAATCGCCGCTCGAATTAGCTACCGTTTACACATTTATATTTATGTTTAATT\n")
1016 f.write("CTAATTTGTCTCTCATCTGACAATTTTTTAAGAAAGCGAAATATTTTTTTTTTGAAACAC\n")
1017 f.write("TTTTAATGTTAATGTTACATCATATTAAGTCAAATGATTTAATAAATATACTAAATAATT\n")
1018 f.write("AAATATGATAACTGTTTATTGCAAAAGTAATATCAAAGACACTAGAATTATTCTAGTGTC\n")
1019 f.write("TTTGCTTTGTTCATATCTTGAGGCACGAAGTGCGGACACAAGCACTCAACAATCATTGCC\n")
1020 f.write("TTATTAATTTTTCACACGCCGCAAGATGAATACTCTAATGACAAATATTCTTATATAAAG\n")
1021 f.write("TCATTTTTGAAATTTATTTTTGTGATAATATGTACATAGATTTGGCTATTTCTAATCTAT\n")
1022 f.write("TTTCAAATAATAATAACGTTAAGGCAATGCAAAACAAGAATTTTTTTAGTCGCATGGTGC\n")
1023 f.write("CAATTGATCAAAAATAATATAGATTTAAAGTCTAAGAACTTCTAAGGTGAAGGGCATATT\n")
1024 f.write("TTGTCAAATTTACAATGCATGAGCGAGCATACGTGTGCACACATACAGTTGTCTGCTATC\n")
1025 f.write("ACTTTGTGCGTTGAAAA\n")
1026
1027 expFileName = "exp.ORF.map"
1028 with open(expFileName, "w") as f:
1029 f.write("ORF|3|263\tDmelChr4_Blaster_Recon_13_Map_4\t189\t452\n")
1030 f.write("ORF|2|206\tDmelChr4_Blaster_Recon_13_Map_4\t185\t391\n")
1031 f.write("ORF|-3|164\tDmelChr4_Blaster_Recon_13_Map_4\t382\t218\n")
1032 f.write("ORF|-1|161\tDmelChr4_Blaster_Recon_13_Map_4\t297\t136\n")
1033 f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t400\t513\n")
1034 f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t112\t225\n")
1035 f.write("ORF|3|107\tDmelChr4_Blaster_Recon_13_Map_4\t81\t188\n")
1036 f.write("ORF|1|107\tDmelChr4_Blaster_Recon_13_Map_4\t292\t399\n")
1037 f.write("ORF|-1|104\tDmelChr4_Blaster_Recon_13_Map_4\t432\t328\n")
1038 f.write("ORF|-2|104\tDmelChr4_Blaster_Recon_13_Map_4\t515\t411\n")
1039 f.write("ORF|3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t393\t509\n")
1040 f.write("ORF|-3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t505\t389\n")
1041 f.write("ORF|-2|86\tDmelChr4_Blaster_Piler_3.5_Map_7\t518\t432\n")
1042 f.write("ORF|1|80\tDmelChr4_Blaster_Piler_3.5_Map_7\t436\t516\n")
1043 f.write("ORF|-3|170\tDmelChr4_Blaster_Grouper_10_Map_13\t222\t52\n")
1044 f.write("ORF|-1|161\tDmelChr4_Blaster_Grouper_10_Map_13\t260\t99\n")
1045 f.write("ORF|3|155\tDmelChr4_Blaster_Grouper_10_Map_13\t702\t857\n")
1046 f.write("ORF|3|152\tDmelChr4_Blaster_Grouper_10_Map_13\t288\t440\n")
1047 f.write("ORF|1|137\tDmelChr4_Blaster_Grouper_10_Map_13\t622\t759\n")
1048 f.write("ORF|2|128\tDmelChr4_Blaster_Grouper_10_Map_13\t539\t667\n")
1049 f.write("ORF|1|125\tDmelChr4_Blaster_Grouper_10_Map_13\t760\t885\n")
1050 f.write("ORF|2|122\tDmelChr4_Blaster_Grouper_10_Map_13\t14\t136\n")
1051 f.write("ORF|-2|113\tDmelChr4_Blaster_Grouper_10_Map_13\t847\t734\n")
1052 f.write("ORF|1|110\tDmelChr4_Blaster_Grouper_10_Map_13\t154\t264\n")
1053 obsFileName = "%s.ORF.map" % fileName
1054
1055 FastaUtils.dbORF(fileName, 10, 30)
1056
1057 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1058
1059 os.remove(fileName)
1060 os.remove(obsFileName)
1061 os.remove(expFileName)
1062
1063 def test_sortSequencesByIncreasingLength(self):
1064 fileName = "dummyFastaFile.fa"
1065 f = open(fileName, 'w')
1066 f.write(">seq1_length_60\n")
1067 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1068 f.write(">seq2_length_120\n")
1069 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1070 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1071 f.write(">seq3_length_32\n")
1072 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1073 f.close()
1074
1075 expFileName = "exp_dummyFastaFile.fa"
1076 f = open(expFileName, 'w')
1077 f.write(">seq3_length_32\n")
1078 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1079 f.write(">seq1_length_60\n")
1080 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1081 f.write(">seq2_length_120\n")
1082 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1083 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1084
1085 f.close()
1086
1087 obsFileName = "obs_dummyFastaFile.fa"
1088
1089 FastaUtils.sortSequencesByIncreasingLength(fileName, obsFileName, 0)
1090
1091 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1092
1093 os.remove(expFileName)
1094 os.remove(obsFileName)
1095
1096 def test_sortSequencesByIncreasingLength_in_file_do_not_exists(self):
1097 fileName = "dummyFile.fa"
1098 isSysExitRaised = False
1099 try:
1100 FastaUtils.sortSequencesByIncreasingLength(fileName, "", 0)
1101 except SystemExit:
1102 isSysExitRaised = True
1103
1104 self.assertTrue(isSysExitRaised)
1105
1106 def test_sortSequencesByHeader(self):
1107 fileName = "dummyFastaFile.fa"
1108 f = open(fileName, "w")
1109 f.write(">seq1::test-test\n")
1110 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1111 f.write(">seq3\n")
1112 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1113 f.write(">seq2\n")
1114 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1115 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1116 f.close()
1117 expFileName = "expFastaFile.fa"
1118 f = open(expFileName, "w")
1119 f.write(">seq1::test-test\n")
1120 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1121 f.write(">seq2\n")
1122 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1123 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1124 f.write(">seq3\n")
1125 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1126 f.close()
1127
1128 obsFileName = "obsFastaFile.fa"
1129 FastaUtils.sortSequencesByHeader(fileName, obsFileName)
1130 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1131
1132 os.remove(expFileName)
1133 os.remove(obsFileName)
1134
1135 def test_sortSequencesByHeader_no_outFileName(self):
1136 fileName = "dummyFastaFile.fa"
1137 f = open(fileName, "w")
1138 f.write(">seq12\n")
1139 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1140 f.write(">seq1\n")
1141 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1142 f.write(">seq2\n")
1143 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1144 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1145 f.close()
1146 expFileName = "expFastaFile.fa"
1147 f = open(expFileName, "w")
1148 f.write(">seq1\n")
1149 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n")
1150 f.write(">seq12\n")
1151 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1152 f.write(">seq2\n")
1153 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1154 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n")
1155 f.close()
1156
1157 obsFileName = "dummyFastaFile_sortByHeaders.fa"
1158 FastaUtils.sortSequencesByHeader(fileName)
1159 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1160
1161 os.remove(expFileName)
1162 os.remove(obsFileName)
1163
1164 def test_getLengthPerHeader( self ):
1165 inFile = "dummyFile.fa"
1166 inFileHandler = open( inFile, "w" )
1167 inFileHandler.write(">seq1\nAGCGATGCGT\n")
1168 inFileHandler.write(">seq2\nAGCGATG\n")
1169 inFileHandler.write(">seq3\nAGCGATGGTGCGTGC\n")
1170 inFileHandler.write("AGCGATGGTGCGTGC\n")
1171 inFileHandler.close()
1172
1173 dExp = { "seq1": 10, "seq2": 7, "seq3": 30 }
1174
1175 dObs = FastaUtils.getLengthPerHeader( inFile, 0 )
1176
1177 self.assertEquals( dExp, dObs )
1178
1179 os.remove( inFile )
1180
1181 def test_convertFastaHeadersFromChkToChr_grouper(self):
1182 inFile = "dummyFastaFile.fa"
1183 with open(inFile, "w") as f:
1184 f.write(">MbQ1Gr1Cl0 chunk6 {Fragment} 95523..96053\n")
1185 f.write("AGCGTGCA\n")
1186 f.write(">MbQ77Gr8Cl0 chunk7 {Fragment} 123657..122568,121935..121446\n")
1187 f.write("AGCATGC\n")
1188 f.write(">MbS78Gr8Cl0 chunk7 {Fragment} 140078..139519,139470..138985,138651..138183\n")
1189 f.write("CGTGCG\n")
1190 f.write(">MbQ79Gr8Cl0 chunk7 {Fragment} 48021..48587,48669..49153,57346..57834\n")
1191 f.write("AGCGTGC\n")
1192 mapFile = "dummyMapFile.map"
1193 with open(mapFile, "w") as f:
1194 f.write("chunk5\tdmel_chr4\t760001\t960000\n")
1195 f.write("chunk6\tdmel_chr4\t950001\t1150000\n")
1196 f.write("chunk7\tdmel_chr4\t1140001\t1281640\n")
1197 expFile = "expFile.fa"
1198 with open(expFile, "w") as f:
1199 f.write(">MbQ1Gr1Cl0 dmel_chr4 {Fragment} 1045523..1046053\n")
1200 f.write("AGCGTGCA\n")
1201 f.write(">MbQ77Gr8Cl0 dmel_chr4 {Fragment} 1263657..1262568,1261935..1261446\n")
1202 f.write("AGCATGC\n")
1203 f.write(">MbS78Gr8Cl0 dmel_chr4 {Fragment} 1280078..1279519,1279470..1278985,1278651..1278183\n")
1204 f.write("CGTGCG\n")
1205 f.write(">MbQ79Gr8Cl0 dmel_chr4 {Fragment} 1188021..1188587,1188669..1189153,1197346..1197834\n")
1206 f.write("AGCGTGC\n")
1207 obsFile = "obsFile.fa"
1208
1209 FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile)
1210
1211 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile))
1212
1213 for file in [inFile, mapFile, expFile, obsFile]:
1214 os.remove(file)
1215
1216 def test_convertFastaHeadersFromChkToChr_blastclust(self):
1217 inFile = "dummyFastaFile.fa"
1218 with open(inFile, "w") as f:
1219 f.write(">BlastclustCluster12Mb63_chunk1 (dbseq-nr 0) [1,10]\n")
1220 f.write("AGCGTGCA\n")
1221 f.write(">BlastclustCluster12Mb53_chunk2 (dbseq-nr 2) [1,10]\n")
1222 f.write("AGCATGC\n")
1223 f.write(">BlastclustCluster12Mb26_chunk2 (dbseq-nr 2) [12,18]\n")
1224 f.write("CGTGCG\n")
1225 f.write(">BlastclustCluster12Mb35_chunk3 (dbseq-nr 0) [10,1]\n")
1226 f.write("AGCGTGC\n")
1227 mapFile = "dummyMapFile.map"
1228 with open(mapFile, "w") as f:
1229 f.write("chunk1\tchromosome1\t1\t20\n")
1230 f.write("chunk2\tchromosome1\t16\t35\n")
1231 f.write("chunk3\tchromosome2\t1\t20\n")
1232 expFile = "expFile.fa"
1233 with open(expFile, "w") as f:
1234 f.write(">BlastclustCluster12Mb63 chromosome1 (dbseq-nr 0) 1..10\n")
1235 f.write("AGCGTGCA\n")
1236 f.write(">BlastclustCluster12Mb53 chromosome1 (dbseq-nr 2) 16..25\n")
1237 f.write("AGCATGC\n")
1238 f.write(">BlastclustCluster12Mb26 chromosome1 (dbseq-nr 2) 27..33\n")
1239 f.write("CGTGCG\n")
1240 f.write(">BlastclustCluster12Mb35 chromosome2 (dbseq-nr 0) 10..1\n")
1241 f.write("AGCGTGC\n")
1242 obsFile = "obsFile.fa"
1243
1244 FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile)
1245
1246 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile))
1247
1248 for file in [inFile, mapFile, expFile, obsFile]:
1249 os.remove(file)
1250
1251 def test_convertFastaToLength( self ):
1252 inFile = "dummyFastaFile.fa"
1253 inFileHandler = open(inFile, "w")
1254 inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n")
1255 inFileHandler.write("AGCGTGCA\n")
1256 inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n")
1257 inFileHandler.write("AGCATGCAA\n")
1258 inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n")
1259 inFileHandler.write("CGTGCGAAAA\n")
1260 inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n")
1261 inFileHandler.write("AGCGTG\n")
1262 inFileHandler.close()
1263
1264 expFile = "expFile.length"
1265 expFileHandler = open(expFile, "w")
1266 expFileHandler.write("ReconCluster12Mb63\t8\n")
1267 expFileHandler.write("ReconCluster12Mb53\t9\n")
1268 expFileHandler.write("ReconCluster12Mb26\t10\n")
1269 expFileHandler.write("ReconCluster12Mb35\t6\n")
1270 expFileHandler.close()
1271
1272 obsFile = "obsFile.length"
1273
1274 FastaUtils.convertFastaToLength(inFile, obsFile)
1275
1276 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile))
1277
1278 for f in [inFile, expFile, obsFile]:
1279 os.remove(f)
1280
1281 def test_convertFastaToSeq( self ):
1282 inFile = "dummyFastaFile.fa"
1283 inFileHandler = open(inFile, "w")
1284 inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n")
1285 inFileHandler.write("AGCGTGCA\n")
1286 inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n")
1287 inFileHandler.write("AGCATGCAA\n")
1288 inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n")
1289 inFileHandler.write("CGTGCGAAAA\n")
1290 inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n")
1291 inFileHandler.write("AGCGTG\n")
1292 inFileHandler.close()
1293
1294 expFile = "expFile.seq"
1295 expFileHandler = open(expFile, "w")
1296 expFileHandler.write("ReconCluster12Mb63\tAGCGTGCA\tReconCluster12Mb63 chunk1 {Fragment} 1..10\t8\n")
1297 expFileHandler.write("ReconCluster12Mb53\tAGCATGCAA\tReconCluster12Mb53 chunk2 {Fragment} 1..10\t9\n")
1298 expFileHandler.write("ReconCluster12Mb26\tCGTGCGAAAA\tReconCluster12Mb26 chunk2 {Fragment} 12..18\t10\n")
1299 expFileHandler.write("ReconCluster12Mb35\tAGCGTG\tReconCluster12Mb35 chunk3 {Fragment} 10..1\t6\n")
1300 expFileHandler.close()
1301
1302 obsFile = "obsFile.seq"
1303
1304 FastaUtils.convertFastaToSeq(inFile, obsFile)
1305
1306 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile))
1307
1308 for f in [inFile, expFile, obsFile]:
1309 os.remove(f)
1310
1311 def test_spliceFromCoords( self ):
1312 coordFile = "dummyCoordFile"
1313 coordFileHandler = open( coordFile, "w" )
1314 coordFileHandler.write("TE1\tchr1\t2\t5\n")
1315 coordFileHandler.write("TE2\tchr1\t15\t11\n")
1316 coordFileHandler.write("TE3\tchr2\t1\t3\n")
1317 coordFileHandler.write("TE1\tchr2\t8\t10\n")
1318 coordFileHandler.write("TE4\tchr3\t3\t1\n")
1319 coordFileHandler.write("TE4\tchr3\t6\t4\n")
1320 coordFileHandler.close()
1321
1322 genomeFile = "dummyGenomeFile"
1323 genomeFileHandler = open( genomeFile, "w" )
1324 genomeFileHandler.write(">chr1\n")
1325 genomeFileHandler.write("AGGGGAAAAACCCCCAAAAA\n")
1326 genomeFileHandler.write(">chr2\n")
1327 genomeFileHandler.write("GGGAAAAGGG\n")
1328 genomeFileHandler.write(">chr3\n")
1329 genomeFileHandler.write("GGGGGGTTTT\n")
1330 genomeFileHandler.close()
1331
1332 expFile = "dummyExpFile"
1333 expFileHandler = open( expFile, "w" )
1334 expFileHandler.write(">chr1\n")
1335 expFileHandler.write("AAAAAAAAAAA\n")
1336 expFileHandler.write(">chr2\n")
1337 expFileHandler.write("AAAA\n")
1338 expFileHandler.write(">chr3\n")
1339 expFileHandler.write("TTTT\n")
1340 expFileHandler.close()
1341
1342 obsFile = "dummyObsFile"
1343
1344 FastaUtils.spliceFromCoords( genomeFile,
1345 coordFile,
1346 obsFile )
1347 self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) )
1348 for f in [ coordFile, genomeFile, expFile, obsFile ]:
1349 os.remove( f )
1350
1351 def test_dbShuffle_inputFile( self ):
1352 inFile = "dummyInFile.fa"
1353 inFileHandler = open( inFile, "w" )
1354 inFileHandler.write(">seq1\n")
1355 inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n")
1356 inFileHandler.close()
1357
1358 obsFile = "dummyObsFile.fa"
1359 FastaUtils.dbShuffle( inFile, obsFile, 1 )
1360
1361 self.assertTrue( FastaUtils.dbSize( obsFile ) == 1 )
1362
1363 for f in [ inFile, obsFile ]:
1364 os.remove( f )
1365
1366 def test_dbShuffle_inputDir( self ):
1367 inDir = "dummyInDir"
1368 if os.path.exists( inDir ):
1369 shutil.rmtree( inDir )
1370 os.mkdir( inDir )
1371 inFile = "%s/dummyInFile.fa" % inDir
1372 inFileHandler = open( inFile, "w" )
1373 inFileHandler.write(">seq1\n")
1374 inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n")
1375 inFileHandler.close()
1376
1377 obsDir = "dummyObsDir"
1378 FastaUtils.dbShuffle( inDir, obsDir, 1 )
1379
1380 obsFile = "dummyInFile_shuffle.fa"
1381 self.assertTrue( len( glob.glob("%s/%s" % (obsDir,obsFile)) ) == 1 )
1382
1383 for d in [ inDir, obsDir ]:
1384 shutil.rmtree( d )
1385
1386 def test_convertClusterFileToFastaFile(self):
1387 inClusterFileName = "in.tab"
1388 with open(inClusterFileName, "w") as f:
1389 f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n")
1390 f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1391 f.write("RLX-incomp_DmelChr4-B-G220-Map3\n")
1392 inFastaFileName = "in.fa"
1393 with open(inFastaFileName, "w") as f:
1394 f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n")
1395 f.write("ATCGCATCGATCGATC\n")
1396 f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n")
1397 f.write("ATCGCATCGATCGATC\n")
1398 f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n")
1399 f.write("ATCGCC\n")
1400 f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n")
1401 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1402 f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1403 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1404 f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n")
1405 f.write("ATCGCATCGATCGATC\n")
1406 expFileName = "exp.fa"
1407 with open(expFileName, "w") as f:
1408 f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n")
1409 f.write("ATCGCATCGATCGATC\n")
1410 f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n")
1411 f.write("ATCGCATCGATCGATC\n")
1412 f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n")
1413 f.write("ATCGCC\n")
1414 f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n")
1415 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1416 f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1417 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1418 f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n")
1419 f.write("ATCGCATCGATCGATC\n")
1420 obsFileName = "obs.fa"
1421
1422 FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust")
1423
1424 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1425 os.remove(inClusterFileName)
1426 os.remove(inFastaFileName)
1427 os.remove(expFileName)
1428 os.remove(obsFileName)
1429
1430
1431 def test_convertClusterFileToFastaFile_withoutUnclusterizedSequences(self):
1432 inClusterFileName = "in.tab"
1433 with open(inClusterFileName, "w") as f:
1434 f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n")
1435 f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1436 inFastaFileName = "in.fa"
1437 with open(inFastaFileName, "w") as f:
1438 f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n")
1439 f.write("ATCGCATCGATCGATC\n")
1440 f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n")
1441 f.write("ATCGCATCGATCGATC\n")
1442 f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n")
1443 f.write("ATCGCC\n")
1444 f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n")
1445 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1446 f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1447 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1448 f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n")
1449 f.write("ATCGCATCGATCGATC\n")
1450 expFileName = "exp.fa"
1451 with open(expFileName, "w") as f:
1452 f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n")
1453 f.write("ATCGCATCGATCGATC\n")
1454 f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n")
1455 f.write("ATCGCATCGATCGATC\n")
1456 f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n")
1457 f.write("ATCGCC\n")
1458 f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n")
1459 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1460 f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n")
1461 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n")
1462 f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n")
1463 f.write("ATCGCATCGATCGATC\n")
1464 obsFileName = "obs.fa"
1465
1466 FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust")
1467
1468 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
1469 os.remove(inClusterFileName)
1470 os.remove(inFastaFileName)
1471 os.remove(expFileName)
1472 os.remove(obsFileName)
1473
1474 def test_convertClusterFileToMapFile(self):
1475 for clustAlgo in ["Blastclust", "MCL"]:
1476 inFileName = "dummy%sOut.fa" % clustAlgo
1477 inF = open(inFileName, "w")
1478 inF.write(">%sCluster1Mb1_chunk1 (dbseq-nr 1) [1,14]\n" % clustAlgo)
1479 inF.write("gaattgtttactta\n")
1480 inF.write(">%sCluster3Mb1_chunk5 (dbseq-nr 8) [1000,1014]\n" % clustAlgo)
1481 inF.write("gaattgtttactta\n")
1482 inF.write(">%sCluster1Mb2_chunk1 (dbseq-nr 1) [30,44]\n" % clustAlgo)
1483 inF.write("gaattgtttactta\n")
1484 inF.write(">%sCluster2Mb1_chunk2 (dbseq-nr 1) [100,114]\n" % clustAlgo)
1485 inF.write("gaattgtttactta")
1486 inF.close()
1487
1488 fileExp = "%sToMapExpected.map" % clustAlgo
1489 outF = open(fileExp, "w")
1490 outF.write("%sCluster1Mb1\tchunk1\t1\t14\n" % clustAlgo)
1491 outF.write("%sCluster3Mb1\tchunk5\t1000\t1014\n" % clustAlgo)
1492 outF.write("%sCluster1Mb2\tchunk1\t30\t44\n" % clustAlgo)
1493 outF.write("%sCluster2Mb1\tchunk2\t100\t114\n" % clustAlgo)
1494 outF.close()
1495
1496 fileObs = "%s.map" % os.path.splitext(inFileName)[0]
1497 FastaUtils.convertClusteredFastaFileToMapFile(inFileName, fileObs)
1498
1499 self.assertTrue(FileUtils.are2FilesIdentical(fileObs, fileExp))
1500
1501 os.remove(inFileName)
1502 os.remove(fileObs)
1503 os.remove(fileExp)
1504
1505 if __name__ == "__main__":
1506 unittest.main()