Mercurial > repos > yufei-luo > s_mart
comparison smart_toolShed/commons/core/seq/test/Test_FastaUtils.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0f8dcca02ed |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 from commons.core.seq.FastaUtils import FastaUtils | |
33 from commons.core.seq.test.Utils_for_T_FastaUtils import Utils_for_T_FastaUtils | |
34 from commons.core.utils.FileUtils import FileUtils | |
35 import glob | |
36 import os | |
37 import shutil | |
38 import unittest | |
39 | |
40 | |
41 class Test_FastaUtils( unittest.TestCase ): | |
42 | |
43 | |
44 def test_dbSize_for_empty_file(self): | |
45 fileName = "dummyFastaFile.fa" | |
46 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) | |
47 | |
48 obsNb = FastaUtils.dbSize( fileName ) | |
49 | |
50 expNb = 0 | |
51 os.remove(fileName) | |
52 self.assertEquals(expNb, obsNb) | |
53 | |
54 | |
55 def test_dbSize_one_sequence(self): | |
56 fileName = "dummyFastaFile.fa" | |
57 Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName) | |
58 | |
59 obsNb = FastaUtils.dbSize( fileName ) | |
60 | |
61 expNb = 1 | |
62 os.remove(fileName) | |
63 self.assertEquals(expNb, obsNb) | |
64 | |
65 | |
66 def test_dbSize_four_sequences(self): | |
67 fileName = "dummyFastaFile.fa" | |
68 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) | |
69 | |
70 obsNb = FastaUtils.dbSize( fileName ) | |
71 | |
72 expNb = 4 | |
73 os.remove(fileName) | |
74 self.assertEquals(expNb, obsNb) | |
75 | |
76 | |
77 def test_dbChunks(self): | |
78 inFileName = "dummyBigSeqFastaFile.fa" | |
79 expChunksFileName = 'exp' + inFileName +'_chunks.fa' | |
80 expChunksMapFileName = 'exp' + inFileName +'_chunks.map' | |
81 expCutFileName = 'exp' + inFileName +'_cut' | |
82 expNStretchFileName = 'exp' + inFileName +'.Nstretch.map' | |
83 Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName) | |
84 Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName) | |
85 Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName) | |
86 Utils_for_T_FastaUtils._createFastaFile_of_cut(expCutFileName) | |
87 Utils_for_T_FastaUtils._createFastaFile_of_Nstretch(expNStretchFileName) | |
88 | |
89 FastaUtils.dbChunks(inFileName, '60', '10', '11', '', False, 0) | |
90 | |
91 obsChunksFileName = inFileName +'_chunks.fa' | |
92 obsChunksMapFileName = inFileName +'_chunks.map' | |
93 obsCutFileName = inFileName +'_cut' | |
94 obsNStretchFileName = inFileName +'.Nstretch.map' | |
95 | |
96 self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName)) | |
97 self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName)) | |
98 self.assertTrue(FileUtils.are2FilesIdentical(expCutFileName, obsCutFileName)) | |
99 self.assertTrue(FileUtils.are2FilesIdentical(expNStretchFileName, obsNStretchFileName)) | |
100 | |
101 os.remove(inFileName) | |
102 os.remove(expChunksFileName) | |
103 os.remove(expChunksMapFileName) | |
104 os.remove(expCutFileName) | |
105 os.remove(expNStretchFileName) | |
106 os.remove(obsChunksFileName) | |
107 os.remove(obsChunksMapFileName) | |
108 os.remove(obsCutFileName) | |
109 os.remove(obsNStretchFileName) | |
110 | |
111 | |
112 def test_dbChunks_with_clean_and_prefix(self): | |
113 inFileName = "dummyBigSeqFastaFile.fa" | |
114 expChunksFileName = 'exp' + inFileName +'_chunks.fa' | |
115 expChunksMapFileName = 'exp' + inFileName +'_chunks.map' | |
116 Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName) | |
117 Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName) | |
118 Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName) | |
119 | |
120 FastaUtils.dbChunks(inFileName, '60', '10', '11', 'outFile_chunks', True, 0) | |
121 | |
122 obsChunksFileName = "outFile_chunks.fa" | |
123 obsChunksMapFileName = "outFile_chunks.map" | |
124 | |
125 self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName)) | |
126 self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName)) | |
127 | |
128 os.remove(inFileName) | |
129 os.remove(expChunksFileName) | |
130 os.remove(expChunksMapFileName) | |
131 os.remove(obsChunksFileName) | |
132 os.remove(obsChunksMapFileName) | |
133 | |
134 | |
135 def test_dbCumLength_with_empty_file(self): | |
136 inFileName = "dummyFastaFile.fa" | |
137 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(inFileName) | |
138 | |
139 expCumulLength = 0 | |
140 | |
141 inFileHandler = open(inFileName, "r") | |
142 obsCumulLength = FastaUtils.dbCumLength(inFileHandler) | |
143 inFileHandler.close() | |
144 os.remove(inFileName) | |
145 | |
146 self.assertEquals(expCumulLength, obsCumulLength) | |
147 | |
148 def test_dbCumLength_four_sequences(self): | |
149 inFileName = "dummyFastaFile.fa" | |
150 Utils_for_T_FastaUtils._createFastaFile_four_sequences(inFileName) | |
151 | |
152 expCumulLength = 1168 | |
153 | |
154 inFileHandler = open(inFileName, "r") | |
155 obsCumulLength = FastaUtils.dbCumLength(inFileHandler) | |
156 inFileHandler.close() | |
157 os.remove(inFileName) | |
158 | |
159 self.assertEquals(expCumulLength, obsCumulLength) | |
160 | |
161 | |
162 def test_dbLengths( self ): | |
163 inFileName = "dummyFastaFile.fa" | |
164 inF = open( inFileName, "w" ) | |
165 inF.write(">seq1\nATGACGT\n") | |
166 inF.write(">seq2\nATGGCGAGACGT\n") | |
167 inF.close() | |
168 lExp = [ 7, 12 ] | |
169 lObs = FastaUtils.dbLengths( inFileName ) | |
170 self.assertEquals( lExp, lObs ) | |
171 os.remove( inFileName ) | |
172 | |
173 | |
174 def test_dbHeaders_with_empty_file(self): | |
175 inFile = "dummyFastaFile.fa" | |
176 Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile ) | |
177 lExp = [] | |
178 lObs = FastaUtils.dbHeaders( inFile ) | |
179 self.assertEquals( lExp, lObs ) | |
180 os.remove( inFile ) | |
181 | |
182 | |
183 def test_dbHeaders_with_one_sequence_without_header(self): | |
184 inFile = "dummyFastaFile.fa" | |
185 Utils_for_T_FastaUtils._createFastaFile_sequence_without_header( inFile ) | |
186 lExp = [] | |
187 lObs = FastaUtils.dbHeaders( inFile ) | |
188 self.assertEquals( lExp, lObs ) | |
189 os.remove( inFile ) | |
190 | |
191 | |
192 def test_dbHeaders_four_sequences(self): | |
193 inFile = "dummyFastaFile.fa" | |
194 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) | |
195 lExp = [ "seq 1", "seq 2", "seq 3", "seq 4" ] | |
196 lObs = FastaUtils.dbHeaders( inFile ) | |
197 self.assertEquals( lExp, lObs ) | |
198 os.remove( inFile ) | |
199 | |
200 | |
201 def test_dbSplit_no_in_file( self ): | |
202 inFileName = "dummyFastaFile.fa" | |
203 isSysExitRaised = False | |
204 try: | |
205 FastaUtils.dbSplit( inFileName, 1, False ) | |
206 except SystemExit: | |
207 isSysExitRaised = True | |
208 self.assertTrue( isSysExitRaised ) | |
209 | |
210 | |
211 def test_dbSplit_emptyFile( self ): | |
212 inFile = "dummyFastaFile.fa" | |
213 Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile ) | |
214 FastaUtils.dbSplit( inFile, 10, False, 1 ) | |
215 self.assertTrue( not os.path.exists( "batch_1.fa" ) ) | |
216 os.remove( inFile ) | |
217 | |
218 | |
219 def test_dbSplit_oneSequence_tenSequencesPerBatch( self ): | |
220 inFile = "dummyFastaFile.fa" | |
221 Utils_for_T_FastaUtils._createFastaFile_one_sequence( inFile ) | |
222 | |
223 expBatchFile = "dummyExpBatch_1.fa" | |
224 Utils_for_T_FastaUtils._createFastaFile_one_sequence( expBatchFile ) | |
225 | |
226 FastaUtils.dbSplit( inFile, 10, False ) | |
227 | |
228 obsBatchFile = "batch_1.fa" | |
229 | |
230 self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) ) | |
231 | |
232 for f in [ inFile, expBatchFile, obsBatchFile ]: | |
233 os.remove( f ) | |
234 | |
235 | |
236 def test_dbSplit_fourSequences_threeSequencesPerBatch( self ): | |
237 inFile = "dummyFastaFile.fa" | |
238 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) | |
239 | |
240 expBatch1File = "dummyExpBatch_1.fa" | |
241 expBatch2File = "dummyExpBatch_2.fa" | |
242 Utils_for_T_FastaUtils._createBatch1_three_sequences( expBatch1File ) | |
243 Utils_for_T_FastaUtils._createBatch2_one_sequence( expBatch2File ) | |
244 | |
245 FastaUtils.dbSplit( inFile, 3, False ) | |
246 | |
247 obsBatch1File = "batch_1.fa" | |
248 obsBatch2File = "batch_2.fa" | |
249 | |
250 self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) ) | |
251 self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) ) | |
252 | |
253 for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]: | |
254 os.remove( f ) | |
255 | |
256 | |
257 def test_dbSplit_fourSequences_twoSequencesPerBatch_inBatchDirectory( self ): | |
258 inFile = "dummyFastaFile.fa" | |
259 Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) | |
260 | |
261 expBatch1File = "dummyExp_batch_1.fa" | |
262 expBatch2File = "dummyExp_batch_2.fa" | |
263 Utils_for_T_FastaUtils._createBatch1_two_sequences( expBatch1File ) | |
264 Utils_for_T_FastaUtils._createBatch2_two_sequences( expBatch2File ) | |
265 | |
266 FastaUtils.dbSplit( inFile, 2, True, 1 ) | |
267 | |
268 obsBatch1File = "batches/batch_1.fa" | |
269 obsBatch2File = "batches/batch_2.fa" | |
270 | |
271 self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) ) | |
272 self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) ) | |
273 | |
274 for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]: | |
275 os.remove( f ) | |
276 | |
277 | |
278 def test_dbSplit_tenSequences_oneSequencePerBatch_inBatchDirectory( self ): | |
279 inFile = "dummyFastaFile.fa" | |
280 Utils_for_T_FastaUtils._createFastaFile_ten_sequences( inFile ) | |
281 | |
282 FastaUtils.dbSplit( inFile, 1, True ) | |
283 | |
284 nb = 1 | |
285 for s in [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10' ]: | |
286 expBatchFile = "exp_batch_%s.fa" % ( s ) | |
287 Utils_for_T_FastaUtils._createBatch_one_small_sequence( expBatchFile, "seq " + str(nb) ) | |
288 nb += 1 | |
289 obsBatchFile = "batches/batch_%s.fa" % ( s ) | |
290 self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) ) | |
291 os.remove( expBatchFile ) | |
292 os.remove( obsBatchFile ) | |
293 | |
294 os.remove( inFile ) | |
295 os.rmdir( "batches" ) | |
296 | |
297 | |
298 def test_dbSplit_twoSequences_oneSequencePerBatch_useSeqHeader( self ): | |
299 inFile = "dummyFastaFile.fa" | |
300 Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile ) | |
301 | |
302 lExpFileNames = [ "seq_1.fa", "seq_2.fa" ] | |
303 lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ] | |
304 Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] ) | |
305 Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] ) | |
306 | |
307 FastaUtils.dbSplit( inFile, 1, False, True ) | |
308 | |
309 lObsFiles = glob.glob( "seq*.fa" ) | |
310 lObsFiles.sort() | |
311 for i in range( 0, len(lExpFileNames) ): | |
312 self.assertEqual( lExpFileNames[i], lObsFiles[i] ) | |
313 self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) ) | |
314 | |
315 for f in [ inFile ] + lExpFiles + lObsFiles: | |
316 os.remove( f ) | |
317 | |
318 | |
319 def test_dbSplit_twoSequences_otherPrefix( self ): | |
320 inFile = "dummyFastaFile.fa" | |
321 Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile ) | |
322 | |
323 lExpFileNames = [ "query_1.fa", "query_2.fa" ] | |
324 lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ] | |
325 Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] ) | |
326 Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] ) | |
327 | |
328 FastaUtils.dbSplit( inFile, 1, False, False, "query" ) | |
329 | |
330 lObsFiles = glob.glob( "query_*.fa" ) | |
331 lObsFiles.sort() | |
332 for i in range( 0, len(lExpFileNames) ): | |
333 self.assertEqual( lExpFileNames[i], lObsFiles[i] ) | |
334 self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) ) | |
335 | |
336 for f in [ inFile ] + lExpFiles + lObsFiles: | |
337 os.remove( f ) | |
338 | |
339 | |
340 def test_splitFastaFileInBatches(self): | |
341 inFileName = "dummyFastaFile.fa" | |
342 with open(inFileName, "w") as f: | |
343 f.write(">seq1\n") | |
344 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") | |
345 f.write(">seq2\n") | |
346 f.write("ATCGCTAGCTAGCTCG\n") | |
347 f.write(">seq3\n") | |
348 f.write("GTTTGGATCGCT\n") | |
349 f.write(">seq6\n") | |
350 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") | |
351 f.write(">seq5\n") | |
352 f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n") | |
353 expBatch1 = "expBatch_1.fa" | |
354 with open(expBatch1, "w") as f: | |
355 f.write(">seq6\n") | |
356 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCT\n") | |
357 f.write("GTTTGGATCGCTCTCTGCTCGGAAATCC\n") | |
358 expBatch2 = "expBatch_2.fa" | |
359 with open(expBatch2, "w") as f: | |
360 f.write(">seq1\n") | |
361 f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") | |
362 expBatch3 = "expBatch_3.fa" | |
363 with open(expBatch3, "w") as f: | |
364 f.write(">seq5\n") | |
365 f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n") | |
366 f.write(">seq2\n") | |
367 f.write("ATCGCTAGCTAGCTCG\n") | |
368 f.write(">seq3\n") | |
369 f.write("GTTTGGATCGCT\n") | |
370 | |
371 FastaUtils.splitFastaFileInBatches(inFileName, 60) | |
372 | |
373 obsBatch1 = "batches/batch_1.fa" | |
374 obsBatch2 = "batches/batch_2.fa" | |
375 obsBatch3 = "batches/batch_3.fa" | |
376 | |
377 self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1)) | |
378 self.assertTrue(FileUtils.are2FilesIdentical(expBatch2, obsBatch2)) | |
379 self.assertTrue(FileUtils.are2FilesIdentical(expBatch3, obsBatch3)) | |
380 | |
381 os.remove(inFileName) | |
382 os.remove(expBatch1) | |
383 os.remove(expBatch2) | |
384 os.remove(expBatch3) | |
385 shutil.rmtree("batches") | |
386 | |
387 | |
388 def test_splitFastaFileInBatches_one_seq(self): | |
389 inFileName = "dummyFastaFile.fa" | |
390 with open(inFileName, "w") as f: | |
391 f.write(">seq2\n") | |
392 f.write("ATCGCTAGCTAGCTCG\n") | |
393 expBatch1 = "expBatch_1.fa" | |
394 with open(expBatch1, "w") as f: | |
395 f.write(">seq2\n") | |
396 f.write("ATCGCTAGCTAGCTCG\n") | |
397 | |
398 FastaUtils.splitFastaFileInBatches(inFileName, 60) | |
399 | |
400 obsBatch1 = "batches/batch_1.fa" | |
401 | |
402 self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1)) | |
403 | |
404 os.remove(inFileName) | |
405 os.remove(expBatch1) | |
406 shutil.rmtree("batches") | |
407 | |
408 | |
409 def test_splitSeqPerCluster_no_in_file(self): | |
410 inFileName = "dummyFastaFile.fa" | |
411 isSysExitRaised = False | |
412 try: | |
413 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") | |
414 except SystemExit: | |
415 isSysExitRaised = True | |
416 self.assertTrue(isSysExitRaised) | |
417 | |
418 | |
419 def test_splitSeqPerCluster_in_file_empty(self): | |
420 inFileName = "dummyFastaFile.fa" | |
421 with open(inFileName, 'w'): | |
422 pass | |
423 | |
424 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") | |
425 | |
426 self.assertEquals(glob.glob("seqCluster*.fa"), []) | |
427 | |
428 os.remove(inFileName) | |
429 | |
430 | |
431 def test_splitSeqPerCluster_four_sequences_without_dir(self): | |
432 inFileName = "dummyFastaFile.fa" | |
433 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) | |
434 | |
435 expFirstClusterFileName = "exp_seqCluster1.fa" | |
436 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName) | |
437 expSecondClusterFileName = "exp_seqCluster2.fa" | |
438 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName) | |
439 expThirdClusterFileName = "exp_seqCluster3.574.fa" | |
440 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName) | |
441 | |
442 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") | |
443 obsFirstClusterFileName = "seqCluster1.fa" | |
444 obsSecondClusterFileName = "seqCluster2.fa" | |
445 obsThirdClusterFileName = "seqCluster3.574.fa" | |
446 | |
447 os.remove(inFileName) | |
448 | |
449 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) | |
450 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) | |
451 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) | |
452 | |
453 os.remove(expFirstClusterFileName) | |
454 os.remove(expSecondClusterFileName) | |
455 os.remove(expThirdClusterFileName) | |
456 os.remove(obsFirstClusterFileName) | |
457 os.remove(obsSecondClusterFileName) | |
458 os.remove(obsThirdClusterFileName) | |
459 | |
460 | |
461 def test_splitSeqPerCluster_four_sequences_without_dir_no_split(self): | |
462 inFileName = "dummyFastaFile.fa" | |
463 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(inFileName) | |
464 | |
465 expClusterFileName = "exp_seqCluster.fa" | |
466 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(expClusterFileName) | |
467 | |
468 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") | |
469 obsClusterFileName = "seqCluster1.fa" | |
470 | |
471 os.remove(inFileName) | |
472 | |
473 self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) | |
474 | |
475 os.remove(expClusterFileName) | |
476 os.remove(obsClusterFileName) | |
477 | |
478 | |
479 def test_splitSeqPerCluster_four_sequences_without_dir_shuffle(self): | |
480 inFileName = "dummyFastaFile.fa" | |
481 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_shuffle(inFileName) | |
482 | |
483 expFirstClusterFileName = "exp_seqCluster1.fa" | |
484 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName) | |
485 expSecondClusterFileName = "exp_seqCluster2.fa" | |
486 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName) | |
487 expThirdClusterFileName = "exp_seqCluster3.574.fa" | |
488 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName) | |
489 | |
490 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") | |
491 obsFirstClusterFileName = "seqCluster1.fa" | |
492 obsSecondClusterFileName = "seqCluster2.fa" | |
493 obsThirdClusterFileName = "seqCluster3.574.fa" | |
494 | |
495 os.remove(inFileName) | |
496 | |
497 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) | |
498 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) | |
499 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) | |
500 | |
501 os.remove(expFirstClusterFileName) | |
502 os.remove(expSecondClusterFileName) | |
503 os.remove(expThirdClusterFileName) | |
504 os.remove(obsFirstClusterFileName) | |
505 os.remove(obsSecondClusterFileName) | |
506 os.remove(obsThirdClusterFileName) | |
507 | |
508 | |
509 def test_splitSeqPerCluster_four_sequences_simplify_header(self): | |
510 inFileName = "dummyFastaFile.fa" | |
511 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) | |
512 | |
513 expFirstClusterFileName = "exp_seqCluster1.fa" | |
514 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result_with_simplify_header(expFirstClusterFileName) | |
515 expSecondClusterFileName = "exp_seqCluster2.fa" | |
516 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result_with_simplify_header(expSecondClusterFileName) | |
517 expThirdClusterFileName = "exp_seqCluster3.574.fa" | |
518 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result_with_simplify_header(expThirdClusterFileName) | |
519 | |
520 FastaUtils.splitSeqPerCluster( inFileName, "Piler", True, False, "seqCluster") | |
521 obsFirstClusterFileName = "seqCluster1.fa" | |
522 obsSecondClusterFileName = "seqCluster2.fa" | |
523 obsThirdClusterFileName = "seqCluster3.574.fa" | |
524 | |
525 os.remove(inFileName) | |
526 | |
527 self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) | |
528 self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) | |
529 self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) | |
530 | |
531 os.remove(expFirstClusterFileName) | |
532 os.remove(expSecondClusterFileName) | |
533 os.remove(expThirdClusterFileName) | |
534 os.remove(obsFirstClusterFileName) | |
535 os.remove(obsSecondClusterFileName) | |
536 os.remove(obsThirdClusterFileName) | |
537 | |
538 | |
539 def test_splitSeqPerCluster_four_sequences_with_dir(self): | |
540 inFileName = "dummyFastaFile.fa" | |
541 Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) | |
542 FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, True, "seqCluster") | |
543 os.remove(inFileName) | |
544 | |
545 for i in ['1', '2', '3.574']: | |
546 expClusterFileName = "exp_cluster" + i + ".fa" | |
547 if i == '1': | |
548 Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expClusterFileName) | |
549 if i == '2': | |
550 Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expClusterFileName) | |
551 if i == '3.574': | |
552 Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expClusterFileName) | |
553 | |
554 obsClusterFileName= inFileName + "_cluster_" + i + "/seqCluster" + i + ".fa" | |
555 self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) | |
556 os.remove(expClusterFileName) | |
557 os.remove(obsClusterFileName) | |
558 os.rmdir( inFileName + "_cluster_" + i ) | |
559 | |
560 | |
561 def test_dbLengthFilter_with_one_sequence(self): | |
562 fileName = "dummyFastaFile.fa" | |
563 Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName) | |
564 | |
565 expFileNameInf = "exp_dummyFastaFile.fa.Inf12" | |
566 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileNameInf) | |
567 expFileNameSup = "exp_dummyFastaFile.fa.Sup12" | |
568 Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameSup) | |
569 | |
570 FastaUtils.dbLengthFilter(12, fileName, verbose=0) | |
571 | |
572 obsFileNameInf = "dummyFastaFile.fa.Inf12" | |
573 obsFileNameSup = "dummyFastaFile.fa.Sup12" | |
574 | |
575 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf)) | |
576 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup)) | |
577 | |
578 os.remove(fileName) | |
579 os.remove(expFileNameInf) | |
580 os.remove(expFileNameSup) | |
581 os.remove(obsFileNameInf) | |
582 os.remove(obsFileNameSup) | |
583 | |
584 def test_dbLengthFilter_with_four_sequence(self): | |
585 fileName = "dummyFastaFile.fa" | |
586 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) | |
587 | |
588 expFileNameInf = "exp_dummyFastaFile.fa.Inf130" | |
589 Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameInf) | |
590 expFileNameSup = "exp_dummyFastaFile.fa.Sup130" | |
591 Utils_for_T_FastaUtils._createResult_of_dbLengthFilter_sup(expFileNameSup) | |
592 | |
593 FastaUtils.dbLengthFilter(130, fileName, verbose=0) | |
594 | |
595 obsFileNameInf = "dummyFastaFile.fa.Inf130" | |
596 obsFileNameSup = "dummyFastaFile.fa.Sup130" | |
597 | |
598 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf)) | |
599 self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup)) | |
600 | |
601 os.remove(fileName) | |
602 os.remove(expFileNameInf) | |
603 os.remove(expFileNameSup) | |
604 os.remove(obsFileNameInf) | |
605 os.remove(obsFileNameSup) | |
606 | |
607 def test_dbLongestSequences_with_empty_file(self): | |
608 fileName = "dummyFastaFile.fa" | |
609 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) | |
610 | |
611 expResult = 0 | |
612 | |
613 obsResult = FastaUtils.dbLongestSequences( 1, fileName ) | |
614 | |
615 self.assertEquals(expResult, obsResult) | |
616 | |
617 os.remove(fileName) | |
618 | |
619 def test_dbLongestSequences_with_one_longest_sequence(self): | |
620 fileName = "dummyFastaFile.fa" | |
621 Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) | |
622 | |
623 expFileName = "exp_dummyFastaFile.fa.best1" | |
624 f = open(expFileName, 'w') | |
625 f.write(">seq 3\n") | |
626 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
627 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
628 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
629 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
630 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
631 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
632 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
633 f.write("ATATTCG\n") | |
634 f.close() | |
635 | |
636 FastaUtils.dbLongestSequences( 1, fileName, outFileName="", verbose=0, minThresh=0 ) | |
637 | |
638 obsFileName = "dummyFastaFile.fa.best1" | |
639 | |
640 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
641 | |
642 os.remove(fileName) | |
643 os.remove(expFileName) | |
644 os.remove(obsFileName) | |
645 | |
646 def test_dbLongestSequences_with_two_longest_sequence(self): | |
647 fileName = "dummyFastaFile.fa" | |
648 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
649 expFileName = "exp_dummyFastaFile.fa.best1" | |
650 f = open(expFileName, 'w') | |
651 f.write(">seq 2\n") | |
652 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
653 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
654 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
655 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
656 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
657 f.write("ATATTCG\n") | |
658 f.write(">seq 4\n") | |
659 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
660 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
661 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
662 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
663 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
664 f.write("ATATTCG\n") | |
665 f.close() | |
666 | |
667 FastaUtils.dbLongestSequences( 2, fileName, outFileName="", verbose=0, minThresh=0 ) | |
668 obsFileName = "dummyFastaFile.fa.best2" | |
669 | |
670 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
671 | |
672 os.remove(fileName) | |
673 os.remove(expFileName) | |
674 os.remove(obsFileName) | |
675 | |
676 def test_dbExtractSeqHeaders(self): | |
677 fileName = "dummyFastaFile.fa" | |
678 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
679 expFileName = "exp_dummyFastaFile.fa" | |
680 f = open(expFileName, 'w') | |
681 f.write("seq 1\n") | |
682 f.write("seq 2\n") | |
683 f.write("seq 4\n") | |
684 f.close() | |
685 | |
686 FastaUtils.dbExtractSeqHeaders(fileName) | |
687 obsFileName = "dummyFastaFile.fa.headers" | |
688 | |
689 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
690 | |
691 os.remove(fileName) | |
692 os.remove(expFileName) | |
693 os.remove(obsFileName) | |
694 | |
695 def test_dbExtractSeqHeaders_with_empty_file(self): | |
696 fileName = "dummyFastaFile.fa" | |
697 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) | |
698 expFileName = "exp_dummyFastaFile.fa" | |
699 f = open(expFileName, 'w') | |
700 f.write("") | |
701 f.close() | |
702 | |
703 FastaUtils.dbExtractSeqHeaders(fileName) | |
704 obsFileName = "dummyFastaFile.fa.headers" | |
705 | |
706 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
707 | |
708 os.remove(fileName) | |
709 os.remove(expFileName) | |
710 os.remove(obsFileName) | |
711 | |
712 def test_dbExtractSeqHeaders_without_header(self): | |
713 fileName = "dummyFastaFile.fa" | |
714 Utils_for_T_FastaUtils._createFastaFile_sequence_without_header(fileName) | |
715 expFileName = "exp_dummyFastaFile.fa" | |
716 f = open(expFileName, 'w') | |
717 f.write("") | |
718 f.close() | |
719 | |
720 FastaUtils.dbExtractSeqHeaders(fileName) | |
721 obsFileName = "dummyFastaFile.fa.headers" | |
722 | |
723 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
724 | |
725 os.remove(fileName) | |
726 os.remove(expFileName) | |
727 os.remove(obsFileName) | |
728 | |
729 def test_dbExtractByPattern_without_pattern(self): | |
730 fileName = "dummyFastaFile.fa" | |
731 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
732 | |
733 obsResult = FastaUtils.dbExtractByPattern( "", fileName) | |
734 | |
735 expResult = None | |
736 | |
737 self.assertEquals(expResult, obsResult) | |
738 | |
739 os.remove(fileName) | |
740 | |
741 def test_dbExtractByPattern(self): | |
742 fileName = "dummyFastaFile.fa" | |
743 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
744 expFileName = "exp_dummyFastaFile.fa" | |
745 Utils_for_T_FastaUtils._createFastaFile_three_sequences(expFileName) | |
746 | |
747 FastaUtils.dbExtractByPattern( 'seq', fileName) | |
748 | |
749 obsFileName = "dummyFastaFile.fa.extracted" | |
750 | |
751 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
752 | |
753 os.remove(fileName) | |
754 os.remove(expFileName) | |
755 os.remove(obsFileName) | |
756 | |
757 def test_dbExtractByPattern_with_2_as_pattern(self): | |
758 fileName = "dummyFastaFile.fa" | |
759 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
760 expFileName = "exp_dummyFastaFile.fa" | |
761 f = open(expFileName, 'w') | |
762 f.write(">seq 2\n") | |
763 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
764 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
765 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
766 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
767 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
768 f.write("ATATTCG\n") | |
769 f.close() | |
770 | |
771 FastaUtils.dbExtractByPattern( ' 2', fileName) | |
772 | |
773 obsFileName = "dummyFastaFile.fa.extracted" | |
774 | |
775 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
776 | |
777 os.remove(fileName) | |
778 os.remove(expFileName) | |
779 os.remove(obsFileName) | |
780 | |
781 def test_dbExtractByPattern_with_sandie_as_pattern(self): | |
782 fileName = "dummyFastaFile.fa" | |
783 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
784 expFileName = "exp_dummyFastaFile.fa" | |
785 Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileName) | |
786 | |
787 FastaUtils.dbExtractByPattern( 'sandie', fileName) | |
788 | |
789 obsFileName = "dummyFastaFile.fa.extracted" | |
790 | |
791 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
792 | |
793 os.remove(fileName) | |
794 os.remove(expFileName) | |
795 os.remove(obsFileName) | |
796 | |
797 def test_dbExtractByFilePattern_empty_pattern_filename(self): | |
798 patternFileName = "" | |
799 isSysExitRaised = False | |
800 try: | |
801 FastaUtils.dbExtractByFilePattern(patternFileName , None, "") | |
802 except SystemExit: | |
803 isSysExitRaised = True | |
804 self.assertTrue(isSysExitRaised) | |
805 | |
806 def test_dbExtractByFilePattern(self): | |
807 fileName = "dummyFastaFile.fa" | |
808 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) | |
809 patternFileName = "dummyPatternFile.txt" | |
810 Utils_for_T_FastaUtils._createPatternFile(patternFileName) | |
811 | |
812 expFileName = "exp_dummyFastaFile.fa" | |
813 f = open(expFileName, 'w') | |
814 f.write(">seq 1\n") | |
815 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
816 f.write(">seq 3\n") | |
817 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
818 f.write(">seq 8\n") | |
819 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
820 f.write(">seq 10\n") | |
821 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
822 f.close() | |
823 | |
824 obsFileName = "dummyFastaFile.fa.extracted" | |
825 | |
826 FastaUtils.dbExtractByFilePattern( patternFileName, fileName, "") | |
827 | |
828 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
829 | |
830 os.remove(fileName) | |
831 os.remove(patternFileName) | |
832 os.remove(expFileName) | |
833 os.remove(obsFileName) | |
834 | |
835 def test_dbCleanByPattern_without_pattern(self): | |
836 fileName = "dummyFastaFile.fa" | |
837 Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) | |
838 | |
839 obsResult = FastaUtils.dbCleanByPattern( "", fileName) | |
840 | |
841 expResult = None | |
842 | |
843 self.assertEquals(expResult, obsResult) | |
844 | |
845 os.remove(fileName) | |
846 | |
847 def test_dbCleanByPattern(self): | |
848 fileName = "dummyFastaFile.fa" | |
849 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) | |
850 | |
851 expFileName = "exp_dummyFastaFile.fa" | |
852 f = open(expFileName, 'w') | |
853 f.write(">seq 1\n") | |
854 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
855 f.write(">seq 3\n") | |
856 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
857 f.write(">seq 4\n") | |
858 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
859 f.write(">seq 5\n") | |
860 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
861 f.write(">seq 6\n") | |
862 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
863 f.write(">seq 7\n") | |
864 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
865 f.write(">seq 8\n") | |
866 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
867 f.write(">seq 9\n") | |
868 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
869 f.write(">seq 10\n") | |
870 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
871 f.close() | |
872 | |
873 obsFileName = "dummyFastaFile.fa.cleaned" | |
874 FastaUtils.dbCleanByPattern( '2', fileName) | |
875 | |
876 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
877 | |
878 os.remove(fileName) | |
879 os.remove(expFileName) | |
880 os.remove(obsFileName) | |
881 | |
882 def test_dbCleanByPattern_with_expectedFile_empty(self): | |
883 fileName = "dummyFastaFile.fa" | |
884 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) | |
885 | |
886 expFileName = "exp_dummyFastaFile.fa" | |
887 f = open(expFileName, 'w') | |
888 f.write("") | |
889 f.close() | |
890 | |
891 obsFileName = "dummyFastaFile.fa.cleaned" | |
892 FastaUtils.dbCleanByPattern( 'seq', fileName) | |
893 | |
894 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
895 | |
896 os.remove(fileName) | |
897 os.remove(expFileName) | |
898 os.remove(obsFileName) | |
899 | |
900 def test_dbCleanByFilePattern_empty_pattern_filename(self): | |
901 patternFileName = "" | |
902 isSysExitRaised = False | |
903 try: | |
904 FastaUtils.dbCleanByFilePattern(patternFileName , None, "") | |
905 except SystemExit: | |
906 isSysExitRaised = True | |
907 self.assertTrue(isSysExitRaised) | |
908 | |
909 def test_dbCleanByFilePattern(self): | |
910 fileName = "dummyFastaFile.fa" | |
911 Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) | |
912 patternFileName = "dummyPatternFile.txt" | |
913 Utils_for_T_FastaUtils._createPatternFile(patternFileName) | |
914 | |
915 expFileName = "exp_dummyFastaFile.fa" | |
916 f = open(expFileName, 'w') | |
917 f.write(">seq 2\n") | |
918 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
919 f.write(">seq 4\n") | |
920 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
921 f.write(">seq 5\n") | |
922 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
923 f.write(">seq 6\n") | |
924 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
925 f.write(">seq 7\n") | |
926 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
927 f.write(">seq 9\n") | |
928 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
929 f.close() | |
930 | |
931 obsFileName = "dummyFastaFile.fa.cleaned" | |
932 | |
933 FastaUtils.dbCleanByFilePattern( patternFileName, fileName, "") | |
934 | |
935 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
936 | |
937 os.remove(fileName) | |
938 os.remove(patternFileName) | |
939 os.remove(expFileName) | |
940 os.remove(obsFileName) | |
941 | |
942 def test_dbORF_without_ORF(self): | |
943 fileName = "dummy.fa" | |
944 with open(fileName, "w") as f: | |
945 f.write(">dummy\n") | |
946 f.write("GGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTT\n") | |
947 | |
948 expFileName = "exp.ORF.map" | |
949 with open(expFileName, "w") as f: | |
950 f.write("") | |
951 obsFileName = "%s.ORF.map" % fileName | |
952 | |
953 FastaUtils.dbORF(fileName, 0, 0) | |
954 | |
955 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
956 | |
957 os.remove(fileName) | |
958 os.remove(obsFileName) | |
959 os.remove(expFileName) | |
960 | |
961 def test_dbORF_with_one_ORF(self): | |
962 fileName = "dummyFastaFile.fa" | |
963 f = open(fileName, 'w') | |
964 f.write(">seq1\n") | |
965 f.write("GAAAATATGGGGTAGATAAGGGATCTGGGTTAATTTTTT\n") | |
966 f.close() | |
967 | |
968 expFileName = "exp_dummyORFFile.ORF.map" | |
969 f = open(expFileName, 'w') | |
970 f.write("ORF|1|17\tseq1\t16\t33\n") | |
971 f.close() | |
972 | |
973 FastaUtils.dbORF(fileName, 0, 0) | |
974 obsFileName = fileName + ".ORF.map" | |
975 | |
976 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
977 | |
978 os.remove(fileName) | |
979 os.remove(obsFileName) | |
980 os.remove(expFileName) | |
981 | |
982 def test_dbORF_with_real_ORF(self): | |
983 fileName = "dummy.fa" | |
984 with open(fileName, "w") as f: | |
985 f.write(">DmelChr4_Blaster_Recon_13_Map_4\n") | |
986 f.write("AAGTTGGACATTGAGGGCTTTCTTCGCCGTGTTTCGTTCTTTTCGACAAACAGCAGTGCT\n") | |
987 f.write("TTGCGGATCATTTTGTTTGAACAACCGACAATGCGACCAATTTCAGCGTAGGTTTTACCT\n") | |
988 f.write("TCAGAGATCACGTTTTTAATCAAATTTCTTTTTTCGACGGTACAATGCTTTCCGCGACCC\n") | |
989 f.write("ATGACTAGAGAATTTTTGGTCTTCGTTTGGAAAAAATTCAATTAAAACCTTTAATACAAC\n") | |
990 f.write("TCCTTTTTTCAAAATTTTTCGAAAAAAACCCAAAGCAATCACTCCTATTAATTTTATTCA\n") | |
991 f.write("GCAAATACGTGTTCAGTGCTATTTTTGTTACCGCCTCATTTCGCGCACTTTTGCAGCAAG\n") | |
992 f.write("TGCCCAAAAACAAAAAGAACCGTTACATTGAGAGACTAAAAATTTCTTGCTCAGAGAGCC\n") | |
993 f.write("AACATATGGTACTTATTATTCATGCAATCTGACTTAAAAAAATATAAACATTTAATAATT\n") | |
994 f.write("TTTTTTAGGAAATCAACTTTCCACCTGCAGTAGTGCTATTATTTTAACCGCAGCTGTATA\n") | |
995 f.write(">DmelChr4_Blaster_Piler_3.5_Map_7\n") | |
996 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
997 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
998 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
999 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
1000 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
1001 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") | |
1002 f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGT\n") | |
1003 f.write("TAGGGCTAGGGTTAGGGGTTAGGGTTAGGGTTAGGCTTAGGGTTAGGGTTAGGGTTAGGG\n") | |
1004 f.write("TTAGGGTTAGGGTTAGGGTTAGGAGTTAGGGTGTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") | |
1005 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") | |
1006 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGCTAGGGTTAGGGTTAG\n") | |
1007 f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") | |
1008 f.write(">DmelChr4_Blaster_Grouper_10_Map_13\n") | |
1009 f.write("GCAAAGACACTAGAATAACAAGATGCGTAACGGCCATACATTGGTTTGGCACTATGCAGC\n") | |
1010 f.write("CACTTTTTTGGTGACGGCCAAAATTACTCTCTTTCCGCTCACTCCCGCTGAGAGCGTAAG\n") | |
1011 f.write("AAATCTAAAAATATAATTTGCTTGCTTGTGTGAGTAAAAACAAGAGACGAGAACGCGTAT\n") | |
1012 f.write("AAGTGTGCGTGTTGTGCTAGAAGACGATTTTCGGGACCGAAATCAATTCTGATCGAAGAA\n") | |
1013 f.write("ACGAATTTACATGGTACATATTAGGGTAGTTTTTGCCAATTTCCTAGCAATATGATAAAA\n") | |
1014 f.write("TAAAAAAATTTTTAAAAATTCGCGCCCTGACTATTATAATTTTAAAGCTTTTTAAAATTT\n") | |
1015 f.write("GTTTGTTAAAATCGCCGCTCGAATTAGCTACCGTTTACACATTTATATTTATGTTTAATT\n") | |
1016 f.write("CTAATTTGTCTCTCATCTGACAATTTTTTAAGAAAGCGAAATATTTTTTTTTTGAAACAC\n") | |
1017 f.write("TTTTAATGTTAATGTTACATCATATTAAGTCAAATGATTTAATAAATATACTAAATAATT\n") | |
1018 f.write("AAATATGATAACTGTTTATTGCAAAAGTAATATCAAAGACACTAGAATTATTCTAGTGTC\n") | |
1019 f.write("TTTGCTTTGTTCATATCTTGAGGCACGAAGTGCGGACACAAGCACTCAACAATCATTGCC\n") | |
1020 f.write("TTATTAATTTTTCACACGCCGCAAGATGAATACTCTAATGACAAATATTCTTATATAAAG\n") | |
1021 f.write("TCATTTTTGAAATTTATTTTTGTGATAATATGTACATAGATTTGGCTATTTCTAATCTAT\n") | |
1022 f.write("TTTCAAATAATAATAACGTTAAGGCAATGCAAAACAAGAATTTTTTTAGTCGCATGGTGC\n") | |
1023 f.write("CAATTGATCAAAAATAATATAGATTTAAAGTCTAAGAACTTCTAAGGTGAAGGGCATATT\n") | |
1024 f.write("TTGTCAAATTTACAATGCATGAGCGAGCATACGTGTGCACACATACAGTTGTCTGCTATC\n") | |
1025 f.write("ACTTTGTGCGTTGAAAA\n") | |
1026 | |
1027 expFileName = "exp.ORF.map" | |
1028 with open(expFileName, "w") as f: | |
1029 f.write("ORF|3|263\tDmelChr4_Blaster_Recon_13_Map_4\t189\t452\n") | |
1030 f.write("ORF|2|206\tDmelChr4_Blaster_Recon_13_Map_4\t185\t391\n") | |
1031 f.write("ORF|-3|164\tDmelChr4_Blaster_Recon_13_Map_4\t382\t218\n") | |
1032 f.write("ORF|-1|161\tDmelChr4_Blaster_Recon_13_Map_4\t297\t136\n") | |
1033 f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t400\t513\n") | |
1034 f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t112\t225\n") | |
1035 f.write("ORF|3|107\tDmelChr4_Blaster_Recon_13_Map_4\t81\t188\n") | |
1036 f.write("ORF|1|107\tDmelChr4_Blaster_Recon_13_Map_4\t292\t399\n") | |
1037 f.write("ORF|-1|104\tDmelChr4_Blaster_Recon_13_Map_4\t432\t328\n") | |
1038 f.write("ORF|-2|104\tDmelChr4_Blaster_Recon_13_Map_4\t515\t411\n") | |
1039 f.write("ORF|3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t393\t509\n") | |
1040 f.write("ORF|-3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t505\t389\n") | |
1041 f.write("ORF|-2|86\tDmelChr4_Blaster_Piler_3.5_Map_7\t518\t432\n") | |
1042 f.write("ORF|1|80\tDmelChr4_Blaster_Piler_3.5_Map_7\t436\t516\n") | |
1043 f.write("ORF|-3|170\tDmelChr4_Blaster_Grouper_10_Map_13\t222\t52\n") | |
1044 f.write("ORF|-1|161\tDmelChr4_Blaster_Grouper_10_Map_13\t260\t99\n") | |
1045 f.write("ORF|3|155\tDmelChr4_Blaster_Grouper_10_Map_13\t702\t857\n") | |
1046 f.write("ORF|3|152\tDmelChr4_Blaster_Grouper_10_Map_13\t288\t440\n") | |
1047 f.write("ORF|1|137\tDmelChr4_Blaster_Grouper_10_Map_13\t622\t759\n") | |
1048 f.write("ORF|2|128\tDmelChr4_Blaster_Grouper_10_Map_13\t539\t667\n") | |
1049 f.write("ORF|1|125\tDmelChr4_Blaster_Grouper_10_Map_13\t760\t885\n") | |
1050 f.write("ORF|2|122\tDmelChr4_Blaster_Grouper_10_Map_13\t14\t136\n") | |
1051 f.write("ORF|-2|113\tDmelChr4_Blaster_Grouper_10_Map_13\t847\t734\n") | |
1052 f.write("ORF|1|110\tDmelChr4_Blaster_Grouper_10_Map_13\t154\t264\n") | |
1053 obsFileName = "%s.ORF.map" % fileName | |
1054 | |
1055 FastaUtils.dbORF(fileName, 10, 30) | |
1056 | |
1057 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1058 | |
1059 os.remove(fileName) | |
1060 os.remove(obsFileName) | |
1061 os.remove(expFileName) | |
1062 | |
1063 def test_sortSequencesByIncreasingLength(self): | |
1064 fileName = "dummyFastaFile.fa" | |
1065 f = open(fileName, 'w') | |
1066 f.write(">seq1_length_60\n") | |
1067 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1068 f.write(">seq2_length_120\n") | |
1069 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1070 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1071 f.write(">seq3_length_32\n") | |
1072 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1073 f.close() | |
1074 | |
1075 expFileName = "exp_dummyFastaFile.fa" | |
1076 f = open(expFileName, 'w') | |
1077 f.write(">seq3_length_32\n") | |
1078 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1079 f.write(">seq1_length_60\n") | |
1080 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1081 f.write(">seq2_length_120\n") | |
1082 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1083 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1084 | |
1085 f.close() | |
1086 | |
1087 obsFileName = "obs_dummyFastaFile.fa" | |
1088 | |
1089 FastaUtils.sortSequencesByIncreasingLength(fileName, obsFileName, 0) | |
1090 | |
1091 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1092 | |
1093 os.remove(expFileName) | |
1094 os.remove(obsFileName) | |
1095 | |
1096 def test_sortSequencesByIncreasingLength_in_file_do_not_exists(self): | |
1097 fileName = "dummyFile.fa" | |
1098 isSysExitRaised = False | |
1099 try: | |
1100 FastaUtils.sortSequencesByIncreasingLength(fileName, "", 0) | |
1101 except SystemExit: | |
1102 isSysExitRaised = True | |
1103 | |
1104 self.assertTrue(isSysExitRaised) | |
1105 | |
1106 def test_sortSequencesByHeader(self): | |
1107 fileName = "dummyFastaFile.fa" | |
1108 f = open(fileName, "w") | |
1109 f.write(">seq1::test-test\n") | |
1110 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1111 f.write(">seq3\n") | |
1112 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1113 f.write(">seq2\n") | |
1114 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1115 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1116 f.close() | |
1117 expFileName = "expFastaFile.fa" | |
1118 f = open(expFileName, "w") | |
1119 f.write(">seq1::test-test\n") | |
1120 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1121 f.write(">seq2\n") | |
1122 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1123 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1124 f.write(">seq3\n") | |
1125 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1126 f.close() | |
1127 | |
1128 obsFileName = "obsFastaFile.fa" | |
1129 FastaUtils.sortSequencesByHeader(fileName, obsFileName) | |
1130 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1131 | |
1132 os.remove(expFileName) | |
1133 os.remove(obsFileName) | |
1134 | |
1135 def test_sortSequencesByHeader_no_outFileName(self): | |
1136 fileName = "dummyFastaFile.fa" | |
1137 f = open(fileName, "w") | |
1138 f.write(">seq12\n") | |
1139 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1140 f.write(">seq1\n") | |
1141 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1142 f.write(">seq2\n") | |
1143 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1144 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1145 f.close() | |
1146 expFileName = "expFastaFile.fa" | |
1147 f = open(expFileName, "w") | |
1148 f.write(">seq1\n") | |
1149 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") | |
1150 f.write(">seq12\n") | |
1151 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1152 f.write(">seq2\n") | |
1153 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1154 f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") | |
1155 f.close() | |
1156 | |
1157 obsFileName = "dummyFastaFile_sortByHeaders.fa" | |
1158 FastaUtils.sortSequencesByHeader(fileName) | |
1159 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1160 | |
1161 os.remove(expFileName) | |
1162 os.remove(obsFileName) | |
1163 | |
1164 def test_getLengthPerHeader( self ): | |
1165 inFile = "dummyFile.fa" | |
1166 inFileHandler = open( inFile, "w" ) | |
1167 inFileHandler.write(">seq1\nAGCGATGCGT\n") | |
1168 inFileHandler.write(">seq2\nAGCGATG\n") | |
1169 inFileHandler.write(">seq3\nAGCGATGGTGCGTGC\n") | |
1170 inFileHandler.write("AGCGATGGTGCGTGC\n") | |
1171 inFileHandler.close() | |
1172 | |
1173 dExp = { "seq1": 10, "seq2": 7, "seq3": 30 } | |
1174 | |
1175 dObs = FastaUtils.getLengthPerHeader( inFile, 0 ) | |
1176 | |
1177 self.assertEquals( dExp, dObs ) | |
1178 | |
1179 os.remove( inFile ) | |
1180 | |
1181 def test_convertFastaHeadersFromChkToChr_grouper(self): | |
1182 inFile = "dummyFastaFile.fa" | |
1183 with open(inFile, "w") as f: | |
1184 f.write(">MbQ1Gr1Cl0 chunk6 {Fragment} 95523..96053\n") | |
1185 f.write("AGCGTGCA\n") | |
1186 f.write(">MbQ77Gr8Cl0 chunk7 {Fragment} 123657..122568,121935..121446\n") | |
1187 f.write("AGCATGC\n") | |
1188 f.write(">MbS78Gr8Cl0 chunk7 {Fragment} 140078..139519,139470..138985,138651..138183\n") | |
1189 f.write("CGTGCG\n") | |
1190 f.write(">MbQ79Gr8Cl0 chunk7 {Fragment} 48021..48587,48669..49153,57346..57834\n") | |
1191 f.write("AGCGTGC\n") | |
1192 mapFile = "dummyMapFile.map" | |
1193 with open(mapFile, "w") as f: | |
1194 f.write("chunk5\tdmel_chr4\t760001\t960000\n") | |
1195 f.write("chunk6\tdmel_chr4\t950001\t1150000\n") | |
1196 f.write("chunk7\tdmel_chr4\t1140001\t1281640\n") | |
1197 expFile = "expFile.fa" | |
1198 with open(expFile, "w") as f: | |
1199 f.write(">MbQ1Gr1Cl0 dmel_chr4 {Fragment} 1045523..1046053\n") | |
1200 f.write("AGCGTGCA\n") | |
1201 f.write(">MbQ77Gr8Cl0 dmel_chr4 {Fragment} 1263657..1262568,1261935..1261446\n") | |
1202 f.write("AGCATGC\n") | |
1203 f.write(">MbS78Gr8Cl0 dmel_chr4 {Fragment} 1280078..1279519,1279470..1278985,1278651..1278183\n") | |
1204 f.write("CGTGCG\n") | |
1205 f.write(">MbQ79Gr8Cl0 dmel_chr4 {Fragment} 1188021..1188587,1188669..1189153,1197346..1197834\n") | |
1206 f.write("AGCGTGC\n") | |
1207 obsFile = "obsFile.fa" | |
1208 | |
1209 FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile) | |
1210 | |
1211 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) | |
1212 | |
1213 for file in [inFile, mapFile, expFile, obsFile]: | |
1214 os.remove(file) | |
1215 | |
1216 def test_convertFastaHeadersFromChkToChr_blastclust(self): | |
1217 inFile = "dummyFastaFile.fa" | |
1218 with open(inFile, "w") as f: | |
1219 f.write(">BlastclustCluster12Mb63_chunk1 (dbseq-nr 0) [1,10]\n") | |
1220 f.write("AGCGTGCA\n") | |
1221 f.write(">BlastclustCluster12Mb53_chunk2 (dbseq-nr 2) [1,10]\n") | |
1222 f.write("AGCATGC\n") | |
1223 f.write(">BlastclustCluster12Mb26_chunk2 (dbseq-nr 2) [12,18]\n") | |
1224 f.write("CGTGCG\n") | |
1225 f.write(">BlastclustCluster12Mb35_chunk3 (dbseq-nr 0) [10,1]\n") | |
1226 f.write("AGCGTGC\n") | |
1227 mapFile = "dummyMapFile.map" | |
1228 with open(mapFile, "w") as f: | |
1229 f.write("chunk1\tchromosome1\t1\t20\n") | |
1230 f.write("chunk2\tchromosome1\t16\t35\n") | |
1231 f.write("chunk3\tchromosome2\t1\t20\n") | |
1232 expFile = "expFile.fa" | |
1233 with open(expFile, "w") as f: | |
1234 f.write(">BlastclustCluster12Mb63 chromosome1 (dbseq-nr 0) 1..10\n") | |
1235 f.write("AGCGTGCA\n") | |
1236 f.write(">BlastclustCluster12Mb53 chromosome1 (dbseq-nr 2) 16..25\n") | |
1237 f.write("AGCATGC\n") | |
1238 f.write(">BlastclustCluster12Mb26 chromosome1 (dbseq-nr 2) 27..33\n") | |
1239 f.write("CGTGCG\n") | |
1240 f.write(">BlastclustCluster12Mb35 chromosome2 (dbseq-nr 0) 10..1\n") | |
1241 f.write("AGCGTGC\n") | |
1242 obsFile = "obsFile.fa" | |
1243 | |
1244 FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile) | |
1245 | |
1246 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) | |
1247 | |
1248 for file in [inFile, mapFile, expFile, obsFile]: | |
1249 os.remove(file) | |
1250 | |
1251 def test_convertFastaToLength( self ): | |
1252 inFile = "dummyFastaFile.fa" | |
1253 inFileHandler = open(inFile, "w") | |
1254 inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n") | |
1255 inFileHandler.write("AGCGTGCA\n") | |
1256 inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n") | |
1257 inFileHandler.write("AGCATGCAA\n") | |
1258 inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n") | |
1259 inFileHandler.write("CGTGCGAAAA\n") | |
1260 inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n") | |
1261 inFileHandler.write("AGCGTG\n") | |
1262 inFileHandler.close() | |
1263 | |
1264 expFile = "expFile.length" | |
1265 expFileHandler = open(expFile, "w") | |
1266 expFileHandler.write("ReconCluster12Mb63\t8\n") | |
1267 expFileHandler.write("ReconCluster12Mb53\t9\n") | |
1268 expFileHandler.write("ReconCluster12Mb26\t10\n") | |
1269 expFileHandler.write("ReconCluster12Mb35\t6\n") | |
1270 expFileHandler.close() | |
1271 | |
1272 obsFile = "obsFile.length" | |
1273 | |
1274 FastaUtils.convertFastaToLength(inFile, obsFile) | |
1275 | |
1276 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) | |
1277 | |
1278 for f in [inFile, expFile, obsFile]: | |
1279 os.remove(f) | |
1280 | |
1281 def test_convertFastaToSeq( self ): | |
1282 inFile = "dummyFastaFile.fa" | |
1283 inFileHandler = open(inFile, "w") | |
1284 inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n") | |
1285 inFileHandler.write("AGCGTGCA\n") | |
1286 inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n") | |
1287 inFileHandler.write("AGCATGCAA\n") | |
1288 inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n") | |
1289 inFileHandler.write("CGTGCGAAAA\n") | |
1290 inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n") | |
1291 inFileHandler.write("AGCGTG\n") | |
1292 inFileHandler.close() | |
1293 | |
1294 expFile = "expFile.seq" | |
1295 expFileHandler = open(expFile, "w") | |
1296 expFileHandler.write("ReconCluster12Mb63\tAGCGTGCA\tReconCluster12Mb63 chunk1 {Fragment} 1..10\t8\n") | |
1297 expFileHandler.write("ReconCluster12Mb53\tAGCATGCAA\tReconCluster12Mb53 chunk2 {Fragment} 1..10\t9\n") | |
1298 expFileHandler.write("ReconCluster12Mb26\tCGTGCGAAAA\tReconCluster12Mb26 chunk2 {Fragment} 12..18\t10\n") | |
1299 expFileHandler.write("ReconCluster12Mb35\tAGCGTG\tReconCluster12Mb35 chunk3 {Fragment} 10..1\t6\n") | |
1300 expFileHandler.close() | |
1301 | |
1302 obsFile = "obsFile.seq" | |
1303 | |
1304 FastaUtils.convertFastaToSeq(inFile, obsFile) | |
1305 | |
1306 self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) | |
1307 | |
1308 for f in [inFile, expFile, obsFile]: | |
1309 os.remove(f) | |
1310 | |
1311 def test_spliceFromCoords( self ): | |
1312 coordFile = "dummyCoordFile" | |
1313 coordFileHandler = open( coordFile, "w" ) | |
1314 coordFileHandler.write("TE1\tchr1\t2\t5\n") | |
1315 coordFileHandler.write("TE2\tchr1\t15\t11\n") | |
1316 coordFileHandler.write("TE3\tchr2\t1\t3\n") | |
1317 coordFileHandler.write("TE1\tchr2\t8\t10\n") | |
1318 coordFileHandler.write("TE4\tchr3\t3\t1\n") | |
1319 coordFileHandler.write("TE4\tchr3\t6\t4\n") | |
1320 coordFileHandler.close() | |
1321 | |
1322 genomeFile = "dummyGenomeFile" | |
1323 genomeFileHandler = open( genomeFile, "w" ) | |
1324 genomeFileHandler.write(">chr1\n") | |
1325 genomeFileHandler.write("AGGGGAAAAACCCCCAAAAA\n") | |
1326 genomeFileHandler.write(">chr2\n") | |
1327 genomeFileHandler.write("GGGAAAAGGG\n") | |
1328 genomeFileHandler.write(">chr3\n") | |
1329 genomeFileHandler.write("GGGGGGTTTT\n") | |
1330 genomeFileHandler.close() | |
1331 | |
1332 expFile = "dummyExpFile" | |
1333 expFileHandler = open( expFile, "w" ) | |
1334 expFileHandler.write(">chr1\n") | |
1335 expFileHandler.write("AAAAAAAAAAA\n") | |
1336 expFileHandler.write(">chr2\n") | |
1337 expFileHandler.write("AAAA\n") | |
1338 expFileHandler.write(">chr3\n") | |
1339 expFileHandler.write("TTTT\n") | |
1340 expFileHandler.close() | |
1341 | |
1342 obsFile = "dummyObsFile" | |
1343 | |
1344 FastaUtils.spliceFromCoords( genomeFile, | |
1345 coordFile, | |
1346 obsFile ) | |
1347 self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) ) | |
1348 for f in [ coordFile, genomeFile, expFile, obsFile ]: | |
1349 os.remove( f ) | |
1350 | |
1351 def test_dbShuffle_inputFile( self ): | |
1352 inFile = "dummyInFile.fa" | |
1353 inFileHandler = open( inFile, "w" ) | |
1354 inFileHandler.write(">seq1\n") | |
1355 inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n") | |
1356 inFileHandler.close() | |
1357 | |
1358 obsFile = "dummyObsFile.fa" | |
1359 FastaUtils.dbShuffle( inFile, obsFile, 1 ) | |
1360 | |
1361 self.assertTrue( FastaUtils.dbSize( obsFile ) == 1 ) | |
1362 | |
1363 for f in [ inFile, obsFile ]: | |
1364 os.remove( f ) | |
1365 | |
1366 def test_dbShuffle_inputDir( self ): | |
1367 inDir = "dummyInDir" | |
1368 if os.path.exists( inDir ): | |
1369 shutil.rmtree( inDir ) | |
1370 os.mkdir( inDir ) | |
1371 inFile = "%s/dummyInFile.fa" % inDir | |
1372 inFileHandler = open( inFile, "w" ) | |
1373 inFileHandler.write(">seq1\n") | |
1374 inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n") | |
1375 inFileHandler.close() | |
1376 | |
1377 obsDir = "dummyObsDir" | |
1378 FastaUtils.dbShuffle( inDir, obsDir, 1 ) | |
1379 | |
1380 obsFile = "dummyInFile_shuffle.fa" | |
1381 self.assertTrue( len( glob.glob("%s/%s" % (obsDir,obsFile)) ) == 1 ) | |
1382 | |
1383 for d in [ inDir, obsDir ]: | |
1384 shutil.rmtree( d ) | |
1385 | |
1386 def test_convertClusterFileToFastaFile(self): | |
1387 inClusterFileName = "in.tab" | |
1388 with open(inClusterFileName, "w") as f: | |
1389 f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n") | |
1390 f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1391 f.write("RLX-incomp_DmelChr4-B-G220-Map3\n") | |
1392 inFastaFileName = "in.fa" | |
1393 with open(inFastaFileName, "w") as f: | |
1394 f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") | |
1395 f.write("ATCGCATCGATCGATC\n") | |
1396 f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") | |
1397 f.write("ATCGCATCGATCGATC\n") | |
1398 f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n") | |
1399 f.write("ATCGCC\n") | |
1400 f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") | |
1401 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1402 f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1403 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1404 f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n") | |
1405 f.write("ATCGCATCGATCGATC\n") | |
1406 expFileName = "exp.fa" | |
1407 with open(expFileName, "w") as f: | |
1408 f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") | |
1409 f.write("ATCGCATCGATCGATC\n") | |
1410 f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") | |
1411 f.write("ATCGCATCGATCGATC\n") | |
1412 f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n") | |
1413 f.write("ATCGCC\n") | |
1414 f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") | |
1415 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1416 f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1417 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1418 f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n") | |
1419 f.write("ATCGCATCGATCGATC\n") | |
1420 obsFileName = "obs.fa" | |
1421 | |
1422 FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust") | |
1423 | |
1424 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1425 os.remove(inClusterFileName) | |
1426 os.remove(inFastaFileName) | |
1427 os.remove(expFileName) | |
1428 os.remove(obsFileName) | |
1429 | |
1430 | |
1431 def test_convertClusterFileToFastaFile_withoutUnclusterizedSequences(self): | |
1432 inClusterFileName = "in.tab" | |
1433 with open(inClusterFileName, "w") as f: | |
1434 f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n") | |
1435 f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1436 inFastaFileName = "in.fa" | |
1437 with open(inFastaFileName, "w") as f: | |
1438 f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") | |
1439 f.write("ATCGCATCGATCGATC\n") | |
1440 f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") | |
1441 f.write("ATCGCATCGATCGATC\n") | |
1442 f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n") | |
1443 f.write("ATCGCC\n") | |
1444 f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") | |
1445 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1446 f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1447 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1448 f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n") | |
1449 f.write("ATCGCATCGATCGATC\n") | |
1450 expFileName = "exp.fa" | |
1451 with open(expFileName, "w") as f: | |
1452 f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") | |
1453 f.write("ATCGCATCGATCGATC\n") | |
1454 f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") | |
1455 f.write("ATCGCATCGATCGATC\n") | |
1456 f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n") | |
1457 f.write("ATCGCC\n") | |
1458 f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") | |
1459 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1460 f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") | |
1461 f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") | |
1462 f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n") | |
1463 f.write("ATCGCATCGATCGATC\n") | |
1464 obsFileName = "obs.fa" | |
1465 | |
1466 FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust") | |
1467 | |
1468 self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) | |
1469 os.remove(inClusterFileName) | |
1470 os.remove(inFastaFileName) | |
1471 os.remove(expFileName) | |
1472 os.remove(obsFileName) | |
1473 | |
1474 def test_convertClusterFileToMapFile(self): | |
1475 for clustAlgo in ["Blastclust", "MCL"]: | |
1476 inFileName = "dummy%sOut.fa" % clustAlgo | |
1477 inF = open(inFileName, "w") | |
1478 inF.write(">%sCluster1Mb1_chunk1 (dbseq-nr 1) [1,14]\n" % clustAlgo) | |
1479 inF.write("gaattgtttactta\n") | |
1480 inF.write(">%sCluster3Mb1_chunk5 (dbseq-nr 8) [1000,1014]\n" % clustAlgo) | |
1481 inF.write("gaattgtttactta\n") | |
1482 inF.write(">%sCluster1Mb2_chunk1 (dbseq-nr 1) [30,44]\n" % clustAlgo) | |
1483 inF.write("gaattgtttactta\n") | |
1484 inF.write(">%sCluster2Mb1_chunk2 (dbseq-nr 1) [100,114]\n" % clustAlgo) | |
1485 inF.write("gaattgtttactta") | |
1486 inF.close() | |
1487 | |
1488 fileExp = "%sToMapExpected.map" % clustAlgo | |
1489 outF = open(fileExp, "w") | |
1490 outF.write("%sCluster1Mb1\tchunk1\t1\t14\n" % clustAlgo) | |
1491 outF.write("%sCluster3Mb1\tchunk5\t1000\t1014\n" % clustAlgo) | |
1492 outF.write("%sCluster1Mb2\tchunk1\t30\t44\n" % clustAlgo) | |
1493 outF.write("%sCluster2Mb1\tchunk2\t100\t114\n" % clustAlgo) | |
1494 outF.close() | |
1495 | |
1496 fileObs = "%s.map" % os.path.splitext(inFileName)[0] | |
1497 FastaUtils.convertClusteredFastaFileToMapFile(inFileName, fileObs) | |
1498 | |
1499 self.assertTrue(FileUtils.are2FilesIdentical(fileObs, fileExp)) | |
1500 | |
1501 os.remove(inFileName) | |
1502 os.remove(fileObs) | |
1503 os.remove(fileExp) | |
1504 | |
1505 if __name__ == "__main__": | |
1506 unittest.main() |