Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/commons/core/seq/test/Test_FastaUtils.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/commons/core/seq/test/Test_FastaUtils.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,1506 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +from commons.core.seq.FastaUtils import FastaUtils +from commons.core.seq.test.Utils_for_T_FastaUtils import Utils_for_T_FastaUtils +from commons.core.utils.FileUtils import FileUtils +import glob +import os +import shutil +import unittest + + +class Test_FastaUtils( unittest.TestCase ): + + + def test_dbSize_for_empty_file(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) + + obsNb = FastaUtils.dbSize( fileName ) + + expNb = 0 + os.remove(fileName) + self.assertEquals(expNb, obsNb) + + + def test_dbSize_one_sequence(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName) + + obsNb = FastaUtils.dbSize( fileName ) + + expNb = 1 + os.remove(fileName) + self.assertEquals(expNb, obsNb) + + + def test_dbSize_four_sequences(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) + + obsNb = FastaUtils.dbSize( fileName ) + + expNb = 4 + os.remove(fileName) + self.assertEquals(expNb, obsNb) + + + def test_dbChunks(self): + inFileName = "dummyBigSeqFastaFile.fa" + expChunksFileName = 'exp' + inFileName +'_chunks.fa' + expChunksMapFileName = 'exp' + inFileName +'_chunks.map' + expCutFileName = 'exp' + inFileName +'_cut' + expNStretchFileName = 'exp' + inFileName +'.Nstretch.map' + Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName) + Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName) + Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName) + Utils_for_T_FastaUtils._createFastaFile_of_cut(expCutFileName) + Utils_for_T_FastaUtils._createFastaFile_of_Nstretch(expNStretchFileName) + + FastaUtils.dbChunks(inFileName, '60', '10', '11', '', False, 0) + + obsChunksFileName = inFileName +'_chunks.fa' + obsChunksMapFileName = inFileName +'_chunks.map' + obsCutFileName = inFileName +'_cut' + obsNStretchFileName = inFileName +'.Nstretch.map' + + self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expCutFileName, obsCutFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expNStretchFileName, obsNStretchFileName)) + + os.remove(inFileName) + os.remove(expChunksFileName) + os.remove(expChunksMapFileName) + os.remove(expCutFileName) + os.remove(expNStretchFileName) + os.remove(obsChunksFileName) + os.remove(obsChunksMapFileName) + os.remove(obsCutFileName) + os.remove(obsNStretchFileName) + + + def test_dbChunks_with_clean_and_prefix(self): + inFileName = "dummyBigSeqFastaFile.fa" + expChunksFileName = 'exp' + inFileName +'_chunks.fa' + expChunksMapFileName = 'exp' + inFileName +'_chunks.map' + Utils_for_T_FastaUtils._createFastaFile_big_sequence(inFileName) + Utils_for_T_FastaUtils._createFastaFile_of_Chunks(expChunksFileName) + Utils_for_T_FastaUtils._createMapFile_of_Chunks(expChunksMapFileName) + + FastaUtils.dbChunks(inFileName, '60', '10', '11', 'outFile_chunks', True, 0) + + obsChunksFileName = "outFile_chunks.fa" + obsChunksMapFileName = "outFile_chunks.map" + + self.assertTrue(FileUtils.are2FilesIdentical(expChunksFileName, obsChunksFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expChunksMapFileName, obsChunksMapFileName)) + + os.remove(inFileName) + os.remove(expChunksFileName) + os.remove(expChunksMapFileName) + os.remove(obsChunksFileName) + os.remove(obsChunksMapFileName) + + + def test_dbCumLength_with_empty_file(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(inFileName) + + expCumulLength = 0 + + inFileHandler = open(inFileName, "r") + obsCumulLength = FastaUtils.dbCumLength(inFileHandler) + inFileHandler.close() + os.remove(inFileName) + + self.assertEquals(expCumulLength, obsCumulLength) + + def test_dbCumLength_four_sequences(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences(inFileName) + + expCumulLength = 1168 + + inFileHandler = open(inFileName, "r") + obsCumulLength = FastaUtils.dbCumLength(inFileHandler) + inFileHandler.close() + os.remove(inFileName) + + self.assertEquals(expCumulLength, obsCumulLength) + + + def test_dbLengths( self ): + inFileName = "dummyFastaFile.fa" + inF = open( inFileName, "w" ) + inF.write(">seq1\nATGACGT\n") + inF.write(">seq2\nATGGCGAGACGT\n") + inF.close() + lExp = [ 7, 12 ] + lObs = FastaUtils.dbLengths( inFileName ) + self.assertEquals( lExp, lObs ) + os.remove( inFileName ) + + + def test_dbHeaders_with_empty_file(self): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile ) + lExp = [] + lObs = FastaUtils.dbHeaders( inFile ) + self.assertEquals( lExp, lObs ) + os.remove( inFile ) + + + def test_dbHeaders_with_one_sequence_without_header(self): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_sequence_without_header( inFile ) + lExp = [] + lObs = FastaUtils.dbHeaders( inFile ) + self.assertEquals( lExp, lObs ) + os.remove( inFile ) + + + def test_dbHeaders_four_sequences(self): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) + lExp = [ "seq 1", "seq 2", "seq 3", "seq 4" ] + lObs = FastaUtils.dbHeaders( inFile ) + self.assertEquals( lExp, lObs ) + os.remove( inFile ) + + + def test_dbSplit_no_in_file( self ): + inFileName = "dummyFastaFile.fa" + isSysExitRaised = False + try: + FastaUtils.dbSplit( inFileName, 1, False ) + except SystemExit: + isSysExitRaised = True + self.assertTrue( isSysExitRaised ) + + + def test_dbSplit_emptyFile( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file( inFile ) + FastaUtils.dbSplit( inFile, 10, False, 1 ) + self.assertTrue( not os.path.exists( "batch_1.fa" ) ) + os.remove( inFile ) + + + def test_dbSplit_oneSequence_tenSequencesPerBatch( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_one_sequence( inFile ) + + expBatchFile = "dummyExpBatch_1.fa" + Utils_for_T_FastaUtils._createFastaFile_one_sequence( expBatchFile ) + + FastaUtils.dbSplit( inFile, 10, False ) + + obsBatchFile = "batch_1.fa" + + self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) ) + + for f in [ inFile, expBatchFile, obsBatchFile ]: + os.remove( f ) + + + def test_dbSplit_fourSequences_threeSequencesPerBatch( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) + + expBatch1File = "dummyExpBatch_1.fa" + expBatch2File = "dummyExpBatch_2.fa" + Utils_for_T_FastaUtils._createBatch1_three_sequences( expBatch1File ) + Utils_for_T_FastaUtils._createBatch2_one_sequence( expBatch2File ) + + FastaUtils.dbSplit( inFile, 3, False ) + + obsBatch1File = "batch_1.fa" + obsBatch2File = "batch_2.fa" + + self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) ) + self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) ) + + for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]: + os.remove( f ) + + + def test_dbSplit_fourSequences_twoSequencesPerBatch_inBatchDirectory( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences( inFile ) + + expBatch1File = "dummyExp_batch_1.fa" + expBatch2File = "dummyExp_batch_2.fa" + Utils_for_T_FastaUtils._createBatch1_two_sequences( expBatch1File ) + Utils_for_T_FastaUtils._createBatch2_two_sequences( expBatch2File ) + + FastaUtils.dbSplit( inFile, 2, True, 1 ) + + obsBatch1File = "batches/batch_1.fa" + obsBatch2File = "batches/batch_2.fa" + + self.assertTrue( FileUtils.are2FilesIdentical( expBatch1File, obsBatch1File ) ) + self.assertTrue( FileUtils.are2FilesIdentical( expBatch2File, obsBatch2File ) ) + + for f in [ inFile, expBatch1File, expBatch2File, obsBatch1File, obsBatch2File ]: + os.remove( f ) + + + def test_dbSplit_tenSequences_oneSequencePerBatch_inBatchDirectory( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_ten_sequences( inFile ) + + FastaUtils.dbSplit( inFile, 1, True ) + + nb = 1 + for s in [ '01', '02', '03', '04', '05', '06', '07', '08', '09', '10' ]: + expBatchFile = "exp_batch_%s.fa" % ( s ) + Utils_for_T_FastaUtils._createBatch_one_small_sequence( expBatchFile, "seq " + str(nb) ) + nb += 1 + obsBatchFile = "batches/batch_%s.fa" % ( s ) + self.assertTrue( FileUtils.are2FilesIdentical( expBatchFile, obsBatchFile ) ) + os.remove( expBatchFile ) + os.remove( obsBatchFile ) + + os.remove( inFile ) + os.rmdir( "batches" ) + + + def test_dbSplit_twoSequences_oneSequencePerBatch_useSeqHeader( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile ) + + lExpFileNames = [ "seq_1.fa", "seq_2.fa" ] + lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ] + Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] ) + Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] ) + + FastaUtils.dbSplit( inFile, 1, False, True ) + + lObsFiles = glob.glob( "seq*.fa" ) + lObsFiles.sort() + for i in range( 0, len(lExpFileNames) ): + self.assertEqual( lExpFileNames[i], lObsFiles[i] ) + self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) ) + + for f in [ inFile ] + lExpFiles + lObsFiles: + os.remove( f ) + + + def test_dbSplit_twoSequences_otherPrefix( self ): + inFile = "dummyFastaFile.fa" + Utils_for_T_FastaUtils.createFastaFile_twoSequences( inFile ) + + lExpFileNames = [ "query_1.fa", "query_2.fa" ] + lExpFiles = [ "dummyExp_seq_1.fa", "dummyExp_seq_2.fa" ] + Utils_for_T_FastaUtils.createFastaFile_seq_1( lExpFiles[0] ) + Utils_for_T_FastaUtils.createFastaFile_seq_2( lExpFiles[1] ) + + FastaUtils.dbSplit( inFile, 1, False, False, "query" ) + + lObsFiles = glob.glob( "query_*.fa" ) + lObsFiles.sort() + for i in range( 0, len(lExpFileNames) ): + self.assertEqual( lExpFileNames[i], lObsFiles[i] ) + self.assertTrue( FileUtils.are2FilesIdentical( lExpFiles[i], lObsFiles[i] ) ) + + for f in [ inFile ] + lExpFiles + lObsFiles: + os.remove( f ) + + + def test_splitFastaFileInBatches(self): + inFileName = "dummyFastaFile.fa" + with open(inFileName, "w") as f: + f.write(">seq1\n") + f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") + f.write(">seq2\n") + f.write("ATCGCTAGCTAGCTCG\n") + f.write(">seq3\n") + f.write("GTTTGGATCGCT\n") + f.write(">seq6\n") + f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") + f.write(">seq5\n") + f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n") + expBatch1 = "expBatch_1.fa" + with open(expBatch1, "w") as f: + f.write(">seq6\n") + f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCCTCT\n") + f.write("GTTTGGATCGCTCTCTGCTCGGAAATCC\n") + expBatch2 = "expBatch_2.fa" + with open(expBatch2, "w") as f: + f.write(">seq1\n") + f.write("ATCGCTAGCTAGCTCGATCTAGTCAGTCTGTTTGGATCGCTCTCTGCTCGGAAATCC\n") + expBatch3 = "expBatch_3.fa" + with open(expBatch3, "w") as f: + f.write(">seq5\n") + f.write("TTGGATCGCTCTCTGCTCGGAAATCCCGTC\n") + f.write(">seq2\n") + f.write("ATCGCTAGCTAGCTCG\n") + f.write(">seq3\n") + f.write("GTTTGGATCGCT\n") + + FastaUtils.splitFastaFileInBatches(inFileName, 60) + + obsBatch1 = "batches/batch_1.fa" + obsBatch2 = "batches/batch_2.fa" + obsBatch3 = "batches/batch_3.fa" + + self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1)) + self.assertTrue(FileUtils.are2FilesIdentical(expBatch2, obsBatch2)) + self.assertTrue(FileUtils.are2FilesIdentical(expBatch3, obsBatch3)) + + os.remove(inFileName) + os.remove(expBatch1) + os.remove(expBatch2) + os.remove(expBatch3) + shutil.rmtree("batches") + + + def test_splitFastaFileInBatches_one_seq(self): + inFileName = "dummyFastaFile.fa" + with open(inFileName, "w") as f: + f.write(">seq2\n") + f.write("ATCGCTAGCTAGCTCG\n") + expBatch1 = "expBatch_1.fa" + with open(expBatch1, "w") as f: + f.write(">seq2\n") + f.write("ATCGCTAGCTAGCTCG\n") + + FastaUtils.splitFastaFileInBatches(inFileName, 60) + + obsBatch1 = "batches/batch_1.fa" + + self.assertTrue(FileUtils.are2FilesIdentical(expBatch1, obsBatch1)) + + os.remove(inFileName) + os.remove(expBatch1) + shutil.rmtree("batches") + + + def test_splitSeqPerCluster_no_in_file(self): + inFileName = "dummyFastaFile.fa" + isSysExitRaised = False + try: + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") + except SystemExit: + isSysExitRaised = True + self.assertTrue(isSysExitRaised) + + + def test_splitSeqPerCluster_in_file_empty(self): + inFileName = "dummyFastaFile.fa" + with open(inFileName, 'w'): + pass + + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") + + self.assertEquals(glob.glob("seqCluster*.fa"), []) + + os.remove(inFileName) + + + def test_splitSeqPerCluster_four_sequences_without_dir(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) + + expFirstClusterFileName = "exp_seqCluster1.fa" + Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName) + expSecondClusterFileName = "exp_seqCluster2.fa" + Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName) + expThirdClusterFileName = "exp_seqCluster3.574.fa" + Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName) + + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") + obsFirstClusterFileName = "seqCluster1.fa" + obsSecondClusterFileName = "seqCluster2.fa" + obsThirdClusterFileName = "seqCluster3.574.fa" + + os.remove(inFileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) + + os.remove(expFirstClusterFileName) + os.remove(expSecondClusterFileName) + os.remove(expThirdClusterFileName) + os.remove(obsFirstClusterFileName) + os.remove(obsSecondClusterFileName) + os.remove(obsThirdClusterFileName) + + + def test_splitSeqPerCluster_four_sequences_without_dir_no_split(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(inFileName) + + expClusterFileName = "exp_seqCluster.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_in_same_cluster(expClusterFileName) + + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") + obsClusterFileName = "seqCluster1.fa" + + os.remove(inFileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) + + os.remove(expClusterFileName) + os.remove(obsClusterFileName) + + + def test_splitSeqPerCluster_four_sequences_without_dir_shuffle(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header_shuffle(inFileName) + + expFirstClusterFileName = "exp_seqCluster1.fa" + Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expFirstClusterFileName) + expSecondClusterFileName = "exp_seqCluster2.fa" + Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expSecondClusterFileName) + expThirdClusterFileName = "exp_seqCluster3.574.fa" + Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expThirdClusterFileName) + + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, False, "seqCluster") + obsFirstClusterFileName = "seqCluster1.fa" + obsSecondClusterFileName = "seqCluster2.fa" + obsThirdClusterFileName = "seqCluster3.574.fa" + + os.remove(inFileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) + + os.remove(expFirstClusterFileName) + os.remove(expSecondClusterFileName) + os.remove(expThirdClusterFileName) + os.remove(obsFirstClusterFileName) + os.remove(obsSecondClusterFileName) + os.remove(obsThirdClusterFileName) + + + def test_splitSeqPerCluster_four_sequences_simplify_header(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) + + expFirstClusterFileName = "exp_seqCluster1.fa" + Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result_with_simplify_header(expFirstClusterFileName) + expSecondClusterFileName = "exp_seqCluster2.fa" + Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result_with_simplify_header(expSecondClusterFileName) + expThirdClusterFileName = "exp_seqCluster3.574.fa" + Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result_with_simplify_header(expThirdClusterFileName) + + FastaUtils.splitSeqPerCluster( inFileName, "Piler", True, False, "seqCluster") + obsFirstClusterFileName = "seqCluster1.fa" + obsSecondClusterFileName = "seqCluster2.fa" + obsThirdClusterFileName = "seqCluster3.574.fa" + + os.remove(inFileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expFirstClusterFileName, obsFirstClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expSecondClusterFileName, obsSecondClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expThirdClusterFileName, obsThirdClusterFileName)) + + os.remove(expFirstClusterFileName) + os.remove(expSecondClusterFileName) + os.remove(expThirdClusterFileName) + os.remove(obsFirstClusterFileName) + os.remove(obsSecondClusterFileName) + os.remove(obsThirdClusterFileName) + + + def test_splitSeqPerCluster_four_sequences_with_dir(self): + inFileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_of_four_sequences_with_specific_header(inFileName) + FastaUtils.splitSeqPerCluster( inFileName, "Piler", False, True, "seqCluster") + os.remove(inFileName) + + for i in ['1', '2', '3.574']: + expClusterFileName = "exp_cluster" + i + ".fa" + if i == '1': + Utils_for_T_FastaUtils._createFastaFile_of_first_cluster_result(expClusterFileName) + if i == '2': + Utils_for_T_FastaUtils._createFastaFile_of_second_cluster_result(expClusterFileName) + if i == '3.574': + Utils_for_T_FastaUtils._createFastaFile_of_third_cluster_result(expClusterFileName) + + obsClusterFileName= inFileName + "_cluster_" + i + "/seqCluster" + i + ".fa" + self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) + os.remove(expClusterFileName) + os.remove(obsClusterFileName) + os.rmdir( inFileName + "_cluster_" + i ) + + + def test_dbLengthFilter_with_one_sequence(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_one_sequence(fileName) + + expFileNameInf = "exp_dummyFastaFile.fa.Inf12" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileNameInf) + expFileNameSup = "exp_dummyFastaFile.fa.Sup12" + Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameSup) + + FastaUtils.dbLengthFilter(12, fileName, verbose=0) + + obsFileNameInf = "dummyFastaFile.fa.Inf12" + obsFileNameSup = "dummyFastaFile.fa.Sup12" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf)) + self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup)) + + os.remove(fileName) + os.remove(expFileNameInf) + os.remove(expFileNameSup) + os.remove(obsFileNameInf) + os.remove(obsFileNameSup) + + def test_dbLengthFilter_with_four_sequence(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) + + expFileNameInf = "exp_dummyFastaFile.fa.Inf130" + Utils_for_T_FastaUtils._createFastaFile_one_sequence(expFileNameInf) + expFileNameSup = "exp_dummyFastaFile.fa.Sup130" + Utils_for_T_FastaUtils._createResult_of_dbLengthFilter_sup(expFileNameSup) + + FastaUtils.dbLengthFilter(130, fileName, verbose=0) + + obsFileNameInf = "dummyFastaFile.fa.Inf130" + obsFileNameSup = "dummyFastaFile.fa.Sup130" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileNameInf, obsFileNameInf)) + self.assertTrue(FileUtils.are2FilesIdentical(expFileNameSup, obsFileNameSup)) + + os.remove(fileName) + os.remove(expFileNameInf) + os.remove(expFileNameSup) + os.remove(obsFileNameInf) + os.remove(obsFileNameSup) + + def test_dbLongestSequences_with_empty_file(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) + + expResult = 0 + + obsResult = FastaUtils.dbLongestSequences( 1, fileName ) + + self.assertEquals(expResult, obsResult) + + os.remove(fileName) + + def test_dbLongestSequences_with_one_longest_sequence(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_four_sequences(fileName) + + expFileName = "exp_dummyFastaFile.fa.best1" + f = open(expFileName, 'w') + f.write(">seq 3\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCG\n") + f.close() + + FastaUtils.dbLongestSequences( 1, fileName, outFileName="", verbose=0, minThresh=0 ) + + obsFileName = "dummyFastaFile.fa.best1" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbLongestSequences_with_two_longest_sequence(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + expFileName = "exp_dummyFastaFile.fa.best1" + f = open(expFileName, 'w') + f.write(">seq 2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCG\n") + f.write(">seq 4\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCG\n") + f.close() + + FastaUtils.dbLongestSequences( 2, fileName, outFileName="", verbose=0, minThresh=0 ) + obsFileName = "dummyFastaFile.fa.best2" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractSeqHeaders(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write("seq 1\n") + f.write("seq 2\n") + f.write("seq 4\n") + f.close() + + FastaUtils.dbExtractSeqHeaders(fileName) + obsFileName = "dummyFastaFile.fa.headers" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractSeqHeaders_with_empty_file(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(fileName) + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write("") + f.close() + + FastaUtils.dbExtractSeqHeaders(fileName) + obsFileName = "dummyFastaFile.fa.headers" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractSeqHeaders_without_header(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_sequence_without_header(fileName) + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write("") + f.close() + + FastaUtils.dbExtractSeqHeaders(fileName) + obsFileName = "dummyFastaFile.fa.headers" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractByPattern_without_pattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + + obsResult = FastaUtils.dbExtractByPattern( "", fileName) + + expResult = None + + self.assertEquals(expResult, obsResult) + + os.remove(fileName) + + def test_dbExtractByPattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + expFileName = "exp_dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(expFileName) + + FastaUtils.dbExtractByPattern( 'seq', fileName) + + obsFileName = "dummyFastaFile.fa.extracted" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractByPattern_with_2_as_pattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write(">seq 2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCG\n") + f.close() + + FastaUtils.dbExtractByPattern( ' 2', fileName) + + obsFileName = "dummyFastaFile.fa.extracted" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractByPattern_with_sandie_as_pattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + expFileName = "exp_dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_for_empty_file(expFileName) + + FastaUtils.dbExtractByPattern( 'sandie', fileName) + + obsFileName = "dummyFastaFile.fa.extracted" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbExtractByFilePattern_empty_pattern_filename(self): + patternFileName = "" + isSysExitRaised = False + try: + FastaUtils.dbExtractByFilePattern(patternFileName , None, "") + except SystemExit: + isSysExitRaised = True + self.assertTrue(isSysExitRaised) + + def test_dbExtractByFilePattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) + patternFileName = "dummyPatternFile.txt" + Utils_for_T_FastaUtils._createPatternFile(patternFileName) + + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write(">seq 1\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 3\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 8\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 10\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + + obsFileName = "dummyFastaFile.fa.extracted" + + FastaUtils.dbExtractByFilePattern( patternFileName, fileName, "") + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(patternFileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbCleanByPattern_without_pattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils._createFastaFile_three_sequences(fileName) + + obsResult = FastaUtils.dbCleanByPattern( "", fileName) + + expResult = None + + self.assertEquals(expResult, obsResult) + + os.remove(fileName) + + def test_dbCleanByPattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) + + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write(">seq 1\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 3\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 4\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 5\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 6\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 7\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 8\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 9\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 10\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + + obsFileName = "dummyFastaFile.fa.cleaned" + FastaUtils.dbCleanByPattern( '2', fileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbCleanByPattern_with_expectedFile_empty(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) + + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write("") + f.close() + + obsFileName = "dummyFastaFile.fa.cleaned" + FastaUtils.dbCleanByPattern( 'seq', fileName) + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbCleanByFilePattern_empty_pattern_filename(self): + patternFileName = "" + isSysExitRaised = False + try: + FastaUtils.dbCleanByFilePattern(patternFileName , None, "") + except SystemExit: + isSysExitRaised = True + self.assertTrue(isSysExitRaised) + + def test_dbCleanByFilePattern(self): + fileName = "dummyFastaFile.fa" + Utils_for_T_FastaUtils. _createFastaFile_ten_sequences(fileName) + patternFileName = "dummyPatternFile.txt" + Utils_for_T_FastaUtils._createPatternFile(patternFileName) + + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write(">seq 2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 4\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 5\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 6\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 7\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq 9\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + + obsFileName = "dummyFastaFile.fa.cleaned" + + FastaUtils.dbCleanByFilePattern( patternFileName, fileName, "") + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(patternFileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_dbORF_without_ORF(self): + fileName = "dummy.fa" + with open(fileName, "w") as f: + f.write(">dummy\n") + f.write("GGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTTGGGTT\n") + + expFileName = "exp.ORF.map" + with open(expFileName, "w") as f: + f.write("") + obsFileName = "%s.ORF.map" % fileName + + FastaUtils.dbORF(fileName, 0, 0) + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(obsFileName) + os.remove(expFileName) + + def test_dbORF_with_one_ORF(self): + fileName = "dummyFastaFile.fa" + f = open(fileName, 'w') + f.write(">seq1\n") + f.write("GAAAATATGGGGTAGATAAGGGATCTGGGTTAATTTTTT\n") + f.close() + + expFileName = "exp_dummyORFFile.ORF.map" + f = open(expFileName, 'w') + f.write("ORF|1|17\tseq1\t16\t33\n") + f.close() + + FastaUtils.dbORF(fileName, 0, 0) + obsFileName = fileName + ".ORF.map" + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(obsFileName) + os.remove(expFileName) + + def test_dbORF_with_real_ORF(self): + fileName = "dummy.fa" + with open(fileName, "w") as f: + f.write(">DmelChr4_Blaster_Recon_13_Map_4\n") + f.write("AAGTTGGACATTGAGGGCTTTCTTCGCCGTGTTTCGTTCTTTTCGACAAACAGCAGTGCT\n") + f.write("TTGCGGATCATTTTGTTTGAACAACCGACAATGCGACCAATTTCAGCGTAGGTTTTACCT\n") + f.write("TCAGAGATCACGTTTTTAATCAAATTTCTTTTTTCGACGGTACAATGCTTTCCGCGACCC\n") + f.write("ATGACTAGAGAATTTTTGGTCTTCGTTTGGAAAAAATTCAATTAAAACCTTTAATACAAC\n") + f.write("TCCTTTTTTCAAAATTTTTCGAAAAAAACCCAAAGCAATCACTCCTATTAATTTTATTCA\n") + f.write("GCAAATACGTGTTCAGTGCTATTTTTGTTACCGCCTCATTTCGCGCACTTTTGCAGCAAG\n") + f.write("TGCCCAAAAACAAAAAGAACCGTTACATTGAGAGACTAAAAATTTCTTGCTCAGAGAGCC\n") + f.write("AACATATGGTACTTATTATTCATGCAATCTGACTTAAAAAAATATAAACATTTAATAATT\n") + f.write("TTTTTTAGGAAATCAACTTTCCACCTGCAGTAGTGCTATTATTTTAACCGCAGCTGTATA\n") + f.write(">DmelChr4_Blaster_Piler_3.5_Map_7\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTT\n") + f.write("AGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGGTTAGGGTTAGGGTTAGGGTTAGGGT\n") + f.write("TAGGGCTAGGGTTAGGGGTTAGGGTTAGGGTTAGGCTTAGGGTTAGGGTTAGGGTTAGGG\n") + f.write("TTAGGGTTAGGGTTAGGGTTAGGAGTTAGGGTGTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") + f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") + f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGCTAGGGTTAGGGTTAG\n") + f.write("GGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG\n") + f.write(">DmelChr4_Blaster_Grouper_10_Map_13\n") + f.write("GCAAAGACACTAGAATAACAAGATGCGTAACGGCCATACATTGGTTTGGCACTATGCAGC\n") + f.write("CACTTTTTTGGTGACGGCCAAAATTACTCTCTTTCCGCTCACTCCCGCTGAGAGCGTAAG\n") + f.write("AAATCTAAAAATATAATTTGCTTGCTTGTGTGAGTAAAAACAAGAGACGAGAACGCGTAT\n") + f.write("AAGTGTGCGTGTTGTGCTAGAAGACGATTTTCGGGACCGAAATCAATTCTGATCGAAGAA\n") + f.write("ACGAATTTACATGGTACATATTAGGGTAGTTTTTGCCAATTTCCTAGCAATATGATAAAA\n") + f.write("TAAAAAAATTTTTAAAAATTCGCGCCCTGACTATTATAATTTTAAAGCTTTTTAAAATTT\n") + f.write("GTTTGTTAAAATCGCCGCTCGAATTAGCTACCGTTTACACATTTATATTTATGTTTAATT\n") + f.write("CTAATTTGTCTCTCATCTGACAATTTTTTAAGAAAGCGAAATATTTTTTTTTTGAAACAC\n") + f.write("TTTTAATGTTAATGTTACATCATATTAAGTCAAATGATTTAATAAATATACTAAATAATT\n") + f.write("AAATATGATAACTGTTTATTGCAAAAGTAATATCAAAGACACTAGAATTATTCTAGTGTC\n") + f.write("TTTGCTTTGTTCATATCTTGAGGCACGAAGTGCGGACACAAGCACTCAACAATCATTGCC\n") + f.write("TTATTAATTTTTCACACGCCGCAAGATGAATACTCTAATGACAAATATTCTTATATAAAG\n") + f.write("TCATTTTTGAAATTTATTTTTGTGATAATATGTACATAGATTTGGCTATTTCTAATCTAT\n") + f.write("TTTCAAATAATAATAACGTTAAGGCAATGCAAAACAAGAATTTTTTTAGTCGCATGGTGC\n") + f.write("CAATTGATCAAAAATAATATAGATTTAAAGTCTAAGAACTTCTAAGGTGAAGGGCATATT\n") + f.write("TTGTCAAATTTACAATGCATGAGCGAGCATACGTGTGCACACATACAGTTGTCTGCTATC\n") + f.write("ACTTTGTGCGTTGAAAA\n") + + expFileName = "exp.ORF.map" + with open(expFileName, "w") as f: + f.write("ORF|3|263\tDmelChr4_Blaster_Recon_13_Map_4\t189\t452\n") + f.write("ORF|2|206\tDmelChr4_Blaster_Recon_13_Map_4\t185\t391\n") + f.write("ORF|-3|164\tDmelChr4_Blaster_Recon_13_Map_4\t382\t218\n") + f.write("ORF|-1|161\tDmelChr4_Blaster_Recon_13_Map_4\t297\t136\n") + f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t400\t513\n") + f.write("ORF|1|113\tDmelChr4_Blaster_Recon_13_Map_4\t112\t225\n") + f.write("ORF|3|107\tDmelChr4_Blaster_Recon_13_Map_4\t81\t188\n") + f.write("ORF|1|107\tDmelChr4_Blaster_Recon_13_Map_4\t292\t399\n") + f.write("ORF|-1|104\tDmelChr4_Blaster_Recon_13_Map_4\t432\t328\n") + f.write("ORF|-2|104\tDmelChr4_Blaster_Recon_13_Map_4\t515\t411\n") + f.write("ORF|3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t393\t509\n") + f.write("ORF|-3|116\tDmelChr4_Blaster_Piler_3.5_Map_7\t505\t389\n") + f.write("ORF|-2|86\tDmelChr4_Blaster_Piler_3.5_Map_7\t518\t432\n") + f.write("ORF|1|80\tDmelChr4_Blaster_Piler_3.5_Map_7\t436\t516\n") + f.write("ORF|-3|170\tDmelChr4_Blaster_Grouper_10_Map_13\t222\t52\n") + f.write("ORF|-1|161\tDmelChr4_Blaster_Grouper_10_Map_13\t260\t99\n") + f.write("ORF|3|155\tDmelChr4_Blaster_Grouper_10_Map_13\t702\t857\n") + f.write("ORF|3|152\tDmelChr4_Blaster_Grouper_10_Map_13\t288\t440\n") + f.write("ORF|1|137\tDmelChr4_Blaster_Grouper_10_Map_13\t622\t759\n") + f.write("ORF|2|128\tDmelChr4_Blaster_Grouper_10_Map_13\t539\t667\n") + f.write("ORF|1|125\tDmelChr4_Blaster_Grouper_10_Map_13\t760\t885\n") + f.write("ORF|2|122\tDmelChr4_Blaster_Grouper_10_Map_13\t14\t136\n") + f.write("ORF|-2|113\tDmelChr4_Blaster_Grouper_10_Map_13\t847\t734\n") + f.write("ORF|1|110\tDmelChr4_Blaster_Grouper_10_Map_13\t154\t264\n") + obsFileName = "%s.ORF.map" % fileName + + FastaUtils.dbORF(fileName, 10, 30) + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(fileName) + os.remove(obsFileName) + os.remove(expFileName) + + def test_sortSequencesByIncreasingLength(self): + fileName = "dummyFastaFile.fa" + f = open(fileName, 'w') + f.write(">seq1_length_60\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq2_length_120\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq3_length_32\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.close() + + expFileName = "exp_dummyFastaFile.fa" + f = open(expFileName, 'w') + f.write(">seq3_length_32\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.write(">seq1_length_60\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq2_length_120\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + + f.close() + + obsFileName = "obs_dummyFastaFile.fa" + + FastaUtils.sortSequencesByIncreasingLength(fileName, obsFileName, 0) + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(expFileName) + os.remove(obsFileName) + + def test_sortSequencesByIncreasingLength_in_file_do_not_exists(self): + fileName = "dummyFile.fa" + isSysExitRaised = False + try: + FastaUtils.sortSequencesByIncreasingLength(fileName, "", 0) + except SystemExit: + isSysExitRaised = True + + self.assertTrue(isSysExitRaised) + + def test_sortSequencesByHeader(self): + fileName = "dummyFastaFile.fa" + f = open(fileName, "w") + f.write(">seq1::test-test\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq3\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.write(">seq2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + expFileName = "expFastaFile.fa" + f = open(expFileName, "w") + f.write(">seq1::test-test\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq3\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.close() + + obsFileName = "obsFastaFile.fa" + FastaUtils.sortSequencesByHeader(fileName, obsFileName) + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(expFileName) + os.remove(obsFileName) + + def test_sortSequencesByHeader_no_outFileName(self): + fileName = "dummyFastaFile.fa" + f = open(fileName, "w") + f.write(">seq12\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq1\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.write(">seq2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + expFileName = "expFastaFile.fa" + f = open(expFileName, "w") + f.write(">seq1\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATAT\n") + f.write(">seq12\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write(">seq2\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.write("ATATTCGCGCATCGATCGATCGGCGGCTATATGCTAGTCAGCTAGCTAGTGTGAGTAGTA\n") + f.close() + + obsFileName = "dummyFastaFile_sortByHeaders.fa" + FastaUtils.sortSequencesByHeader(fileName) + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + + os.remove(expFileName) + os.remove(obsFileName) + + def test_getLengthPerHeader( self ): + inFile = "dummyFile.fa" + inFileHandler = open( inFile, "w" ) + inFileHandler.write(">seq1\nAGCGATGCGT\n") + inFileHandler.write(">seq2\nAGCGATG\n") + inFileHandler.write(">seq3\nAGCGATGGTGCGTGC\n") + inFileHandler.write("AGCGATGGTGCGTGC\n") + inFileHandler.close() + + dExp = { "seq1": 10, "seq2": 7, "seq3": 30 } + + dObs = FastaUtils.getLengthPerHeader( inFile, 0 ) + + self.assertEquals( dExp, dObs ) + + os.remove( inFile ) + + def test_convertFastaHeadersFromChkToChr_grouper(self): + inFile = "dummyFastaFile.fa" + with open(inFile, "w") as f: + f.write(">MbQ1Gr1Cl0 chunk6 {Fragment} 95523..96053\n") + f.write("AGCGTGCA\n") + f.write(">MbQ77Gr8Cl0 chunk7 {Fragment} 123657..122568,121935..121446\n") + f.write("AGCATGC\n") + f.write(">MbS78Gr8Cl0 chunk7 {Fragment} 140078..139519,139470..138985,138651..138183\n") + f.write("CGTGCG\n") + f.write(">MbQ79Gr8Cl0 chunk7 {Fragment} 48021..48587,48669..49153,57346..57834\n") + f.write("AGCGTGC\n") + mapFile = "dummyMapFile.map" + with open(mapFile, "w") as f: + f.write("chunk5\tdmel_chr4\t760001\t960000\n") + f.write("chunk6\tdmel_chr4\t950001\t1150000\n") + f.write("chunk7\tdmel_chr4\t1140001\t1281640\n") + expFile = "expFile.fa" + with open(expFile, "w") as f: + f.write(">MbQ1Gr1Cl0 dmel_chr4 {Fragment} 1045523..1046053\n") + f.write("AGCGTGCA\n") + f.write(">MbQ77Gr8Cl0 dmel_chr4 {Fragment} 1263657..1262568,1261935..1261446\n") + f.write("AGCATGC\n") + f.write(">MbS78Gr8Cl0 dmel_chr4 {Fragment} 1280078..1279519,1279470..1278985,1278651..1278183\n") + f.write("CGTGCG\n") + f.write(">MbQ79Gr8Cl0 dmel_chr4 {Fragment} 1188021..1188587,1188669..1189153,1197346..1197834\n") + f.write("AGCGTGC\n") + obsFile = "obsFile.fa" + + FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile) + + self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) + + for file in [inFile, mapFile, expFile, obsFile]: + os.remove(file) + + def test_convertFastaHeadersFromChkToChr_blastclust(self): + inFile = "dummyFastaFile.fa" + with open(inFile, "w") as f: + f.write(">BlastclustCluster12Mb63_chunk1 (dbseq-nr 0) [1,10]\n") + f.write("AGCGTGCA\n") + f.write(">BlastclustCluster12Mb53_chunk2 (dbseq-nr 2) [1,10]\n") + f.write("AGCATGC\n") + f.write(">BlastclustCluster12Mb26_chunk2 (dbseq-nr 2) [12,18]\n") + f.write("CGTGCG\n") + f.write(">BlastclustCluster12Mb35_chunk3 (dbseq-nr 0) [10,1]\n") + f.write("AGCGTGC\n") + mapFile = "dummyMapFile.map" + with open(mapFile, "w") as f: + f.write("chunk1\tchromosome1\t1\t20\n") + f.write("chunk2\tchromosome1\t16\t35\n") + f.write("chunk3\tchromosome2\t1\t20\n") + expFile = "expFile.fa" + with open(expFile, "w") as f: + f.write(">BlastclustCluster12Mb63 chromosome1 (dbseq-nr 0) 1..10\n") + f.write("AGCGTGCA\n") + f.write(">BlastclustCluster12Mb53 chromosome1 (dbseq-nr 2) 16..25\n") + f.write("AGCATGC\n") + f.write(">BlastclustCluster12Mb26 chromosome1 (dbseq-nr 2) 27..33\n") + f.write("CGTGCG\n") + f.write(">BlastclustCluster12Mb35 chromosome2 (dbseq-nr 0) 10..1\n") + f.write("AGCGTGC\n") + obsFile = "obsFile.fa" + + FastaUtils.convertFastaHeadersFromChkToChr(inFile, mapFile, obsFile) + + self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) + + for file in [inFile, mapFile, expFile, obsFile]: + os.remove(file) + + def test_convertFastaToLength( self ): + inFile = "dummyFastaFile.fa" + inFileHandler = open(inFile, "w") + inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n") + inFileHandler.write("AGCGTGCA\n") + inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n") + inFileHandler.write("AGCATGCAA\n") + inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n") + inFileHandler.write("CGTGCGAAAA\n") + inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n") + inFileHandler.write("AGCGTG\n") + inFileHandler.close() + + expFile = "expFile.length" + expFileHandler = open(expFile, "w") + expFileHandler.write("ReconCluster12Mb63\t8\n") + expFileHandler.write("ReconCluster12Mb53\t9\n") + expFileHandler.write("ReconCluster12Mb26\t10\n") + expFileHandler.write("ReconCluster12Mb35\t6\n") + expFileHandler.close() + + obsFile = "obsFile.length" + + FastaUtils.convertFastaToLength(inFile, obsFile) + + self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) + + for f in [inFile, expFile, obsFile]: + os.remove(f) + + def test_convertFastaToSeq( self ): + inFile = "dummyFastaFile.fa" + inFileHandler = open(inFile, "w") + inFileHandler.write(">ReconCluster12Mb63 chunk1 {Fragment} 1..10\n") + inFileHandler.write("AGCGTGCA\n") + inFileHandler.write(">ReconCluster12Mb53 chunk2 {Fragment} 1..10\n") + inFileHandler.write("AGCATGCAA\n") + inFileHandler.write(">ReconCluster12Mb26 chunk2 {Fragment} 12..18\n") + inFileHandler.write("CGTGCGAAAA\n") + inFileHandler.write(">ReconCluster12Mb35 chunk3 {Fragment} 10..1\n") + inFileHandler.write("AGCGTG\n") + inFileHandler.close() + + expFile = "expFile.seq" + expFileHandler = open(expFile, "w") + expFileHandler.write("ReconCluster12Mb63\tAGCGTGCA\tReconCluster12Mb63 chunk1 {Fragment} 1..10\t8\n") + expFileHandler.write("ReconCluster12Mb53\tAGCATGCAA\tReconCluster12Mb53 chunk2 {Fragment} 1..10\t9\n") + expFileHandler.write("ReconCluster12Mb26\tCGTGCGAAAA\tReconCluster12Mb26 chunk2 {Fragment} 12..18\t10\n") + expFileHandler.write("ReconCluster12Mb35\tAGCGTG\tReconCluster12Mb35 chunk3 {Fragment} 10..1\t6\n") + expFileHandler.close() + + obsFile = "obsFile.seq" + + FastaUtils.convertFastaToSeq(inFile, obsFile) + + self.assertTrue(FileUtils.are2FilesIdentical(expFile, obsFile)) + + for f in [inFile, expFile, obsFile]: + os.remove(f) + + def test_spliceFromCoords( self ): + coordFile = "dummyCoordFile" + coordFileHandler = open( coordFile, "w" ) + coordFileHandler.write("TE1\tchr1\t2\t5\n") + coordFileHandler.write("TE2\tchr1\t15\t11\n") + coordFileHandler.write("TE3\tchr2\t1\t3\n") + coordFileHandler.write("TE1\tchr2\t8\t10\n") + coordFileHandler.write("TE4\tchr3\t3\t1\n") + coordFileHandler.write("TE4\tchr3\t6\t4\n") + coordFileHandler.close() + + genomeFile = "dummyGenomeFile" + genomeFileHandler = open( genomeFile, "w" ) + genomeFileHandler.write(">chr1\n") + genomeFileHandler.write("AGGGGAAAAACCCCCAAAAA\n") + genomeFileHandler.write(">chr2\n") + genomeFileHandler.write("GGGAAAAGGG\n") + genomeFileHandler.write(">chr3\n") + genomeFileHandler.write("GGGGGGTTTT\n") + genomeFileHandler.close() + + expFile = "dummyExpFile" + expFileHandler = open( expFile, "w" ) + expFileHandler.write(">chr1\n") + expFileHandler.write("AAAAAAAAAAA\n") + expFileHandler.write(">chr2\n") + expFileHandler.write("AAAA\n") + expFileHandler.write(">chr3\n") + expFileHandler.write("TTTT\n") + expFileHandler.close() + + obsFile = "dummyObsFile" + + FastaUtils.spliceFromCoords( genomeFile, + coordFile, + obsFile ) + self.assertTrue( FileUtils.are2FilesIdentical( expFile, obsFile ) ) + for f in [ coordFile, genomeFile, expFile, obsFile ]: + os.remove( f ) + + def test_dbShuffle_inputFile( self ): + inFile = "dummyInFile.fa" + inFileHandler = open( inFile, "w" ) + inFileHandler.write(">seq1\n") + inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n") + inFileHandler.close() + + obsFile = "dummyObsFile.fa" + FastaUtils.dbShuffle( inFile, obsFile, 1 ) + + self.assertTrue( FastaUtils.dbSize( obsFile ) == 1 ) + + for f in [ inFile, obsFile ]: + os.remove( f ) + + def test_dbShuffle_inputDir( self ): + inDir = "dummyInDir" + if os.path.exists( inDir ): + shutil.rmtree( inDir ) + os.mkdir( inDir ) + inFile = "%s/dummyInFile.fa" % inDir + inFileHandler = open( inFile, "w" ) + inFileHandler.write(">seq1\n") + inFileHandler.write("AGCGATCGACAGCGCATCGCGCATCGCATCGCTACGCATAC\n") + inFileHandler.close() + + obsDir = "dummyObsDir" + FastaUtils.dbShuffle( inDir, obsDir, 1 ) + + obsFile = "dummyInFile_shuffle.fa" + self.assertTrue( len( glob.glob("%s/%s" % (obsDir,obsFile)) ) == 1 ) + + for d in [ inDir, obsDir ]: + shutil.rmtree( d ) + + def test_convertClusterFileToFastaFile(self): + inClusterFileName = "in.tab" + with open(inClusterFileName, "w") as f: + f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("RLX-incomp_DmelChr4-B-G220-Map3\n") + inFastaFileName = "in.fa" + with open(inFastaFileName, "w") as f: + f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n") + f.write("ATCGCC\n") + f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("ATCGCATCGATCGATC\n") + expFileName = "exp.fa" + with open(expFileName, "w") as f: + f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n") + f.write("ATCGCC\n") + f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("ATCGCATCGATCGATC\n") + obsFileName = "obs.fa" + + FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust") + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + os.remove(inClusterFileName) + os.remove(inFastaFileName) + os.remove(expFileName) + os.remove(obsFileName) + + + def test_convertClusterFileToFastaFile_withoutUnclusterizedSequences(self): + inClusterFileName = "in.tab" + with open(inClusterFileName, "w") as f: + f.write("DTX-incomp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("PotentialHostGene-chim_DmelChr4-B-R5-Map5\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + inFastaFileName = "in.fa" + with open(inFastaFileName, "w") as f: + f.write(">DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">RLX-incomp_DmelChr4-B-G220-Map3\n") + f.write("ATCGCC\n") + f.write(">PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">DTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("ATCGCATCGATCGATC\n") + expFileName = "exp.fa" + with open(expFileName, "w") as f: + f.write(">BlastclustCluster1Mb1_DTX-incomp_DmelChr4-B-R10-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">BlastclustCluster1Mb2_DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") + f.write("ATCGCATCGATCGATC\n") + f.write(">BlastclustCluster3Mb1_RLX-incomp_DmelChr4-B-G220-Map3\n") + f.write("ATCGCC\n") + f.write(">BlastclustCluster2Mb1_PotentialHostGene-chim_DmelChr4-B-R5-Map5\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">BlastclustCluster2Mb2_PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("ATCGCATCGATCGATCATCGCATCGATCGATC\n") + f.write(">BlastclustCluster1Mb3_DTX-incomp_DmelChr4-B-G9-Map3\n") + f.write("ATCGCATCGATCGATC\n") + obsFileName = "obs.fa" + + FastaUtils.convertClusterFileToFastaFile(inClusterFileName, inFastaFileName, obsFileName, "Blastclust") + + self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName)) + os.remove(inClusterFileName) + os.remove(inFastaFileName) + os.remove(expFileName) + os.remove(obsFileName) + + def test_convertClusterFileToMapFile(self): + for clustAlgo in ["Blastclust", "MCL"]: + inFileName = "dummy%sOut.fa" % clustAlgo + inF = open(inFileName, "w") + inF.write(">%sCluster1Mb1_chunk1 (dbseq-nr 1) [1,14]\n" % clustAlgo) + inF.write("gaattgtttactta\n") + inF.write(">%sCluster3Mb1_chunk5 (dbseq-nr 8) [1000,1014]\n" % clustAlgo) + inF.write("gaattgtttactta\n") + inF.write(">%sCluster1Mb2_chunk1 (dbseq-nr 1) [30,44]\n" % clustAlgo) + inF.write("gaattgtttactta\n") + inF.write(">%sCluster2Mb1_chunk2 (dbseq-nr 1) [100,114]\n" % clustAlgo) + inF.write("gaattgtttactta") + inF.close() + + fileExp = "%sToMapExpected.map" % clustAlgo + outF = open(fileExp, "w") + outF.write("%sCluster1Mb1\tchunk1\t1\t14\n" % clustAlgo) + outF.write("%sCluster3Mb1\tchunk5\t1000\t1014\n" % clustAlgo) + outF.write("%sCluster1Mb2\tchunk1\t30\t44\n" % clustAlgo) + outF.write("%sCluster2Mb1\tchunk2\t100\t114\n" % clustAlgo) + outF.close() + + fileObs = "%s.map" % os.path.splitext(inFileName)[0] + FastaUtils.convertClusteredFastaFileToMapFile(inFileName, fileObs) + + self.assertTrue(FileUtils.are2FilesIdentical(fileObs, fileExp)) + + os.remove(inFileName) + os.remove(fileObs) + os.remove(fileExp) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file