view commons/core/seq/test/Test_BioseqUtils.py @ 6:769e306b7933

Change the repository level.
author yufei-luo
date Fri, 18 Jan 2013 04:54:14 -0500
parents
children
line wrap: on
line source

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.


import unittest
import os
from commons.core.seq.Bioseq import Bioseq
from commons.core.seq.BioseqUtils import BioseqUtils
from commons.core.utils.FileUtils import FileUtils


class Test_BioseqUtils( unittest.TestCase ):
    
    def test_translateSequence_one_nt( self ):
        bioseq = Bioseq()
        bioseq.sequence = "G"
        BioseqUtils.translateSequence(bioseq, 1)
        expSequence = ""
        obsSequence = bioseq.sequence
        self.assertEqual(expSequence, obsSequence)
        
        
    def test_translateSequence_frame1( self ):
        bioseq = Bioseq()
        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        BioseqUtils.translateSequence(bioseq, 1)
        expSequence = "XGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        obsSequence = bioseq.sequence
        self.assertEqual(expSequence, obsSequence)
        
        
    def test_translateSequence_frame2( self ):
        bioseq = Bioseq()
        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        BioseqUtils.translateSequence(bioseq, 2)
        expSequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        obsSequence = bioseq.sequence
        self.assertEqual(expSequence, obsSequence)
        
        
    def test_translateSequence_frame3( self ):
        bioseq = Bioseq()
        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        BioseqUtils.translateSequence(bioseq, 3)
        expSequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
        obsSequence = bioseq.sequence
        self.assertEqual(expSequence, obsSequence)
        
        
    def test_setFrameInfoOnHeader(self):
        bioseq = Bioseq()
        bioseq.header = "header1 description1 description2"
        BioseqUtils.setFrameInfoOnHeader(bioseq,1)
        expHeader = "header1_1 description1 description2"
        obsHeader = bioseq.header
        self.assertEquals(expHeader,obsHeader)
        
        
    def test_setFrameInfoOnHeader_header_without_space(self):
        bioseq = Bioseq()
        bioseq.header = "header"
        BioseqUtils.setFrameInfoOnHeader(bioseq,1)
        expHeader = "header_1"
        obsHeader = bioseq.header
        self.assertEquals(expHeader, obsHeader)
        
        
    def test_TranslateInAllFrame( self ):
        bioseq = Bioseq()
        bioseq.header = "header1"
        bioseq.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        
        bioseq1 = Bioseq()
        bioseq1.header = "header1_1"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        bioseq2 = Bioseq()
        bioseq2.header = "header1_2"
        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        bioseq3 = Bioseq()
        bioseq3.header = "header1_3"
        bioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
        bioseq4 = Bioseq()
        bioseq4.header = "header1_4"
        bioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
        bioseq5 = Bioseq()
        bioseq5.header = "header1_5"
        bioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"
        bioseq6 = Bioseq()
        bioseq6.header = "header1_6"
        bioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"
        
        expLBioseq = [bioseq1, bioseq2, bioseq3, bioseq4, bioseq5, bioseq6]
        obsLBioseq = BioseqUtils.translateInAllFrame(bioseq)
        
        self.assertEquals(expLBioseq, obsLBioseq) 
        
        
    def test_replaceStopCodonsByX( self ):
        bioseq = Bioseq()
        bioseq.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        BioseqUtils.replaceStopCodonsByX(bioseq)
        expSequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
        obsSequence = bioseq.sequence
        self.assertEquals(expSequence, obsSequence)
        
        
    def test_translateBioseqListInAllFrames_with_empty_list( self ):
        lBioseq = []
        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
        expLBioseq = []
        self.assertEquals( expLBioseq, obsLBioseq )
        
        
    def test_translateBioseqListInAllFrames_with_one_item( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        lBioseq = [bioseq1]
        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
       
        expBioseq1 = Bioseq()
        expBioseq1.header = "header1_1 description"
        expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        expBioseq2 = Bioseq()
        expBioseq2.header = "header1_2 description"
        expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        
        expBioseq3 = Bioseq()
        expBioseq3.header = "header1_3 description"
        expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"

        expBioseq4 = Bioseq()
        expBioseq4.header = "header1_4 description"
        expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"        

        expBioseq5 = Bioseq()
        expBioseq5.header = "header1_5 description"
        expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"     
                
        expBioseq6 = Bioseq()
        expBioseq6.header =  "header1_6 description"
        expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"    
        
        expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6]
                     
        self.assertEquals( expLBioseq, obsLBioseq )
        
        
    def test_translateBioseqListInAllFrames( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        lBioseq = [bioseq1, bioseq2]
        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
       
        expBioseq1 = Bioseq()
        expBioseq1.header = "header1_1 description"
        expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        expBioseq2 = Bioseq()
        expBioseq2.header = "header1_2 description"
        expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        
        expBioseq3 = Bioseq()
        expBioseq3.header = "header1_3 description"
        expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"

        expBioseq4 = Bioseq()
        expBioseq4.header = "header1_4 description"
        expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"        

        expBioseq5 = Bioseq()
        expBioseq5.header = "header1_5 description"
        expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"     
                
        expBioseq6 = Bioseq()
        expBioseq6.header =  "header1_6 description"
        expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"    

        expBioseq7 = Bioseq()
        expBioseq7.header = "header2_1"
        expBioseq7.sequence =  "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"    
        
        expBioseq8 = Bioseq()
        expBioseq8.header = "header2_2"
        expBioseq8.sequence = "VASS*SVYDHNDFT*VSRGYD*STI*CE*SL"     

        expBioseq9 = Bioseq()
        expBioseq9.header = "header2_3"
        expBioseq9.sequence = "WLLVDQFMITMISRRCLVATTNQQYNASRA*"     

        expBioseq10 = Bioseq()
        expBioseq10.header = "header2_4"
        expBioseq10.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
             
        expBioseq11 = Bioseq()
        expBioseq11.header = "header2_5"
        expBioseq11.sequence = "QALLALYC*LVVATRHLREIIVIIN*STRSH"     

        expBioseq12 = Bioseq()
        expBioseq12.header = "header2_6"
        expBioseq12.sequence = "KLYSHYIVD*S*PRDTYVKSL*S*TDQLEAT"     

        expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6, expBioseq7, expBioseq8, expBioseq9, expBioseq10, expBioseq11, expBioseq12]
        self.assertEquals( expLBioseq, obsLBioseq )
        
        
    def test_replaceStopCodonsByXInBioseqList_empty_list( self ):
        lBioseq = []
        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
        expLBioseq = []
        self.assertEquals(obsLBioseq, expLBioseq)
        
        
    def test_replaceStopCodonsByXInBioseqList_without_stop_codon( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
        
        lBioseq = [bioseq1]
        
        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
        
        bioseq2 = Bioseq()
        bioseq2.header = "header1 description"
        bioseq2.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
        
        expLBioseq = [bioseq2]
      
        self.assertEquals(obsLBioseq, expLBioseq)
        
        
    def test_replaceStopCodonsByXInBioseqList( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        
        lBioseq = [bioseq1, bioseq2]
        
        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
        
        bioseq3 = Bioseq()
        bioseq3.header = "header1 description"
        bioseq3.sequence = "CGFXLISLXSQXFHVGVSWLRLINNIMRVEL"
        
        bioseq4 = Bioseq()
        bioseq4.header = "header2"
        bioseq4.sequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
        
        expLBioseq = [bioseq3, bioseq4]
      
        self.assertEquals(obsLBioseq, expLBioseq)
        
        
    def test_writeBioseqListIntoFastaFile(self):
        obsFileName = "dummyWrittenFastaFile.fa"
        
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"

        lBioseq = [bioseq1, bioseq2]
        
        BioseqUtils.writeBioseqListIntoFastaFile( lBioseq, obsFileName )
        
        expFileName = "dummyFastaFile.fa"
        f = open(expFileName, "w")
        f.write(">header1 description\n")
        f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTC\n")
        f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
        f.write(">header2\n")
        f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTA\n")
        f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
        f.close()
         
        self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
        
        os.remove(expFileName)
        os.remove(obsFileName)
        
        
    def test_extractBioseqListFromFastaFile( self ): 
        fileName = "dummyFastaFile.fa"
        f = open(fileName,"w")
        f.write(">header1_1 description1\n")
        f.write("CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL\n")
        f.write(">header1_2 description2\n")
        f.write("VASS*SVYDHNDFT*VSRGSD*STI*CE*SL\n")
        f.write(">header1_3 description3\n")
        f.write("CWLLVDQFMITMISRRCLVAPTNQQYNASRA*\n")
        f.close()
        
        bioseq1 = Bioseq()
        bioseq1.header = "header1_1 description1"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        bioseq2 = Bioseq()
        bioseq2.header = "header1_2 description2"
        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
        bioseq3 = Bioseq()
        bioseq3.header = "header1_3 description3"
        bioseq3.sequence = "CWLLVDQFMITMISRRCLVAPTNQQYNASRA*"
        
        expLBioseq = [bioseq1, bioseq2, bioseq3]
        
        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
        self.assertEquals(expLBioseq , obsLBioseq)
        
        os.remove( fileName )
        
        
    def test_extractBioseqListFromFastaFile_empty_seq( self ): 
        fileName = "dummyFastaFile.fa"
        f = open(fileName,"w")
        f.write(">header1_1 description1\n")
        f.close()
        
        bioseq1 = Bioseq()
        bioseq1.header = "header1_1 description1"
        bioseq1.sequence = ""
        expLBioseq = [bioseq1]
        
        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
        self.assertEquals(expLBioseq , obsLBioseq)
        
        os.remove( fileName )
        
        
    def test_extractBioseqListFromFastaFile_empty_file( self ): 
        fileName = "dummyFastaFile.fa"
        
        f = open(fileName,"w")
        f.close()
        
        expLBioseq = []
        
        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
        self.assertEquals(expLBioseq , obsLBioseq)
        
        os.remove( fileName )
        
        
    def test_getSeqLengthWithSeqName ( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
        
        lBioseq = [bioseq1, bioseq2]
        
        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header1 description")
        expLength = 31
        
        self.assertEquals( expLength, obsLength)
        
        
    def test_getSeqLengthWithSeqName_second_item ( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
        
        lBioseq = [bioseq1, bioseq2]
        
        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
        expLength = 44
        
        self.assertEquals( expLength, obsLength)
        
        
    def test_getSeqLengthWithSeqName_empty_list ( self ):
        lBioseq = []
        
        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
        expLength = 0
        
        self.assertEquals( expLength, obsLength)
        
        
    def test_getSeqLengthWithSeqName_empty_sequence ( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = ""
        
        lBioseq = [bioseq1, bioseq2]
        
        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
        expLength = 0
        
        self.assertEquals( expLength, obsLength)
        
        
    def test_getSeqLengthWithSeqName_sequence_unknown ( self ):
        bioseq1 = Bioseq()
        bioseq1.header = "header1 description"
        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
        
        bioseq2 = Bioseq()
        bioseq2.header = "header2"
        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
        
        lBioseq = [bioseq1, bioseq2]
        
        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header3")
        expLength = 0
        
        self.assertEquals( expLength, obsLength)
        
        
    def test_getLengthPerSeqFromFile( self ):
        inFile = "dummyInFile"
        inFileHandler = open( inFile, "w" )
        inFileHandler.write( ">seq1\nAGCGATGCAGCTA\n" )
        inFileHandler.write( ">seq2\nGCGATGCGCATCGACGCGA\n" )
        inFileHandler.close()
        
        dExp = { "seq1": 13, "seq2": 19 }
        
        dObs = BioseqUtils.getLengthPerSeqFromFile( inFile )
        
        self.assertEqual( dExp, dObs )
        
        os.remove( inFile )
        
        
    def test_getBioseqListSortedByDecreasingLength( self ):
        lBioseqs = [ Bioseq( "TE2", "ACC" ),
                    Bioseq( "TE3", "TA" ),
                    Bioseq( "TE1", "AGCG" ) ]
        lExp = [ Bioseq( "TE1", "AGCG" ),
                Bioseq( "TE2", "ACC" ),
                Bioseq( "TE3", "TA" ) ]
        lObs = BioseqUtils.getBioseqListSortedByDecreasingLength( lBioseqs )
        self.assertEquals( lExp, lObs )
        
        
    def test_getBioseqListSortedByDecreasingLengthWithoutGaps( self ):
        lBioseqs = [ Bioseq( "TE2", "-ACC-" ),
                    Bioseq( "TE3", "TA---" ),
                    Bioseq( "TE1", "-AGCG" ) ]
        lExp = [ Bioseq( "TE1", "-AGCG" ),
                Bioseq( "TE2", "-ACC-" ),
                Bioseq( "TE3", "TA---" ) ]
        lObs = BioseqUtils.getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs )
        self.assertEquals( lExp, lObs )
        
        
test_suite = unittest.TestSuite()
test_suite.addTest( unittest.makeSuite( Test_BioseqUtils ) )
if __name__ == "__main__":
    unittest.TextTestRunner(verbosity=2).run( test_suite )