diff smart_toolShed/commons/core/seq/test/Test_BioseqUtils.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/commons/core/seq/test/Test_BioseqUtils.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,498 @@
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+
+import unittest
+import os
+from commons.core.seq.Bioseq import Bioseq
+from commons.core.seq.BioseqUtils import BioseqUtils
+from commons.core.utils.FileUtils import FileUtils
+
+
+class Test_BioseqUtils( unittest.TestCase ):
+    
+    def test_translateSequence_one_nt( self ):
+        bioseq = Bioseq()
+        bioseq.sequence = "G"
+        BioseqUtils.translateSequence(bioseq, 1)
+        expSequence = ""
+        obsSequence = bioseq.sequence
+        self.assertEqual(expSequence, obsSequence)
+        
+        
+    def test_translateSequence_frame1( self ):
+        bioseq = Bioseq()
+        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        BioseqUtils.translateSequence(bioseq, 1)
+        expSequence = "XGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        obsSequence = bioseq.sequence
+        self.assertEqual(expSequence, obsSequence)
+        
+        
+    def test_translateSequence_frame2( self ):
+        bioseq = Bioseq()
+        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        BioseqUtils.translateSequence(bioseq, 2)
+        expSequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        obsSequence = bioseq.sequence
+        self.assertEqual(expSequence, obsSequence)
+        
+        
+    def test_translateSequence_frame3( self ):
+        bioseq = Bioseq()
+        bioseq.sequence = "NGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        BioseqUtils.translateSequence(bioseq, 3)
+        expSequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
+        obsSequence = bioseq.sequence
+        self.assertEqual(expSequence, obsSequence)
+        
+        
+    def test_setFrameInfoOnHeader(self):
+        bioseq = Bioseq()
+        bioseq.header = "header1 description1 description2"
+        BioseqUtils.setFrameInfoOnHeader(bioseq,1)
+        expHeader = "header1_1 description1 description2"
+        obsHeader = bioseq.header
+        self.assertEquals(expHeader,obsHeader)
+        
+        
+    def test_setFrameInfoOnHeader_header_without_space(self):
+        bioseq = Bioseq()
+        bioseq.header = "header"
+        BioseqUtils.setFrameInfoOnHeader(bioseq,1)
+        expHeader = "header_1"
+        obsHeader = bioseq.header
+        self.assertEquals(expHeader, obsHeader)
+        
+        
+    def test_TranslateInAllFrame( self ):
+        bioseq = Bioseq()
+        bioseq.header = "header1"
+        bioseq.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1_1"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        bioseq2 = Bioseq()
+        bioseq2.header = "header1_2"
+        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        bioseq3 = Bioseq()
+        bioseq3.header = "header1_3"
+        bioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
+        bioseq4 = Bioseq()
+        bioseq4.header = "header1_4"
+        bioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
+        bioseq5 = Bioseq()
+        bioseq5.header = "header1_5"
+        bioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"
+        bioseq6 = Bioseq()
+        bioseq6.header = "header1_6"
+        bioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"
+        
+        expLBioseq = [bioseq1, bioseq2, bioseq3, bioseq4, bioseq5, bioseq6]
+        obsLBioseq = BioseqUtils.translateInAllFrame(bioseq)
+        
+        self.assertEquals(expLBioseq, obsLBioseq) 
+        
+        
+    def test_replaceStopCodonsByX( self ):
+        bioseq = Bioseq()
+        bioseq.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        BioseqUtils.replaceStopCodonsByX(bioseq)
+        expSequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
+        obsSequence = bioseq.sequence
+        self.assertEquals(expSequence, obsSequence)
+        
+        
+    def test_translateBioseqListInAllFrames_with_empty_list( self ):
+        lBioseq = []
+        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
+        expLBioseq = []
+        self.assertEquals( expLBioseq, obsLBioseq )
+        
+        
+    def test_translateBioseqListInAllFrames_with_one_item( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        lBioseq = [bioseq1]
+        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
+       
+        expBioseq1 = Bioseq()
+        expBioseq1.header = "header1_1 description"
+        expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        expBioseq2 = Bioseq()
+        expBioseq2.header = "header1_2 description"
+        expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        
+        expBioseq3 = Bioseq()
+        expBioseq3.header = "header1_3 description"
+        expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
+
+        expBioseq4 = Bioseq()
+        expBioseq4.header = "header1_4 description"
+        expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"        
+
+        expBioseq5 = Bioseq()
+        expBioseq5.header = "header1_5 description"
+        expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"     
+                
+        expBioseq6 = Bioseq()
+        expBioseq6.header =  "header1_6 description"
+        expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"    
+        
+        expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6]
+                     
+        self.assertEquals( expLBioseq, obsLBioseq )
+        
+        
+    def test_translateBioseqListInAllFrames( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        lBioseq = [bioseq1, bioseq2]
+        obsLBioseq = BioseqUtils.translateBioseqListInAllFrames( lBioseq )
+       
+        expBioseq1 = Bioseq()
+        expBioseq1.header = "header1_1 description"
+        expBioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        expBioseq2 = Bioseq()
+        expBioseq2.header = "header1_2 description"
+        expBioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        
+        expBioseq3 = Bioseq()
+        expBioseq3.header = "header1_3 description"
+        expBioseq3.sequence = "WLLVDQFMITMISRRCLVAPTNQQYNASRA*"
+
+        expBioseq4 = Bioseq()
+        expBioseq4.header = "header1_4 description"
+        expBioseq4.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"        
+
+        expBioseq5 = Bioseq()
+        expBioseq5.header = "header1_5 description"
+        expBioseq5.sequence = "QALLALYC*LVGATRHLREIIVIIN*STRSH"     
+                
+        expBioseq6 = Bioseq()
+        expBioseq6.header =  "header1_6 description"
+        expBioseq6.sequence = "KLYSHYIVD*SEPRDTYVKSL*S*TDQLEAT"    
+
+        expBioseq7 = Bioseq()
+        expBioseq7.header = "header2_1"
+        expBioseq7.sequence =  "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"    
+        
+        expBioseq8 = Bioseq()
+        expBioseq8.header = "header2_2"
+        expBioseq8.sequence = "VASS*SVYDHNDFT*VSRGYD*STI*CE*SL"     
+
+        expBioseq9 = Bioseq()
+        expBioseq9.header = "header2_3"
+        expBioseq9.sequence = "WLLVDQFMITMISRRCLVATTNQQYNASRA*"     
+
+        expBioseq10 = Bioseq()
+        expBioseq10.header = "header2_4"
+        expBioseq10.sequence = "SSSTRIILLISRSHETPT*NHCDHKLIN*KP"
+             
+        expBioseq11 = Bioseq()
+        expBioseq11.header = "header2_5"
+        expBioseq11.sequence = "QALLALYC*LVVATRHLREIIVIIN*STRSH"     
+
+        expBioseq12 = Bioseq()
+        expBioseq12.header = "header2_6"
+        expBioseq12.sequence = "KLYSHYIVD*S*PRDTYVKSL*S*TDQLEAT"     
+
+        expLBioseq = [expBioseq1, expBioseq2, expBioseq3, expBioseq4, expBioseq5, expBioseq6, expBioseq7, expBioseq8, expBioseq9, expBioseq10, expBioseq11, expBioseq12]
+        self.assertEquals( expLBioseq, obsLBioseq )
+        
+        
+    def test_replaceStopCodonsByXInBioseqList_empty_list( self ):
+        lBioseq = []
+        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
+        expLBioseq = []
+        self.assertEquals(obsLBioseq, expLBioseq)
+        
+        
+    def test_replaceStopCodonsByXInBioseqList_without_stop_codon( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
+        
+        lBioseq = [bioseq1]
+        
+        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header1 description"
+        bioseq2.sequence = "CGFLISLSQFHVGVSWLRLINNIMRVEL"
+        
+        expLBioseq = [bioseq2]
+      
+        self.assertEquals(obsLBioseq, expLBioseq)
+        
+        
+    def test_replaceStopCodonsByXInBioseqList( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        
+        lBioseq = [bioseq1, bioseq2]
+        
+        obsLBioseq = BioseqUtils.replaceStopCodonsByXInBioseqList( lBioseq )
+        
+        bioseq3 = Bioseq()
+        bioseq3.header = "header1 description"
+        bioseq3.sequence = "CGFXLISLXSQXFHVGVSWLRLINNIMRVEL"
+        
+        bioseq4 = Bioseq()
+        bioseq4.header = "header2"
+        bioseq4.sequence = "VASSXSVYDHNDFTXVSRGSDXSTIXCEXSL"
+        
+        expLBioseq = [bioseq3, bioseq4]
+      
+        self.assertEquals(obsLBioseq, expLBioseq)
+        
+        
+    def test_writeBioseqListIntoFastaFile(self):
+        obsFileName = "dummyWrittenFastaFile.fa"
+        
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTACGACTAATCAACAATATAATGCGAGTAGAGCTTGA"
+
+        lBioseq = [bioseq1, bioseq2]
+        
+        BioseqUtils.writeBioseqListIntoFastaFile( lBioseq, obsFileName )
+        
+        expFileName = "dummyFastaFile.fa"
+        f = open(expFileName, "w")
+        f.write(">header1 description\n")
+        f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTC\n")
+        f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
+        f.write(">header2\n")
+        f.write("TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTA\n")
+        f.write("CGACTAATCAACAATATAATGCGAGTAGAGCTTGA\n")
+        f.close()
+         
+        self.assertTrue(FileUtils.are2FilesIdentical(expFileName, obsFileName))
+        
+        os.remove(expFileName)
+        os.remove(obsFileName)
+        
+        
+    def test_extractBioseqListFromFastaFile( self ): 
+        fileName = "dummyFastaFile.fa"
+        f = open(fileName,"w")
+        f.write(">header1_1 description1\n")
+        f.write("CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL\n")
+        f.write(">header1_2 description2\n")
+        f.write("VASS*SVYDHNDFT*VSRGSD*STI*CE*SL\n")
+        f.write(">header1_3 description3\n")
+        f.write("CWLLVDQFMITMISRRCLVAPTNQQYNASRA*\n")
+        f.close()
+        
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1_1 description1"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        bioseq2 = Bioseq()
+        bioseq2.header = "header1_2 description2"
+        bioseq2.sequence = "VASS*SVYDHNDFT*VSRGSD*STI*CE*SL"
+        bioseq3 = Bioseq()
+        bioseq3.header = "header1_3 description3"
+        bioseq3.sequence = "CWLLVDQFMITMISRRCLVAPTNQQYNASRA*"
+        
+        expLBioseq = [bioseq1, bioseq2, bioseq3]
+        
+        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
+        self.assertEquals(expLBioseq , obsLBioseq)
+        
+        os.remove( fileName )
+        
+        
+    def test_extractBioseqListFromFastaFile_empty_seq( self ): 
+        fileName = "dummyFastaFile.fa"
+        f = open(fileName,"w")
+        f.write(">header1_1 description1\n")
+        f.close()
+        
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1_1 description1"
+        bioseq1.sequence = ""
+        expLBioseq = [bioseq1]
+        
+        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
+        self.assertEquals(expLBioseq , obsLBioseq)
+        
+        os.remove( fileName )
+        
+        
+    def test_extractBioseqListFromFastaFile_empty_file( self ): 
+        fileName = "dummyFastaFile.fa"
+        
+        f = open(fileName,"w")
+        f.close()
+        
+        expLBioseq = []
+        
+        obsLBioseq = BioseqUtils.extractBioseqListFromFastaFile( fileName )
+        self.assertEquals(expLBioseq , obsLBioseq)
+        
+        os.remove( fileName )
+        
+        
+    def test_getSeqLengthWithSeqName ( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
+        
+        lBioseq = [bioseq1, bioseq2]
+        
+        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header1 description")
+        expLength = 31
+        
+        self.assertEquals( expLength, obsLength)
+        
+        
+    def test_getSeqLengthWithSeqName_second_item ( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
+        
+        lBioseq = [bioseq1, bioseq2]
+        
+        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
+        expLength = 44
+        
+        self.assertEquals( expLength, obsLength)
+        
+        
+    def test_getSeqLengthWithSeqName_empty_list ( self ):
+        lBioseq = []
+        
+        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
+        expLength = 0
+        
+        self.assertEquals( expLength, obsLength)
+        
+        
+    def test_getSeqLengthWithSeqName_empty_sequence ( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = ""
+        
+        lBioseq = [bioseq1, bioseq2]
+        
+        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header2")
+        expLength = 0
+        
+        self.assertEquals( expLength, obsLength)
+        
+        
+    def test_getSeqLengthWithSeqName_sequence_unknown ( self ):
+        bioseq1 = Bioseq()
+        bioseq1.header = "header1 description"
+        bioseq1.sequence = "CGF*LISL*SQ*FHVGVSWLRLINNIMRVEL"
+        
+        bioseq2 = Bioseq()
+        bioseq2.header = "header2"
+        bioseq2.sequence = "ATGCGTGCGTAAATGCGTATGCGTATGCGTTCGCGAATGCGTGT"
+        
+        lBioseq = [bioseq1, bioseq2]
+        
+        obsLength = BioseqUtils.getSeqLengthWithSeqName(lBioseq, "header3")
+        expLength = 0
+        
+        self.assertEquals( expLength, obsLength)
+        
+        
+    def test_getLengthPerSeqFromFile( self ):
+        inFile = "dummyInFile"
+        inFileHandler = open( inFile, "w" )
+        inFileHandler.write( ">seq1\nAGCGATGCAGCTA\n" )
+        inFileHandler.write( ">seq2\nGCGATGCGCATCGACGCGA\n" )
+        inFileHandler.close()
+        
+        dExp = { "seq1": 13, "seq2": 19 }
+        
+        dObs = BioseqUtils.getLengthPerSeqFromFile( inFile )
+        
+        self.assertEqual( dExp, dObs )
+        
+        os.remove( inFile )
+        
+        
+    def test_getBioseqListSortedByDecreasingLength( self ):
+        lBioseqs = [ Bioseq( "TE2", "ACC" ),
+                    Bioseq( "TE3", "TA" ),
+                    Bioseq( "TE1", "AGCG" ) ]
+        lExp = [ Bioseq( "TE1", "AGCG" ),
+                Bioseq( "TE2", "ACC" ),
+                Bioseq( "TE3", "TA" ) ]
+        lObs = BioseqUtils.getBioseqListSortedByDecreasingLength( lBioseqs )
+        self.assertEquals( lExp, lObs )
+        
+        
+    def test_getBioseqListSortedByDecreasingLengthWithoutGaps( self ):
+        lBioseqs = [ Bioseq( "TE2", "-ACC-" ),
+                    Bioseq( "TE3", "TA---" ),
+                    Bioseq( "TE1", "-AGCG" ) ]
+        lExp = [ Bioseq( "TE1", "-AGCG" ),
+                Bioseq( "TE2", "-ACC-" ),
+                Bioseq( "TE3", "TA---" ) ]
+        lObs = BioseqUtils.getBioseqListSortedByDecreasingLengthWithoutGaps( lBioseqs )
+        self.assertEquals( lExp, lObs )
+        
+        
+test_suite = unittest.TestSuite()
+test_suite.addTest( unittest.makeSuite( Test_BioseqUtils ) )
+if __name__ == "__main__":
+    unittest.TextTestRunner(verbosity=2).run( test_suite )