Mercurial > repos > yufei-luo > s_mart
diff commons/tools/tests/Test_F_PostAnalyzeTELib.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/tests/Test_F_PostAnalyzeTELib.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,294 @@ +from commons.core.utils.FileUtils import FileUtils +from commons.core.sql.DbFactory import DbFactory +from commons.tools.PostAnalyzeTELib import PostAnalyzeTELib +import subprocess +import unittest +import os + + +class Test_F_PostAnalyzeTELib(unittest.TestCase): + + def setUp(self): + self._expStatFileName = "expStats.tab" + self._obsStatFileName = "" + self._genomeSize = 1281640 + self._pathTableName = "dummyDmelChr4_chr_allTEs_nr_noSSR_join_path" + self._seqTableName = "dummyDmelChr4_denovoLibTEs_seq" + + def tearDown(self): + try: + os.remove(self._expStatFileName) + except: pass + try: + os.remove(self._obsStatFileName) + except: pass + + def test_run_analysis1(self): + libFileName = "TElib.fa" + self._writeInputFasta_analysis1(libFileName) + expClusterFileName = "expClusters.tab" + self._writeExpClusterFile_analysis1(expClusterFileName) + expGlobalStatFileName = "expGlobalStats.txt" + self._writeExpGlobalStats_analysis1(expGlobalStatFileName) + self._writeExpStatsFile_analysis1(self._expStatFileName) + obsClusterFileName = "TElib.tab" + obsGlobalStatFileName = "TElib.globalStatsPerCluster.txt" + self._obsStatFileName = "TElib.statsPerCluster.tab" + + iPATEL = PostAnalyzeTELib(analysis=1, fastaFileName=libFileName, doClean=True) + iPATEL.run() + + self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expGlobalStatFileName, obsGlobalStatFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expStatFileName, self._obsStatFileName)) + + os.remove(libFileName) + os.remove(expClusterFileName) + os.remove(expGlobalStatFileName) + os.remove(obsClusterFileName) + os.remove(obsGlobalStatFileName) + + def test_run_as_script_analysis1(self): + libFileName = "TElib.fa" + with open(libFileName, "w") as f: + f.write(">transib2\n") + f.write("GGCCAGTCACAATGGGGGTTTCACTGGTGTGTCATGCACATTTAATAGGGGTAAGACTGA\n") + f.write("ATAAAAAATGATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGG\n") + f.write("GATTCGTTTCATTCACCGGATCTCTTGCGTCCGCCTCCGCCGTGCGACCTCCGCATTC\n") + f.write(">transib3\n") + f.write("ATAAAAAATGATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGG\n") + f.write("TGAAACTCGTCAGCGTCGTTTCCAAGTCCT\n") + f.write(">transib4\n") + f.write("GGCCAGTCACAATGGGGGTTTCACTGGTGTGTCATGCACATTTAATAGGGGTAAGACTGA\n") + f.write("ATAAAAAATGATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGG\n") + f.write("GATTCGTTTCATTCACCGGATCTCTTGCGTCCGCCTCCGCCGTGCGACCTCCGCATTCAT\n") + f.write("AAAAAATGATTATTTGCATGAAATGGGGATGAGAGAGAAGGAAAGAGTTTCATCCTGG\n") + expClusterFileName = "expClusters.tab" + with open(expClusterFileName, "w") as f: + f.write("transib4 \n") + f.write("transib2 \n") + f.write("transib3 \n") + with open(self._expStatFileName, "w") as f: + f.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") + f.write("1\t1\t238\t238\t238\t238\n") + f.write("2\t1\t178\t178\t178\t178\n") + f.write("3\t1\t90\t90\t90\t90\n") + expGlobalStatFileName = "expGlobalStats.txt" + with open(expGlobalStatFileName, "w") as f: + f.write("nb of clusters: 3\n") + f.write("nb of clusters with 1 sequence: 3\n") + f.write("nb of clusters with 2 sequences: 0\n") + f.write("nb of clusters with >2 sequences: 0 (0 sequences)\n") + f.write("nb of sequences: 3\n") + f.write("nb of sequences in the largest cluster: 1\n") + f.write("nb of sequences in the smallest cluster: 1\n") + f.write("size of the smallest sequence: 90\n") + f.write("size of the largest sequence: 238\n") + f.write("average sequences size: 168\n") + f.write("median sequences size: 178\n") + obsClusterFileName = "TElib.tab" + obsGlobalStatFileName = "TElib.globalStatsPerCluster.txt" + self._obsStatFileName = "TElib.statsPerCluster.tab" + + cmd = "PostAnalyzeTELib.py -i %s -L 98 -S 95 -b -c -v 3" % libFileName + process = subprocess.Popen(cmd, shell = True) + process.communicate() + + self.assertTrue(FileUtils.are2FilesIdentical(expClusterFileName, obsClusterFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(expGlobalStatFileName, obsGlobalStatFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expStatFileName, self._obsStatFileName)) + + os.remove(libFileName) + os.remove(expClusterFileName) + os.remove(expGlobalStatFileName) + os.remove(obsClusterFileName) + os.remove(obsGlobalStatFileName) + + def test_run_analysis2(self): + clusterFileName = "ConsensusClusters.tab" + self._writeClusterFile_analysis2(clusterFileName) + self._writeExpStatsFile_analysis2(self._expStatFileName) + self._obsStatFileName = "ConsensusClusters.classifStatsPerCluster.tab" + + iPATEL = PostAnalyzeTELib(analysis=2, clusterFileName=clusterFileName, verbosity=3) + iPATEL.run() + print "exp: %s, obs: %s" % (self._expStatFileName, self._obsStatFileName) + self.assertTrue(FileUtils.are2FilesIdentical(self._expStatFileName, self._obsStatFileName)) + + os.remove(clusterFileName) + + def test_run_analysis3(self): + iDb = DbFactory.createInstance() + iDb.createTable(self._pathTableName, "path", "%s/Tools/DmelChr4_chr_allTEs_nr_noSSR_join_path.path" % os.environ['REPET_DATA'], True) + iDb.createTable(self._seqTableName, "seq", "%s/TEannot/DmelChr4_denovoLibTEs.fa" % os.environ['REPET_DATA'], True) + + expGlobalStatFileName = "expGlobalStats.txt" + self._writeExpGlobalStats_analysis3(expGlobalStatFileName) + self._writeExpStatsFile_analysis3(self._expStatFileName) + obsGlobalStatFileName = "%s.globalAnnotStatsPerTE.txt" % self._pathTableName + self._obsStatFileName = "%s.annotStatsPerTE.tab" % self._pathTableName + + iPATEL = PostAnalyzeTELib(analysis=3, pathTableName=self._pathTableName, seqTableName=self._seqTableName, genomeSize=self._genomeSize) + iPATEL.run() + + self.assertTrue(FileUtils.are2FilesIdentical(expGlobalStatFileName, obsGlobalStatFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expStatFileName, self._obsStatFileName)) + + iDb.dropTable(self._pathTableName) + iDb.dropTable(self._seqTableName) + iDb.close() + os.remove(expGlobalStatFileName) + os.remove(obsGlobalStatFileName) + + def test_run_analysis4(self): + iDb = DbFactory.createInstance() + iDb.createTable(self._pathTableName, "path", "%s/Tools/DmelChr4_chr_allTEs_nr_noSSR_join_path.path" % os.environ['REPET_DATA'], True) + iDb.createTable(self._seqTableName, "seq", "%s/TEannot/DmelChr4_denovoLibTEs.fa" % os.environ['REPET_DATA'], True) + + clusterFileName = "clusters.tab" + self._writeClusterFile_analysis4(clusterFileName) + self._writeExpStatsFile_analysis4(self._expStatFileName) + self._obsStatFileName = "%s.annotStatsPerCluster.tab" % self._pathTableName + + iPATEL = PostAnalyzeTELib(analysis=4, clusterFileName=clusterFileName, pathTableName=self._pathTableName, seqTableName=self._seqTableName, genomeSize=self._genomeSize) + iPATEL.run() + + self.assertTrue(FileUtils.are2FilesIdentical(self._expStatFileName, self._obsStatFileName)) + + iDb.dropTable(self._pathTableName) + iDb.dropTable(self._seqTableName) + iDb.close() + os.remove(clusterFileName) + + def _writeInputFasta_analysis1(self, fileName): + with open(fileName, "w") as f: + f.write(">DTX-incomp_DmelChr4-B-R9-Map3_reversed\n") + f.write("CATTAGATTCAAGGCATCATGGATCAGCACATTTACACAGATATCCTGGAAAATGTGATG\n") + f.write("CTGCCATATGCCGGGGATGAAATGCCGTTGGTTTGGACATTTCAACAGGATAACGATTCA\n") + f.write("AAACACACGAGCAAGAAAGCTTGAAAGTGGTTTGAGCAGAAATCGATCCGAGTAATGAAA\n") + f.write("TGGCCTGCTCTGTCATCCGACTTGAATCCAATCGAAAACCTTTGGGCGGACGTGGAAAAA\n") + f.write(">DTX-incomp_DmelChr4-B-R10-Map3\n") + f.write("CATTAGATTCAAGGCATCATGGATCAGCACATTTACACAGATATCCTGGAAAATGTGATG\n") + f.write("CTGCCATATGCCGGGGATGAAATGCCGTTGGTTTGGACATTTCAACAGGATAACGATTCA\n") + f.write("AAACACACGAGCAAGAAAGCTTGAAAGTGGTTTGAGCAGAAATCGATCCGAGTAATGAAA\n") + f.write("TGGCCTGCTCTGTCATCCGACTTGAATCCAATCGAAAACCTTTGGGCGGACGTGGAAAAA\n") + f.write(">PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("TACCAAAGACACTAGAATAACAAGATGCGTAACGCCATACGATTTTTTGGCACACGATTT\n") + f.write("TTTCGCCGTGGCTCTAGAGGTGGCTCCAGGCTCTCTCGAATTTTTGTTAGAGAGCGAGAG\n") + f.write("AGCGGAGAGCGCTACAGCGAACAGCTCTTTTCAACGCATAAAGTGATAGCAGACAACTGT\n") + + def _writeExpClusterFile_analysis1(self, fileName): + with open(fileName, "w") as f: + f.write("DTX-incomp_DmelChr4-B-R10-Map3 DTX-incomp_DmelChr4-B-R9-Map3_reversed \n") + f.write("PotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed \n") + + def _writeExpStatsFile_analysis1(self, fileName): + #TODO: header in option ? + with open(fileName, "w") as f: + f.write("cluster\tsequencesNb\tsizeOfSmallestSeq\tsizeOfLargestSeq\taverageSize\tmedSize\n") + f.write("1\t2\t240\t240\t240\t240\n") + f.write("2\t1\t180\t180\t180\t180\n") + + def _writeExpGlobalStats_analysis1(self, fileName): + #TODO: file or STDOUT ? + with open(fileName, "w") as f: + f.write("nb of clusters: 2\n") + f.write("nb of clusters with 1 sequence: 1\n") + f.write("nb of clusters with 2 sequences: 1\n") + f.write("nb of clusters with >2 sequences: 0 (0 sequences)\n") + f.write("nb of sequences: 3\n") + f.write("nb of sequences in the largest cluster: 2\n") + f.write("nb of sequences in the smallest cluster: 1\n") + f.write("size of the smallest sequence: 180\n") + f.write("size of the largest sequence: 240\n") + f.write("average sequences size: 220\n") + f.write("median sequences size: 240\n") + + def _writeClusterFile_analysis2(self, fileName): + with open(fileName, "w") as f: + f.write("DTX-incomp_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_DmelChr4-B-R10-Map3_reversed\tPotentialHostGene-chim_DmelChr4-B-R4-Map5_reversed\n") + f.write("DTX-incomp_Blc1_DmelChr4-B-R9-Map3_reversed\tDTX-incomp_Blc1_DmelChr4-B-R10-Map3\n") + f.write("DXX-comp_DmelChr4-B-R9-Map3\tDTX-comp_DmelChr4-B-R10-Map3_reversed\tDTX-incomp_DmelChr4-B-R10-Map3_reversed\tnoCat_DmelChr4-B-G1-Map3\tnoCat_DmelChr4-B-R1-Map4\n") + f.write("RXX-MITE_DmelChr4-B-G7-Map3\tRXX-MITE_DmelChr4-B-G5-Map3\tRXX-MITE_DmelChr4-B-G2-Map3\tRXX-MITE_DmelChr4-B-G23-Map3\tRXX-MITE_DmelChr4-B-G6-Map3\n") + + def _writeExpStatsFile_analysis2(self, fileName): + #TODO: header in option ? + with open(fileName, "w") as f: + f.write("cluster\tnoCat\tPotentialChimeric\tcomp\tincomp\tclassifs (nbTEs)\n") + f.write("1\t0\t1\t0\t2\tDTX (2)\tPotentialHostGene (1)\n") + f.write("2\t0\t0\t0\t2\tDTX (2)\n") + f.write("3\t2\t0\t2\t1\tDTX (2)\tDXX (1)\n") + f.write("4\t0\t0\t0\t0\tMITE (5)\n") + + def _writeExpStatsFile_analysis3(self, fileName): + #TODO: header in option ? + with open(fileName, "w") as f: + f.write("TE\tlength\tcovg\tfrags\tfullLgthFrags\tcopies\tfullLgthCopies\tmeanId\tmeanLgth\tmeanLgthPerc\n") + f.write("DmelChr4-B-G1-Map3_NoCat\t542\t3701\t12\t4\t10\t4\t95.72\t370.10\t68.28\n") + f.write("DmelChr4-B-G11-Map20_classII-TIR-incomp\t1240\t8216\t27\t0\t22\t0\t88.80\t375.00\t30.24\n") + f.write("DmelChr4-B-G7-Map3_classII-TIR-incomp\t1944\t15212\t49\t1\t42\t1\t89.44\t382.36\t19.67\n") + f.write("DmelChr4-B-G9-Map3_NoCat\t1590\t11564\t24\t0\t21\t1\t92.03\t550.67\t34.63\n") + f.write("DmelChr4-B-P0.0-Map3_classII-TIR-incomp\t1042\t4001\t13\t3\t11\t3\t85.11\t366.36\t35.16\n") + f.write("DmelChr4-B-R1-Map4_NoCat\t2367\t66031\t484\t0\t361\t0\t77.84\t182.91\t7.73\n") + f.write("DmelChr4-B-R12-Map3_NoCat\t2284\t4938\t3\t2\t3\t2\t99.26\t1646.00\t72.07\n") + f.write("DmelChr4-B-R19-Map4_NoCat\t705\t3328\t10\t3\t10\t3\t88.51\t332.80\t47.21\n") + f.write("DmelChr4-B-R2-Map6_NoCat\t4638\t20539\t34\t2\t29\t3\t80.93\t708.24\t15.27\n") + f.write("DmelChr4-B-R4-Map5_NoCat\t1067\t7292\t35\t1\t28\t1\t86.50\t260.54\t24.42\n") + f.write("DmelChr4-B-R9-Map3_NoCat\t714\t5453\t19\t2\t16\t2\t81.18\t340.81\t47.73\n") + + def _writeExpGlobalStats_analysis3(self, fileName): + with open(fileName, "w") as f: + f.write("nb of sequences: 11\n") + f.write("nb of matched sequences: 11\n") + f.write("cumulative coverage: 150275 bp\n") + f.write("coverage percentage: 11.73%\n") + f.write("\n") + f.write("total nb of TE fragments: 710\n") + f.write("total nb full-length fragments: 18 (2.54%)\n") + f.write("total nb of TE copies: 553\n") + f.write("total nb full-length copies: 20 (3.62%)\n") + f.write("families with full-length fragments: 8 (72.73%)\n") + f.write(" with only one full-length fragment: 2\n") + f.write(" with only two full-length fragments: 3\n") + f.write(" with only three full-length fragments: 2\n") + f.write(" with more than three full-length fragments: 1\n") + f.write("families with full-length copies: 9 (81.82%)\n") + f.write(" with only one full-length copy: 3\n") + f.write(" with only two full-length copies: 2\n") + f.write(" with only three full-length copies: 3\n") + f.write(" with more than three full-length copies: 1\n") + f.write("mean of median identity of all families: 88.30 +- 8.33\n") + f.write("mean of median length percentage of all families: 30.83 +- 32.30\n") + + def _writeClusterFile_analysis4(self, fileName): + with open(fileName, "w") as f: + f.write("1\tDmelChr4-B-R1-Map4_NoCat\tDmelChr4-B-R2-Map6_NoCat\tDmelChr4-B-R4-Map5_NoCat\n") + f.write("2\tDmelChr4-B-G7-Map3_classII-TIR-incomp\tDmelChr4-B-P0.0-Map3_classII-TIR-incomp\n") + + def _writeExpStatsFile_analysis4(self, fileName): + with open(fileName, "w") as f: + f.write("Cluster\tcovg\tfrags\tcopies\n") + f.write("1\t93862\t553\t418\n") + f.write("2\t19213\t62\t53\n") + + def _writeConfigFile(self, configFileName): + with open(configFileName, "w") as fHandle: + fHandle.write("[repet_env]\n") + fHandle.write("repet_host: %s\n" % os.environ["REPET_HOST"]) + fHandle.write("repet_user: %s\n" % os.environ["REPET_USER"]) + fHandle.write("repet_pw: %s\n" % os.environ["REPET_PW"]) + fHandle.write("repet_db: %s\n" % os.environ["REPET_DB"]) + fHandle.write("repet_port: 3306\n") + fHandle.write("[analysis1]\n") + fHandle.write("fasta_name: %s\n" % self._expStatFileName) + + fHandle.write("[analysis2]\n") + fHandle.write("clusterFileName: %s\n" % self._expStatFileName) + + fHandle.write("[analysis3]\n") + fHandle.write("pathTableName: %s\n" % self._pathTableName) + fHandle.write("seqTableName: %s\n" % self._seqTableName) + fHandle.write("genomeSize: %s\n" % self._genomeSize) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file