Mercurial > repos > yufei-luo > s_mart
diff commons/core/parsing/test/Test_Multifasta2SNPFile.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/parsing/test/Test_Multifasta2SNPFile.py Fri Jan 18 04:54:14 2013 -0500 @@ -0,0 +1,1786 @@ +import os +import shutil +import unittest +from commons.core.utils.FileUtils import FileUtils +from commons.core.parsing.Multifasta2SNPFile import Multifasta2SNPFile +from commons.core.parsing.Multifasta2SNPFile import ReferenceBioseqAndLinesBioseqDBWrapper +from commons.core.seq.Bioseq import Bioseq +from commons.core.seq.BioseqDB import BioseqDB +from smac_pipe.tests.Utils4Test import Utils4Test + + +class Test_Multifasta2SNPFile(unittest.TestCase): +# TODO TEST LOGFILE + def setUp(self): + os.chdir("%s/commons/core/parsing/test/" % os.environ["REPET_PATH"]) + self._inFileName = "multifasta_input.fasta" + + self._expSubSNPFileName = "%s/commons/core/parsing/test/expSubSNP.csv" % os.environ["REPET_PATH"] + self._expAlleleFileName = "%s/commons/core/parsing/test/expAllele.csv" % os.environ["REPET_PATH"] + + self._expIndividualFileName = "%s/commons/core/parsing/test/expIndividual.csv" % os.environ["REPET_PATH"] + self._expSequenceFSAFileName = "%s/commons/core/parsing/test/expSequences.fsa" % os.environ["REPET_PATH"] + self._expSequenceCSVFileName = "%s/commons/core/parsing/test/expSequences.csv" % os.environ["REPET_PATH"] + self._expBatchFileName = "%s/commons/core/parsing/test/expBatch.txt" % os.environ["REPET_PATH"] + self._expBatchLineFileName = "%s/commons/core/parsing/test/expBatchLine.csv" % os.environ["REPET_PATH"] + + self._realInputFileName = "data/real_multifasta_input.fasta" + self._realExpSubSNPFileName = "data/realExpSubSNP.csv" + self._realExpSequenceFSAFileName = "data/realExpSequences.fsa" + self._realExpBatchLineFileName = "data/realExpBatchLine.csv" + self._realExpIndividualFileName = "data/realExpIndividual.csv" + + self._inputDirSeveralBatches = "%s/commons/core/parsing/test/severalBatchDir" % os.environ["REPET_PATH"] + + self._obsSubSNPFileName = "SubSNP.csv" + self._obsAlleleFileName = "Allele.csv" + self._obsIndividualFileName = "Individual.csv" + self._obsSequenceFSAFileName = "Sequences.fsa" + self._obsSequenceCSVFileName = "Sequences.csv" + self._obsBatchFileName = "Batch.txt" + self._obsBatchLineFileName = "BatchLine.csv" + + self._fileUtils = FileUtils() + + def tearDown(self): + os.chdir("%s/commons/core/parsing/test/" % os.environ["REPET_PATH"]) + logFileName = "multifasta2SNP.log" + if self._fileUtils.isRessourceExists(self._inFileName): + os.remove(self._inFileName) + if self._fileUtils.isRessourceExists(self._obsSubSNPFileName): + os.remove(self._obsSubSNPFileName) + if self._fileUtils.isRessourceExists(self._obsSubSNPFileName + "_filtered"): + os.remove(self._obsSubSNPFileName + "_filtered") + if self._fileUtils.isRessourceExists(self._obsAlleleFileName): + os.remove(self._obsAlleleFileName) + if self._fileUtils.isRessourceExists(self._obsIndividualFileName): + os.remove(self._obsIndividualFileName) + if self._fileUtils.isRessourceExists(self._obsSequenceFSAFileName): + os.remove(self._obsSequenceFSAFileName) + if self._fileUtils.isRessourceExists(self._obsSequenceCSVFileName): + os.remove(self._obsSequenceCSVFileName) + if self._fileUtils.isRessourceExists(self._obsBatchFileName): + os.remove(self._obsBatchFileName) + if self._fileUtils.isRessourceExists(self._obsBatchLineFileName): + os.remove(self._obsBatchLineFileName) + + if self._fileUtils.isRessourceExists(self._expSubSNPFileName): + os.remove(self._expSubSNPFileName) + if self._fileUtils.isRessourceExists(self._realExpSubSNPFileName + "_filtered"): + os.remove(self._realExpSubSNPFileName + "_filtered") + if self._fileUtils.isRessourceExists(self._expAlleleFileName): + os.remove(self._expAlleleFileName) + if self._fileUtils.isRessourceExists(self._expIndividualFileName): + os.remove(self._expIndividualFileName) + if self._fileUtils.isRessourceExists(self._expSequenceFSAFileName): + os.remove(self._expSequenceFSAFileName) + if self._fileUtils.isRessourceExists(self._expSequenceCSVFileName): + os.remove(self._expSequenceCSVFileName) + if self._fileUtils.isRessourceExists(self._expBatchFileName): + os.remove(self._expBatchFileName) + if self._fileUtils.isRessourceExists(self._expBatchLineFileName): + os.remove(self._expBatchLineFileName) + + if self._fileUtils.isRessourceExists(logFileName): + os.remove(logFileName) + if self._fileUtils.isRessourceExists(self._inputDirSeveralBatches): + shutil.rmtree(self._inputDirSeveralBatches) + + + def test_runOneBatch(self): + self._writeInputFile() + self._writeExpSubSNPFile() + self._writeExpAlleleFile() + self._writeExpIndividualFile() + self._writeExpSequenceFile() + self._writeExpBatchFile() + self._writeExpBatchLineFile() + + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana", "Batch1", "methyltransferase") + multifasta2SNPFile.runOneBatch(self._inFileName) + + self.assertTrue(FileUtils.isRessourceExists(self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expIndividualFileName, self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceFSAFileName, self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchLineFileName, self._obsBatchLineFileName)) + self.assertTrue(FileUtils.isRessourceExists(self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSubSNPFileName, self._obsSubSNPFileName)) + + def test_runOneBatch_with_a_real_input_file(self): + self._writeRealExpAlleleFile() + self._writeRealExpSequenceCSVFile() + self._writeRealExpBatchFile() + + multifasta2SNPFile = Multifasta2SNPFile("Pinus pinaster", "INRA_Pinus_pinaster_HDZ31-1", "PpHDZ31") + multifasta2SNPFile.runOneBatch(self._realInputFileName) + + self.assertTrue(FileUtils.isRessourceExists(self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._realExpIndividualFileName, self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._realExpSequenceFSAFileName, self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._realExpBatchLineFileName, self._obsBatchLineFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._realExpSubSNPFileName , self._obsSubSNPFileName)) + + def test_runOneBatch_with_errors_in_refSeq(self): + self._writeInputFileWithSeqErrorsInRefSeq() + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana", "Batch1", "methyltransferase") + self.assertRaises(Exception, multifasta2SNPFile.runOneBatch, self._inFileName, self._obsSubSNPFileName) + + def test_runOneBatch_with_errors_in_lineSeq(self): + self._writeInputFileWithSeqErrorsInOneLineSeq() + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana", "Batch1", "methyltransferase") + self.assertRaises(Exception, multifasta2SNPFile.runOneBatch, self._inFileName, self._obsSubSNPFileName) + + def test_runOneBatch_with_a_several_lineSeq(self): + self._writeInputFileWithASeveralLineSeq() + self._writeExpSubSNPFileSeveralLineSeq() + self._writeExpAlleleFile() + self._writeExpIndividualFile() + self._writeExpSequenceFileSeveralLineSeq() + self._writeExpBatchFile() + self._writeExpBatchLineFile() + + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana", "Batch1", "methyltransferase") + multifasta2SNPFile.runOneBatch(self._inFileName) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSubSNPFileName, self._obsSubSNPFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expIndividualFileName, self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceFSAFileName, self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchLineFileName, self._obsBatchLineFileName)) + + def test_runOneBatch_with_2_seqs_with_the_same_name(self): + self._writeInputFileWith2SeqsWithTheSameName() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + isSysExitRaised = False + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + + try: + multifasta2SNPFile.runOneBatch(self._inFileName) + except SystemExit: + isSysExitRaised = True + + self.assertTrue(isSysExitRaised) + + def test_runOneBatch_with_indels_and_snps(self): + self._writeInputFileWithSnpsAndIndels() + self._writeExpSubSNPFileWithSnpsAndIndels() + self._writeExpAlleleFileWithSnpsAndIndels() + self._writeExpIndividualFile() + self._writeExpSequenceFileWithDeletion() + self._writeExpBatchFile() + self._writeExpBatchLineFile() + + batchName = "Batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + multifasta2SNPFile.runOneBatch(self._inFileName) + + self.assertTrue(FileUtils.isRessourceExists(self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expIndividualFileName, self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceFSAFileName, self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchLineFileName, self._obsBatchLineFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSubSNPFileName, self._obsSubSNPFileName)) + + def test_runOneBatchWithPotentialDooblons(self): + self._writeInputFileBatchWithPotentialDooblons() + + batchName = "Batch_AU247387" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + multifasta2SNPFile.runOneBatch(self._inFileName) + self.assertTrue(FileUtils.isRessourceExists(self._obsSubSNPFileName)) + + expSubSNPFile = "data/ExpPotDooblonsSubSNP.csv" + + Utils4Test.removeOneSpecifiedColumn(expSubSNPFile, ";", 8) + Utils4Test.removeOneSpecifiedColumn(self._obsSubSNPFileName, ";", 8) + + Utils4Test.removeOneSpecifiedColumn(expSubSNPFile + "_filtered", ";", 9) + Utils4Test.removeOneSpecifiedColumn(self._obsSubSNPFileName + "_filtered", ";", 9) + + Utils4Test.removeOneSpecifiedColumn(expSubSNPFile + "_filtered_filtered", ";", 13) + Utils4Test.removeOneSpecifiedColumn(self._obsSubSNPFileName + "_filtered_filtered", ";", 13) + + comparableExpSubSNPFile = expSubSNPFile + "_filtered_filtered_filtered" + comparableObsSubSNPFile = self._obsSubSNPFileName + "_filtered_filtered_filtered" + + self.assertTrue(FileUtils.isRessourceExists(comparableExpSubSNPFile)) + self.assertTrue(FileUtils.isRessourceExists(comparableObsSubSNPFile)) + self.assertTrue(FileUtils.are2FilesIdentical(comparableExpSubSNPFile, comparableObsSubSNPFile)) + + if(self._fileUtils.isRessourceExists(self._obsSubSNPFileName + "_filtered")): + os.remove(self._obsSubSNPFileName + "_filtered") + if(self._fileUtils.isRessourceExists(expSubSNPFile + "_filtered")): + os.remove(expSubSNPFile + "_filtered") + + if(self._fileUtils.isRessourceExists(self._obsSubSNPFileName + "_filtered_filtered")): + os.remove(self._obsSubSNPFileName + "_filtered_filtered") + if(self._fileUtils.isRessourceExists(expSubSNPFile + "_filtered_filtered")): + os.remove(expSubSNPFile + "_filtered_filtered") + + if self._fileUtils.isRessourceExists(comparableExpSubSNPFile): + os.remove(comparableExpSubSNPFile) + if self._fileUtils.isRessourceExists(comparableObsSubSNPFile): + os.remove(comparableObsSubSNPFile) + + def test_runSeveralBatches(self): + self._writeInputFileSeveralBatches() + self._writeExpSubSNPFileSeveralBatches() + self._writeExpAlleleFileSeveralBatches() + self._writeExpIndividualFile() + self._writeExpSequenceSeveralBatches() + self._writeExpBatchFileSeveralBatches() + self._writeExpBatchLineFileSeveralBatches() + + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana") + multifasta2SNPFile.runSeveralBatches(self._inputDirSeveralBatches) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._inputDirSeveralBatches + "/" + self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" +self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expIndividualFileName, self._inputDirSeveralBatches + "/" + self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceFSAFileName, self._inputDirSeveralBatches + "/" + self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._inputDirSeveralBatches + "/" + self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._inputDirSeveralBatches + "/" + self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchLineFileName, self._inputDirSeveralBatches + "/" + self._obsBatchLineFileName)) + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSubSNPFileName, self._inputDirSeveralBatches + "/" + self._obsSubSNPFileName)) + + def test_runSeveralBatches_different_lines_between_files(self): + self._writeInputFileSeveralBatches_different_lines_between_files() + self._writeExpSubSNPFileSeveralBatches_different_lines_between_files() + self._writeExpAlleleFileSeveralBatches() + self._writeExpIndividualFile_different_lines_between_files() + self._writeExpSequenceSeveralBatches() + self._writeExpBatchFileSeveralBatches() + self._writeExpBatchLineFileSeveralBatches_different_lines_between_files() + + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana") + multifasta2SNPFile.runSeveralBatches(self._inputDirSeveralBatches) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsAlleleFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expAlleleFileName, self._inputDirSeveralBatches + "/" + self._obsAlleleFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" +self._obsIndividualFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expIndividualFileName, self._inputDirSeveralBatches + "/" + self._obsIndividualFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSequenceFSAFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceFSAFileName, self._inputDirSeveralBatches + "/" + self._obsSequenceFSAFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSequenceCSVFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSequenceCSVFileName, self._inputDirSeveralBatches + "/" + self._obsSequenceCSVFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsBatchFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchFileName, self._inputDirSeveralBatches + "/" + self._obsBatchFileName)) + + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsBatchLineFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expBatchLineFileName, self._inputDirSeveralBatches + "/" + self._obsBatchLineFileName)) + self.assertTrue(FileUtils.isRessourceExists(self._inputDirSeveralBatches + "/" + self._obsSubSNPFileName)) + self.assertTrue(FileUtils.are2FilesIdentical(self._expSubSNPFileName, self._inputDirSeveralBatches + "/" + self._obsSubSNPFileName)) + + def test_runSeveralBatches_different_lines_and_same_refseq_between_files(self): + self._writeInputFileSeveralBatches_different_lines_and_same_refseq_between_files() + self._writeExpSubSNPFileSeveralBatches_different_lines_between_files() + self._writeExpAlleleFileSeveralBatches() + self._writeExpIndividualFile_different_lines_between_files() + self._writeExpSequenceSeveralBatchesForSameRefSeq() + self._writeExpBatchFileSeveralBatchesForSameRefSeq() + self._writeExpBatchLineFileSeveralBatches_different_lines_between_files() + + multifasta2SNPFile = Multifasta2SNPFile("Arabidopsis thaliana") + try: + multifasta2SNPFile.runSeveralBatches(self._inputDirSeveralBatches) + except Exception, e : + self.assertRaises(Exception, e) + + def test_detectSNPAndIndels(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "ATTCGCGTATGCGTATGCTT" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "ATCCGCGTATGCGTATGATT" ) + bs2 = Bioseq( "line2", "ATTCGTGTATGCGTATGGTT" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'C': 1, 'A': 2, 'T': 3, 'G': 4 } + lExpSNP = [{'subSNPName': batchName + "_SNP_3_line1", 'position': 3, 'lineName': 1, 'allele': 1, '5flank': "AT", '3flank': "CGCGTATGCGTATGATT", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_3_line2", 'position': 3, 'lineName': 2, 'allele': 3, '5flank': "AT", '3flank': "CGTGTATGCGTATGGTT", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line2", 'position': 6, 'lineName': 2, 'allele': 3, '5flank': "ATTCG", '3flank': "GTATGCGTATGGTT", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line1", 'position': 6, 'lineName': 1, 'allele': 1, '5flank': "ATCCG", '3flank': "GTATGCGTATGATT",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_18_line1", 'position': 18, 'lineName': 1, 'allele': 2, '5flank': "ATCCGCGTATGCGTATG", '3flank': "TT", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_18_line2", 'position': 18, 'lineName': 2, 'allele': 4, '5flank': "ATTCGTGTATGCGTATG", '3flank': "TT", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"},] + + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_detectSNPAndIndels_no_polym(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "ATTCGCGTATGCGTATGCTT" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "ATTCGCGTATGCGTATGCTT" ) + bs2 = Bioseq( "line2", "ATTCGCGTATGCGTATGCTT" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + instance = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + + multifasta2SNPFile.detectSNPsAndIndels(instance) + + lExpSNP = [] + + self.assertEquals(lExpSNP, multifasta2SNPFile._lSubSNPFileResults) + + def test_detectSNPAndIndels_with_only_dels(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "ATTACCGAA" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "A--ACCGAA" ) + bs2 = Bioseq( "line2", "---ACCGAA" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'A--': 1, '---': 2} + lExpSNP = [{'subSNPName': batchName + "_DEL_1_line2", 'position': 1, 'lineName': 2, 'allele': 2, '5flank': "", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 3}, + {'subSNPName': batchName + "_DEL_1_line1", 'position': 1, 'lineName': 1, 'allele': 1, '5flank': "", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 3}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"}] + + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_detectSNPAndIndels_with_dels_and_snps(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "ATTACCGAA" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "A--ACCGAA" ) + bs2 = Bioseq( "line2", "---ACCGAA" ) + bs3 = Bioseq( "line3", "ATTACCGGA" ) + bs4 = Bioseq( "line4", "----CCGAA" ) + + alignedBioseqDB.setData( [ bs1, bs2, bs3, bs4 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'G': 1, 'A--A': 2, '---A': 3, '----': 4, 'ATTA': 5, 'A': 6} + lExpSNP = [{'subSNPName': batchName + "_DEL_1_line2", 'position': 1, 'lineName': 2, 'allele': 3, '5flank': "", '3flank': "CCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line1", 'position': 1, 'lineName': 1, 'allele': 2, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_SNP_8_line3", 'position': 8, 'lineName': 3, 'allele': 1, '5flank': "ATTACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line1", 'position': 8, 'lineName': 1, 'allele': 6, '5flank': "A--ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line2", 'position': 8, 'lineName': 2, 'allele': 6, '5flank': "---ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line4", 'position': 8, 'lineName': 4, 'allele': 6, '5flank': "----CCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_DEL_1_line4", 'position': 1, 'lineName': 4, 'allele': 4, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line3", 'position': 1, 'lineName': 3, 'allele': 5, '5flank': "", '3flank': "CCGGA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "line3", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 4, 'individualName': "line4", 'scientificName': "Arabidopsis thaliana"}] + + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_detectSNPAndIndels_with_only_inserts(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "A--ACCGAA" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "A--ACCGAA" ) + bs2 = Bioseq( "line2", "AG-ACCGAA" ) + bs3 = Bioseq( "line3", "ATTACCGAA" ) + + alignedBioseqDB.setData( [ bs1, bs2, bs3 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'G-': 1, 'TT': 2, '--': 3} + lExpSNP = [{'subSNPName': batchName + "_INS_1_line2", 'position': 1, 'lineName': 2, 'allele': 1, '5flank': "A", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line3", 'position': 1, 'lineName': 3, 'allele': 2, '5flank': "A", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line1", 'position': 1, 'lineName': 1, 'allele': 3, '5flank': "A", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "line3", 'scientificName': "Arabidopsis thaliana"}] + + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_detectSNPAndIndels_with_snps_and_inserts(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "A--ACCGAA" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "A--ACCGAA" ) + bs2 = Bioseq( "line2", "AG-ACCGAA" ) + bs3 = Bioseq( "line3", "ATTACCGCA" ) + + alignedBioseqDB.setData( [ bs1, bs2, bs3 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'C': 1, 'G-': 2, 'TT': 3, '--': 4, 'A' : 5} + lExpSNP = [{'subSNPName': batchName + "_SNP_6_line3", 'position': 6, 'lineName': 3, 'allele': 1, '5flank': "ATTACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line1", 'position': 6, 'lineName': 1, 'allele': 5, '5flank': "A--ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line2", 'position': 6, 'lineName': 2, 'allele': 5, '5flank': "AG-ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_INS_1_line2", 'position': 1, 'lineName': 2, 'allele': 2, '5flank': "A", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line3", 'position': 1, 'lineName': 3, 'allele': 3, '5flank': "A", '3flank': "ACCGCA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line1", 'position': 1, 'lineName': 1, 'allele': 4, '5flank': "A", '3flank': "ACCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "line3", 'scientificName': "Arabidopsis thaliana"}] + + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_detectSNPAndIndels_with_snps_inserts_and_dels(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + refBioseq.sequence = "A--ACCGAATATAC" + refBioseq.header = "reference" + + bs1 = Bioseq( "line1", "A--ACCGAATATAC" ) + bs2 = Bioseq( "line2", "AG-ACCGAAT--AC" ) + bs3 = Bioseq( "line3", "ATTACCGCA-----" ) + + alignedBioseqDB.setData( [ bs1, bs2, bs3 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + multifasta2SNPFile._dBatchResults = {'BatchNumber': 1, 'BatchName': "Batch1", 'GeneName': "methyltransferase", 'RefSeqName': "Sequence_de_Reference"} + multifasta2SNPFile.detectSNPsAndIndels(multifasta2SNPFile._wrapper) + + dExpAllele = {'C': 1, 'G-': 2, 'T--AC': 3, 'TT': 4, '-----': 5, '--': 6, 'TATAC': 7, 'A': 8} + lExpSNP = [{'subSNPName': batchName + "_SNP_6_line3", 'position': 6, 'lineName': 3, 'allele': 1, '5flank': "ATTACCG", '3flank': "A-----", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line1", 'position': 6, 'lineName': 1, 'allele': 8, '5flank': "A--ACCG", '3flank': "ATATAC", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_6_line2", 'position': 6, 'lineName': 2, 'allele': 8, '5flank': "AG-ACCG", '3flank': "AT--AC", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + + {'subSNPName': batchName + "_INS_1_line2", 'position': 1, 'lineName': 2, 'allele': 2, '5flank': "A", '3flank': "ACCGAAT--AC", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line3", 'position': 1, 'lineName': 3, 'allele': 4, '5flank': "A", '3flank': "ACCGCA-----", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + {'subSNPName': batchName + "_INS_1_line1", 'position': 1, 'lineName': 1, 'allele': 6, '5flank': "A", '3flank': "ACCGAATATAC", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "INSERTION", 'length': 2}, + + {'subSNPName': batchName + "_DEL_8_line2", 'position': 8, 'lineName': 2, 'allele': 3, '5flank': "AG-ACCGAA", '3flank': "", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 5}, + {'subSNPName': batchName + "_DEL_8_line3", 'position': 8, 'lineName': 3, 'allele': 5, '5flank': "ATTACCGCA", '3flank': "", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 5}, + {'subSNPName': batchName + "_DEL_8_line1", 'position': 8, 'lineName': 1, 'allele': 7, '5flank': "A--ACCGAA", '3flank': "", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 5}] + lExpIndividual = [{'individualNumber': 1, 'individualName': "line1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "line2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "line3", 'scientificName': "Arabidopsis thaliana"}] + + self.assertEquals(dExpAllele, multifasta2SNPFile._dAlleleFileResults) + self.assertEquals(multifasta2SNPFile._sortSubSNPResultByBatchPositionAndLineName(lExpSNP), multifasta2SNPFile._lSubSNPFileResults) + self.assertEquals(lExpIndividual, multifasta2SNPFile._lIndividualFileResults) + + def test_createWrapperFromFile_with_upcase_and_lowcase_nucleotide(self): + self._writeInputFileWithUpcaseAndLowcaseNucleotide() + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + + expLineBioseqDB = BioseqDB() + expRefBioseq = Bioseq("Sequence_de_Reference",\ + "CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA") + iBioSeq = Bioseq("Line1","CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA") + expLineBioseqDB.add ( iBioSeq ) + iBioSeq = Bioseq("Line2","CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATACGCAGTAGCCAAACCTCCACAATA") + expLineBioseqDB.add ( iBioSeq ) + + expBioseqDBWrapper = ReferenceBioseqAndLinesBioseqDBWrapper (expRefBioseq, expLineBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + + obsBioseqDBWrapper = multifasta2SNPFile.createWrapperFromFile(self._inFileName) + + self.assertEquals(obsBioseqDBWrapper._iReferenceBioseq, expBioseqDBWrapper._iReferenceBioseq) + self.assertEquals(obsBioseqDBWrapper._iLinesBioseqDB, expBioseqDBWrapper._iLinesBioseqDB) + + def test_checkHeaderAlphabet(self): + # header ALPHABET [^a-zA-Z0-9_-:] + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="abcdefghijklmnopqrstuvwxyz0912834567_:-" + self.assertTrue ( multifasta2SNPFile.checkHeaderAlphabet(strToBeCheck)) + strToBeCheck="ABCDEFGHIJKLMNOPQRSTUVWXYZ0912834567_:-" + self.assertTrue ( multifasta2SNPFile.checkHeaderAlphabet(strToBeCheck)) + + def test_checkHeaderAlphabet_empty_string(self): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="" + self.assertFalse ( multifasta2SNPFile.checkHeaderAlphabet(strToBeCheck)) + + def test_checkHeaderAlphabet_space(self): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck=" " + self.assertFalse ( multifasta2SNPFile.checkHeaderAlphabet(strToBeCheck)) + + def test_checkHeaderAlphabet_non_aphabetical(self): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="}" + self.assertFalse ( multifasta2SNPFile.checkHeaderAlphabet(strToBeCheck)) + + def test_isDNA_bases( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="TGTGGCTTCTAGTTGATCAGTTTATGATCACAATGATTTCACGTAGGTGTCTCGTGGCTCCGACTAATCAACAATATAATGCGAGTAGAGCTTGA" + self.assertTrue ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_isDNA_bases_non_DNA_letter( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="XTAGTTGATCA" + self.assertFalse ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_isDNA_bases_carriage_return( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="TA\nGTTGATCA" + self.assertFalse ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_isDNA_bases_empty_string( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="" + self.assertFalse ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_isDNA_bases_space( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck=" " + self.assertFalse ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_isDNA_bases_IUPAC_letter_but_non_DNA_bases( self ): + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + strToBeCheck="UMWSB" + self.assertFalse ( multifasta2SNPFile.isDNA_bases(strToBeCheck)) + + def test_getLineAsAHeader (self): + lineToBeCheck=">test on good header" + batchName = "batch1" + expHeader = "test_on_good_header" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + obsHeader = multifasta2SNPFile.getLineAsAHeader(lineToBeCheck) + self.assertEqual(obsHeader,expHeader) + + def test_getLineAsAHeader_warning_bad_header_tag_omitted(self): + + lineToBeCheck="test on bad header with tag omitted" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + try : + expHeader = multifasta2SNPFile.getLineAsAHeader( lineToBeCheck ) + except Exception, e : + self.assertRaises(Exception, e , self._inFileName, self._obsSubSNPFileName) + + def test_getLineAsAHeader_warning_repeated_blanks_removed(self): + + lineToBeCheck =">test on header \twith warning" + expHeader = "test_on_header_with_warning" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + obsHeader = multifasta2SNPFile.getLineAsAHeader( lineToBeCheck ) + self.assertEquals( obsHeader, expHeader) + self.assertRaises(Exception, multifasta2SNPFile.getLineAsAHeader( lineToBeCheck ) , self._inFileName, self._obsSubSNPFileName) + + def test_getLineAsAHeader_fatal_error_bad_header(self): + lineToBeCheck=">test\on bad header with fatal error" + + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + try : + expHeader = multifasta2SNPFile.getLineAsAHeader( lineToBeCheck ) + except Exception, e : + self.assertRaises(Exception, e , self._inFileName, self._obsSubSNPFileName) + + def test_isHeaderInRefSeqList(self): + header = "line1" + bs1 = Bioseq( "line1", "A--ACCGAATATAC" ) + bs2 = Bioseq( "line2", "AG-ACCGAAT--AC" ) + bs3 = Bioseq( "line3", "ATTACCGCA-----" ) + + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + multifasta2SNPFile._lRefSequences = [bs1, bs2, bs3] + try: + isHeader = multifasta2SNPFile.isHeaderInRefSeqList(header) + except Exception, e : + self.assertRaises(Exception, e) + + def test_completeAlleleSetWithCurrentAllele_one_allele_added(self): + dAlleleSetInInput = {"A" : 1, + "T" : 2, + "G" : 3} + alleleToAdd = "C" + dAlleleExpSet = {"A" : 1, + "T" : 2, + "G" : 3, + "C" : 4} + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + dAlleleObsSet = multifasta2SNPFile._completeAlleleSetWithCurrentAllele(dAlleleSetInInput, alleleToAdd) + self.assertEquals(dAlleleObsSet, dAlleleExpSet) + + def test_completeAlleleSetWithCurrentAllele_no_allele_added(self): + dAlleleSetInInput = {"A" : 1, + "T" : 2, + "G" : 3} + alleleToAdd = "T" + dAlleleExpSet = {"A" : 1, + "T" : 2, + "G" : 3} + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + dAlleleObsSet = multifasta2SNPFile._completeAlleleSetWithCurrentAllele(dAlleleSetInInput, alleleToAdd) + self.assertEquals(dAlleleObsSet, dAlleleExpSet) + + def test_completeAlleleSetWithCurrentAllele_with_an_empty_allele_set(self): + dAlleleSetInInput = {} + alleleToAdd = "T" + dAlleleExpSet = {"T" : 1} + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + dAlleleObsSet = multifasta2SNPFile._completeAlleleSetWithCurrentAllele(dAlleleSetInInput, alleleToAdd) + self.assertEquals(dAlleleObsSet, dAlleleExpSet) + + def test_completeBatchLineListWithCurrentIndividual(self): + #TODO: this test only pass with a batchNumber of 1 + iCurrentBatchNumber = 1 + lBatchLineResults = [{'IndividualNumber': "1", 'BatchNumber': iCurrentBatchNumber}, + {'IndividualNumber': "2", 'BatchNumber': iCurrentBatchNumber}] + lIndividualResults = [{'individualNumber': 1, 'individualName': "Individual1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "Individual2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "Individual3", 'scientificName': "Arabidopsis thaliana"}] + lExpBatchLineResults = [{'IndividualNumber': "1", 'BatchNumber': iCurrentBatchNumber}, + {'IndividualNumber': "2", 'BatchNumber': iCurrentBatchNumber}, + {'IndividualNumber': "3", 'BatchNumber': iCurrentBatchNumber}] + lineName2Add = "Individual3" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + lBatchLineResults = multifasta2SNPFile._completeBatchLineListWithCurrentIndividual(lBatchLineResults, lIndividualResults, lineName2Add) + self.assertEquals(lBatchLineResults, lExpBatchLineResults) + + def test_completeBatchLineListWithCurrentIndividual_no_entries_in_batchline_results_in_input(self): + lBatchLineResults = [] + lIndividualResults = [{'individualNumber': 1, 'individualName': "Individual1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "Individual2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "Individual3", 'scientificName': "Arabidopsis thaliana"}] + lExpBatchLineResults = [{'IndividualNumber': "2", 'BatchNumber': 1}] + lineName2Add = "Individual2" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + lBatchLineResults = multifasta2SNPFile._completeBatchLineListWithCurrentIndividual(lBatchLineResults, lIndividualResults, lineName2Add) + self.assertEquals(lBatchLineResults, lExpBatchLineResults) + + def test_completeBatchLineListWithCurrentIndividual_no_individual_in_individualList(self): + lBatchLineResults = [{'IndividualNumber': "1", 'BatchNumber': 1}, + {'IndividualNumber': "2", 'BatchNumber': 1}] + lIndividualResults = [] + + lineName2Add = "Individual3" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + try: + lBatchLineResults = multifasta2SNPFile._completeBatchLineListWithCurrentIndividual(lBatchLineResults, lIndividualResults, lineName2Add) + except Exception, e : + self.assertRaises(Exception, e) + + def test_completeBatchLineListWithCurrentIndividual_individual_added_has_no_individual_number(self): + lBatchLineResults = [{'IndividualNumber': "1", 'BatchNumber': "1"}, + {'IndividualNumber': "2", 'BatchNumber': "1"}] + lIndividualResults = [{'individualNumber': 1, 'individualName': "Individual1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "Individual2", 'scientificName': "Arabidopsis thaliana"}, + {'individualName': "Individual3", 'scientificName': "Arabidopsis thaliana"}] + + lineName2Add = "Individual3" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + try: + lBatchLineResults = multifasta2SNPFile._completeBatchLineListWithCurrentIndividual(lBatchLineResults, lIndividualResults, lineName2Add) + except Exception, e : + self.assertRaises(Exception, e) + + def test_completeBatchLineListWithCurrentIndividual_individual_not_present_in_individualList(self): + lBatchLineResults = [{'IndividualNumber': "1", 'BatchNumber': "1"}, + {'IndividualNumber': "2", 'BatchNumber': "1"}] + lIndividualResults = [{'individualNumber': 1, 'individualName': "Individual1", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 2, 'individualName': "Individual2", 'scientificName': "Arabidopsis thaliana"}, + {'individualNumber': 3, 'individualName': "Individual3", 'scientificName': "Arabidopsis thaliana"}] + + lineName2Add = "Michael Corleone" + batchName = "batch1" + taxon = "Arabidopsis thaliana" + gene = "methyltransferase" + multifasta2SNPFile = Multifasta2SNPFile(taxon, batchName, gene) + try: + lBatchLineResults = multifasta2SNPFile._completeBatchLineListWithCurrentIndividual(lBatchLineResults, lIndividualResults, lineName2Add) + except Exception, e : + self.assertRaises(Exception, e) + + def test_findASubSNPInAListWithHisName(self): + lSubSNPList = [{'subSNPName': "SubSNP_batch1_1_line2", 'position': 1, 'lineName': 2, 'allele': 2, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION"}, + {'subSNPName': "SubSNP_batch1_2_line1", 'position': 1, 'lineName': 1, 'allele': 1, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION"}, + {'subSNPName': "SubSNP_batch1_6_line1", 'position': 6, 'lineName': 1, 'allele': 3, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP"}] + name = "SubSNP_batch1_2_line1" + + dExpSubSNP = {'subSNPName': "SubSNP_batch1_2_line1", 'position': 1, 'lineName': 1, 'allele': 1, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION"} + expIndice = 1 + + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + + dObsSubSNP, obsIndice = multifasta2SNPFile.findASubSNPInAListWithHisName(name, lSubSNPList) + + self.assertEquals(expIndice, obsIndice) + self.assertEquals(dExpSubSNP, dObsSubSNP) + + def test_findASubSNPInAListWithHisName_SubSNP_not_found(self): + lSubSNPList = [{'subSNPName': "SubSNP_batch1_1_line2", 'position': 1, 'lineName': 2, 'allele': 2, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION"}, + {'subSNPName': "SubSNP_batch1_2_line1", 'position': 1, 'lineName': 1, 'allele': 1, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION"}, + {'subSNPName': "SubSNP_batch1_6_line1", 'position': 6, 'lineName': 1, 'allele': 3, 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP"}] + name = "SubSNP_fake" + + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + + try: + dObsSubSNP, obsIndice = multifasta2SNPFile.findASubSNPInAListWithHisName(name, lSubSNPList) + except Exception, e : + self.assertRaises(Exception, e) + + def test_clusteriseIndels(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + lObsIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}] + dIndel = {'start': 1, 'end': 6} + + lObsIndelsList = multifasta2SNPFile.clusteriseIndels(dIndel, lObsIndelsList) + lexpIndelsList = [{'name' : "indel1", 'start': 1, 'end': 10}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3", 'start': 1, 'end': 10}] + + self.assertEquals(lexpIndelsList, lObsIndelsList) + + def test_clusteriseIndels_no_overlap(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + lObsIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 25, 'end': 30}] + dIndel = {'start': 1, 'end': 6} + + lObsIndelsList = multifasta2SNPFile.clusteriseIndels(dIndel, lObsIndelsList) + lexpIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3", 'start': 25, 'end': 30}] + + self.assertEquals(lexpIndelsList, lObsIndelsList) + + def test_clusteriseIndels_many_overlaps_complicated(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + lObsIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + dIndel = {'start': 5, 'end': 10} + + lObsIndelsList = multifasta2SNPFile.clusteriseIndels(dIndel, lObsIndelsList) + lexpIndelsList = [{'name' : "indel1", 'start': 1, 'end': 40}, + {'name' : "indel2", 'start': 1, 'end': 40}, + {'name' : "indel3", 'start': 1, 'end': 40}, + {'name' : "indel4",'start': 1, 'end': 40}] + + self.assertEquals(lexpIndelsList, lObsIndelsList) + + def test_updateBoundsForAnIndelInAnIndelList(self): + lIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + dIndelWithNewBounds = {'name': "indel2", 'start': 7, 'end': 19} + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + lObsNewIndelsList = multifasta2SNPFile.updateBoundsForAnIndelInAnIndelList(lIndelsList, dIndelWithNewBounds) + lExpNewIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 7, 'end': 19}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + self.assertEquals(lExpNewIndelsList, lObsNewIndelsList) + + def test_updateBoundsForAnIndelInAnIndelList_no_update_to_do(self): + lIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + dIndelWithNewBounds = {'name': "indel2", 'start': 12, 'end': 15} + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + lObsNewIndelsList = multifasta2SNPFile.updateBoundsForAnIndelInAnIndelList(lIndelsList, dIndelWithNewBounds) + lExpNewIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + self.assertEquals(lExpNewIndelsList, lObsNewIndelsList) + + def test_updateBoundsForAnIndelInAnIndelList_indel_2_update_does_not_exist(self): + lIndelsList = [{'name' : "indel1", 'start': 1, 'end': 6}, + {'name' : "indel2", 'start': 12, 'end': 15}, + {'name' : "indel3",'start': 5, 'end': 10}, + {'name' : "indel4",'start': 9, 'end': 40}] + dIndelWithNewBounds = {'name': "DeNiro", 'start': 12, 'end': 15} + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + try: + lObsNewIndelsList = multifasta2SNPFile.updateBoundsForAnIndelInAnIndelList(lIndelsList, dIndelWithNewBounds) + except Exception, e : + self.assertRaises(Exception, e) + + def test_mergeBoundsFor2Indels(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + dIndel1 = {'start': 1, 'end': 4} + dIndel2 = {'start': 2, 'end': 15} + dIndel1, dIndel2 = multifasta2SNPFile.mergeBoundsForTwoOverlappingIndels(dIndel1, dIndel2) + dExpIndel1 = {'start': 1, 'end': 15} + dExpIndel2 = {'start': 1, 'end': 15} + self.assertEquals(dExpIndel1, dIndel1) + self.assertEquals(dExpIndel2, dIndel2) + + def test_mergeBoundsFor2Indels_no_overlap(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + dIndel1 = {'start': 1, 'end': 4} + dIndel2 = {'start': 5, 'end': 15} + dIndel1, dIndel2 = multifasta2SNPFile.mergeBoundsForTwoOverlappingIndels(dIndel1, dIndel2) + dExpIndel1 = {'start': 1, 'end': 4} + dExpIndel2 = {'start': 5, 'end': 15} + self.assertEquals(dExpIndel1, dIndel1) + self.assertEquals(dExpIndel2, dIndel2) + + def test_getUngappedPositionInRefSeq(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "A--TTACC-GAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + + expUngappedPositionFor1 = 1 + obsUngappedPositionFor1 = multifasta2SNPFile.getUngappedPositionInRefSeq(1) + expUngappedPositionFor5 = 3 + obsUngappedPositionFor5 = multifasta2SNPFile.getUngappedPositionInRefSeq(5) + expUngappedPositionFor10 = 7 + obsUngappedPositionFor10 = multifasta2SNPFile.getUngappedPositionInRefSeq(10) + + self.assertEquals(expUngappedPositionFor1, obsUngappedPositionFor1) + self.assertEquals(expUngappedPositionFor5, obsUngappedPositionFor5) + self.assertEquals(expUngappedPositionFor10, obsUngappedPositionFor10) + + def test_getUngappedPositionInRefSeq_no_gap(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + + expUngappedPositionFor1 = 1 + obsUngappedPositionFor1 = multifasta2SNPFile.getUngappedPositionInRefSeq(1) + expUngappedPositionFor5 = 5 + obsUngappedPositionFor5 = multifasta2SNPFile.getUngappedPositionInRefSeq(5) + expUngappedPositionFor10 = 10 + obsUngappedPositionFor10 = multifasta2SNPFile.getUngappedPositionInRefSeq(10) + + self.assertEquals(expUngappedPositionFor1, obsUngappedPositionFor1) + self.assertEquals(expUngappedPositionFor5, obsUngappedPositionFor5) + self.assertEquals(expUngappedPositionFor10, obsUngappedPositionFor10) + + def test_checkAllSeq_sequences_with_different_sizes_one_seq_longer(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAATTTC" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + try: + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + except Exception, e : + self.assertRaises(Exception, e) + obsMsg = e.message + expMsg = "File: " + self._inFileName + ", problem with the sequence " + bs2.header + ": its length is different from the reference seq! All the sequences must have the same length.\n" + expMsg += "refseq length: " + str(len(refBioseq.sequence)) + "\n" + expMsg += "seq length: " + str(len(bs2.sequence)) + "\n" + self.assertEquals(expMsg, obsMsg) + + def test_checkAllSeq_sequences_with_different_sizes_one_seq_shorter(self): + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC" ) + + alignedBioseqDB.setData( [ bs1, bs2 ] ) + + try: + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + except Exception, e : + self.assertRaises(Exception, e) + obsMsg = e.message + expMsg = "File: " + self._inFileName + ", problem with the sequence " + bs2.header + ": its length is different from the reference seq! All the sequences must have the same length.\n" + expMsg += "refseq length: " + str(len(refBioseq.sequence)) + "\n" + expMsg += "seq length: " + str(len(bs2.sequence)) + "\n" + self.assertEquals(expMsg, obsMsg) + + + def test_getFlanksOfASubSNP(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 3 + polymLength = 3 + lineName = "line1" + exp5flank = "AA" + exp3flank = "TCCAGAA" + + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 7) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) + + def test_getFlanksOfASubSNP_flank_truncated(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 3 + polymLength = 3 + lineName = "line1" + exp5flank = "AA" + exp3flank = "TCCAGAA" + + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 500) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) + + def test_getFlanksOfASubSNP_empty_seq(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "" ) + bs2 = Bioseq( "line2", "" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 3 + polymLength = 3 + lineName = "line1" + exp5flank = "" + exp3flank = "" + + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 500) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) + + def test_getFlanksOfASubSNP_flank_of_first_base(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 1 + polymLength = 1 + lineName = "line1" + exp5flank = "" + exp3flank = "ACTTTCCAGAA" + + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 500) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) + + def test_getFlanksOfASubSNP_flank_of_first_base_with_polym_on_all_sequence(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 1 + polymLength = 12 + lineName = "line1" + exp5flank = "" + exp3flank = "" + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 500) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) + + def test_getFlanksOfASubSNP_flank_of_last_base_with_polym_on_all_sequence(self): + refBioseq = Bioseq() + alignedBioseqDB = BioseqDB() + refBioseq.sequence = "AACTTACCAGAA" + refBioseq.header = "reference" + bs1 = Bioseq( "line1", "AACTTTCCAGAA" ) + bs2 = Bioseq( "line2", "AACTTACC-GAA" ) + alignedBioseqDB.setData( [ bs1, bs2 ] ) + multifasta2SNPFile = Multifasta2SNPFile("batch1", "gene1", "mouse") + multifasta2SNPFile._wrapper = ReferenceBioseqAndLinesBioseqDBWrapper(refBioseq, alignedBioseqDB, multifasta2SNPFile._logFile, self._inFileName) + subsnpPosition = 12 + polymLength = 1 + lineName = "line1" + exp5flank = "AACTTTCCAGA" + exp3flank = "" + obs5flank, obs3flank = multifasta2SNPFile.getFlanksOfASubSNP(lineName, subsnpPosition, polymLength, 500) + self.assertEquals(exp5flank, obs5flank) + self.assertEquals(exp3flank, obs3flank) +# + def test_subSNPExistsInSubSNPList_subSNP_exists(self): + batchName = "batch1" + lSubSNP = [{'subSNPName': batchName + "_DEL_1_line2", 'position': 1, 'lineName': 2, 'allele': 3, '5flank': "", '3flank': "CCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line1", 'position': 1, 'lineName': 1, 'allele': 2, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_SNP_8_line3", 'position': 8, 'lineName': 3, 'allele': 1, '5flank': "ATTACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line1", 'position': 8, 'lineName': 1, 'allele': 6, '5flank': "A--ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line2", 'position': 8, 'lineName': 2, 'allele': 6, '5flank': "---ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line4", 'position': 8, 'lineName': 4, 'allele': 6, '5flank': "----CCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_DEL_1_line4", 'position': 1, 'lineName': 4, 'allele': 4, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line3", 'position': 1, 'lineName': 3, 'allele': 5, '5flank': "", '3flank': "CCGGA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}] + multifasta2SNPFile = Multifasta2SNPFile(batchName, "gene1", "mouse") + + dSearchedSubSNP = {'subSNPName': batchName + "_DEL_1_line1", 'position': 1, 'lineName': 1, 'allele': 2, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4} + + expResult = multifasta2SNPFile.subSNPExistsInSubSNPList(dSearchedSubSNP, lSubSNP) + obsResult = True + + self.assertEquals(expResult, obsResult) + + def test_subSNPExistsInSubSNPList_subSNP_does_not_exist(self): + batchName = "batch1" + lSubSNP = [{'subSNPName': batchName + "_DEL_1_line2", 'position': 1, 'lineName': 2, 'allele': 3, '5flank': "", '3flank': "CCGAA", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line1", 'position': 1, 'lineName': 1, 'allele': 2, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_SNP_8_line3", 'position': 8, 'lineName': 3, 'allele': 1, '5flank': "ATTACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line1", 'position': 8, 'lineName': 1, 'allele': 6, '5flank': "A--ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line2", 'position': 8, 'lineName': 2, 'allele': 6, '5flank': "---ACCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_SNP_8_line4", 'position': 8, 'lineName': 4, 'allele': 6, '5flank': "----CCG", '3flank': "A", 'batchNumber': 1, 'confidenceValue' : "A", 'type' : "SNP", 'length': 1}, + {'subSNPName': batchName + "_DEL_1_line4", 'position': 1, 'lineName': 4, 'allele': 4, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}, + {'subSNPName': batchName + "_DEL_1_line3", 'position': 1, 'lineName': 3, 'allele': 5, '5flank': "", '3flank': "CCGGA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4}] + multifasta2SNPFile = Multifasta2SNPFile(batchName, "gene1", "mouse") + + dSearchedSubSNP = {'subSNPName': batchName + "_DEL_12_line1", 'position': 12, 'lineName': 1, 'allele': 2, '5flank': "", '3flank': "CCGAA",'batchNumber': 1, 'confidenceValue' : "A", 'type' : "DELETION", 'length': 4} + + expResult = multifasta2SNPFile.subSNPExistsInSubSNPList(dSearchedSubSNP, lSubSNP) + obsResult = False + + self.assertEquals(expResult, obsResult) + + def _writeExpSubSNPFile(self): + expFileHandle = open(self._expSubSNPFileName, "w") + expFileHandle.write("SubSNPName;ConfidenceValue;Type;Position;5flank;3flank;Length;BatchNumber;IndividualNumber;PrimerType;PrimerNumber;Forward_or_Reverse;AlleleNumber\n") + expFileHandle.write("Batch1_SNP_4_Line1;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;1\n") + expFileHandle.write("Batch1_SNP_4_Line2;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.write("Batch1_SNP_21_Line1;A;SNP;21;CCTTAGCCATTGCTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_21_Line2;A;SNP;21;CCTAAGCCATTGCTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_25_Line1;A;SNP;25;CCTTAGCCATTGCTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_25_Line2;A;SNP;25;CCTAAGCCATTGCTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_36_Line1;A;SNP;36;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_36_Line2;A;SNP;36;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_51_Line1;A;SNP;51;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_51_Line2;A;SNP;51;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;1;2;Sequence;;;4\n") + expFileHandle.close() + + def _writeExpSubSNPFileWithSnpsAndIndels(self): + expFileHandle = open(self._expSubSNPFileName, "w") + expFileHandle.write("SubSNPName;ConfidenceValue;Type;Position;5flank;3flank;Length;BatchNumber;IndividualNumber;PrimerType;PrimerNumber;Forward_or_Reverse;AlleleNumber\n") + expFileHandle.write("Batch1_INS_1_Line1;A;INSERTION;1;C;TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;2;1;1;Sequence;;;8\n") + expFileHandle.write("Batch1_INS_1_Line2;A;INSERTION;1;C;AAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;2;1;2;Sequence;;;6\n") + expFileHandle.write("Batch1_SNP_2_Line1;A;SNP;2;C--;AGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;1\n") + expFileHandle.write("Batch1_SNP_2_Line2;A;SNP;2;CCT;AGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.write("Batch1_DEL_8_Line1;A;DELETION;8;C--TAGCCA;CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;3;1;1;Sequence;;;5\n") + expFileHandle.write("Batch1_DEL_8_Line2;A;DELETION;8;CCTAAGCCA;CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;3;1;2;Sequence;;;7\n") + expFileHandle.write("Batch1_SNP_19_Line1;A;SNP;19;C--TAGCCA---CTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_19_Line2;A;SNP;19;CCTAAGCCATT-CTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_23_Line1;A;SNP;23;C--TAGCCA---CTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_23_Line2;A;SNP;23;CCTAAGCCATT-CTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_34_Line1;A;SNP;34;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_34_Line2;A;SNP;34;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_49_Line1;A;SNP;49;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_49_Line2;A;SNP;49;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;1;2;Sequence;;;4\n") + expFileHandle.close() + + def _writeExpSubSNPFileSeveralBatches(self): + expFileHandle = open(self._expSubSNPFileName, "w") + expFileHandle.write("SubSNPName;ConfidenceValue;Type;Position;5flank;3flank;Length;BatchNumber;IndividualNumber;PrimerType;PrimerNumber;Forward_or_Reverse;AlleleNumber\n") + expFileHandle.write("Batch_Gene1_SNP_4_Line1;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;1\n") + expFileHandle.write("Batch_Gene1_SNP_4_Line2;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.write("Batch_Gene1_SNP_21_Line1;A;SNP;21;CCTTAGCCATTGCTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_21_Line2;A;SNP;21;CCTAAGCCATTGCTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_25_Line1;A;SNP;25;CCTTAGCCATTGCTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene1_SNP_25_Line2;A;SNP;25;CCTAAGCCATTGCTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_36_Line1;A;SNP;36;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene1_SNP_36_Line2;A;SNP;36;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_51_Line1;A;SNP;51;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_51_Line2;A;SNP;51;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;1;2;Sequence;;;4\n") + + expFileHandle.write("Batch_Gene2_INS_1_Line1;A;INSERTION;1;C;TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;2;2;1;Sequence;;;8\n") + expFileHandle.write("Batch_Gene2_INS_1_Line2;A;INSERTION;1;C;AAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;2;2;2;Sequence;;;6\n") + expFileHandle.write("Batch_Gene2_SNP_2_Line1;A;SNP;2;C--;AGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;2;1;Sequence;;;1\n") + expFileHandle.write("Batch_Gene2_SNP_2_Line2;A;SNP;2;CCT;AGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;2;2;Sequence;;;4\n") + expFileHandle.write("Batch_Gene2_DEL_8_Line1;A;DELETION;8;C--TAGCCA;CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;3;2;1;Sequence;;;5\n") + expFileHandle.write("Batch_Gene2_DEL_8_Line2;A;DELETION;8;CCTAAGCCA;CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;3;2;2;Sequence;;;7\n") + expFileHandle.write("Batch_Gene2_SNP_19_Line1;A;SNP;19;C--TAGCCA---CTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;2;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_19_Line2;A;SNP;19;CCTAAGCCATT-CTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;2;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_23_Line1;A;SNP;23;C--TAGCCA---CTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;2;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene2_SNP_23_Line2;A;SNP;23;CCTAAGCCATT-CTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;2;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_34_Line1;A;SNP;34;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;2;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene2_SNP_34_Line2;A;SNP;34;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;2;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_49_Line1;A;SNP;49;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;2;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_49_Line2;A;SNP;49;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;2;2;Sequence;;;4\n") + expFileHandle.close() + + def _writeExpSubSNPFileSeveralBatches_different_lines_between_files(self): + expFileHandle = open(self._expSubSNPFileName, "w") + expFileHandle.write("SubSNPName;ConfidenceValue;Type;Position;5flank;3flank;Length;BatchNumber;IndividualNumber;PrimerType;PrimerNumber;Forward_or_Reverse;AlleleNumber\n") + expFileHandle.write("Batch_Gene1_SNP_4_Line1;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;1\n") + expFileHandle.write("Batch_Gene1_SNP_4_Line2;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.write("Batch_Gene1_SNP_21_Line1;A;SNP;21;CCTTAGCCATTGCTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_21_Line2;A;SNP;21;CCTAAGCCATTGCTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_25_Line1;A;SNP;25;CCTTAGCCATTGCTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene1_SNP_25_Line2;A;SNP;25;CCTAAGCCATTGCTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_36_Line1;A;SNP;36;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch_Gene1_SNP_36_Line2;A;SNP;36;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_51_Line1;A;SNP;51;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch_Gene1_SNP_51_Line2;A;SNP;51;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;1;2;Sequence;;;4\n") + + expFileHandle.write("Batch_Gene2_INS_1_Line3;A;INSERTION;1;C;TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;2;2;3;Sequence;;;8\n") + expFileHandle.write("Batch_Gene2_INS_1_Line4;A;INSERTION;1;C;AAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;2;2;4;Sequence;;;6\n") + expFileHandle.write("Batch_Gene2_SNP_2_Line3;A;SNP;2;C--;AGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;1;2;3;Sequence;;;1\n") + expFileHandle.write("Batch_Gene2_SNP_2_Line4;A;SNP;2;CCT;AGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;1;2;4;Sequence;;;4\n") + expFileHandle.write("Batch_Gene2_DEL_8_Line3;A;DELETION;8;C--TAGCCA;CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC;3;2;3;Sequence;;;5\n") + expFileHandle.write("Batch_Gene2_DEL_8_Line4;A;DELETION;8;CCTAAGCCA;CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA;3;2;4;Sequence;;;7\n") + expFileHandle.write("Batch_Gene2_SNP_19_Line3;A;SNP;19;C--TAGCCA---CTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATC;1;2;3;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_19_Line4;A;SNP;19;CCTAAGCCATT-CTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATA;1;2;4;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_23_Line3;A;SNP;23;C--TAGCCA---CTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATC;1;2;3;Sequence;;;3\n") + expFileHandle.write("Batch_Gene2_SNP_23_Line4;A;SNP;23;CCTAAGCCATT-CTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATA;1;2;4;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_34_Line3;A;SNP;34;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATC;1;2;3;Sequence;;;3\n") + expFileHandle.write("Batch_Gene2_SNP_34_Line4;A;SNP;34;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATA;1;2;4;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_49_Line3;A;SNP;49;C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;;1;2;3;Sequence;;;2\n") + expFileHandle.write("Batch_Gene2_SNP_49_Line4;A;SNP;49;CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;;1;2;4;Sequence;;;4\n") + expFileHandle.close() + + def _writeExpSubSNPFileSeveralLineSeq(self): + expFileHandle = open(self._expSubSNPFileName, "w") + expFileHandle.write("SubSNPName;ConfidenceValue;Type;Position;5flank;3flank;Length;BatchNumber;IndividualNumber;PrimerType;PrimerNumber;Forward_or_Reverse;AlleleNumber\n") + expFileHandle.write("Batch1_SNP_4_Line1;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA;1;1;1;Sequence;;;1\n") + expFileHandle.write("Batch1_SNP_4_Line2;A;SNP;4;CCT;AGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATACGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.write("Batch1_SNP_21_Line1;A;SNP;21;CCTTAGCCATTGCTTGGTGA;TATGAAGGCAGTAGGCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_21_Line2;A;SNP;21;CCTAAGCCATTGCTTGGTGA;TATCAAGGCAGTAGCCAAACCTCCACAATACGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_25_Line1;A;SNP;25;CCTTAGCCATTGCTTGGTGACTAT;AAGGCAGTAGGCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_25_Line2;A;SNP;25;CCTAAGCCATTGCTTGGTGACTAT;AAGGCAGTAGCCAAACCTCCACAATACGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_36_Line1;A;SNP;36;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAG;CAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA;1;1;1;Sequence;;;3\n") + expFileHandle.write("Batch1_SNP_36_Line2;A;SNP;36;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAG;CAAACCTCCACAATACGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_51_Line1;A;SNP;51;CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAAT;CGCAGTAGCCAAACCTCCACAATA;1;1;1;Sequence;;;2\n") + expFileHandle.write("Batch1_SNP_51_Line2;A;SNP;51;CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAAT;CGCAGTAGCCAAACCTCCACAATA;1;1;2;Sequence;;;4\n") + expFileHandle.close() + + + def _writeExpAlleleFile(self): + expFileHandle = open(self._expAlleleFileName, "w") + expFileHandle.write("AlleleNumber;Value;Motif;NbCopy;Comment\n") + expFileHandle.write("1;T;;;\n") + expFileHandle.write("2;C;;;\n") + expFileHandle.write("3;G;;;\n") + expFileHandle.write("4;A;;;\n") + expFileHandle.close() + + def _writeExpAlleleFileWithSnpsAndIndels(self): + expFileHandle = open(self._expAlleleFileName, "w") + expFileHandle.write("AlleleNumber;Value;Motif;NbCopy;Comment\n") + expFileHandle.write("1;T;;;\n") + expFileHandle.write("2;C;;;\n") + expFileHandle.write("3;G;;;\n") + expFileHandle.write("4;A;;;\n") + expFileHandle.write("5;---;;;\n") + expFileHandle.write("6;CT;;;\n") + expFileHandle.write("7;TT-;;;\n") + expFileHandle.write("8;--;;;\n") + expFileHandle.close() + + + def _writeExpAlleleFileSeveralBatches(self): + expFileHandle = open(self._expAlleleFileName, "w") + expFileHandle.write("AlleleNumber;Value;Motif;NbCopy;Comment\n") + expFileHandle.write("1;T;;;\n") + expFileHandle.write("2;C;;;\n") + expFileHandle.write("3;G;;;\n") + expFileHandle.write("4;A;;;\n") + expFileHandle.write("5;---;;;\n") + expFileHandle.write("6;CT;;;\n") + expFileHandle.write("7;TT-;;;\n") + expFileHandle.write("8;--;;;\n") + expFileHandle.close() + + def _writeExpIndividualFile(self): + expFileHandle = open(self._expIndividualFileName, "w") + expFileHandle.write("IndividualNumber;IndividualName;Description;AberrAneuploide;FractionLength;DeletionLineSynthesis;UrlEarImage;TypeLine;ChromNumber;ArmChrom;DeletionBin;ScientificName;local_germplasm_name;submitter_code;local_institute;donor_institute;donor_acc_id\n") + expFileHandle.write("1;Line1;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.write("2;Line2;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.close() + + def _writeExpIndividualFile_different_lines_between_files(self): + expFileHandle = open(self._expIndividualFileName, "w") + expFileHandle.write("IndividualNumber;IndividualName;Description;AberrAneuploide;FractionLength;DeletionLineSynthesis;UrlEarImage;TypeLine;ChromNumber;ArmChrom;DeletionBin;ScientificName;local_germplasm_name;submitter_code;local_institute;donor_institute;donor_acc_id\n") + expFileHandle.write("1;Line1;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.write("2;Line2;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.write("3;Line3;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.write("4;Line4;;;;;;;;;;Arabidopsis thaliana;;;;;\n") + expFileHandle.close() + + def _writeExpSequenceFile(self): + SequenceFSAFileHandle = open(self._expSequenceFSAFileName, "w") + SequenceFSAFileHandle.write(">Sequence_de_Reference\n") + SequenceFSAFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceCSVFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceCSVFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceCSVFileHandle.write("Sequence_de_Reference;Reference;;;;;Arabidopsis thaliana\n") + + def _writeExpSequenceFileSeveralLineSeq(self): + SequenceFSAFileHandle = open(self._expSequenceFSAFileName, "w") + SequenceFSAFileHandle.write(">Sequence_de_Reference\n") + SequenceFSAFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA\n") + SequenceCSVFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceCSVFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceCSVFileHandle.write("Sequence_de_Reference;Reference;;;;;Arabidopsis thaliana\n") + + def _writeExpSequenceFileWithDeletion(self): + SequenceFSAFileHandle = open(self._expSequenceFSAFileName, "w") + SequenceFSAFileHandle.write(">Sequence_de_Reference\n") + SequenceFSAFileHandle.write("CAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceCSVFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceCSVFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceCSVFileHandle.write("Sequence_de_Reference;Reference;;;;;Arabidopsis thaliana\n") + + def _writeExpSequenceSeveralBatches(self): + SequenceFSAFileHandle = open(self._expSequenceFSAFileName, "w") + SequenceFSAFileHandle.write(">Sequence_de_Reference1\n") + SequenceFSAFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceFSAFileHandle.write(">Sequence_de_Reference2\n") + SequenceFSAFileHandle.write("CAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceCSVFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceCSVFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceCSVFileHandle.write("Sequence_de_Reference1;Reference;;;;;Arabidopsis thaliana\n") + SequenceCSVFileHandle.write("Sequence_de_Reference2;Reference;;;;;Arabidopsis thaliana\n") + + def _writeExpSequenceSeveralBatchesForSameRefSeq(self): + SequenceFSAFileHandle = open(self._expSequenceFSAFileName, "w") + SequenceFSAFileHandle.write(">Sequence_de_Reference1\n") + SequenceFSAFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceFSAFileHandle.write(">Sequence_de_Reference1\n") + SequenceFSAFileHandle.write("CAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + SequenceCSVFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceCSVFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceCSVFileHandle.write("Sequence_de_Reference1;Reference;;;;;Arabidopsis thaliana\n") + SequenceCSVFileHandle.write("Sequence_de_Reference1;Reference;;;;;Arabidopsis thaliana\n") + + def _writeExpBatchFile(self): + BatchFileHandle = open(self._expBatchFileName, "w") + BatchFileHandle.write("BatchNumber: 1\n") + BatchFileHandle.write("BatchName: Batch1\n") + BatchFileHandle.write("GeneName: methyltransferase\n") + BatchFileHandle.write("Description: \n") + BatchFileHandle.write("ContactNumber: 1\n") + BatchFileHandle.write("ProtocolNumber: 1\n") + BatchFileHandle.write("ThematicNumber: 1\n") + BatchFileHandle.write("RefSeqName: Sequence_de_Reference\n") + BatchFileHandle.write("AlignmentFileName: \n") + BatchFileHandle.write("SeqName: \n") + BatchFileHandle.write("//\n") + BatchFileHandle.close() + + def _writeExpBatchFileSeveralBatches(self): + BatchFileHandle = open(self._expBatchFileName, "w") + BatchFileHandle.write("BatchNumber: 1\n") + BatchFileHandle.write("BatchName: Batch_Gene1\n") + BatchFileHandle.write("GeneName: Gene1\n") + BatchFileHandle.write("Description: \n") + BatchFileHandle.write("ContactNumber: 1\n") + BatchFileHandle.write("ProtocolNumber: 1\n") + BatchFileHandle.write("ThematicNumber: 1\n") + BatchFileHandle.write("RefSeqName: Sequence_de_Reference1\n") + BatchFileHandle.write("AlignmentFileName: \n") + BatchFileHandle.write("SeqName: \n") + BatchFileHandle.write("//\n") + BatchFileHandle.write("BatchNumber: 2\n") + BatchFileHandle.write("BatchName: Batch_Gene2\n") + BatchFileHandle.write("GeneName: Gene2\n") + BatchFileHandle.write("Description: \n") + BatchFileHandle.write("ContactNumber: 1\n") + BatchFileHandle.write("ProtocolNumber: 1\n") + BatchFileHandle.write("ThematicNumber: 1\n") + BatchFileHandle.write("RefSeqName: Sequence_de_Reference2\n") + BatchFileHandle.write("AlignmentFileName: \n") + BatchFileHandle.write("SeqName: \n") + BatchFileHandle.write("//\n") + BatchFileHandle.close() + + def _writeExpBatchFileSeveralBatchesForSameRefSeq(self): + BatchFileHandle = open(self._expBatchFileName, "w") + BatchFileHandle.write("BatchNumber: 1\n") + BatchFileHandle.write("BatchName: Batch_Gene1\n") + BatchFileHandle.write("GeneName: Gene1\n") + BatchFileHandle.write("Description: \n") + BatchFileHandle.write("ContactNumber: 1\n") + BatchFileHandle.write("ProtocolNumber: 1\n") + BatchFileHandle.write("ThematicNumber: 1\n") + BatchFileHandle.write("RefSeqName: Sequence_de_Reference1\n") + BatchFileHandle.write("AlignmentFileName: \n") + BatchFileHandle.write("SeqName: \n") + BatchFileHandle.write("//\n") + BatchFileHandle.write("BatchNumber: 2\n") + BatchFileHandle.write("BatchName: Batch_Gene2\n") + BatchFileHandle.write("GeneName: Gene2\n") + BatchFileHandle.write("Description: \n") + BatchFileHandle.write("ContactNumber: 1\n") + BatchFileHandle.write("ProtocolNumber: 1\n") + BatchFileHandle.write("ThematicNumber: 1\n") + BatchFileHandle.write("RefSeqName: Sequence_de_Reference1\n") + BatchFileHandle.write("AlignmentFileName: \n") + BatchFileHandle.write("SeqName: \n") + BatchFileHandle.write("//\n") + BatchFileHandle.close() + + + BatchFileHandle.close() + + def _writeExpBatchLineFile(self): + BatchLineFileHandle = open(self._expBatchLineFileName, "w") + BatchLineFileHandle.write("IndividualNumber;Pos5;Pos3;BatchNumber;Sequence\n") + BatchLineFileHandle.write("1;;;1;\n") + BatchLineFileHandle.write("2;;;1;\n") + BatchLineFileHandle.close() + + + def _writeExpBatchLineFileSeveralBatches(self): + BatchLineFileHandle = open(self._expBatchLineFileName, "w") + BatchLineFileHandle.write("IndividualNumber;Pos5;Pos3;BatchNumber;Sequence\n") + BatchLineFileHandle.write("1;;;1;\n") + BatchLineFileHandle.write("2;;;1;\n") + BatchLineFileHandle.write("1;;;2;\n") + BatchLineFileHandle.write("2;;;2;\n") + BatchLineFileHandle.close() + + def _writeExpBatchLineFileSeveralBatches_different_lines_between_files(self): + BatchLineFileHandle = open(self._expBatchLineFileName, "w") + BatchLineFileHandle.write("IndividualNumber;Pos5;Pos3;BatchNumber;Sequence\n") + BatchLineFileHandle.write("1;;;1;\n") + BatchLineFileHandle.write("2;;;1;\n") + BatchLineFileHandle.write("3;;;2;\n") + BatchLineFileHandle.write("4;;;2;\n") + BatchLineFileHandle.close() + + def _writeInputFile(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + def _writeInputFileWithSnpsAndIndels(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("C--AAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + def _writeInputFileWithSeqErrorsInRefSeq(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTA7GCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + def _writeInputFileWithSeqErrorsInOneLineSeq(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATXAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + def _writeInputFileWithASeveralLineSeq(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATCCGCAGTAGCCAAACCTCCACAATA\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA\nCGCAGTAGCCAAA\nCCTCCACAATA\n") + inFileHandle.close() + + + def _writeInputFileWithUpcaseAndLowcaseNucleotide(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTAAGCCATTGCTTGGtGATTATGAAGgCAGTAGTCAAACCTCCACAATC\nCGCAGTAGCCAAA\nCCTCCACAATA\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCtTGGTGACTATGAAGGcAGTAGGCAAACCTCCACAATC\nCGCAGTAGCCAAA\nCCTCCACAATA\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCAtTGCTTGGTGACTATCaAGGCAGTAGCCAAACCTCCACAATA\nCGCAGTAGCCAAA\nCCTCCACAATA\n") + inFileHandle.close() + + def _writeInputFileWith2SeqsWithTheSameName(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">Sequence_de_Reference\n") + inFileHandle.write("CCTAAGCCATTGCTTGGtGATTATGAAGgCAGTAGTCAAACCTCCACAATC\nCGCAGTAGCCAAA\nCCTCCACAATA\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCtTGGTGACTATGAAGGcAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCAtTGCTTGGTGACTATCaAGGCAGTAGCCAAACCTCCACAATA\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCAtTGCTTGGTGACTATCaAGGCAGTAGCCAAACCTCCACAATA\n") + inFileHandle.close() + + def _writeInputFileBatchWithPotentialDooblons(self): + inFileHandle = open(self._inFileName, "w") + inFileHandle.write(">AU247387ref\n") + inFileHandle.write("CACTATAGCTCCTAACATTCCTGAAGTGAAGATCACGGAGGACCTGGCTGTCAATGTTGCCCGCTCGCTGAGATATGAGATCAACAGGGGCTTTGCTAGCCTGAGGGCGATTGGTCAAGGCCGTGACCTGAAGAAATTCCTGATTGTACGTTCTGGTTACTCTTCAATTTGGGCATGCTTAATTATCTCCTCAATTTCAATTTGGCCATGCTTAATGTTGGGTGCTTTCTTTATAGCCTGCTCACCAACATGTGATCTGTTCTTTGTATGCTCAGGTGGTTGCATGGCTTCGTTCTCTTTAGCCTTCGCTGTTTGTGGCTTTGTTATGTGACCAAGCACTTGCTATACTGTCTATTTGTTCGCAGGTGATTGCAGGTCTGTGGATCCTCTGGGTTCTTTCTGCCCTTGGGAGCTGCTGCAATTTCCTCACCTTGTTCTACATAGGTAATGTGCTTCGCTGCTACAGCCTGAACTTGTGCTGCAACAGATGTGCAGTAACTGTACCTAGCATTGTTTACCCATACGAGTTGTGAACTGATGACATCCTCTCGCTTTCTTACTTGCAGTCTTCATGGTTCTCTACACTGTGCCGGTTCTGTACGAGAAGTACGAGGACAAGATCGATGCTTTTGGAGAGAAG\n") + inFileHandle.write(">10102\n") + inFileHandle.write("NNNtatagctcctaacattcctgaagtgaagatcacrgaggacnnggctgtcaatgttgcccgctcgctgagatatgagatcaacaggggcttygctagcttgagggcgattggNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">10954\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtcaatgttgcccgctcgctgagatatgagatcaacaggggctttgctagcctgagggcgattggtcaaggccgtgacctgaagaaattcctgattgtacgt---------------------------ttaat---------------------------------------------------------------------------------------------tggttgcatggcttcgttctctttagccttcgctgtttgtggctttgttatgtgaccaagcacttgctatactgtctatttgttcgcaggtgattgcaggtctgtggatcctct---------ctgcccttgggagctgctgcaatttcctcaccttgttctacataggtaatgtgcttcgctgctacagcctgaacttg--------cagatgtgcagtaactgtacctagcattgtttacccat------------------------tctcgctttcttacNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">ABERAVON\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNggtcaaggccgtgacctgaagaaattcctgattgtacgt---------------------------ttaat---------------------------------------------------------------------------------------------tggttgcatggcttcgttctctttagccttcgctgtttgtggctttgttatgtgaccNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">CARILLON\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNcaacattgcccgctcgctgagatatgagatcaacaggggcttctttactttgaaggagatcggtcagggccgtgatctgaagaaattcctcattgtatgttctggttactcttcaatttgggcatgcttaat---------------------------------gttgggtgctttctttat--cctgctcaccaacatgtgatctgttctttgtatgctcaggtggttgccgg---------------------------------------------------------------------------------------------------cctctgggttctttctgttcttgggagctcttgcaacttcttgacattggcatatataggtaat------------------tttaacttgtgctgcaacacttgagttcataaccaccctag------ttgtccatacgagttgtgaactgatgacatccgttctttttcccragtgcagtcttcgtggtgctctacacggtgccagttctgtatgaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">CONCERTO\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNctttgttatgtgaccaagcacttgctatactgtctatttgttcgcaggtgattgcaggtctgtggatcctct---------ctgcccttgggagctgctgcaatttcctcaccttgttctacataggtaatgtgcttcgctgctacagcctgaacttg--------cagatgtgcagtaactgtacctagcattgtttacccat------------------------tctcgctttcttacttgcagtcttcatggttctctacactgtgccNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">F14-13\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtsaatgttgcccgctcgctgagatatgagatcaacaggggctttgctagcctgagggcgattggtcaaggccgtgacctgaagaaaNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">GAGNY\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNcattgcccgctcgctgagatatgagatcaacaggggcttctttactttgaaggagatyggtcagggccgtgayctgaagaaattcctsattgtaygtNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">GREECE\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtsaacattgcccgctcgctgagatatgagatcaacaggggcttctttactttgaaggagatyggycagggccgtgatctgaagaaattcctcattgtatgtNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">IMAGINE\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtcaatgttgcccgctcgctgagatatgagatcaacaggggctttgctagcctgagggcgattggtcaaggccgtgacctgaagaaattcctgattgtacgt---------------------------ttaat---------------------------------------------------------------------------------------------tggttgcatggcttcgttctctttagccttcgctgtttgtggctttgttatgtgaccaagcacttgctatactgtctatttgttcgcaggtgattgcaggtctgtggatcctct---------ctgcccttgggagctgctgcaatttcctcaccttgttctacataggtaatgtgcttcgctgctacagcctgaacttg--------cagatgtgcagtaactgtacctagcattgtttacccat------------------------tctcgctttcttacttgcagtcttcatggttctctacactgtgccNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">IRELAND\n") + inFileHandle.write("NNNTATAGCTCCTAACATTCCTGAAGTGACGATTCCAGAGGACACGATTGTGAACATTGCCCGCTCGCTGAGATATGAGATCAACAGGGGCTTCTTTACTTTGATGGAGATTGGCCAGGGCCGTGATCTGAAGAAATTCCTCATTGTATGT---------------------------TTGTTTATCTCCTCAATTTCAATTTGGCCATGCTTAATGTTGGGTGCTTTCTGTATAGCCTGCTCACCAAGGTGTGATCTCTTCTTTGTATACACAGGTGGTTGCTGG---------------------------------------------------------------------------------------------------CCTCTGGGTTCTTTCTGTTCTTGGGAGCTCTTGCAACTTCTTGACNTTGGCATATATAGGTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">NEMOF\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtcaatgttgcccgctcgctgagatatgagatcaacaggggctttgctagcctgagggcgattggtcaaggccgtgacctgaagaaattcctgattgtacgt---------------------------ttaat---------------------------------------------------------------------------------------------tggttgcatggcttcgttctctttagccttcgctgtttgtggctttgttatgtgaccaagcacttgctatactgtctatttgttcgcaggtgattgcaggtctgtggatcctct---------ctgcccttgggNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">NEMOH\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtcaatgttgcccgctcgctgagatatgagatcaacaggggctttgctagcctgagggcgattggtcaaggccgtgacctgaagaaattcctgattgtacgt---------------------------ttaat---------------------------------------------------------------------------------------------tggttgcatggcttcgttctctttagccttcgctgtttgtggctttgttatgtgaccaagcacttgctatactgtctatttgttcgcaggtgattgcaggtctgtggatcctct---------ctgcccttgggagctgctgcaatttcctcaccttgttctacataggtaatgtgcttcgctgctacagcctgaacttg--------cagatgtgcagtaactgtacctagcattgtttacccat------------------------tctcgctttcttacNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">POLAND\n") + inFileHandle.write("NNNTATAGCTCCTAACATTCCTGAAGTGAAGATCACGGAGGACCTGGCTGTCAATGTTGCCCGCTCGCTGAGATATGAGATCAACAGGGGCTTTGCTAGCCTGAGGGCGATTGGTCAAGGCCGTGACCTGAAGAAATTCCTGATTGTAYGT---------------------------TTAAT---------------------------------------------------------------------------------------------TGGTTGCATGGCTTCGTTCTCTTTAGCCTTCGCTGTTTGTGGCTTTGTTATGTGACCAAGCNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">SPAIN\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNtcaacattgcccgctcgctgagatatgagatcaacaggggcttctttactttgaaggagatcggtcagggccgtgatctgaagaaattcctcattgtatgttctggttactcttcaatttgggcatgcttaat---------------------------------gttgggtgctttctttat--cctgctcaccaacatgtgatctgttctttgtatgctcaggtggttgccgg---------------------------------------------------------------------------------------------------cctctgggttctttctgttcttgggagctcttgcaacttcttgacattggcatatataggtaat------------------tttaacttgtgctgcaacacttgagttcataaccaccctag------ttgtccatacgagttgtgaactgatgacatccgttctttttcccgagtgcagtcttcgtggtgctctacacggtgccagttctgtatgagaagtacgacgacaaggttgatgcttttggtgagaag\n") + inFileHandle.write(">TRANSATE\n") + inFileHandle.write("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNcgctcgctgagatatgagatcaacaggggcttctttactttgaaggagatYggccagggtcgcgacctcaagaaattcctcattgtatgttgcttgt-ctcttcaatttcaacatgcttgat---------------------------------gttgggtgctttctttat--cctgctcaccaacatgtgatctcttctttgtatgctcaggtggttgcggg---------------------------------------------------------------------------------------------------tctctgggttctttctgttcttgggagctcttgcaacttcttgacattggcatatataggtaaK------------------tataRcttgtgctgcaacacttgagttcataaccNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n") + inFileHandle.write(">VIGOR\n") + inFileHandle.write("NNNTATAGCTCCTAACATTCCTGAAGTGAAGATCACGGAGGACCTGGCTGTCAATGTTGCCCGCTCGCTGAGATATGAGATCAACAGGGGCTTTGCTAGCCTGAGGGCGATTGGTCAAGGCCGTGACCTGAAGAAATTCCTGATTGTACGT---------------------------TTAAT---------------------------------------------------------------------------------------------TGGTTGCATGGCTTCGTTCTCTTTAGCCTTCGCTGTTTGTGGCTTTGTTATGTGACCAAGCACTTGCTATACTGTCTATTTGTTCGCAGGTGATTGCAGGTCTGTGGATCCTCT---------CTGCCCTTGGGAGCTGCTGCAATTTCCTCACCTTGTTCTACATAGGTAATGTGCTTCGCTGCTACAGCCTGAACTTG--------CAGATGTGCAGTAACTGTACCTAGCATTGTTTACCCAT------------------------TCTCGCTTTCTTACTTGCAGTCTTCATGGTTCTCTACACTGTGCCGGTTCTGTACGAGAAGTACGAGGACAAGATCGATGCTTTTGGAGAGAAG\n") + inFileHandle.close() + + def _writeRealExpAlleleFile(self): + expFileHandle = open(self._expAlleleFileName, "w") + expFileHandle.write("AlleleNumber;Value;Motif;NbCopy;Comment\n") + expFileHandle.write("1;G;;;\n") + expFileHandle.write("2;T;;;\n") + expFileHandle.write("3;A;;;\n") + expFileHandle.write("4;C;;;\n") + expFileHandle.write("5;-;;;\n") + expFileHandle.close(); + + def _writeRealExpSequenceCSVFile(self): + SequenceFSAFileHandle = open(self._expSequenceCSVFileName, "w") + SequenceFSAFileHandle.write("SequenceName;SeqType;BankName;BankVersion;ACNumber;Locus;ScientificName\n") + SequenceFSAFileHandle.write("PpHDZ31_ref;Reference;;;;;Pinus pinaster\n") + SequenceFSAFileHandle.close() + + def _writeRealExpBatchFile(self): + FileHandle = open(self._expBatchFileName, "w") + FileHandle.write("BatchNumber: 1\n") + FileHandle.write("BatchName: INRA_Pinus_pinaster_HDZ31-1\n") + FileHandle.write("GeneName: PpHDZ31\n") + FileHandle.write("Description: \n") + FileHandle.write("ContactNumber: 1\n") + FileHandle.write("ProtocolNumber: 1\n") + FileHandle.write("ThematicNumber: 1\n") + FileHandle.write("RefSeqName: PpHDZ31_ref\n") + FileHandle.write("AlignmentFileName: \n") + FileHandle.write("SeqName: \n") + FileHandle.write("//\n") + FileHandle.close() + + + def _writeInputFileSeveralBatches(self): + if(not FileUtils.isRessourceExists(self._inputDirSeveralBatches)): + os.mkdir(self._inputDirSeveralBatches) + + inFileHandle = open(self._inputDirSeveralBatches+"/Gene1.fasta","w") + inFileHandle.write(">Sequence_de_Reference1\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + inFileHandle2 = open(self._inputDirSeveralBatches+"/Gene2.fasta","w") + inFileHandle2.write(">Sequence_de_Reference2\n") + inFileHandle2.write("C--AAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle2.write(">Line1\n") + inFileHandle2.write("C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle2.write(">Line2\n") + inFileHandle2.write("CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle2.close() + + def _writeInputFileSeveralBatches_different_lines_between_files(self): + if(not FileUtils.isRessourceExists(self._inputDirSeveralBatches)): + os.mkdir(self._inputDirSeveralBatches) + + inFileHandle = open(self._inputDirSeveralBatches+"/Gene1.fasta","w") + inFileHandle.write(">Sequence_de_Reference1\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + inFileHandle2 = open(self._inputDirSeveralBatches+"/Gene2.fasta","w") + inFileHandle2.write(">Sequence_de_Reference2\n") + inFileHandle2.write("C--AAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle2.write(">Line3\n") + inFileHandle2.write("C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle2.write(">Line4\n") + inFileHandle2.write("CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle2.close() + + def _writeInputFileSeveralBatches_different_lines_and_same_refseq_between_files(self): + if(not FileUtils.isRessourceExists(self._inputDirSeveralBatches)): + os.mkdir(self._inputDirSeveralBatches) + + inFileHandle = open(self._inputDirSeveralBatches+"/Gene1.fasta","w") + inFileHandle.write(">Sequence_de_Reference1\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle.write(">Line1\n") + inFileHandle.write("CCTTAGCCATTGCTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle.write(">Line2\n") + inFileHandle.write("CCTAAGCCATTGCTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle.close() + + inFileHandle2 = open(self._inputDirSeveralBatches+"/Gene2.fasta","w") + inFileHandle2.write(">Sequence_de_Reference1\n") + inFileHandle2.write("C--AAGCCATTGCTTGGTGATTATGAAGGCAGTAGTCAAACCTCCACAATC\n") + inFileHandle2.write(">Line3\n") + inFileHandle2.write("C--TAGCCA---CTTGGTGACTATGAAGGCAGTAGGCAAACCTCCACAATC\n") + inFileHandle2.write(">Line4\n") + inFileHandle2.write("CCTAAGCCATT-CTTGGTGACTATCAAGGCAGTAGCCAAACCTCCACAATA") + inFileHandle2.close() + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file