Mercurial > repos > onnodg > add_taxonomic_labels
diff tests/test_add_taxonomic_labels.py @ 0:abd214795fa5 draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:07:01 +0000 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_add_taxonomic_labels.py Tue Oct 14 09:07:01 2025 +0000 @@ -0,0 +1,132 @@ +""" +Test suite for add taxonomic labels tool. +""" + +import pytest +import tempfile +import os + +from Stage_1_translated.NLOOR_scripts.add_header_tool.add_taxonomic_labels import add_labels + + +class TestTaxonomyProcessing: + """Test the main taxonomy processing functionality.""" + + @pytest.fixture + def sample_input_data(self): + """Provide sample input data for testing.""" + return """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy +M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species +M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16) source=NCBI sequenceID=KC539736 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Ulmaceae genus=Ulmus species=Ulmus laevis markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 89 1.44e-38 152 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species""" + + @pytest.fixture + def temp_files(self): + """Create temporary input and output files.""" + input_file = tempfile.NamedTemporaryFile(mode='w', delete=False) + input_file.close() + output_file = tempfile.NamedTemporaryFile(mode='w', delete=False) + output_file.close() + + yield input_file.name, output_file.name + + # Cleanup + os.unlink(input_file.name) + os.unlink(output_file.name) + + def test_header_creation(self, temp_files): + """Test that the correct header is written to output file.""" + input_file, output_file = temp_files + + # Create empty input file + with open(input_file, 'w') as f: + f.write("#Query ID header line\n") + + add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) + + with open(output_file, 'r') as f: + header = f.readline().strip() + + expected_header = '#Query ID\t#Subject\t#Subject accession\t#Subject Taxonomy ID\t#Identity percentage\t#Coverage\t#evalue\t#bitscore\t#Source\t#Taxonomy' + assert header == expected_header + + def test_basic_taxonomy_processing(self, temp_files, sample_input_data): + """Test basic taxonomy line processing.""" + input_file, output_file = temp_files + + with open(input_file, 'w') as f: + f.write(sample_input_data) + + add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) # kingdom, phylum, species + + with open(output_file, 'r') as f: + lines = f.readlines() + + # Should have header + 2 data lines + assert len(lines) == 3 + + # Check first data line + first_data_line = lines[1].strip() + assert 'M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 NCBI Viridiplantae / Streptophyta / Magnoliopsida / Malpighiales / Salicaceae / Populus / Populus tremula' in first_data_line + + # Check second data line + second_data_line = lines[2].strip() + assert 'M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16) source=NCBI sequenceID=KC539736 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Ulmaceae genus=Ulmus species=Ulmus laevis markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 89 1.44e-38 152 NCBI Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Ulmaceae / Ulmus / Ulmus laevis' in second_data_line + + def test_single_taxon_level(self, temp_files): + """Test processing with only one taxonomic level.""" + input_file, output_file = temp_files + + test_data = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy +M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species""" + with open(input_file, 'w') as f: + f.write(test_data) + + add_labels(input_file, output_file, [13]) + + with open(output_file, 'r') as f: + lines = f.readlines() + + data_line = lines[1].strip() + assert data_line.endswith('Populus tremula') # Should not have ' / ' since it's the last level + + def test_default_taxon_levels(self, temp_files, sample_input_data): + """Test processing with default taxonomic levels.""" + input_file, output_file = temp_files + + with open(input_file, 'w') as f: + f.write(sample_input_data) + + # Use default levels + add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) + + with open(output_file, 'r') as f: + lines = f.readlines() + + # Check that the taxonomy string has the expected structure + first_data_line = lines[1] + taxonomy_part = first_data_line.split('\t')[-1].strip() + + # Should have ' / ' separators between levels (except the last one) + separator_count = taxonomy_part.count(' / ') + assert separator_count == 6 # 7 levels - 1 = 6 separators + + +class TestEdgeCases: + """Test edge cases and error conditions.""" + + def test_different_input_file(self, tmp_path): + """Test processing an input file with unexpected data.""" + input_file = tmp_path / "empty_input.txt" + output_file = tmp_path / "output.txt" + + input_file.write_text("""#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy +M01687:476:000000000-LL5F5:1:1102:11130:1143 source=NCBI sequenceID=KP794848 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Rosaceae genus=Sorbus species=Sorbus aucuparia markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 100 5.00e-43 167 Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Rosaceae / Sorbus / Sorbus aucuparia""") + + with pytest.raises(ValueError, match="Line does not contain expected fields: superkingdom, markercode, or Genbank"): + add_labels(str(input_file), str(output_file), [1, 2, 4, 7, 11, 12, 13]) + + +# Example of how to run these tests +if __name__ == "__main__": + # Run all tests in this file + pytest.main([__file__]) \ No newline at end of file
