diff tests/test_add_taxonomic_labels.py @ 0:abd214795fa5 draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
author onnodg
date Tue, 14 Oct 2025 09:07:01 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test_add_taxonomic_labels.py	Tue Oct 14 09:07:01 2025 +0000
@@ -0,0 +1,132 @@
+"""
+Test suite for add taxonomic labels tool.
+"""
+
+import pytest
+import tempfile
+import os
+
+from Stage_1_translated.NLOOR_scripts.add_header_tool.add_taxonomic_labels import add_labels
+
+
+class TestTaxonomyProcessing:
+    """Test the main taxonomy processing functionality."""
+
+    @pytest.fixture
+    def sample_input_data(self):
+        """Provide sample input data for testing."""
+        return """#Query ID	#Subject	#Subject accession	#Subject Taxonomy ID	#Identity percentage	#Coverage	#evalue	#bitscore	#Source	#Taxonomy
+M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1)	source=NCBI   sequenceID=HM590330   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Malpighiales   suborder=NA   infraorder=NA   superfamily=NA   family=Salicaceae   genus=Populus   species=Populus tremula   markercode=trnL   lat=50.47   lon=-104.37	source=NCBI	N/A	100.000	91	8.47e-10	54.7	Genbank	unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species
+M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16)	source=NCBI   sequenceID=KC539736   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Rosales   suborder=NA   infraorder=NA   superfamily=NA   family=Ulmaceae   genus=Ulmus   species=Ulmus laevis   markercode=trnL   lat=NA   lon=NA	source=NCBI	N/A	100.000	89	1.44e-38	152	Genbank	unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species"""
+
+    @pytest.fixture
+    def temp_files(self):
+        """Create temporary input and output files."""
+        input_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        input_file.close()
+        output_file = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        output_file.close()
+
+        yield input_file.name, output_file.name
+
+        # Cleanup
+        os.unlink(input_file.name)
+        os.unlink(output_file.name)
+
+    def test_header_creation(self, temp_files):
+        """Test that the correct header is written to output file."""
+        input_file, output_file = temp_files
+
+        # Create empty input file
+        with open(input_file, 'w') as f:
+            f.write("#Query ID	header line\n")
+
+        add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13])
+
+        with open(output_file, 'r') as f:
+            header = f.readline().strip()
+
+        expected_header = '#Query ID\t#Subject\t#Subject accession\t#Subject Taxonomy ID\t#Identity percentage\t#Coverage\t#evalue\t#bitscore\t#Source\t#Taxonomy'
+        assert header == expected_header
+
+    def test_basic_taxonomy_processing(self, temp_files, sample_input_data):
+        """Test basic taxonomy line processing."""
+        input_file, output_file = temp_files
+
+        with open(input_file, 'w') as f:
+            f.write(sample_input_data)
+
+        add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13])  # kingdom, phylum, species
+
+        with open(output_file, 'r') as f:
+            lines = f.readlines()
+
+        # Should have header + 2 data lines
+        assert len(lines) == 3
+
+        # Check first data line
+        first_data_line = lines[1].strip()
+        assert 'M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1)	source=NCBI   sequenceID=HM590330   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Malpighiales   suborder=NA   infraorder=NA   superfamily=NA   family=Salicaceae   genus=Populus   species=Populus tremula   markercode=trnL   lat=50.47   lon=-104.37	source=NCBI	N/A	100.000	91	8.47e-10	54.7	NCBI	Viridiplantae / Streptophyta / Magnoliopsida / Malpighiales / Salicaceae / Populus / Populus tremula' in first_data_line
+
+        # Check second data line
+        second_data_line = lines[2].strip()
+        assert 'M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16)	source=NCBI   sequenceID=KC539736   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Rosales   suborder=NA   infraorder=NA   superfamily=NA   family=Ulmaceae   genus=Ulmus   species=Ulmus laevis   markercode=trnL   lat=NA   lon=NA	source=NCBI	N/A	100.000	89	1.44e-38	152	NCBI	Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Ulmaceae / Ulmus / Ulmus laevis' in second_data_line
+
+    def test_single_taxon_level(self, temp_files):
+        """Test processing with only one taxonomic level."""
+        input_file, output_file = temp_files
+
+        test_data = """#Query ID	#Subject	#Subject accession	#Subject Taxonomy ID	#Identity percentage	#Coverage	#evalue	#bitscore	#Source	#Taxonomy
+M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1)	source=NCBI   sequenceID=HM590330   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Malpighiales   suborder=NA   infraorder=NA   superfamily=NA   family=Salicaceae   genus=Populus   species=Populus tremula   markercode=trnL   lat=50.47   lon=-104.37	source=NCBI	N/A	100.000	91	8.47e-10	54.7	Genbank	unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species"""
+        with open(input_file, 'w') as f:
+            f.write(test_data)
+
+        add_labels(input_file, output_file, [13])
+
+        with open(output_file, 'r') as f:
+            lines = f.readlines()
+
+        data_line = lines[1].strip()
+        assert data_line.endswith('Populus tremula')  # Should not have ' / ' since it's the last level
+
+    def test_default_taxon_levels(self, temp_files, sample_input_data):
+        """Test processing with default taxonomic levels."""
+        input_file, output_file = temp_files
+
+        with open(input_file, 'w') as f:
+            f.write(sample_input_data)
+
+        # Use default levels
+        add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13])
+
+        with open(output_file, 'r') as f:
+            lines = f.readlines()
+
+        # Check that the taxonomy string has the expected structure
+        first_data_line = lines[1]
+        taxonomy_part = first_data_line.split('\t')[-1].strip()
+
+        # Should have ' / ' separators between levels (except the last one)
+        separator_count = taxonomy_part.count(' / ')
+        assert separator_count == 6  # 7 levels - 1 = 6 separators
+
+
+class TestEdgeCases:
+    """Test edge cases and error conditions."""
+
+    def test_different_input_file(self, tmp_path):
+        """Test processing an input file with unexpected data."""
+        input_file = tmp_path / "empty_input.txt"
+        output_file = tmp_path / "output.txt"
+
+        input_file.write_text("""#Query ID	#Subject	#Subject accession	#Subject Taxonomy ID	#Identity percentage	#Coverage	#evalue	#bitscore	#Source	#Taxonomy
+M01687:476:000000000-LL5F5:1:1102:11130:1143	source=NCBI   sequenceID=KP794848   superkingdom=Eukaryota   kingdom=Viridiplantae   phylum=Streptophyta   subphylum=Streptophytina   class=Magnoliopsida   subclass=NA   infraclass=NA   order=Rosales   suborder=NA   infraorder=NA   superfamily=NA   family=Rosaceae   genus=Sorbus   species=Sorbus aucuparia   markercode=trnL   lat=NA   lon=NA	source=NCBI	N/A	100.000	100	5.00e-43	167	Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Rosaceae / Sorbus / Sorbus aucuparia""")
+
+        with pytest.raises(ValueError, match="Line does not contain expected fields: superkingdom, markercode, or Genbank"):
+            add_labels(str(input_file), str(output_file), [1, 2, 4, 7, 11, 12, 13])
+
+
+# Example of how to run these tests
+if __name__ == "__main__":
+    # Run all tests in this file
+    pytest.main([__file__])
\ No newline at end of file