Mercurial > repos > onnodg > add_taxonomic_labels
comparison tests/test_add_taxonomic_labels.py @ 0:abd214795fa5 draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/add_header_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:07:01 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:abd214795fa5 |
|---|---|
| 1 """ | |
| 2 Test suite for add taxonomic labels tool. | |
| 3 """ | |
| 4 | |
| 5 import pytest | |
| 6 import tempfile | |
| 7 import os | |
| 8 | |
| 9 from Stage_1_translated.NLOOR_scripts.add_header_tool.add_taxonomic_labels import add_labels | |
| 10 | |
| 11 | |
| 12 class TestTaxonomyProcessing: | |
| 13 """Test the main taxonomy processing functionality.""" | |
| 14 | |
| 15 @pytest.fixture | |
| 16 def sample_input_data(self): | |
| 17 """Provide sample input data for testing.""" | |
| 18 return """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | |
| 19 M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species | |
| 20 M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16) source=NCBI sequenceID=KC539736 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Ulmaceae genus=Ulmus species=Ulmus laevis markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 89 1.44e-38 152 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species""" | |
| 21 | |
| 22 @pytest.fixture | |
| 23 def temp_files(self): | |
| 24 """Create temporary input and output files.""" | |
| 25 input_file = tempfile.NamedTemporaryFile(mode='w', delete=False) | |
| 26 input_file.close() | |
| 27 output_file = tempfile.NamedTemporaryFile(mode='w', delete=False) | |
| 28 output_file.close() | |
| 29 | |
| 30 yield input_file.name, output_file.name | |
| 31 | |
| 32 # Cleanup | |
| 33 os.unlink(input_file.name) | |
| 34 os.unlink(output_file.name) | |
| 35 | |
| 36 def test_header_creation(self, temp_files): | |
| 37 """Test that the correct header is written to output file.""" | |
| 38 input_file, output_file = temp_files | |
| 39 | |
| 40 # Create empty input file | |
| 41 with open(input_file, 'w') as f: | |
| 42 f.write("#Query ID header line\n") | |
| 43 | |
| 44 add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) | |
| 45 | |
| 46 with open(output_file, 'r') as f: | |
| 47 header = f.readline().strip() | |
| 48 | |
| 49 expected_header = '#Query ID\t#Subject\t#Subject accession\t#Subject Taxonomy ID\t#Identity percentage\t#Coverage\t#evalue\t#bitscore\t#Source\t#Taxonomy' | |
| 50 assert header == expected_header | |
| 51 | |
| 52 def test_basic_taxonomy_processing(self, temp_files, sample_input_data): | |
| 53 """Test basic taxonomy line processing.""" | |
| 54 input_file, output_file = temp_files | |
| 55 | |
| 56 with open(input_file, 'w') as f: | |
| 57 f.write(sample_input_data) | |
| 58 | |
| 59 add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) # kingdom, phylum, species | |
| 60 | |
| 61 with open(output_file, 'r') as f: | |
| 62 lines = f.readlines() | |
| 63 | |
| 64 # Should have header + 2 data lines | |
| 65 assert len(lines) == 3 | |
| 66 | |
| 67 # Check first data line | |
| 68 first_data_line = lines[1].strip() | |
| 69 assert 'M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 NCBI Viridiplantae / Streptophyta / Magnoliopsida / Malpighiales / Salicaceae / Populus / Populus tremula' in first_data_line | |
| 70 | |
| 71 # Check second data line | |
| 72 second_data_line = lines[2].strip() | |
| 73 assert 'M01687:476:000000000-LL5F5:1:2115:26447:7735_CONS(16) source=NCBI sequenceID=KC539736 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Ulmaceae genus=Ulmus species=Ulmus laevis markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 89 1.44e-38 152 NCBI Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Ulmaceae / Ulmus / Ulmus laevis' in second_data_line | |
| 74 | |
| 75 def test_single_taxon_level(self, temp_files): | |
| 76 """Test processing with only one taxonomic level.""" | |
| 77 input_file, output_file = temp_files | |
| 78 | |
| 79 test_data = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | |
| 80 M01687:476:000000000-LL5F5:1:1101:20413:7447_PairEnd(1) source=NCBI sequenceID=HM590330 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malpighiales suborder=NA infraorder=NA superfamily=NA family=Salicaceae genus=Populus species=Populus tremula markercode=trnL lat=50.47 lon=-104.37 source=NCBI N/A 100.000 91 8.47e-10 54.7 Genbank unknown kingdom / unknown phylum / unknown class / unknown order / unknown family / unknown genus / unknown species""" | |
| 81 with open(input_file, 'w') as f: | |
| 82 f.write(test_data) | |
| 83 | |
| 84 add_labels(input_file, output_file, [13]) | |
| 85 | |
| 86 with open(output_file, 'r') as f: | |
| 87 lines = f.readlines() | |
| 88 | |
| 89 data_line = lines[1].strip() | |
| 90 assert data_line.endswith('Populus tremula') # Should not have ' / ' since it's the last level | |
| 91 | |
| 92 def test_default_taxon_levels(self, temp_files, sample_input_data): | |
| 93 """Test processing with default taxonomic levels.""" | |
| 94 input_file, output_file = temp_files | |
| 95 | |
| 96 with open(input_file, 'w') as f: | |
| 97 f.write(sample_input_data) | |
| 98 | |
| 99 # Use default levels | |
| 100 add_labels(input_file, output_file, [1, 2, 4, 7, 11, 12, 13]) | |
| 101 | |
| 102 with open(output_file, 'r') as f: | |
| 103 lines = f.readlines() | |
| 104 | |
| 105 # Check that the taxonomy string has the expected structure | |
| 106 first_data_line = lines[1] | |
| 107 taxonomy_part = first_data_line.split('\t')[-1].strip() | |
| 108 | |
| 109 # Should have ' / ' separators between levels (except the last one) | |
| 110 separator_count = taxonomy_part.count(' / ') | |
| 111 assert separator_count == 6 # 7 levels - 1 = 6 separators | |
| 112 | |
| 113 | |
| 114 class TestEdgeCases: | |
| 115 """Test edge cases and error conditions.""" | |
| 116 | |
| 117 def test_different_input_file(self, tmp_path): | |
| 118 """Test processing an input file with unexpected data.""" | |
| 119 input_file = tmp_path / "empty_input.txt" | |
| 120 output_file = tmp_path / "output.txt" | |
| 121 | |
| 122 input_file.write_text("""#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | |
| 123 M01687:476:000000000-LL5F5:1:1102:11130:1143 source=NCBI sequenceID=KP794848 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Rosales suborder=NA infraorder=NA superfamily=NA family=Rosaceae genus=Sorbus species=Sorbus aucuparia markercode=trnL lat=NA lon=NA source=NCBI N/A 100.000 100 5.00e-43 167 Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Rosaceae / Sorbus / Sorbus aucuparia""") | |
| 124 | |
| 125 with pytest.raises(ValueError, match="Line does not contain expected fields: superkingdom, markercode, or Genbank"): | |
| 126 add_labels(str(input_file), str(output_file), [1, 2, 4, 7, 11, 12, 13]) | |
| 127 | |
| 128 | |
| 129 # Example of how to run these tests | |
| 130 if __name__ == "__main__": | |
| 131 # Run all tests in this file | |
| 132 pytest.main([__file__]) |
