Mercurial > repos > onnodg > blast_annotations_processor
comparison tests/test_blast_annotations_processor.py @ 2:9ca209477dfd draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
| author | onnodg |
|---|---|
| date | Mon, 15 Dec 2025 16:43:36 +0000 |
| parents | a3989edf0a4a |
| children |
comparison
equal
deleted
inserted
replaced
| 1:2acf82433aa4 | 2:9ca209477dfd |
|---|---|
| 1 """ | 1 """ |
| 2 Test suite for BLAST annotation processor. | 2 Test suite for BLAST annotation processor. |
| 3 """ | 3 """ |
| 4 | 4 import re |
| 5 import ast | |
| 5 import pytest | 6 import pytest |
| 6 import os | 7 import os |
| 7 import sys | 8 import sys |
| 8 import json | 9 import json |
| 9 import pandas as pd | 10 import pandas as pd |
| 11 | 12 |
| 12 # Add the module to path for importing | 13 # Add the module to path for importing |
| 13 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | 14 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 14 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import ( | 15 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import ( |
| 15 process_single_file, | 16 process_single_file, |
| 16 resolve_taxon_majority, | 17 resolve_tax_majority, |
| 17 TAXONOMIC_LEVELS | 18 TAXONOMIC_LEVELS, |
| 19 check_header_string | |
| 18 ) | 20 ) |
| 19 | 21 |
| 20 | 22 |
| 21 class TestBlastAnnotationProcessor: | 23 class TestBlastAnnotationProcessor: |
| 22 """Test class for BLAST annotation processor""" | 24 """Test class for BLAST annotation processor""" |
| 25 def test_data_dir(self): | 27 def test_data_dir(self): |
| 26 """Setup test data directory structure""" | 28 """Setup test data directory structure""" |
| 27 base_dir = Path("test-data") | 29 base_dir = Path("test-data") |
| 28 base_dir.mkdir(exist_ok=True) | 30 base_dir.mkdir(exist_ok=True) |
| 29 | 31 |
| 30 # Create subdirectories | |
| 31 for subdir in ["input", "expected", "output"]: | 32 for subdir in ["input", "expected", "output"]: |
| 32 (base_dir / subdir).mkdir(exist_ok=True) | 33 (base_dir / subdir).mkdir(exist_ok=True) |
| 33 | 34 |
| 34 return base_dir | 35 return base_dir |
| 35 | 36 |
| 36 @pytest.fixture(scope="class") | 37 @pytest.fixture(scope="class") |
| 37 def sample_files(self, test_data_dir): | 38 def sample_files(self, test_data_dir): |
| 38 """Create sample input files for testing""" | 39 """Create sample input files for testing""" |
| 39 input_dir = test_data_dir / "input" | 40 input_dir = test_data_dir / "input" |
| 40 | 41 |
| 41 # Sample annotated BLAST file | |
| 42 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | 42 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy |
| 43 read1(100) subject2 subject2 subject2 90.0 95 1e-45 180 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis | 43 read1(100) subject2 id2 subject2 90.0 95 1e-45 180 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis |
| 44 read1(100) subject1 subject1 subject1 95.889 100 1e-50 200 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis | 44 read1(100) subject1 id1 subject1 95.889 100 1e-50 200 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis |
| 45 read2(50) subject3 subject3 subject3 85.0 90 1e-40 160 database2 Bacteria / Proteobacteria / Gammaproteobacteria / Enterobacterales / Enterobacteriaceae / Escherichia / Escherichia_coli | 45 read2(50) subject3 id3 subject3 85.0 90 1e-40 160 database2 Bacteria / Proteobacteria / Gammaproteobacteria / Enterobacterales / Enterobacteriaceae / Escherichia / Escherichia_coli |
| 46 read3(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii | 46 read3(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii |
| 47 read4(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_blabla | 47 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_blabla |
| 48 read4(25) subject4 subject4 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi | 48 read4(25) subject4 id4.1 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi |
| 49 read4(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi | 49 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi |
| 50 read4(25) subject4 subject4 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii | 50 read4(25) subject4 id4.2 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii |
| 51 """ | 51 """ |
| 52 | 52 |
| 53 # Sample unannotated FASTA file (headers must match BLAST q_id) | |
| 54 fasta_content = """>read1(100) count=100; | 53 fasta_content = """>read1(100) count=100; |
| 55 ATCGATCGATCGATCG | 54 ATCGATCGATCGATCG |
| 56 >read2(50) count=50; | 55 >read2(50) count=50; |
| 57 GCTAGCTAGCTAGCTA | 56 GCTAGCTAGCTAGCTA |
| 58 >read3(25) count=25; | 57 >read3(25) count=25; |
| 77 @pytest.fixture(scope="class") | 76 @pytest.fixture(scope="class") |
| 78 def processed_output(self, test_data_dir, sample_files): | 77 def processed_output(self, test_data_dir, sample_files): |
| 79 """Run the processor on sample files and return output paths""" | 78 """Run the processor on sample files and return output paths""" |
| 80 output_dir = test_data_dir / "output" | 79 output_dir = test_data_dir / "output" |
| 81 | 80 |
| 82 # Create arguments object | |
| 83 class Args: | 81 class Args: |
| 84 def __init__(self): | 82 def __init__(self): |
| 85 self.input_anno = sample_files['blast'] | 83 self.input_anno = sample_files['blast'] |
| 86 self.input_unanno = sample_files['fasta'] | 84 self.input_unanno = sample_files['fasta'] |
| 87 self.eval_plot = str(output_dir / "eval_plot.png") | 85 self.eval_plot = str(output_dir / "eval_plot.png") |
| 88 self.taxa_output = str(output_dir / "taxa_output.txt") | 86 self.taxa_output = str(output_dir / "taxa_output.txt") |
| 89 self.circle_data = str(output_dir / "circle_data.json") | 87 self.circle_data = str(output_dir / "circle_data.json") |
| 90 self.header_anno = str(output_dir / "header_anno.xlsx") | 88 self.header_anno = str(output_dir / "header_anno.xlsx") |
| 91 self.anno_stats = str(output_dir / "anno_stats.txt") | 89 self.anno_stats = str(output_dir / "anno_stats.txt") |
| 90 self.filtered_fasta = str(output_dir / "filtered_fasta.fasta") | |
| 91 self.log = str(output_dir / "log.txt") | |
| 92 self.uncertain_threshold = 0.9 | 92 self.uncertain_threshold = 0.9 |
| 93 self.eval_threshold = 1e-10 | 93 self.eval_threshold = 1e-10 |
| 94 self.min_bitscore = 60 | |
| 95 self.min_support = 1 | |
| 96 self.ignore_rank = 'unknown' | |
| 97 self.ignore_taxonomy = 'environmental' | |
| 98 self.bitscore_perc_cutoff = 10 | |
| 99 self.ignore_obiclean_type ='singleton' | |
| 100 self.ignore_illuminapairend_type = 'pairend' | |
| 101 self.min_identity = 70 | |
| 102 self.min_coverage = 70 | |
| 103 self.ignore_seqids = '' | |
| 94 self.use_counts = True | 104 self.use_counts = True |
| 95 | 105 |
| 96 args = Args() | 106 args = Args() |
| 97 | 107 log_messages = [] |
| 98 # Process the files | 108 process_single_file(args.input_anno, args.input_unanno, args, log_messages) |
| 99 process_single_file(args.input_anno, args.input_unanno, args) | |
| 100 | 109 |
| 101 return args | 110 return args |
| 102 | 111 |
| 103 def test_data_integrity_best_values(self, processed_output): | |
| 104 """ | |
| 105 Test 1: Data Integrity - Best Values Selection | |
| 106 | |
| 107 Verifies that for each read, the best e-value corresponds to the correct | |
| 108 bitscore, identity, coverage, and taxonomic annotation. | |
| 109 """ | |
| 110 # Read the Excel output to verify the best values are correctly selected | |
| 111 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') | |
| 112 print(df) | |
| 113 # For read1(100), verify best e-value (1e-50) corresponds to correct values | |
| 114 read1_row = df[df['header'].str.contains('read1')].iloc[0] | |
| 115 assert read1_row['bitscore'] == float(200), "best bitscore doesn't match" | |
| 116 assert read1_row['e_value'] == pytest.approx(1e-50, rel=1e-8, abs=1e-49), "Best e-value not correctly selected for read1" | |
| 117 assert read1_row['identity percentage'] == float(95.889), "Identity doesn't match best bitscore for read1" | |
| 118 assert 'Bacillus_subtilis' in read1_row['taxa'], "Taxa doesn't match best hit for read1" | |
| 119 | |
| 120 read4_row = df[df['header'].str.contains('read4')].iloc[0] | |
| 121 assert read4_row['bitscore'] == float(160), "best bitscore doesn't match" | |
| 122 assert 'Methanobrevibacter_smithii' in read4_row['taxa'], "Taxa doesn't match best hit for read1" | |
| 123 print("✓ Test 1 PASSED: Best values correctly associated for each read") | |
| 124 | 112 |
| 125 def test_read_count_consistency(self, processed_output): | 113 def test_read_count_consistency(self, processed_output): |
| 126 """ | 114 """ |
| 127 Test 2: Read Count Consistency | 115 Test 1: Read Count Consistency |
| 128 | 116 |
| 129 Verifies that read counts from FASTA headers are correctly preserved | 117 Verifies that read counts from FASTA headers are correctly preserved |
| 130 and aggregated in all output files. | 118 and aggregated in all output files. |
| 131 """ | 119 """ |
| 132 # Check Excel output | |
| 133 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') | 120 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') |
| 134 # Verify counts are correctly extracted and preserved | |
| 135 expected_counts = {'read1': 100, 'read2': 50, 'read3': 25, 'read4':25} | 121 expected_counts = {'read1': 100, 'read2': 50, 'read3': 25, 'read4':25} |
| 136 | 122 |
| 137 skipped_reads = [] | 123 skipped_reads = [] |
| 138 | 124 |
| 139 for read_name, expected_count in expected_counts.items(): | 125 for read_name, expected_count in expected_counts.items(): |
| 142 skipped_reads.append(read_name) # remember we skip this read | 128 skipped_reads.append(read_name) # remember we skip this read |
| 143 continue | 129 continue |
| 144 row = subset.iloc[0] | 130 row = subset.iloc[0] |
| 145 assert row['count'] == expected_count, f"Count mismatch for {read_name}" | 131 assert row['count'] == expected_count, f"Count mismatch for {read_name}" |
| 146 | 132 |
| 147 # Check annotation stats | |
| 148 with open(processed_output.anno_stats, 'r') as f: | 133 with open(processed_output.anno_stats, 'r') as f: |
| 149 stats_content = f.read() | 134 stats_content = f.read() |
| 150 # Total unique count should be 175 (100+50+25) | 135 # Total unique count should be 175 (100+50+25) |
| 151 assert 'total_unique\t200' in stats_content, "Total unique count incorrect in stats" | 136 assert 'total_unique: 200' in stats_content, "Total unique count incorrect in stats" |
| 152 if skipped_reads: | 137 if skipped_reads: |
| 153 assert all(read not in df['header'].values for read in skipped_reads) | 138 assert all(read not in df['header'].values for read in skipped_reads) |
| 154 print("✓ Test 2 PASSED: Read counts consistent across all outputs") | 139 print("✓ Test 1 PASSED: Read counts consistent across all outputs") |
| 155 | 140 |
| 156 def test_lowest_common_ancester(self, processed_output): | 141 def test_lowest_common_ancester(self, processed_output): |
| 157 """ | 142 """ |
| 158 Test 3: Big Input Files | 143 Test 2: Big Input Files |
| 159 | 144 |
| 160 Tests the functioning of lowest common ancestor selection with realistic inputfile sizes | 145 Tests the functioning of lowest common ancestor selection with realistic inputfile sizes |
| 161 """ | 146 """ |
| 162 # Test the function directly with known conflicts | |
| 163 test_conflicts = { | 147 test_conflicts = { |
| 164 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 10, | 148 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 10, |
| 165 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 1, | 149 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 1, |
| 166 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita c': 1, | 150 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita c': 1, |
| 167 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita d': 1, | 151 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita d': 1, |
| 168 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita e': 1, | 152 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita e': 1, |
| 169 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia a': 187, | 153 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia a': 450, |
| 170 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia b': 2, | 154 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia b': 2, |
| 171 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia c': 2, | 155 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia c': 2, |
| 172 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia d': 2, | 156 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia d': 2, |
| 173 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia e': 2, | 157 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia e': 2, |
| 174 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia f': 12, | 158 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia f': 12, |
| 175 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 | 159 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 |
| 176 } | 160 } |
| 177 resolved_short, resolved_long = resolve_taxon_majority(test_conflicts, 0.9) | 161 resolved_short1, resolved_long1 = resolve_tax_majority(test_conflicts, 0.9) |
| 178 assert 'Ciceronia a' in resolved_short, "Conflict not resolved to uncertain taxa" | 162 assert 'Ciceronia a' in resolved_short1, "Conflict not resolved to uncertain taxa" |
| 179 | 163 |
| 180 test_90_precent_conflicts = { | 164 test_90_precent_conflicts = { |
| 181 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 90, | 165 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 90, |
| 182 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 10, | 166 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 10, |
| 183 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 | 167 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 |
| 184 } | 168 } |
| 185 resolved_short, resolved_long = resolve_taxon_majority(test_90_precent_conflicts, 0.9) | 169 resolved_short, resolved_long = resolve_tax_majority(test_90_precent_conflicts, 0.9) |
| 186 assert 'Cicerbita a' in resolved_short, "Conflict not resolved to uncertain taxa" | 170 assert 'Viridiplantae / Streptophyta / Uncertain taxa' in resolved_long, "Conflict not resolved to uncertain taxa" |
| 187 | 171 |
| 188 print("✓ Test 3 PASSED: Lowest common ancestor works correctly") | 172 print("✓ Test 2 PASSED: Lowest common ancestor works correctly") |
| 189 | 173 |
| 190 | 174 |
| 191 def test_taxonomic_conflict_resolution(self, processed_output): | 175 def test_taxonomic_conflict_resolution(self, processed_output): |
| 192 """ | 176 """ |
| 193 Test 4: Taxonomic Conflict Resolution | 177 Test 3: Taxonomic Conflict Resolution |
| 194 | 178 |
| 195 Tests the uncertainty threshold mechanism for resolving taxonomic conflicts. | 179 Tests the uncertainty threshold mechanism for resolving taxonomic conflicts. |
| 196 Uses a controlled scenario where multiple hits have different taxa. | 180 Uses a controlled scenario where multiple hits have different taxa. |
| 197 """ | 181 """ |
| 198 # Test the function directly with known conflicts | |
| 199 test_conflicts = { | 182 test_conflicts = { |
| 200 'Bacteria / Firmicutes / Bacilli': 2, | 183 'Bacteria / Firmicutes / Bacilli': 2, |
| 201 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 | 184 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 |
| 202 } | 185 } |
| 203 | 186 |
| 204 resolved_short, resolved_long = resolve_taxon_majority(test_conflicts, 0.9) | 187 resolved_short, resolved_long = resolve_tax_majority(test_conflicts, 0.9) |
| 205 | 188 |
| 206 # With threshold 0.9, should resolve to most common (2/3 = 0.67 < 0.9, so uncertain) | 189 # With threshold 0.9, should resolve to most common (2/3 = 0.67 < 0.9, so uncertain) |
| 207 assert 'Uncertain taxa' in resolved_short, "Conflict not resolved to uncertain taxa" | 190 assert 'Uncertain taxa' in resolved_short, "Conflict not resolved to uncertain taxa" |
| 208 | 191 |
| 209 # Test with higher confidence | |
| 210 test_high_confidence = { | 192 test_high_confidence = { |
| 211 'Bacteria / Firmicutes / Bacilli': 9, | 193 'Bacteria / Firmicutes / Bacilli': 9, |
| 212 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 | 194 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 |
| 213 } | 195 } |
| 214 | 196 |
| 215 resolved_short, resolved_long = resolve_taxon_majority(test_high_confidence, 0.9) | 197 resolved_short, resolved_long = resolve_tax_majority(test_high_confidence, 0.9) |
| 216 assert 'Firmicutes' in resolved_short, "High confidence case not resolved correctly" | 198 assert 'Firmicutes' in resolved_short, "High confidence case not resolved correctly" |
| 217 | 199 |
| 218 print("✓ Test 4 PASSED: Taxonomic conflict resolution working correctly") | 200 print("✓ Test 3 PASSED: Taxonomic conflict resolution working correctly") |
| 219 | 201 |
| 220 def test_output_file_structures(self, processed_output): | 202 def test_output_file_structures(self, processed_output): |
| 221 """ | 203 """ |
| 222 Test 5: Output File Structure Validation | 204 Test 4: Output File Structure Validation |
| 223 | 205 |
| 224 Verifies that all output files are created with correct structure and format. | 206 Verifies that all output files are created with correct structure and format. |
| 225 """ | 207 """ |
| 226 # Test Excel file structure | |
| 227 excel_file = processed_output.header_anno | 208 excel_file = processed_output.header_anno |
| 228 assert os.path.exists(excel_file), "Excel output file not created" | 209 assert os.path.exists(excel_file), "Excel output file not created" |
| 229 | 210 |
| 230 # Check both sheets exist | |
| 231 xl_file = pd.ExcelFile(excel_file) | 211 xl_file = pd.ExcelFile(excel_file) |
| 232 expected_sheets = ['Individual_Reads', 'Merged_by_Taxa'] | 212 expected_sheets = ['Individual_Reads', 'Merged_by_Taxa'] |
| 233 assert all(sheet in xl_file.sheet_names for sheet in expected_sheets), "Missing Excel sheets" | 213 assert all(sheet in xl_file.sheet_names for sheet in expected_sheets), "Missing Excel sheets" |
| 234 | 214 |
| 235 # Test Individual_Reads sheet structure | |
| 236 df_individual = pd.read_excel(excel_file, sheet_name='Individual_Reads') | 215 df_individual = pd.read_excel(excel_file, sheet_name='Individual_Reads') |
| 237 expected_cols = ['header', 'e_value', 'identity percentage', 'coverage', | 216 expected_cols = ['header', 'seq_id', 'source', 'count', 'taxa', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] |
| 238 'bitscore', 'count', 'source', 'taxa'] | |
| 239 assert all(col in df_individual.columns for col in expected_cols), "Missing columns in Individual_Reads" | 217 assert all(col in df_individual.columns for col in expected_cols), "Missing columns in Individual_Reads" |
| 240 | 218 |
| 241 # Test taxa output structure | |
| 242 with open(processed_output.taxa_output, 'r') as f: | 219 with open(processed_output.taxa_output, 'r') as f: |
| 243 taxa_lines = f.readlines() | 220 taxa_lines = f.readlines() |
| 244 | 221 assert len(taxa_lines) == 2, "Taxa output too short" |
| 245 # Should have header line and data lines | |
| 246 assert len(taxa_lines) > 2, "Taxa output too short" | |
| 247 assert 'percentage_rooted\tnumber_rooted' in taxa_lines[1], "Taxa output header incorrect" | 222 assert 'percentage_rooted\tnumber_rooted' in taxa_lines[1], "Taxa output header incorrect" |
| 248 | 223 |
| 249 # Test circle data JSON structure | 224 with open(processed_output.anno_stats, 'r') as f: |
| 225 anno_stats = f.readlines() | |
| 226 assert 'FASTA: headers kept after filters and min_support=1: 4\n' in anno_stats, "Taxa output header incorrect" | |
| 227 filter_f = 4 | |
| 228 | |
| 229 | |
| 250 with open(processed_output.circle_data, 'r') as f: | 230 with open(processed_output.circle_data, 'r') as f: |
| 251 circle_data = json.load(f) | 231 circle_data = json.load(f) |
| 252 | 232 |
| 253 assert isinstance(circle_data, list), "Circle data should be a list" | 233 assert isinstance(circle_data, list), "Circle data should be a list" |
| 254 assert len(circle_data) == len(TAXONOMIC_LEVELS), "Circle data should have entry per taxonomic level" | 234 assert len(circle_data) == len(TAXONOMIC_LEVELS), "Circle data should have entry per taxonomic level" |
| 255 | 235 |
| 256 print("✓ Test 5 PASSED: All output files have correct structure") | 236 with open(processed_output.filtered_fasta, 'r') as f: |
| 257 | 237 filtered_fasta = f.readlines() |
| 258 def test_evalue_filtering(self, test_data_dir): | 238 assert len(filtered_fasta) == filter_f * 2 |
| 259 """ | 239 |
| 260 Test 6: E-value Threshold Filtering | 240 print("✓ Test 4 PASSED: All output files have correct structure") |
| 261 | 241 |
| 262 Tests that hits above the e-value threshold are correctly filtered out. | 242 |
| 263 """ | |
| 264 input_dir = test_data_dir / "input" | |
| 265 output_dir = test_data_dir / "output" | |
| 266 | |
| 267 # Create test file with mix of good and bad e-values | |
| 268 blast_content_mixed = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | |
| 269 read1(100) subject1 95.0 100 50 75 1e-50 200 database1 Viridiplantae / Streptophyta / Magnoliopsida / Fagales / Juglandaceae / Uncertain taxa / Uncertain taxa | |
| 270 read1(100) subject2 90.0 95 45 70 1e-5 180 database1 Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Rosaceae / Sorbus / Sorbus aucuparia | |
| 271 read2(50) subject3 85.0 90 40 65 1e-3 160 database2 Viridiplantae / Streptophyta / Magnoliopsida / Solanales / Solanaceae / Uncertain taxa / Uncertain taxa | |
| 272 """ | |
| 273 | |
| 274 fasta_content = """>read1(100) count=100; | |
| 275 ATCG | |
| 276 >read2(50) count=50; | |
| 277 GCTA | |
| 278 """ | |
| 279 | |
| 280 blast_file = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sorted_test.tabular" | |
| 281 fasta_file = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sorted_test.fasta" | |
| 282 | |
| 283 with open(blast_file, 'w') as f: | |
| 284 f.write(blast_content_mixed) | |
| 285 with open(fasta_file, 'w') as f: | |
| 286 f.write(fasta_content) | |
| 287 | |
| 288 # Process with strict e-value threshold | |
| 289 class Args: | |
| 290 def __init__(self): | |
| 291 self.input_anno = str(blast_file) | |
| 292 self.input_unanno = str(fasta_file) | |
| 293 self.header_anno = str(output_dir / "evalue_test.xlsx") | |
| 294 self.eval_plot = None | |
| 295 self.taxa_output = None | |
| 296 self.circle_data = None | |
| 297 self.anno_stats = None | |
| 298 self.uncertain_threshold = 0.9 | |
| 299 self.eval_threshold = 1e-10 # Very strict threshold | |
| 300 self.use_counts = True | |
| 301 | |
| 302 args = Args() | |
| 303 process_single_file(args.input_anno, args.input_unanno, args) | |
| 304 | |
| 305 # Check that only the 1e-50 hit remains | |
| 306 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads') | |
| 307 | |
| 308 # Should only have read1 (with 1e-50), read2 should be filtered out | |
| 309 assert len(df) == 1, f"Expected 1 read after filtering, got {len(df)}" | |
| 310 assert df.iloc[0]['e_value'] == pytest.approx(1e-50, rel=1e-8, abs=1e-12), "Wrong hit survived e-value filtering" | |
| 311 | |
| 312 print("✓ Test 6 PASSED: E-value filtering working correctly") | |
| 313 | 243 |
| 314 def test_header_synchronization(self, test_data_dir): | 244 def test_header_synchronization(self, test_data_dir): |
| 315 """ | 245 """ |
| 316 Test 7: Header Synchronization Between Files | 246 Test 5: Header Synchronization Between Files |
| 317 | 247 |
| 318 Tests that the processor correctly handles mismatched headers between | 248 Tests that the processor correctly handles mismatched headers between |
| 319 annotated and unannotated files. | 249 annotated and unannotated files. |
| 320 """ | 250 """ |
| 321 input_dir = test_data_dir / "input" | 251 input_dir = test_data_dir / "input" |
| 323 | 253 |
| 324 # Create mismatched files | 254 # Create mismatched files |
| 325 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy | 255 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy |
| 326 read1(100) source=NCBI sequenceID=KR738003 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-30 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum | 256 read1(100) source=NCBI sequenceID=KR738003 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-30 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum |
| 327 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum | 257 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum |
| 258 read2.1(50) 1 2 3 4 5 6 7 8 9 | |
| 328 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum | 259 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum |
| 329 """ | 260 """ |
| 330 | 261 |
| 331 fasta_content = """>read1(100) count=100; | 262 fasta_content = """>read1(100) count=100; |
| 332 ATCG | 263 ATCG |
| 350 self.input_unanno = fasta_file | 281 self.input_unanno = fasta_file |
| 351 self.header_anno = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sync_test.xlsx" | 282 self.header_anno = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sync_test.xlsx" |
| 352 self.eval_plot = None | 283 self.eval_plot = None |
| 353 self.taxa_output = None | 284 self.taxa_output = None |
| 354 self.circle_data = None | 285 self.circle_data = None |
| 286 self.filtered_fasta = str(output_dir / "filtered_fasta.fasta") | |
| 355 self.anno_stats = str(output_dir / "sync_stats.txt") | 287 self.anno_stats = str(output_dir / "sync_stats.txt") |
| 288 self.log = str(output_dir / "log.txt") | |
| 356 self.uncertain_threshold = 0.9 | 289 self.uncertain_threshold = 0.9 |
| 357 self.eval_threshold = 1e-10 | 290 self.eval_threshold = 1e-10 |
| 358 self.use_counts = True | 291 self.use_counts = True |
| 292 self.min_bitscore = 50 | |
| 293 self.min_support = 1 | |
| 294 self.ignore_rank = 'unknown' | |
| 295 self.ignore_taxonomy = 'environmental' | |
| 296 self.bitscore_perc_cutoff = 10 | |
| 297 self.ignore_obiclean_type = 'singleton' | |
| 298 self.ignore_illuminapairend_type = 'pairend' | |
| 299 self.min_identity = 30 | |
| 300 self.min_coverage = 30 | |
| 301 self.ignore_seqids = '' | |
| 359 | 302 |
| 360 args = Args() | 303 args = Args() |
| 361 process_single_file(args.input_anno, args.input_unanno, args) | 304 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[]) |
| 362 # Check that processing handled the mismatch correctly | |
| 363 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads') | 305 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads') |
| 364 extracted = df['header'].str.extract(r'(read\d+)') | 306 extracted = df['header'].str.extract(r'(read\d+)') |
| 365 # final list | |
| 366 headers = extracted[0].tolist() | 307 headers = extracted[0].tolist() |
| 367 # Should have read1 and read3, read2 should be skipped | 308 # Should have read1 and read3, read2 should be skipped |
| 368 assert 'read1' in headers, "read1 should be present" | 309 assert 'read1' in headers, "read1 should be present" |
| 310 assert 'read2' not in headers, "read2 should not be present" | |
| 311 assert 'read2.1' not in headers, "read2 should not be present" | |
| 369 assert 'read3' in headers, "read3 should be present" | 312 assert 'read3' in headers, "read3 should be present" |
| 370 | 313 |
| 371 print("✓ Test 7 PASSED: Header synchronization handled correctly") | 314 print("✓ Test 5 PASSED: Header synchronization handled correctly") |
| 315 | |
| 316 def test_check_header_string_all_behaviors(self): | |
| 317 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import \ | |
| 318 check_header_string | |
| 319 | |
| 320 # clean header — allowed | |
| 321 assert check_header_string(">readA count=10;", "", "") is True | |
| 322 # blocks singleton | |
| 323 assert check_header_string(">r obiclean_status={'XXX': 's'}", "singleton", "") is False | |
| 324 # blocks variant | |
| 325 assert check_header_string(">r obiclean_status={'XXX': 'i'}", "variant", "") is False | |
| 326 # blocks head | |
| 327 assert check_header_string(">r obiclean_status={'XXX': 'h'}", "head", "") is False | |
| 328 # blocks pairend | |
| 329 assert check_header_string(">r PairEnd", "pairend", "") is False | |
| 330 # blocks consensus | |
| 331 assert check_header_string(">r CONS", "consensus", "") is False | |
| 332 # blocks custom string | |
| 333 assert check_header_string(">r FooBar", "FooBar", "") is False | |
| 334 # blocks when string is in second param | |
| 335 assert check_header_string(">r blah", "", "blah") is False | |
| 336 # blocks when multiple ignore values contain it | |
| 337 assert check_header_string(">r PairEnd obiclean_status={'XXX': 's'}", "pairend,singleton", "") is False | |
| 338 # allows when no match | |
| 339 assert check_header_string(">r something", "pairend", "") is True | |
| 372 | 340 |
| 373 def test_excel_merged_vs_individual(self, processed_output): | 341 def test_excel_merged_vs_individual(self, processed_output): |
| 374 """ | 342 """ |
| 375 Test 8: Excel Merged vs Individual Sheet Consistency | 343 Test 6: Excel Merged vs Individual Sheet Consistency |
| 376 | 344 |
| 377 Verifies that the merged sheet correctly aggregates data from the individual sheet. | 345 Verifies that the merged sheet correctly aggregates data from the individual sheet. |
| 378 """ | 346 """ |
| 379 df_individual = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') | 347 df_individual = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') |
| 380 df_merged = pd.read_excel(processed_output.header_anno, sheet_name='Merged_by_Taxa') | 348 df_merged = pd.read_excel(processed_output.header_anno, sheet_name='Merged_by_Taxa') |
| 381 | 349 |
| 382 # Count unique taxa in individual sheet | |
| 383 individual_taxa = df_individual['taxa'].nunique() | 350 individual_taxa = df_individual['taxa'].nunique() |
| 384 | 351 |
| 385 # Should match number of rows in merged sheet | |
| 386 assert len(df_merged) == individual_taxa, "Merged sheet doesn't match unique taxa count" | 352 assert len(df_merged) == individual_taxa, "Merged sheet doesn't match unique taxa count" |
| 387 | 353 |
| 388 # Check that counts are properly aggregated | 354 # Check that counts are properly aggregated |
| 389 # For taxa with multiple reads, counts should be summed | 355 # For taxa with multiple reads, counts should be summed |
| 390 for _, merged_row in df_merged.iterrows(): | 356 for _, merged_row in df_merged.iterrows(): |
| 394 expected_count = individual_rows['count'].sum() | 360 expected_count = individual_rows['count'].sum() |
| 395 actual_count = merged_row['count'] | 361 actual_count = merged_row['count'] |
| 396 | 362 |
| 397 assert actual_count == expected_count, f"Count mismatch for taxa {taxa}: expected {expected_count}, got {actual_count}" | 363 assert actual_count == expected_count, f"Count mismatch for taxa {taxa}: expected {expected_count}, got {actual_count}" |
| 398 | 364 |
| 399 print("✓ Test 8 PASSED: Excel merged sheet correctly aggregates individual data") | 365 print("✓ Test 6 PASSED: Excel merged sheet correctly aggregates individual data") |
| 400 | 366 |
| 401 def test_annotation_statistics_accuracy(self, processed_output, sample_files): | 367 def test_annotation_statistics_accuracy(self, processed_output, sample_files): |
| 402 """ | 368 """ |
| 403 Test 9: Annotation Statistics Accuracy | 369 Test 7: Annotation Statistics Accuracy |
| 404 | 370 |
| 405 Verifies that calculated annotation statistics match the actual data. | 371 Verifies that calculated annotation statistics match the actual data. |
| 406 """ | 372 Adapted for the new plain-text log file instead of tab-separated output. |
| 407 # Read stats file | 373 """ |
| 408 stats = {} | 374 stats = {} |
| 375 | |
| 409 with open(processed_output.anno_stats, 'r') as f: | 376 with open(processed_output.anno_stats, 'r') as f: |
| 410 lines = f.readlines()[1:] # Skip header | 377 lines = f.readlines() |
| 411 for line in lines: | 378 |
| 412 key, value = line.strip().split('\t') | 379 for line in lines: |
| 413 try: | 380 line = line.strip() |
| 414 stats[key] = float(value) | 381 if not line or ":" not in line: |
| 415 except ValueError: | 382 continue |
| 416 stats[key] = value | 383 |
| 417 | 384 key, value = line.split(":", 1) |
| 418 # Manual verification | 385 key = key.strip() |
| 419 assert stats['total_sequences'] == 4.0, "Total sequences count incorrect" | 386 value = value.strip() |
| 420 assert stats['annotated_sequences'] == 3.0, "Annotated sequences count incorrect" | 387 |
| 421 assert stats['total_unique'] == 200, "Total unique count incorrect" | 388 try: |
| 422 assert stats['unique_annotated'] == 150, "Unique annotated count incorrect" | 389 stats[key] = float(value) |
| 423 assert stats['percentage_annotated'] == 75.0, "Percentage annotated incorrect" | 390 except ValueError: |
| 424 assert stats['percentage_unique_annotated'] == 75.0, "Percentage unique annotated incorrect" | 391 stats[key] = value |
| 425 | 392 |
| 426 print("✓ Test 9 PASSED: Annotation statistics are accurate") | 393 assert stats["total_sequences"] == 4.0, "Total sequences count incorrect" |
| 427 | 394 assert stats["annotated_sequences"] == 3.0, "Annotated sequence count incorrect" |
| 395 assert stats["total_unique"] == 200.0, "Total unique count incorrect" | |
| 396 assert stats["unique_annotated"] == 150.0, "Unique annotated count incorrect" | |
| 397 assert stats["percentage_annotated"] == 75.0, "Percentage annotated incorrect" | |
| 398 assert stats["percentage_unique_annotated"] == 75.0, "Percentage unique annotated incorrect" | |
| 399 | |
| 400 print("✓ Test 7 PASSED: Annotation statistics are accurate") | |
| 401 | |
| 402 def test_combined_all_filters(self, test_data_dir): | |
| 403 """ | |
| 404 Single integrated test that validates all FASTA + BLAST filter rules. | |
| 405 Every read is designed to fail exactly one filter, except readOK. | |
| 406 """ | |
| 407 input_dir = test_data_dir / "input" | |
| 408 output_dir = test_data_dir / "output" | |
| 409 | |
| 410 fasta = input_dir / "combined_filters.fasta" | |
| 411 blast = input_dir / "combined_filters.tabular" | |
| 412 | |
| 413 fasta.write_text( | |
| 414 ">lowSupport(1) count=1;\nACGT\n" | |
| 415 ">obicleanFail(10) count=10; obiclean_status={'XXX': 's'};\nACGT\n" | |
| 416 ">pairendFail_PairEnd(10) count=10;\nACGT\n" | |
| 417 ">identityFail(10) count=10;\nACGT\n" | |
| 418 ">coverageFail(10) count=10;\nACGT\n" | |
| 419 ">bitscoreFail(10) count=10;\nACGT\n" | |
| 420 ">bscutoffHigh(10) count=10;\nACGT\n" | |
| 421 ">envTaxFail(10) count=10;\nACGT\n" | |
| 422 ">rankFail(10) count=10;\nACGT\n" | |
| 423 ">seqidFail(10) count=10;\nACGT\n" | |
| 424 ">readOK(10) count=10;\nACGT\n" | |
| 425 ">readOK_multiple_id(10) count=10;\nACGT\n" | |
| 426 ) | |
| 427 | |
| 428 blast.write_text( | |
| 429 # min_support (count=1 < 5) | |
| 430 "lowSupport(1)\ts\tid1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 431 | |
| 432 # ignore_obiclean_type = singleton | |
| 433 "obicleanFail(10)\ts\tid2\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 434 | |
| 435 # ignore_illuminapairedend_type = pairend | |
| 436 "pairendFail_PairEnd(10)\ts\tid3\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 437 | |
| 438 # min_identity = 90 → identity = 50 fails | |
| 439 "identityFail(10)\ts\tid4\t123\t50\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 440 | |
| 441 # min_coverage = 50 → coverage = 20 fails | |
| 442 "coverageFail(10)\ts\tid5\t123\t99\t20\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 443 | |
| 444 # min_bitscore = 60 → bitscore = 10 fails | |
| 445 "bitscoreFail(10)\ts\tid6\t123\t99\t99\t1e-50\t10\tsrc\tA / B / C / D / E / F / G\n" | |
| 446 | |
| 447 # bitscore_perc_cutoff: best = 200 → cutoff = 180 → bitscore 150 fails | |
| 448 "bscutoffHigh(10)\ts\tid7.1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G.1\n" | |
| 449 "bscutoffHigh(10)\ts\tid7.2\t123\t99\t99\t1e-50\t150\tsrc\tA / B / C / D / E / F / G.2\n" | |
| 450 | |
| 451 # ignore_taxonomy = 'environmental' | |
| 452 "envTaxFail(10)\ts\tid8\t123\t99\t99\t1e-50\t200\tsrc\tEnvironmental / B / C / D / E / F / G\n" | |
| 453 | |
| 454 # ignore_rank = 'unknown' | |
| 455 "rankFail(10)\ts\tid9\t123\t99\t99\t1e-50\t200\tsrc\tUnknown / B / C / D / E / F / G\n" | |
| 456 | |
| 457 # ignore_seqids = BADSEQ | |
| 458 "seqidFail(10)\ts\tBADSEQ\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 459 | |
| 460 # readOK (valid, full taxonomy) | |
| 461 "readOK(10)\ts\tidGood\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 462 | |
| 463 # readOK_multiple_id (valid, full taxonomy, multiple id's) | |
| 464 "readOK_multiple_id(10)\ts\tidGood.1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 465 "readOK_multiple_id(10)\ts\tidGood.2\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n" | |
| 466 ) | |
| 467 | |
| 468 class Args: | |
| 469 def __init__(self): | |
| 470 self.input_anno = str(blast) | |
| 471 self.input_unanno = str(fasta) | |
| 472 self.header_anno = str(output_dir / "combined.xlsx") | |
| 473 self.filtered_fasta = str(output_dir / "combined.fasta") | |
| 474 self.anno_stats = str(output_dir / "combined_stats.txt") | |
| 475 self.eval_plot = None | |
| 476 self.taxa_output = None | |
| 477 self.circle_data = None | |
| 478 self.log = str(output_dir / "log.txt") | |
| 479 self.uncertain_threshold = 0.9 | |
| 480 self.eval_threshold = 1e-10 | |
| 481 self.use_counts = True | |
| 482 self.min_bitscore = 60 | |
| 483 self.min_support = 5 | |
| 484 self.ignore_rank = 'unknown' | |
| 485 self.ignore_taxonomy = 'environmental' | |
| 486 self.bitscore_perc_cutoff = 10 | |
| 487 self.ignore_obiclean_type = 'singleton' | |
| 488 self.ignore_illuminapairend_type = 'pairend' | |
| 489 self.min_identity = 90 | |
| 490 self.min_coverage = 50 | |
| 491 self.ignore_seqids = 'BADSEQ' | |
| 492 | |
| 493 args = Args() | |
| 494 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[]) | |
| 495 | |
| 496 with open(args.filtered_fasta) as f: | |
| 497 headers = [l.strip() for l in f if l.startswith(">")] | |
| 498 assert '>obicleanFail(10) count=10;' not in headers | |
| 499 assert '>pairendFail_PairEnd(10) count=10;' not in headers | |
| 500 assert len(headers) == 9, "FASTA filtering only applies to header-based rules" | |
| 501 | |
| 502 df = pd.read_excel(args.header_anno, sheet_name="Individual_Reads") | |
| 503 seq_ids = { | |
| 504 sid | |
| 505 for val in df["seq_id"] | |
| 506 for sid in (ast.literal_eval(val) if isinstance(val, str) else val) | |
| 507 } | |
| 508 | |
| 509 expected = {'idGood.1', 'idGood', 'id7.1', 'idGood.2'} | |
| 510 | |
| 511 assert seq_ids == expected, f"Expected surviving seq_ids {expected}, got {seq_ids}" | |
| 512 | |
| 513 | |
| 514 def test_log_filters_count(self, processed_output): | |
| 515 """ | |
| 516 Verify that the BLAST filter counters in the log file match expected structure. | |
| 517 """ | |
| 518 with open(processed_output.anno_stats) as f: | |
| 519 log = f.read() | |
| 520 | |
| 521 assert "=== PARAMETERS USED ===" in log | |
| 522 assert "input_anno:" in log | |
| 523 assert "input_unanno:" in log | |
| 524 | |
| 525 assert "FASTA: total headers: 4" in log | |
| 526 assert "FASTA: headers kept after filters" in log | |
| 527 | |
| 528 assert "BLAST: total hits read: 8" in log | |
| 529 assert "BLAST: hits kept after quality filters: 7" in log | |
| 530 | |
| 531 assert "ANNOTATION: total FASTA headers considered: 4" in log | |
| 532 assert "ANNOTATION: reads with BLAST hits: 3" in log | |
| 533 assert "ANNOTATION: reads without BLAST hits: 1" in log | |
| 534 | |
| 535 assert "E-value plot written to:" in log | |
| 536 assert "Taxa summary written to:" in log | |
| 537 assert "Header annotations written to:" in log | |
| 538 assert "Circle diagram JSON written to:" in log | |
| 539 | |
| 540 assert "=== ANNOTATION STATISTICS ===" in log | |
| 541 assert "percentage_annotated: 75.0" in log | |
| 542 assert "unique_annotated: 150" in log | |
| 543 assert "total_unique: 200" in log | |
| 544 | |
| 545 def test_missing_blast_file_graceful(self, test_data_dir): | |
| 546 """ | |
| 547 Crash / robustness test. | |
| 548 | |
| 549 When the BLAST file does NOT exist, the processor should: | |
| 550 - not crash | |
| 551 - write an anno_stats log mentioning the error | |
| 552 - return without creating header_anno | |
| 553 """ | |
| 554 input_dir = test_data_dir / "input" | |
| 555 output_dir = test_data_dir / "output" | |
| 556 | |
| 557 fasta = input_dir / "missing_blast_test.fasta" | |
| 558 fasta.write_text(">read1(10) count=10;\nACGT\n") | |
| 559 | |
| 560 missing_blast = input_dir / "this_file_does_not_exist.tabular" | |
| 561 | |
| 562 class Args: | |
| 563 def __init__(self): | |
| 564 self.input_anno = str(missing_blast) | |
| 565 self.input_unanno = str(fasta) | |
| 566 self.header_anno = str(output_dir / "missing_blast_header.xlsx") | |
| 567 self.filtered_fasta = str(output_dir / "missing_blast_filtered.fasta") | |
| 568 self.anno_stats = str(output_dir / "missing_blast_stats.txt") | |
| 569 self.eval_plot = None | |
| 570 self.taxa_output = None | |
| 571 self.circle_data = None | |
| 572 self.log = str(output_dir / "log.txt") | |
| 573 self.uncertain_threshold = 0.9 | |
| 574 self.eval_threshold = 1e-10 | |
| 575 self.use_counts = True | |
| 576 self.min_bitscore = 0 | |
| 577 self.min_support = 1 | |
| 578 self.ignore_rank = 'unknown' | |
| 579 self.ignore_taxonomy = 'environmental' | |
| 580 self.bitscore_perc_cutoff = 10 | |
| 581 self.ignore_obiclean_type = 'singleton' | |
| 582 self.ignore_illuminapairend_type = 'pairend' | |
| 583 self.min_identity = 0 | |
| 584 self.min_coverage = 0 | |
| 585 self.ignore_seqids = '' | |
| 586 | |
| 587 args = Args() | |
| 588 | |
| 589 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[]) | |
| 590 | |
| 591 assert not os.path.exists(args.header_anno), "Header file should not be created when BLAST is missing" | |
| 592 | |
| 593 assert os.path.exists(args.anno_stats), "anno_stats log should be created on error" | |
| 594 | |
| 595 from pathlib import Path | |
| 596 log_text = Path(args.anno_stats).read_text() | |
| 597 assert "Error: Input file" in log_text, "Missing BLAST file error not logged" | |
| 598 assert "Starting processing for FASTA" in log_text, "FASTA processing log missing" | |
| 428 | 599 |
| 429 | 600 |
| 430 if __name__ == "__main__": | 601 if __name__ == "__main__": |
| 431 # Run all tests in this file | |
| 432 pytest.main([__file__]) | 602 pytest.main([__file__]) |
