comparison tests/test_blast_annotations_processor.py @ 2:9ca209477dfd draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:43:36 +0000
parents a3989edf0a4a
children
comparison
equal deleted inserted replaced
1:2acf82433aa4 2:9ca209477dfd
1 """ 1 """
2 Test suite for BLAST annotation processor. 2 Test suite for BLAST annotation processor.
3 """ 3 """
4 4 import re
5 import ast
5 import pytest 6 import pytest
6 import os 7 import os
7 import sys 8 import sys
8 import json 9 import json
9 import pandas as pd 10 import pandas as pd
11 12
12 # Add the module to path for importing 13 # Add the module to path for importing
13 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 14 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import ( 15 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import (
15 process_single_file, 16 process_single_file,
16 resolve_taxon_majority, 17 resolve_tax_majority,
17 TAXONOMIC_LEVELS 18 TAXONOMIC_LEVELS,
19 check_header_string
18 ) 20 )
19 21
20 22
21 class TestBlastAnnotationProcessor: 23 class TestBlastAnnotationProcessor:
22 """Test class for BLAST annotation processor""" 24 """Test class for BLAST annotation processor"""
25 def test_data_dir(self): 27 def test_data_dir(self):
26 """Setup test data directory structure""" 28 """Setup test data directory structure"""
27 base_dir = Path("test-data") 29 base_dir = Path("test-data")
28 base_dir.mkdir(exist_ok=True) 30 base_dir.mkdir(exist_ok=True)
29 31
30 # Create subdirectories
31 for subdir in ["input", "expected", "output"]: 32 for subdir in ["input", "expected", "output"]:
32 (base_dir / subdir).mkdir(exist_ok=True) 33 (base_dir / subdir).mkdir(exist_ok=True)
33 34
34 return base_dir 35 return base_dir
35 36
36 @pytest.fixture(scope="class") 37 @pytest.fixture(scope="class")
37 def sample_files(self, test_data_dir): 38 def sample_files(self, test_data_dir):
38 """Create sample input files for testing""" 39 """Create sample input files for testing"""
39 input_dir = test_data_dir / "input" 40 input_dir = test_data_dir / "input"
40 41
41 # Sample annotated BLAST file
42 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy 42 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy
43 read1(100) subject2 subject2 subject2 90.0 95 1e-45 180 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis 43 read1(100) subject2 id2 subject2 90.0 95 1e-45 180 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis
44 read1(100) subject1 subject1 subject1 95.889 100 1e-50 200 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis 44 read1(100) subject1 id1 subject1 95.889 100 1e-50 200 database1 Bacteria / Firmicutes / Bacilli / Bacillales / Bacillaceae / Bacillus / Bacillus_subtilis
45 read2(50) subject3 subject3 subject3 85.0 90 1e-40 160 database2 Bacteria / Proteobacteria / Gammaproteobacteria / Enterobacterales / Enterobacteriaceae / Escherichia / Escherichia_coli 45 read2(50) subject3 id3 subject3 85.0 90 1e-40 160 database2 Bacteria / Proteobacteria / Gammaproteobacteria / Enterobacterales / Enterobacteriaceae / Escherichia / Escherichia_coli
46 read3(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii 46 read3(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii
47 read4(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_blabla 47 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_blabla
48 read4(25) subject4 subject4 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi 48 read4(25) subject4 id4.1 subject4 80.0 85 1e-40 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_eclhi
49 read4(25) subject4 subject4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi 49 read4(25) subject4 id4 subject4 80.0 85 1e-35 140 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_elchi
50 read4(25) subject4 subject4 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii 50 read4(25) subject4 id4.2 subject4 90.0 87 1e-50 160 database1 Archaea / Euryarchaeota / Methanobacteria / Methanobacteriales / Methanobacteriaceae / Methanobrevibacter / Methanobrevibacter_smithii
51 """ 51 """
52 52
53 # Sample unannotated FASTA file (headers must match BLAST q_id)
54 fasta_content = """>read1(100) count=100; 53 fasta_content = """>read1(100) count=100;
55 ATCGATCGATCGATCG 54 ATCGATCGATCGATCG
56 >read2(50) count=50; 55 >read2(50) count=50;
57 GCTAGCTAGCTAGCTA 56 GCTAGCTAGCTAGCTA
58 >read3(25) count=25; 57 >read3(25) count=25;
77 @pytest.fixture(scope="class") 76 @pytest.fixture(scope="class")
78 def processed_output(self, test_data_dir, sample_files): 77 def processed_output(self, test_data_dir, sample_files):
79 """Run the processor on sample files and return output paths""" 78 """Run the processor on sample files and return output paths"""
80 output_dir = test_data_dir / "output" 79 output_dir = test_data_dir / "output"
81 80
82 # Create arguments object
83 class Args: 81 class Args:
84 def __init__(self): 82 def __init__(self):
85 self.input_anno = sample_files['blast'] 83 self.input_anno = sample_files['blast']
86 self.input_unanno = sample_files['fasta'] 84 self.input_unanno = sample_files['fasta']
87 self.eval_plot = str(output_dir / "eval_plot.png") 85 self.eval_plot = str(output_dir / "eval_plot.png")
88 self.taxa_output = str(output_dir / "taxa_output.txt") 86 self.taxa_output = str(output_dir / "taxa_output.txt")
89 self.circle_data = str(output_dir / "circle_data.json") 87 self.circle_data = str(output_dir / "circle_data.json")
90 self.header_anno = str(output_dir / "header_anno.xlsx") 88 self.header_anno = str(output_dir / "header_anno.xlsx")
91 self.anno_stats = str(output_dir / "anno_stats.txt") 89 self.anno_stats = str(output_dir / "anno_stats.txt")
90 self.filtered_fasta = str(output_dir / "filtered_fasta.fasta")
91 self.log = str(output_dir / "log.txt")
92 self.uncertain_threshold = 0.9 92 self.uncertain_threshold = 0.9
93 self.eval_threshold = 1e-10 93 self.eval_threshold = 1e-10
94 self.min_bitscore = 60
95 self.min_support = 1
96 self.ignore_rank = 'unknown'
97 self.ignore_taxonomy = 'environmental'
98 self.bitscore_perc_cutoff = 10
99 self.ignore_obiclean_type ='singleton'
100 self.ignore_illuminapairend_type = 'pairend'
101 self.min_identity = 70
102 self.min_coverage = 70
103 self.ignore_seqids = ''
94 self.use_counts = True 104 self.use_counts = True
95 105
96 args = Args() 106 args = Args()
97 107 log_messages = []
98 # Process the files 108 process_single_file(args.input_anno, args.input_unanno, args, log_messages)
99 process_single_file(args.input_anno, args.input_unanno, args)
100 109
101 return args 110 return args
102 111
103 def test_data_integrity_best_values(self, processed_output):
104 """
105 Test 1: Data Integrity - Best Values Selection
106
107 Verifies that for each read, the best e-value corresponds to the correct
108 bitscore, identity, coverage, and taxonomic annotation.
109 """
110 # Read the Excel output to verify the best values are correctly selected
111 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads')
112 print(df)
113 # For read1(100), verify best e-value (1e-50) corresponds to correct values
114 read1_row = df[df['header'].str.contains('read1')].iloc[0]
115 assert read1_row['bitscore'] == float(200), "best bitscore doesn't match"
116 assert read1_row['e_value'] == pytest.approx(1e-50, rel=1e-8, abs=1e-49), "Best e-value not correctly selected for read1"
117 assert read1_row['identity percentage'] == float(95.889), "Identity doesn't match best bitscore for read1"
118 assert 'Bacillus_subtilis' in read1_row['taxa'], "Taxa doesn't match best hit for read1"
119
120 read4_row = df[df['header'].str.contains('read4')].iloc[0]
121 assert read4_row['bitscore'] == float(160), "best bitscore doesn't match"
122 assert 'Methanobrevibacter_smithii' in read4_row['taxa'], "Taxa doesn't match best hit for read1"
123 print("✓ Test 1 PASSED: Best values correctly associated for each read")
124 112
125 def test_read_count_consistency(self, processed_output): 113 def test_read_count_consistency(self, processed_output):
126 """ 114 """
127 Test 2: Read Count Consistency 115 Test 1: Read Count Consistency
128 116
129 Verifies that read counts from FASTA headers are correctly preserved 117 Verifies that read counts from FASTA headers are correctly preserved
130 and aggregated in all output files. 118 and aggregated in all output files.
131 """ 119 """
132 # Check Excel output
133 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') 120 df = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads')
134 # Verify counts are correctly extracted and preserved
135 expected_counts = {'read1': 100, 'read2': 50, 'read3': 25, 'read4':25} 121 expected_counts = {'read1': 100, 'read2': 50, 'read3': 25, 'read4':25}
136 122
137 skipped_reads = [] 123 skipped_reads = []
138 124
139 for read_name, expected_count in expected_counts.items(): 125 for read_name, expected_count in expected_counts.items():
142 skipped_reads.append(read_name) # remember we skip this read 128 skipped_reads.append(read_name) # remember we skip this read
143 continue 129 continue
144 row = subset.iloc[0] 130 row = subset.iloc[0]
145 assert row['count'] == expected_count, f"Count mismatch for {read_name}" 131 assert row['count'] == expected_count, f"Count mismatch for {read_name}"
146 132
147 # Check annotation stats
148 with open(processed_output.anno_stats, 'r') as f: 133 with open(processed_output.anno_stats, 'r') as f:
149 stats_content = f.read() 134 stats_content = f.read()
150 # Total unique count should be 175 (100+50+25) 135 # Total unique count should be 175 (100+50+25)
151 assert 'total_unique\t200' in stats_content, "Total unique count incorrect in stats" 136 assert 'total_unique: 200' in stats_content, "Total unique count incorrect in stats"
152 if skipped_reads: 137 if skipped_reads:
153 assert all(read not in df['header'].values for read in skipped_reads) 138 assert all(read not in df['header'].values for read in skipped_reads)
154 print("✓ Test 2 PASSED: Read counts consistent across all outputs") 139 print("✓ Test 1 PASSED: Read counts consistent across all outputs")
155 140
156 def test_lowest_common_ancester(self, processed_output): 141 def test_lowest_common_ancester(self, processed_output):
157 """ 142 """
158 Test 3: Big Input Files 143 Test 2: Big Input Files
159 144
160 Tests the functioning of lowest common ancestor selection with realistic inputfile sizes 145 Tests the functioning of lowest common ancestor selection with realistic inputfile sizes
161 """ 146 """
162 # Test the function directly with known conflicts
163 test_conflicts = { 147 test_conflicts = {
164 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 10, 148 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 10,
165 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 1, 149 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 1,
166 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita c': 1, 150 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita c': 1,
167 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita d': 1, 151 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita d': 1,
168 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita e': 1, 152 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita e': 1,
169 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia a': 187, 153 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia a': 450,
170 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia b': 2, 154 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia b': 2,
171 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia c': 2, 155 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia c': 2,
172 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia d': 2, 156 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia d': 2,
173 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia e': 2, 157 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia e': 2,
174 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia f': 12, 158 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Ciceronia / Ciceronia f': 12,
175 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 159 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6
176 } 160 }
177 resolved_short, resolved_long = resolve_taxon_majority(test_conflicts, 0.9) 161 resolved_short1, resolved_long1 = resolve_tax_majority(test_conflicts, 0.9)
178 assert 'Ciceronia a' in resolved_short, "Conflict not resolved to uncertain taxa" 162 assert 'Ciceronia a' in resolved_short1, "Conflict not resolved to uncertain taxa"
179 163
180 test_90_precent_conflicts = { 164 test_90_precent_conflicts = {
181 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 90, 165 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita a': 90,
182 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 10, 166 'Viridiplantae / Streptophyta / Magnoliopsida / Asterales / Asteraceae / Cicerbita / Cicerbita b': 10,
183 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6 167 'Viridiplantae / Streptophyta / Bryopsida / Funariales / Funariaceae / Funaria / Uncertain taxa': 6
184 } 168 }
185 resolved_short, resolved_long = resolve_taxon_majority(test_90_precent_conflicts, 0.9) 169 resolved_short, resolved_long = resolve_tax_majority(test_90_precent_conflicts, 0.9)
186 assert 'Cicerbita a' in resolved_short, "Conflict not resolved to uncertain taxa" 170 assert 'Viridiplantae / Streptophyta / Uncertain taxa' in resolved_long, "Conflict not resolved to uncertain taxa"
187 171
188 print("✓ Test 3 PASSED: Lowest common ancestor works correctly") 172 print("✓ Test 2 PASSED: Lowest common ancestor works correctly")
189 173
190 174
191 def test_taxonomic_conflict_resolution(self, processed_output): 175 def test_taxonomic_conflict_resolution(self, processed_output):
192 """ 176 """
193 Test 4: Taxonomic Conflict Resolution 177 Test 3: Taxonomic Conflict Resolution
194 178
195 Tests the uncertainty threshold mechanism for resolving taxonomic conflicts. 179 Tests the uncertainty threshold mechanism for resolving taxonomic conflicts.
196 Uses a controlled scenario where multiple hits have different taxa. 180 Uses a controlled scenario where multiple hits have different taxa.
197 """ 181 """
198 # Test the function directly with known conflicts
199 test_conflicts = { 182 test_conflicts = {
200 'Bacteria / Firmicutes / Bacilli': 2, 183 'Bacteria / Firmicutes / Bacilli': 2,
201 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 184 'Bacteria / Proteobacteria / Gammaproteobacteria': 1
202 } 185 }
203 186
204 resolved_short, resolved_long = resolve_taxon_majority(test_conflicts, 0.9) 187 resolved_short, resolved_long = resolve_tax_majority(test_conflicts, 0.9)
205 188
206 # With threshold 0.9, should resolve to most common (2/3 = 0.67 < 0.9, so uncertain) 189 # With threshold 0.9, should resolve to most common (2/3 = 0.67 < 0.9, so uncertain)
207 assert 'Uncertain taxa' in resolved_short, "Conflict not resolved to uncertain taxa" 190 assert 'Uncertain taxa' in resolved_short, "Conflict not resolved to uncertain taxa"
208 191
209 # Test with higher confidence
210 test_high_confidence = { 192 test_high_confidence = {
211 'Bacteria / Firmicutes / Bacilli': 9, 193 'Bacteria / Firmicutes / Bacilli': 9,
212 'Bacteria / Proteobacteria / Gammaproteobacteria': 1 194 'Bacteria / Proteobacteria / Gammaproteobacteria': 1
213 } 195 }
214 196
215 resolved_short, resolved_long = resolve_taxon_majority(test_high_confidence, 0.9) 197 resolved_short, resolved_long = resolve_tax_majority(test_high_confidence, 0.9)
216 assert 'Firmicutes' in resolved_short, "High confidence case not resolved correctly" 198 assert 'Firmicutes' in resolved_short, "High confidence case not resolved correctly"
217 199
218 print("✓ Test 4 PASSED: Taxonomic conflict resolution working correctly") 200 print("✓ Test 3 PASSED: Taxonomic conflict resolution working correctly")
219 201
220 def test_output_file_structures(self, processed_output): 202 def test_output_file_structures(self, processed_output):
221 """ 203 """
222 Test 5: Output File Structure Validation 204 Test 4: Output File Structure Validation
223 205
224 Verifies that all output files are created with correct structure and format. 206 Verifies that all output files are created with correct structure and format.
225 """ 207 """
226 # Test Excel file structure
227 excel_file = processed_output.header_anno 208 excel_file = processed_output.header_anno
228 assert os.path.exists(excel_file), "Excel output file not created" 209 assert os.path.exists(excel_file), "Excel output file not created"
229 210
230 # Check both sheets exist
231 xl_file = pd.ExcelFile(excel_file) 211 xl_file = pd.ExcelFile(excel_file)
232 expected_sheets = ['Individual_Reads', 'Merged_by_Taxa'] 212 expected_sheets = ['Individual_Reads', 'Merged_by_Taxa']
233 assert all(sheet in xl_file.sheet_names for sheet in expected_sheets), "Missing Excel sheets" 213 assert all(sheet in xl_file.sheet_names for sheet in expected_sheets), "Missing Excel sheets"
234 214
235 # Test Individual_Reads sheet structure
236 df_individual = pd.read_excel(excel_file, sheet_name='Individual_Reads') 215 df_individual = pd.read_excel(excel_file, sheet_name='Individual_Reads')
237 expected_cols = ['header', 'e_value', 'identity percentage', 'coverage', 216 expected_cols = ['header', 'seq_id', 'source', 'count', 'taxa', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
238 'bitscore', 'count', 'source', 'taxa']
239 assert all(col in df_individual.columns for col in expected_cols), "Missing columns in Individual_Reads" 217 assert all(col in df_individual.columns for col in expected_cols), "Missing columns in Individual_Reads"
240 218
241 # Test taxa output structure
242 with open(processed_output.taxa_output, 'r') as f: 219 with open(processed_output.taxa_output, 'r') as f:
243 taxa_lines = f.readlines() 220 taxa_lines = f.readlines()
244 221 assert len(taxa_lines) == 2, "Taxa output too short"
245 # Should have header line and data lines
246 assert len(taxa_lines) > 2, "Taxa output too short"
247 assert 'percentage_rooted\tnumber_rooted' in taxa_lines[1], "Taxa output header incorrect" 222 assert 'percentage_rooted\tnumber_rooted' in taxa_lines[1], "Taxa output header incorrect"
248 223
249 # Test circle data JSON structure 224 with open(processed_output.anno_stats, 'r') as f:
225 anno_stats = f.readlines()
226 assert 'FASTA: headers kept after filters and min_support=1: 4\n' in anno_stats, "Taxa output header incorrect"
227 filter_f = 4
228
229
250 with open(processed_output.circle_data, 'r') as f: 230 with open(processed_output.circle_data, 'r') as f:
251 circle_data = json.load(f) 231 circle_data = json.load(f)
252 232
253 assert isinstance(circle_data, list), "Circle data should be a list" 233 assert isinstance(circle_data, list), "Circle data should be a list"
254 assert len(circle_data) == len(TAXONOMIC_LEVELS), "Circle data should have entry per taxonomic level" 234 assert len(circle_data) == len(TAXONOMIC_LEVELS), "Circle data should have entry per taxonomic level"
255 235
256 print("✓ Test 5 PASSED: All output files have correct structure") 236 with open(processed_output.filtered_fasta, 'r') as f:
257 237 filtered_fasta = f.readlines()
258 def test_evalue_filtering(self, test_data_dir): 238 assert len(filtered_fasta) == filter_f * 2
259 """ 239
260 Test 6: E-value Threshold Filtering 240 print("✓ Test 4 PASSED: All output files have correct structure")
261 241
262 Tests that hits above the e-value threshold are correctly filtered out. 242
263 """
264 input_dir = test_data_dir / "input"
265 output_dir = test_data_dir / "output"
266
267 # Create test file with mix of good and bad e-values
268 blast_content_mixed = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy
269 read1(100) subject1 95.0 100 50 75 1e-50 200 database1 Viridiplantae / Streptophyta / Magnoliopsida / Fagales / Juglandaceae / Uncertain taxa / Uncertain taxa
270 read1(100) subject2 90.0 95 45 70 1e-5 180 database1 Viridiplantae / Streptophyta / Magnoliopsida / Rosales / Rosaceae / Sorbus / Sorbus aucuparia
271 read2(50) subject3 85.0 90 40 65 1e-3 160 database2 Viridiplantae / Streptophyta / Magnoliopsida / Solanales / Solanaceae / Uncertain taxa / Uncertain taxa
272 """
273
274 fasta_content = """>read1(100) count=100;
275 ATCG
276 >read2(50) count=50;
277 GCTA
278 """
279
280 blast_file = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sorted_test.tabular"
281 fasta_file = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sorted_test.fasta"
282
283 with open(blast_file, 'w') as f:
284 f.write(blast_content_mixed)
285 with open(fasta_file, 'w') as f:
286 f.write(fasta_content)
287
288 # Process with strict e-value threshold
289 class Args:
290 def __init__(self):
291 self.input_anno = str(blast_file)
292 self.input_unanno = str(fasta_file)
293 self.header_anno = str(output_dir / "evalue_test.xlsx")
294 self.eval_plot = None
295 self.taxa_output = None
296 self.circle_data = None
297 self.anno_stats = None
298 self.uncertain_threshold = 0.9
299 self.eval_threshold = 1e-10 # Very strict threshold
300 self.use_counts = True
301
302 args = Args()
303 process_single_file(args.input_anno, args.input_unanno, args)
304
305 # Check that only the 1e-50 hit remains
306 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads')
307
308 # Should only have read1 (with 1e-50), read2 should be filtered out
309 assert len(df) == 1, f"Expected 1 read after filtering, got {len(df)}"
310 assert df.iloc[0]['e_value'] == pytest.approx(1e-50, rel=1e-8, abs=1e-12), "Wrong hit survived e-value filtering"
311
312 print("✓ Test 6 PASSED: E-value filtering working correctly")
313 243
314 def test_header_synchronization(self, test_data_dir): 244 def test_header_synchronization(self, test_data_dir):
315 """ 245 """
316 Test 7: Header Synchronization Between Files 246 Test 5: Header Synchronization Between Files
317 247
318 Tests that the processor correctly handles mismatched headers between 248 Tests that the processor correctly handles mismatched headers between
319 annotated and unannotated files. 249 annotated and unannotated files.
320 """ 250 """
321 input_dir = test_data_dir / "input" 251 input_dir = test_data_dir / "input"
323 253
324 # Create mismatched files 254 # Create mismatched files
325 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy 255 blast_content = """#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage #Coverage #evalue #bitscore #Source #Taxonomy
326 read1(100) source=NCBI sequenceID=KR738003 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-30 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum 256 read1(100) source=NCBI sequenceID=KR738003 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-30 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum
327 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum 257 read1(100) source=NCBI sequenceID=KR738670 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 100.000 100 7.35e-14 54.7 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum
258 read2.1(50) 1 2 3 4 5 6 7 8 9
328 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum 259 read3(25) source=NCBI sequenceID=KR737595 superkingdom=Eukaryota kingdom=Viridiplantae phylum=Streptophyta subphylum=Streptophytina class=Magnoliopsida subclass=NA infraclass=NA order=Malvales suborder=NA infraorder=NA superfamily=NA family=Malvaceae genus=Hibiscus species=Hibiscus trionum markercode=trnL lat=0.304 lon=36.87 source=NCBI N/A 97.561 87 1.68e-14 71.3 Viridiplantae / Streptophyta / Magnoliopsida / Malvales / Malvaceae / Hibiscus / Hibiscus trionum
329 """ 260 """
330 261
331 fasta_content = """>read1(100) count=100; 262 fasta_content = """>read1(100) count=100;
332 ATCG 263 ATCG
350 self.input_unanno = fasta_file 281 self.input_unanno = fasta_file
351 self.header_anno = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sync_test.xlsx" 282 self.header_anno = "Stage_1_translated/NLOOR_scripts/process_annotations_tool/test-data/sync_test.xlsx"
352 self.eval_plot = None 283 self.eval_plot = None
353 self.taxa_output = None 284 self.taxa_output = None
354 self.circle_data = None 285 self.circle_data = None
286 self.filtered_fasta = str(output_dir / "filtered_fasta.fasta")
355 self.anno_stats = str(output_dir / "sync_stats.txt") 287 self.anno_stats = str(output_dir / "sync_stats.txt")
288 self.log = str(output_dir / "log.txt")
356 self.uncertain_threshold = 0.9 289 self.uncertain_threshold = 0.9
357 self.eval_threshold = 1e-10 290 self.eval_threshold = 1e-10
358 self.use_counts = True 291 self.use_counts = True
292 self.min_bitscore = 50
293 self.min_support = 1
294 self.ignore_rank = 'unknown'
295 self.ignore_taxonomy = 'environmental'
296 self.bitscore_perc_cutoff = 10
297 self.ignore_obiclean_type = 'singleton'
298 self.ignore_illuminapairend_type = 'pairend'
299 self.min_identity = 30
300 self.min_coverage = 30
301 self.ignore_seqids = ''
359 302
360 args = Args() 303 args = Args()
361 process_single_file(args.input_anno, args.input_unanno, args) 304 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[])
362 # Check that processing handled the mismatch correctly
363 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads') 305 df = pd.read_excel(args.header_anno, sheet_name='Individual_Reads')
364 extracted = df['header'].str.extract(r'(read\d+)') 306 extracted = df['header'].str.extract(r'(read\d+)')
365 # final list
366 headers = extracted[0].tolist() 307 headers = extracted[0].tolist()
367 # Should have read1 and read3, read2 should be skipped 308 # Should have read1 and read3, read2 should be skipped
368 assert 'read1' in headers, "read1 should be present" 309 assert 'read1' in headers, "read1 should be present"
310 assert 'read2' not in headers, "read2 should not be present"
311 assert 'read2.1' not in headers, "read2 should not be present"
369 assert 'read3' in headers, "read3 should be present" 312 assert 'read3' in headers, "read3 should be present"
370 313
371 print("✓ Test 7 PASSED: Header synchronization handled correctly") 314 print("✓ Test 5 PASSED: Header synchronization handled correctly")
315
316 def test_check_header_string_all_behaviors(self):
317 from Stage_1_translated.NLOOR_scripts.process_annotations_tool.blast_annotations_processor import \
318 check_header_string
319
320 # clean header — allowed
321 assert check_header_string(">readA count=10;", "", "") is True
322 # blocks singleton
323 assert check_header_string(">r obiclean_status={'XXX': 's'}", "singleton", "") is False
324 # blocks variant
325 assert check_header_string(">r obiclean_status={'XXX': 'i'}", "variant", "") is False
326 # blocks head
327 assert check_header_string(">r obiclean_status={'XXX': 'h'}", "head", "") is False
328 # blocks pairend
329 assert check_header_string(">r PairEnd", "pairend", "") is False
330 # blocks consensus
331 assert check_header_string(">r CONS", "consensus", "") is False
332 # blocks custom string
333 assert check_header_string(">r FooBar", "FooBar", "") is False
334 # blocks when string is in second param
335 assert check_header_string(">r blah", "", "blah") is False
336 # blocks when multiple ignore values contain it
337 assert check_header_string(">r PairEnd obiclean_status={'XXX': 's'}", "pairend,singleton", "") is False
338 # allows when no match
339 assert check_header_string(">r something", "pairend", "") is True
372 340
373 def test_excel_merged_vs_individual(self, processed_output): 341 def test_excel_merged_vs_individual(self, processed_output):
374 """ 342 """
375 Test 8: Excel Merged vs Individual Sheet Consistency 343 Test 6: Excel Merged vs Individual Sheet Consistency
376 344
377 Verifies that the merged sheet correctly aggregates data from the individual sheet. 345 Verifies that the merged sheet correctly aggregates data from the individual sheet.
378 """ 346 """
379 df_individual = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads') 347 df_individual = pd.read_excel(processed_output.header_anno, sheet_name='Individual_Reads')
380 df_merged = pd.read_excel(processed_output.header_anno, sheet_name='Merged_by_Taxa') 348 df_merged = pd.read_excel(processed_output.header_anno, sheet_name='Merged_by_Taxa')
381 349
382 # Count unique taxa in individual sheet
383 individual_taxa = df_individual['taxa'].nunique() 350 individual_taxa = df_individual['taxa'].nunique()
384 351
385 # Should match number of rows in merged sheet
386 assert len(df_merged) == individual_taxa, "Merged sheet doesn't match unique taxa count" 352 assert len(df_merged) == individual_taxa, "Merged sheet doesn't match unique taxa count"
387 353
388 # Check that counts are properly aggregated 354 # Check that counts are properly aggregated
389 # For taxa with multiple reads, counts should be summed 355 # For taxa with multiple reads, counts should be summed
390 for _, merged_row in df_merged.iterrows(): 356 for _, merged_row in df_merged.iterrows():
394 expected_count = individual_rows['count'].sum() 360 expected_count = individual_rows['count'].sum()
395 actual_count = merged_row['count'] 361 actual_count = merged_row['count']
396 362
397 assert actual_count == expected_count, f"Count mismatch for taxa {taxa}: expected {expected_count}, got {actual_count}" 363 assert actual_count == expected_count, f"Count mismatch for taxa {taxa}: expected {expected_count}, got {actual_count}"
398 364
399 print("✓ Test 8 PASSED: Excel merged sheet correctly aggregates individual data") 365 print("✓ Test 6 PASSED: Excel merged sheet correctly aggregates individual data")
400 366
401 def test_annotation_statistics_accuracy(self, processed_output, sample_files): 367 def test_annotation_statistics_accuracy(self, processed_output, sample_files):
402 """ 368 """
403 Test 9: Annotation Statistics Accuracy 369 Test 7: Annotation Statistics Accuracy
404 370
405 Verifies that calculated annotation statistics match the actual data. 371 Verifies that calculated annotation statistics match the actual data.
406 """ 372 Adapted for the new plain-text log file instead of tab-separated output.
407 # Read stats file 373 """
408 stats = {} 374 stats = {}
375
409 with open(processed_output.anno_stats, 'r') as f: 376 with open(processed_output.anno_stats, 'r') as f:
410 lines = f.readlines()[1:] # Skip header 377 lines = f.readlines()
411 for line in lines: 378
412 key, value = line.strip().split('\t') 379 for line in lines:
413 try: 380 line = line.strip()
414 stats[key] = float(value) 381 if not line or ":" not in line:
415 except ValueError: 382 continue
416 stats[key] = value 383
417 384 key, value = line.split(":", 1)
418 # Manual verification 385 key = key.strip()
419 assert stats['total_sequences'] == 4.0, "Total sequences count incorrect" 386 value = value.strip()
420 assert stats['annotated_sequences'] == 3.0, "Annotated sequences count incorrect" 387
421 assert stats['total_unique'] == 200, "Total unique count incorrect" 388 try:
422 assert stats['unique_annotated'] == 150, "Unique annotated count incorrect" 389 stats[key] = float(value)
423 assert stats['percentage_annotated'] == 75.0, "Percentage annotated incorrect" 390 except ValueError:
424 assert stats['percentage_unique_annotated'] == 75.0, "Percentage unique annotated incorrect" 391 stats[key] = value
425 392
426 print("✓ Test 9 PASSED: Annotation statistics are accurate") 393 assert stats["total_sequences"] == 4.0, "Total sequences count incorrect"
427 394 assert stats["annotated_sequences"] == 3.0, "Annotated sequence count incorrect"
395 assert stats["total_unique"] == 200.0, "Total unique count incorrect"
396 assert stats["unique_annotated"] == 150.0, "Unique annotated count incorrect"
397 assert stats["percentage_annotated"] == 75.0, "Percentage annotated incorrect"
398 assert stats["percentage_unique_annotated"] == 75.0, "Percentage unique annotated incorrect"
399
400 print("✓ Test 7 PASSED: Annotation statistics are accurate")
401
402 def test_combined_all_filters(self, test_data_dir):
403 """
404 Single integrated test that validates all FASTA + BLAST filter rules.
405 Every read is designed to fail exactly one filter, except readOK.
406 """
407 input_dir = test_data_dir / "input"
408 output_dir = test_data_dir / "output"
409
410 fasta = input_dir / "combined_filters.fasta"
411 blast = input_dir / "combined_filters.tabular"
412
413 fasta.write_text(
414 ">lowSupport(1) count=1;\nACGT\n"
415 ">obicleanFail(10) count=10; obiclean_status={'XXX': 's'};\nACGT\n"
416 ">pairendFail_PairEnd(10) count=10;\nACGT\n"
417 ">identityFail(10) count=10;\nACGT\n"
418 ">coverageFail(10) count=10;\nACGT\n"
419 ">bitscoreFail(10) count=10;\nACGT\n"
420 ">bscutoffHigh(10) count=10;\nACGT\n"
421 ">envTaxFail(10) count=10;\nACGT\n"
422 ">rankFail(10) count=10;\nACGT\n"
423 ">seqidFail(10) count=10;\nACGT\n"
424 ">readOK(10) count=10;\nACGT\n"
425 ">readOK_multiple_id(10) count=10;\nACGT\n"
426 )
427
428 blast.write_text(
429 # min_support (count=1 < 5)
430 "lowSupport(1)\ts\tid1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
431
432 # ignore_obiclean_type = singleton
433 "obicleanFail(10)\ts\tid2\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
434
435 # ignore_illuminapairedend_type = pairend
436 "pairendFail_PairEnd(10)\ts\tid3\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
437
438 # min_identity = 90 → identity = 50 fails
439 "identityFail(10)\ts\tid4\t123\t50\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
440
441 # min_coverage = 50 → coverage = 20 fails
442 "coverageFail(10)\ts\tid5\t123\t99\t20\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
443
444 # min_bitscore = 60 → bitscore = 10 fails
445 "bitscoreFail(10)\ts\tid6\t123\t99\t99\t1e-50\t10\tsrc\tA / B / C / D / E / F / G\n"
446
447 # bitscore_perc_cutoff: best = 200 → cutoff = 180 → bitscore 150 fails
448 "bscutoffHigh(10)\ts\tid7.1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G.1\n"
449 "bscutoffHigh(10)\ts\tid7.2\t123\t99\t99\t1e-50\t150\tsrc\tA / B / C / D / E / F / G.2\n"
450
451 # ignore_taxonomy = 'environmental'
452 "envTaxFail(10)\ts\tid8\t123\t99\t99\t1e-50\t200\tsrc\tEnvironmental / B / C / D / E / F / G\n"
453
454 # ignore_rank = 'unknown'
455 "rankFail(10)\ts\tid9\t123\t99\t99\t1e-50\t200\tsrc\tUnknown / B / C / D / E / F / G\n"
456
457 # ignore_seqids = BADSEQ
458 "seqidFail(10)\ts\tBADSEQ\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
459
460 # readOK (valid, full taxonomy)
461 "readOK(10)\ts\tidGood\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
462
463 # readOK_multiple_id (valid, full taxonomy, multiple id's)
464 "readOK_multiple_id(10)\ts\tidGood.1\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
465 "readOK_multiple_id(10)\ts\tidGood.2\t123\t99\t99\t1e-50\t200\tsrc\tA / B / C / D / E / F / G\n"
466 )
467
468 class Args:
469 def __init__(self):
470 self.input_anno = str(blast)
471 self.input_unanno = str(fasta)
472 self.header_anno = str(output_dir / "combined.xlsx")
473 self.filtered_fasta = str(output_dir / "combined.fasta")
474 self.anno_stats = str(output_dir / "combined_stats.txt")
475 self.eval_plot = None
476 self.taxa_output = None
477 self.circle_data = None
478 self.log = str(output_dir / "log.txt")
479 self.uncertain_threshold = 0.9
480 self.eval_threshold = 1e-10
481 self.use_counts = True
482 self.min_bitscore = 60
483 self.min_support = 5
484 self.ignore_rank = 'unknown'
485 self.ignore_taxonomy = 'environmental'
486 self.bitscore_perc_cutoff = 10
487 self.ignore_obiclean_type = 'singleton'
488 self.ignore_illuminapairend_type = 'pairend'
489 self.min_identity = 90
490 self.min_coverage = 50
491 self.ignore_seqids = 'BADSEQ'
492
493 args = Args()
494 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[])
495
496 with open(args.filtered_fasta) as f:
497 headers = [l.strip() for l in f if l.startswith(">")]
498 assert '>obicleanFail(10) count=10;' not in headers
499 assert '>pairendFail_PairEnd(10) count=10;' not in headers
500 assert len(headers) == 9, "FASTA filtering only applies to header-based rules"
501
502 df = pd.read_excel(args.header_anno, sheet_name="Individual_Reads")
503 seq_ids = {
504 sid
505 for val in df["seq_id"]
506 for sid in (ast.literal_eval(val) if isinstance(val, str) else val)
507 }
508
509 expected = {'idGood.1', 'idGood', 'id7.1', 'idGood.2'}
510
511 assert seq_ids == expected, f"Expected surviving seq_ids {expected}, got {seq_ids}"
512
513
514 def test_log_filters_count(self, processed_output):
515 """
516 Verify that the BLAST filter counters in the log file match expected structure.
517 """
518 with open(processed_output.anno_stats) as f:
519 log = f.read()
520
521 assert "=== PARAMETERS USED ===" in log
522 assert "input_anno:" in log
523 assert "input_unanno:" in log
524
525 assert "FASTA: total headers: 4" in log
526 assert "FASTA: headers kept after filters" in log
527
528 assert "BLAST: total hits read: 8" in log
529 assert "BLAST: hits kept after quality filters: 7" in log
530
531 assert "ANNOTATION: total FASTA headers considered: 4" in log
532 assert "ANNOTATION: reads with BLAST hits: 3" in log
533 assert "ANNOTATION: reads without BLAST hits: 1" in log
534
535 assert "E-value plot written to:" in log
536 assert "Taxa summary written to:" in log
537 assert "Header annotations written to:" in log
538 assert "Circle diagram JSON written to:" in log
539
540 assert "=== ANNOTATION STATISTICS ===" in log
541 assert "percentage_annotated: 75.0" in log
542 assert "unique_annotated: 150" in log
543 assert "total_unique: 200" in log
544
545 def test_missing_blast_file_graceful(self, test_data_dir):
546 """
547 Crash / robustness test.
548
549 When the BLAST file does NOT exist, the processor should:
550 - not crash
551 - write an anno_stats log mentioning the error
552 - return without creating header_anno
553 """
554 input_dir = test_data_dir / "input"
555 output_dir = test_data_dir / "output"
556
557 fasta = input_dir / "missing_blast_test.fasta"
558 fasta.write_text(">read1(10) count=10;\nACGT\n")
559
560 missing_blast = input_dir / "this_file_does_not_exist.tabular"
561
562 class Args:
563 def __init__(self):
564 self.input_anno = str(missing_blast)
565 self.input_unanno = str(fasta)
566 self.header_anno = str(output_dir / "missing_blast_header.xlsx")
567 self.filtered_fasta = str(output_dir / "missing_blast_filtered.fasta")
568 self.anno_stats = str(output_dir / "missing_blast_stats.txt")
569 self.eval_plot = None
570 self.taxa_output = None
571 self.circle_data = None
572 self.log = str(output_dir / "log.txt")
573 self.uncertain_threshold = 0.9
574 self.eval_threshold = 1e-10
575 self.use_counts = True
576 self.min_bitscore = 0
577 self.min_support = 1
578 self.ignore_rank = 'unknown'
579 self.ignore_taxonomy = 'environmental'
580 self.bitscore_perc_cutoff = 10
581 self.ignore_obiclean_type = 'singleton'
582 self.ignore_illuminapairend_type = 'pairend'
583 self.min_identity = 0
584 self.min_coverage = 0
585 self.ignore_seqids = ''
586
587 args = Args()
588
589 process_single_file(args.input_anno, args.input_unanno, args, log_messages=[])
590
591 assert not os.path.exists(args.header_anno), "Header file should not be created when BLAST is missing"
592
593 assert os.path.exists(args.anno_stats), "anno_stats log should be created on error"
594
595 from pathlib import Path
596 log_text = Path(args.anno_stats).read_text()
597 assert "Error: Input file" in log_text, "Missing BLAST file error not logged"
598 assert "Starting processing for FASTA" in log_text, "FASTA processing log missing"
428 599
429 600
430 if __name__ == "__main__": 601 if __name__ == "__main__":
431 # Run all tests in this file
432 pytest.main([__file__]) 602 pytest.main([__file__])