Mercurial > repos > bimib > cobraxy
comparison COBRAxy/test_gpr_translation_comprehensive.py @ 493:a92d21f92956 draft
Uploaded
| author | francesco_lapi |
|---|---|
| date | Tue, 30 Sep 2025 15:00:21 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 492:4ed95023af20 | 493:a92d21f92956 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Comprehensive test suite for GPR translation functionality in COBRAxy. | |
| 4 | |
| 5 This test suite covers: | |
| 6 - Basic 1:1, 1:many, many:1 gene mappings | |
| 7 - Complex GPR expressions with AND/OR logic | |
| 8 - Translation issues tracking | |
| 9 - OR-only GPR flattening functionality | |
| 10 - Edge cases and nested expressions | |
| 11 - Statistical reporting | |
| 12 """ | |
| 13 | |
| 14 import cobra | |
| 15 import pandas as pd | |
| 16 import sys | |
| 17 import os | |
| 18 import logging | |
| 19 from typing import Dict, List, Tuple | |
| 20 import re | |
| 21 | |
| 22 # Add the COBRAxy utils directory to the path | |
| 23 sys.path.append('/hdd/home/flapi/COBRAxy') | |
| 24 from utils import model_utils | |
| 25 | |
| 26 # Configure logging | |
| 27 logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') | |
| 28 logger = logging.getLogger(__name__) | |
| 29 | |
| 30 class GPRTranslationTester: | |
| 31 """Comprehensive GPR translation test suite""" | |
| 32 | |
| 33 def __init__(self): | |
| 34 self.test_results = {} | |
| 35 self.failed_tests = [] | |
| 36 | |
| 37 def create_comprehensive_test_model(self) -> cobra.Model: | |
| 38 """Create a comprehensive test model with diverse GPR patterns""" | |
| 39 model = cobra.Model('comprehensive_test_model') | |
| 40 | |
| 41 # Create metabolites | |
| 42 metabolites = [] | |
| 43 for i in range(30): | |
| 44 met = cobra.Metabolite(f'met_{chr(65+i%26)}{i//26}', compartment='c') | |
| 45 metabolites.append(met) | |
| 46 | |
| 47 reactions_data = [ | |
| 48 # === BASIC CASES === | |
| 49 ('BASIC_1to1', 'GENE1', 0, 1), # Simple 1:1 mapping | |
| 50 ('BASIC_1tomany', 'GENE2', 1, 2), # 1:many mapping | |
| 51 ('BASIC_manyto1', 'GENE3', 2, 3), # many:1 mapping | |
| 52 ('BASIC_unmapped', 'UNMAPPED_GENE', 3, 4), # unmapped gene | |
| 53 | |
| 54 # === SIMPLE OR CASES (candidates for flattening) === | |
| 55 ('OR_simple', 'GENE4 or GENE5', 4, 5), # Simple OR with many:1 | |
| 56 ('OR_three', 'GENE6 or GENE7 or GENE8', 5, 6), # Three genes OR | |
| 57 ('OR_parentheses', '(GENE9 or GENE10)', 6, 7), # OR with parentheses | |
| 58 ('OR_duplicates', 'GENE11 or GENE12 or GENE11', 7, 8), # OR with duplicates after translation | |
| 59 | |
| 60 # === COMPLEX OR CASES (candidates for flattening) === | |
| 61 ('OR_nested_simple', '(GENE13 or GENE14) or (GENE15 or GENE16)', 8, 9), # Nested OR only | |
| 62 ('OR_many_parentheses', '((GENE17 or GENE18) or GENE19) or GENE20', 9, 10), # Multiple nesting levels | |
| 63 ('OR_mixed_mapping', 'GENE21 or GENE22 or GENE23', 10, 11), # Mixed 1:1, 1:many, many:1 | |
| 64 | |
| 65 # === AND CASES (should NOT be flattened) === | |
| 66 ('AND_simple', 'GENE24 and GENE25', 11, 12), # Simple AND | |
| 67 ('AND_complex', '(GENE26 and GENE27) and GENE28', 12, 13), # Complex AND | |
| 68 | |
| 69 # === MIXED AND/OR (should NOT be flattened) === | |
| 70 ('MIXED_basic', 'GENE29 and (GENE30 or GENE31)', 13, 14), # AND with OR | |
| 71 ('MIXED_complex', '(GENE32 or GENE33) and (GENE34 or GENE35)', 14, 15), # OR and AND | |
| 72 ('MIXED_nested', '((GENE36 and GENE37) or GENE38) and GENE39', 15, 16), # Complex nesting | |
| 73 | |
| 74 # === EDGE CASES === | |
| 75 ('EDGE_single', 'GENE40', 16, 17), # Single gene | |
| 76 ('EDGE_empty', '', 17, 18), # Empty GPR | |
| 77 ('EDGE_whitespace', ' GENE41 or GENE42 ', 18, 19), # Whitespace | |
| 78 ('EDGE_case_sensitive', 'Gene43 OR gene44', 19, 20), # Case variations | |
| 79 | |
| 80 # === STRESS TESTS === | |
| 81 ('STRESS_long_or', 'GENE45 or GENE46 or GENE47 or GENE48 or GENE49 or GENE50', 20, 21), # Long OR chain | |
| 82 ('STRESS_deep_nest', '(((GENE51 or GENE52) or GENE53) or GENE54)', 21, 22), # Deep nesting | |
| 83 ('STRESS_complex', '(GENE55 or (GENE56 or GENE57)) or ((GENE58 or GENE59) or GENE60)', 22, 23), # Complex structure | |
| 84 | |
| 85 # === TRANSLATION ISSUE TRIGGERS === | |
| 86 ('ISSUE_1many_or', 'GENE61 or GENE62', 23, 24), # 1:many in OR (should be flattened) | |
| 87 ('ISSUE_manyto1_and', 'GENE63 and GENE64', 24, 25), # many:1 in AND (should NOT be flattened) | |
| 88 ('ISSUE_mixed_problems', '(GENE65 or GENE66) and GENE67', 25, 26), # Mixed problems | |
| 89 | |
| 90 # === REAL-WORLD INSPIRED CASES === | |
| 91 ('REAL_metabolism', '(ENSG001 or ENSG002) or (ENSG003 or ENSG004)', 26, 27), # Metabolic pathway | |
| 92 ('REAL_transport', 'TRANSPORTER1 and (COFACTOR1 or COFACTOR2)', 27, 28), # Transport reaction | |
| 93 ('REAL_complex_enzyme', '((SUBUNIT1 and SUBUNIT2) or SUBUNIT3) and COFACTOR3', 28, 29), # Complex enzyme | |
| 94 ] | |
| 95 | |
| 96 # Create reactions | |
| 97 for rxn_id, gpr, met_in, met_out in reactions_data: | |
| 98 rxn = cobra.Reaction(rxn_id) | |
| 99 if met_in < len(metabolites) and met_out < len(metabolites): | |
| 100 rxn.add_metabolites({metabolites[met_in]: -1, metabolites[met_out]: 1}) | |
| 101 rxn.gene_reaction_rule = gpr | |
| 102 model.add_reactions([rxn]) | |
| 103 | |
| 104 return model | |
| 105 | |
| 106 def create_comprehensive_mapping(self) -> pd.DataFrame: | |
| 107 """Create a comprehensive gene mapping covering all test scenarios""" | |
| 108 mapping_data = { | |
| 109 'hgnc_symbol': [], | |
| 110 'ensg': [] | |
| 111 } | |
| 112 | |
| 113 # === BASIC MAPPINGS === | |
| 114 # 1:1 mappings | |
| 115 one_to_one = [ | |
| 116 ('GENE1', 'TARGET1'), | |
| 117 ('GENE24', 'TARGET24'), | |
| 118 ('GENE25', 'TARGET25'), | |
| 119 ('GENE26', 'TARGET26'), | |
| 120 ('GENE27', 'TARGET27'), | |
| 121 ('GENE28', 'TARGET28'), | |
| 122 ('GENE29', 'TARGET29'), | |
| 123 ('GENE40', 'TARGET40'), | |
| 124 ('GENE41', 'TARGET41'), | |
| 125 ('GENE42', 'TARGET42'), | |
| 126 ] | |
| 127 | |
| 128 # 1:many mappings (one source gene maps to multiple targets) | |
| 129 one_to_many = [ | |
| 130 ('GENE2', 'TARGET2A'), | |
| 131 ('GENE2', 'TARGET2B'), | |
| 132 ('GENE30', 'TARGET30A'), | |
| 133 ('GENE30', 'TARGET30B'), | |
| 134 ('GENE61', 'TARGET61A'), | |
| 135 ('GENE61', 'TARGET61B'), | |
| 136 ('GENE61', 'TARGET61C'), # Maps to 3 targets | |
| 137 ('GENE65', 'TARGET65A'), | |
| 138 ('GENE65', 'TARGET65B'), | |
| 139 ] | |
| 140 | |
| 141 # many:1 mappings (multiple source genes map to one target) | |
| 142 many_to_one = [ | |
| 143 ('GENE3', 'SHARED_TARGET1'), | |
| 144 ('GENE4', 'SHARED_TARGET1'), | |
| 145 ('GENE5', 'SHARED_TARGET1'), | |
| 146 ('GENE6', 'SHARED_TARGET2'), | |
| 147 ('GENE7', 'SHARED_TARGET2'), | |
| 148 ('GENE8', 'SHARED_TARGET2'), | |
| 149 ('GENE9', 'SHARED_TARGET3'), | |
| 150 ('GENE10', 'SHARED_TARGET3'), | |
| 151 ('GENE11', 'SHARED_TARGET4'), | |
| 152 ('GENE12', 'SHARED_TARGET4'), | |
| 153 ('GENE13', 'SHARED_TARGET5'), | |
| 154 ('GENE14', 'SHARED_TARGET5'), | |
| 155 ('GENE15', 'SHARED_TARGET5'), | |
| 156 ('GENE16', 'SHARED_TARGET5'), | |
| 157 ('GENE17', 'SHARED_TARGET6'), | |
| 158 ('GENE18', 'SHARED_TARGET6'), | |
| 159 ('GENE19', 'SHARED_TARGET6'), | |
| 160 ('GENE20', 'SHARED_TARGET6'), | |
| 161 ('GENE45', 'SHARED_TARGET7'), | |
| 162 ('GENE46', 'SHARED_TARGET7'), | |
| 163 ('GENE47', 'SHARED_TARGET7'), | |
| 164 ('GENE48', 'SHARED_TARGET7'), | |
| 165 ('GENE49', 'SHARED_TARGET7'), | |
| 166 ('GENE50', 'SHARED_TARGET7'), | |
| 167 ('GENE51', 'SHARED_TARGET8'), | |
| 168 ('GENE52', 'SHARED_TARGET8'), | |
| 169 ('GENE53', 'SHARED_TARGET8'), | |
| 170 ('GENE54', 'SHARED_TARGET8'), | |
| 171 ('GENE55', 'SHARED_TARGET9'), | |
| 172 ('GENE56', 'SHARED_TARGET9'), | |
| 173 ('GENE57', 'SHARED_TARGET9'), | |
| 174 ('GENE58', 'SHARED_TARGET9'), | |
| 175 ('GENE59', 'SHARED_TARGET9'), | |
| 176 ('GENE60', 'SHARED_TARGET9'), | |
| 177 ('GENE63', 'SHARED_TARGET10'), | |
| 178 ('GENE64', 'SHARED_TARGET10'), | |
| 179 ('GENE66', 'SHARED_TARGET11'), | |
| 180 ] | |
| 181 | |
| 182 # Mixed mappings for complex cases | |
| 183 mixed_mappings = [ | |
| 184 ('GENE21', 'TARGET21'), # 1:1 | |
| 185 ('GENE22', 'TARGET22A'), # 1:many | |
| 186 ('GENE22', 'TARGET22B'), | |
| 187 ('GENE23', 'SHARED_TARGET1'), # many:1 (shares with GENE3-5) | |
| 188 ('GENE31', 'SHARED_TARGET12'), | |
| 189 ('GENE32', 'SHARED_TARGET13'), | |
| 190 ('GENE33', 'SHARED_TARGET13'), | |
| 191 ('GENE34', 'TARGET34'), | |
| 192 ('GENE35', 'TARGET35'), | |
| 193 ('GENE36', 'TARGET36'), | |
| 194 ('GENE37', 'TARGET37'), | |
| 195 ('GENE38', 'TARGET38'), | |
| 196 ('GENE39', 'TARGET39'), | |
| 197 ('GENE62', 'TARGET62A'), | |
| 198 ('GENE62', 'TARGET62B'), | |
| 199 ('GENE67', 'TARGET67'), | |
| 200 ] | |
| 201 | |
| 202 # Case sensitivity tests | |
| 203 case_mappings = [ | |
| 204 ('Gene43', 'TARGET43'), | |
| 205 ('gene44', 'TARGET44'), | |
| 206 ] | |
| 207 | |
| 208 # Real-world inspired mappings | |
| 209 real_mappings = [ | |
| 210 ('ENSG001', 'HUMAN_GENE1'), | |
| 211 ('ENSG002', 'HUMAN_GENE2'), | |
| 212 ('ENSG003', 'HUMAN_GENE1'), # many:1 | |
| 213 ('ENSG004', 'HUMAN_GENE2'), # many:1 | |
| 214 ('TRANSPORTER1', 'SLC_FAMILY1'), | |
| 215 ('COFACTOR1', 'COFACTOR_A'), | |
| 216 ('COFACTOR2', 'COFACTOR_A'), # many:1 | |
| 217 ('COFACTOR3', 'COFACTOR_B'), | |
| 218 ('SUBUNIT1', 'COMPLEX_SUBUNIT1'), | |
| 219 ('SUBUNIT2', 'COMPLEX_SUBUNIT2'), | |
| 220 ('SUBUNIT3', 'COMPLEX_ALTERNATIVE'), | |
| 221 ] | |
| 222 | |
| 223 # Combine all mappings | |
| 224 all_mappings = one_to_one + one_to_many + many_to_one + mixed_mappings + case_mappings + real_mappings | |
| 225 | |
| 226 for source, target in all_mappings: | |
| 227 mapping_data['hgnc_symbol'].append(source) | |
| 228 mapping_data['ensg'].append(target) | |
| 229 | |
| 230 return pd.DataFrame(mapping_data) | |
| 231 | |
| 232 def analyze_mapping_statistics(self, mapping_df: pd.DataFrame) -> Dict: | |
| 233 """Analyze mapping statistics""" | |
| 234 stats = {} | |
| 235 | |
| 236 source_counts = mapping_df.groupby('hgnc_symbol')['ensg'].count() | |
| 237 target_counts = mapping_df.groupby('ensg')['hgnc_symbol'].count() | |
| 238 | |
| 239 stats['total_mappings'] = len(mapping_df) | |
| 240 stats['unique_sources'] = len(source_counts) | |
| 241 stats['unique_targets'] = len(target_counts) | |
| 242 | |
| 243 stats['one_to_one'] = (source_counts == 1).sum() | |
| 244 stats['one_to_many'] = (source_counts > 1).sum() | |
| 245 stats['many_to_one_targets'] = (target_counts > 1).sum() | |
| 246 | |
| 247 stats['one_to_many_details'] = {} | |
| 248 for gene, count in source_counts[source_counts > 1].items(): | |
| 249 targets = mapping_df[mapping_df['hgnc_symbol'] == gene]['ensg'].tolist() | |
| 250 stats['one_to_many_details'][gene] = targets | |
| 251 | |
| 252 stats['many_to_one_details'] = {} | |
| 253 for target, count in target_counts[target_counts > 1].items(): | |
| 254 sources = mapping_df[mapping_df['ensg'] == target]['hgnc_symbol'].tolist() | |
| 255 stats['many_to_one_details'][target] = sources | |
| 256 | |
| 257 return stats | |
| 258 | |
| 259 def predict_translation_issues(self, model: cobra.Model, mapping_df: pd.DataFrame) -> Dict: | |
| 260 """Predict which reactions will have translation issues""" | |
| 261 predictions = {} | |
| 262 mapping_dict = {} | |
| 263 | |
| 264 # Build mapping dictionary | |
| 265 for _, row in mapping_df.iterrows(): | |
| 266 source = row['hgnc_symbol'] | |
| 267 target = row['ensg'] | |
| 268 if source not in mapping_dict: | |
| 269 mapping_dict[source] = [] | |
| 270 mapping_dict[source].append(target) | |
| 271 | |
| 272 for rxn in model.reactions: | |
| 273 if not rxn.gene_reaction_rule or rxn.gene_reaction_rule.strip() == '': | |
| 274 continue | |
| 275 | |
| 276 # Extract genes from GPR | |
| 277 token_pattern = r'\b[A-Za-z0-9:_.-]+\b' | |
| 278 tokens = re.findall(token_pattern, rxn.gene_reaction_rule) | |
| 279 logical_operators = {'and', 'or', 'AND', 'OR', '(', ')'} | |
| 280 genes = [t for t in tokens if t not in logical_operators] | |
| 281 | |
| 282 issues = [] | |
| 283 has_1_to_many = False | |
| 284 has_many_to_1 = False | |
| 285 has_unmapped = False | |
| 286 | |
| 287 for gene in set(genes): | |
| 288 norm_gene = model_utils._normalize_gene_id(gene) | |
| 289 if norm_gene in mapping_dict: | |
| 290 targets = mapping_dict[norm_gene] | |
| 291 if len(targets) > 1: | |
| 292 has_1_to_many = True | |
| 293 issues.append(f"1:many - {gene} -> {targets}") | |
| 294 else: | |
| 295 has_unmapped = True | |
| 296 issues.append(f"unmapped - {gene}") | |
| 297 | |
| 298 # Check for many:1 mappings | |
| 299 target_to_sources = {} | |
| 300 for gene in set(genes): | |
| 301 norm_gene = model_utils._normalize_gene_id(gene) | |
| 302 if norm_gene in mapping_dict: | |
| 303 for target in mapping_dict[norm_gene]: | |
| 304 if target not in target_to_sources: | |
| 305 target_to_sources[target] = [] | |
| 306 target_to_sources[target].append(gene) | |
| 307 | |
| 308 for target, sources in target_to_sources.items(): | |
| 309 if len(sources) > 1: | |
| 310 has_many_to_1 = True | |
| 311 issues.append(f"many:1 - {sources} -> {target}") | |
| 312 | |
| 313 if issues: | |
| 314 predictions[rxn.id] = { | |
| 315 'issues': issues, | |
| 316 'has_1_to_many': has_1_to_many, | |
| 317 'has_many_to_1': has_many_to_1, | |
| 318 'has_unmapped': has_unmapped, | |
| 319 'is_or_only': self._check_if_or_only(rxn.gene_reaction_rule), | |
| 320 'predicted_flattening': has_1_to_many or has_many_to_1 and self._check_if_or_only(rxn.gene_reaction_rule) | |
| 321 } | |
| 322 | |
| 323 return predictions | |
| 324 | |
| 325 def _check_if_or_only(self, gpr: str) -> bool: | |
| 326 """Check if GPR contains only OR operators (and parentheses)""" | |
| 327 if not gpr or gpr.strip() == '': | |
| 328 return False | |
| 329 | |
| 330 # Remove gene names and whitespace, keep only logical operators | |
| 331 token_pattern = r'\b[A-Za-z0-9:_.-]+\b' | |
| 332 logic_only = re.sub(token_pattern, '', gpr) | |
| 333 logic_only = re.sub(r'\s+', ' ', logic_only.strip()) | |
| 334 | |
| 335 # Check for AND operators | |
| 336 and_pattern = r'\b(and|AND)\b' | |
| 337 return not bool(re.search(and_pattern, logic_only)) | |
| 338 | |
| 339 def run_comprehensive_test(self) -> Dict: | |
| 340 """Run the comprehensive translation test""" | |
| 341 print("="*80) | |
| 342 print("COMPREHENSIVE GPR TRANSLATION TEST SUITE") | |
| 343 print("="*80) | |
| 344 | |
| 345 # Create test model and mapping | |
| 346 print("\n1. Creating test model and mapping...") | |
| 347 model = self.create_comprehensive_test_model() | |
| 348 mapping_df = self.create_comprehensive_mapping() | |
| 349 | |
| 350 print(f" ✓ Created model with {len(model.reactions)} reactions") | |
| 351 print(f" ✓ Created mapping with {len(mapping_df)} entries") | |
| 352 | |
| 353 # Analyze mapping statistics | |
| 354 print("\n2. Analyzing mapping statistics...") | |
| 355 mapping_stats = self.analyze_mapping_statistics(mapping_df) | |
| 356 print(f" ✓ Unique source genes: {mapping_stats['unique_sources']}") | |
| 357 print(f" ✓ Unique target genes: {mapping_stats['unique_targets']}") | |
| 358 print(f" ✓ 1:1 mappings: {mapping_stats['one_to_one']}") | |
| 359 print(f" ✓ 1:many mappings: {mapping_stats['one_to_many']}") | |
| 360 print(f" ✓ Many:1 target genes: {mapping_stats['many_to_one_targets']}") | |
| 361 | |
| 362 # Predict translation issues | |
| 363 print("\n3. Predicting translation issues...") | |
| 364 predicted_issues = self.predict_translation_issues(model, mapping_df) | |
| 365 predicted_or_only = sum(1 for pred in predicted_issues.values() if pred['is_or_only']) | |
| 366 predicted_flattening = sum(1 for pred in predicted_issues.values() if pred['predicted_flattening']) | |
| 367 | |
| 368 print(f" ✓ Reactions with predicted issues: {len(predicted_issues)}") | |
| 369 print(f" ✓ OR-only reactions: {predicted_or_only}") | |
| 370 print(f" ✓ Predicted for flattening: {predicted_flattening}") | |
| 371 | |
| 372 # Display original GPRs | |
| 373 print("\n4. Original model GPRs:") | |
| 374 for rxn in sorted(model.reactions, key=lambda x: x.id): | |
| 375 status = "🔍" if rxn.id in predicted_issues else "✓" | |
| 376 or_only = "🔗" if predicted_issues.get(rxn.id, {}).get('is_or_only', False) else " " | |
| 377 print(f" {status}{or_only} {rxn.id:20} : {rxn.gene_reaction_rule}") | |
| 378 | |
| 379 # Run translation | |
| 380 print("\n5. Running translation...") | |
| 381 try: | |
| 382 translated_model, translation_issues = model_utils.translate_model_genes( | |
| 383 model=model, | |
| 384 mapping_df=mapping_df, | |
| 385 target_nomenclature='ensg', | |
| 386 source_nomenclature='hgnc_symbol', | |
| 387 allow_many_to_one=True | |
| 388 ) | |
| 389 print(" ✓ Translation completed successfully") | |
| 390 except Exception as e: | |
| 391 print(f" ❌ Translation failed: {e}") | |
| 392 import traceback | |
| 393 traceback.print_exc() | |
| 394 return {'success': False, 'error': str(e)} | |
| 395 | |
| 396 # Display translated GPRs | |
| 397 print("\n6. Translated model GPRs:") | |
| 398 for rxn in sorted(translated_model.reactions, key=lambda x: x.id): | |
| 399 has_issues = "🚨" if rxn.id in translation_issues else "✓" | |
| 400 print(f" {has_issues} {rxn.id:20} : {rxn.gene_reaction_rule}") | |
| 401 | |
| 402 # Analyze translation issues | |
| 403 print("\n7. Translation issues analysis:") | |
| 404 if translation_issues: | |
| 405 for rxn_id, issues_str in sorted(translation_issues.items()): | |
| 406 predicted = predicted_issues.get(rxn_id, {}) | |
| 407 prediction_status = "✓ PREDICTED" if rxn_id in predicted_issues else "❓ UNEXPECTED" | |
| 408 print(f" 🚨 {rxn_id:20} ({prediction_status})") | |
| 409 # Split issues string by semicolon separator | |
| 410 if issues_str: | |
| 411 issues_list = [issue.strip() for issue in issues_str.split(';') if issue.strip()] | |
| 412 for issue in issues_list: | |
| 413 print(f" - {issue}") | |
| 414 else: | |
| 415 print(f" - No specific issues reported") | |
| 416 else: | |
| 417 print(" ✅ No translation issues detected") | |
| 418 | |
| 419 # Compare predictions vs actual | |
| 420 print("\n8. Prediction accuracy:") | |
| 421 true_positive = set(predicted_issues.keys()) & set(translation_issues.keys()) | |
| 422 false_positive = set(predicted_issues.keys()) - set(translation_issues.keys()) | |
| 423 false_negative = set(translation_issues.keys()) - set(predicted_issues.keys()) | |
| 424 | |
| 425 print(f" ✓ Correctly predicted issues: {len(true_positive)}") | |
| 426 print(f" ⚠ False positives: {len(false_positive)}") | |
| 427 print(f" ❌ False negatives: {len(false_negative)}") | |
| 428 | |
| 429 if false_positive: | |
| 430 print(" False positive reactions:") | |
| 431 for rxn_id in false_positive: | |
| 432 print(f" - {rxn_id}") | |
| 433 | |
| 434 if false_negative: | |
| 435 print(" False negative reactions:") | |
| 436 for rxn_id in false_negative: | |
| 437 print(f" - {rxn_id}") | |
| 438 | |
| 439 # Test specific functionality | |
| 440 print("\n9. Testing OR-only GPR flattening...") | |
| 441 flattening_tests = self.test_or_only_flattening(translated_model, translation_issues) | |
| 442 | |
| 443 # Summary statistics | |
| 444 print("\n10. Summary:") | |
| 445 results = { | |
| 446 'success': True, | |
| 447 'model_reactions': len(model.reactions), | |
| 448 'mapping_entries': len(mapping_df), | |
| 449 'predicted_issues': len(predicted_issues), | |
| 450 'actual_issues': len(translation_issues), | |
| 451 'prediction_accuracy': { | |
| 452 'true_positive': len(true_positive), | |
| 453 'false_positive': len(false_positive), | |
| 454 'false_negative': len(false_negative), | |
| 455 'precision': len(true_positive) / len(predicted_issues) if predicted_issues else 0, | |
| 456 'recall': len(true_positive) / len(translation_issues) if translation_issues else 0, | |
| 457 }, | |
| 458 'mapping_stats': mapping_stats, | |
| 459 'flattening_tests': flattening_tests, | |
| 460 'models': { | |
| 461 'original': model, | |
| 462 'translated': translated_model | |
| 463 }, | |
| 464 'issues': { | |
| 465 'predicted': predicted_issues, | |
| 466 'actual': translation_issues | |
| 467 } | |
| 468 } | |
| 469 | |
| 470 precision = results['prediction_accuracy']['precision'] | |
| 471 recall = results['prediction_accuracy']['recall'] | |
| 472 f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 | |
| 473 | |
| 474 print(f" 📊 Total reactions: {len(model.reactions)}") | |
| 475 print(f" 📊 Reactions with issues: {len(translation_issues)}") | |
| 476 print(f" 📊 Prediction precision: {precision:.2%}") | |
| 477 print(f" 📊 Prediction recall: {recall:.2%}") | |
| 478 print(f" 📊 Prediction F1-score: {f1:.2%}") | |
| 479 print(f" 📊 OR-only flattening tests: {flattening_tests['passed']}/{flattening_tests['total']}") | |
| 480 | |
| 481 print("\n" + "="*80) | |
| 482 print("TEST SUITE COMPLETED") | |
| 483 print("="*80) | |
| 484 | |
| 485 return results | |
| 486 | |
| 487 def test_or_only_flattening(self, model: cobra.Model, translation_issues: Dict) -> Dict: | |
| 488 """Test the OR-only GPR flattening functionality""" | |
| 489 test_cases = [ | |
| 490 # (original_gpr, expected_after_flattening, should_be_flattened) | |
| 491 ("SHARED_TARGET1 or SHARED_TARGET1", "SHARED_TARGET1", True), | |
| 492 ("(SHARED_TARGET2 or SHARED_TARGET2) or SHARED_TARGET2", "SHARED_TARGET2", True), | |
| 493 ("TARGET1 or TARGET2 or TARGET1", "TARGET1 or TARGET2", True), | |
| 494 ("(TARGET1 or TARGET2) and TARGET3", "(TARGET1 or TARGET2) and TARGET3", False), # Contains AND | |
| 495 ("TARGET1 and TARGET1", "TARGET1", True), # Should simplify AND duplicates too | |
| 496 ] | |
| 497 | |
| 498 results = {'total': 0, 'passed': 0, 'failed': [], 'details': []} | |
| 499 | |
| 500 print(" Testing OR-only flattening functionality:") | |
| 501 | |
| 502 # Test the helper functions directly | |
| 503 for original, expected, should_flatten in test_cases: | |
| 504 results['total'] += 1 | |
| 505 | |
| 506 # Test _is_or_only_expression | |
| 507 is_or_only = model_utils._is_or_only_expression(original) | |
| 508 | |
| 509 # Test _flatten_or_only_gpr if it should be OR-only | |
| 510 if should_flatten and 'and' not in original.lower(): | |
| 511 flattened = model_utils._flatten_or_only_gpr(original) | |
| 512 passed = flattened == expected | |
| 513 else: | |
| 514 passed = not should_flatten or is_or_only == (not 'and' in original.lower()) | |
| 515 flattened = original | |
| 516 | |
| 517 status = "✓" if passed else "❌" | |
| 518 results['details'].append({ | |
| 519 'original': original, | |
| 520 'expected': expected, | |
| 521 'flattened': flattened, | |
| 522 'is_or_only': is_or_only, | |
| 523 'should_flatten': should_flatten, | |
| 524 'passed': passed | |
| 525 }) | |
| 526 | |
| 527 if passed: | |
| 528 results['passed'] += 1 | |
| 529 else: | |
| 530 results['failed'].append(f"{original} -> {flattened} (expected: {expected})") | |
| 531 | |
| 532 print(f" {status} '{original}' -> '{flattened}' (OR-only: {is_or_only})") | |
| 533 | |
| 534 # Test actual model reactions that should have been flattened | |
| 535 for rxn in model.reactions: | |
| 536 if rxn.id in translation_issues: | |
| 537 original_gpr = rxn.gene_reaction_rule | |
| 538 is_or_only = model_utils._is_or_only_expression(original_gpr) | |
| 539 if is_or_only: | |
| 540 print(f" 🔍 Real case: {rxn.id} has OR-only GPR: '{original_gpr}'") | |
| 541 | |
| 542 return results | |
| 543 | |
| 544 def run_individual_tests(): | |
| 545 """Run individual component tests""" | |
| 546 print("\n" + "="*80) | |
| 547 print("INDIVIDUAL COMPONENT TESTS") | |
| 548 print("="*80) | |
| 549 | |
| 550 # Test 1: OR-only detection | |
| 551 print("\n1. Testing OR-only detection...") | |
| 552 or_only_cases = [ | |
| 553 ("GENE1 or GENE2", True), | |
| 554 ("(GENE1 or GENE2)", True), | |
| 555 ("GENE1 or GENE2 or GENE3", True), | |
| 556 ("(GENE1 or GENE2) or GENE3", True), | |
| 557 ("((GENE1 or GENE2) or GENE3) or GENE4", True), | |
| 558 ("GENE1 and GENE2", False), | |
| 559 ("GENE1 or (GENE2 and GENE3)", False), | |
| 560 ("(GENE1 or GENE2) and GENE3", False), | |
| 561 ("GENE1", False), # Single gene | |
| 562 ("", False), # Empty | |
| 563 ] | |
| 564 | |
| 565 for gpr, expected in or_only_cases: | |
| 566 result = model_utils._is_or_only_expression(gpr) | |
| 567 status = "✓" if result == expected else "❌" | |
| 568 print(f" {status} '{gpr}' -> {result} (expected: {expected})") | |
| 569 | |
| 570 # Test 2: GPR flattening | |
| 571 print("\n2. Testing GPR flattening...") | |
| 572 flattening_cases = [ | |
| 573 ("GENE1 or GENE1", "GENE1"), | |
| 574 ("(GENE1 or GENE1) or GENE2", "GENE1 or GENE2"), | |
| 575 ("GENE1 or GENE2 or GENE1", "GENE1 or GENE2"), | |
| 576 ("(GENE1 or GENE2) or (GENE1 or GENE3)", "GENE1 or GENE2 or GENE3"), | |
| 577 ("((A or A) or B) or C", "A or B or C"), | |
| 578 ] | |
| 579 | |
| 580 for original, expected in flattening_cases: | |
| 581 result = model_utils._flatten_or_only_gpr(original) | |
| 582 status = "✓" if result == expected else "❌" | |
| 583 print(f" {status} '{original}' -> '{result}' (expected: '{expected}')") | |
| 584 | |
| 585 def main(): | |
| 586 """Main test function""" | |
| 587 print("COBRAxy GPR Translation Comprehensive Test Suite") | |
| 588 print("=" * 80) | |
| 589 | |
| 590 # Run individual component tests first | |
| 591 run_individual_tests() | |
| 592 | |
| 593 # Run comprehensive test suite | |
| 594 tester = GPRTranslationTester() | |
| 595 results = tester.run_comprehensive_test() | |
| 596 | |
| 597 # Save results for further analysis if needed | |
| 598 if results['success']: | |
| 599 print(f"\n✅ All tests completed successfully!") | |
| 600 print(f"📁 Test models and results available in results object") | |
| 601 | |
| 602 # Optionally save to file | |
| 603 try: | |
| 604 import pickle | |
| 605 with open('/tmp/gpr_translation_test_results.pkl', 'wb') as f: | |
| 606 pickle.dump(results, f) | |
| 607 print(f"📁 Detailed results saved to /tmp/gpr_translation_test_results.pkl") | |
| 608 except: | |
| 609 pass | |
| 610 else: | |
| 611 print(f"\n❌ Tests failed: {results.get('error', 'Unknown error')}") | |
| 612 return False | |
| 613 | |
| 614 return True | |
| 615 | |
| 616 if __name__ == "__main__": | |
| 617 success = main() | |
| 618 sys.exit(0 if success else 1) |
