Mercurial > repos > bimib > cobraxy
changeset 493:a92d21f92956 draft
Uploaded
author | francesco_lapi |
---|---|
date | Tue, 30 Sep 2025 15:00:21 +0000 |
parents | 4ed95023af20 |
children | 5397559097dc |
files | COBRAxy/test_gpr_translation_comprehensive.py COBRAxy/utils/model_utils.py |
diffstat | 2 files changed, 699 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/COBRAxy/test_gpr_translation_comprehensive.py Tue Sep 30 15:00:21 2025 +0000 @@ -0,0 +1,618 @@ +#!/usr/bin/env python3 +""" +Comprehensive test suite for GPR translation functionality in COBRAxy. + +This test suite covers: +- Basic 1:1, 1:many, many:1 gene mappings +- Complex GPR expressions with AND/OR logic +- Translation issues tracking +- OR-only GPR flattening functionality +- Edge cases and nested expressions +- Statistical reporting +""" + +import cobra +import pandas as pd +import sys +import os +import logging +from typing import Dict, List, Tuple +import re + +# Add the COBRAxy utils directory to the path +sys.path.append('/hdd/home/flapi/COBRAxy') +from utils import model_utils + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +class GPRTranslationTester: + """Comprehensive GPR translation test suite""" + + def __init__(self): + self.test_results = {} + self.failed_tests = [] + + def create_comprehensive_test_model(self) -> cobra.Model: + """Create a comprehensive test model with diverse GPR patterns""" + model = cobra.Model('comprehensive_test_model') + + # Create metabolites + metabolites = [] + for i in range(30): + met = cobra.Metabolite(f'met_{chr(65+i%26)}{i//26}', compartment='c') + metabolites.append(met) + + reactions_data = [ + # === BASIC CASES === + ('BASIC_1to1', 'GENE1', 0, 1), # Simple 1:1 mapping + ('BASIC_1tomany', 'GENE2', 1, 2), # 1:many mapping + ('BASIC_manyto1', 'GENE3', 2, 3), # many:1 mapping + ('BASIC_unmapped', 'UNMAPPED_GENE', 3, 4), # unmapped gene + + # === SIMPLE OR CASES (candidates for flattening) === + ('OR_simple', 'GENE4 or GENE5', 4, 5), # Simple OR with many:1 + ('OR_three', 'GENE6 or GENE7 or GENE8', 5, 6), # Three genes OR + ('OR_parentheses', '(GENE9 or GENE10)', 6, 7), # OR with parentheses + ('OR_duplicates', 'GENE11 or GENE12 or GENE11', 7, 8), # OR with duplicates after translation + + # === COMPLEX OR CASES (candidates for flattening) === + ('OR_nested_simple', '(GENE13 or GENE14) or (GENE15 or GENE16)', 8, 9), # Nested OR only + ('OR_many_parentheses', '((GENE17 or GENE18) or GENE19) or GENE20', 9, 10), # Multiple nesting levels + ('OR_mixed_mapping', 'GENE21 or GENE22 or GENE23', 10, 11), # Mixed 1:1, 1:many, many:1 + + # === AND CASES (should NOT be flattened) === + ('AND_simple', 'GENE24 and GENE25', 11, 12), # Simple AND + ('AND_complex', '(GENE26 and GENE27) and GENE28', 12, 13), # Complex AND + + # === MIXED AND/OR (should NOT be flattened) === + ('MIXED_basic', 'GENE29 and (GENE30 or GENE31)', 13, 14), # AND with OR + ('MIXED_complex', '(GENE32 or GENE33) and (GENE34 or GENE35)', 14, 15), # OR and AND + ('MIXED_nested', '((GENE36 and GENE37) or GENE38) and GENE39', 15, 16), # Complex nesting + + # === EDGE CASES === + ('EDGE_single', 'GENE40', 16, 17), # Single gene + ('EDGE_empty', '', 17, 18), # Empty GPR + ('EDGE_whitespace', ' GENE41 or GENE42 ', 18, 19), # Whitespace + ('EDGE_case_sensitive', 'Gene43 OR gene44', 19, 20), # Case variations + + # === STRESS TESTS === + ('STRESS_long_or', 'GENE45 or GENE46 or GENE47 or GENE48 or GENE49 or GENE50', 20, 21), # Long OR chain + ('STRESS_deep_nest', '(((GENE51 or GENE52) or GENE53) or GENE54)', 21, 22), # Deep nesting + ('STRESS_complex', '(GENE55 or (GENE56 or GENE57)) or ((GENE58 or GENE59) or GENE60)', 22, 23), # Complex structure + + # === TRANSLATION ISSUE TRIGGERS === + ('ISSUE_1many_or', 'GENE61 or GENE62', 23, 24), # 1:many in OR (should be flattened) + ('ISSUE_manyto1_and', 'GENE63 and GENE64', 24, 25), # many:1 in AND (should NOT be flattened) + ('ISSUE_mixed_problems', '(GENE65 or GENE66) and GENE67', 25, 26), # Mixed problems + + # === REAL-WORLD INSPIRED CASES === + ('REAL_metabolism', '(ENSG001 or ENSG002) or (ENSG003 or ENSG004)', 26, 27), # Metabolic pathway + ('REAL_transport', 'TRANSPORTER1 and (COFACTOR1 or COFACTOR2)', 27, 28), # Transport reaction + ('REAL_complex_enzyme', '((SUBUNIT1 and SUBUNIT2) or SUBUNIT3) and COFACTOR3', 28, 29), # Complex enzyme + ] + + # Create reactions + for rxn_id, gpr, met_in, met_out in reactions_data: + rxn = cobra.Reaction(rxn_id) + if met_in < len(metabolites) and met_out < len(metabolites): + rxn.add_metabolites({metabolites[met_in]: -1, metabolites[met_out]: 1}) + rxn.gene_reaction_rule = gpr + model.add_reactions([rxn]) + + return model + + def create_comprehensive_mapping(self) -> pd.DataFrame: + """Create a comprehensive gene mapping covering all test scenarios""" + mapping_data = { + 'hgnc_symbol': [], + 'ensg': [] + } + + # === BASIC MAPPINGS === + # 1:1 mappings + one_to_one = [ + ('GENE1', 'TARGET1'), + ('GENE24', 'TARGET24'), + ('GENE25', 'TARGET25'), + ('GENE26', 'TARGET26'), + ('GENE27', 'TARGET27'), + ('GENE28', 'TARGET28'), + ('GENE29', 'TARGET29'), + ('GENE40', 'TARGET40'), + ('GENE41', 'TARGET41'), + ('GENE42', 'TARGET42'), + ] + + # 1:many mappings (one source gene maps to multiple targets) + one_to_many = [ + ('GENE2', 'TARGET2A'), + ('GENE2', 'TARGET2B'), + ('GENE30', 'TARGET30A'), + ('GENE30', 'TARGET30B'), + ('GENE61', 'TARGET61A'), + ('GENE61', 'TARGET61B'), + ('GENE61', 'TARGET61C'), # Maps to 3 targets + ('GENE65', 'TARGET65A'), + ('GENE65', 'TARGET65B'), + ] + + # many:1 mappings (multiple source genes map to one target) + many_to_one = [ + ('GENE3', 'SHARED_TARGET1'), + ('GENE4', 'SHARED_TARGET1'), + ('GENE5', 'SHARED_TARGET1'), + ('GENE6', 'SHARED_TARGET2'), + ('GENE7', 'SHARED_TARGET2'), + ('GENE8', 'SHARED_TARGET2'), + ('GENE9', 'SHARED_TARGET3'), + ('GENE10', 'SHARED_TARGET3'), + ('GENE11', 'SHARED_TARGET4'), + ('GENE12', 'SHARED_TARGET4'), + ('GENE13', 'SHARED_TARGET5'), + ('GENE14', 'SHARED_TARGET5'), + ('GENE15', 'SHARED_TARGET5'), + ('GENE16', 'SHARED_TARGET5'), + ('GENE17', 'SHARED_TARGET6'), + ('GENE18', 'SHARED_TARGET6'), + ('GENE19', 'SHARED_TARGET6'), + ('GENE20', 'SHARED_TARGET6'), + ('GENE45', 'SHARED_TARGET7'), + ('GENE46', 'SHARED_TARGET7'), + ('GENE47', 'SHARED_TARGET7'), + ('GENE48', 'SHARED_TARGET7'), + ('GENE49', 'SHARED_TARGET7'), + ('GENE50', 'SHARED_TARGET7'), + ('GENE51', 'SHARED_TARGET8'), + ('GENE52', 'SHARED_TARGET8'), + ('GENE53', 'SHARED_TARGET8'), + ('GENE54', 'SHARED_TARGET8'), + ('GENE55', 'SHARED_TARGET9'), + ('GENE56', 'SHARED_TARGET9'), + ('GENE57', 'SHARED_TARGET9'), + ('GENE58', 'SHARED_TARGET9'), + ('GENE59', 'SHARED_TARGET9'), + ('GENE60', 'SHARED_TARGET9'), + ('GENE63', 'SHARED_TARGET10'), + ('GENE64', 'SHARED_TARGET10'), + ('GENE66', 'SHARED_TARGET11'), + ] + + # Mixed mappings for complex cases + mixed_mappings = [ + ('GENE21', 'TARGET21'), # 1:1 + ('GENE22', 'TARGET22A'), # 1:many + ('GENE22', 'TARGET22B'), + ('GENE23', 'SHARED_TARGET1'), # many:1 (shares with GENE3-5) + ('GENE31', 'SHARED_TARGET12'), + ('GENE32', 'SHARED_TARGET13'), + ('GENE33', 'SHARED_TARGET13'), + ('GENE34', 'TARGET34'), + ('GENE35', 'TARGET35'), + ('GENE36', 'TARGET36'), + ('GENE37', 'TARGET37'), + ('GENE38', 'TARGET38'), + ('GENE39', 'TARGET39'), + ('GENE62', 'TARGET62A'), + ('GENE62', 'TARGET62B'), + ('GENE67', 'TARGET67'), + ] + + # Case sensitivity tests + case_mappings = [ + ('Gene43', 'TARGET43'), + ('gene44', 'TARGET44'), + ] + + # Real-world inspired mappings + real_mappings = [ + ('ENSG001', 'HUMAN_GENE1'), + ('ENSG002', 'HUMAN_GENE2'), + ('ENSG003', 'HUMAN_GENE1'), # many:1 + ('ENSG004', 'HUMAN_GENE2'), # many:1 + ('TRANSPORTER1', 'SLC_FAMILY1'), + ('COFACTOR1', 'COFACTOR_A'), + ('COFACTOR2', 'COFACTOR_A'), # many:1 + ('COFACTOR3', 'COFACTOR_B'), + ('SUBUNIT1', 'COMPLEX_SUBUNIT1'), + ('SUBUNIT2', 'COMPLEX_SUBUNIT2'), + ('SUBUNIT3', 'COMPLEX_ALTERNATIVE'), + ] + + # Combine all mappings + all_mappings = one_to_one + one_to_many + many_to_one + mixed_mappings + case_mappings + real_mappings + + for source, target in all_mappings: + mapping_data['hgnc_symbol'].append(source) + mapping_data['ensg'].append(target) + + return pd.DataFrame(mapping_data) + + def analyze_mapping_statistics(self, mapping_df: pd.DataFrame) -> Dict: + """Analyze mapping statistics""" + stats = {} + + source_counts = mapping_df.groupby('hgnc_symbol')['ensg'].count() + target_counts = mapping_df.groupby('ensg')['hgnc_symbol'].count() + + stats['total_mappings'] = len(mapping_df) + stats['unique_sources'] = len(source_counts) + stats['unique_targets'] = len(target_counts) + + stats['one_to_one'] = (source_counts == 1).sum() + stats['one_to_many'] = (source_counts > 1).sum() + stats['many_to_one_targets'] = (target_counts > 1).sum() + + stats['one_to_many_details'] = {} + for gene, count in source_counts[source_counts > 1].items(): + targets = mapping_df[mapping_df['hgnc_symbol'] == gene]['ensg'].tolist() + stats['one_to_many_details'][gene] = targets + + stats['many_to_one_details'] = {} + for target, count in target_counts[target_counts > 1].items(): + sources = mapping_df[mapping_df['ensg'] == target]['hgnc_symbol'].tolist() + stats['many_to_one_details'][target] = sources + + return stats + + def predict_translation_issues(self, model: cobra.Model, mapping_df: pd.DataFrame) -> Dict: + """Predict which reactions will have translation issues""" + predictions = {} + mapping_dict = {} + + # Build mapping dictionary + for _, row in mapping_df.iterrows(): + source = row['hgnc_symbol'] + target = row['ensg'] + if source not in mapping_dict: + mapping_dict[source] = [] + mapping_dict[source].append(target) + + for rxn in model.reactions: + if not rxn.gene_reaction_rule or rxn.gene_reaction_rule.strip() == '': + continue + + # Extract genes from GPR + token_pattern = r'\b[A-Za-z0-9:_.-]+\b' + tokens = re.findall(token_pattern, rxn.gene_reaction_rule) + logical_operators = {'and', 'or', 'AND', 'OR', '(', ')'} + genes = [t for t in tokens if t not in logical_operators] + + issues = [] + has_1_to_many = False + has_many_to_1 = False + has_unmapped = False + + for gene in set(genes): + norm_gene = model_utils._normalize_gene_id(gene) + if norm_gene in mapping_dict: + targets = mapping_dict[norm_gene] + if len(targets) > 1: + has_1_to_many = True + issues.append(f"1:many - {gene} -> {targets}") + else: + has_unmapped = True + issues.append(f"unmapped - {gene}") + + # Check for many:1 mappings + target_to_sources = {} + for gene in set(genes): + norm_gene = model_utils._normalize_gene_id(gene) + if norm_gene in mapping_dict: + for target in mapping_dict[norm_gene]: + if target not in target_to_sources: + target_to_sources[target] = [] + target_to_sources[target].append(gene) + + for target, sources in target_to_sources.items(): + if len(sources) > 1: + has_many_to_1 = True + issues.append(f"many:1 - {sources} -> {target}") + + if issues: + predictions[rxn.id] = { + 'issues': issues, + 'has_1_to_many': has_1_to_many, + 'has_many_to_1': has_many_to_1, + 'has_unmapped': has_unmapped, + 'is_or_only': self._check_if_or_only(rxn.gene_reaction_rule), + 'predicted_flattening': has_1_to_many or has_many_to_1 and self._check_if_or_only(rxn.gene_reaction_rule) + } + + return predictions + + def _check_if_or_only(self, gpr: str) -> bool: + """Check if GPR contains only OR operators (and parentheses)""" + if not gpr or gpr.strip() == '': + return False + + # Remove gene names and whitespace, keep only logical operators + token_pattern = r'\b[A-Za-z0-9:_.-]+\b' + logic_only = re.sub(token_pattern, '', gpr) + logic_only = re.sub(r'\s+', ' ', logic_only.strip()) + + # Check for AND operators + and_pattern = r'\b(and|AND)\b' + return not bool(re.search(and_pattern, logic_only)) + + def run_comprehensive_test(self) -> Dict: + """Run the comprehensive translation test""" + print("="*80) + print("COMPREHENSIVE GPR TRANSLATION TEST SUITE") + print("="*80) + + # Create test model and mapping + print("\n1. Creating test model and mapping...") + model = self.create_comprehensive_test_model() + mapping_df = self.create_comprehensive_mapping() + + print(f" ✓ Created model with {len(model.reactions)} reactions") + print(f" ✓ Created mapping with {len(mapping_df)} entries") + + # Analyze mapping statistics + print("\n2. Analyzing mapping statistics...") + mapping_stats = self.analyze_mapping_statistics(mapping_df) + print(f" ✓ Unique source genes: {mapping_stats['unique_sources']}") + print(f" ✓ Unique target genes: {mapping_stats['unique_targets']}") + print(f" ✓ 1:1 mappings: {mapping_stats['one_to_one']}") + print(f" ✓ 1:many mappings: {mapping_stats['one_to_many']}") + print(f" ✓ Many:1 target genes: {mapping_stats['many_to_one_targets']}") + + # Predict translation issues + print("\n3. Predicting translation issues...") + predicted_issues = self.predict_translation_issues(model, mapping_df) + predicted_or_only = sum(1 for pred in predicted_issues.values() if pred['is_or_only']) + predicted_flattening = sum(1 for pred in predicted_issues.values() if pred['predicted_flattening']) + + print(f" ✓ Reactions with predicted issues: {len(predicted_issues)}") + print(f" ✓ OR-only reactions: {predicted_or_only}") + print(f" ✓ Predicted for flattening: {predicted_flattening}") + + # Display original GPRs + print("\n4. Original model GPRs:") + for rxn in sorted(model.reactions, key=lambda x: x.id): + status = "🔍" if rxn.id in predicted_issues else "✓" + or_only = "🔗" if predicted_issues.get(rxn.id, {}).get('is_or_only', False) else " " + print(f" {status}{or_only} {rxn.id:20} : {rxn.gene_reaction_rule}") + + # Run translation + print("\n5. Running translation...") + try: + translated_model, translation_issues = model_utils.translate_model_genes( + model=model, + mapping_df=mapping_df, + target_nomenclature='ensg', + source_nomenclature='hgnc_symbol', + allow_many_to_one=True + ) + print(" ✓ Translation completed successfully") + except Exception as e: + print(f" ❌ Translation failed: {e}") + import traceback + traceback.print_exc() + return {'success': False, 'error': str(e)} + + # Display translated GPRs + print("\n6. Translated model GPRs:") + for rxn in sorted(translated_model.reactions, key=lambda x: x.id): + has_issues = "🚨" if rxn.id in translation_issues else "✓" + print(f" {has_issues} {rxn.id:20} : {rxn.gene_reaction_rule}") + + # Analyze translation issues + print("\n7. Translation issues analysis:") + if translation_issues: + for rxn_id, issues_str in sorted(translation_issues.items()): + predicted = predicted_issues.get(rxn_id, {}) + prediction_status = "✓ PREDICTED" if rxn_id in predicted_issues else "❓ UNEXPECTED" + print(f" 🚨 {rxn_id:20} ({prediction_status})") + # Split issues string by semicolon separator + if issues_str: + issues_list = [issue.strip() for issue in issues_str.split(';') if issue.strip()] + for issue in issues_list: + print(f" - {issue}") + else: + print(f" - No specific issues reported") + else: + print(" ✅ No translation issues detected") + + # Compare predictions vs actual + print("\n8. Prediction accuracy:") + true_positive = set(predicted_issues.keys()) & set(translation_issues.keys()) + false_positive = set(predicted_issues.keys()) - set(translation_issues.keys()) + false_negative = set(translation_issues.keys()) - set(predicted_issues.keys()) + + print(f" ✓ Correctly predicted issues: {len(true_positive)}") + print(f" ⚠ False positives: {len(false_positive)}") + print(f" ❌ False negatives: {len(false_negative)}") + + if false_positive: + print(" False positive reactions:") + for rxn_id in false_positive: + print(f" - {rxn_id}") + + if false_negative: + print(" False negative reactions:") + for rxn_id in false_negative: + print(f" - {rxn_id}") + + # Test specific functionality + print("\n9. Testing OR-only GPR flattening...") + flattening_tests = self.test_or_only_flattening(translated_model, translation_issues) + + # Summary statistics + print("\n10. Summary:") + results = { + 'success': True, + 'model_reactions': len(model.reactions), + 'mapping_entries': len(mapping_df), + 'predicted_issues': len(predicted_issues), + 'actual_issues': len(translation_issues), + 'prediction_accuracy': { + 'true_positive': len(true_positive), + 'false_positive': len(false_positive), + 'false_negative': len(false_negative), + 'precision': len(true_positive) / len(predicted_issues) if predicted_issues else 0, + 'recall': len(true_positive) / len(translation_issues) if translation_issues else 0, + }, + 'mapping_stats': mapping_stats, + 'flattening_tests': flattening_tests, + 'models': { + 'original': model, + 'translated': translated_model + }, + 'issues': { + 'predicted': predicted_issues, + 'actual': translation_issues + } + } + + precision = results['prediction_accuracy']['precision'] + recall = results['prediction_accuracy']['recall'] + f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0 + + print(f" 📊 Total reactions: {len(model.reactions)}") + print(f" 📊 Reactions with issues: {len(translation_issues)}") + print(f" 📊 Prediction precision: {precision:.2%}") + print(f" 📊 Prediction recall: {recall:.2%}") + print(f" 📊 Prediction F1-score: {f1:.2%}") + print(f" 📊 OR-only flattening tests: {flattening_tests['passed']}/{flattening_tests['total']}") + + print("\n" + "="*80) + print("TEST SUITE COMPLETED") + print("="*80) + + return results + + def test_or_only_flattening(self, model: cobra.Model, translation_issues: Dict) -> Dict: + """Test the OR-only GPR flattening functionality""" + test_cases = [ + # (original_gpr, expected_after_flattening, should_be_flattened) + ("SHARED_TARGET1 or SHARED_TARGET1", "SHARED_TARGET1", True), + ("(SHARED_TARGET2 or SHARED_TARGET2) or SHARED_TARGET2", "SHARED_TARGET2", True), + ("TARGET1 or TARGET2 or TARGET1", "TARGET1 or TARGET2", True), + ("(TARGET1 or TARGET2) and TARGET3", "(TARGET1 or TARGET2) and TARGET3", False), # Contains AND + ("TARGET1 and TARGET1", "TARGET1", True), # Should simplify AND duplicates too + ] + + results = {'total': 0, 'passed': 0, 'failed': [], 'details': []} + + print(" Testing OR-only flattening functionality:") + + # Test the helper functions directly + for original, expected, should_flatten in test_cases: + results['total'] += 1 + + # Test _is_or_only_expression + is_or_only = model_utils._is_or_only_expression(original) + + # Test _flatten_or_only_gpr if it should be OR-only + if should_flatten and 'and' not in original.lower(): + flattened = model_utils._flatten_or_only_gpr(original) + passed = flattened == expected + else: + passed = not should_flatten or is_or_only == (not 'and' in original.lower()) + flattened = original + + status = "✓" if passed else "❌" + results['details'].append({ + 'original': original, + 'expected': expected, + 'flattened': flattened, + 'is_or_only': is_or_only, + 'should_flatten': should_flatten, + 'passed': passed + }) + + if passed: + results['passed'] += 1 + else: + results['failed'].append(f"{original} -> {flattened} (expected: {expected})") + + print(f" {status} '{original}' -> '{flattened}' (OR-only: {is_or_only})") + + # Test actual model reactions that should have been flattened + for rxn in model.reactions: + if rxn.id in translation_issues: + original_gpr = rxn.gene_reaction_rule + is_or_only = model_utils._is_or_only_expression(original_gpr) + if is_or_only: + print(f" 🔍 Real case: {rxn.id} has OR-only GPR: '{original_gpr}'") + + return results + +def run_individual_tests(): + """Run individual component tests""" + print("\n" + "="*80) + print("INDIVIDUAL COMPONENT TESTS") + print("="*80) + + # Test 1: OR-only detection + print("\n1. Testing OR-only detection...") + or_only_cases = [ + ("GENE1 or GENE2", True), + ("(GENE1 or GENE2)", True), + ("GENE1 or GENE2 or GENE3", True), + ("(GENE1 or GENE2) or GENE3", True), + ("((GENE1 or GENE2) or GENE3) or GENE4", True), + ("GENE1 and GENE2", False), + ("GENE1 or (GENE2 and GENE3)", False), + ("(GENE1 or GENE2) and GENE3", False), + ("GENE1", False), # Single gene + ("", False), # Empty + ] + + for gpr, expected in or_only_cases: + result = model_utils._is_or_only_expression(gpr) + status = "✓" if result == expected else "❌" + print(f" {status} '{gpr}' -> {result} (expected: {expected})") + + # Test 2: GPR flattening + print("\n2. Testing GPR flattening...") + flattening_cases = [ + ("GENE1 or GENE1", "GENE1"), + ("(GENE1 or GENE1) or GENE2", "GENE1 or GENE2"), + ("GENE1 or GENE2 or GENE1", "GENE1 or GENE2"), + ("(GENE1 or GENE2) or (GENE1 or GENE3)", "GENE1 or GENE2 or GENE3"), + ("((A or A) or B) or C", "A or B or C"), + ] + + for original, expected in flattening_cases: + result = model_utils._flatten_or_only_gpr(original) + status = "✓" if result == expected else "❌" + print(f" {status} '{original}' -> '{result}' (expected: '{expected}')") + +def main(): + """Main test function""" + print("COBRAxy GPR Translation Comprehensive Test Suite") + print("=" * 80) + + # Run individual component tests first + run_individual_tests() + + # Run comprehensive test suite + tester = GPRTranslationTester() + results = tester.run_comprehensive_test() + + # Save results for further analysis if needed + if results['success']: + print(f"\n✅ All tests completed successfully!") + print(f"📁 Test models and results available in results object") + + # Optionally save to file + try: + import pickle + with open('/tmp/gpr_translation_test_results.pkl', 'wb') as f: + pickle.dump(results, f) + print(f"📁 Detailed results saved to /tmp/gpr_translation_test_results.pkl") + except: + pass + else: + print(f"\n❌ Tests failed: {results.get('error', 'Unknown error')}") + return False + + return True + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) \ No newline at end of file
--- a/COBRAxy/utils/model_utils.py Tue Sep 30 14:02:17 2025 +0000 +++ b/COBRAxy/utils/model_utils.py Tue Sep 30 15:00:21 2025 +0000 @@ -366,7 +366,10 @@ """ metabolites = set() # optional coefficient followed by a token ending with _<letters> - pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[a-z]+)' + if reaction_formula[-1] == ']' and reaction_formula[-3] == '[': + pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)' + else: + pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)' matches = re.findall(pattern, reaction_formula) metabolites.update(matches) return metabolites @@ -376,6 +379,8 @@ """Extract the compartment from a metabolite ID.""" if '_' in metabolite_id: return metabolite_id.split('_')[-1] + if metabolite_id[-1] == ']' and metabolite_id[-3] == '[': + return metabolite_id[-2] return 'c' # default cytoplasm @@ -598,6 +603,66 @@ g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE) return g +def _is_or_only_expression(expr: str) -> bool: + """ + Check if a GPR expression contains only OR operators (no AND operators). + + Args: + expr: GPR expression string + + Returns: + bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise + """ + if not expr or not expr.strip(): + return False + + # Normalize the expression + normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ') + + # Check if it contains any AND operators + has_and = ' and ' in normalized.lower() + + # Check if it contains OR operators + has_or = ' or ' in normalized.lower() + + # Must have OR operators and no AND operators + return has_or and not has_and + + +def _flatten_or_only_gpr(expr: str) -> str: + """ + Flatten a GPR expression that contains only OR operators by: + 1. Removing all parentheses + 2. Extracting unique gene names + 3. Joining them with ' or ' + + Args: + expr: GPR expression string with only OR operators + + Returns: + str: Flattened GPR expression + """ + if not expr or not expr.strip(): + return expr + + # Extract all gene tokens (exclude logical operators and parentheses) + gene_pattern = r'\b[A-Za-z0-9:_.-]+\b' + logical = {'and', 'or', 'AND', 'OR', '(', ')'} + + tokens = re.findall(gene_pattern, expr) + genes = [t for t in tokens if t not in logical] + + # Create set to remove duplicates, then convert back to list to maintain some order + unique_genes = list(dict.fromkeys(genes)) # Preserves insertion order + + if len(unique_genes) == 0: + return expr + elif len(unique_genes) == 1: + return unique_genes[0] + else: + return ' or '.join(unique_genes) + + def _simplify_boolean_expression(expr: str) -> str: """ Simplify a boolean expression by removing duplicates while strictly preserving semantics. @@ -783,7 +848,7 @@ model_copy = model.copy() # statistics - stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0} + stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0} unmapped = [] multi = [] @@ -802,6 +867,15 @@ reaction_translation_issues[rxn.id] = rxn_issues if new_gpr != gpr: + # Check if this GPR has translation issues and contains only OR operators + if rxn_issues and _is_or_only_expression(new_gpr): + # Flatten the GPR: remove parentheses and create set of unique genes + flattened_gpr = _flatten_or_only_gpr(new_gpr) + if flattened_gpr != new_gpr: + stats['flattened_or_gprs'] += 1 + logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'") + new_gpr = flattened_gpr + simplified_gpr = _simplify_boolean_expression(new_gpr) if simplified_gpr != new_gpr: stats['simplified_gprs'] += 1 @@ -985,6 +1059,7 @@ logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})") logger.info(f"Not found tokens: {stats.get('not_found', 0)}") logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}") + logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}") final_ids = {g.id for g in final_genes} logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}") @@ -995,5 +1070,9 @@ logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):") for orig, targets in multi_mapping_genes[:10]: logger.info(f" {orig} -> {', '.join(targets)}") + + # Log summary of flattened GPRs if any + if stats.get('flattened_or_gprs', 0) > 0: + logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)") \ No newline at end of file