cobraxy: COBRAxy/src/utils/model

comparison COBRAxy/src/utils/model_utils.py @ 539:2fb97466e404 draft

Uploaded

author	francesco_lapi
date	Sat, 25 Oct 2025 14:55:13 +0000
parents
children	fcdbc81feb45

comparison

equal deleted inserted replaced

-:fd53d42348bd
+:2fb97466e404
+"""
+Utilities for generating and manipulating COBRA models and related metadata.
+This module includes helpers to:
+- extract rules, reactions, bounds, objective coefficients, and compartments
+- build a COBRA model from a tabular file
+- set objective and medium from dataframes
+- validate a model and convert gene identifiers
+- translate model GPRs using mapping tables
+"""
+import os
+import cobra
+import pandas as pd
+import re
+import logging
+from typing import Optional, Tuple, Union, List, Dict, Set
+from collections import defaultdict
+import utils.rule_parsing  as rulesUtils
+import utils.reaction_parsing as reactionUtils
+from cobra import Model as cobraModel, Reaction, Metabolite
+import sys
+############################ check_methods ####################################
+def gene_type(l :str, name :str) -> str:
+"""
+Determine the type of gene ID.
+Args:
+l (str): The gene identifier to check.
+name (str): The name of the dataset, used in error messages.
+Returns:
+str: The type of gene ID ('HGNC_ID', 'ENSG', 'HGNC_symbol', or 'entrez_id').
+Raises:
+sys.exit: If the gene ID type is not supported, the execution is aborted.
+"""
+if check_hgnc(l):
+return 'HGNC_ID'
+elif check_ensembl(l):
+return 'ENSG'
+elif check_symbol(l):
+return 'HGNC_symbol'
+elif check_entrez(l):
+return 'entrez_id'
+else:
+sys.exit('Execution aborted:\n' +
+'gene ID type in ' + name + ' not supported. Supported ID'+
+'types are: HUGO ID, Ensemble ID, HUGO symbol, Entrez ID\n')
+def check_hgnc(l :str) -> bool:
+"""
+Check if a gene identifier follows the HGNC format.
+Args:
+l (str): The gene identifier to check.
+Returns:
+bool: True if the gene identifier follows the HGNC format, False otherwise.
+"""
+if len(l) > 5:
+if (l.upper()).startswith('HGNC:'):
+return l[5:].isdigit()
+else:
+return False
+else:
+return False
+def check_ensembl(l :str) -> bool:
+"""
+Check if a gene identifier follows the Ensembl format.
+Args:
+l (str): The gene identifier to check.
+Returns:
+bool: True if the gene identifier follows the Ensembl format, False otherwise.
+"""
+return l.upper().startswith('ENS')
+def check_symbol(l :str) -> bool:
+"""
+Check if a gene identifier follows the symbol format.
+Args:
+l (str): The gene identifier to check.
+Returns:
+bool: True if the gene identifier follows the symbol format, False otherwise.
+"""
+if len(l) > 0:
+if l[0].isalpha() and l[1:].isalnum():
+return True
+else:
+return False
+else:
+return False
+def check_entrez(l :str) -> bool:
+"""
+Check if a gene identifier follows the Entrez ID format.
+Args:
+l (str): The gene identifier to check.
+Returns:
+bool: True if the gene identifier follows the Entrez ID format, False otherwise.
+"""
+if len(l) > 0:
+return l.isdigit()
+else:
+return False
+################################- DATA GENERATION -################################
+ReactionId = str
+def generate_rules(model: cobraModel, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]:
+"""
+Generate a dictionary mapping reaction IDs to GPR rules from the model.
+Args:
+model: COBRA model to derive data from.
+asParsed: If True, parse rules into a nested list structure; otherwise keep raw strings.
+Returns:
+Dict[ReactionId, rulesUtils.OpList]: Parsed rules by reaction ID.
+Dict[ReactionId, str]: Raw rules by reaction ID.
+"""
+_ruleGetter   =  lambda reaction : reaction.gene_reaction_rule
+ruleExtractor = (lambda reaction :
+rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter
+return {
+reaction.id : ruleExtractor(reaction)
+for reaction in model.reactions
+if reaction.gene_reaction_rule }
+def generate_reactions(model :cobraModel, *, asParsed = True) -> Dict[ReactionId, str]:
+"""
+Generate a dictionary mapping reaction IDs to reaction formulas from the model.
+Args:
+model: COBRA model to derive data from.
+asParsed: If True, convert formulas into a parsed representation; otherwise keep raw strings.
+Returns:
+Dict[ReactionId, str]: Reactions by reaction ID (parsed if requested).
+"""
+unparsedReactions = {
+reaction.id : reaction.reaction
+for reaction in model.reactions
+if reaction.reaction
+}
+if not asParsed: return unparsedReactions
+return reactionUtils.create_reaction_dict(unparsedReactions)
+def get_medium(model:cobraModel) -> pd.DataFrame:
+"""
+Extract the uptake reactions representing the model medium.
+Returns a DataFrame with a single column 'reaction' listing exchange reactions
+with negative lower bound and no positive stoichiometric coefficients (uptake only).
+"""
+trueMedium=[]
+for r in model.reactions:
+positiveCoeff=0
+for m in r.metabolites:
+if r.get_coefficient(m.id)>0:
+positiveCoeff=1;
+if (positiveCoeff==0 and r.lower_bound<0):
+trueMedium.append(r.id)
+df_medium = pd.DataFrame()
+df_medium["reaction"] = trueMedium
+return df_medium
+def extract_objective_coefficients(model: cobraModel) -> pd.DataFrame:
+"""
+Extract objective coefficients for each reaction.
+Args:
+model: COBRA model
+Returns:
+pd.DataFrame with columns: ReactionID, ObjectiveCoefficient
+"""
+coeffs = []
+# model.objective.expression is a linear expression
+objective_expr = model.objective.expression.as_coefficients_dict()
+for reaction in model.reactions:
+coeff = objective_expr.get(reaction.forward_variable, 0.0)
+coeffs.append({
+"ReactionID": reaction.id,
+"ObjectiveCoefficient": coeff
+})
+return pd.DataFrame(coeffs)
+def generate_bounds(model:cobraModel) -> pd.DataFrame:
+"""
+Build a DataFrame of lower/upper bounds for all reactions.
+Returns:
+pd.DataFrame indexed by reaction IDs with columns ['lower_bound', 'upper_bound'].
+"""
+rxns = []
+for reaction in model.reactions:
+rxns.append(reaction.id)
+bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns)
+for reaction in model.reactions:
+bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound]
+return bounds
+def generate_compartments(model: cobraModel) -> pd.DataFrame:
+"""
+Generates a DataFrame containing pathway information for each reaction.
+Creates columns for each pathway position (Pathway_1, Pathway_2, etc.) only if pathways exist.
+Args:
+model: the COBRA model to extract pathway data from.
+Returns:
+pd.DataFrame: DataFrame with ReactionID and pathway columns (if any pathways exist)
+"""
+pathway_data = []
+# First pass: determine the maximum number of pathways any reaction has
+max_pathways = 0
+reaction_pathways = {}
+has_any_pathways = False
+for reaction in model.reactions:
+# Get unique pathways from all metabolites in the reaction
+if 'pathways' in reaction.annotation and reaction.annotation['pathways']:
+if type(reaction.annotation['pathways']) == list:
+# Filter out empty/None values
+valid_pathways = [p for p in reaction.annotation['pathways'] if p]
+if valid_pathways:
+reaction_pathways[reaction.id] = valid_pathways
+max_pathways = max(max_pathways, len(valid_pathways))
+has_any_pathways = True
+else:
+reaction_pathways[reaction.id] = []
+else:
+# Single pathway value
+if reaction.annotation['pathways']:
+reaction_pathways[reaction.id] = [reaction.annotation['pathways']]
+max_pathways = max(max_pathways, 1)
+has_any_pathways = True
+else:
+reaction_pathways[reaction.id] = []
+else:
+# No pathway annotation - use empty list
+reaction_pathways[reaction.id] = []
+# If no pathways exist in the model, return DataFrame with only ReactionID
+if not has_any_pathways:
+return None
+# Create column names for pathways only if they exist
+pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)]
+# Second pass: create the data with pathway columns
+for reaction_id, pathways in reaction_pathways.items():
+row = {"ReactionID": reaction_id}
+# Fill pathway columns
+for i in range(max_pathways):
+col_name = pathway_columns[i]
+if i < len(pathways):
+row[col_name] = pathways[i]
+else:
+row[col_name] = None
+pathway_data.append(row)
+return pd.DataFrame(pathway_data)
+def set_annotation_pathways_from_data(model: cobraModel, df: pd.DataFrame):
+"""Set reaction pathways based on 'Pathway_1', 'Pathway_2', ... columns in the dataframe."""
+pathway_cols = [col for col in df.columns if col.startswith('Pathway_')]
+if not pathway_cols:
+print("No 'Pathway_' columns found, skipping pathway annotation")
+return
+pathway_data = defaultdict(list)
+for idx, row in df.iterrows():
+reaction_id = str(row['ReactionID']).strip()
+if reaction_id not in model.reactions:
+continue
+pathways = []
+for col in pathway_cols:
+if pd.notna(row[col]) and str(row[col]).strip():
+pathways.append(str(row[col]).strip())
+if pathways:
+reaction = model.reactions.get_by_id(reaction_id)
+if len(pathways) == 1:
+reaction.annotation['pathways'] = pathways[0]
+else:
+reaction.annotation['pathways'] = pathways
+pathway_data[reaction_id] = pathways
+print(f"Annotated {len(pathway_data)} reactions with pathways.")
+def build_cobra_model_from_csv(csv_path: str, model_id: str = "new_model") -> cobraModel:
+"""
+Build a COBRApy model from a tabular file with reaction data.
+Args:
+csv_path: Path to the tab-separated file.
+model_id: ID for the newly created model.
+Returns:
+cobra.Model: The constructed COBRApy model.
+"""
+# Try to detect separator
+with open(csv_path, 'r') as f:
+first_line = f.readline()
+sep = '\t' if '\t' in first_line else ','
+df = pd.read_csv(csv_path, sep=sep)
+# Check required columns
+required_cols = ['ReactionID', 'Formula']
+missing_cols = [col for col in required_cols if col not in df.columns]
+if missing_cols:
+raise ValueError(f"Missing required columns: {missing_cols}. Available columns: {list(df.columns)}")
+model = cobraModel(model_id)
+metabolites_dict = {}
+compartments_dict = {}
+print(f"Building model from {len(df)} reactions...")
+for idx, row in df.iterrows():
+reaction_formula = str(row['Formula']).strip()
+if not reaction_formula or reaction_formula == 'nan':
+continue
+metabolites = extract_metabolites_from_reaction(reaction_formula)
+for met_id in metabolites:
+compartment = extract_compartment_from_metabolite(met_id)
+if compartment not in compartments_dict:
+compartments_dict[compartment] = compartment
+if met_id not in metabolites_dict:
+metabolites_dict[met_id] = Metabolite(
+id=met_id,
+compartment=compartment,
+name=met_id.replace(f"_{compartment}", "").replace("__", "_")
+)
+model.compartments = compartments_dict
+model.add_metabolites(list(metabolites_dict.values()))
+print(f"Added {len(metabolites_dict)} metabolites and {len(compartments_dict)} compartments")
+reactions_added = 0
+reactions_skipped = 0
+for idx, row in df.iterrows():
+reaction_id = str(row['ReactionID']).strip()
+reaction_formula = str(row['Formula']).strip()
+if not reaction_formula or reaction_formula == 'nan':
+raise ValueError(f"Missing reaction formula for {reaction_id}")
+reaction = Reaction(reaction_id)
+reaction.name = reaction_id
+reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0
+reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0
+if pd.notna(row['GPR']) and str(row['GPR']).strip():
+reaction.gene_reaction_rule = str(row['GPR']).strip()
+try:
+parse_reaction_formula(reaction, reaction_formula, metabolites_dict)
+except Exception as e:
+print(f"Error parsing reaction {reaction_id}: {e}")
+reactions_skipped += 1
+continue
+model.add_reactions([reaction])
+reactions_added += 1
+print(f"Added {reactions_added} reactions, skipped {reactions_skipped} reactions")
+# set objective function
+set_objective_from_csv(model, df, obj_col="ObjectiveCoefficient")
+set_medium_from_data(model, df)
+set_annotation_pathways_from_data(model, df)
+print(f"Model completed: {len(model.reactions)} reactions, {len(model.metabolites)} metabolites")
+return model
+# Estrae tutti gli ID metaboliti nella formula (gestisce prefissi numerici + underscore)
+#def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]:
+#    """
+#    Extract metabolite IDs from a reaction formula.
+#    Robust pattern: tokens ending with _<compartment> (e.g., _c, _m, _e),
+#    allowing leading digits/underscores.
+#    """
+#    metabolites = set()
+#    # optional coefficient followed by a token ending with _<letters>
+#    if reaction_formula[-1] == ']' and reaction_formula[-3] == '[':
+#        pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+[[A-Za-z0-9]]+)'
+#    else:
+#        pattern = r'(?:\d+(?:\.\d+)?\s+)?([A-Za-z0-9_]+_[A-Za-z0-9]+)'
+#    matches = re.findall(pattern, reaction_formula)
+#    metabolites.update(matches)
+#    return metabolites
+def extract_metabolites_from_reaction(reaction_formula: str) -> Set[str]:
+"""
+Extract metabolite IDs from a reaction formula.
+Handles:
+- optional stoichiometric coefficients (integers or decimals)
+- compartment tags at the end of the metabolite, either [c] or _c
+Returns the IDs including the compartment suffix exactly as written.
+"""
+pattern = re.compile(
+r'(?:^|(?<=\s)|(?<=\+)|(?<=,)|(?<==)|(?<=:))'              # left boundary (start, space, +, comma, =, :)
+r'(?:\d+(?:\.\d+)?\s+)?'                                   # optional coefficient (requires space after)
+r'([A-Za-z0-9][A-Za-z0-9_]*(?:\[[A-Za-z0-9]+\]|_[A-Za-z0-9]+))'  # metabolite + compartment (can start with number)
+)
+return {m.group(1) for m in pattern.finditer(reaction_formula)}
+def extract_compartment_from_metabolite(metabolite_id: str) -> str:
+"""Extract the compartment from a metabolite ID."""
+if '_' == metabolite_id[-2]:
+return metabolite_id.split('_')[-1]
+if metabolite_id[-1] == ']' and metabolite_id[-3] == '[':
+return metabolite_id[-2]
+return 'c'  # default cytoplasm
+def parse_reaction_formula(reaction: Reaction, formula: str, metabolites_dict: Dict[str, Metabolite]):
+"""Parse a reaction formula and set metabolites with their coefficients."""
+if '<=>' in formula:
+parts = formula.split('<=>')
+reversible = True
+elif '<--' in formula:
+parts = formula.split('<--')
+reversible = False
+elif '-->' in formula:
+parts = formula.split('-->')
+reversible = False
+elif '<-' in formula:
+parts = formula.split('<-')
+reversible = False
+else:
+raise ValueError(f"Unrecognized reaction format: {formula}")
+# Handle cases where one side might be empty (exchange reactions)
+if len(parts) != 2:
+raise ValueError(f"Invalid reaction format, expected 2 parts: {formula}")
+left, right = parts[0].strip(), parts[1].strip()
+reactants = parse_metabolites_side(left) if left else {}
+products = parse_metabolites_side(right) if right else {}
+metabolites_to_add = {}
+for met_id, coeff in reactants.items():
+if met_id in metabolites_dict:
+metabolites_to_add[metabolites_dict[met_id]] = -coeff
+for met_id, coeff in products.items():
+if met_id in metabolites_dict:
+metabolites_to_add[metabolites_dict[met_id]] = coeff
+reaction.add_metabolites(metabolites_to_add)
+def parse_metabolites_side(side_str: str) -> Dict[str, float]:
+"""Parse one side of a reaction and extract metabolites with coefficients."""
+metabolites = {}
+if not side_str or side_str.strip() == '':
+return metabolites
+terms = side_str.split('+')
+for term in terms:
+term = term.strip()
+if not term:
+continue
+# First check if term has space-separated coefficient and metabolite
+parts = term.split()
+if len(parts) == 2:
+# Two parts: potential coefficient + metabolite
+try:
+coeff = float(parts[0])
+met_id = parts[1]
+# Verify the second part looks like a metabolite with compartment
+if re.match(r'[A-Za-z0-9_]+(?:\[[A-Za-z0-9]+\]|_[A-Za-z0-9]+)', met_id):
+metabolites[met_id] = coeff
+continue
+except ValueError:
+pass
+# Single term - check if it's a metabolite (no coefficient)
+# Updated pattern to include metabolites starting with numbers
+if re.match(r'[A-Za-z0-9][A-Za-z0-9_]*(?:\[[A-Za-z0-9]+\]|_[A-Za-z0-9]+)', term):
+metabolites[term] = 1.0
+else:
+print(f"Warning: Could not parse metabolite term: '{term}'")
+return metabolites
+def set_objective_from_csv(model: cobra.Model, df: pd.DataFrame, obj_col: str = "ObjectiveCoefficient"):
+"""
+Sets the model's objective function based on a column of coefficients in the CSV.
+Can be any reaction(s), not necessarily biomass.
+"""
+obj_dict = {}
+for idx, row in df.iterrows():
+reaction_id = str(row['ReactionID']).strip()
+coeff = float(row[obj_col]) if pd.notna(row[obj_col]) else 0.0
+if coeff != 0:
+if reaction_id in model.reactions:
+obj_dict[model.reactions.get_by_id(reaction_id)] = coeff
+else:
+print(f"Warning: reaction {reaction_id} not found in model, skipping for objective.")
+if not obj_dict:
+raise ValueError("No reactions found with non-zero objective coefficient.")
+model.objective = obj_dict
+print(f"Objective set with {len(obj_dict)} reactions.")
+def set_medium_from_data(model: cobraModel, df: pd.DataFrame):
+"""Set the medium based on the 'InMedium' column in the dataframe."""
+if 'InMedium' not in df.columns:
+print("No 'InMedium' column found, skipping medium setup")
+return
+medium_reactions = df[df['InMedium'] == True]['ReactionID'].tolist()
+medium_dict = {}
+for rxn_id in medium_reactions:
+if rxn_id in [r.id for r in model.reactions]:
+reaction = model.reactions.get_by_id(rxn_id)
+if reaction.lower_bound < 0:
+medium_dict[rxn_id] = abs(reaction.lower_bound)
+if medium_dict:
+model.medium = medium_dict
+print(f"Medium set with {len(medium_dict)} components")
+else:
+print("No medium components found")
+def validate_model(model: cobraModel) -> Dict[str, any]:
+"""Validate the model and return basic statistics."""
+validation = {
+'num_reactions': len(model.reactions),
+'num_metabolites': len(model.metabolites),
+'num_genes': len(model.genes),
+'num_compartments': len(model.compartments),
+'objective': str(model.objective),
+'medium_size': len(model.medium),
+'reversible_reactions': len([r for r in model.reactions if r.reversibility]),
+'exchange_reactions': len([r for r in model.reactions if r.id.startswith('EX_')]),
+}
+try:
+# Growth test
+solution = model.optimize()
+validation['growth_rate'] = solution.objective_value
+validation['status'] = solution.status
+except Exception as e:
+validation['growth_rate'] = None
+validation['status'] = f"Error: {e}"
+return validation
+def convert_genes(model, annotation):
+"""Rename genes using a selected annotation key in gene.notes; returns a model copy."""
+from cobra.manipulation import rename_genes
+model2=model.copy()
+try:
+dict_genes={gene.id:gene.notes[annotation]  for gene in model2.genes}
+except:
+print("No annotation in gene dict!")
+return -1
+rename_genes(model2,dict_genes)
+return model2
+# ---------- Utility helpers ----------
+def _normalize_colname(col: str) -> str:
+return col.strip().lower().replace(' ', '_')
+def _choose_columns(mapping_df: 'pd.DataFrame') -> Dict[str, str]:
+"""
+Find useful columns and return a dict {ensg: colname1, hgnc_id: colname2, ...}.
+Raise ValueError if no suitable mapping is found.
+"""
+cols = { _normalize_colname(c): c for c in mapping_df.columns }
+chosen = {}
+# candidate names for each category
+candidates = {
+'ensg': ['ensg', 'ensembl_gene_id', 'ensembl'],
+'hgnc_id': ['hgnc_id', 'hgnc', 'hgnc:'],
+'hgnc_symbol': ['hgnc_symbol', 'hgnc symbol', 'symbol'],
+'entrez_id': ['entrez', 'entrez_id', 'entrezgene'],
+'gene_number': ['gene_number']
+}
+for key, names in candidates.items():
+for n in names:
+if n in cols:
+chosen[key] = cols[n]
+break
+return chosen
+def _validate_target_uniqueness(mapping_df: 'pd.DataFrame',
+source_col: str,
+target_col: str,
+model_source_genes: Optional[Set[str]] = None,
+logger: Optional[logging.Logger] = None) -> None:
+"""
+Check that, within the filtered mapping_df, each target maps to at most one source.
+Log examples if duplicates are found.
+"""
+if logger is None:
+logger = logging.getLogger(__name__)
+if mapping_df.empty:
+logger.warning("Mapping dataframe is empty for the requested source genes; skipping uniqueness validation.")
+return
+# normalize temporary columns for grouping (without altering the original df)
+tmp = mapping_df[[source_col, target_col]].copy()
+tmp['_src_norm'] = tmp[source_col].astype(str).apply(_normalize_gene_id)
+tmp['_tgt_norm'] = tmp[target_col].astype(str).str.strip()
+# optionally filter to the set of model source genes
+if model_source_genes is not None:
+tmp = tmp[tmp['_src_norm'].isin(model_source_genes)]
+if tmp.empty:
+logger.warning("After filtering to model source genes, mapping table is empty — nothing to validate.")
+return
+# build reverse mapping: target -> set(sources)
+grouped = tmp.groupby('_tgt_norm')['_src_norm'].agg(lambda s: set(s.dropna()))
+# find targets with more than one source
+problematic = {t: sorted(list(s)) for t, s in grouped.items() if len(s) > 1}
+if problematic:
+# prepare warning message with examples (limited subset)
+sample_items = list(problematic.items())
+msg_lines = ["Mapping validation failed: some target IDs are associated with multiple source IDs."]
+for tgt, sources in sample_items:
+msg_lines.append(f"  - target '{tgt}' <- sources: {', '.join(sources)}")
+full_msg = "\n".join(msg_lines)
+# log warning
+logger.warning(full_msg)
+# if everything is fine
+logger.info("Mapping validation passed: no target ID is associated with multiple source IDs (within filtered set).")
+def _normalize_gene_id(g: str) -> str:
+"""Normalize a gene ID for use as a key (removes prefixes like 'HGNC:' and strips)."""
+if g is None:
+return ""
+g = str(g).strip()
+# remove common prefixes
+g = re.sub(r'^(HGNC:)', '', g, flags=re.IGNORECASE)
+g = re.sub(r'^(ENSG:)', '', g, flags=re.IGNORECASE)
+return g
+def _is_or_only_expression(expr: str) -> bool:
+"""
+Check if a GPR expression contains only OR operators (no AND operators).
+Args:
+expr: GPR expression string
+Returns:
+bool: True if expression contains only OR (and parentheses) and has multiple genes, False otherwise
+"""
+if not expr or not expr.strip():
+return False
+# Normalize the expression
+normalized = expr.replace(' AND ', ' and ').replace(' OR ', ' or ')
+# Check if it contains any AND operators
+has_and = ' and ' in normalized.lower()
+# Check if it contains OR operators
+has_or = ' or ' in normalized.lower()
+# Must have OR operators and no AND operators
+return has_or and not has_and
+def _flatten_or_only_gpr(expr: str) -> str:
+"""
+Flatten a GPR expression that contains only OR operators by:
+1. Removing all parentheses
+2. Extracting unique gene names
+3. Joining them with ' or '
+Args:
+expr: GPR expression string with only OR operators
+Returns:
+str: Flattened GPR expression
+"""
+if not expr or not expr.strip():
+return expr
+# Extract all gene tokens (exclude logical operators and parentheses)
+gene_pattern = r'\b[A-Za-z0-9:_.-]+\b'
+logical = {'and', 'or', 'AND', 'OR', '(', ')'}
+tokens = re.findall(gene_pattern, expr)
+genes = [t for t in tokens if t not in logical]
+# Create set to remove duplicates, then convert back to list to maintain some order
+unique_genes = list(dict.fromkeys(genes))  # Preserves insertion order
+if len(unique_genes) == 0:
+return expr
+elif len(unique_genes) == 1:
+return unique_genes[0]
+else:
+return ' or '.join(unique_genes)
+def _simplify_boolean_expression(expr: str) -> str:
+"""
+Simplify a boolean expression by removing duplicates while strictly preserving semantics.
+This function handles simple duplicates within parentheses while being conservative about
+complex expressions that could change semantics.
+"""
+if not expr or not expr.strip():
+return expr
+# Normalize operators and whitespace
+expr = expr.replace(' AND ', ' and ').replace(' OR ', ' or ')
+expr = ' '.join(expr.split())  # Normalize whitespace
+def simplify_parentheses_content(match_obj):
+"""Helper function to simplify content within parentheses."""
+content = match_obj.group(1)  # Content inside parentheses
+# Only simplify if it's a pure OR or pure AND chain
+if ' or ' in content and ' and ' not in content:
+# Pure OR chain - safe to deduplicate
+parts = [p.strip() for p in content.split(' or ') if p.strip()]
+unique_parts = []
+seen = set()
+for part in parts:
+if part not in seen:
+unique_parts.append(part)
+seen.add(part)
+if len(unique_parts) == 1:
+return unique_parts[0]  # Remove unnecessary parentheses for single items
+else:
+return '(' + ' or '.join(unique_parts) + ')'
+elif ' and ' in content and ' or ' not in content:
+# Pure AND chain - safe to deduplicate
+parts = [p.strip() for p in content.split(' and ') if p.strip()]
+unique_parts = []
+seen = set()
+for part in parts:
+if part not in seen:
+unique_parts.append(part)
+seen.add(part)
+if len(unique_parts) == 1:
+return unique_parts[0]  # Remove unnecessary parentheses for single items
+else:
+return '(' + ' and '.join(unique_parts) + ')'
+else:
+# Mixed operators or single item - return with parentheses as-is
+return '(' + content + ')'
+def remove_duplicates_simple(parts_str: str, separator: str) -> str:
+"""Remove duplicates from a simple chain of operations."""
+parts = [p.strip() for p in parts_str.split(separator) if p.strip()]
+# Remove duplicates while preserving order
+unique_parts = []
+seen = set()
+for part in parts:
+if part not in seen:
+unique_parts.append(part)
+seen.add(part)
+if len(unique_parts) == 1:
+return unique_parts[0]
+else:
+return f' {separator} '.join(unique_parts)
+try:
+import re
+# First, simplify content within parentheses
+# This handles cases like (A or A) -> A and (B and B) -> B
+expr_simplified = re.sub(r'\(([^()]+)\)', simplify_parentheses_content, expr)
+# Check if the resulting expression has mixed operators
+has_and = ' and ' in expr_simplified
+has_or = ' or ' in expr_simplified
+# Only simplify top-level if it's pure AND or pure OR
+if has_and and not has_or and '(' not in expr_simplified:
+# Pure AND chain at top level - safe to deduplicate
+return remove_duplicates_simple(expr_simplified, 'and')
+elif has_or and not has_and and '(' not in expr_simplified:
+# Pure OR chain at top level - safe to deduplicate
+return remove_duplicates_simple(expr_simplified, 'or')
+else:
+# Mixed operators or has parentheses - return the simplified version (with parentheses content cleaned)
+return expr_simplified
+except Exception:
+# If anything goes wrong, return the original expression
+return expr
+def translate_model_genes(model: 'cobra.Model',
+mapping_df: 'pd.DataFrame',
+target_nomenclature: str,
+source_nomenclature: str = 'hgnc_id',
+allow_many_to_one: bool = False,
+logger: Optional[logging.Logger] = None) -> Tuple['cobra.Model', Dict[str, str]]:
+"""
+Translate model genes from source_nomenclature to target_nomenclature using a mapping table.
+mapping_df should contain columns enabling mapping (e.g., ensg, hgnc_id, hgnc_symbol, entrez).
+Args:
+model: COBRA model to translate.
+mapping_df: DataFrame containing the mapping information.
+target_nomenclature: Desired target key (e.g., 'hgnc_symbol').
+source_nomenclature: Current source key in the model (default 'hgnc_id').
+allow_many_to_one: If True, allow many-to-one mappings and handle duplicates in GPRs.
+logger: Optional logger.
+Returns:
+Tuple containing:
+- Translated COBRA model
+- Dictionary mapping reaction IDs to translation issue descriptions
+"""
+if logger is None:
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+logger.info(f"Translating genes from '{source_nomenclature}' to '{target_nomenclature}'")
+# normalize column names and choose relevant columns
+chosen = _choose_columns(mapping_df)
+if not chosen:
+raise ValueError("Could not detect useful columns in mapping_df. Expected at least one of: ensg, hgnc_id, hgnc_symbol, entrez.")
+# map source/target to actual dataframe column names (allow user-specified source/target keys)
+# normalize input args
+src_key = source_nomenclature.strip().lower()
+tgt_key = target_nomenclature.strip().lower()
+# try to find the actual column names for requested keys
+col_for_src = None
+col_for_tgt = None
+# first, try exact match
+for k, actual in chosen.items():
+if k == src_key:
+col_for_src = actual
+if k == tgt_key:
+col_for_tgt = actual
+# if not found, try mapping common names
+if col_for_src is None:
+possible_src_names = {k: v for k, v in chosen.items()}
+# try to match by contained substring
+for k, actual in possible_src_names.items():
+if src_key in k:
+col_for_src = actual
+break
+if col_for_tgt is None:
+for k, actual in chosen.items():
+if tgt_key in k:
+col_for_tgt = actual
+break
+if col_for_src is None:
+raise ValueError(f"Source column for '{source_nomenclature}' not found in mapping dataframe.")
+if col_for_tgt is None:
+raise ValueError(f"Target column for '{target_nomenclature}' not found in mapping dataframe.")
+model_source_genes = { _normalize_gene_id(g.id) for g in model.genes }
+logger.info(f"Filtering mapping to {len(model_source_genes)} source genes present in model (normalized).")
+tmp_map = mapping_df[[col_for_src, col_for_tgt]].dropna().copy()
+tmp_map[col_for_src + "_norm"] = tmp_map[col_for_src].astype(str).apply(_normalize_gene_id)
+filtered_map = tmp_map[tmp_map[col_for_src + "_norm"].isin(model_source_genes)].copy()
+if filtered_map.empty:
+logger.warning("No mapping rows correspond to source genes present in the model after filtering. Proceeding with empty mapping (no translation will occur).")
+if not allow_many_to_one:
+_validate_target_uniqueness(filtered_map, col_for_src, col_for_tgt, model_source_genes=model_source_genes, logger=logger)
+# Crea il mapping
+gene_mapping = _create_gene_mapping(filtered_map, col_for_src, col_for_tgt, logger)
+# copy model
+model_copy = model.copy()
+# statistics
+stats = {'translated': 0, 'one_to_one': 0, 'one_to_many': 0, 'not_found': 0, 'simplified_gprs': 0, 'flattened_or_gprs': 0}
+unmapped = []
+multi = []
+# Dictionary to store translation issues per reaction
+reaction_translation_issues = {}
+original_genes = {g.id for g in model_copy.genes}
+logger.info(f"Original genes count: {len(original_genes)}")
+# translate GPRs
+for rxn in model_copy.reactions:
+gpr = rxn.gene_reaction_rule
+if gpr and gpr.strip():
+new_gpr, rxn_issues = _translate_gpr(gpr, gene_mapping, stats, unmapped, multi, logger, track_issues=True)
+if rxn_issues:
+reaction_translation_issues[rxn.id] = rxn_issues
+if new_gpr != gpr:
+# Check if this GPR has translation issues and contains only OR operators
+if rxn_issues and _is_or_only_expression(new_gpr):
+# Flatten the GPR: remove parentheses and create set of unique genes
+flattened_gpr = _flatten_or_only_gpr(new_gpr)
+if flattened_gpr != new_gpr:
+stats['flattened_or_gprs'] += 1
+logger.debug(f"Flattened OR-only GPR with issues for {rxn.id}: '{new_gpr}' -> '{flattened_gpr}'")
+new_gpr = flattened_gpr
+simplified_gpr = _simplify_boolean_expression(new_gpr)
+if simplified_gpr != new_gpr:
+stats['simplified_gprs'] += 1
+logger.debug(f"Simplified GPR for {rxn.id}: '{new_gpr}' -> '{simplified_gpr}'")
+rxn.gene_reaction_rule = simplified_gpr
+logger.debug(f"Reaction {rxn.id}: '{gpr}' -> '{simplified_gpr}'")
+# update model genes based on new GPRs
+_update_model_genes(model_copy, logger)
+# final logging
+_log_translation_statistics(stats, unmapped, multi, original_genes, model_copy.genes, logger)
+logger.info("Translation finished")
+return model_copy, reaction_translation_issues
+# ---------- helper functions ----------
+def _create_gene_mapping(mapping_df, source_col: str, target_col: str, logger: logging.Logger) -> Dict[str, List[str]]:
+"""
+Build mapping dict: source_id -> list of target_ids
+Normalizes IDs (removes prefixes like 'HGNC:' etc).
+"""
+df = mapping_df[[source_col, target_col]].dropna().copy()
+# normalize to string
+df[source_col] = df[source_col].astype(str).apply(_normalize_gene_id)
+df[target_col] = df[target_col].astype(str).str.strip()
+df = df.drop_duplicates()
+logger.info(f"Creating mapping from {len(df)} rows")
+mapping = defaultdict(list)
+for _, row in df.iterrows():
+s = row[source_col]
+t = row[target_col]
+if t not in mapping[s]:
+mapping[s].append(t)
+# stats
+one_to_one = sum(1 for v in mapping.values() if len(v) == 1)
+one_to_many = sum(1 for v in mapping.values() if len(v) > 1)
+logger.info(f"Mapping: {len(mapping)} source keys, {one_to_one} 1:1, {one_to_many} 1:many")
+return dict(mapping)
+def _translate_gpr(gpr_string: str,
+gene_mapping: Dict[str, List[str]],
+stats: Dict[str, int],
+unmapped_genes: List[str],
+multi_mapping_genes: List[Tuple[str, List[str]]],
+logger: logging.Logger,
+track_issues: bool = False) -> Union[str, Tuple[str, str]]:
+"""
+Translate genes inside a GPR string using gene_mapping.
+Returns new GPR string, and optionally translation issues if track_issues=True.
+"""
+# Generic token pattern: letters, digits, :, _, -, ., (captures HGNC:1234, ENSG000..., symbols)
+token_pattern = r'\b[A-Za-z0-9:_.-]+\b'
+tokens = re.findall(token_pattern, gpr_string)
+logical = {'and', 'or', 'AND', 'OR', '(', ')'}
+tokens = [t for t in tokens if t not in logical]
+new_gpr = gpr_string
+issues = []
+for token in sorted(set(tokens), key=lambda x: -len(x)):  # longer tokens first to avoid partial replacement
+norm = _normalize_gene_id(token)
+if norm in gene_mapping:
+targets = gene_mapping[norm]
+stats['translated'] += 1
+if len(targets) == 1:
+stats['one_to_one'] += 1
+replacement = targets[0]
+else:
+stats['one_to_many'] += 1
+multi_mapping_genes.append((token, targets))
+replacement = "(" + " or ".join(targets) + ")"
+if track_issues:
+issues.append(f"{token} -> {' or '.join(targets)}")
+pattern = r'\b' + re.escape(token) + r'\b'
+new_gpr = re.sub(pattern, replacement, new_gpr)
+else:
+stats['not_found'] += 1
+if token not in unmapped_genes:
+unmapped_genes.append(token)
+logger.debug(f"Token not found in mapping (left as-is): {token}")
+# Check for many-to-one cases (multiple source genes mapping to same target)
+if track_issues:
+# Build reverse mapping to detect many-to-one cases from original tokens
+original_to_target = {}
+for orig_token in tokens:
+norm = _normalize_gene_id(orig_token)
+if norm in gene_mapping:
+targets = gene_mapping[norm]
+for target in targets:
+if target not in original_to_target:
+original_to_target[target] = []
+if orig_token not in original_to_target[target]:
+original_to_target[target].append(orig_token)
+# Identify many-to-one mappings in this specific GPR
+for target, original_genes in original_to_target.items():
+if len(original_genes) > 1:
+issues.append(f"{' or '.join(original_genes)} -> {target}")
+issue_text = "; ".join(issues) if issues else ""
+if track_issues:
+return new_gpr, issue_text
+else:
+return new_gpr
+def _update_model_genes(model: 'cobra.Model', logger: logging.Logger):
+"""
+Rebuild model.genes from gene_reaction_rule content.
+Removes genes not referenced and adds missing ones.
+"""
+# collect genes in GPRs
+gene_pattern = r'\b[A-Za-z0-9:_.-]+\b'
+logical = {'and', 'or', 'AND', 'OR', '(', ')'}
+genes_in_gpr: Set[str] = set()
+for rxn in model.reactions:
+gpr = rxn.gene_reaction_rule
+if gpr and gpr.strip():
+toks = re.findall(gene_pattern, gpr)
+toks = [t for t in toks if t not in logical]
+# normalize IDs consistent with mapping normalization
+toks = [_normalize_gene_id(t) for t in toks]
+genes_in_gpr.update(toks)
+# existing gene ids
+existing = {g.id for g in model.genes}
+# remove obsolete genes
+to_remove = [gid for gid in existing if gid not in genes_in_gpr]
+removed = 0
+for gid in to_remove:
+try:
+gene_obj = model.genes.get_by_id(gid)
+model.genes.remove(gene_obj)
+removed += 1
+except Exception:
+# safe-ignore
+pass
+# add new genes
+added = 0
+for gid in genes_in_gpr:
+if gid not in existing:
+new_gene = cobra.Gene(gid)
+try:
+model.genes.add(new_gene)
+except Exception:
+# fallback: if model.genes doesn't support add, try append or model.add_genes
+try:
+model.genes.append(new_gene)
+except Exception:
+try:
+model.add_genes([new_gene])
+except Exception:
+logger.warning(f"Could not add gene object for {gid}")
+added += 1
+logger.info(f"Model genes updated: removed {removed}, added {added}")
+def export_model_to_tabular(model: cobraModel,
+output_path: str,
+translation_issues: Dict = None,
+include_objective: bool = True,
+save_function = None) -> pd.DataFrame:
+"""
+Export a COBRA model to tabular format with optional components.
+Args:
+model: COBRA model to export
+output_path: Path where to save the tabular file
+translation_issues: Optional dict of {reaction_id: issues} from gene translation
+include_objective: Whether to include objective coefficient column
+save_function: Optional custom save function, if None uses pd.DataFrame.to_csv
+Returns:
+pd.DataFrame: The merged tabular data
+"""
+# Generate model data
+rules = generate_rules(model, asParsed=False)
+reactions = generate_reactions(model, asParsed=False)
+bounds = generate_bounds(model)
+medium = get_medium(model)
+compartments = generate_compartments(model)
+# Create base DataFrames
+df_rules = pd.DataFrame(list(rules.items()), columns=["ReactionID", "GPR"])
+df_reactions = pd.DataFrame(list(reactions.items()), columns=["ReactionID", "Formula"])
+df_bounds = bounds.reset_index().rename(columns={"index": "ReactionID"})
+df_medium = medium.rename(columns={"reaction": "ReactionID"})
+df_medium["InMedium"] = True
+# Start merging
+merged = df_reactions.merge(df_rules, on="ReactionID", how="outer")
+merged = merged.merge(df_bounds, on="ReactionID", how="outer")
+# Add objective coefficients if requested
+if include_objective:
+objective_function = extract_objective_coefficients(model)
+merged = merged.merge(objective_function, on="ReactionID", how="outer")
+# Add compartments/pathways if they exist
+if compartments is not None:
+merged = merged.merge(compartments, on="ReactionID", how="outer")
+# Add medium information
+merged = merged.merge(df_medium, on="ReactionID", how="left")
+# Add translation issues if provided
+if translation_issues:
+df_translation_issues = pd.DataFrame([
+{"ReactionID": rxn_id, "TranslationIssues": issues}
+for rxn_id, issues in translation_issues.items()
+])
+if not df_translation_issues.empty:
+merged = merged.merge(df_translation_issues, on="ReactionID", how="left")
+merged["TranslationIssues"] = merged["TranslationIssues"].fillna("")
+# Final processing
+merged["InMedium"] = merged["InMedium"].fillna(False)
+merged = merged.sort_values(by="InMedium", ascending=False)
+# Save the file
+if save_function:
+save_function(merged, output_path)
+else:
+merged.to_csv(output_path, sep="\t", index=False)
+return merged
+def _log_translation_statistics(stats: Dict[str, int],
+unmapped_genes: List[str],
+multi_mapping_genes: List[Tuple[str, List[str]]],
+original_genes: Set[str],
+final_genes,
+logger: logging.Logger):
+logger.info("=== TRANSLATION STATISTICS ===")
+logger.info(f"Translated: {stats.get('translated', 0)} (1:1 = {stats.get('one_to_one', 0)}, 1:many = {stats.get('one_to_many', 0)})")
+logger.info(f"Not found tokens: {stats.get('not_found', 0)}")
+logger.info(f"Simplified GPRs: {stats.get('simplified_gprs', 0)}")
+logger.info(f"Flattened OR-only GPRs with issues: {stats.get('flattened_or_gprs', 0)}")
+final_ids = {g.id for g in final_genes}
+logger.info(f"Genes in model: {len(original_genes)} -> {len(final_ids)}")
+if unmapped_genes:
+logger.warning(f"Unmapped tokens ({len(unmapped_genes)}): {', '.join(unmapped_genes[:20])}{(' ...' if len(unmapped_genes)>20 else '')}")
+if multi_mapping_genes:
+logger.info(f"Multi-mapping examples ({len(multi_mapping_genes)}):")
+for orig, targets in multi_mapping_genes[:10]:
+logger.info(f"  {orig} -> {', '.join(targets)}")
+# Log summary of flattened GPRs if any
+if stats.get('flattened_or_gprs', 0) > 0:
+logger.info(f"Flattened {stats['flattened_or_gprs']} OR-only GPRs that had translation issues (removed parentheses, created unique gene sets)")

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/src/utils/model_utils.py @ 539:2fb97466e404 draft