Mercurial > repos > bimib > cobraxy
changeset 414:5086145cfb96 draft
Uploaded
author | francesco_lapi |
---|---|
date | Mon, 08 Sep 2025 21:54:14 +0000 |
parents | 7a3ccf066b2c |
children | 4a248b45273c |
files | COBRAxy/custom_data_generator_beta.py COBRAxy/ras_to_bounds_beta.py COBRAxy/utils/general_utils.py |
diffstat | 3 files changed, 282 insertions(+), 167 deletions(-) [+] |
line wrap: on
line diff
--- a/COBRAxy/custom_data_generator_beta.py Mon Sep 08 21:37:14 2025 +0000 +++ b/COBRAxy/custom_data_generator_beta.py Mon Sep 08 21:54:14 2025 +0000 @@ -72,7 +72,125 @@ raise utils.DataErr(file_path, f"Formato \"{file_path.ext}\" non riconosciuto, sono supportati solo file JSON e XML") +################################- DATA GENERATION -################################ +ReactionId = str +def generate_rules(model: cobra.Model, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]: + """ + Generates a dictionary mapping reaction ids to rules from the model. + Args: + model : the model to derive data from. + asParsed : if True parses the rules to an optimized runtime format, otherwise leaves them as strings. + + Returns: + Dict[ReactionId, rulesUtils.OpList] : the generated dictionary of parsed rules. + Dict[ReactionId, str] : the generated dictionary of raw rules. + """ + # Is the below approach convoluted? yes + # Ok but is it inefficient? probably + # Ok but at least I don't have to repeat the check at every rule (I'm clinically insane) + _ruleGetter = lambda reaction : reaction.gene_reaction_rule + ruleExtractor = (lambda reaction : + rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter + + return { + reaction.id : ruleExtractor(reaction) + for reaction in model.reactions + if reaction.gene_reaction_rule } + +def generate_reactions(model :cobra.Model, *, asParsed = True) -> Dict[ReactionId, str]: + """ + Generates a dictionary mapping reaction ids to reaction formulas from the model. + + Args: + model : the model to derive data from. + asParsed : if True parses the reactions to an optimized runtime format, otherwise leaves them as they are. + + Returns: + Dict[ReactionId, str] : the generated dictionary. + """ + + unparsedReactions = { + reaction.id : reaction.reaction + for reaction in model.reactions + if reaction.reaction + } + + if not asParsed: return unparsedReactions + + return reactionUtils.create_reaction_dict(unparsedReactions) + +def get_medium(model:cobra.Model) -> pd.DataFrame: + trueMedium=[] + for r in model.reactions: + positiveCoeff=0 + for m in r.metabolites: + if r.get_coefficient(m.id)>0: + positiveCoeff=1; + if (positiveCoeff==0 and r.lower_bound<0): + trueMedium.append(r.id) + + df_medium = pd.DataFrame() + df_medium["reaction"] = trueMedium + return df_medium + +def generate_bounds(model:cobra.Model) -> pd.DataFrame: + + rxns = [] + for reaction in model.reactions: + rxns.append(reaction.id) + + bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns) + + for reaction in model.reactions: + bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound] + return bounds + + + +def generate_compartments(model: cobra.Model) -> pd.DataFrame: + """ + Generates a DataFrame containing compartment information for each reaction. + Creates columns for each compartment position (Compartment_1, Compartment_2, etc.) + + Args: + model: the COBRA model to extract compartment data from. + + Returns: + pd.DataFrame: DataFrame with ReactionID and compartment columns + """ + pathway_data = [] + + # First pass: determine the maximum number of pathways any reaction has + max_pathways = 0 + reaction_pathways = {} + + for reaction in model.reactions: + # Get unique pathways from all metabolites in the reaction + if type(reaction.annotation['pathways']) == list: + reaction_pathways[reaction.id] = reaction.annotation['pathways'] + max_pathways = max(max_pathways, len(reaction.annotation['pathways'])) + else: + reaction_pathways[reaction.id] = [reaction.annotation['pathways']] + + # Create column names for pathways + pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)] + + # Second pass: create the data + for reaction_id, pathways in reaction_pathways.items(): + row = {"ReactionID": reaction_id} + + # Fill pathway columns + for i in range(max_pathways): + col_name = pathway_columns[i] + if i < len(pathways): + row[col_name] = pathways[i] + else: + row[col_name] = None # or "" if you prefer empty strings + + pathway_data.append(row) + + return pd.DataFrame(pathway_data) ###############################- FILE SAVING -################################ @@ -178,12 +296,12 @@ model = utils.convert_genes(model, ARGS.gene_format.replace("HGNC_", "HGNC ")) # generate data - rules = utils.generate_rules(model, asParsed = False) - reactions = utils.generate_reactions(model, asParsed = False) - bounds = utils.generate_bounds(model) - medium = utils.get_medium(model) + rules = generate_rules(model, asParsed = False) + reactions = generate_reactions(model, asParsed = False) + bounds = generate_bounds(model) + medium = get_medium(model) if ARGS.name == "ENGRO2": - compartments = utils.generate_compartments(model) + compartments = generate_compartments(model) df_rules = pd.DataFrame(list(rules.items()), columns = ["ReactionID", "Rule"]) df_reactions = pd.DataFrame(list(reactions.items()), columns = ["ReactionID", "Reaction"])
--- a/COBRAxy/ras_to_bounds_beta.py Mon Sep 08 21:37:14 2025 +0000 +++ b/COBRAxy/ras_to_bounds_beta.py Mon Sep 08 21:54:14 2025 +0000 @@ -10,6 +10,7 @@ import sys import csv from joblib import Parallel, delayed, cpu_count +import utils.rule_parsing as rulesUtils # , medium @@ -149,6 +150,126 @@ new_bounds.loc[reaction, "upper_bound"] = valMax return new_bounds +################################- DATA GENERATION -################################ +ReactionId = str +def generate_rules(model: cobra.Model, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]: + """ + Generates a dictionary mapping reaction ids to rules from the model. + + Args: + model : the model to derive data from. + asParsed : if True parses the rules to an optimized runtime format, otherwise leaves them as strings. + + Returns: + Dict[ReactionId, rulesUtils.OpList] : the generated dictionary of parsed rules. + Dict[ReactionId, str] : the generated dictionary of raw rules. + """ + # Is the below approach convoluted? yes + # Ok but is it inefficient? probably + # Ok but at least I don't have to repeat the check at every rule (I'm clinically insane) + _ruleGetter = lambda reaction : reaction.gene_reaction_rule + ruleExtractor = (lambda reaction : + rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter + + return { + reaction.id : ruleExtractor(reaction) + for reaction in model.reactions + if reaction.gene_reaction_rule } + +def generate_reactions(model :cobra.Model, *, asParsed = True) -> Dict[ReactionId, str]: + """ + Generates a dictionary mapping reaction ids to reaction formulas from the model. + + Args: + model : the model to derive data from. + asParsed : if True parses the reactions to an optimized runtime format, otherwise leaves them as they are. + + Returns: + Dict[ReactionId, str] : the generated dictionary. + """ + + unparsedReactions = { + reaction.id : reaction.reaction + for reaction in model.reactions + if reaction.reaction + } + + if not asParsed: return unparsedReactions + + return reactionUtils.create_reaction_dict(unparsedReactions) + +def get_medium(model:cobra.Model) -> pd.DataFrame: + trueMedium=[] + for r in model.reactions: + positiveCoeff=0 + for m in r.metabolites: + if r.get_coefficient(m.id)>0: + positiveCoeff=1; + if (positiveCoeff==0 and r.lower_bound<0): + trueMedium.append(r.id) + + df_medium = pd.DataFrame() + df_medium["reaction"] = trueMedium + return df_medium + +def generate_bounds(model:cobra.Model) -> pd.DataFrame: + + rxns = [] + for reaction in model.reactions: + rxns.append(reaction.id) + + bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns) + + for reaction in model.reactions: + bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound] + return bounds + + + +def generate_compartments(model: cobra.Model) -> pd.DataFrame: + """ + Generates a DataFrame containing compartment information for each reaction. + Creates columns for each compartment position (Compartment_1, Compartment_2, etc.) + + Args: + model: the COBRA model to extract compartment data from. + + Returns: + pd.DataFrame: DataFrame with ReactionID and compartment columns + """ + pathway_data = [] + + # First pass: determine the maximum number of pathways any reaction has + max_pathways = 0 + reaction_pathways = {} + + for reaction in model.reactions: + # Get unique pathways from all metabolites in the reaction + if type(reaction.annotation['pathways']) == list: + reaction_pathways[reaction.id] = reaction.annotation['pathways'] + max_pathways = max(max_pathways, len(reaction.annotation['pathways'])) + else: + reaction_pathways[reaction.id] = [reaction.annotation['pathways']] + + # Create column names for pathways + pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)] + + # Second pass: create the data + for reaction_id, pathways in reaction_pathways.items(): + row = {"ReactionID": reaction_id} + + # Fill pathway columns + for i in range(max_pathways): + col_name = pathway_columns[i] + if i < len(pathways): + row[col_name] = pathways[i] + else: + row[col_name] = None # or "" if you prefer empty strings + + pathway_data.append(row) + + return pd.DataFrame(pathway_data) + def save_model(model, filename, output_folder, file_format='csv'): """ Save a COBRA model to file in the specified format. @@ -170,10 +291,10 @@ # Special handling for tabular format using utils functions filepath = os.path.join(output_folder, f"{filename}.csv") - rules = utils.generate_rules(model, asParsed = False) - reactions = utils.generate_reactions(model, asParsed = False) - bounds = utils.generate_bounds(model) - medium = utils.get_medium(model) + rules = generate_rules(model, asParsed = False) + reactions = generate_reactions(model, asParsed = False) + bounds = generate_bounds(model) + medium = get_medium(model) try: compartments = utils.generate_compartments(model) @@ -269,7 +390,7 @@ pass -def generate_bounds(model: cobra.Model, ras=None, output_folder='output/', save_models=False, save_models_path='saved_models/', save_models_format='csv') -> pd.DataFrame: +def generate_bounds_model(model: cobra.Model, ras=None, output_folder='output/', save_models=False, save_models_path='saved_models/', save_models_format='csv') -> pd.DataFrame: """ Generate reaction bounds for a metabolic model based on medium conditions and optional RAS adjustments. @@ -369,12 +490,12 @@ print(f"{key}: {value}") if(ARGS.ras_selector == True): - generate_bounds(model, ras=ras_combined, output_folder=ARGS.output_path, + generate_bounds_model(model, ras=ras_combined, output_folder=ARGS.output_path, save_models=ARGS.save_models, save_models_path=ARGS.save_models_path, save_models_format=ARGS.save_models_format) class_assignments.to_csv(ARGS.cell_class, sep='\t', index=False) else: - generate_bounds(model, output_folder=ARGS.output_path, + generate_bounds_model(model, output_folder=ARGS.output_path, save_models=ARGS.save_models, save_models_path=ARGS.save_models_path, save_models_format=ARGS.save_models_format)
--- a/COBRAxy/utils/general_utils.py Mon Sep 08 21:37:14 2025 +0000 +++ b/COBRAxy/utils/general_utils.py Mon Sep 08 21:54:14 2025 +0000 @@ -17,11 +17,6 @@ import gzip import bz2 from io import StringIO -import os -sys.path.insert(0, os.path.dirname(__file__)) -import rule_parsing as rulesUtils -import reaction_parsing as reactionUtils - @@ -780,38 +775,40 @@ # Seconda passata: aggiungi le reazioni reactions_added = 0 + reactions_skipped = 0 for idx, row in df.iterrows(): - reaction_id = str(row['ReactionID']).strip() - reaction_formula = str(row['Reaction']).strip() - - # Salta reazioni senza formula - if not reaction_formula or reaction_formula == 'nan': - raise ValueError(f"Formula della reazione mancante {reaction_id}") - - # Crea la reazione - reaction = Reaction(reaction_id) - reaction.name = reaction_id - - # Imposta bounds - reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0 - reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0 - - # Aggiungi gene rule se presente - if pd.notna(row['Rule']) and str(row['Rule']).strip(): - reaction.gene_reaction_rule = str(row['Rule']).strip() - - # Parse della formula della reazione try: - parse_reaction_formula(reaction, reaction_formula, metabolites_dict) - except Exception as e: - print(f"Errore nel parsing della reazione {reaction_id}: {e}") - reactions_skipped += 1 - continue - - # Aggiungi la reazione al modello - model.add_reactions([reaction]) - reactions_added += 1 + reaction_id = str(row['ReactionID']).strip() + reaction_formula = str(row['Reaction']).strip() + + # Salta reazioni senza formula + if not reaction_formula or reaction_formula == 'nan': + raise ValueError(f"Formula della reazione mancante {reaction_id}") + + # Crea la reazione + reaction = Reaction(reaction_id) + reaction.name = reaction_id + + # Imposta bounds + reaction.lower_bound = float(row['lower_bound']) if pd.notna(row['lower_bound']) else -1000.0 + reaction.upper_bound = float(row['upper_bound']) if pd.notna(row['upper_bound']) else 1000.0 + + # Aggiungi gene rule se presente + if pd.notna(row['Rule']) and str(row['Rule']).strip(): + reaction.gene_reaction_rule = str(row['Rule']).strip() + + # Parse della formula della reazione + try: + parse_reaction_formula(reaction, reaction_formula, metabolites_dict) + except Exception as e: + print(f"Errore nel parsing della reazione {reaction_id}: {e}") + reactions_skipped += 1 + continue + + # Aggiungi la reazione al modello + model.add_reactions([reaction]) + reactions_added += 1 print(f"Aggiunte {reactions_added} reazioni, saltate {reactions_skipped} reazioni") @@ -979,124 +976,3 @@ validation['status'] = f"Error: {e}" return validation - - -################################- DATA GENERATION -################################ -ReactionId = str -def generate_rules(model: cobra.Model, *, asParsed = True) -> Union[Dict[ReactionId, rulesUtils.OpList], Dict[ReactionId, str]]: - """ - Generates a dictionary mapping reaction ids to rules from the model. - - Args: - model : the model to derive data from. - asParsed : if True parses the rules to an optimized runtime format, otherwise leaves them as strings. - - Returns: - Dict[ReactionId, rulesUtils.OpList] : the generated dictionary of parsed rules. - Dict[ReactionId, str] : the generated dictionary of raw rules. - """ - # Is the below approach convoluted? yes - # Ok but is it inefficient? probably - # Ok but at least I don't have to repeat the check at every rule (I'm clinically insane) - _ruleGetter = lambda reaction : reaction.gene_reaction_rule - ruleExtractor = (lambda reaction : - rulesUtils.parseRuleToNestedList(_ruleGetter(reaction))) if asParsed else _ruleGetter - - return { - reaction.id : ruleExtractor(reaction) - for reaction in model.reactions - if reaction.gene_reaction_rule } - -def generate_reactions(model :cobra.Model, *, asParsed = True) -> Dict[ReactionId, str]: - """ - Generates a dictionary mapping reaction ids to reaction formulas from the model. - - Args: - model : the model to derive data from. - asParsed : if True parses the reactions to an optimized runtime format, otherwise leaves them as they are. - - Returns: - Dict[ReactionId, str] : the generated dictionary. - """ - - unparsedReactions = { - reaction.id : reaction.reaction - for reaction in model.reactions - if reaction.reaction - } - - if not asParsed: return unparsedReactions - - return reactionUtils.create_reaction_dict(unparsedReactions) - -def get_medium(model:cobra.Model) -> pd.DataFrame: - trueMedium=[] - for r in model.reactions: - positiveCoeff=0 - for m in r.metabolites: - if r.get_coefficient(m.id)>0: - positiveCoeff=1; - if (positiveCoeff==0 and r.lower_bound<0): - trueMedium.append(r.id) - - df_medium = pd.DataFrame() - df_medium["reaction"] = trueMedium - return df_medium - -def generate_bounds(model:cobra.Model) -> pd.DataFrame: - - rxns = [] - for reaction in model.reactions: - rxns.append(reaction.id) - - bounds = pd.DataFrame(columns = ["lower_bound", "upper_bound"], index=rxns) - - for reaction in model.reactions: - bounds.loc[reaction.id] = [reaction.lower_bound, reaction.upper_bound] - return bounds - - - -def generate_compartments(model: cobra.Model) -> pd.DataFrame: - """ - Generates a DataFrame containing compartment information for each reaction. - Creates columns for each compartment position (Compartment_1, Compartment_2, etc.) - - Args: - model: the COBRA model to extract compartment data from. - - Returns: - pd.DataFrame: DataFrame with ReactionID and compartment columns - """ - pathway_data = [] - - # First pass: determine the maximum number of pathways any reaction has - max_pathways = 0 - reaction_pathways = {} - - for reaction in model.reactions: - # Get unique pathways from all metabolites in the reaction - if type(reaction.annotation['pathways']) == list: - reaction_pathways[reaction.id] = reaction.annotation['pathways'] - max_pathways = max(max_pathways, len(reaction.annotation['pathways'])) - else: - reaction_pathways[reaction.id] = [reaction.annotation['pathways']] - - # Create column names for pathways - pathway_columns = [f"Pathway_{i+1}" for i in range(max_pathways)] - - # Second pass: create the data - for reaction_id, pathways in reaction_pathways.items(): - row = {"ReactionID": reaction_id} - - # Fill pathway columns - for i in range(max_pathways): - col_name = pathway_columns[i] - if i < len(pathways): - row[col_name] = pathways[i] - else: - row[col_name] = None # or "" if you prefer empty strings - - pathway_data.append(row) - - return pd.DataFrame(pathway_data) \ No newline at end of file