cobraxy: COBRAxy/flux_simulation

comparison COBRAxy/flux_simulation_beta.py @ 461:73f02860f7d7 draft

Uploaded

author	luca_milaz
date	Mon, 22 Sep 2025 13:51:19 +0000
parents	a6e45049c1b9
children	5f02f7e4ea9f

comparison

equal deleted inserted replaced

-:6a7010997b32
+:73f02860f7d7
 import argparse
 import utils.general_utils as utils
 from typing import List
 import os
 import pandas as pd
+import numpy as np
 import cobra
 import utils.CBS_backend as CBS_backend
 from joblib import Parallel, delayed, cpu_count
 from cobra.sampling import OptGPSampler
 import sys
 import utils.model_utils as model_utils
 ################################# process args ###############################
-def process_args(args :List[str] = None) -> argparse.Namespace:
+def process_args(args: List[str] = None) -> argparse.Namespace:
 """
 Processes command-line arguments.
 Args:
 args (list): List of command-line arguments.
 Returns:
 Namespace: An object containing parsed arguments.
 """
-parser = argparse.ArgumentParser(usage = '%(prog)s [options]',
+parser = argparse.ArgumentParser(usage='%(prog)s [options]',
-description = 'process some value\'s')
+description='process some value\'s')
-parser.add_argument("-mo", "--model_upload", type = str,
+parser.add_argument("-mo", "--model_upload", type=str,
-help = "path to input file with custom rules, if provided")
+help="path to input file with custom rules, if provided")
-parser.add_argument("-mab", "--model_and_bounds", type = str,
+parser.add_argument("-mab", "--model_and_bounds", type=str,
-choices = ['True', 'False'],
+choices=['True', 'False'],
-required = True,
+required=True,
-help = "upload mode: True for model+bounds, False for complete models")
+help="upload mode: True for model+bounds, False for complete models")
+parser.add_argument('-ol', '--out_log',
-parser.add_argument('-ol', '--out_log',
+help="Output log")
-help = "Output log")
 parser.add_argument('-td', '--tool_dir',
-type = str,
+type=str,
-required = True,
+required=True,
-help = 'your tool directory')
+help='your tool directory')
 parser.add_argument('-in', '--input',
-required = True,
+required=True,
 type=str,
-help = 'input bounds files or complete model files')
+help='input bounds files or complete model files')
 parser.add_argument('-ni', '--name',
-required = True,
+required=True,
 type=str,
-help = 'cell names')
+help='cell names')
 parser.add_argument('-a', '--algorithm',
-type = str,
+type=str,
-choices = ['OPTGP', 'CBS'],
+choices=['OPTGP', 'CBS'],
-required = True,
+required=True,
-help = 'choose sampling algorithm')
+help='choose sampling algorithm')
 parser.add_argument('-th', '--thinning',
-type = int,
+type=int,
-default= 100,
+default=100,
 required=False,
-help = 'choose thinning')
+help='choose thinning')
 parser.add_argument('-ns', '--n_samples',
-type = int,
+type=int,
-required = True,
+required=True,
-help = 'choose how many samples')
+help='choose how many samples (set to 0 for optimization only)')
 parser.add_argument('-sd', '--seed',
-type = int,
+type=int,
-required = True,
+required=True,
-help = 'seed')
+help='seed for random number generation')
 parser.add_argument('-nb', '--n_batches',
-type = int,
+type=int,
-required = True,
+required=True,
-help = 'choose how many batches')
+help='choose how many batches')
 parser.add_argument('-opt', '--perc_opt',
-type = float,
+type=float,
 default=0.9,
-required = False,
+required=False,
-help = 'choose the fraction of optimality for FVA (0-1)')
+help='choose the fraction of optimality for FVA (0-1)')
 parser.add_argument('-ot', '--output_type',
-type = str,
+type=str,
-required = True,
+required=True,
-help = 'output type')
+help='output type for sampling results')
 parser.add_argument('-ota', '--output_type_analysis',
-type = str,
+type=str,
-required = False,
+required=False,
-help = 'output type analysis')
+help='output type analysis (optimization methods)')
 parser.add_argument('-idop', '--output_path',
-type = str,
+type=str,
 default='flux_simulation',
-help = 'output path for maps')
+help='output path for maps')
 ARGS = parser.parse_args(args)
 return ARGS
 ########################### warning ###########################################
 def warning(s :str) -> None:
 """
 Log a warning message to an output log file and print it to the console.
 sys.exit('Execution aborted: wrong format of ' + name + '\n')
 return dataset
-def OPTGP_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, thinning:int=100, n_batches:int=1, seed:int=0)-> None:
+def OPTGP_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, thinning: int = 100, n_batches: int = 1, seed: int = 0) -> None:
 """
 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files.
 Args:
 model (cobra.Model): The COBRA model to sample from.
 model_name (str): The name of the model, used in naming output files.
 n_samples (int, optional): Number of samples per batch. Default is 1000.
 thinning (int, optional): Thinning parameter for the sampler. Default is 100.
 n_batches (int, optional): Number of batches to run. Default is 1.
 seed (int, optional): Random seed for reproducibility. Default is 0.
 Returns:
 None
 """
+import numpy as np
-for i in range(0, n_batches):
+# Get reaction IDs for consistent column ordering
+reaction_ids = [rxn.id for rxn in model.reactions]
+# Sample and save each batch as numpy file
+for i in range(n_batches):
 optgp = OptGPSampler(model, thinning, seed)
 samples = optgp.sample(n_samples)
-samples.to_csv(ARGS.output_path + "/" +  model_name + '_'+ str(i)+'_OPTGP.csv', index=False)
-seed+=1
+# Save as numpy array (more memory efficient)
-samplesTotal = pd.DataFrame()
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
-for i in range(0, n_batches):
+np.save(batch_filename, samples.values)
-samples_batch = pd.read_csv(ARGS.output_path + "/"  +  model_name + '_'+ str(i)+'_OPTGP.csv')
-samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True)
+seed += 1
+# Merge all batches into a single DataFrame
+all_samples = []
+for i in range(n_batches):
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
+batch_data = np.load(batch_filename)
+all_samples.append(batch_data)
+# Concatenate all batches
+samplesTotal_array = np.vstack(all_samples)
+# Convert back to DataFrame with proper column names
+samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids)
+# Save the final merged result as CSV
 write_to_file(samplesTotal.T, model_name, True)
-for i in range(0, n_batches):
+# Clean up temporary numpy files
-os.remove(ARGS.output_path + "/" +   model_name + '_'+ str(i)+'_OPTGP.csv')
+for i in range(n_batches):
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
+if os.path.exists(batch_filename):
-def CBS_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, n_batches:int=1, seed:int=0)-> None:
+os.remove(batch_filename)
+def CBS_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, n_batches: int = 1, seed: int = 0) -> None:
 """
 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files.
 Args:
 model (cobra.Model): The COBRA model to sample from.
 model_name (str): The name of the model, used in naming output files.
 n_samples (int, optional): Number of samples per batch. Default is 1000.
 n_batches (int, optional): Number of batches to run. Default is 1.
 seed (int, optional): Random seed for reproducibility. Default is 0.
 Returns:
 None
 """
+import numpy as np
-df_FVA = cobra.flux_analysis.flux_variability_analysis(model,fraction_of_optimum=0).round(6)
+# Get reaction IDs for consistent column ordering
-df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples*n_batches, df_FVA, seed=seed)
+reaction_ids = [reaction.id for reaction in model.reactions]
-for i in range(0, n_batches):
+# Perform FVA analysis once for all batches
-samples = pd.DataFrame(columns =[reaction.id for reaction in model.reactions], index = range(n_samples))
+df_FVA = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0).round(6)
+# Generate random objective functions for all samples across all batches
+df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples * n_batches, df_FVA, seed=seed)
+# Sample and save each batch as numpy file
+for i in range(n_batches):
+samples = pd.DataFrame(columns=reaction_ids, index=range(n_samples))
 try:
-CBS_backend.randomObjectiveFunctionSampling(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], samples)
+CBS_backend.randomObjectiveFunctionSampling(
+model,
+n_samples,
+df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples],
+samples
+)
 except Exception as e:
 utils.logWarning(
-"Warning: GLPK solver has failed for " + model_name + ". Trying with COBRA interface. Error:" + str(e),
+f"Warning: GLPK solver has failed for {model_name}. Trying with COBRA interface. Error: {str(e)}",
-ARGS.out_log)
+ARGS.out_log
-CBS_backend.randomObjectiveFunctionSampling_cobrapy(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples],
+)
-samples)
+CBS_backend.randomObjectiveFunctionSampling_cobrapy(
-utils.logWarning(ARGS.output_path + "/" +  model_name + '_'+ str(i)+'_CBS.csv', ARGS.out_log)
+model,
-samples.to_csv(ARGS.output_path + "/" +  model_name + '_'+ str(i)+'_CBS.csv', index=False)
+n_samples,
+df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples],
-samplesTotal = pd.DataFrame()
+samples
-for i in range(0, n_batches):
+)
-samples_batch = pd.read_csv(ARGS.output_path + "/"  +  model_name + '_'+ str(i)+'_CBS.csv')
-samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True)
+# Save as numpy array (more memory efficient)
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
+utils.logWarning(batch_filename, ARGS.out_log)
+np.save(batch_filename, samples.values)
+# Merge all batches into a single DataFrame
+all_samples = []
+for i in range(n_batches):
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
+batch_data = np.load(batch_filename)
+all_samples.append(batch_data)
+# Concatenate all batches
+samplesTotal_array = np.vstack(all_samples)
+# Convert back to DataFrame with proper column names
+samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids)
+# Save the final merged result as CSV
 write_to_file(samplesTotal.T, model_name, True)
-for i in range(0, n_batches):
+# Clean up temporary numpy files
-os.remove(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv')
+for i in range(n_batches):
+batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
+if os.path.exists(batch_filename):
+os.remove(batch_filename)
 def model_sampler_with_bounds(model_input_original: cobra.Model, bounds_path: str, cell_name: str) -> List[pd.DataFrame]:
 """
 df_sensitivity.loc[model_name] = newRow
 df_sensitivity = df_sensitivity.astype(float).round(6)
 return df_pFBA, df_FVA, df_sensitivity
 ############################# main ###########################################
-def main(args :List[str] = None) -> None:
+def main(args: List[str] = None) -> None:
 """
 Initialize and run sampling/analysis based on the frontend input arguments.
 Returns:
 None
 # output types (required) -> list
 ARGS.output_types = ARGS.output_type.split(",") if ARGS.output_type else []
 # optional analysis output types -> list or empty
 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") if ARGS.output_type_analysis else []
+# Determine if sampling should be performed
+perform_sampling = ARGS.n_samples > 0
 print("=== INPUT FILES ===")
 print(f"{ARGS.input_files}")
 print(f"{ARGS.file_names}")
 print(f"{ARGS.output_type}")
 print(f"{ARGS.output_types}")
 print(f"{ARGS.output_type_analysis}")
+print(f"Sampling enabled: {perform_sampling} (n_samples: {ARGS.n_samples})")
 if ARGS.model_and_bounds == "True":
 # MODE 1: Model + bounds (separate files)
 print("=== MODE 1: Model + Bounds (separate files) ===")
 results = Parallel(n_jobs=num_processors)(
 delayed(perform_sampling_and_analysis)(model_utils.build_cobra_model_from_csv(model_file), cell_name)
 for model_file, cell_name in zip(ARGS.input_files, ARGS.file_names)
 )
+# Handle sampling outputs (only if sampling was performed)
-all_mean = pd.concat([result[0] for result in results], ignore_index=False)
+if perform_sampling:
-all_median = pd.concat([result[1] for result in results], ignore_index=False)
+print("=== PROCESSING SAMPLING RESULTS ===")
-all_quantiles = pd.concat([result[2] for result in results], ignore_index=False)
+all_mean = pd.concat([result[0] for result in results], ignore_index=False)
-if("mean" in ARGS.output_types):
+all_median = pd.concat([result[1] for result in results], ignore_index=False)
-all_mean = all_mean.fillna(0.0)
+all_quantiles = pd.concat([result[2] for result in results], ignore_index=False)
-all_mean = all_mean.sort_index()
-write_to_file(all_mean.T, "mean", True)
+if "mean" in ARGS.output_types:
+all_mean = all_mean.fillna(0.0)
-if("median" in ARGS.output_types):
+all_mean = all_mean.sort_index()
-all_median = all_median.fillna(0.0)
+write_to_file(all_mean.T, "mean", True)
-all_median = all_median.sort_index()
-write_to_file(all_median.T, "median", True)
+if "median" in ARGS.output_types:
+all_median = all_median.fillna(0.0)
-if("quantiles" in ARGS.output_types):
+all_median = all_median.sort_index()
-all_quantiles = all_quantiles.fillna(0.0)
+write_to_file(all_median.T, "median", True)
-all_quantiles = all_quantiles.sort_index()
-write_to_file(all_quantiles.T, "quantiles", True)
+if "quantiles" in ARGS.output_types:
+all_quantiles = all_quantiles.fillna(0.0)
-index_result = 3
+all_quantiles = all_quantiles.sort_index()
-if("pFBA" in ARGS.output_type_analysis):
+write_to_file(all_quantiles.T, "quantiles", True)
+else:
+print("=== SAMPLING SKIPPED (n_samples = 0) ===")
+# Handle optimization analysis outputs (always available)
+print("=== PROCESSING OPTIMIZATION RESULTS ===")
+# Determine the starting index for optimization results
+# If sampling was performed, optimization results start at index 3
+# If no sampling, optimization results start at index 0
+index_result = 3 if perform_sampling else 0
+if "pFBA" in ARGS.output_type_analysis:
 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False)
 all_pFBA = all_pFBA.sort_index()
 write_to_file(all_pFBA.T, "pFBA", True)
-index_result+=1
+index_result += 1
-if("FVA" in ARGS.output_type_analysis):
-all_FVA= pd.concat([result[index_result] for result in results], ignore_index=False)
+if "FVA" in ARGS.output_type_analysis:
+all_FVA = pd.concat([result[index_result] for result in results], ignore_index=False)
 all_FVA = all_FVA.sort_index()
 write_to_file(all_FVA.T, "FVA", True)
-index_result+=1
+index_result += 1
-if("sensitivity" in ARGS.output_type_analysis):
+if "sensitivity" in ARGS.output_type_analysis:
 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False)
 all_sensitivity = all_sensitivity.sort_index()
 write_to_file(all_sensitivity.T, "sensitivity", True)
 return

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/flux_simulation_beta.py @ 461:73f02860f7d7 draft