comparison COBRAxy/flux_simulation_beta.py @ 461:73f02860f7d7 draft

Uploaded
author luca_milaz
date Mon, 22 Sep 2025 13:51:19 +0000
parents a6e45049c1b9
children 5f02f7e4ea9f
comparison
equal deleted inserted replaced
460:6a7010997b32 461:73f02860f7d7
13 import argparse 13 import argparse
14 import utils.general_utils as utils 14 import utils.general_utils as utils
15 from typing import List 15 from typing import List
16 import os 16 import os
17 import pandas as pd 17 import pandas as pd
18 import numpy as np
18 import cobra 19 import cobra
19 import utils.CBS_backend as CBS_backend 20 import utils.CBS_backend as CBS_backend
20 from joblib import Parallel, delayed, cpu_count 21 from joblib import Parallel, delayed, cpu_count
21 from cobra.sampling import OptGPSampler 22 from cobra.sampling import OptGPSampler
22 import sys 23 import sys
23 import utils.model_utils as model_utils 24 import utils.model_utils as model_utils
24 25
25 26
26 ################################# process args ############################### 27 ################################# process args ###############################
27 def process_args(args :List[str] = None) -> argparse.Namespace: 28 def process_args(args: List[str] = None) -> argparse.Namespace:
28 """ 29 """
29 Processes command-line arguments. 30 Processes command-line arguments.
30 31
31 Args: 32 Args:
32 args (list): List of command-line arguments. 33 args (list): List of command-line arguments.
33 34
34 Returns: 35 Returns:
35 Namespace: An object containing parsed arguments. 36 Namespace: An object containing parsed arguments.
36 """ 37 """
37 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', 38 parser = argparse.ArgumentParser(usage='%(prog)s [options]',
38 description = 'process some value\'s') 39 description='process some value\'s')
39 40
40 parser.add_argument("-mo", "--model_upload", type = str, 41 parser.add_argument("-mo", "--model_upload", type=str,
41 help = "path to input file with custom rules, if provided") 42 help="path to input file with custom rules, if provided")
42 43
43 parser.add_argument("-mab", "--model_and_bounds", type = str, 44 parser.add_argument("-mab", "--model_and_bounds", type=str,
44 choices = ['True', 'False'], 45 choices=['True', 'False'],
45 required = True, 46 required=True,
46 help = "upload mode: True for model+bounds, False for complete models") 47 help="upload mode: True for model+bounds, False for complete models")
47 48
48 49 parser.add_argument('-ol', '--out_log',
49 parser.add_argument('-ol', '--out_log', 50 help="Output log")
50 help = "Output log")
51 51
52 parser.add_argument('-td', '--tool_dir', 52 parser.add_argument('-td', '--tool_dir',
53 type = str, 53 type=str,
54 required = True, 54 required=True,
55 help = 'your tool directory') 55 help='your tool directory')
56 56
57 parser.add_argument('-in', '--input', 57 parser.add_argument('-in', '--input',
58 required = True, 58 required=True,
59 type=str, 59 type=str,
60 help = 'input bounds files or complete model files') 60 help='input bounds files or complete model files')
61 61
62 parser.add_argument('-ni', '--name', 62 parser.add_argument('-ni', '--name',
63 required = True, 63 required=True,
64 type=str, 64 type=str,
65 help = 'cell names') 65 help='cell names')
66 66
67 parser.add_argument('-a', '--algorithm', 67 parser.add_argument('-a', '--algorithm',
68 type = str, 68 type=str,
69 choices = ['OPTGP', 'CBS'], 69 choices=['OPTGP', 'CBS'],
70 required = True, 70 required=True,
71 help = 'choose sampling algorithm') 71 help='choose sampling algorithm')
72 72
73 parser.add_argument('-th', '--thinning', 73 parser.add_argument('-th', '--thinning',
74 type = int, 74 type=int,
75 default= 100, 75 default=100,
76 required=False, 76 required=False,
77 help = 'choose thinning') 77 help='choose thinning')
78 78
79 parser.add_argument('-ns', '--n_samples', 79 parser.add_argument('-ns', '--n_samples',
80 type = int, 80 type=int,
81 required = True, 81 required=True,
82 help = 'choose how many samples') 82 help='choose how many samples (set to 0 for optimization only)')
83 83
84 parser.add_argument('-sd', '--seed', 84 parser.add_argument('-sd', '--seed',
85 type = int, 85 type=int,
86 required = True, 86 required=True,
87 help = 'seed') 87 help='seed for random number generation')
88 88
89 parser.add_argument('-nb', '--n_batches', 89 parser.add_argument('-nb', '--n_batches',
90 type = int, 90 type=int,
91 required = True, 91 required=True,
92 help = 'choose how many batches') 92 help='choose how many batches')
93 93
94 parser.add_argument('-opt', '--perc_opt', 94 parser.add_argument('-opt', '--perc_opt',
95 type = float, 95 type=float,
96 default=0.9, 96 default=0.9,
97 required = False, 97 required=False,
98 help = 'choose the fraction of optimality for FVA (0-1)') 98 help='choose the fraction of optimality for FVA (0-1)')
99 99
100 parser.add_argument('-ot', '--output_type', 100 parser.add_argument('-ot', '--output_type',
101 type = str, 101 type=str,
102 required = True, 102 required=True,
103 help = 'output type') 103 help='output type for sampling results')
104 104
105 parser.add_argument('-ota', '--output_type_analysis', 105 parser.add_argument('-ota', '--output_type_analysis',
106 type = str, 106 type=str,
107 required = False, 107 required=False,
108 help = 'output type analysis') 108 help='output type analysis (optimization methods)')
109 109
110 parser.add_argument('-idop', '--output_path', 110 parser.add_argument('-idop', '--output_path',
111 type = str, 111 type=str,
112 default='flux_simulation', 112 default='flux_simulation',
113 help = 'output path for maps') 113 help='output path for maps')
114 114
115 ARGS = parser.parse_args(args) 115 ARGS = parser.parse_args(args)
116 return ARGS 116 return ARGS
117
118 ########################### warning ########################################### 117 ########################### warning ###########################################
119 def warning(s :str) -> None: 118 def warning(s :str) -> None:
120 """ 119 """
121 Log a warning message to an output log file and print it to the console. 120 Log a warning message to an output log file and print it to the console.
122 121
170 sys.exit('Execution aborted: wrong format of ' + name + '\n') 169 sys.exit('Execution aborted: wrong format of ' + name + '\n')
171 return dataset 170 return dataset
172 171
173 172
174 173
175 def OPTGP_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, thinning:int=100, n_batches:int=1, seed:int=0)-> None: 174 def OPTGP_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, thinning: int = 100, n_batches: int = 1, seed: int = 0) -> None:
176 """ 175 """
177 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files. 176 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files.
178 177
179 Args: 178 Args:
180 model (cobra.Model): The COBRA model to sample from. 179 model (cobra.Model): The COBRA model to sample from.
181 model_name (str): The name of the model, used in naming output files. 180 model_name (str): The name of the model, used in naming output files.
182 n_samples (int, optional): Number of samples per batch. Default is 1000. 181 n_samples (int, optional): Number of samples per batch. Default is 1000.
183 thinning (int, optional): Thinning parameter for the sampler. Default is 100. 182 thinning (int, optional): Thinning parameter for the sampler. Default is 100.
184 n_batches (int, optional): Number of batches to run. Default is 1. 183 n_batches (int, optional): Number of batches to run. Default is 1.
185 seed (int, optional): Random seed for reproducibility. Default is 0. 184 seed (int, optional): Random seed for reproducibility. Default is 0.
186 185
187 Returns: 186 Returns:
188 None 187 None
189 """ 188 """
190 189 import numpy as np
191 for i in range(0, n_batches): 190
191 # Get reaction IDs for consistent column ordering
192 reaction_ids = [rxn.id for rxn in model.reactions]
193
194 # Sample and save each batch as numpy file
195 for i in range(n_batches):
192 optgp = OptGPSampler(model, thinning, seed) 196 optgp = OptGPSampler(model, thinning, seed)
193 samples = optgp.sample(n_samples) 197 samples = optgp.sample(n_samples)
194 samples.to_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv', index=False) 198
195 seed+=1 199 # Save as numpy array (more memory efficient)
196 samplesTotal = pd.DataFrame() 200 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
197 for i in range(0, n_batches): 201 np.save(batch_filename, samples.values)
198 samples_batch = pd.read_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv') 202
199 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) 203 seed += 1
200 204
205 # Merge all batches into a single DataFrame
206 all_samples = []
207
208 for i in range(n_batches):
209 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
210 batch_data = np.load(batch_filename)
211 all_samples.append(batch_data)
212
213 # Concatenate all batches
214 samplesTotal_array = np.vstack(all_samples)
215
216 # Convert back to DataFrame with proper column names
217 samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids)
218
219 # Save the final merged result as CSV
201 write_to_file(samplesTotal.T, model_name, True) 220 write_to_file(samplesTotal.T, model_name, True)
202 221
203 for i in range(0, n_batches): 222 # Clean up temporary numpy files
204 os.remove(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv') 223 for i in range(n_batches):
205 224 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy"
206 225 if os.path.exists(batch_filename):
207 def CBS_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, n_batches:int=1, seed:int=0)-> None: 226 os.remove(batch_filename)
227
228
229 def CBS_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, n_batches: int = 1, seed: int = 0) -> None:
208 """ 230 """
209 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files. 231 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files.
210 232
211 Args: 233 Args:
212 model (cobra.Model): The COBRA model to sample from. 234 model (cobra.Model): The COBRA model to sample from.
213 model_name (str): The name of the model, used in naming output files. 235 model_name (str): The name of the model, used in naming output files.
214 n_samples (int, optional): Number of samples per batch. Default is 1000. 236 n_samples (int, optional): Number of samples per batch. Default is 1000.
215 n_batches (int, optional): Number of batches to run. Default is 1. 237 n_batches (int, optional): Number of batches to run. Default is 1.
216 seed (int, optional): Random seed for reproducibility. Default is 0. 238 seed (int, optional): Random seed for reproducibility. Default is 0.
217 239
218 Returns: 240 Returns:
219 None 241 None
220 """ 242 """
221 243 import numpy as np
222 df_FVA = cobra.flux_analysis.flux_variability_analysis(model,fraction_of_optimum=0).round(6) 244
223 245 # Get reaction IDs for consistent column ordering
224 df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples*n_batches, df_FVA, seed=seed) 246 reaction_ids = [reaction.id for reaction in model.reactions]
225 247
226 for i in range(0, n_batches): 248 # Perform FVA analysis once for all batches
227 samples = pd.DataFrame(columns =[reaction.id for reaction in model.reactions], index = range(n_samples)) 249 df_FVA = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0).round(6)
250
251 # Generate random objective functions for all samples across all batches
252 df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples * n_batches, df_FVA, seed=seed)
253
254 # Sample and save each batch as numpy file
255 for i in range(n_batches):
256 samples = pd.DataFrame(columns=reaction_ids, index=range(n_samples))
257
228 try: 258 try:
229 CBS_backend.randomObjectiveFunctionSampling(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], samples) 259 CBS_backend.randomObjectiveFunctionSampling(
260 model,
261 n_samples,
262 df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples],
263 samples
264 )
230 except Exception as e: 265 except Exception as e:
231 utils.logWarning( 266 utils.logWarning(
232 "Warning: GLPK solver has failed for " + model_name + ". Trying with COBRA interface. Error:" + str(e), 267 f"Warning: GLPK solver has failed for {model_name}. Trying with COBRA interface. Error: {str(e)}",
233 ARGS.out_log) 268 ARGS.out_log
234 CBS_backend.randomObjectiveFunctionSampling_cobrapy(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], 269 )
235 samples) 270 CBS_backend.randomObjectiveFunctionSampling_cobrapy(
236 utils.logWarning(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv', ARGS.out_log) 271 model,
237 samples.to_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv', index=False) 272 n_samples,
238 273 df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples],
239 samplesTotal = pd.DataFrame() 274 samples
240 for i in range(0, n_batches): 275 )
241 samples_batch = pd.read_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv') 276
242 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) 277 # Save as numpy array (more memory efficient)
243 278 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
279 utils.logWarning(batch_filename, ARGS.out_log)
280 np.save(batch_filename, samples.values)
281
282 # Merge all batches into a single DataFrame
283 all_samples = []
284
285 for i in range(n_batches):
286 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
287 batch_data = np.load(batch_filename)
288 all_samples.append(batch_data)
289
290 # Concatenate all batches
291 samplesTotal_array = np.vstack(all_samples)
292
293 # Convert back to DataFrame with proper column names
294 samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids)
295
296 # Save the final merged result as CSV
244 write_to_file(samplesTotal.T, model_name, True) 297 write_to_file(samplesTotal.T, model_name, True)
245 298
246 for i in range(0, n_batches): 299 # Clean up temporary numpy files
247 os.remove(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv') 300 for i in range(n_batches):
301 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy"
302 if os.path.exists(batch_filename):
303 os.remove(batch_filename)
248 304
249 305
250 306
251 def model_sampler_with_bounds(model_input_original: cobra.Model, bounds_path: str, cell_name: str) -> List[pd.DataFrame]: 307 def model_sampler_with_bounds(model_input_original: cobra.Model, bounds_path: str, cell_name: str) -> List[pd.DataFrame]:
252 """ 308 """
409 df_sensitivity.loc[model_name] = newRow 465 df_sensitivity.loc[model_name] = newRow
410 df_sensitivity = df_sensitivity.astype(float).round(6) 466 df_sensitivity = df_sensitivity.astype(float).round(6)
411 return df_pFBA, df_FVA, df_sensitivity 467 return df_pFBA, df_FVA, df_sensitivity
412 468
413 ############################# main ########################################### 469 ############################# main ###########################################
414 def main(args :List[str] = None) -> None: 470 def main(args: List[str] = None) -> None:
415 """ 471 """
416 Initialize and run sampling/analysis based on the frontend input arguments. 472 Initialize and run sampling/analysis based on the frontend input arguments.
417 473
418 Returns: 474 Returns:
419 None 475 None
433 # output types (required) -> list 489 # output types (required) -> list
434 ARGS.output_types = ARGS.output_type.split(",") if ARGS.output_type else [] 490 ARGS.output_types = ARGS.output_type.split(",") if ARGS.output_type else []
435 # optional analysis output types -> list or empty 491 # optional analysis output types -> list or empty
436 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") if ARGS.output_type_analysis else [] 492 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") if ARGS.output_type_analysis else []
437 493
494 # Determine if sampling should be performed
495 perform_sampling = ARGS.n_samples > 0
496
438 print("=== INPUT FILES ===") 497 print("=== INPUT FILES ===")
439 print(f"{ARGS.input_files}") 498 print(f"{ARGS.input_files}")
440 print(f"{ARGS.file_names}") 499 print(f"{ARGS.file_names}")
441 print(f"{ARGS.output_type}") 500 print(f"{ARGS.output_type}")
442 print(f"{ARGS.output_types}") 501 print(f"{ARGS.output_types}")
443 print(f"{ARGS.output_type_analysis}") 502 print(f"{ARGS.output_type_analysis}")
503 print(f"Sampling enabled: {perform_sampling} (n_samples: {ARGS.n_samples})")
444 504
445 if ARGS.model_and_bounds == "True": 505 if ARGS.model_and_bounds == "True":
446 # MODE 1: Model + bounds (separate files) 506 # MODE 1: Model + bounds (separate files)
447 print("=== MODE 1: Model + Bounds (separate files) ===") 507 print("=== MODE 1: Model + Bounds (separate files) ===")
448 508
475 results = Parallel(n_jobs=num_processors)( 535 results = Parallel(n_jobs=num_processors)(
476 delayed(perform_sampling_and_analysis)(model_utils.build_cobra_model_from_csv(model_file), cell_name) 536 delayed(perform_sampling_and_analysis)(model_utils.build_cobra_model_from_csv(model_file), cell_name)
477 for model_file, cell_name in zip(ARGS.input_files, ARGS.file_names) 537 for model_file, cell_name in zip(ARGS.input_files, ARGS.file_names)
478 ) 538 )
479 539
480 540 # Handle sampling outputs (only if sampling was performed)
481 all_mean = pd.concat([result[0] for result in results], ignore_index=False) 541 if perform_sampling:
482 all_median = pd.concat([result[1] for result in results], ignore_index=False) 542 print("=== PROCESSING SAMPLING RESULTS ===")
483 all_quantiles = pd.concat([result[2] for result in results], ignore_index=False) 543
484 544 all_mean = pd.concat([result[0] for result in results], ignore_index=False)
485 if("mean" in ARGS.output_types): 545 all_median = pd.concat([result[1] for result in results], ignore_index=False)
486 all_mean = all_mean.fillna(0.0) 546 all_quantiles = pd.concat([result[2] for result in results], ignore_index=False)
487 all_mean = all_mean.sort_index() 547
488 write_to_file(all_mean.T, "mean", True) 548 if "mean" in ARGS.output_types:
489 549 all_mean = all_mean.fillna(0.0)
490 if("median" in ARGS.output_types): 550 all_mean = all_mean.sort_index()
491 all_median = all_median.fillna(0.0) 551 write_to_file(all_mean.T, "mean", True)
492 all_median = all_median.sort_index() 552
493 write_to_file(all_median.T, "median", True) 553 if "median" in ARGS.output_types:
494 554 all_median = all_median.fillna(0.0)
495 if("quantiles" in ARGS.output_types): 555 all_median = all_median.sort_index()
496 all_quantiles = all_quantiles.fillna(0.0) 556 write_to_file(all_median.T, "median", True)
497 all_quantiles = all_quantiles.sort_index() 557
498 write_to_file(all_quantiles.T, "quantiles", True) 558 if "quantiles" in ARGS.output_types:
499 559 all_quantiles = all_quantiles.fillna(0.0)
500 index_result = 3 560 all_quantiles = all_quantiles.sort_index()
501 if("pFBA" in ARGS.output_type_analysis): 561 write_to_file(all_quantiles.T, "quantiles", True)
562 else:
563 print("=== SAMPLING SKIPPED (n_samples = 0) ===")
564
565 # Handle optimization analysis outputs (always available)
566 print("=== PROCESSING OPTIMIZATION RESULTS ===")
567
568 # Determine the starting index for optimization results
569 # If sampling was performed, optimization results start at index 3
570 # If no sampling, optimization results start at index 0
571 index_result = 3 if perform_sampling else 0
572
573 if "pFBA" in ARGS.output_type_analysis:
502 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False) 574 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False)
503 all_pFBA = all_pFBA.sort_index() 575 all_pFBA = all_pFBA.sort_index()
504 write_to_file(all_pFBA.T, "pFBA", True) 576 write_to_file(all_pFBA.T, "pFBA", True)
505 index_result+=1 577 index_result += 1
506 if("FVA" in ARGS.output_type_analysis): 578
507 all_FVA= pd.concat([result[index_result] for result in results], ignore_index=False) 579 if "FVA" in ARGS.output_type_analysis:
580 all_FVA = pd.concat([result[index_result] for result in results], ignore_index=False)
508 all_FVA = all_FVA.sort_index() 581 all_FVA = all_FVA.sort_index()
509 write_to_file(all_FVA.T, "FVA", True) 582 write_to_file(all_FVA.T, "FVA", True)
510 index_result+=1 583 index_result += 1
511 if("sensitivity" in ARGS.output_type_analysis): 584
585 if "sensitivity" in ARGS.output_type_analysis:
512 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False) 586 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False)
513 all_sensitivity = all_sensitivity.sort_index() 587 all_sensitivity = all_sensitivity.sort_index()
514 write_to_file(all_sensitivity.T, "sensitivity", True) 588 write_to_file(all_sensitivity.T, "sensitivity", True)
515 589
516 return 590 return