Mercurial > repos > bimib > cobraxy
comparison COBRAxy/flux_simulation_beta.py @ 461:73f02860f7d7 draft
Uploaded
author | luca_milaz |
---|---|
date | Mon, 22 Sep 2025 13:51:19 +0000 |
parents | a6e45049c1b9 |
children | 5f02f7e4ea9f |
comparison
equal
deleted
inserted
replaced
460:6a7010997b32 | 461:73f02860f7d7 |
---|---|
13 import argparse | 13 import argparse |
14 import utils.general_utils as utils | 14 import utils.general_utils as utils |
15 from typing import List | 15 from typing import List |
16 import os | 16 import os |
17 import pandas as pd | 17 import pandas as pd |
18 import numpy as np | |
18 import cobra | 19 import cobra |
19 import utils.CBS_backend as CBS_backend | 20 import utils.CBS_backend as CBS_backend |
20 from joblib import Parallel, delayed, cpu_count | 21 from joblib import Parallel, delayed, cpu_count |
21 from cobra.sampling import OptGPSampler | 22 from cobra.sampling import OptGPSampler |
22 import sys | 23 import sys |
23 import utils.model_utils as model_utils | 24 import utils.model_utils as model_utils |
24 | 25 |
25 | 26 |
26 ################################# process args ############################### | 27 ################################# process args ############################### |
27 def process_args(args :List[str] = None) -> argparse.Namespace: | 28 def process_args(args: List[str] = None) -> argparse.Namespace: |
28 """ | 29 """ |
29 Processes command-line arguments. | 30 Processes command-line arguments. |
30 | 31 |
31 Args: | 32 Args: |
32 args (list): List of command-line arguments. | 33 args (list): List of command-line arguments. |
33 | 34 |
34 Returns: | 35 Returns: |
35 Namespace: An object containing parsed arguments. | 36 Namespace: An object containing parsed arguments. |
36 """ | 37 """ |
37 parser = argparse.ArgumentParser(usage = '%(prog)s [options]', | 38 parser = argparse.ArgumentParser(usage='%(prog)s [options]', |
38 description = 'process some value\'s') | 39 description='process some value\'s') |
39 | 40 |
40 parser.add_argument("-mo", "--model_upload", type = str, | 41 parser.add_argument("-mo", "--model_upload", type=str, |
41 help = "path to input file with custom rules, if provided") | 42 help="path to input file with custom rules, if provided") |
42 | 43 |
43 parser.add_argument("-mab", "--model_and_bounds", type = str, | 44 parser.add_argument("-mab", "--model_and_bounds", type=str, |
44 choices = ['True', 'False'], | 45 choices=['True', 'False'], |
45 required = True, | 46 required=True, |
46 help = "upload mode: True for model+bounds, False for complete models") | 47 help="upload mode: True for model+bounds, False for complete models") |
47 | 48 |
48 | 49 parser.add_argument('-ol', '--out_log', |
49 parser.add_argument('-ol', '--out_log', | 50 help="Output log") |
50 help = "Output log") | |
51 | 51 |
52 parser.add_argument('-td', '--tool_dir', | 52 parser.add_argument('-td', '--tool_dir', |
53 type = str, | 53 type=str, |
54 required = True, | 54 required=True, |
55 help = 'your tool directory') | 55 help='your tool directory') |
56 | 56 |
57 parser.add_argument('-in', '--input', | 57 parser.add_argument('-in', '--input', |
58 required = True, | 58 required=True, |
59 type=str, | 59 type=str, |
60 help = 'input bounds files or complete model files') | 60 help='input bounds files or complete model files') |
61 | 61 |
62 parser.add_argument('-ni', '--name', | 62 parser.add_argument('-ni', '--name', |
63 required = True, | 63 required=True, |
64 type=str, | 64 type=str, |
65 help = 'cell names') | 65 help='cell names') |
66 | 66 |
67 parser.add_argument('-a', '--algorithm', | 67 parser.add_argument('-a', '--algorithm', |
68 type = str, | 68 type=str, |
69 choices = ['OPTGP', 'CBS'], | 69 choices=['OPTGP', 'CBS'], |
70 required = True, | 70 required=True, |
71 help = 'choose sampling algorithm') | 71 help='choose sampling algorithm') |
72 | 72 |
73 parser.add_argument('-th', '--thinning', | 73 parser.add_argument('-th', '--thinning', |
74 type = int, | 74 type=int, |
75 default= 100, | 75 default=100, |
76 required=False, | 76 required=False, |
77 help = 'choose thinning') | 77 help='choose thinning') |
78 | 78 |
79 parser.add_argument('-ns', '--n_samples', | 79 parser.add_argument('-ns', '--n_samples', |
80 type = int, | 80 type=int, |
81 required = True, | 81 required=True, |
82 help = 'choose how many samples') | 82 help='choose how many samples (set to 0 for optimization only)') |
83 | 83 |
84 parser.add_argument('-sd', '--seed', | 84 parser.add_argument('-sd', '--seed', |
85 type = int, | 85 type=int, |
86 required = True, | 86 required=True, |
87 help = 'seed') | 87 help='seed for random number generation') |
88 | 88 |
89 parser.add_argument('-nb', '--n_batches', | 89 parser.add_argument('-nb', '--n_batches', |
90 type = int, | 90 type=int, |
91 required = True, | 91 required=True, |
92 help = 'choose how many batches') | 92 help='choose how many batches') |
93 | 93 |
94 parser.add_argument('-opt', '--perc_opt', | 94 parser.add_argument('-opt', '--perc_opt', |
95 type = float, | 95 type=float, |
96 default=0.9, | 96 default=0.9, |
97 required = False, | 97 required=False, |
98 help = 'choose the fraction of optimality for FVA (0-1)') | 98 help='choose the fraction of optimality for FVA (0-1)') |
99 | 99 |
100 parser.add_argument('-ot', '--output_type', | 100 parser.add_argument('-ot', '--output_type', |
101 type = str, | 101 type=str, |
102 required = True, | 102 required=True, |
103 help = 'output type') | 103 help='output type for sampling results') |
104 | 104 |
105 parser.add_argument('-ota', '--output_type_analysis', | 105 parser.add_argument('-ota', '--output_type_analysis', |
106 type = str, | 106 type=str, |
107 required = False, | 107 required=False, |
108 help = 'output type analysis') | 108 help='output type analysis (optimization methods)') |
109 | 109 |
110 parser.add_argument('-idop', '--output_path', | 110 parser.add_argument('-idop', '--output_path', |
111 type = str, | 111 type=str, |
112 default='flux_simulation', | 112 default='flux_simulation', |
113 help = 'output path for maps') | 113 help='output path for maps') |
114 | 114 |
115 ARGS = parser.parse_args(args) | 115 ARGS = parser.parse_args(args) |
116 return ARGS | 116 return ARGS |
117 | |
118 ########################### warning ########################################### | 117 ########################### warning ########################################### |
119 def warning(s :str) -> None: | 118 def warning(s :str) -> None: |
120 """ | 119 """ |
121 Log a warning message to an output log file and print it to the console. | 120 Log a warning message to an output log file and print it to the console. |
122 | 121 |
170 sys.exit('Execution aborted: wrong format of ' + name + '\n') | 169 sys.exit('Execution aborted: wrong format of ' + name + '\n') |
171 return dataset | 170 return dataset |
172 | 171 |
173 | 172 |
174 | 173 |
175 def OPTGP_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, thinning:int=100, n_batches:int=1, seed:int=0)-> None: | 174 def OPTGP_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, thinning: int = 100, n_batches: int = 1, seed: int = 0) -> None: |
176 """ | 175 """ |
177 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files. | 176 Samples from the OPTGP (Optimal Global Perturbation) algorithm and saves the results to CSV files. |
178 | 177 |
179 Args: | 178 Args: |
180 model (cobra.Model): The COBRA model to sample from. | 179 model (cobra.Model): The COBRA model to sample from. |
181 model_name (str): The name of the model, used in naming output files. | 180 model_name (str): The name of the model, used in naming output files. |
182 n_samples (int, optional): Number of samples per batch. Default is 1000. | 181 n_samples (int, optional): Number of samples per batch. Default is 1000. |
183 thinning (int, optional): Thinning parameter for the sampler. Default is 100. | 182 thinning (int, optional): Thinning parameter for the sampler. Default is 100. |
184 n_batches (int, optional): Number of batches to run. Default is 1. | 183 n_batches (int, optional): Number of batches to run. Default is 1. |
185 seed (int, optional): Random seed for reproducibility. Default is 0. | 184 seed (int, optional): Random seed for reproducibility. Default is 0. |
186 | 185 |
187 Returns: | 186 Returns: |
188 None | 187 None |
189 """ | 188 """ |
190 | 189 import numpy as np |
191 for i in range(0, n_batches): | 190 |
191 # Get reaction IDs for consistent column ordering | |
192 reaction_ids = [rxn.id for rxn in model.reactions] | |
193 | |
194 # Sample and save each batch as numpy file | |
195 for i in range(n_batches): | |
192 optgp = OptGPSampler(model, thinning, seed) | 196 optgp = OptGPSampler(model, thinning, seed) |
193 samples = optgp.sample(n_samples) | 197 samples = optgp.sample(n_samples) |
194 samples.to_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv', index=False) | 198 |
195 seed+=1 | 199 # Save as numpy array (more memory efficient) |
196 samplesTotal = pd.DataFrame() | 200 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy" |
197 for i in range(0, n_batches): | 201 np.save(batch_filename, samples.values) |
198 samples_batch = pd.read_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv') | 202 |
199 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) | 203 seed += 1 |
200 | 204 |
205 # Merge all batches into a single DataFrame | |
206 all_samples = [] | |
207 | |
208 for i in range(n_batches): | |
209 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy" | |
210 batch_data = np.load(batch_filename) | |
211 all_samples.append(batch_data) | |
212 | |
213 # Concatenate all batches | |
214 samplesTotal_array = np.vstack(all_samples) | |
215 | |
216 # Convert back to DataFrame with proper column names | |
217 samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids) | |
218 | |
219 # Save the final merged result as CSV | |
201 write_to_file(samplesTotal.T, model_name, True) | 220 write_to_file(samplesTotal.T, model_name, True) |
202 | 221 |
203 for i in range(0, n_batches): | 222 # Clean up temporary numpy files |
204 os.remove(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_OPTGP.csv') | 223 for i in range(n_batches): |
205 | 224 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_OPTGP.npy" |
206 | 225 if os.path.exists(batch_filename): |
207 def CBS_sampler(model:cobra.Model, model_name:str, n_samples:int=1000, n_batches:int=1, seed:int=0)-> None: | 226 os.remove(batch_filename) |
227 | |
228 | |
229 def CBS_sampler(model: cobra.Model, model_name: str, n_samples: int = 1000, n_batches: int = 1, seed: int = 0) -> None: | |
208 """ | 230 """ |
209 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files. | 231 Samples using the CBS (Constraint-based Sampling) algorithm and saves the results to CSV files. |
210 | 232 |
211 Args: | 233 Args: |
212 model (cobra.Model): The COBRA model to sample from. | 234 model (cobra.Model): The COBRA model to sample from. |
213 model_name (str): The name of the model, used in naming output files. | 235 model_name (str): The name of the model, used in naming output files. |
214 n_samples (int, optional): Number of samples per batch. Default is 1000. | 236 n_samples (int, optional): Number of samples per batch. Default is 1000. |
215 n_batches (int, optional): Number of batches to run. Default is 1. | 237 n_batches (int, optional): Number of batches to run. Default is 1. |
216 seed (int, optional): Random seed for reproducibility. Default is 0. | 238 seed (int, optional): Random seed for reproducibility. Default is 0. |
217 | 239 |
218 Returns: | 240 Returns: |
219 None | 241 None |
220 """ | 242 """ |
221 | 243 import numpy as np |
222 df_FVA = cobra.flux_analysis.flux_variability_analysis(model,fraction_of_optimum=0).round(6) | 244 |
223 | 245 # Get reaction IDs for consistent column ordering |
224 df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples*n_batches, df_FVA, seed=seed) | 246 reaction_ids = [reaction.id for reaction in model.reactions] |
225 | 247 |
226 for i in range(0, n_batches): | 248 # Perform FVA analysis once for all batches |
227 samples = pd.DataFrame(columns =[reaction.id for reaction in model.reactions], index = range(n_samples)) | 249 df_FVA = cobra.flux_analysis.flux_variability_analysis(model, fraction_of_optimum=0).round(6) |
250 | |
251 # Generate random objective functions for all samples across all batches | |
252 df_coefficients = CBS_backend.randomObjectiveFunction(model, n_samples * n_batches, df_FVA, seed=seed) | |
253 | |
254 # Sample and save each batch as numpy file | |
255 for i in range(n_batches): | |
256 samples = pd.DataFrame(columns=reaction_ids, index=range(n_samples)) | |
257 | |
228 try: | 258 try: |
229 CBS_backend.randomObjectiveFunctionSampling(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], samples) | 259 CBS_backend.randomObjectiveFunctionSampling( |
260 model, | |
261 n_samples, | |
262 df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples], | |
263 samples | |
264 ) | |
230 except Exception as e: | 265 except Exception as e: |
231 utils.logWarning( | 266 utils.logWarning( |
232 "Warning: GLPK solver has failed for " + model_name + ". Trying with COBRA interface. Error:" + str(e), | 267 f"Warning: GLPK solver has failed for {model_name}. Trying with COBRA interface. Error: {str(e)}", |
233 ARGS.out_log) | 268 ARGS.out_log |
234 CBS_backend.randomObjectiveFunctionSampling_cobrapy(model, n_samples, df_coefficients.iloc[:,i*n_samples:(i+1)*n_samples], | 269 ) |
235 samples) | 270 CBS_backend.randomObjectiveFunctionSampling_cobrapy( |
236 utils.logWarning(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv', ARGS.out_log) | 271 model, |
237 samples.to_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv', index=False) | 272 n_samples, |
238 | 273 df_coefficients.iloc[:, i * n_samples:(i + 1) * n_samples], |
239 samplesTotal = pd.DataFrame() | 274 samples |
240 for i in range(0, n_batches): | 275 ) |
241 samples_batch = pd.read_csv(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv') | 276 |
242 samplesTotal = pd.concat([samplesTotal, samples_batch], ignore_index = True) | 277 # Save as numpy array (more memory efficient) |
243 | 278 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy" |
279 utils.logWarning(batch_filename, ARGS.out_log) | |
280 np.save(batch_filename, samples.values) | |
281 | |
282 # Merge all batches into a single DataFrame | |
283 all_samples = [] | |
284 | |
285 for i in range(n_batches): | |
286 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy" | |
287 batch_data = np.load(batch_filename) | |
288 all_samples.append(batch_data) | |
289 | |
290 # Concatenate all batches | |
291 samplesTotal_array = np.vstack(all_samples) | |
292 | |
293 # Convert back to DataFrame with proper column names | |
294 samplesTotal = pd.DataFrame(samplesTotal_array, columns=reaction_ids) | |
295 | |
296 # Save the final merged result as CSV | |
244 write_to_file(samplesTotal.T, model_name, True) | 297 write_to_file(samplesTotal.T, model_name, True) |
245 | 298 |
246 for i in range(0, n_batches): | 299 # Clean up temporary numpy files |
247 os.remove(ARGS.output_path + "/" + model_name + '_'+ str(i)+'_CBS.csv') | 300 for i in range(n_batches): |
301 batch_filename = f"{ARGS.output_path}/{model_name}_{i}_CBS.npy" | |
302 if os.path.exists(batch_filename): | |
303 os.remove(batch_filename) | |
248 | 304 |
249 | 305 |
250 | 306 |
251 def model_sampler_with_bounds(model_input_original: cobra.Model, bounds_path: str, cell_name: str) -> List[pd.DataFrame]: | 307 def model_sampler_with_bounds(model_input_original: cobra.Model, bounds_path: str, cell_name: str) -> List[pd.DataFrame]: |
252 """ | 308 """ |
409 df_sensitivity.loc[model_name] = newRow | 465 df_sensitivity.loc[model_name] = newRow |
410 df_sensitivity = df_sensitivity.astype(float).round(6) | 466 df_sensitivity = df_sensitivity.astype(float).round(6) |
411 return df_pFBA, df_FVA, df_sensitivity | 467 return df_pFBA, df_FVA, df_sensitivity |
412 | 468 |
413 ############################# main ########################################### | 469 ############################# main ########################################### |
414 def main(args :List[str] = None) -> None: | 470 def main(args: List[str] = None) -> None: |
415 """ | 471 """ |
416 Initialize and run sampling/analysis based on the frontend input arguments. | 472 Initialize and run sampling/analysis based on the frontend input arguments. |
417 | 473 |
418 Returns: | 474 Returns: |
419 None | 475 None |
433 # output types (required) -> list | 489 # output types (required) -> list |
434 ARGS.output_types = ARGS.output_type.split(",") if ARGS.output_type else [] | 490 ARGS.output_types = ARGS.output_type.split(",") if ARGS.output_type else [] |
435 # optional analysis output types -> list or empty | 491 # optional analysis output types -> list or empty |
436 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") if ARGS.output_type_analysis else [] | 492 ARGS.output_type_analysis = ARGS.output_type_analysis.split(",") if ARGS.output_type_analysis else [] |
437 | 493 |
494 # Determine if sampling should be performed | |
495 perform_sampling = ARGS.n_samples > 0 | |
496 | |
438 print("=== INPUT FILES ===") | 497 print("=== INPUT FILES ===") |
439 print(f"{ARGS.input_files}") | 498 print(f"{ARGS.input_files}") |
440 print(f"{ARGS.file_names}") | 499 print(f"{ARGS.file_names}") |
441 print(f"{ARGS.output_type}") | 500 print(f"{ARGS.output_type}") |
442 print(f"{ARGS.output_types}") | 501 print(f"{ARGS.output_types}") |
443 print(f"{ARGS.output_type_analysis}") | 502 print(f"{ARGS.output_type_analysis}") |
503 print(f"Sampling enabled: {perform_sampling} (n_samples: {ARGS.n_samples})") | |
444 | 504 |
445 if ARGS.model_and_bounds == "True": | 505 if ARGS.model_and_bounds == "True": |
446 # MODE 1: Model + bounds (separate files) | 506 # MODE 1: Model + bounds (separate files) |
447 print("=== MODE 1: Model + Bounds (separate files) ===") | 507 print("=== MODE 1: Model + Bounds (separate files) ===") |
448 | 508 |
475 results = Parallel(n_jobs=num_processors)( | 535 results = Parallel(n_jobs=num_processors)( |
476 delayed(perform_sampling_and_analysis)(model_utils.build_cobra_model_from_csv(model_file), cell_name) | 536 delayed(perform_sampling_and_analysis)(model_utils.build_cobra_model_from_csv(model_file), cell_name) |
477 for model_file, cell_name in zip(ARGS.input_files, ARGS.file_names) | 537 for model_file, cell_name in zip(ARGS.input_files, ARGS.file_names) |
478 ) | 538 ) |
479 | 539 |
480 | 540 # Handle sampling outputs (only if sampling was performed) |
481 all_mean = pd.concat([result[0] for result in results], ignore_index=False) | 541 if perform_sampling: |
482 all_median = pd.concat([result[1] for result in results], ignore_index=False) | 542 print("=== PROCESSING SAMPLING RESULTS ===") |
483 all_quantiles = pd.concat([result[2] for result in results], ignore_index=False) | 543 |
484 | 544 all_mean = pd.concat([result[0] for result in results], ignore_index=False) |
485 if("mean" in ARGS.output_types): | 545 all_median = pd.concat([result[1] for result in results], ignore_index=False) |
486 all_mean = all_mean.fillna(0.0) | 546 all_quantiles = pd.concat([result[2] for result in results], ignore_index=False) |
487 all_mean = all_mean.sort_index() | 547 |
488 write_to_file(all_mean.T, "mean", True) | 548 if "mean" in ARGS.output_types: |
489 | 549 all_mean = all_mean.fillna(0.0) |
490 if("median" in ARGS.output_types): | 550 all_mean = all_mean.sort_index() |
491 all_median = all_median.fillna(0.0) | 551 write_to_file(all_mean.T, "mean", True) |
492 all_median = all_median.sort_index() | 552 |
493 write_to_file(all_median.T, "median", True) | 553 if "median" in ARGS.output_types: |
494 | 554 all_median = all_median.fillna(0.0) |
495 if("quantiles" in ARGS.output_types): | 555 all_median = all_median.sort_index() |
496 all_quantiles = all_quantiles.fillna(0.0) | 556 write_to_file(all_median.T, "median", True) |
497 all_quantiles = all_quantiles.sort_index() | 557 |
498 write_to_file(all_quantiles.T, "quantiles", True) | 558 if "quantiles" in ARGS.output_types: |
499 | 559 all_quantiles = all_quantiles.fillna(0.0) |
500 index_result = 3 | 560 all_quantiles = all_quantiles.sort_index() |
501 if("pFBA" in ARGS.output_type_analysis): | 561 write_to_file(all_quantiles.T, "quantiles", True) |
562 else: | |
563 print("=== SAMPLING SKIPPED (n_samples = 0) ===") | |
564 | |
565 # Handle optimization analysis outputs (always available) | |
566 print("=== PROCESSING OPTIMIZATION RESULTS ===") | |
567 | |
568 # Determine the starting index for optimization results | |
569 # If sampling was performed, optimization results start at index 3 | |
570 # If no sampling, optimization results start at index 0 | |
571 index_result = 3 if perform_sampling else 0 | |
572 | |
573 if "pFBA" in ARGS.output_type_analysis: | |
502 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False) | 574 all_pFBA = pd.concat([result[index_result] for result in results], ignore_index=False) |
503 all_pFBA = all_pFBA.sort_index() | 575 all_pFBA = all_pFBA.sort_index() |
504 write_to_file(all_pFBA.T, "pFBA", True) | 576 write_to_file(all_pFBA.T, "pFBA", True) |
505 index_result+=1 | 577 index_result += 1 |
506 if("FVA" in ARGS.output_type_analysis): | 578 |
507 all_FVA= pd.concat([result[index_result] for result in results], ignore_index=False) | 579 if "FVA" in ARGS.output_type_analysis: |
580 all_FVA = pd.concat([result[index_result] for result in results], ignore_index=False) | |
508 all_FVA = all_FVA.sort_index() | 581 all_FVA = all_FVA.sort_index() |
509 write_to_file(all_FVA.T, "FVA", True) | 582 write_to_file(all_FVA.T, "FVA", True) |
510 index_result+=1 | 583 index_result += 1 |
511 if("sensitivity" in ARGS.output_type_analysis): | 584 |
585 if "sensitivity" in ARGS.output_type_analysis: | |
512 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False) | 586 all_sensitivity = pd.concat([result[index_result] for result in results], ignore_index=False) |
513 all_sensitivity = all_sensitivity.sort_index() | 587 all_sensitivity = all_sensitivity.sort_index() |
514 write_to_file(all_sensitivity.T, "sensitivity", True) | 588 write_to_file(all_sensitivity.T, "sensitivity", True) |
515 | 589 |
516 return | 590 return |