comparison COBRAxy/ras_generator.py @ 505:96f512dff490 draft

Uploaded
author francesco_lapi
date Wed, 01 Oct 2025 11:15:00 +0000
parents c6ea189ea7e9
children 5956dcf94277
comparison
equal deleted inserted replaced
504:ef3df9b697b1 505:96f512dff490
5 computes RAS per reaction for each sample/cell line, and writes a tabular output. 5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
6 """ 6 """
7 from __future__ import division 7 from __future__ import division
8 import sys 8 import sys
9 import argparse 9 import argparse
10 import collections
11 import pandas as pd 10 import pandas as pd
12 import pickle as pk 11 import numpy as np
13 import utils.general_utils as utils 12 import utils.general_utils as utils
14 import utils.rule_parsing as ruleUtils 13 from typing import List, Dict
15 from typing import Union, Optional, List, Dict, Tuple, TypeVar 14 import ast
16 15
17 ERRORS = [] 16 ERRORS = []
18 ########################## argparse ########################################## 17 ########################## argparse ##########################################
19 ARGS :argparse.Namespace 18 ARGS :argparse.Namespace
20 def process_args(args:List[str] = None) -> argparse.Namespace: 19 def process_args(args:List[str] = None) -> argparse.Namespace:
80 Raises: 79 Raises:
81 pd.errors.EmptyDataError: If the CSV file is empty. 80 pd.errors.EmptyDataError: If the CSV file is empty.
82 sys.exit: If the CSV file has the wrong format, the execution is aborted. 81 sys.exit: If the CSV file has the wrong format, the execution is aborted.
83 """ 82 """
84 try: 83 try:
85 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python') 84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
85 dataset = dataset.astype(float)
86 except pd.errors.EmptyDataError: 86 except pd.errors.EmptyDataError:
87 sys.exit('Execution aborted: wrong format of ' + name + '\n') 87 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
88 if len(dataset.columns) < 2: 88 if len(dataset.columns) < 2:
89 sys.exit('Execution aborted: wrong format of ' + name + '\n') 89 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
90 return dataset 90 return dataset
91 91
92 ############################ load id e rules ################################## 92
93 def load_id_rules(reactions :Dict[str, Dict[str, List[str]]]) -> Tuple[List[str], List[Dict[str, List[str]]]]: 93 def load_custom_rules() -> Dict[str,str]:
94 """
95 Load IDs and rules from a dictionary of reactions.
96
97 Args:
98 reactions (dict): A dictionary where keys are IDs and values are rules.
99
100 Returns:
101 tuple: A tuple containing two lists, the first list containing IDs and the second list containing rules.
102 """
103 ids, rules = [], []
104 for key, value in reactions.items():
105 ids.append(key)
106 rules.append(value)
107 return (ids, rules)
108
109
110 ############################ gene #############################################
111 def data_gene(gene: pd.DataFrame, type_gene: str, name: str, gene_custom: Optional[Dict[str, str]]) -> Dict[str, str]:
112 """
113 Process gene data to ensure correct formatting and handle duplicates.
114
115 Args:
116 gene (DataFrame): DataFrame containing gene data.
117 type_gene (str): Type of gene data (e.g., 'hugo_id', 'ensembl_gene_id', 'symbol', 'entrez_id').
118 name (str): Name of the dataset.
119 gene_custom (dict or None): Custom gene data dictionary if provided.
120
121 Returns:
122 dict: A dictionary containing gene data with gene IDs as keys and corresponding values.
123 """
124
125 for i in range(len(gene)):
126 tmp = gene.iloc[i, 0]
127 gene.iloc[i, 0] = tmp.strip().split('.')[0]
128
129 gene_dup = [item for item, count in
130 collections.Counter(gene[gene.columns[0]]).items() if count > 1]
131 pat_dup = [item for item, count in
132 collections.Counter(list(gene.columns)).items() if count > 1]
133
134 gene_in_rule = None
135
136 if gene_dup:
137 if gene_custom == None:
138
139 if str(ARGS.rules_selector) == 'HMRcore':
140 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/HMRcore_genes.p', 'rb'))
141
142 elif str(ARGS.rules_selector) == 'Recon':
143 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/Recon_genes.p', 'rb'))
144
145 elif str(ARGS.rules_selector) == 'ENGRO2':
146 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/ENGRO2_genes.p', 'rb'))
147
148 utils.logWarning(f"{ARGS.tool_dir}'/local/pickle files/ENGRO2_genes.p'", ARGS.out_log)
149
150 gene_in_rule = gene_in_rule.get(type_gene)
151
152 else:
153 gene_in_rule = gene_custom
154
155 tmp = []
156 for i in gene_dup:
157 if gene_in_rule.get(i) == 'ok':
158 tmp.append(i)
159 if tmp:
160 sys.exit('Execution aborted because gene ID '
161 +str(tmp)+' in '+name+' is duplicated\n')
162
163 if pat_dup: utils.logWarning(f"Warning: duplicated label\n{pat_dup} in {name}", ARGS.out_log)
164 return (gene.set_index(gene.columns[0])).to_dict()
165
166 ############################ resolve ##########################################
167 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]:
168 """
169 Replace gene identifiers in a parsed rule expression with values from a dict.
170
171 Args:
172 l: Parsed rule as a nested list structure (strings, lists, and operators).
173 d: Dict mapping gene IDs to numeric values.
174
175 Returns:
176 tuple: (new_expression, not_found_genes)
177 """
178 tmp = []
179 err = []
180 while l:
181 if isinstance(l[0], list):
182 tmp_rules, tmp_err = replace_gene_value(l[0], d)
183 tmp.append(tmp_rules)
184 err.extend(tmp_err)
185 else:
186 value = replace_gene(l[0], d)
187 tmp.append(value)
188 if value == None:
189 err.append(l[0])
190 l = l[1:]
191 return (tmp, err)
192
193 def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]:
194 """
195 Replace a single gene identifier with its corresponding value from a dictionary.
196
197 Args:
198 l (str): Gene identifier to replace.
199 d (dict): Dict mapping gene IDs to numeric values.
200
201 Returns:
202 float/int/None: Corresponding value from the dictionary if found, None otherwise.
203
204 Raises:
205 sys.exit: If the value associated with the gene identifier is not valid.
206 """
207 if l =='and' or l == 'or':
208 return l
209 else:
210 value = d.get(l, None)
211 if not(value == None or isinstance(value, (int, float))):
212 sys.exit('Execution aborted: ' + value + ' value not valid\n')
213 return value
214
215 T = TypeVar("T", bound = Optional[Union[int, float]])
216 def computes(val1 :T, op :str, val2 :T, cn :bool) -> T:
217 """
218 Compute the RAS value between two value and an operator ('and' or 'or').
219
220 Args:
221 val1(Optional(Union[float, int])): First value.
222 op (str): Operator ('and' or 'or').
223 val2(Optional(Union[float, int])): Second value.
224 cn (bool): Control boolean value.
225
226 Returns:
227 Optional(Union[float, int]): Result of the computation.
228 """
229 if val1 != None and val2 != None:
230 if op == 'and':
231 return min(val1, val2)
232 else:
233 return val1 + val2
234 elif op == 'and':
235 if cn is True:
236 if val1 != None:
237 return val1
238 elif val2 != None:
239 return val2
240 else:
241 return None
242 else:
243 return None
244 else:
245 if val1 != None:
246 return val1
247 elif val2 != None:
248 return val2
249 else:
250 return None
251
252 # ris should be Literal[None] but Literal is not supported in Python 3.7
253 def control(ris, l :List[Union[int, float, list]], cn :bool) -> Union[bool, int, float]: #Union[Literal[False], int, float]:
254 """
255 Control the format of the expression.
256
257 Args:
258 ris: Intermediate result.
259 l (list): Expression to control.
260 cn (bool): Control boolean value.
261
262 Returns:
263 Union[Literal[False], int, float]: Result of the control.
264 """
265 if len(l) == 1:
266 if isinstance(l[0], (float, int)) or l[0] == None:
267 return l[0]
268 elif isinstance(l[0], list):
269 return control(None, l[0], cn)
270 else:
271 return False
272 elif len(l) > 2:
273 return control_list(ris, l, cn)
274 else:
275 return False
276
277 def control_list(ris, l :List[Optional[Union[float, int, list]]], cn :bool) -> Optional[bool]: #Optional[Literal[False]]:
278 """
279 Control the format of a list of expressions.
280
281 Args:
282 ris: Intermediate result.
283 l (list): List of expressions to control.
284 cn (bool): Control boolean value.
285
286 Returns:
287 Optional[Literal[False]]: Result of the control.
288 """
289 while l:
290 if len(l) == 1:
291 return False
292 elif (isinstance(l[0], (float, int)) or
293 l[0] == None) and l[1] in ['and', 'or']:
294 if isinstance(l[2], (float, int)) or l[2] == None:
295 ris = computes(l[0], l[1], l[2], cn)
296 elif isinstance(l[2], list):
297 tmp = control(None, l[2], cn)
298 if tmp is False:
299 return False
300 else:
301 ris = computes(l[0], l[1], tmp, cn)
302 else:
303 return False
304 l = l[3:]
305 elif l[0] in ['and', 'or']:
306 if isinstance(l[1], (float, int)) or l[1] == None:
307 ris = computes(ris, l[0], l[1], cn)
308 elif isinstance(l[1], list):
309 tmp = control(None,l[1], cn)
310 if tmp is False:
311 return False
312 else:
313 ris = computes(ris, l[0], tmp, cn)
314 else:
315 return False
316 l = l[2:]
317 elif isinstance(l[0], list) and l[1] in ['and', 'or']:
318 if isinstance(l[2], (float, int)) or l[2] == None:
319 tmp = control(None, l[0], cn)
320 if tmp is False:
321 return False
322 else:
323 ris = computes(tmp, l[1], l[2], cn)
324 elif isinstance(l[2], list):
325 tmp = control(None, l[0], cn)
326 tmp2 = control(None, l[2], cn)
327 if tmp is False or tmp2 is False:
328 return False
329 else:
330 ris = computes(tmp, l[1], tmp2, cn)
331 else:
332 return False
333 l = l[3:]
334 else:
335 return False
336 return ris
337
338 ResolvedRules = Dict[str, List[Optional[Union[float, int]]]]
339 def resolve(genes: Dict[str, str], rules: List[str], ids: List[str], resolve_none: bool, name: str) -> Tuple[Optional[ResolvedRules], Optional[list]]:
340 """
341 Resolve rules using gene data to compute scores for each rule.
342
343 Args:
344 genes (dict): Dictionary containing gene data with gene IDs as keys and corresponding values.
345 rules (list): List of rules to resolve.
346 ids (list): List of IDs corresponding to the rules.
347 resolve_none (bool): Flag indicating whether to resolve None values in the rules.
348 name (str): Name of the dataset.
349
350 Returns:
351 tuple: A tuple containing resolved rules as a dictionary and a list of gene IDs not found in the data.
352 """
353 resolve_rules = {}
354 not_found = []
355 flag = False
356 for key, value in genes.items():
357 tmp_resolve = []
358 for i in range(len(rules)):
359 tmp = rules[i]
360 if tmp:
361 tmp, err = replace_gene_value(tmp, value)
362 if err:
363 not_found.extend(err)
364 ris = control(None, tmp, resolve_none)
365 if ris is False or ris == None:
366 tmp_resolve.append(None)
367 else:
368 tmp_resolve.append(ris)
369 flag = True
370 else:
371 tmp_resolve.append(None)
372 resolve_rules[key] = tmp_resolve
373
374 if flag is False:
375 utils.logWarning(
376 f"Warning: no computable score (due to missing gene values) for class {name}, the class has been disregarded",
377 ARGS.out_log)
378
379 return (None, None)
380
381 return (resolve_rules, list(set(not_found)))
382 ############################ create_ras #######################################
383 def create_ras(resolve_rules: Optional[ResolvedRules], dataset_name: str, rules: List[str], ids: List[str], file: str) -> None:
384 """
385 Create a RAS (Reaction Activity Score) file from resolved rules.
386
387 Args:
388 resolve_rules (dict): Dictionary containing resolved rules.
389 dataset_name (str): Name of the dataset.
390 rules (list): List of rules.
391 file (str): Path to the output RAS file.
392
393 Returns:
394 None
395 """
396 if resolve_rules is None:
397 utils.logWarning(f"Couldn't generate RAS for current dataset: {dataset_name}", ARGS.out_log)
398
399 for geni in resolve_rules.values():
400 for i, valori in enumerate(geni):
401 if valori == None:
402 geni[i] = 'None'
403
404 output_ras = pd.DataFrame.from_dict(resolve_rules)
405
406 output_ras.insert(0, 'Reactions', ids)
407 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False)
408
409 text_file = open(file, "w")
410
411 text_file.write(output_to_csv)
412 text_file.close()
413
414 ################################- NEW RAS COMPUTATION -################################
415 Expr = Optional[Union[int, float]]
416 Ras = Expr
417 def ras_for_cell_lines(dataset: pd.DataFrame, rules: Dict[str, ruleUtils.OpList]) -> Dict[str, Dict[str, Ras]]:
418 """
419 Generates the RAS scores for each cell line found in the dataset.
420
421 Args:
422 dataset (pd.DataFrame): Dataset containing gene values.
423 rules (dict): The dict containing reaction ids as keys and rules as values.
424
425 Note:
426 Modifies dataset in place by setting the first column as index.
427
428 Returns:
429 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary
430 where each key corresponds to a reaction ID and each value is its computed RAS score.
431 """
432 ras_values_by_cell_line = {}
433 dataset.set_index(dataset.columns[0], inplace=True)
434
435 for cell_line_name in dataset.columns: #[1:]:
436 cell_line = dataset[cell_line_name].to_dict()
437 ras_values_by_cell_line[cell_line_name]= get_ras_values(rules, cell_line)
438 return ras_values_by_cell_line
439
440 def get_ras_values(value_rules: Dict[str, ruleUtils.OpList], dataset: Dict[str, Expr]) -> Dict[str, Ras]:
441 """
442 Computes the RAS (Reaction Activity Score) values for each rule in the given dict.
443
444 Args:
445 value_rules (dict): A dictionary where keys are reaction ids and values are OpLists.
446 dataset : gene expression data of one cell line.
447
448 Returns:
449 dict: A dictionary where keys are reaction ids and values are the computed RAS values for each rule.
450 """
451 return {key: ras_op_list(op_list, dataset) for key, op_list in value_rules.items()}
452
453 def get_gene_expr(dataset :Dict[str, Expr], name :str) -> Expr:
454 """
455 Extracts the gene expression of the given gene from a cell line dataset.
456
457 Args:
458 dataset : gene expression data of one cell line.
459 name : gene name.
460
461 Returns:
462 Expr : the gene's expression value.
463 """
464 expr = dataset.get(name, None)
465 if expr is None: ERRORS.append(name)
466
467 return expr
468
469 def ras_op_list(op_list: ruleUtils.OpList, dataset: Dict[str, Expr]) -> Ras:
470 """
471 Computes recursively the RAS (Reaction Activity Score) value for the given OpList, considering the specified flag to control None behavior.
472
473 Args:
474 op_list (OpList): The OpList representing a rule with gene values.
475 dataset : gene expression data of one cell line.
476
477 Returns:
478 Ras: The computed RAS value for the given OpList.
479 """
480 op = op_list.op
481 ras_value :Ras = None
482 if not op: return get_gene_expr(dataset, op_list[0])
483 if op is ruleUtils.RuleOp.AND and not ARGS.none and None in op_list: return None
484
485 for i in range(len(op_list)):
486 item = op_list[i]
487 if isinstance(item, ruleUtils.OpList):
488 item = ras_op_list(item, dataset)
489
490 else:
491 item = get_gene_expr(dataset, item)
492
493 if item is None:
494 if op is ruleUtils.RuleOp.AND and not ARGS.none: return None
495 continue
496
497 if ras_value is None:
498 ras_value = item
499 else:
500 ras_value = ras_value + item if op is ruleUtils.RuleOp.OR else min(ras_value, item)
501
502 return ras_value
503
504 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None:
505 """
506 Save computed RAS scores to ARGS.ras_output as a TSV file.
507
508 Args:
509 rasScores : the computed ras scores.
510 reactions : the list of reaction IDs, used as the first column.
511
512 Returns:
513 None
514 """
515 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly
516 for reactId, score in scores.items():
517 if score is None: scores[reactId] = "None"
518
519 output_ras = pd.DataFrame.from_dict(rasScores)
520 output_ras.insert(0, 'Reactions', reactions)
521 output_ras.to_csv(ARGS.ras_output, sep = '\t', index = False)
522
523 ############################ MAIN #############################################
524 #TODO: not used but keep, it will be when the new translator dicts will be used.
525 def translateGene(geneName :str, encoding :str, geneTranslator :Dict[str, Dict[str, str]]) -> str:
526 """
527 Translate gene from any supported encoding to HugoID.
528
529 Args:
530 geneName (str): the name of the gene in its current encoding.
531 encoding (str): the encoding.
532 geneTranslator (Dict[str, Dict[str, str]]): the dict containing all supported gene names
533 and encodings in the current model, mapping each to the corresponding HugoID encoding.
534
535 Raises:
536 ValueError: When the gene isn't supported in the model.
537
538 Returns:
539 str: the gene in HugoID encoding.
540 """
541 supportedGenesInEncoding = geneTranslator[encoding]
542 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName]
543 raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.")
544
545 def load_custom_rules() -> Dict[str, ruleUtils.OpList]:
546 """ 94 """
547 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be 95 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
548 performed, significantly impacting the runtime. 96 performed, significantly impacting the runtime.
549 97
550 Returns: 98 Returns:
568 for line in rows[1:]: 116 for line in rows[1:]:
569 if len(line) <= idx_gpr: 117 if len(line) <= idx_gpr:
570 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) 118 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
571 continue 119 continue
572 120
573 if line[idx_gpr] == "": 121 dict_rule[line[id_idx]] = line[idx_gpr]
574 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) 122
575 else:
576 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
577
578 except Exception as e: 123 except Exception as e:
579 # If parsing with tabs fails, try comma delimiter 124 # If parsing with tabs fails, try comma delimiter
580 try: 125 try:
581 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) 126 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
582 127
592 for line in rows[1:]: 137 for line in rows[1:]:
593 if len(line) <= idx_gpr: 138 if len(line) <= idx_gpr:
594 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) 139 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
595 continue 140 continue
596 141
597 if line[idx_gpr] == "": 142 dict_rule[line[id_idx]] =line[idx_gpr]
598 dict_rule[line[id_idx]] = ruleUtils.OpList([""])
599 else:
600 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
601 143
602 except Exception as e2: 144 except Exception as e2:
603 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}") 145 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
604 146
605 if not dict_rule: 147 if not dict_rule:
606 raise ValueError("No valid rules found in the uploaded file. Please check the file format.") 148 raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
607 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed. 149 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
608 return dict_rule 150 return dict_rule
609 151
610 152
153
154 def computeRAS(
155 dataset,gene_rules,rules_total_string,
156 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
157 and_function=np.min, # type of operation to do in case of an and expression(min, sum)
158 ignore_nan = True
159 ):
160
161
162 logic_operators = ['and', 'or', '(', ')']
163 reactions=list(gene_rules.keys())
164
165 # Build the dictionary for the GPRs
166 df_reactions = pd.DataFrame(index=reactions)
167 gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]
168 df_reactions['rule'] = gene_rules
169 df_reactions = df_reactions.reset_index()
170 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
171
172 dict_rule_reactions = df_reactions.to_dict()['index']
173
174 # build useful structures for RAS computation
175 genes =dataset.index.intersection(rules_total_string)
176
177 #check if there is one gene at least
178 if len(genes)==0:
179 print("ERROR: no gene of the count matrix is in the metabolic model!")
180 print(" are you sure that the gene annotation is the same for the model and the count matrix?")
181 return -1
182
183 cell_ids = list(dataset.columns)
184 count_df_filtered = dataset.loc[genes]
185 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_"))
186
187 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
188
189 # for loop on rules
190 ind = 0
191 for rule, reaction_ids in dict_rule_reactions.items():
192 if len(rule) != 0:
193 # there is one gene at least in the formula
194 rule_split = rule.split()
195 rule_split_elements = list(set(rule_split)) # genes in formula
196
197 # which genes are in the count matrix?
198 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
199 genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes]
200
201 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
202 if len(rule_split) == 1:
203 #one gene --> one reaction
204 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
205 else:
206 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
207 ras_df[ind] = np.nan
208 else:
209 # more genes in the formula
210 check_only_and=("and" in rule and "or" not in rule) #only and
211 check_only_or=("or" in rule and "and" not in rule) #only or
212 if check_only_and or check_only_or:
213 #or/and sequence
214 matrix = count_df_filtered.loc[genes_in_count_matrix].values
215 #compute for all cells
216 if check_only_and:
217 ras_df[ind] = and_function(matrix, axis=0)
218 else:
219 ras_df[ind] = or_function(matrix, axis=0)
220 else:
221 # complex expression (e.g. A or (B and C))
222 data = count_df_filtered.loc[genes_in_count_matrix] # dataframe of genes in the GPRs
223
224 rule = rule.replace("-", "_")
225 tree = ast.parse(rule, mode="eval").body
226 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
227 for j, values in enumerate(values_by_cell):
228 ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
229
230 ind +=1
231
232 #create the dataframe of ras (rules x samples)
233 ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
234 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
235
236 #create the reaction dataframe for ras (reactions x samples)
237 ras_df = ras_df.explode("Reactions").set_index("Reactions")
238
239 return ras_df
240
241 # function to evalute a complex boolean expression e.g. A or (B and C)
242 def _evaluate_ast( node, values,or_function,and_function):
243 if isinstance(node, ast.BoolOp):
244 # Ricorsione sugli argomenti
245 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
246
247 vals = [v for v in vals if v is not None]
248 if not vals:
249 return None
250
251 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
252
253 if isinstance(node.op, ast.Or):
254 return or_function(vals)
255 elif isinstance(node.op, ast.And):
256 return and_function(vals)
257
258 elif isinstance(node, ast.Name):
259 return values.get(node.id, None)
260 elif isinstance(node, ast.Constant):
261 return node.value
262 else:
263 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
264
611 def main(args:List[str] = None) -> None: 265 def main(args:List[str] = None) -> None:
612 """ 266 """
613 Initializes everything and sets the program in motion based on the fronted input arguments. 267 Initializes everything and sets the program in motion based on the fronted input arguments.
614 268
615 Returns: 269 Returns:
617 """ 271 """
618 # get args from frontend (related xml) 272 # get args from frontend (related xml)
619 global ARGS 273 global ARGS
620 ARGS = process_args(args) 274 ARGS = process_args(args)
621 275
622 # read dataset 276 # read dataset and remove versioning from gene names
623 dataset = read_dataset(ARGS.input, "dataset") 277 dataset = read_dataset(ARGS.input, "dataset")
624 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) 278 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
625 279
626 # remove versioning from gene names 280 #load GPR rules
627 dataset.iloc[:, 0] = dataset.iloc[:, 0].str.split('.').str[0]
628
629 rules = load_custom_rules() 281 rules = load_custom_rules()
630 reactions = list(rules.keys()) 282
631 283 #create a list of all the gpr
632 save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) 284 rules_total_string=""
633 if ERRORS: utils.logWarning( 285 for id,rule in rules.items():
634 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", 286 rules_total_string+=rule.replace("(","").replace(")","") + " "
635 ARGS.out_log) 287 rules_total_string=list(set(rules_total_string.split(" ")))
636 288
637 289 #check if nan value must be ignored in the GPR
290 print(ARGS.none,"\n\n\n*************",ARGS.none==True)
291 if ARGS.none:
292 # #e.g. (A or nan --> A)
293 ignore_nan = True
294 else:
295 #e.g. (A or nan --> nan)
296 ignore_nan = False
297
298 #compure ras
299 ras_df=computeRAS(dataset,rules,
300 rules_total_string,
301 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
302 and_function=np.min,
303 ignore_nan=ignore_nan)
304
305 #save to csv and replace nan with None
306 ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t')
307
308 #print
638 print("Execution succeeded") 309 print("Execution succeeded")
639 310
640 ############################################################################### 311 ###############################################################################
641 if __name__ == "__main__": 312 if __name__ == "__main__":
642 main() 313 main()