Mercurial > repos > bimib > cobraxy
comparison COBRAxy/ras_generator.py @ 505:96f512dff490 draft
Uploaded
| author | francesco_lapi |
|---|---|
| date | Wed, 01 Oct 2025 11:15:00 +0000 |
| parents | c6ea189ea7e9 |
| children | 5956dcf94277 |
comparison
equal
deleted
inserted
replaced
| 504:ef3df9b697b1 | 505:96f512dff490 |
|---|---|
| 5 computes RAS per reaction for each sample/cell line, and writes a tabular output. | 5 computes RAS per reaction for each sample/cell line, and writes a tabular output. |
| 6 """ | 6 """ |
| 7 from __future__ import division | 7 from __future__ import division |
| 8 import sys | 8 import sys |
| 9 import argparse | 9 import argparse |
| 10 import collections | |
| 11 import pandas as pd | 10 import pandas as pd |
| 12 import pickle as pk | 11 import numpy as np |
| 13 import utils.general_utils as utils | 12 import utils.general_utils as utils |
| 14 import utils.rule_parsing as ruleUtils | 13 from typing import List, Dict |
| 15 from typing import Union, Optional, List, Dict, Tuple, TypeVar | 14 import ast |
| 16 | 15 |
| 17 ERRORS = [] | 16 ERRORS = [] |
| 18 ########################## argparse ########################################## | 17 ########################## argparse ########################################## |
| 19 ARGS :argparse.Namespace | 18 ARGS :argparse.Namespace |
| 20 def process_args(args:List[str] = None) -> argparse.Namespace: | 19 def process_args(args:List[str] = None) -> argparse.Namespace: |
| 80 Raises: | 79 Raises: |
| 81 pd.errors.EmptyDataError: If the CSV file is empty. | 80 pd.errors.EmptyDataError: If the CSV file is empty. |
| 82 sys.exit: If the CSV file has the wrong format, the execution is aborted. | 81 sys.exit: If the CSV file has the wrong format, the execution is aborted. |
| 83 """ | 82 """ |
| 84 try: | 83 try: |
| 85 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python') | 84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0) |
| 85 dataset = dataset.astype(float) | |
| 86 except pd.errors.EmptyDataError: | 86 except pd.errors.EmptyDataError: |
| 87 sys.exit('Execution aborted: wrong format of ' + name + '\n') | 87 sys.exit('Execution aborted: wrong file format of ' + name + '\n') |
| 88 if len(dataset.columns) < 2: | 88 if len(dataset.columns) < 2: |
| 89 sys.exit('Execution aborted: wrong format of ' + name + '\n') | 89 sys.exit('Execution aborted: wrong file format of ' + name + '\n') |
| 90 return dataset | 90 return dataset |
| 91 | 91 |
| 92 ############################ load id e rules ################################## | 92 |
| 93 def load_id_rules(reactions :Dict[str, Dict[str, List[str]]]) -> Tuple[List[str], List[Dict[str, List[str]]]]: | 93 def load_custom_rules() -> Dict[str,str]: |
| 94 """ | |
| 95 Load IDs and rules from a dictionary of reactions. | |
| 96 | |
| 97 Args: | |
| 98 reactions (dict): A dictionary where keys are IDs and values are rules. | |
| 99 | |
| 100 Returns: | |
| 101 tuple: A tuple containing two lists, the first list containing IDs and the second list containing rules. | |
| 102 """ | |
| 103 ids, rules = [], [] | |
| 104 for key, value in reactions.items(): | |
| 105 ids.append(key) | |
| 106 rules.append(value) | |
| 107 return (ids, rules) | |
| 108 | |
| 109 | |
| 110 ############################ gene ############################################# | |
| 111 def data_gene(gene: pd.DataFrame, type_gene: str, name: str, gene_custom: Optional[Dict[str, str]]) -> Dict[str, str]: | |
| 112 """ | |
| 113 Process gene data to ensure correct formatting and handle duplicates. | |
| 114 | |
| 115 Args: | |
| 116 gene (DataFrame): DataFrame containing gene data. | |
| 117 type_gene (str): Type of gene data (e.g., 'hugo_id', 'ensembl_gene_id', 'symbol', 'entrez_id'). | |
| 118 name (str): Name of the dataset. | |
| 119 gene_custom (dict or None): Custom gene data dictionary if provided. | |
| 120 | |
| 121 Returns: | |
| 122 dict: A dictionary containing gene data with gene IDs as keys and corresponding values. | |
| 123 """ | |
| 124 | |
| 125 for i in range(len(gene)): | |
| 126 tmp = gene.iloc[i, 0] | |
| 127 gene.iloc[i, 0] = tmp.strip().split('.')[0] | |
| 128 | |
| 129 gene_dup = [item for item, count in | |
| 130 collections.Counter(gene[gene.columns[0]]).items() if count > 1] | |
| 131 pat_dup = [item for item, count in | |
| 132 collections.Counter(list(gene.columns)).items() if count > 1] | |
| 133 | |
| 134 gene_in_rule = None | |
| 135 | |
| 136 if gene_dup: | |
| 137 if gene_custom == None: | |
| 138 | |
| 139 if str(ARGS.rules_selector) == 'HMRcore': | |
| 140 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/HMRcore_genes.p', 'rb')) | |
| 141 | |
| 142 elif str(ARGS.rules_selector) == 'Recon': | |
| 143 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/Recon_genes.p', 'rb')) | |
| 144 | |
| 145 elif str(ARGS.rules_selector) == 'ENGRO2': | |
| 146 gene_in_rule = pk.load(open(ARGS.tool_dir + '/local/pickle files/ENGRO2_genes.p', 'rb')) | |
| 147 | |
| 148 utils.logWarning(f"{ARGS.tool_dir}'/local/pickle files/ENGRO2_genes.p'", ARGS.out_log) | |
| 149 | |
| 150 gene_in_rule = gene_in_rule.get(type_gene) | |
| 151 | |
| 152 else: | |
| 153 gene_in_rule = gene_custom | |
| 154 | |
| 155 tmp = [] | |
| 156 for i in gene_dup: | |
| 157 if gene_in_rule.get(i) == 'ok': | |
| 158 tmp.append(i) | |
| 159 if tmp: | |
| 160 sys.exit('Execution aborted because gene ID ' | |
| 161 +str(tmp)+' in '+name+' is duplicated\n') | |
| 162 | |
| 163 if pat_dup: utils.logWarning(f"Warning: duplicated label\n{pat_dup} in {name}", ARGS.out_log) | |
| 164 return (gene.set_index(gene.columns[0])).to_dict() | |
| 165 | |
| 166 ############################ resolve ########################################## | |
| 167 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: | |
| 168 """ | |
| 169 Replace gene identifiers in a parsed rule expression with values from a dict. | |
| 170 | |
| 171 Args: | |
| 172 l: Parsed rule as a nested list structure (strings, lists, and operators). | |
| 173 d: Dict mapping gene IDs to numeric values. | |
| 174 | |
| 175 Returns: | |
| 176 tuple: (new_expression, not_found_genes) | |
| 177 """ | |
| 178 tmp = [] | |
| 179 err = [] | |
| 180 while l: | |
| 181 if isinstance(l[0], list): | |
| 182 tmp_rules, tmp_err = replace_gene_value(l[0], d) | |
| 183 tmp.append(tmp_rules) | |
| 184 err.extend(tmp_err) | |
| 185 else: | |
| 186 value = replace_gene(l[0], d) | |
| 187 tmp.append(value) | |
| 188 if value == None: | |
| 189 err.append(l[0]) | |
| 190 l = l[1:] | |
| 191 return (tmp, err) | |
| 192 | |
| 193 def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]: | |
| 194 """ | |
| 195 Replace a single gene identifier with its corresponding value from a dictionary. | |
| 196 | |
| 197 Args: | |
| 198 l (str): Gene identifier to replace. | |
| 199 d (dict): Dict mapping gene IDs to numeric values. | |
| 200 | |
| 201 Returns: | |
| 202 float/int/None: Corresponding value from the dictionary if found, None otherwise. | |
| 203 | |
| 204 Raises: | |
| 205 sys.exit: If the value associated with the gene identifier is not valid. | |
| 206 """ | |
| 207 if l =='and' or l == 'or': | |
| 208 return l | |
| 209 else: | |
| 210 value = d.get(l, None) | |
| 211 if not(value == None or isinstance(value, (int, float))): | |
| 212 sys.exit('Execution aborted: ' + value + ' value not valid\n') | |
| 213 return value | |
| 214 | |
| 215 T = TypeVar("T", bound = Optional[Union[int, float]]) | |
| 216 def computes(val1 :T, op :str, val2 :T, cn :bool) -> T: | |
| 217 """ | |
| 218 Compute the RAS value between two value and an operator ('and' or 'or'). | |
| 219 | |
| 220 Args: | |
| 221 val1(Optional(Union[float, int])): First value. | |
| 222 op (str): Operator ('and' or 'or'). | |
| 223 val2(Optional(Union[float, int])): Second value. | |
| 224 cn (bool): Control boolean value. | |
| 225 | |
| 226 Returns: | |
| 227 Optional(Union[float, int]): Result of the computation. | |
| 228 """ | |
| 229 if val1 != None and val2 != None: | |
| 230 if op == 'and': | |
| 231 return min(val1, val2) | |
| 232 else: | |
| 233 return val1 + val2 | |
| 234 elif op == 'and': | |
| 235 if cn is True: | |
| 236 if val1 != None: | |
| 237 return val1 | |
| 238 elif val2 != None: | |
| 239 return val2 | |
| 240 else: | |
| 241 return None | |
| 242 else: | |
| 243 return None | |
| 244 else: | |
| 245 if val1 != None: | |
| 246 return val1 | |
| 247 elif val2 != None: | |
| 248 return val2 | |
| 249 else: | |
| 250 return None | |
| 251 | |
| 252 # ris should be Literal[None] but Literal is not supported in Python 3.7 | |
| 253 def control(ris, l :List[Union[int, float, list]], cn :bool) -> Union[bool, int, float]: #Union[Literal[False], int, float]: | |
| 254 """ | |
| 255 Control the format of the expression. | |
| 256 | |
| 257 Args: | |
| 258 ris: Intermediate result. | |
| 259 l (list): Expression to control. | |
| 260 cn (bool): Control boolean value. | |
| 261 | |
| 262 Returns: | |
| 263 Union[Literal[False], int, float]: Result of the control. | |
| 264 """ | |
| 265 if len(l) == 1: | |
| 266 if isinstance(l[0], (float, int)) or l[0] == None: | |
| 267 return l[0] | |
| 268 elif isinstance(l[0], list): | |
| 269 return control(None, l[0], cn) | |
| 270 else: | |
| 271 return False | |
| 272 elif len(l) > 2: | |
| 273 return control_list(ris, l, cn) | |
| 274 else: | |
| 275 return False | |
| 276 | |
| 277 def control_list(ris, l :List[Optional[Union[float, int, list]]], cn :bool) -> Optional[bool]: #Optional[Literal[False]]: | |
| 278 """ | |
| 279 Control the format of a list of expressions. | |
| 280 | |
| 281 Args: | |
| 282 ris: Intermediate result. | |
| 283 l (list): List of expressions to control. | |
| 284 cn (bool): Control boolean value. | |
| 285 | |
| 286 Returns: | |
| 287 Optional[Literal[False]]: Result of the control. | |
| 288 """ | |
| 289 while l: | |
| 290 if len(l) == 1: | |
| 291 return False | |
| 292 elif (isinstance(l[0], (float, int)) or | |
| 293 l[0] == None) and l[1] in ['and', 'or']: | |
| 294 if isinstance(l[2], (float, int)) or l[2] == None: | |
| 295 ris = computes(l[0], l[1], l[2], cn) | |
| 296 elif isinstance(l[2], list): | |
| 297 tmp = control(None, l[2], cn) | |
| 298 if tmp is False: | |
| 299 return False | |
| 300 else: | |
| 301 ris = computes(l[0], l[1], tmp, cn) | |
| 302 else: | |
| 303 return False | |
| 304 l = l[3:] | |
| 305 elif l[0] in ['and', 'or']: | |
| 306 if isinstance(l[1], (float, int)) or l[1] == None: | |
| 307 ris = computes(ris, l[0], l[1], cn) | |
| 308 elif isinstance(l[1], list): | |
| 309 tmp = control(None,l[1], cn) | |
| 310 if tmp is False: | |
| 311 return False | |
| 312 else: | |
| 313 ris = computes(ris, l[0], tmp, cn) | |
| 314 else: | |
| 315 return False | |
| 316 l = l[2:] | |
| 317 elif isinstance(l[0], list) and l[1] in ['and', 'or']: | |
| 318 if isinstance(l[2], (float, int)) or l[2] == None: | |
| 319 tmp = control(None, l[0], cn) | |
| 320 if tmp is False: | |
| 321 return False | |
| 322 else: | |
| 323 ris = computes(tmp, l[1], l[2], cn) | |
| 324 elif isinstance(l[2], list): | |
| 325 tmp = control(None, l[0], cn) | |
| 326 tmp2 = control(None, l[2], cn) | |
| 327 if tmp is False or tmp2 is False: | |
| 328 return False | |
| 329 else: | |
| 330 ris = computes(tmp, l[1], tmp2, cn) | |
| 331 else: | |
| 332 return False | |
| 333 l = l[3:] | |
| 334 else: | |
| 335 return False | |
| 336 return ris | |
| 337 | |
| 338 ResolvedRules = Dict[str, List[Optional[Union[float, int]]]] | |
| 339 def resolve(genes: Dict[str, str], rules: List[str], ids: List[str], resolve_none: bool, name: str) -> Tuple[Optional[ResolvedRules], Optional[list]]: | |
| 340 """ | |
| 341 Resolve rules using gene data to compute scores for each rule. | |
| 342 | |
| 343 Args: | |
| 344 genes (dict): Dictionary containing gene data with gene IDs as keys and corresponding values. | |
| 345 rules (list): List of rules to resolve. | |
| 346 ids (list): List of IDs corresponding to the rules. | |
| 347 resolve_none (bool): Flag indicating whether to resolve None values in the rules. | |
| 348 name (str): Name of the dataset. | |
| 349 | |
| 350 Returns: | |
| 351 tuple: A tuple containing resolved rules as a dictionary and a list of gene IDs not found in the data. | |
| 352 """ | |
| 353 resolve_rules = {} | |
| 354 not_found = [] | |
| 355 flag = False | |
| 356 for key, value in genes.items(): | |
| 357 tmp_resolve = [] | |
| 358 for i in range(len(rules)): | |
| 359 tmp = rules[i] | |
| 360 if tmp: | |
| 361 tmp, err = replace_gene_value(tmp, value) | |
| 362 if err: | |
| 363 not_found.extend(err) | |
| 364 ris = control(None, tmp, resolve_none) | |
| 365 if ris is False or ris == None: | |
| 366 tmp_resolve.append(None) | |
| 367 else: | |
| 368 tmp_resolve.append(ris) | |
| 369 flag = True | |
| 370 else: | |
| 371 tmp_resolve.append(None) | |
| 372 resolve_rules[key] = tmp_resolve | |
| 373 | |
| 374 if flag is False: | |
| 375 utils.logWarning( | |
| 376 f"Warning: no computable score (due to missing gene values) for class {name}, the class has been disregarded", | |
| 377 ARGS.out_log) | |
| 378 | |
| 379 return (None, None) | |
| 380 | |
| 381 return (resolve_rules, list(set(not_found))) | |
| 382 ############################ create_ras ####################################### | |
| 383 def create_ras(resolve_rules: Optional[ResolvedRules], dataset_name: str, rules: List[str], ids: List[str], file: str) -> None: | |
| 384 """ | |
| 385 Create a RAS (Reaction Activity Score) file from resolved rules. | |
| 386 | |
| 387 Args: | |
| 388 resolve_rules (dict): Dictionary containing resolved rules. | |
| 389 dataset_name (str): Name of the dataset. | |
| 390 rules (list): List of rules. | |
| 391 file (str): Path to the output RAS file. | |
| 392 | |
| 393 Returns: | |
| 394 None | |
| 395 """ | |
| 396 if resolve_rules is None: | |
| 397 utils.logWarning(f"Couldn't generate RAS for current dataset: {dataset_name}", ARGS.out_log) | |
| 398 | |
| 399 for geni in resolve_rules.values(): | |
| 400 for i, valori in enumerate(geni): | |
| 401 if valori == None: | |
| 402 geni[i] = 'None' | |
| 403 | |
| 404 output_ras = pd.DataFrame.from_dict(resolve_rules) | |
| 405 | |
| 406 output_ras.insert(0, 'Reactions', ids) | |
| 407 output_to_csv = pd.DataFrame.to_csv(output_ras, sep = '\t', index = False) | |
| 408 | |
| 409 text_file = open(file, "w") | |
| 410 | |
| 411 text_file.write(output_to_csv) | |
| 412 text_file.close() | |
| 413 | |
| 414 ################################- NEW RAS COMPUTATION -################################ | |
| 415 Expr = Optional[Union[int, float]] | |
| 416 Ras = Expr | |
| 417 def ras_for_cell_lines(dataset: pd.DataFrame, rules: Dict[str, ruleUtils.OpList]) -> Dict[str, Dict[str, Ras]]: | |
| 418 """ | |
| 419 Generates the RAS scores for each cell line found in the dataset. | |
| 420 | |
| 421 Args: | |
| 422 dataset (pd.DataFrame): Dataset containing gene values. | |
| 423 rules (dict): The dict containing reaction ids as keys and rules as values. | |
| 424 | |
| 425 Note: | |
| 426 Modifies dataset in place by setting the first column as index. | |
| 427 | |
| 428 Returns: | |
| 429 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary | |
| 430 where each key corresponds to a reaction ID and each value is its computed RAS score. | |
| 431 """ | |
| 432 ras_values_by_cell_line = {} | |
| 433 dataset.set_index(dataset.columns[0], inplace=True) | |
| 434 | |
| 435 for cell_line_name in dataset.columns: #[1:]: | |
| 436 cell_line = dataset[cell_line_name].to_dict() | |
| 437 ras_values_by_cell_line[cell_line_name]= get_ras_values(rules, cell_line) | |
| 438 return ras_values_by_cell_line | |
| 439 | |
| 440 def get_ras_values(value_rules: Dict[str, ruleUtils.OpList], dataset: Dict[str, Expr]) -> Dict[str, Ras]: | |
| 441 """ | |
| 442 Computes the RAS (Reaction Activity Score) values for each rule in the given dict. | |
| 443 | |
| 444 Args: | |
| 445 value_rules (dict): A dictionary where keys are reaction ids and values are OpLists. | |
| 446 dataset : gene expression data of one cell line. | |
| 447 | |
| 448 Returns: | |
| 449 dict: A dictionary where keys are reaction ids and values are the computed RAS values for each rule. | |
| 450 """ | |
| 451 return {key: ras_op_list(op_list, dataset) for key, op_list in value_rules.items()} | |
| 452 | |
| 453 def get_gene_expr(dataset :Dict[str, Expr], name :str) -> Expr: | |
| 454 """ | |
| 455 Extracts the gene expression of the given gene from a cell line dataset. | |
| 456 | |
| 457 Args: | |
| 458 dataset : gene expression data of one cell line. | |
| 459 name : gene name. | |
| 460 | |
| 461 Returns: | |
| 462 Expr : the gene's expression value. | |
| 463 """ | |
| 464 expr = dataset.get(name, None) | |
| 465 if expr is None: ERRORS.append(name) | |
| 466 | |
| 467 return expr | |
| 468 | |
| 469 def ras_op_list(op_list: ruleUtils.OpList, dataset: Dict[str, Expr]) -> Ras: | |
| 470 """ | |
| 471 Computes recursively the RAS (Reaction Activity Score) value for the given OpList, considering the specified flag to control None behavior. | |
| 472 | |
| 473 Args: | |
| 474 op_list (OpList): The OpList representing a rule with gene values. | |
| 475 dataset : gene expression data of one cell line. | |
| 476 | |
| 477 Returns: | |
| 478 Ras: The computed RAS value for the given OpList. | |
| 479 """ | |
| 480 op = op_list.op | |
| 481 ras_value :Ras = None | |
| 482 if not op: return get_gene_expr(dataset, op_list[0]) | |
| 483 if op is ruleUtils.RuleOp.AND and not ARGS.none and None in op_list: return None | |
| 484 | |
| 485 for i in range(len(op_list)): | |
| 486 item = op_list[i] | |
| 487 if isinstance(item, ruleUtils.OpList): | |
| 488 item = ras_op_list(item, dataset) | |
| 489 | |
| 490 else: | |
| 491 item = get_gene_expr(dataset, item) | |
| 492 | |
| 493 if item is None: | |
| 494 if op is ruleUtils.RuleOp.AND and not ARGS.none: return None | |
| 495 continue | |
| 496 | |
| 497 if ras_value is None: | |
| 498 ras_value = item | |
| 499 else: | |
| 500 ras_value = ras_value + item if op is ruleUtils.RuleOp.OR else min(ras_value, item) | |
| 501 | |
| 502 return ras_value | |
| 503 | |
| 504 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: | |
| 505 """ | |
| 506 Save computed RAS scores to ARGS.ras_output as a TSV file. | |
| 507 | |
| 508 Args: | |
| 509 rasScores : the computed ras scores. | |
| 510 reactions : the list of reaction IDs, used as the first column. | |
| 511 | |
| 512 Returns: | |
| 513 None | |
| 514 """ | |
| 515 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly | |
| 516 for reactId, score in scores.items(): | |
| 517 if score is None: scores[reactId] = "None" | |
| 518 | |
| 519 output_ras = pd.DataFrame.from_dict(rasScores) | |
| 520 output_ras.insert(0, 'Reactions', reactions) | |
| 521 output_ras.to_csv(ARGS.ras_output, sep = '\t', index = False) | |
| 522 | |
| 523 ############################ MAIN ############################################# | |
| 524 #TODO: not used but keep, it will be when the new translator dicts will be used. | |
| 525 def translateGene(geneName :str, encoding :str, geneTranslator :Dict[str, Dict[str, str]]) -> str: | |
| 526 """ | |
| 527 Translate gene from any supported encoding to HugoID. | |
| 528 | |
| 529 Args: | |
| 530 geneName (str): the name of the gene in its current encoding. | |
| 531 encoding (str): the encoding. | |
| 532 geneTranslator (Dict[str, Dict[str, str]]): the dict containing all supported gene names | |
| 533 and encodings in the current model, mapping each to the corresponding HugoID encoding. | |
| 534 | |
| 535 Raises: | |
| 536 ValueError: When the gene isn't supported in the model. | |
| 537 | |
| 538 Returns: | |
| 539 str: the gene in HugoID encoding. | |
| 540 """ | |
| 541 supportedGenesInEncoding = geneTranslator[encoding] | |
| 542 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] | |
| 543 raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.") | |
| 544 | |
| 545 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: | |
| 546 """ | 94 """ |
| 547 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be | 95 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be |
| 548 performed, significantly impacting the runtime. | 96 performed, significantly impacting the runtime. |
| 549 | 97 |
| 550 Returns: | 98 Returns: |
| 568 for line in rows[1:]: | 116 for line in rows[1:]: |
| 569 if len(line) <= idx_gpr: | 117 if len(line) <= idx_gpr: |
| 570 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 118 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
| 571 continue | 119 continue |
| 572 | 120 |
| 573 if line[idx_gpr] == "": | 121 dict_rule[line[id_idx]] = line[idx_gpr] |
| 574 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) | 122 |
| 575 else: | |
| 576 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) | |
| 577 | |
| 578 except Exception as e: | 123 except Exception as e: |
| 579 # If parsing with tabs fails, try comma delimiter | 124 # If parsing with tabs fails, try comma delimiter |
| 580 try: | 125 try: |
| 581 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) | 126 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) |
| 582 | 127 |
| 592 for line in rows[1:]: | 137 for line in rows[1:]: |
| 593 if len(line) <= idx_gpr: | 138 if len(line) <= idx_gpr: |
| 594 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 139 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
| 595 continue | 140 continue |
| 596 | 141 |
| 597 if line[idx_gpr] == "": | 142 dict_rule[line[id_idx]] =line[idx_gpr] |
| 598 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) | |
| 599 else: | |
| 600 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) | |
| 601 | 143 |
| 602 except Exception as e2: | 144 except Exception as e2: |
| 603 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}") | 145 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}") |
| 604 | 146 |
| 605 if not dict_rule: | 147 if not dict_rule: |
| 606 raise ValueError("No valid rules found in the uploaded file. Please check the file format.") | 148 raise ValueError("No valid rules found in the uploaded file. Please check the file format.") |
| 607 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed. | 149 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed. |
| 608 return dict_rule | 150 return dict_rule |
| 609 | 151 |
| 610 | 152 |
| 153 | |
| 154 def computeRAS( | |
| 155 dataset,gene_rules,rules_total_string, | |
| 156 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean) | |
| 157 and_function=np.min, # type of operation to do in case of an and expression(min, sum) | |
| 158 ignore_nan = True | |
| 159 ): | |
| 160 | |
| 161 | |
| 162 logic_operators = ['and', 'or', '(', ')'] | |
| 163 reactions=list(gene_rules.keys()) | |
| 164 | |
| 165 # Build the dictionary for the GPRs | |
| 166 df_reactions = pd.DataFrame(index=reactions) | |
| 167 gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions] | |
| 168 df_reactions['rule'] = gene_rules | |
| 169 df_reactions = df_reactions.reset_index() | |
| 170 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x))) | |
| 171 | |
| 172 dict_rule_reactions = df_reactions.to_dict()['index'] | |
| 173 | |
| 174 # build useful structures for RAS computation | |
| 175 genes =dataset.index.intersection(rules_total_string) | |
| 176 | |
| 177 #check if there is one gene at least | |
| 178 if len(genes)==0: | |
| 179 print("ERROR: no gene of the count matrix is in the metabolic model!") | |
| 180 print(" are you sure that the gene annotation is the same for the model and the count matrix?") | |
| 181 return -1 | |
| 182 | |
| 183 cell_ids = list(dataset.columns) | |
| 184 count_df_filtered = dataset.loc[genes] | |
| 185 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_")) | |
| 186 | |
| 187 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan) | |
| 188 | |
| 189 # for loop on rules | |
| 190 ind = 0 | |
| 191 for rule, reaction_ids in dict_rule_reactions.items(): | |
| 192 if len(rule) != 0: | |
| 193 # there is one gene at least in the formula | |
| 194 rule_split = rule.split() | |
| 195 rule_split_elements = list(set(rule_split)) # genes in formula | |
| 196 | |
| 197 # which genes are in the count matrix? | |
| 198 genes_in_count_matrix = [el for el in rule_split_elements if el in genes] | |
| 199 genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes] | |
| 200 | |
| 201 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix | |
| 202 if len(rule_split) == 1: | |
| 203 #one gene --> one reaction | |
| 204 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix] | |
| 205 else: | |
| 206 if len(genes_notin_count_matrix) > 0 and ignore_nan == False: | |
| 207 ras_df[ind] = np.nan | |
| 208 else: | |
| 209 # more genes in the formula | |
| 210 check_only_and=("and" in rule and "or" not in rule) #only and | |
| 211 check_only_or=("or" in rule and "and" not in rule) #only or | |
| 212 if check_only_and or check_only_or: | |
| 213 #or/and sequence | |
| 214 matrix = count_df_filtered.loc[genes_in_count_matrix].values | |
| 215 #compute for all cells | |
| 216 if check_only_and: | |
| 217 ras_df[ind] = and_function(matrix, axis=0) | |
| 218 else: | |
| 219 ras_df[ind] = or_function(matrix, axis=0) | |
| 220 else: | |
| 221 # complex expression (e.g. A or (B and C)) | |
| 222 data = count_df_filtered.loc[genes_in_count_matrix] # dataframe of genes in the GPRs | |
| 223 | |
| 224 rule = rule.replace("-", "_") | |
| 225 tree = ast.parse(rule, mode="eval").body | |
| 226 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns] | |
| 227 for j, values in enumerate(values_by_cell): | |
| 228 ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function) | |
| 229 | |
| 230 ind +=1 | |
| 231 | |
| 232 #create the dataframe of ras (rules x samples) | |
| 233 ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids) | |
| 234 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()] | |
| 235 | |
| 236 #create the reaction dataframe for ras (reactions x samples) | |
| 237 ras_df = ras_df.explode("Reactions").set_index("Reactions") | |
| 238 | |
| 239 return ras_df | |
| 240 | |
| 241 # function to evalute a complex boolean expression e.g. A or (B and C) | |
| 242 def _evaluate_ast( node, values,or_function,and_function): | |
| 243 if isinstance(node, ast.BoolOp): | |
| 244 # Ricorsione sugli argomenti | |
| 245 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values] | |
| 246 | |
| 247 vals = [v for v in vals if v is not None] | |
| 248 if not vals: | |
| 249 return None | |
| 250 | |
| 251 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals] | |
| 252 | |
| 253 if isinstance(node.op, ast.Or): | |
| 254 return or_function(vals) | |
| 255 elif isinstance(node.op, ast.And): | |
| 256 return and_function(vals) | |
| 257 | |
| 258 elif isinstance(node, ast.Name): | |
| 259 return values.get(node.id, None) | |
| 260 elif isinstance(node, ast.Constant): | |
| 261 return node.value | |
| 262 else: | |
| 263 raise ValueError(f"Error in boolean expression: {ast.dump(node)}") | |
| 264 | |
| 611 def main(args:List[str] = None) -> None: | 265 def main(args:List[str] = None) -> None: |
| 612 """ | 266 """ |
| 613 Initializes everything and sets the program in motion based on the fronted input arguments. | 267 Initializes everything and sets the program in motion based on the fronted input arguments. |
| 614 | 268 |
| 615 Returns: | 269 Returns: |
| 617 """ | 271 """ |
| 618 # get args from frontend (related xml) | 272 # get args from frontend (related xml) |
| 619 global ARGS | 273 global ARGS |
| 620 ARGS = process_args(args) | 274 ARGS = process_args(args) |
| 621 | 275 |
| 622 # read dataset | 276 # read dataset and remove versioning from gene names |
| 623 dataset = read_dataset(ARGS.input, "dataset") | 277 dataset = read_dataset(ARGS.input, "dataset") |
| 624 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str) | 278 dataset.index = [str(el.split(".")[0]) for el in dataset.index] |
| 625 | 279 |
| 626 # remove versioning from gene names | 280 #load GPR rules |
| 627 dataset.iloc[:, 0] = dataset.iloc[:, 0].str.split('.').str[0] | |
| 628 | |
| 629 rules = load_custom_rules() | 281 rules = load_custom_rules() |
| 630 reactions = list(rules.keys()) | 282 |
| 631 | 283 #create a list of all the gpr |
| 632 save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) | 284 rules_total_string="" |
| 633 if ERRORS: utils.logWarning( | 285 for id,rule in rules.items(): |
| 634 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | 286 rules_total_string+=rule.replace("(","").replace(")","") + " " |
| 635 ARGS.out_log) | 287 rules_total_string=list(set(rules_total_string.split(" "))) |
| 636 | 288 |
| 637 | 289 #check if nan value must be ignored in the GPR |
| 290 print(ARGS.none,"\n\n\n*************",ARGS.none==True) | |
| 291 if ARGS.none: | |
| 292 # #e.g. (A or nan --> A) | |
| 293 ignore_nan = True | |
| 294 else: | |
| 295 #e.g. (A or nan --> nan) | |
| 296 ignore_nan = False | |
| 297 | |
| 298 #compure ras | |
| 299 ras_df=computeRAS(dataset,rules, | |
| 300 rules_total_string, | |
| 301 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean) | |
| 302 and_function=np.min, | |
| 303 ignore_nan=ignore_nan) | |
| 304 | |
| 305 #save to csv and replace nan with None | |
| 306 ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t') | |
| 307 | |
| 308 #print | |
| 638 print("Execution succeeded") | 309 print("Execution succeeded") |
| 639 | 310 |
| 640 ############################################################################### | 311 ############################################################################### |
| 641 if __name__ == "__main__": | 312 if __name__ == "__main__": |
| 642 main() | 313 main() |
