Mercurial > repos > bimib > cobraxy
comparison COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft
Uploaded
| author | francesco_lapi |
|---|---|
| date | Fri, 12 Sep 2025 17:28:45 +0000 |
| parents | 4a385fdb9e58 |
| children |
comparison
equal
deleted
inserted
replaced
| 455:4e2bc80764b6 | 456:a6e45049c1b9 |
|---|---|
| 1 """ | |
| 2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules. | |
| 3 | |
| 4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs), | |
| 5 computes RAS per reaction for each sample/cell line, and writes a tabular output. | |
| 6 """ | |
| 1 from __future__ import division | 7 from __future__ import division |
| 2 # galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason. | |
| 3 import sys | 8 import sys |
| 4 import argparse | 9 import argparse |
| 5 import collections | 10 import collections |
| 6 import pandas as pd | 11 import pandas as pd |
| 7 import pickle as pk | 12 import pickle as pk |
| 8 import utils.general_utils as utils | 13 import utils.general_utils as utils |
| 9 import utils.rule_parsing as ruleUtils | 14 import utils.rule_parsing as ruleUtils |
| 10 from typing import Union, Optional, List, Dict, Tuple, TypeVar | 15 from typing import Union, Optional, List, Dict, Tuple, TypeVar |
| 11 import os | |
| 12 | 16 |
| 13 ERRORS = [] | 17 ERRORS = [] |
| 14 ########################## argparse ########################################## | 18 ########################## argparse ########################################## |
| 15 ARGS :argparse.Namespace | 19 ARGS :argparse.Namespace |
| 16 def process_args(args:List[str] = None) -> argparse.Namespace: | 20 def process_args(args:List[str] = None) -> argparse.Namespace: |
| 29 | 33 |
| 30 parser.add_argument("-rl", "--model_upload", type = str, | 34 parser.add_argument("-rl", "--model_upload", type = str, |
| 31 help = "path to input file containing the rules") | 35 help = "path to input file containing the rules") |
| 32 | 36 |
| 33 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") | 37 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name") |
| 34 # ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in | 38 # Galaxy converts files into .dat, this helps infer the original extension when needed. |
| 35 | 39 |
| 36 parser.add_argument( | 40 parser.add_argument( |
| 37 '-n', '--none', | 41 '-n', '--none', |
| 38 type = utils.Bool("none"), default = True, | 42 type = utils.Bool("none"), default = True, |
| 39 help = 'compute Nan values') | 43 help = 'compute Nan values') |
| 47 '-ol', '--out_log', | 51 '-ol', '--out_log', |
| 48 type = str, | 52 type = str, |
| 49 help = "Output log") | 53 help = "Output log") |
| 50 | 54 |
| 51 parser.add_argument( | 55 parser.add_argument( |
| 52 '-in', '--input', #id รจ diventato in | 56 '-in', '--input', |
| 53 type = str, | 57 type = str, |
| 54 help = 'input dataset') | 58 help = 'input dataset') |
| 55 | 59 |
| 56 parser.add_argument( | 60 parser.add_argument( |
| 57 '-ra', '--ras_output', | 61 '-ra', '--ras_output', |
| 251 return (gene.set_index(gene.columns[0])).to_dict() | 255 return (gene.set_index(gene.columns[0])).to_dict() |
| 252 | 256 |
| 253 ############################ resolve ########################################## | 257 ############################ resolve ########################################## |
| 254 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: | 258 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]: |
| 255 """ | 259 """ |
| 256 Replace gene identifiers with corresponding values from a dictionary. | 260 Replace gene identifiers in a parsed rule expression with values from a dict. |
| 257 | 261 |
| 258 Args: | 262 Args: |
| 259 l (str): String of gene identifier. | 263 l: Parsed rule as a nested list structure (strings, lists, and operators). |
| 260 d (str): String corresponding to its value. | 264 d: Dict mapping gene IDs to numeric values. |
| 261 | 265 |
| 262 Returns: | 266 Returns: |
| 263 tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement. | 267 tuple: (new_expression, not_found_genes) |
| 264 """ | 268 """ |
| 265 tmp = [] | 269 tmp = [] |
| 266 err = [] | 270 err = [] |
| 267 while l: | 271 while l: |
| 268 if isinstance(l[0], list): | 272 if isinstance(l[0], list): |
| 275 if value == None: | 279 if value == None: |
| 276 err.append(l[0]) | 280 err.append(l[0]) |
| 277 l = l[1:] | 281 l = l[1:] |
| 278 return (tmp, err) | 282 return (tmp, err) |
| 279 | 283 |
| 280 def replace_gene(l :str, d :str) -> Union[int, float]: | 284 def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]: |
| 281 """ | 285 """ |
| 282 Replace a single gene identifier with its corresponding value from a dictionary. | 286 Replace a single gene identifier with its corresponding value from a dictionary. |
| 283 | 287 |
| 284 Args: | 288 Args: |
| 285 l (str): Gene identifier to replace. | 289 l (str): Gene identifier to replace. |
| 286 d (str): String corresponding to its value. | 290 d (dict): Dict mapping gene IDs to numeric values. |
| 287 | 291 |
| 288 Returns: | 292 Returns: |
| 289 float/int: Corresponding value from the dictionary if found, None otherwise. | 293 float/int/None: Corresponding value from the dictionary if found, None otherwise. |
| 290 | 294 |
| 291 Raises: | 295 Raises: |
| 292 sys.exit: If the value associated with the gene identifier is not valid. | 296 sys.exit: If the value associated with the gene identifier is not valid. |
| 293 """ | 297 """ |
| 294 if l =='and' or l == 'or': | 298 if l =='and' or l == 'or': |
| 506 Generates the RAS scores for each cell line found in the dataset. | 510 Generates the RAS scores for each cell line found in the dataset. |
| 507 | 511 |
| 508 Args: | 512 Args: |
| 509 dataset (pd.DataFrame): Dataset containing gene values. | 513 dataset (pd.DataFrame): Dataset containing gene values. |
| 510 rules (dict): The dict containing reaction ids as keys and rules as values. | 514 rules (dict): The dict containing reaction ids as keys and rules as values. |
| 511 | 515 |
| 512 Side effects: | 516 Note: |
| 513 dataset : mut | 517 Modifies dataset in place by setting the first column as index. |
| 514 | 518 |
| 515 Returns: | 519 Returns: |
| 516 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary | 520 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary |
| 517 where each key corresponds to a reaction ID and each value is its computed RAS score. | 521 where each key corresponds to a reaction ID and each value is its computed RAS score. |
| 518 """ | 522 """ |
| 588 | 592 |
| 589 return ras_value | 593 return ras_value |
| 590 | 594 |
| 591 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: | 595 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None: |
| 592 """ | 596 """ |
| 593 Save computed ras scores to the given path, as a tsv file. | 597 Save computed RAS scores to ARGS.ras_output as a TSV file. |
| 594 | 598 |
| 595 Args: | 599 Args: |
| 596 rasScores : the computed ras scores. | 600 rasScores : the computed ras scores. |
| 597 path : the output tsv file's path. | 601 reactions : the list of reaction IDs, used as the first column. |
| 598 | 602 |
| 599 Returns: | 603 Returns: |
| 600 None | 604 None |
| 601 """ | 605 """ |
| 602 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly | 606 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly |
| 625 Returns: | 629 Returns: |
| 626 str: the gene in HugoID encoding. | 630 str: the gene in HugoID encoding. |
| 627 """ | 631 """ |
| 628 supportedGenesInEncoding = geneTranslator[encoding] | 632 supportedGenesInEncoding = geneTranslator[encoding] |
| 629 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] | 633 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName] |
| 630 raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!") | 634 raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.") |
| 631 | 635 |
| 632 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: | 636 def load_custom_rules() -> Dict[str, ruleUtils.OpList]: |
| 633 """ | 637 """ |
| 634 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be | 638 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be |
| 635 performed, significantly impacting the runtime. | 639 performed, significantly impacting the runtime. |
| 636 | 640 |
| 637 Returns: | 641 Returns: |
| 638 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. | 642 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules. |
| 639 """ | 643 """ |
| 640 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in galaxy as a .dat | 644 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat |
| 641 | |
| 642 #try: filenamePath = utils.FilePath.fromStrPath(ARGS.model_upload_name) # file's name in input, to determine its original ext | |
| 643 #except utils.PathErr as err: | |
| 644 # utils.logWarning(f"Cannot determine file extension from filename '{ARGS.model_upload_name}'. Assuming tabular format.", ARGS.out_log) | |
| 645 # filenamePath = None | |
| 646 | |
| 647 #if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath) | |
| 648 | 645 |
| 649 dict_rule = {} | 646 dict_rule = {} |
| 650 | 647 |
| 651 try: | 648 try: |
| 652 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False) | 649 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False) |
| 656 if not rows: | 653 if not rows: |
| 657 raise ValueError("Model tabular is file is empty.") | 654 raise ValueError("Model tabular is file is empty.") |
| 658 | 655 |
| 659 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") | 656 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") |
| 660 | 657 |
| 661 # Proviamo prima con delimitatore tab | 658 # First, try using a tab delimiter |
| 662 for line in rows[1:]: | 659 for line in rows[1:]: |
| 663 if len(line) <= idx_gpr: | 660 if len(line) <= idx_gpr: |
| 664 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 661 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
| 665 continue | 662 continue |
| 666 | 663 |
| 668 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) | 665 dict_rule[line[id_idx]] = ruleUtils.OpList([""]) |
| 669 else: | 666 else: |
| 670 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) | 667 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr]) |
| 671 | 668 |
| 672 except Exception as e: | 669 except Exception as e: |
| 673 # Se fallisce con tab, proviamo con virgola | 670 # If parsing with tabs fails, try comma delimiter |
| 674 try: | 671 try: |
| 675 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) | 672 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False) |
| 676 | 673 |
| 677 if len(rows) <= 1: | 674 if len(rows) <= 1: |
| 678 raise ValueError("Model tabular with 1 column is not supported.") | 675 raise ValueError("Model tabular with 1 column is not supported.") |
| 680 if not rows: | 677 if not rows: |
| 681 raise ValueError("Model tabular is file is empty.") | 678 raise ValueError("Model tabular is file is empty.") |
| 682 | 679 |
| 683 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") | 680 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR") |
| 684 | 681 |
| 685 # Proviamo prima con delimitatore tab | 682 # Try again parsing row content with the GPR column using comma-separated values |
| 686 for line in rows[1:]: | 683 for line in rows[1:]: |
| 687 if len(line) <= idx_gpr: | 684 if len(line) <= idx_gpr: |
| 688 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) | 685 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log) |
| 689 continue | 686 continue |
| 690 | 687 |
| 727 if ERRORS: utils.logWarning( | 724 if ERRORS: utils.logWarning( |
| 728 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | 725 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", |
| 729 ARGS.out_log) | 726 ARGS.out_log) |
| 730 | 727 |
| 731 | 728 |
| 732 ############ | 729 print("Execution succeeded") |
| 733 | |
| 734 # handle custom models | |
| 735 #model :utils.Model = ARGS.rules_selector | |
| 736 | |
| 737 #if model is utils.Model.Custom: | |
| 738 # rules = load_custom_rules() | |
| 739 # reactions = list(rules.keys()) | |
| 740 | |
| 741 # save_as_tsv(ras_for_cell_lines(dataset, rules), reactions) | |
| 742 # if ERRORS: utils.logWarning( | |
| 743 # f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}", | |
| 744 # ARGS.out_log) | |
| 745 | |
| 746 # return | |
| 747 | |
| 748 # This is the standard flow of the ras_generator program, for non-custom models. | |
| 749 #name = "RAS Dataset" | |
| 750 #type_gene = gene_type(dataset.iloc[0, 0], name) | |
| 751 | |
| 752 #rules = model.getRules(ARGS.tool_dir) | |
| 753 #genes = data_gene(dataset, type_gene, name, None) | |
| 754 #ids, rules = load_id_rules(rules.get(type_gene)) | |
| 755 | |
| 756 #resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name) | |
| 757 #create_ras(resolve_rules, name, rules, ids, ARGS.ras_output) | |
| 758 | |
| 759 #if err: utils.logWarning( | |
| 760 # f"Warning: gene(s) {err} not found in class \"{name}\", " + | |
| 761 # "the expression level for this gene will be considered NaN", | |
| 762 # ARGS.out_log) | |
| 763 | |
| 764 print("Execution succeded") | |
| 765 | 730 |
| 766 ############################################################################### | 731 ############################################################################### |
| 767 if __name__ == "__main__": | 732 if __name__ == "__main__": |
| 768 main() | 733 main() |
