cobraxy: COBRAxy/ras_generator

comparison COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft

Uploaded

author	francesco_lapi
date	Fri, 12 Sep 2025 17:28:45 +0000
parents	4a385fdb9e58
children

comparison

equal deleted inserted replaced

-:4e2bc80764b6
+:a6e45049c1b9
+"""
+Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
+The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
+computes RAS per reaction for each sample/cell line, and writes a tabular output.
+"""
 from __future__ import division
-# galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason.
 import sys
 import argparse
 import collections
 import pandas as pd
 import pickle as pk
 import utils.general_utils as utils
 import utils.rule_parsing as ruleUtils
 from typing import Union, Optional, List, Dict, Tuple, TypeVar
-import os
 ERRORS = []
 ########################## argparse ##########################################
 ARGS :argparse.Namespace
 def process_args(args:List[str] = None) -> argparse.Namespace:
 parser.add_argument("-rl", "--model_upload", type = str,
 help = "path to input file containing the rules")
 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
-# ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in
+# Galaxy converts files into .dat, this helps infer the original extension when needed.
 parser.add_argument(
 '-n', '--none',
 type = utils.Bool("none"), default = True,
 help = 'compute Nan values')
 '-ol', '--out_log',
 type = str,
 help = "Output log")
 parser.add_argument(
-'-in', '--input', #id è diventato in
+'-in', '--input',
 type = str,
 help = 'input dataset')
 parser.add_argument(
 '-ra', '--ras_output',
 return (gene.set_index(gene.columns[0])).to_dict()
 ############################ resolve ##########################################
 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]:
 """
-Replace gene identifiers with corresponding values from a dictionary.
+Replace gene identifiers in a parsed rule expression with values from a dict.
 Args:
-l (str): String of gene identifier.
+l: Parsed rule as a nested list structure (strings, lists, and operators).
-d (str): String corresponding to its value.
+d: Dict mapping gene IDs to numeric values.
 Returns:
-tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement.
+tuple: (new_expression, not_found_genes)
 """
 tmp = []
 err = []
 while l:
 if isinstance(l[0], list):
 if value == None:
 err.append(l[0])
 l = l[1:]
 return (tmp, err)
-def replace_gene(l :str, d :str) -> Union[int, float]:
+def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]:
 """
 Replace a single gene identifier with its corresponding value from a dictionary.
 Args:
 l (str): Gene identifier to replace.
-d (str): String corresponding to its value.
+d (dict): Dict mapping gene IDs to numeric values.
 Returns:
-float/int: Corresponding value from the dictionary if found, None otherwise.
+float/int/None: Corresponding value from the dictionary if found, None otherwise.
 Raises:
 sys.exit: If the value associated with the gene identifier is not valid.
 """
 if l =='and' or l == 'or':
 Generates the RAS scores for each cell line found in the dataset.
 Args:
 dataset (pd.DataFrame): Dataset containing gene values.
 rules (dict): The dict containing reaction ids as keys and rules as values.
-Side effects:
+Note:
-dataset : mut
+Modifies dataset in place by setting the first column as index.
 Returns:
 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary
 where each key corresponds to a reaction ID and each value is its computed RAS score.
 """
 return ras_value
 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None:
 """
-Save computed ras scores to the given path, as a tsv file.
+Save computed RAS scores to ARGS.ras_output as a TSV file.
 Args:
 rasScores : the computed ras scores.
-path : the output tsv file's path.
+reactions : the list of reaction IDs, used as the first column.
 Returns:
 None
 """
 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly
 Returns:
 str: the gene in HugoID encoding.
 """
 supportedGenesInEncoding = geneTranslator[encoding]
 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName]
-raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!")
+raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.")
 def load_custom_rules() -> Dict[str, ruleUtils.OpList]:
 """
 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
 performed, significantly impacting the runtime.
 Returns:
 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
 """
-datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in galaxy as a .dat
+datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload)  # actual file, stored in Galaxy as a .dat
-#try: filenamePath = utils.FilePath.fromStrPath(ARGS.model_upload_name) # file's name in input, to determine its original ext
-#except utils.PathErr as err:
-#    utils.logWarning(f"Cannot determine file extension from filename '{ARGS.model_upload_name}'. Assuming tabular format.", ARGS.out_log)
-#    filenamePath = None
-#if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath)
 dict_rule = {}
 try:
 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
 if not rows:
 raise ValueError("Model tabular is file is empty.")
 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
-# Proviamo prima con delimitatore tab
+# First, try using a tab delimiter
 for line in rows[1:]:
 if len(line) <= idx_gpr:
 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 continue
 dict_rule[line[id_idx]] = ruleUtils.OpList([""])
 else:
 dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
 except Exception as e:
-# Se fallisce con tab, proviamo con virgola
+# If parsing with tabs fails, try comma delimiter
 try:
 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
 if len(rows) <= 1:
 raise ValueError("Model tabular with 1 column is not supported.")
 if not rows:
 raise ValueError("Model tabular is file is empty.")
 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
-# Proviamo prima con delimitatore tab
+# Try again parsing row content with the GPR column using comma-separated values
 for line in rows[1:]:
 if len(line) <= idx_gpr:
 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
 continue
 if ERRORS: utils.logWarning(
 f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
 ARGS.out_log)
-############
+print("Execution succeeded")
-# handle custom models
-#model :utils.Model = ARGS.rules_selector
-#if model is utils.Model.Custom:
-#    rules = load_custom_rules()
-#    reactions = list(rules.keys())
-#    save_as_tsv(ras_for_cell_lines(dataset, rules), reactions)
-#    if ERRORS: utils.logWarning(
-#        f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
-#        ARGS.out_log)
-#    return
-# This is the standard flow of the ras_generator program, for non-custom models.
-#name = "RAS Dataset"
-#type_gene = gene_type(dataset.iloc[0, 0], name)
-#rules      = model.getRules(ARGS.tool_dir)
-#genes      = data_gene(dataset, type_gene, name, None)
-#ids, rules = load_id_rules(rules.get(type_gene))
-#resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name)
-#create_ras(resolve_rules, name, rules, ids, ARGS.ras_output)
-#if err: utils.logWarning(
-#    f"Warning: gene(s) {err} not found in class \"{name}\", " +
-#    "the expression level for this gene will be considered NaN",
-#    ARGS.out_log)
-print("Execution succeded")
 ###############################################################################
 if __name__ == "__main__":
 main()

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/ras_generator_beta.py @ 456:a6e45049c1b9 draft