cobraxy: COBRAxy/ras_generator.py comparison

comparison COBRAxy/ras_generator.py @ 489:97eea560a10f draft

Uploaded

author	francesco_lapi
date	Mon, 29 Sep 2025 10:33:26 +0000
parents	187cee1a00e2
children	c6ea189ea7e9

comparison

equal deleted inserted replaced

-:e0bcc61b2feb
+:97eea560a10f
+"""
+Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
+The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
+computes RAS per reaction for each sample/cell line, and writes a tabular output.
+"""
 from __future__ import division
-# galaxy complains this ^^^ needs to be at the very beginning of the file, for some reason.
 import sys
 import argparse
 import collections
 import pandas as pd
 import pickle as pk
 import utils.general_utils as utils
 import utils.rule_parsing as ruleUtils
 from typing import Union, Optional, List, Dict, Tuple, TypeVar
-import os
 ERRORS = []
 ########################## argparse ##########################################
 ARGS :argparse.Namespace
 def process_args(args:List[str] = None) -> argparse.Namespace:
 """
 parser = argparse.ArgumentParser(
 usage = '%(prog)s [options]',
 description = "process some value's genes to create a comparison's map.")
-parser.add_argument(
+parser.add_argument("-rl", "--model_upload", type = str,
-'-rs', '--rules_selector',
+help = "path to input file containing the rules")
-type = utils.Model, default = utils.Model.ENGRO2, choices = list(utils.Model),
-help = 'chose which type of dataset you want use')
+parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
+# Galaxy converts files into .dat, this helps infer the original extension when needed.
-parser.add_argument("-rl", "--rule_list", type = str,
-help = "path to input file with custom rules, if provided")
-parser.add_argument("-rn", "--rules_name", type = str, help = "custom rules name")
-# ^ I need this because galaxy converts my files into .dat but I need to know what extension they were in
 parser.add_argument(
 '-n', '--none',
 type = utils.Bool("none"), default = True,
 help = 'compute Nan values')
 '-ol', '--out_log',
 type = str,
 help = "Output log")
 parser.add_argument(
-'-in', '--input', #id è diventato in
+'-in', '--input',
 type = str,
 help = 'input dataset')
 parser.add_argument(
 '-ra', '--ras_output',
 return (gene.set_index(gene.columns[0])).to_dict()
 ############################ resolve ##########################################
 def replace_gene_value(l :str, d :str) -> Tuple[Union[int, float], list]:
 """
-Replace gene identifiers with corresponding values from a dictionary.
+Replace gene identifiers in a parsed rule expression with values from a dict.
 Args:
-l (str): String of gene identifier.
+l: Parsed rule as a nested list structure (strings, lists, and operators).
-d (str): String corresponding to its value.
+d: Dict mapping gene IDs to numeric values.
 Returns:
-tuple: A tuple containing two lists: the first list contains replaced values, and the second list contains any errors encountered during replacement.
+tuple: (new_expression, not_found_genes)
 """
 tmp = []
 err = []
 while l:
 if isinstance(l[0], list):
 if value == None:
 err.append(l[0])
 l = l[1:]
 return (tmp, err)
-def replace_gene(l :str, d :str) -> Union[int, float]:
+def replace_gene(l: str, d: Dict[str, Union[int, float]]) -> Union[int, float, None]:
 """
 Replace a single gene identifier with its corresponding value from a dictionary.
 Args:
 l (str): Gene identifier to replace.
-d (str): String corresponding to its value.
+d (dict): Dict mapping gene IDs to numeric values.
 Returns:
-float/int: Corresponding value from the dictionary if found, None otherwise.
+float/int/None: Corresponding value from the dictionary if found, None otherwise.
 Raises:
 sys.exit: If the value associated with the gene identifier is not valid.
 """
 if l =='and' or l == 'or':
 Generates the RAS scores for each cell line found in the dataset.
 Args:
 dataset (pd.DataFrame): Dataset containing gene values.
 rules (dict): The dict containing reaction ids as keys and rules as values.
-Side effects:
+Note:
-dataset : mut
+Modifies dataset in place by setting the first column as index.
 Returns:
 dict: A dictionary where each key corresponds to a cell line name and each value is a dictionary
 where each key corresponds to a reaction ID and each value is its computed RAS score.
 """
 ras_values_by_cell_line = {}
 dataset.set_index(dataset.columns[0], inplace=True)
-# Considera tutte le colonne tranne la prima in cui ci sono gli hugo quindi va scartata
-for cell_line_name in dataset.columns[1:]:
+for cell_line_name in dataset.columns: #[1:]:
 cell_line = dataset[cell_line_name].to_dict()
 ras_values_by_cell_line[cell_line_name]= get_ras_values(rules, cell_line)
 return ras_values_by_cell_line
 def get_ras_values(value_rules: Dict[str, ruleUtils.OpList], dataset: Dict[str, Expr]) -> Dict[str, Ras]:
 return ras_value
 def save_as_tsv(rasScores: Dict[str, Dict[str, Ras]], reactions :List[str]) -> None:
 """
-Save computed ras scores to the given path, as a tsv file.
+Save computed RAS scores to ARGS.ras_output as a TSV file.
 Args:
 rasScores : the computed ras scores.
-path : the output tsv file's path.
+reactions : the list of reaction IDs, used as the first column.
 Returns:
 None
 """
 for scores in rasScores.values(): # this is actually a lot faster than using the ootb dataframe metod, sadly
 Returns:
 str: the gene in HugoID encoding.
 """
 supportedGenesInEncoding = geneTranslator[encoding]
 if geneName in supportedGenesInEncoding: return supportedGenesInEncoding[geneName]
-raise ValueError(f"Gene \"{geneName}\" non trovato, verifica di star utilizzando il modello corretto!")
+raise ValueError(f"Gene '{geneName}' not found. Please verify you are using the correct model.")
 def load_custom_rules() -> Dict[str, ruleUtils.OpList]:
 """
 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
 performed, significantly impacting the runtime.
 Returns:
 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
 """
-datFilePath = utils.FilePath.fromStrPath(ARGS.rule_list) # actual file, stored in galaxy as a .dat
+datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload)  # actual file, stored in Galaxy as a .dat
-try: filenamePath = utils.FilePath.fromStrPath(ARGS.rules_name) # file's name in input, to determine its original ext
+dict_rule = {}
-except utils.PathErr as err:
-raise utils.PathErr(filenamePath, f"Please make sure your file's name is a valid file path, {err.msg}")
+try:
+rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
-if filenamePath.ext is utils.FileFormat.PICKLE: return utils.readPickle(datFilePath)
+if len(rows) <= 1:
+raise ValueError("Model tabular with 1 column is not supported.")
+if not rows:
+raise ValueError("Model tabular is file is empty.")
+id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
+# First, try using a tab delimiter
+for line in rows[1:]:
+if len(line) <= idx_gpr:
+utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
+continue
+if line[idx_gpr] == "":
+dict_rule[line[id_idx]] = ruleUtils.OpList([""])
+else:
+dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
+except Exception as e:
+# If parsing with tabs fails, try comma delimiter
+try:
+rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
+if len(rows) <= 1:
+raise ValueError("Model tabular with 1 column is not supported.")
+if not rows:
+raise ValueError("Model tabular is file is empty.")
+id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
+# Try again parsing row content with the GPR column using comma-separated values
+for line in rows[1:]:
+if len(line) <= idx_gpr:
+utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
+continue
+if line[idx_gpr] == "":
+dict_rule[line[id_idx]] = ruleUtils.OpList([""])
+else:
+dict_rule[line[id_idx]] = ruleUtils.parseRuleToNestedList(line[idx_gpr])
+except Exception as e2:
+raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
+if not dict_rule:
+raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
-return { line[0] : ruleUtils.parseRuleToNestedList(line[1]) for line in utils.readCsv(datFilePath) }
+return dict_rule
 def main(args:List[str] = None) -> None:
 """
 Initializes everything and sets the program in motion based on the fronted input arguments.
 dataset.iloc[:, 0] = (dataset.iloc[:, 0]).astype(str)
 # remove versioning from gene names
 dataset.iloc[:, 0] = dataset.iloc[:, 0].str.split('.').str[0]
-# handle custom models
+rules = load_custom_rules()
-model :utils.Model = ARGS.rules_selector
+reactions = list(rules.keys())
-if model is utils.Model.Custom:
+save_as_tsv(ras_for_cell_lines(dataset, rules), reactions)
-rules = load_custom_rules()
+if ERRORS: utils.logWarning(
-reactions = list(rules.keys())
+f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
+ARGS.out_log)
-save_as_tsv(ras_for_cell_lines(dataset, rules), reactions)
-if ERRORS: utils.logWarning(
-f"The following genes are mentioned in the rules but don't appear in the dataset: {ERRORS}",
+print("Execution succeeded")
-ARGS.out_log)
-return
-# This is the standard flow of the ras_generator program, for non-custom models.
-name = "RAS Dataset"
-type_gene = gene_type(dataset.iloc[0, 0], name)
-rules      = model.getRules(ARGS.tool_dir)
-genes      = data_gene(dataset, type_gene, name, None)
-ids, rules = load_id_rules(rules.get(type_gene))
-resolve_rules, err = resolve(genes, rules, ids, ARGS.none, name)
-create_ras(resolve_rules, name, rules, ids, ARGS.ras_output)
-if err: utils.logWarning(
-f"Warning: gene(s) {err} not found in class \"{name}\", " +
-"the expression level for this gene will be considered NaN",
-ARGS.out_log)
-print("Execution succeded")
 ###############################################################################
 if __name__ == "__main__":
 main()

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/ras_generator.py @ 489:97eea560a10f draft