cobraxy: COBRAxy/rps_generator.py comparison

comparison COBRAxy/rps_generator.py @ 293:7b8d9de81a86 draft

Uploaded

author	francesco_lapi
date	Thu, 15 May 2025 18:23:52 +0000
parents	5dd2ab4637aa
children	3dccdf56cb24

comparison

equal deleted inserted replaced

-:31bc171a6ba5
+:7b8d9de81a86
-import re
-import sys
-import csv
 import math
 import argparse
 import numpy  as np
 import pickle as pk
 import pandas as pd
-from enum   import Enum
+from typing import Optional, List, Dict
-from typing import Optional, List, Dict, Tuple
 import utils.general_utils as utils
 import utils.reaction_parsing as reactionUtils
 ########################## argparse ##########################################
 help = 'your tool directory')
 parser.add_argument('-ol', '--out_log',
 help = "Output log")
 parser.add_argument('-id', '--input',
 type = str,
+required = True,
 help = 'input dataset')
 parser.add_argument('-rp', '--rps_output',
 type = str,
 required = True,
 help = 'rps output')
 """
 if str(name_data) == 'Dataset':
 return str(name_data) + '_' + str(count)
 else:
 return str(name_data)
 ############################ get_abund_data ####################################
 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]:
 """
 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are
 cell_line_name = dataset.columns[cell_line_index]
 abundances_series = dataset[cell_line_name][1:]
 return abundances_series
 ############################ clean_metabolite_name ####################################
 def clean_metabolite_name(name :str) -> str:
 """
 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify
 the search of a match in the dictionary of synonyms.
 Returns:
 str : a new string with the cleaned name.
 """
 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower()
 ############################ get_metabolite_id ####################################
 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str:
 """
 Looks through a dictionary of synonyms to find a match for a given metabolite's name.
 missing_list.append(metabolite)
 return missing_list
 ############################ calculate_rps ####################################
-def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str]) -> Dict[str, float]:
+def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str], substrateFreqTable: Dict[str, int]) -> Dict[str, float]:
 """
 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample.
 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient
-for each reaction using the provided coefficient and abundance values.
+for each reaction using the provided coefficient and abundance values. The value is then normalized, based on how frequent the metabolite is in the selected model's reactions,
+and log-transformed.
 Parameters:
 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances.
 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1.
+substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
 Returns:
 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores.
 """
 rps_scores = {}
 for reaction_name, substrates in reactions.items():
 total_contribution = 1
-metab_significant = False
+metab_significant  = False
 for metabolite, stoichiometry in substrates.items():
-temp = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite]
+abundance = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite]
 if metabolite not in black_list and metabolite not in missing_list:
 metab_significant = True
-total_contribution *= temp ** stoichiometry
+total_contribution += math.log((abundance + np.finfo(float).eps) / substrateFreqTable[metabolite]) * stoichiometry
 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan
 return rps_scores
 ############################ rps_for_cell_lines ####################################
-def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]]) -> None:
+def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]], substrateFreqTable: Dict[str, int]) -> None:
 """
 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file.
 Parameters:
 dataset : the dataset's data, by rows
 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values.
 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation.
 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms.
+substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value).
 Returns:
 None
 """
 cell_lines = dataset[0][1:]
 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines)))
 rps_scores :Dict[Dict[str, float]] = {}
 for pos, cell_line_name in enumerate(cell_lines):
 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() }
-rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list)
+rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list, substrateFreqTable)
 df = pd.DataFrame.from_dict(rps_scores)
 df.index.name = 'Reactions'
 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True)
 ############################ main ####################################
 def main(args:List[str] = None) -> None:
 """
 Initializes everything and sets the program in motion based on the fronted input arguments.
 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader = False)
 if ARGS.reaction_choice == 'default':
 reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb'))
+substrateFreqTable = pk.load(open(ARGS.tool_dir + '/local/pickle files/substrate_frequencies.pickle', 'rb'))
 elif ARGS.reaction_choice == 'custom':
 reactions = reactionUtils.parse_custom_reactions(ARGS.custom)
+substrateFreqTable = {}
-rps_for_cell_lines(dataset, reactions, black_list, syn_dict)
+for _, substrates in reactions.items():
+for substrateName, _ in substrates.items():
+if substrateName not in substrateFreqTable: substrateFreqTable[substrateName] = 0
+substrateFreqTable[substrateName] += 1
+rps_for_cell_lines(dataset, reactions, black_list, syn_dict, substrateFreqTable)
 print('Execution succeded')
 ##############################################################################
-if __name__ == "__main__":
+if __name__ == "__main__": main()
-main()

Mercurial > repos > bimib > cobraxy

comparison COBRAxy/rps_generator.py @ 293:7b8d9de81a86 draft