Mercurial > repos > bimib > cobraxy
comparison COBRAxy/rps_generator.py @ 293:7b8d9de81a86 draft
Uploaded
| author | francesco_lapi |
|---|---|
| date | Thu, 15 May 2025 18:23:52 +0000 |
| parents | 5dd2ab4637aa |
| children | 3dccdf56cb24 |
comparison
equal
deleted
inserted
replaced
| 292:31bc171a6ba5 | 293:7b8d9de81a86 |
|---|---|
| 1 import re | |
| 2 import sys | |
| 3 import csv | |
| 4 import math | 1 import math |
| 5 import argparse | 2 import argparse |
| 6 | 3 |
| 7 import numpy as np | 4 import numpy as np |
| 8 import pickle as pk | 5 import pickle as pk |
| 9 import pandas as pd | 6 import pandas as pd |
| 10 | 7 |
| 11 from enum import Enum | 8 from typing import Optional, List, Dict |
| 12 from typing import Optional, List, Dict, Tuple | |
| 13 | 9 |
| 14 import utils.general_utils as utils | 10 import utils.general_utils as utils |
| 15 import utils.reaction_parsing as reactionUtils | 11 import utils.reaction_parsing as reactionUtils |
| 16 | 12 |
| 17 ########################## argparse ########################################## | 13 ########################## argparse ########################################## |
| 43 help = 'your tool directory') | 39 help = 'your tool directory') |
| 44 parser.add_argument('-ol', '--out_log', | 40 parser.add_argument('-ol', '--out_log', |
| 45 help = "Output log") | 41 help = "Output log") |
| 46 parser.add_argument('-id', '--input', | 42 parser.add_argument('-id', '--input', |
| 47 type = str, | 43 type = str, |
| 44 required = True, | |
| 48 help = 'input dataset') | 45 help = 'input dataset') |
| 49 parser.add_argument('-rp', '--rps_output', | 46 parser.add_argument('-rp', '--rps_output', |
| 50 type = str, | 47 type = str, |
| 51 required = True, | 48 required = True, |
| 52 help = 'rps output') | 49 help = 'rps output') |
| 68 """ | 65 """ |
| 69 if str(name_data) == 'Dataset': | 66 if str(name_data) == 'Dataset': |
| 70 return str(name_data) + '_' + str(count) | 67 return str(name_data) + '_' + str(count) |
| 71 else: | 68 else: |
| 72 return str(name_data) | 69 return str(name_data) |
| 73 | |
| 74 | 70 |
| 75 ############################ get_abund_data #################################### | 71 ############################ get_abund_data #################################### |
| 76 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]: | 72 def get_abund_data(dataset: pd.DataFrame, cell_line_index:int) -> Optional[pd.Series]: |
| 77 """ | 73 """ |
| 78 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are | 74 Extracts abundance data and turns it into a series for a specific cell line from the dataset, which rows are |
| 94 cell_line_name = dataset.columns[cell_line_index] | 90 cell_line_name = dataset.columns[cell_line_index] |
| 95 abundances_series = dataset[cell_line_name][1:] | 91 abundances_series = dataset[cell_line_name][1:] |
| 96 | 92 |
| 97 return abundances_series | 93 return abundances_series |
| 98 | 94 |
| 99 | |
| 100 ############################ clean_metabolite_name #################################### | 95 ############################ clean_metabolite_name #################################### |
| 101 def clean_metabolite_name(name :str) -> str: | 96 def clean_metabolite_name(name :str) -> str: |
| 102 """ | 97 """ |
| 103 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify | 98 Removes some characters from a metabolite's name, provided as input, and makes it lowercase in order to simplify |
| 104 the search of a match in the dictionary of synonyms. | 99 the search of a match in the dictionary of synonyms. |
| 108 | 103 |
| 109 Returns: | 104 Returns: |
| 110 str : a new string with the cleaned name. | 105 str : a new string with the cleaned name. |
| 111 """ | 106 """ |
| 112 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower() | 107 return "".join(ch for ch in name if ch not in ",;-_'([{ }])").lower() |
| 113 | |
| 114 | 108 |
| 115 ############################ get_metabolite_id #################################### | 109 ############################ get_metabolite_id #################################### |
| 116 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str: | 110 def get_metabolite_id(name :str, syn_dict :Dict[str, List[str]]) -> str: |
| 117 """ | 111 """ |
| 118 Looks through a dictionary of synonyms to find a match for a given metabolite's name. | 112 Looks through a dictionary of synonyms to find a match for a given metabolite's name. |
| 155 missing_list.append(metabolite) | 149 missing_list.append(metabolite) |
| 156 | 150 |
| 157 return missing_list | 151 return missing_list |
| 158 | 152 |
| 159 ############################ calculate_rps #################################### | 153 ############################ calculate_rps #################################### |
| 160 def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str]) -> Dict[str, float]: | 154 def calculate_rps(reactions: Dict[str, Dict[str, int]], abundances: Dict[str, float], black_list: List[str], missing_list: List[str], substrateFreqTable: Dict[str, int]) -> Dict[str, float]: |
| 161 """ | 155 """ |
| 162 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample. | 156 Calculate the Reaction Propensity scores (RPS) based on the availability of reaction substrates, for (ideally) each input model reaction and for each sample. |
| 163 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient | 157 The score is computed as the product of the concentrations of the reacting substances, with each concentration raised to a power equal to its stoichiometric coefficient |
| 164 for each reaction using the provided coefficient and abundance values. | 158 for each reaction using the provided coefficient and abundance values. The value is then normalized, based on how frequent the metabolite is in the selected model's reactions, |
| 159 and log-transformed. | |
| 165 | 160 |
| 166 Parameters: | 161 Parameters: |
| 167 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. | 162 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. |
| 168 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances. | 163 abundances (dict): A dictionary representing metabolite abundances where keys are metabolite names and values are their corresponding abundances. |
| 169 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. | 164 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. |
| 170 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1. | 165 missing_list (list): A list containing metabolite names that were missing in the original abundances dictionary and thus their values were set to 1. |
| 171 | 166 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value). |
| 167 | |
| 172 Returns: | 168 Returns: |
| 173 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores. | 169 dict: A dictionary containing Reaction Propensity Scores (RPS) where keys are reaction names and values are the corresponding RPS scores. |
| 174 """ | 170 """ |
| 175 rps_scores = {} | 171 rps_scores = {} |
| 176 | 172 |
| 177 for reaction_name, substrates in reactions.items(): | 173 for reaction_name, substrates in reactions.items(): |
| 178 total_contribution = 1 | 174 total_contribution = 1 |
| 179 metab_significant = False | 175 metab_significant = False |
| 180 for metabolite, stoichiometry in substrates.items(): | 176 for metabolite, stoichiometry in substrates.items(): |
| 181 temp = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite] | 177 abundance = 1 if math.isnan(abundances[metabolite]) else abundances[metabolite] |
| 182 if metabolite not in black_list and metabolite not in missing_list: | 178 if metabolite not in black_list and metabolite not in missing_list: |
| 183 metab_significant = True | 179 metab_significant = True |
| 184 total_contribution *= temp ** stoichiometry | 180 |
| 181 total_contribution += math.log((abundance + np.finfo(float).eps) / substrateFreqTable[metabolite]) * stoichiometry | |
| 185 | 182 |
| 186 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan | 183 rps_scores[reaction_name] = total_contribution if metab_significant else math.nan |
| 187 | 184 |
| 188 return rps_scores | 185 return rps_scores |
| 189 | 186 |
| 190 | |
| 191 ############################ rps_for_cell_lines #################################### | 187 ############################ rps_for_cell_lines #################################### |
| 192 def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]]) -> None: | 188 def rps_for_cell_lines(dataset: List[List[str]], reactions: Dict[str, Dict[str, int]], black_list: List[str], syn_dict: Dict[str, List[str]], substrateFreqTable: Dict[str, int]) -> None: |
| 193 """ | 189 """ |
| 194 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file. | 190 Calculate Reaction Propensity Scores (RPS) for each cell line represented in the dataframe and creates an output file. |
| 195 | 191 |
| 196 Parameters: | 192 Parameters: |
| 197 dataset : the dataset's data, by rows | 193 dataset : the dataset's data, by rows |
| 198 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. | 194 reactions (dict): A dictionary representing reactions where keys are reaction names and values are dictionaries containing metabolite names as keys and stoichiometric coefficients as values. |
| 199 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. | 195 black_list (list): A list containing metabolite names that should be excluded from the RPS calculation. |
| 200 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms. | 196 syn_dict (dict): A dictionary where keys are general metabolite names and values are lists of possible synonyms. |
| 197 substrateFreqTable (dict): A dictionary where each metabolite name (key) is associated with how many times it shows up in the model's reactions (value). | |
| 201 | 198 |
| 202 Returns: | 199 Returns: |
| 203 None | 200 None |
| 204 """ | 201 """ |
| 205 cell_lines = dataset[0][1:] | 202 cell_lines = dataset[0][1:] |
| 213 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines))) | 210 missing_list = check_missing_metab(reactions, abundances_dict, len((cell_lines))) |
| 214 | 211 |
| 215 rps_scores :Dict[Dict[str, float]] = {} | 212 rps_scores :Dict[Dict[str, float]] = {} |
| 216 for pos, cell_line_name in enumerate(cell_lines): | 213 for pos, cell_line_name in enumerate(cell_lines): |
| 217 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() } | 214 abundances = { metab : abundances[pos] for metab, abundances in abundances_dict.items() } |
| 218 rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list) | 215 rps_scores[cell_line_name] = calculate_rps(reactions, abundances, black_list, missing_list, substrateFreqTable) |
| 219 | 216 |
| 220 df = pd.DataFrame.from_dict(rps_scores) | 217 df = pd.DataFrame.from_dict(rps_scores) |
| 221 | 218 |
| 222 df.index.name = 'Reactions' | 219 df.index.name = 'Reactions' |
| 223 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True) | 220 df.to_csv(ARGS.rps_output, sep='\t', na_rep='None', index=True) |
| 224 | |
| 225 | 221 |
| 226 ############################ main #################################### | 222 ############################ main #################################### |
| 227 def main(args:List[str] = None) -> None: | 223 def main(args:List[str] = None) -> None: |
| 228 """ | 224 """ |
| 229 Initializes everything and sets the program in motion based on the fronted input arguments. | 225 Initializes everything and sets the program in motion based on the fronted input arguments. |
| 243 | 239 |
| 244 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader = False) | 240 dataset = utils.readCsv(utils.FilePath.fromStrPath(ARGS.input), '\t', skipHeader = False) |
| 245 | 241 |
| 246 if ARGS.reaction_choice == 'default': | 242 if ARGS.reaction_choice == 'default': |
| 247 reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb')) | 243 reactions = pk.load(open(ARGS.tool_dir + '/local/pickle files/reactions.pickle', 'rb')) |
| 244 substrateFreqTable = pk.load(open(ARGS.tool_dir + '/local/pickle files/substrate_frequencies.pickle', 'rb')) | |
| 248 | 245 |
| 249 elif ARGS.reaction_choice == 'custom': | 246 elif ARGS.reaction_choice == 'custom': |
| 250 reactions = reactionUtils.parse_custom_reactions(ARGS.custom) | 247 reactions = reactionUtils.parse_custom_reactions(ARGS.custom) |
| 251 | 248 substrateFreqTable = {} |
| 252 rps_for_cell_lines(dataset, reactions, black_list, syn_dict) | 249 for _, substrates in reactions.items(): |
| 250 for substrateName, _ in substrates.items(): | |
| 251 if substrateName not in substrateFreqTable: substrateFreqTable[substrateName] = 0 | |
| 252 substrateFreqTable[substrateName] += 1 | |
| 253 | |
| 254 rps_for_cell_lines(dataset, reactions, black_list, syn_dict, substrateFreqTable) | |
| 253 print('Execution succeded') | 255 print('Execution succeded') |
| 254 | 256 |
| 255 ############################################################################## | 257 ############################################################################## |
| 256 if __name__ == "__main__": | 258 if __name__ == "__main__": main() |
| 257 main() |
