489
|
1 """
|
|
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
|
|
3
|
|
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
|
|
5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
|
|
6 """
|
93
|
7 from __future__ import division
|
|
8 import sys
|
|
9 import argparse
|
|
10 import pandas as pd
|
505
|
11 import numpy as np
|
93
|
12 import utils.general_utils as utils
|
505
|
13 from typing import List, Dict
|
|
14 import ast
|
93
|
15
|
|
16 ERRORS = []
|
|
17 ########################## argparse ##########################################
|
|
18 ARGS :argparse.Namespace
|
147
|
19 def process_args(args:List[str] = None) -> argparse.Namespace:
|
93
|
20 """
|
|
21 Processes command-line arguments.
|
|
22
|
|
23 Args:
|
|
24 args (list): List of command-line arguments.
|
|
25
|
|
26 Returns:
|
|
27 Namespace: An object containing parsed arguments.
|
|
28 """
|
|
29 parser = argparse.ArgumentParser(
|
|
30 usage = '%(prog)s [options]',
|
|
31 description = "process some value's genes to create a comparison's map.")
|
|
32
|
489
|
33 parser.add_argument("-rl", "--model_upload", type = str,
|
|
34 help = "path to input file containing the rules")
|
93
|
35
|
489
|
36 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
|
|
37 # Galaxy converts files into .dat, this helps infer the original extension when needed.
|
93
|
38
|
|
39 parser.add_argument(
|
|
40 '-n', '--none',
|
|
41 type = utils.Bool("none"), default = True,
|
|
42 help = 'compute Nan values')
|
|
43
|
|
44 parser.add_argument(
|
|
45 '-td', '--tool_dir',
|
|
46 type = str,
|
|
47 required = True, help = 'your tool directory')
|
|
48
|
|
49 parser.add_argument(
|
|
50 '-ol', '--out_log',
|
|
51 type = str,
|
|
52 help = "Output log")
|
|
53
|
|
54 parser.add_argument(
|
489
|
55 '-in', '--input',
|
93
|
56 type = str,
|
|
57 help = 'input dataset')
|
|
58
|
|
59 parser.add_argument(
|
|
60 '-ra', '--ras_output',
|
|
61 type = str,
|
|
62 required = True, help = 'ras output')
|
147
|
63
|
93
|
64
|
147
|
65 return parser.parse_args(args)
|
93
|
66
|
|
67 ############################ dataset input ####################################
|
|
68 def read_dataset(data :str, name :str) -> pd.DataFrame:
|
|
69 """
|
|
70 Read a dataset from a CSV file and return it as a pandas DataFrame.
|
|
71
|
|
72 Args:
|
|
73 data (str): Path to the CSV file containing the dataset.
|
|
74 name (str): Name of the dataset, used in error messages.
|
|
75
|
|
76 Returns:
|
|
77 pandas.DataFrame: DataFrame containing the dataset.
|
|
78
|
|
79 Raises:
|
|
80 pd.errors.EmptyDataError: If the CSV file is empty.
|
|
81 sys.exit: If the CSV file has the wrong format, the execution is aborted.
|
|
82 """
|
|
83 try:
|
505
|
84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
|
|
85 dataset = dataset.astype(float)
|
93
|
86 except pd.errors.EmptyDataError:
|
505
|
87 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
|
93
|
88 if len(dataset.columns) < 2:
|
505
|
89 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
|
93
|
90 return dataset
|
|
91
|
|
92
|
505
|
93 def load_custom_rules() -> Dict[str,str]:
|
93
|
94 """
|
|
95 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
|
|
96 performed, significantly impacting the runtime.
|
|
97
|
|
98 Returns:
|
|
99 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
|
|
100 """
|
489
|
101 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat
|
|
102
|
|
103 dict_rule = {}
|
|
104
|
|
105 try:
|
|
106 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
|
|
107 if len(rows) <= 1:
|
|
108 raise ValueError("Model tabular with 1 column is not supported.")
|
381
|
109
|
489
|
110 if not rows:
|
|
111 raise ValueError("Model tabular is file is empty.")
|
|
112
|
|
113 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
|
|
114
|
|
115 # First, try using a tab delimiter
|
|
116 for line in rows[1:]:
|
|
117 if len(line) <= idx_gpr:
|
|
118 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
|
|
119 continue
|
|
120
|
505
|
121 dict_rule[line[id_idx]] = line[idx_gpr]
|
|
122
|
489
|
123 except Exception as e:
|
|
124 # If parsing with tabs fails, try comma delimiter
|
|
125 try:
|
|
126 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
|
|
127
|
|
128 if len(rows) <= 1:
|
|
129 raise ValueError("Model tabular with 1 column is not supported.")
|
|
130
|
|
131 if not rows:
|
|
132 raise ValueError("Model tabular is file is empty.")
|
|
133
|
|
134 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
|
|
135
|
|
136 # Try again parsing row content with the GPR column using comma-separated values
|
|
137 for line in rows[1:]:
|
|
138 if len(line) <= idx_gpr:
|
|
139 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
|
|
140 continue
|
|
141
|
505
|
142 dict_rule[line[id_idx]] =line[idx_gpr]
|
489
|
143
|
|
144 except Exception as e2:
|
|
145 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
|
|
146
|
|
147 if not dict_rule:
|
|
148 raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
|
93
|
149 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
|
489
|
150 return dict_rule
|
|
151
|
401
|
152
|
505
|
153
|
|
154 def computeRAS(
|
|
155 dataset,gene_rules,rules_total_string,
|
|
156 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
|
|
157 and_function=np.min, # type of operation to do in case of an and expression(min, sum)
|
|
158 ignore_nan = True
|
|
159 ):
|
|
160
|
|
161
|
|
162 logic_operators = ['and', 'or', '(', ')']
|
|
163 reactions=list(gene_rules.keys())
|
|
164
|
|
165 # Build the dictionary for the GPRs
|
|
166 df_reactions = pd.DataFrame(index=reactions)
|
|
167 gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]
|
|
168 df_reactions['rule'] = gene_rules
|
|
169 df_reactions = df_reactions.reset_index()
|
|
170 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
|
|
171
|
|
172 dict_rule_reactions = df_reactions.to_dict()['index']
|
|
173
|
|
174 # build useful structures for RAS computation
|
|
175 genes =dataset.index.intersection(rules_total_string)
|
|
176
|
|
177 #check if there is one gene at least
|
|
178 if len(genes)==0:
|
|
179 print("ERROR: no gene of the count matrix is in the metabolic model!")
|
|
180 print(" are you sure that the gene annotation is the same for the model and the count matrix?")
|
|
181 return -1
|
|
182
|
|
183 cell_ids = list(dataset.columns)
|
|
184 count_df_filtered = dataset.loc[genes]
|
|
185 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_"))
|
|
186
|
|
187 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
|
|
188
|
|
189 # for loop on rules
|
|
190 ind = 0
|
|
191 for rule, reaction_ids in dict_rule_reactions.items():
|
|
192 if len(rule) != 0:
|
|
193 # there is one gene at least in the formula
|
|
194 rule_split = rule.split()
|
|
195 rule_split_elements = list(set(rule_split)) # genes in formula
|
|
196
|
|
197 # which genes are in the count matrix?
|
|
198 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
|
|
199 genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes]
|
|
200
|
|
201 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
|
|
202 if len(rule_split) == 1:
|
|
203 #one gene --> one reaction
|
|
204 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
|
|
205 else:
|
|
206 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
|
|
207 ras_df[ind] = np.nan
|
|
208 else:
|
|
209 # more genes in the formula
|
|
210 check_only_and=("and" in rule and "or" not in rule) #only and
|
|
211 check_only_or=("or" in rule and "and" not in rule) #only or
|
|
212 if check_only_and or check_only_or:
|
|
213 #or/and sequence
|
|
214 matrix = count_df_filtered.loc[genes_in_count_matrix].values
|
|
215 #compute for all cells
|
|
216 if check_only_and:
|
|
217 ras_df[ind] = and_function(matrix, axis=0)
|
|
218 else:
|
|
219 ras_df[ind] = or_function(matrix, axis=0)
|
|
220 else:
|
|
221 # complex expression (e.g. A or (B and C))
|
|
222 data = count_df_filtered.loc[genes_in_count_matrix] # dataframe of genes in the GPRs
|
|
223
|
|
224 rule = rule.replace("-", "_")
|
|
225 tree = ast.parse(rule, mode="eval").body
|
|
226 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
|
|
227 for j, values in enumerate(values_by_cell):
|
|
228 ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
|
|
229
|
|
230 ind +=1
|
|
231
|
|
232 #create the dataframe of ras (rules x samples)
|
|
233 ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
|
|
234 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
|
|
235
|
|
236 #create the reaction dataframe for ras (reactions x samples)
|
|
237 ras_df = ras_df.explode("Reactions").set_index("Reactions")
|
|
238
|
|
239 return ras_df
|
|
240
|
|
241 # function to evalute a complex boolean expression e.g. A or (B and C)
|
|
242 def _evaluate_ast( node, values,or_function,and_function):
|
|
243 if isinstance(node, ast.BoolOp):
|
|
244 # Ricorsione sugli argomenti
|
|
245 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
|
|
246
|
|
247 vals = [v for v in vals if v is not None]
|
|
248 if not vals:
|
|
249 return None
|
|
250
|
|
251 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
|
|
252
|
|
253 if isinstance(node.op, ast.Or):
|
|
254 return or_function(vals)
|
|
255 elif isinstance(node.op, ast.And):
|
|
256 return and_function(vals)
|
|
257
|
|
258 elif isinstance(node, ast.Name):
|
|
259 return values.get(node.id, None)
|
|
260 elif isinstance(node, ast.Constant):
|
|
261 return node.value
|
|
262 else:
|
|
263 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
|
|
264
|
147
|
265 def main(args:List[str] = None) -> None:
|
93
|
266 """
|
|
267 Initializes everything and sets the program in motion based on the fronted input arguments.
|
|
268
|
|
269 Returns:
|
|
270 None
|
|
271 """
|
|
272 # get args from frontend (related xml)
|
|
273 global ARGS
|
147
|
274 ARGS = process_args(args)
|
309
|
275
|
505
|
276 # read dataset and remove versioning from gene names
|
93
|
277 dataset = read_dataset(ARGS.input, "dataset")
|
505
|
278 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
|
93
|
279
|
505
|
280 #load GPR rules
|
489
|
281 rules = load_custom_rules()
|
505
|
282
|
|
283 #create a list of all the gpr
|
|
284 rules_total_string=""
|
|
285 for id,rule in rules.items():
|
|
286 rules_total_string+=rule.replace("(","").replace(")","") + " "
|
|
287 rules_total_string=list(set(rules_total_string.split(" ")))
|
93
|
288
|
505
|
289 #check if nan value must be ignored in the GPR
|
|
290 print(ARGS.none,"\n\n\n*************",ARGS.none==True)
|
|
291 if ARGS.none:
|
|
292 # #e.g. (A or nan --> A)
|
|
293 ignore_nan = True
|
|
294 else:
|
|
295 #e.g. (A or nan --> nan)
|
|
296 ignore_nan = False
|
|
297
|
|
298 #compure ras
|
|
299 ras_df=computeRAS(dataset,rules,
|
|
300 rules_total_string,
|
|
301 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
|
|
302 and_function=np.min,
|
|
303 ignore_nan=ignore_nan)
|
|
304
|
|
305 #save to csv and replace nan with None
|
|
306 ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t')
|
381
|
307
|
505
|
308 #print
|
489
|
309 print("Execution succeeded")
|
93
|
310
|
|
311 ###############################################################################
|
|
312 if __name__ == "__main__":
|
505
|
313 main() |