comparison COBRAxy/ras_generator.py @ 510:c17c6c9d112c draft

Uploaded
author francesco_lapi
date Tue, 07 Oct 2025 09:48:18 +0000
parents 5956dcf94277
children f32d3c9089fc
comparison
equal deleted inserted replaced
509:5956dcf94277 510:c17c6c9d112c
174 # build useful structures for RAS computation 174 # build useful structures for RAS computation
175 genes =dataset.index.intersection(rules_total_string) 175 genes =dataset.index.intersection(rules_total_string)
176 176
177 #check if there is one gene at least 177 #check if there is one gene at least
178 if len(genes)==0: 178 if len(genes)==0:
179 print("ERROR: no gene of the count matrix is in the metabolic model!") 179 raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.")
180 print(" are you sure that the gene annotation is the same for the model and the count matrix?")
181 return -1
182 180
183 cell_ids = list(dataset.columns) 181 cell_ids = list(dataset.columns)
184 count_df_filtered = dataset.loc[genes] 182 count_df_filtered = dataset.loc[genes]
185 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_")) 183 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_").replace(":", "_"))
186 184
187 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan) 185 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
188 186
189 # for loop on rules 187 # for loop on rules
188 genes_not_mapped=[]
190 ind = 0 189 ind = 0
191 for rule, reaction_ids in dict_rule_reactions.items(): 190 for rule, reaction_ids in dict_rule_reactions.items():
192 if len(rule) != 0: 191 if len(rule) != 0:
193 # there is one gene at least in the formula 192 # there is one gene at least in the formula
194 rule = rule.replace("-","_") 193 warning_rule="_"
194 if "-" in rule:
195 warning_rule="-"
196 if ":" in rule:
197 warning_rule=":"
198 rule_orig=rule.split().copy() #original rule in list
199 rule = rule.replace(warning_rule,"_")
200 #modified rule
195 rule_split = rule.split() 201 rule_split = rule.split()
196 rule_split_elements = list(set(rule_split)) # genes in formula 202 rule_split_elements = list(filter(lambda x: x not in logic_operators, rule_split)) # remove of all logical operators
203 rule_split_elements = list(set(rule_split_elements)) # genes in formula
197 204
198 # which genes are in the count matrix? 205 # which genes are in the count matrix?
199 genes_in_count_matrix = [el for el in rule_split_elements if el in genes] 206 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
200 genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes] 207 genes_notin_count_matrix = []
201 208 for el in rule_split_elements:
202 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix 209 if el not in genes: #not present in original dataset
210 if el.replace("_",warning_rule) in rule_orig:
211 genes_notin_count_matrix.append(el.replace("_",warning_rule))
212 else:
213 genes_notin_count_matrix.append(el)
214 genes_not_mapped.extend(genes_notin_count_matrix)
215
216 # add genes not present in the data
217 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
203 if len(rule_split) == 1: 218 if len(rule_split) == 1:
204 #one gene --> one reaction 219 #one gene --> one reaction
205 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix] 220 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
206 else: 221 else:
207 if len(genes_notin_count_matrix) > 0 and ignore_nan == False: 222 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
233 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()] 248 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
234 249
235 #create the reaction dataframe for ras (reactions x samples) 250 #create the reaction dataframe for ras (reactions x samples)
236 ras_df = ras_df.explode("Reactions").set_index("Reactions") 251 ras_df = ras_df.explode("Reactions").set_index("Reactions")
237 252
238 return ras_df 253 #total genes not mapped from the gpr
239 254 genes_not_mapped = sorted(set(genes_not_mapped))
255
256 return ras_df,genes_not_mapped
257
258 # function to evalute a complex boolean expression e.g. A or (B and C)
240 # function to evalute a complex boolean expression e.g. A or (B and C) 259 # function to evalute a complex boolean expression e.g. A or (B and C)
241 def _evaluate_ast( node, values,or_function,and_function): 260 def _evaluate_ast( node, values,or_function,and_function):
242 if isinstance(node, ast.BoolOp): 261 if isinstance(node, ast.BoolOp):
243 # Ricorsione sugli argomenti 262
244 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values] 263 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
245 264
246 vals = [v for v in vals if v is not None] 265 vals = [v for v in vals if v is not None]
247 if not vals: 266 if not vals:
248 return None 267 return np.nan
249 268
250 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals] 269 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
251 270
252 if isinstance(node.op, ast.Or): 271 if isinstance(node.op, ast.Or):
253 return or_function(vals) 272 return or_function(vals)
255 return and_function(vals) 274 return and_function(vals)
256 275
257 elif isinstance(node, ast.Name): 276 elif isinstance(node, ast.Name):
258 return values.get(node.id, None) 277 return values.get(node.id, None)
259 elif isinstance(node, ast.Constant): 278 elif isinstance(node, ast.Constant):
260 return node.value 279 key = str(node.value) #convert in str
280 return values.get(key, None)
261 else: 281 else:
262 raise ValueError(f"Error in boolean expression: {ast.dump(node)}") 282 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
263 283
264 def main(args:List[str] = None) -> None: 284 def main(args:List[str] = None) -> None:
265 """ 285 """
272 global ARGS 292 global ARGS
273 ARGS = process_args(args) 293 ARGS = process_args(args)
274 294
275 # read dataset and remove versioning from gene names 295 # read dataset and remove versioning from gene names
276 dataset = read_dataset(ARGS.input, "dataset") 296 dataset = read_dataset(ARGS.input, "dataset")
277 dataset.index = [str(el.split(".")[0]) for el in dataset.index] 297 orig_gene_list=dataset.index.copy()
298 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
299
300 if any(dataset.index.duplicated(keep=False)):
301 list_str=", ".join(orig_gene_list[dataset.index.duplicated(keep=False)])
302 raise ValueError(f"ERROR: Duplicate entries in the gene dataset. The following genes are duplicated: "+list_str)
278 303
279 #load GPR rules 304 #load GPR rules
280 rules = load_custom_rules() 305 rules = load_custom_rules()
281 306
282 #create a list of all the gpr 307 #create a list of all the gpr
284 for id,rule in rules.items(): 309 for id,rule in rules.items():
285 rules_total_string+=rule.replace("(","").replace(")","") + " " 310 rules_total_string+=rule.replace("(","").replace(")","") + " "
286 rules_total_string=list(set(rules_total_string.split(" "))) 311 rules_total_string=list(set(rules_total_string.split(" ")))
287 312
288 #check if nan value must be ignored in the GPR 313 #check if nan value must be ignored in the GPR
289 print(ARGS.none,"\n\n\n*************",ARGS.none==True)
290 if ARGS.none: 314 if ARGS.none:
291 # #e.g. (A or nan --> A) 315 # #e.g. (A or nan --> A)
292 ignore_nan = True 316 ignore_nan = True
293 else: 317 else:
294 #e.g. (A or nan --> nan) 318 #e.g. (A or nan --> nan)
295 ignore_nan = False 319 ignore_nan = False
296 320
297 #compure ras 321 #compure ras
298 ras_df=computeRAS(dataset,rules, 322 ras_df,genes_not_mapped=computeRAS(dataset,rules,
299 rules_total_string, 323 rules_total_string,
300 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean) 324 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
301 and_function=np.min, 325 and_function=np.min,
302 ignore_nan=ignore_nan) 326 ignore_nan=ignore_nan)
303 327
304 #save to csv and replace nan with None 328 #save to csv and replace nan with None
305 ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t') 329 ras_df.replace([np.nan,None],"None").to_csv(ARGS.ras_output, sep = '\t')
306 330
307 #print 331 #report genes not present in the data
332 if len(genes_not_mapped)>0:
333 genes_not_mapped_str=", ".join(genes_not_mapped)
334 utils.logWarning(
335 f"The following genes are mentioned in the GPR rules but don't appear in the dataset: "+genes_not_mapped_str,
336 ARGS.out_log)
337
308 print("Execution succeeded") 338 print("Execution succeeded")
309 339
310 ############################################################################### 340 ###############################################################################
311 if __name__ == "__main__": 341 if __name__ == "__main__":
312 main() 342 main()