annotate COBRAxy/ras_generator.py @ 511:0cb727788cae draft default tip

Uploaded
author francesco_lapi
date Tue, 07 Oct 2025 10:31:33 +0000
parents c17c6c9d112c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
1 """
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
3
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
6 """
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
7 from __future__ import division
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
8 import sys
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
9 import argparse
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
10 import pandas as pd
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
11 import numpy as np
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
12 import utils.general_utils as utils
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
13 from typing import List, Dict
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
14 import ast
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
15
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
16 ERRORS = []
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
17 ########################## argparse ##########################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
18 ARGS :argparse.Namespace
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
19 def process_args(args:List[str] = None) -> argparse.Namespace:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
20 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
21 Processes command-line arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
22
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
23 Args:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
24 args (list): List of command-line arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
25
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
26 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
27 Namespace: An object containing parsed arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
28 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
29 parser = argparse.ArgumentParser(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
30 usage = '%(prog)s [options]',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
31 description = "process some value's genes to create a comparison's map.")
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
32
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
33 parser.add_argument("-rl", "--model_upload", type = str,
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
34 help = "path to input file containing the rules")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
35
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
36 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
37 # Galaxy converts files into .dat, this helps infer the original extension when needed.
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
38
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
39 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
40 '-n', '--none',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
41 type = utils.Bool("none"), default = True,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
42 help = 'compute Nan values')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
43
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
44 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
45 '-td', '--tool_dir',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
46 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
47 required = True, help = 'your tool directory')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
48
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
49 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
50 '-ol', '--out_log',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
51 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
52 help = "Output log")
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
53
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
54 parser.add_argument(
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
55 '-in', '--input',
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
56 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
57 help = 'input dataset')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
58
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
59 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
60 '-ra', '--ras_output',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
61 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
62 required = True, help = 'ras output')
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
63
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
64
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
65 return parser.parse_args(args)
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
66
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
67 ############################ dataset input ####################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
68 def read_dataset(data :str, name :str) -> pd.DataFrame:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
69 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
70 Read a dataset from a CSV file and return it as a pandas DataFrame.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
71
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
72 Args:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
73 data (str): Path to the CSV file containing the dataset.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
74 name (str): Name of the dataset, used in error messages.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
75
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
76 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
77 pandas.DataFrame: DataFrame containing the dataset.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
78
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
79 Raises:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
80 pd.errors.EmptyDataError: If the CSV file is empty.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
81 sys.exit: If the CSV file has the wrong format, the execution is aborted.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
82 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
83 try:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
85 dataset = dataset.astype(float)
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
86 except pd.errors.EmptyDataError:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
87 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
88 if len(dataset.columns) < 2:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
89 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
90 return dataset
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
91
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
92
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
93 def load_custom_rules() -> Dict[str,str]:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
94 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
95 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
96 performed, significantly impacting the runtime.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
97
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
98 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
99 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
100 """
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
101 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
102
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
103 dict_rule = {}
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
104
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
105 try:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
106 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
107 if len(rows) <= 1:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
108 raise ValueError("Model tabular with 1 column is not supported.")
381
0a3ca20848f3 Uploaded
francesco_lapi
parents: 309
diff changeset
109
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
110 if not rows:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
111 raise ValueError("Model tabular is file is empty.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
112
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
113 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
114
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
115 # First, try using a tab delimiter
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
116 for line in rows[1:]:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
117 if len(line) <= idx_gpr:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
118 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
119 continue
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
120
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
121 dict_rule[line[id_idx]] = line[idx_gpr]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
122
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
123 except Exception as e:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
124 # If parsing with tabs fails, try comma delimiter
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
125 try:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
126 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
127
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
128 if len(rows) <= 1:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
129 raise ValueError("Model tabular with 1 column is not supported.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
130
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
131 if not rows:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
132 raise ValueError("Model tabular is file is empty.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
133
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
134 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
135
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
136 # Try again parsing row content with the GPR column using comma-separated values
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
137 for line in rows[1:]:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
138 if len(line) <= idx_gpr:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
139 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
140 continue
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
141
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
142 dict_rule[line[id_idx]] =line[idx_gpr]
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
143
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
144 except Exception as e2:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
145 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
146
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
147 if not dict_rule:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
148 raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
149 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
150 return dict_rule
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
151
401
6c7ddf68381a Uploaded
francesco_lapi
parents: 400
diff changeset
152
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
153
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
154 def computeRAS(
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
155 dataset,gene_rules,rules_total_string,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
156 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
157 and_function=np.min, # type of operation to do in case of an and expression(min, sum)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
158 ignore_nan = True
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
159 ):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
160
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
161
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
162 logic_operators = ['and', 'or', '(', ')']
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
163 reactions=list(gene_rules.keys())
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
164
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
165 # Build the dictionary for the GPRs
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
166 df_reactions = pd.DataFrame(index=reactions)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
167 gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
168 df_reactions['rule'] = gene_rules
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
169 df_reactions = df_reactions.reset_index()
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
170 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
171
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
172 dict_rule_reactions = df_reactions.to_dict()['index']
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
173
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
174 # build useful structures for RAS computation
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
175 genes =dataset.index.intersection(rules_total_string)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
176
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
177 #check if there is one gene at least
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
178 if len(genes)==0:
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
179 raise ValueError("ERROR: No genes from the count matrix match the metabolic model. Check that gene annotations are consistent between model and dataset.")
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
180
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
181 cell_ids = list(dataset.columns)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
182 count_df_filtered = dataset.loc[genes]
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
183 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_").replace(":", "_"))
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
184
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
185 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
186
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
187 # for loop on rules
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
188 genes_not_mapped=[]
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
189 ind = 0
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
190 for rule, reaction_ids in dict_rule_reactions.items():
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
191 if len(rule) != 0:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
192 # there is one gene at least in the formula
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
193 warning_rule="_"
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
194 if "-" in rule:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
195 warning_rule="-"
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
196 if ":" in rule:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
197 warning_rule=":"
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
198 rule_orig=rule.split().copy() #original rule in list
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
199 rule = rule.replace(warning_rule,"_")
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
200 #modified rule
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
201 rule_split = rule.split()
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
202 rule_split_elements = list(filter(lambda x: x not in logic_operators, rule_split)) # remove of all logical operators
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
203 rule_split_elements = list(set(rule_split_elements)) # genes in formula
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
204
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
205 # which genes are in the count matrix?
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
206 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
207 genes_notin_count_matrix = []
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
208 for el in rule_split_elements:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
209 if el not in genes: #not present in original dataset
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
210 if el.replace("_",warning_rule) in rule_orig:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
211 genes_notin_count_matrix.append(el.replace("_",warning_rule))
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
212 else:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
213 genes_notin_count_matrix.append(el)
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
214 genes_not_mapped.extend(genes_notin_count_matrix)
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
215
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
216 # add genes not present in the data
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
217 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
218 if len(rule_split) == 1:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
219 #one gene --> one reaction
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
220 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
221 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
222 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
223 ras_df[ind] = np.nan
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
224 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
225 # more genes in the formula
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
226 check_only_and=("and" in rule and "or" not in rule) #only and
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
227 check_only_or=("or" in rule and "and" not in rule) #only or
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
228 if check_only_and or check_only_or:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
229 #or/and sequence
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
230 matrix = count_df_filtered.loc[genes_in_count_matrix].values
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
231 #compute for all cells
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
232 if check_only_and:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
233 ras_df[ind] = and_function(matrix, axis=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
234 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
235 ras_df[ind] = or_function(matrix, axis=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
236 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
237 # complex expression (e.g. A or (B and C))
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
238 data = count_df_filtered.loc[genes_in_count_matrix] # dataframe of genes in the GPRs
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
239 tree = ast.parse(rule, mode="eval").body
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
240 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
241 for j, values in enumerate(values_by_cell):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
242 ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
243
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
244 ind +=1
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
245
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
246 #create the dataframe of ras (rules x samples)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
247 ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
248 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
249
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
250 #create the reaction dataframe for ras (reactions x samples)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
251 ras_df = ras_df.explode("Reactions").set_index("Reactions")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
252
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
253 #total genes not mapped from the gpr
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
254 genes_not_mapped = sorted(set(genes_not_mapped))
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
255
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
256 return ras_df,genes_not_mapped
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
257
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
258 # function to evalute a complex boolean expression e.g. A or (B and C)
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
259 # function to evalute a complex boolean expression e.g. A or (B and C)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
260 def _evaluate_ast( node, values,or_function,and_function):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
261 if isinstance(node, ast.BoolOp):
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
262
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
263 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
264
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
265 vals = [v for v in vals if v is not None]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
266 if not vals:
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
267 return np.nan
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
268
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
269 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
270
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
271 if isinstance(node.op, ast.Or):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
272 return or_function(vals)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
273 elif isinstance(node.op, ast.And):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
274 return and_function(vals)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
275
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
276 elif isinstance(node, ast.Name):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
277 return values.get(node.id, None)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
278 elif isinstance(node, ast.Constant):
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
279 key = str(node.value) #convert in str
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
280 return values.get(key, None)
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
281 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
282 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
283
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
284 def main(args:List[str] = None) -> None:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
285 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
286 Initializes everything and sets the program in motion based on the fronted input arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
287
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
288 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
289 None
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
290 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
291 # get args from frontend (related xml)
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
292 global ARGS
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
293 ARGS = process_args(args)
309
38c9a958ea78 Uploaded
francesco_lapi
parents: 266
diff changeset
294
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
295 # read dataset and remove versioning from gene names
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
296 dataset = read_dataset(ARGS.input, "dataset")
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
297 orig_gene_list=dataset.index.copy()
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
298 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
299
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
300 if any(dataset.index.duplicated(keep=False)):
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
301 list_str=", ".join(orig_gene_list[dataset.index.duplicated(keep=False)])
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
302 raise ValueError(f"ERROR: Duplicate entries in the gene dataset. The following genes are duplicated: "+list_str)
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
303
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
304 #load GPR rules
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
305 rules = load_custom_rules()
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
306
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
307 #create a list of all the gpr
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
308 rules_total_string=""
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
309 for id,rule in rules.items():
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
310 rules_total_string+=rule.replace("(","").replace(")","") + " "
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
311 rules_total_string=list(set(rules_total_string.split(" ")))
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
312
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
313 #check if nan value must be ignored in the GPR
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
314 if ARGS.none:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
315 # #e.g. (A or nan --> A)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
316 ignore_nan = True
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
317 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
318 #e.g. (A or nan --> nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
319 ignore_nan = False
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
320
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
321 #compure ras
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
322 ras_df,genes_not_mapped=computeRAS(dataset,rules,
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
323 rules_total_string,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
324 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
325 and_function=np.min,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
326 ignore_nan=ignore_nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
327
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
328 #save to csv and replace nan with None
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
329 ras_df.replace([np.nan,None],"None").to_csv(ARGS.ras_output, sep = '\t')
381
0a3ca20848f3 Uploaded
francesco_lapi
parents: 309
diff changeset
330
510
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
331 #report genes not present in the data
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
332 if len(genes_not_mapped)>0:
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
333 genes_not_mapped_str=", ".join(genes_not_mapped)
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
334 utils.logWarning(
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
335 f"The following genes are mentioned in the GPR rules but don't appear in the dataset: "+genes_not_mapped_str,
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
336 ARGS.out_log)
c17c6c9d112c Uploaded
francesco_lapi
parents: 509
diff changeset
337
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
338 print("Execution succeeded")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
339
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
340 ###############################################################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
341 if __name__ == "__main__":
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
342 main()