annotate COBRAxy/ras_generator.py @ 508:ca98c149ec61 draft default tip

Uploaded
author francesco_lapi
date Wed, 01 Oct 2025 14:21:26 +0000
parents 96f512dff490
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
1 """
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
2 Generate Reaction Activity Scores (RAS) from a gene expression dataset and GPR rules.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
3
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
4 The script reads a tabular dataset (genes x samples) and a rules file (GPRs),
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
5 computes RAS per reaction for each sample/cell line, and writes a tabular output.
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
6 """
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
7 from __future__ import division
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
8 import sys
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
9 import argparse
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
10 import pandas as pd
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
11 import numpy as np
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
12 import utils.general_utils as utils
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
13 from typing import List, Dict
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
14 import ast
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
15
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
16 ERRORS = []
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
17 ########################## argparse ##########################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
18 ARGS :argparse.Namespace
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
19 def process_args(args:List[str] = None) -> argparse.Namespace:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
20 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
21 Processes command-line arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
22
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
23 Args:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
24 args (list): List of command-line arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
25
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
26 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
27 Namespace: An object containing parsed arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
28 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
29 parser = argparse.ArgumentParser(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
30 usage = '%(prog)s [options]',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
31 description = "process some value's genes to create a comparison's map.")
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
32
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
33 parser.add_argument("-rl", "--model_upload", type = str,
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
34 help = "path to input file containing the rules")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
35
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
36 parser.add_argument("-rn", "--model_upload_name", type = str, help = "custom rules name")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
37 # Galaxy converts files into .dat, this helps infer the original extension when needed.
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
38
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
39 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
40 '-n', '--none',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
41 type = utils.Bool("none"), default = True,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
42 help = 'compute Nan values')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
43
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
44 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
45 '-td', '--tool_dir',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
46 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
47 required = True, help = 'your tool directory')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
48
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
49 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
50 '-ol', '--out_log',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
51 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
52 help = "Output log")
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
53
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
54 parser.add_argument(
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
55 '-in', '--input',
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
56 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
57 help = 'input dataset')
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
58
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
59 parser.add_argument(
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
60 '-ra', '--ras_output',
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
61 type = str,
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
62 required = True, help = 'ras output')
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
63
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
64
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
65 return parser.parse_args(args)
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
66
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
67 ############################ dataset input ####################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
68 def read_dataset(data :str, name :str) -> pd.DataFrame:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
69 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
70 Read a dataset from a CSV file and return it as a pandas DataFrame.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
71
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
72 Args:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
73 data (str): Path to the CSV file containing the dataset.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
74 name (str): Name of the dataset, used in error messages.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
75
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
76 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
77 pandas.DataFrame: DataFrame containing the dataset.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
78
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
79 Raises:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
80 pd.errors.EmptyDataError: If the CSV file is empty.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
81 sys.exit: If the CSV file has the wrong format, the execution is aborted.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
82 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
83 try:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
84 dataset = pd.read_csv(data, sep = '\t', header = 0, engine='python', index_col=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
85 dataset = dataset.astype(float)
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
86 except pd.errors.EmptyDataError:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
87 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
88 if len(dataset.columns) < 2:
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
89 sys.exit('Execution aborted: wrong file format of ' + name + '\n')
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
90 return dataset
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
91
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
92
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
93 def load_custom_rules() -> Dict[str,str]:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
94 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
95 Opens custom rules file and extracts the rules. If the file is in .csv format an additional parsing step will be
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
96 performed, significantly impacting the runtime.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
97
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
98 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
99 Dict[str, ruleUtils.OpList] : dict mapping reaction IDs to rules.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
100 """
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
101 datFilePath = utils.FilePath.fromStrPath(ARGS.model_upload) # actual file, stored in Galaxy as a .dat
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
102
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
103 dict_rule = {}
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
104
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
105 try:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
106 rows = utils.readCsv(datFilePath, delimiter = "\t", skipHeader=False)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
107 if len(rows) <= 1:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
108 raise ValueError("Model tabular with 1 column is not supported.")
381
0a3ca20848f3 Uploaded
francesco_lapi
parents: 309
diff changeset
109
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
110 if not rows:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
111 raise ValueError("Model tabular is file is empty.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
112
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
113 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
114
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
115 # First, try using a tab delimiter
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
116 for line in rows[1:]:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
117 if len(line) <= idx_gpr:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
118 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
119 continue
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
120
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
121 dict_rule[line[id_idx]] = line[idx_gpr]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
122
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
123 except Exception as e:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
124 # If parsing with tabs fails, try comma delimiter
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
125 try:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
126 rows = utils.readCsv(datFilePath, delimiter = ",", skipHeader=False)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
127
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
128 if len(rows) <= 1:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
129 raise ValueError("Model tabular with 1 column is not supported.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
130
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
131 if not rows:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
132 raise ValueError("Model tabular is file is empty.")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
133
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
134 id_idx, idx_gpr = utils.findIdxByName(rows[0], "GPR")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
135
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
136 # Try again parsing row content with the GPR column using comma-separated values
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
137 for line in rows[1:]:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
138 if len(line) <= idx_gpr:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
139 utils.logWarning(f"Skipping malformed line: {line}", ARGS.out_log)
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
140 continue
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
141
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
142 dict_rule[line[id_idx]] =line[idx_gpr]
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
143
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
144 except Exception as e2:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
145 raise ValueError(f"Unable to parse rules file. Tried both tab and comma delimiters. Original errors: Tab: {e}, Comma: {e2}")
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
146
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
147 if not dict_rule:
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
148 raise ValueError("No valid rules found in the uploaded file. Please check the file format.")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
149 # csv rules need to be parsed, those in a pickle format are taken to be pre-parsed.
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
150 return dict_rule
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
151
401
6c7ddf68381a Uploaded
francesco_lapi
parents: 400
diff changeset
152
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
153
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
154 def computeRAS(
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
155 dataset,gene_rules,rules_total_string,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
156 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
157 and_function=np.min, # type of operation to do in case of an and expression(min, sum)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
158 ignore_nan = True
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
159 ):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
160
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
161
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
162 logic_operators = ['and', 'or', '(', ')']
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
163 reactions=list(gene_rules.keys())
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
164
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
165 # Build the dictionary for the GPRs
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
166 df_reactions = pd.DataFrame(index=reactions)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
167 gene_rules=[gene_rules[reaction_id].replace("OR","or").replace("AND","and").replace("(","( ").replace(")"," )") for reaction_id in reactions]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
168 df_reactions['rule'] = gene_rules
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
169 df_reactions = df_reactions.reset_index()
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
170 df_reactions = df_reactions.groupby('rule').agg(lambda x: sorted(list(x)))
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
171
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
172 dict_rule_reactions = df_reactions.to_dict()['index']
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
173
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
174 # build useful structures for RAS computation
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
175 genes =dataset.index.intersection(rules_total_string)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
176
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
177 #check if there is one gene at least
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
178 if len(genes)==0:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
179 print("ERROR: no gene of the count matrix is in the metabolic model!")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
180 print(" are you sure that the gene annotation is the same for the model and the count matrix?")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
181 return -1
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
182
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
183 cell_ids = list(dataset.columns)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
184 count_df_filtered = dataset.loc[genes]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
185 count_df_filtered = count_df_filtered.rename(index=lambda x: x.replace("-", "_"))
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
186
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
187 ras_df=np.full((len(dict_rule_reactions), len(cell_ids)), np.nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
188
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
189 # for loop on rules
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
190 ind = 0
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
191 for rule, reaction_ids in dict_rule_reactions.items():
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
192 if len(rule) != 0:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
193 # there is one gene at least in the formula
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
194 rule_split = rule.split()
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
195 rule_split_elements = list(set(rule_split)) # genes in formula
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
196
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
197 # which genes are in the count matrix?
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
198 genes_in_count_matrix = [el for el in rule_split_elements if el in genes]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
199 genes_notin_count_matrix = [el for el in rule_split_elements if el not in genes]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
200
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
201 if len(genes_in_count_matrix) > 0: #there is at least one gene in the count matrix
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
202 if len(rule_split) == 1:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
203 #one gene --> one reaction
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
204 ras_df[ind] = count_df_filtered.loc[genes_in_count_matrix]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
205 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
206 if len(genes_notin_count_matrix) > 0 and ignore_nan == False:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
207 ras_df[ind] = np.nan
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
208 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
209 # more genes in the formula
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
210 check_only_and=("and" in rule and "or" not in rule) #only and
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
211 check_only_or=("or" in rule and "and" not in rule) #only or
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
212 if check_only_and or check_only_or:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
213 #or/and sequence
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
214 matrix = count_df_filtered.loc[genes_in_count_matrix].values
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
215 #compute for all cells
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
216 if check_only_and:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
217 ras_df[ind] = and_function(matrix, axis=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
218 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
219 ras_df[ind] = or_function(matrix, axis=0)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
220 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
221 # complex expression (e.g. A or (B and C))
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
222 data = count_df_filtered.loc[genes_in_count_matrix] # dataframe of genes in the GPRs
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
223
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
224 rule = rule.replace("-", "_")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
225 tree = ast.parse(rule, mode="eval").body
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
226 values_by_cell = [dict(zip(data.index, data[col].values)) for col in data.columns]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
227 for j, values in enumerate(values_by_cell):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
228 ras_df[ind, j] = _evaluate_ast(tree, values, or_function, and_function)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
229
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
230 ind +=1
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
231
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
232 #create the dataframe of ras (rules x samples)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
233 ras_df= pd.DataFrame(data=ras_df,index=range(len(dict_rule_reactions)), columns=cell_ids)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
234 ras_df['Reactions'] = [reaction_ids for rule,reaction_ids in dict_rule_reactions.items()]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
235
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
236 #create the reaction dataframe for ras (reactions x samples)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
237 ras_df = ras_df.explode("Reactions").set_index("Reactions")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
238
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
239 return ras_df
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
240
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
241 # function to evalute a complex boolean expression e.g. A or (B and C)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
242 def _evaluate_ast( node, values,or_function,and_function):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
243 if isinstance(node, ast.BoolOp):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
244 # Ricorsione sugli argomenti
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
245 vals = [_evaluate_ast(v, values,or_function,and_function) for v in node.values]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
246
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
247 vals = [v for v in vals if v is not None]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
248 if not vals:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
249 return None
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
250
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
251 vals = [np.array(v) if isinstance(v, (list, np.ndarray)) else v for v in vals]
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
252
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
253 if isinstance(node.op, ast.Or):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
254 return or_function(vals)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
255 elif isinstance(node.op, ast.And):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
256 return and_function(vals)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
257
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
258 elif isinstance(node, ast.Name):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
259 return values.get(node.id, None)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
260 elif isinstance(node, ast.Constant):
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
261 return node.value
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
262 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
263 raise ValueError(f"Error in boolean expression: {ast.dump(node)}")
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
264
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
265 def main(args:List[str] = None) -> None:
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
266 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
267 Initializes everything and sets the program in motion based on the fronted input arguments.
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
268
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
269 Returns:
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
270 None
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
271 """
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
272 # get args from frontend (related xml)
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
273 global ARGS
147
3fca9b568faf Uploaded
bimib
parents: 93
diff changeset
274 ARGS = process_args(args)
309
38c9a958ea78 Uploaded
francesco_lapi
parents: 266
diff changeset
275
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
276 # read dataset and remove versioning from gene names
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
277 dataset = read_dataset(ARGS.input, "dataset")
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
278 dataset.index = [str(el.split(".")[0]) for el in dataset.index]
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
279
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
280 #load GPR rules
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
281 rules = load_custom_rules()
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
282
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
283 #create a list of all the gpr
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
284 rules_total_string=""
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
285 for id,rule in rules.items():
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
286 rules_total_string+=rule.replace("(","").replace(")","") + " "
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
287 rules_total_string=list(set(rules_total_string.split(" ")))
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
288
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
289 #check if nan value must be ignored in the GPR
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
290 print(ARGS.none,"\n\n\n*************",ARGS.none==True)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
291 if ARGS.none:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
292 # #e.g. (A or nan --> A)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
293 ignore_nan = True
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
294 else:
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
295 #e.g. (A or nan --> nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
296 ignore_nan = False
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
297
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
298 #compure ras
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
299 ras_df=computeRAS(dataset,rules,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
300 rules_total_string,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
301 or_function=np.sum, # type of operation to do in case of an or expression (max, sum, mean)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
302 and_function=np.min,
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
303 ignore_nan=ignore_nan)
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
304
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
305 #save to csv and replace nan with None
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
306 ras_df.replace(np.nan,"None").to_csv(ARGS.ras_output, sep = '\t')
381
0a3ca20848f3 Uploaded
francesco_lapi
parents: 309
diff changeset
307
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
308 #print
489
97eea560a10f Uploaded
francesco_lapi
parents: 406
diff changeset
309 print("Execution succeeded")
93
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
310
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
311 ###############################################################################
7e703e546998 Uploaded
luca_milaz
parents:
diff changeset
312 if __name__ == "__main__":
505
96f512dff490 Uploaded
francesco_lapi
parents: 490
diff changeset
313 main()