Mercurial > repos > ebi-gxa > decoupler_pseudobulk
comparison decoupler_pathway_inference.py @ 15:09c833d9b03b draft default tip
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b581a5b4ba88c5bf06f6223ba9aec51a8564796c
author | ebi-gxa |
---|---|
date | Fri, 29 Nov 2024 11:34:16 +0000 |
parents | 599ad3dce864 |
children |
comparison
equal
deleted
inserted
replaced
14:ef054892d47f | 15:09c833d9b03b |
---|---|
8 # define arguments for the script | 8 # define arguments for the script |
9 parser = argparse.ArgumentParser() | 9 parser = argparse.ArgumentParser() |
10 | 10 |
11 # add AnnData input file option | 11 # add AnnData input file option |
12 parser.add_argument( | 12 parser.add_argument( |
13 "-i", "--input_anndata", help="AnnData input file", required=True | 13 "-i", "--input", |
14 ) | 14 help=( |
15 | 15 "AnnData or table input file. Table input is meant for a single " |
16 "comparison, not gene x cells" | |
17 ), | |
18 required=True | |
19 ) | |
16 # add network input file option | 20 # add network input file option |
17 parser.add_argument( | 21 parser.add_argument( |
18 "-n", "--input_network", help="Network input file", required=True | 22 "-n", "--input_network", help="Network input file", required=True |
19 ) | 23 ) |
20 | 24 |
30 parser.add_argument( | 34 parser.add_argument( |
31 "-a", | 35 "-a", |
32 "--activities_path", | 36 "--activities_path", |
33 help="Path to save Activities AnnData file", | 37 help="Path to save Activities AnnData file", |
34 default=None, | 38 default=None, |
39 ) | |
40 | |
41 # Column for stat to use when providing a table | |
42 parser.add_argument( | |
43 "--stat", | |
44 help="Stat to use when providing a table. Default is 'log2FC'.", | |
45 default="log2FC", | |
46 ) | |
47 # Optional column for p-value or FDR in the table | |
48 parser.add_argument( | |
49 "--p_value_column", | |
50 required=False, | |
51 help="Column name in the table with p-values or FDRs.", | |
52 default=None, | |
53 ) | |
54 # Optional column for FDR threshold when given a table | |
55 parser.add_argument( | |
56 "--p_value_threshold", | |
57 required=False, | |
58 type=float, | |
59 help="Column name in the table with FDRs.", | |
60 default=0.05 | |
35 ) | 61 ) |
36 | 62 |
37 # Column name in net with source nodes | 63 # Column name in net with source nodes |
38 parser.add_argument( | 64 parser.add_argument( |
39 "-s", | 65 "-s", |
91 | 117 |
92 # check that either -o or --output is specified | 118 # check that either -o or --output is specified |
93 if args.output is None: | 119 if args.output is None: |
94 raise ValueError("Please specify either -o or --output") | 120 raise ValueError("Please specify either -o or --output") |
95 | 121 |
96 # read in the AnnData input file | 122 # detect based on input file extension if the input file is AnnData or matrix |
97 adata = ad.read_h5ad(args.input_anndata) | 123 if args.input.endswith(".h5ad"): |
124 input_type = "AnnData" | |
125 elif args.input.endswith(".tsv") or args.input.endswith(".csv"): | |
126 input_type = "matrix" | |
127 else: | |
128 raise ValueError( | |
129 "Invalid input file. Please provide a valid AnnData or matrix file." | |
130 ) | |
98 | 131 |
99 # read in the input file network input file | 132 # read in the input file network input file |
100 network = pd.read_csv(args.input_network, sep="\t") | 133 network = pd.read_csv(args.input_network, sep="\t") |
101 | 134 |
102 if ( | 135 if ( |
106 ): | 139 ): |
107 raise ValueError( | 140 raise ValueError( |
108 "Source, target, and weight columns are not present in the network" | 141 "Source, target, and weight columns are not present in the network" |
109 ) | 142 ) |
110 | 143 |
111 | |
112 print(type(args.min_n)) | 144 print(type(args.min_n)) |
113 | 145 |
114 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns: | 146 if input_type == "AnnData": |
115 # Storing index in a column called 'index_bak' | 147 # read in the AnnData input file |
116 adata.var['index_bak'] = adata.var.index | 148 adata = ad.read_h5ad(args.input) |
117 adata.var.set_index(args.var_gene_symbols_field, inplace=True) | 149 |
118 | 150 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns: |
151 # Storing index in a column called 'index_bak' | |
152 adata.var['index_bak'] = adata.var.index | |
153 adata.var.set_index(args.var_gene_symbols_field, inplace=True) | |
154 else: | |
155 # read in the matrix input file, genes in rows and columns for stats | |
156 adata = pd.read_csv(args.input, sep="\t", index_col=0) | |
157 if args.stat not in adata.columns: | |
158 raise ValueError(f"Stat column {args.stat} not found in input table header.") | |
159 if args.p_value_column and args.p_value_column not in adata.columns: | |
160 raise ValueError( | |
161 f"P-value column {args.p_value_column} not found in input table header." | |
162 ) | |
163 if args.p_value_column and args.p_value_threshold is not None: | |
164 adata = adata[adata[args.p_value_column] <= args.p_value_threshold] | |
165 | |
166 adata = adata[[args.stat]].T | |
119 | 167 |
120 if args.method == "mlm": | 168 if args.method == "mlm": |
121 dc.run_mlm( | 169 res = dc.run_mlm( |
122 mat=adata, | 170 mat=adata, |
123 net=network, | 171 net=network, |
124 source=args.source, | 172 source=args.source, |
125 target=args.target, | 173 target=args.target, |
126 weight=args.weight, | 174 weight=args.weight, |
127 verbose=True, | 175 verbose=True, |
128 min_n=args.min_n, | 176 min_n=args.min_n, |
129 use_raw=args.use_raw, | 177 use_raw=args.use_raw, |
130 ) | 178 ) |
131 | 179 |
132 if args.output is not None: | |
133 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | |
134 # output network files | |
135 combined_df = pd.concat( | |
136 [adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1 | |
137 ) | |
138 | |
139 # Save the combined dataframe to a file | |
140 combined_df.to_csv(args.output + ".tsv", sep="\t") | |
141 | |
142 # if args.activities_path is specified, generate the activities AnnData | |
143 # and save the AnnData object to the specified path | |
144 if args.activities_path is not None: | |
145 acts = dc.get_acts(adata, obsm_key="mlm_estimate") | |
146 acts.write_h5ad(args.activities_path) | |
147 | |
148 elif args.method == "ulm": | 180 elif args.method == "ulm": |
149 dc.run_ulm( | 181 res = dc.run_ulm( |
150 mat=adata, | 182 mat=adata, |
151 net=network, | 183 net=network, |
152 source=args.source, | 184 source=args.source, |
153 target=args.target, | 185 target=args.target, |
154 weight=args.weight, | 186 weight=args.weight, |
155 verbose=True, | 187 verbose=True, |
156 min_n=args.min_n, | 188 min_n=args.min_n, |
157 use_raw=args.use_raw, | 189 use_raw=args.use_raw, |
158 ) | 190 ) |
159 | 191 elif args.method == "consensus": |
160 if args.output is not None: | 192 res = dc.run_consensus( |
161 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | 193 mat=adata, |
162 # output network files | 194 net=network, |
195 source=args.source, | |
196 target=args.target, | |
197 weight=args.weight, | |
198 verbose=True, | |
199 min_n=args.min_n, | |
200 use_raw=args.use_raw, | |
201 ) | |
202 | |
203 if args.output is not None: | |
204 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | |
205 # output network files | |
206 if input_type == "AnnData": | |
163 combined_df = pd.concat( | 207 combined_df = pd.concat( |
164 [adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1 | 208 [adata.obsm[f"{args.method}_estimate"], |
209 adata.obsm[f"{args.method}_pvals"]], axis=1 | |
165 ) | 210 ) |
166 | 211 else: |
167 # Save the combined dataframe to a file | 212 tf_est, tf_pvals = res |
168 combined_df.to_csv(args.output + ".tsv", sep="\t") | 213 combined_df = pd.DataFrame( |
169 | 214 { |
170 # if args.activities_path is specified, generate the activities AnnData | 215 # index is written, so no need for the set names |
171 # and save the AnnData object to the specified path | 216 f"{args.method}_estimate": tf_est.iloc[0], |
172 if args.activities_path is not None: | 217 f"{args.method}_pvals": tf_pvals.iloc[0], |
173 acts = dc.get_acts(adata, obsm_key="ulm_estimate") | 218 } |
174 acts.write_h5ad(args.activities_path) | 219 ) |
220 # sort ascending on the p-values | |
221 combined_df.sort_values(by=f"{args.method}_pvals", inplace=True) | |
222 | |
223 # Save the combined dataframe to a file | |
224 combined_df.to_csv(args.output + ".tsv", sep="\t") | |
225 | |
226 # if args.activities_path is specified, generate the activities AnnData | |
227 # and save the AnnData object to the specified path | |
228 if args.activities_path is not None and input_type == "AnnData": | |
229 acts = dc.get_acts(adata, obsm_key=f"{args.method}_estimate") | |
230 acts.write_h5ad(args.activities_path) |