comparison decoupler_pathway_inference.py @ 15:09c833d9b03b draft default tip

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b581a5b4ba88c5bf06f6223ba9aec51a8564796c
author ebi-gxa
date Fri, 29 Nov 2024 11:34:16 +0000
parents 599ad3dce864
children
comparison
equal deleted inserted replaced
14:ef054892d47f 15:09c833d9b03b
8 # define arguments for the script 8 # define arguments for the script
9 parser = argparse.ArgumentParser() 9 parser = argparse.ArgumentParser()
10 10
11 # add AnnData input file option 11 # add AnnData input file option
12 parser.add_argument( 12 parser.add_argument(
13 "-i", "--input_anndata", help="AnnData input file", required=True 13 "-i", "--input",
14 ) 14 help=(
15 15 "AnnData or table input file. Table input is meant for a single "
16 "comparison, not gene x cells"
17 ),
18 required=True
19 )
16 # add network input file option 20 # add network input file option
17 parser.add_argument( 21 parser.add_argument(
18 "-n", "--input_network", help="Network input file", required=True 22 "-n", "--input_network", help="Network input file", required=True
19 ) 23 )
20 24
30 parser.add_argument( 34 parser.add_argument(
31 "-a", 35 "-a",
32 "--activities_path", 36 "--activities_path",
33 help="Path to save Activities AnnData file", 37 help="Path to save Activities AnnData file",
34 default=None, 38 default=None,
39 )
40
41 # Column for stat to use when providing a table
42 parser.add_argument(
43 "--stat",
44 help="Stat to use when providing a table. Default is 'log2FC'.",
45 default="log2FC",
46 )
47 # Optional column for p-value or FDR in the table
48 parser.add_argument(
49 "--p_value_column",
50 required=False,
51 help="Column name in the table with p-values or FDRs.",
52 default=None,
53 )
54 # Optional column for FDR threshold when given a table
55 parser.add_argument(
56 "--p_value_threshold",
57 required=False,
58 type=float,
59 help="Column name in the table with FDRs.",
60 default=0.05
35 ) 61 )
36 62
37 # Column name in net with source nodes 63 # Column name in net with source nodes
38 parser.add_argument( 64 parser.add_argument(
39 "-s", 65 "-s",
91 117
92 # check that either -o or --output is specified 118 # check that either -o or --output is specified
93 if args.output is None: 119 if args.output is None:
94 raise ValueError("Please specify either -o or --output") 120 raise ValueError("Please specify either -o or --output")
95 121
96 # read in the AnnData input file 122 # detect based on input file extension if the input file is AnnData or matrix
97 adata = ad.read_h5ad(args.input_anndata) 123 if args.input.endswith(".h5ad"):
124 input_type = "AnnData"
125 elif args.input.endswith(".tsv") or args.input.endswith(".csv"):
126 input_type = "matrix"
127 else:
128 raise ValueError(
129 "Invalid input file. Please provide a valid AnnData or matrix file."
130 )
98 131
99 # read in the input file network input file 132 # read in the input file network input file
100 network = pd.read_csv(args.input_network, sep="\t") 133 network = pd.read_csv(args.input_network, sep="\t")
101 134
102 if ( 135 if (
106 ): 139 ):
107 raise ValueError( 140 raise ValueError(
108 "Source, target, and weight columns are not present in the network" 141 "Source, target, and weight columns are not present in the network"
109 ) 142 )
110 143
111
112 print(type(args.min_n)) 144 print(type(args.min_n))
113 145
114 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns: 146 if input_type == "AnnData":
115 # Storing index in a column called 'index_bak' 147 # read in the AnnData input file
116 adata.var['index_bak'] = adata.var.index 148 adata = ad.read_h5ad(args.input)
117 adata.var.set_index(args.var_gene_symbols_field, inplace=True) 149
118 150 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns:
151 # Storing index in a column called 'index_bak'
152 adata.var['index_bak'] = adata.var.index
153 adata.var.set_index(args.var_gene_symbols_field, inplace=True)
154 else:
155 # read in the matrix input file, genes in rows and columns for stats
156 adata = pd.read_csv(args.input, sep="\t", index_col=0)
157 if args.stat not in adata.columns:
158 raise ValueError(f"Stat column {args.stat} not found in input table header.")
159 if args.p_value_column and args.p_value_column not in adata.columns:
160 raise ValueError(
161 f"P-value column {args.p_value_column} not found in input table header."
162 )
163 if args.p_value_column and args.p_value_threshold is not None:
164 adata = adata[adata[args.p_value_column] <= args.p_value_threshold]
165
166 adata = adata[[args.stat]].T
119 167
120 if args.method == "mlm": 168 if args.method == "mlm":
121 dc.run_mlm( 169 res = dc.run_mlm(
122 mat=adata, 170 mat=adata,
123 net=network, 171 net=network,
124 source=args.source, 172 source=args.source,
125 target=args.target, 173 target=args.target,
126 weight=args.weight, 174 weight=args.weight,
127 verbose=True, 175 verbose=True,
128 min_n=args.min_n, 176 min_n=args.min_n,
129 use_raw=args.use_raw, 177 use_raw=args.use_raw,
130 ) 178 )
131 179
132 if args.output is not None:
133 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
134 # output network files
135 combined_df = pd.concat(
136 [adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1
137 )
138
139 # Save the combined dataframe to a file
140 combined_df.to_csv(args.output + ".tsv", sep="\t")
141
142 # if args.activities_path is specified, generate the activities AnnData
143 # and save the AnnData object to the specified path
144 if args.activities_path is not None:
145 acts = dc.get_acts(adata, obsm_key="mlm_estimate")
146 acts.write_h5ad(args.activities_path)
147
148 elif args.method == "ulm": 180 elif args.method == "ulm":
149 dc.run_ulm( 181 res = dc.run_ulm(
150 mat=adata, 182 mat=adata,
151 net=network, 183 net=network,
152 source=args.source, 184 source=args.source,
153 target=args.target, 185 target=args.target,
154 weight=args.weight, 186 weight=args.weight,
155 verbose=True, 187 verbose=True,
156 min_n=args.min_n, 188 min_n=args.min_n,
157 use_raw=args.use_raw, 189 use_raw=args.use_raw,
158 ) 190 )
159 191 elif args.method == "consensus":
160 if args.output is not None: 192 res = dc.run_consensus(
161 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the 193 mat=adata,
162 # output network files 194 net=network,
195 source=args.source,
196 target=args.target,
197 weight=args.weight,
198 verbose=True,
199 min_n=args.min_n,
200 use_raw=args.use_raw,
201 )
202
203 if args.output is not None:
204 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
205 # output network files
206 if input_type == "AnnData":
163 combined_df = pd.concat( 207 combined_df = pd.concat(
164 [adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1 208 [adata.obsm[f"{args.method}_estimate"],
209 adata.obsm[f"{args.method}_pvals"]], axis=1
165 ) 210 )
166 211 else:
167 # Save the combined dataframe to a file 212 tf_est, tf_pvals = res
168 combined_df.to_csv(args.output + ".tsv", sep="\t") 213 combined_df = pd.DataFrame(
169 214 {
170 # if args.activities_path is specified, generate the activities AnnData 215 # index is written, so no need for the set names
171 # and save the AnnData object to the specified path 216 f"{args.method}_estimate": tf_est.iloc[0],
172 if args.activities_path is not None: 217 f"{args.method}_pvals": tf_pvals.iloc[0],
173 acts = dc.get_acts(adata, obsm_key="ulm_estimate") 218 }
174 acts.write_h5ad(args.activities_path) 219 )
220 # sort ascending on the p-values
221 combined_df.sort_values(by=f"{args.method}_pvals", inplace=True)
222
223 # Save the combined dataframe to a file
224 combined_df.to_csv(args.output + ".tsv", sep="\t")
225
226 # if args.activities_path is specified, generate the activities AnnData
227 # and save the AnnData object to the specified path
228 if args.activities_path is not None and input_type == "AnnData":
229 acts = dc.get_acts(adata, obsm_key=f"{args.method}_estimate")
230 acts.write_h5ad(args.activities_path)