Mercurial > repos > ebi-gxa > decoupler_pseudobulk
comparison decoupler_pathway_inference.py @ 15:09c833d9b03b draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b581a5b4ba88c5bf06f6223ba9aec51a8564796c
| author | ebi-gxa |
|---|---|
| date | Fri, 29 Nov 2024 11:34:16 +0000 |
| parents | 599ad3dce864 |
| children |
comparison
equal
deleted
inserted
replaced
| 14:ef054892d47f | 15:09c833d9b03b |
|---|---|
| 8 # define arguments for the script | 8 # define arguments for the script |
| 9 parser = argparse.ArgumentParser() | 9 parser = argparse.ArgumentParser() |
| 10 | 10 |
| 11 # add AnnData input file option | 11 # add AnnData input file option |
| 12 parser.add_argument( | 12 parser.add_argument( |
| 13 "-i", "--input_anndata", help="AnnData input file", required=True | 13 "-i", "--input", |
| 14 ) | 14 help=( |
| 15 | 15 "AnnData or table input file. Table input is meant for a single " |
| 16 "comparison, not gene x cells" | |
| 17 ), | |
| 18 required=True | |
| 19 ) | |
| 16 # add network input file option | 20 # add network input file option |
| 17 parser.add_argument( | 21 parser.add_argument( |
| 18 "-n", "--input_network", help="Network input file", required=True | 22 "-n", "--input_network", help="Network input file", required=True |
| 19 ) | 23 ) |
| 20 | 24 |
| 30 parser.add_argument( | 34 parser.add_argument( |
| 31 "-a", | 35 "-a", |
| 32 "--activities_path", | 36 "--activities_path", |
| 33 help="Path to save Activities AnnData file", | 37 help="Path to save Activities AnnData file", |
| 34 default=None, | 38 default=None, |
| 39 ) | |
| 40 | |
| 41 # Column for stat to use when providing a table | |
| 42 parser.add_argument( | |
| 43 "--stat", | |
| 44 help="Stat to use when providing a table. Default is 'log2FC'.", | |
| 45 default="log2FC", | |
| 46 ) | |
| 47 # Optional column for p-value or FDR in the table | |
| 48 parser.add_argument( | |
| 49 "--p_value_column", | |
| 50 required=False, | |
| 51 help="Column name in the table with p-values or FDRs.", | |
| 52 default=None, | |
| 53 ) | |
| 54 # Optional column for FDR threshold when given a table | |
| 55 parser.add_argument( | |
| 56 "--p_value_threshold", | |
| 57 required=False, | |
| 58 type=float, | |
| 59 help="Column name in the table with FDRs.", | |
| 60 default=0.05 | |
| 35 ) | 61 ) |
| 36 | 62 |
| 37 # Column name in net with source nodes | 63 # Column name in net with source nodes |
| 38 parser.add_argument( | 64 parser.add_argument( |
| 39 "-s", | 65 "-s", |
| 91 | 117 |
| 92 # check that either -o or --output is specified | 118 # check that either -o or --output is specified |
| 93 if args.output is None: | 119 if args.output is None: |
| 94 raise ValueError("Please specify either -o or --output") | 120 raise ValueError("Please specify either -o or --output") |
| 95 | 121 |
| 96 # read in the AnnData input file | 122 # detect based on input file extension if the input file is AnnData or matrix |
| 97 adata = ad.read_h5ad(args.input_anndata) | 123 if args.input.endswith(".h5ad"): |
| 124 input_type = "AnnData" | |
| 125 elif args.input.endswith(".tsv") or args.input.endswith(".csv"): | |
| 126 input_type = "matrix" | |
| 127 else: | |
| 128 raise ValueError( | |
| 129 "Invalid input file. Please provide a valid AnnData or matrix file." | |
| 130 ) | |
| 98 | 131 |
| 99 # read in the input file network input file | 132 # read in the input file network input file |
| 100 network = pd.read_csv(args.input_network, sep="\t") | 133 network = pd.read_csv(args.input_network, sep="\t") |
| 101 | 134 |
| 102 if ( | 135 if ( |
| 106 ): | 139 ): |
| 107 raise ValueError( | 140 raise ValueError( |
| 108 "Source, target, and weight columns are not present in the network" | 141 "Source, target, and weight columns are not present in the network" |
| 109 ) | 142 ) |
| 110 | 143 |
| 111 | |
| 112 print(type(args.min_n)) | 144 print(type(args.min_n)) |
| 113 | 145 |
| 114 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns: | 146 if input_type == "AnnData": |
| 115 # Storing index in a column called 'index_bak' | 147 # read in the AnnData input file |
| 116 adata.var['index_bak'] = adata.var.index | 148 adata = ad.read_h5ad(args.input) |
| 117 adata.var.set_index(args.var_gene_symbols_field, inplace=True) | 149 |
| 118 | 150 if args.var_gene_symbols_field and args.var_gene_symbols_field in adata.var.columns: |
| 151 # Storing index in a column called 'index_bak' | |
| 152 adata.var['index_bak'] = adata.var.index | |
| 153 adata.var.set_index(args.var_gene_symbols_field, inplace=True) | |
| 154 else: | |
| 155 # read in the matrix input file, genes in rows and columns for stats | |
| 156 adata = pd.read_csv(args.input, sep="\t", index_col=0) | |
| 157 if args.stat not in adata.columns: | |
| 158 raise ValueError(f"Stat column {args.stat} not found in input table header.") | |
| 159 if args.p_value_column and args.p_value_column not in adata.columns: | |
| 160 raise ValueError( | |
| 161 f"P-value column {args.p_value_column} not found in input table header." | |
| 162 ) | |
| 163 if args.p_value_column and args.p_value_threshold is not None: | |
| 164 adata = adata[adata[args.p_value_column] <= args.p_value_threshold] | |
| 165 | |
| 166 adata = adata[[args.stat]].T | |
| 119 | 167 |
| 120 if args.method == "mlm": | 168 if args.method == "mlm": |
| 121 dc.run_mlm( | 169 res = dc.run_mlm( |
| 122 mat=adata, | 170 mat=adata, |
| 123 net=network, | 171 net=network, |
| 124 source=args.source, | 172 source=args.source, |
| 125 target=args.target, | 173 target=args.target, |
| 126 weight=args.weight, | 174 weight=args.weight, |
| 127 verbose=True, | 175 verbose=True, |
| 128 min_n=args.min_n, | 176 min_n=args.min_n, |
| 129 use_raw=args.use_raw, | 177 use_raw=args.use_raw, |
| 130 ) | 178 ) |
| 131 | 179 |
| 132 if args.output is not None: | |
| 133 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | |
| 134 # output network files | |
| 135 combined_df = pd.concat( | |
| 136 [adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1 | |
| 137 ) | |
| 138 | |
| 139 # Save the combined dataframe to a file | |
| 140 combined_df.to_csv(args.output + ".tsv", sep="\t") | |
| 141 | |
| 142 # if args.activities_path is specified, generate the activities AnnData | |
| 143 # and save the AnnData object to the specified path | |
| 144 if args.activities_path is not None: | |
| 145 acts = dc.get_acts(adata, obsm_key="mlm_estimate") | |
| 146 acts.write_h5ad(args.activities_path) | |
| 147 | |
| 148 elif args.method == "ulm": | 180 elif args.method == "ulm": |
| 149 dc.run_ulm( | 181 res = dc.run_ulm( |
| 150 mat=adata, | 182 mat=adata, |
| 151 net=network, | 183 net=network, |
| 152 source=args.source, | 184 source=args.source, |
| 153 target=args.target, | 185 target=args.target, |
| 154 weight=args.weight, | 186 weight=args.weight, |
| 155 verbose=True, | 187 verbose=True, |
| 156 min_n=args.min_n, | 188 min_n=args.min_n, |
| 157 use_raw=args.use_raw, | 189 use_raw=args.use_raw, |
| 158 ) | 190 ) |
| 159 | 191 elif args.method == "consensus": |
| 160 if args.output is not None: | 192 res = dc.run_consensus( |
| 161 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | 193 mat=adata, |
| 162 # output network files | 194 net=network, |
| 195 source=args.source, | |
| 196 target=args.target, | |
| 197 weight=args.weight, | |
| 198 verbose=True, | |
| 199 min_n=args.min_n, | |
| 200 use_raw=args.use_raw, | |
| 201 ) | |
| 202 | |
| 203 if args.output is not None: | |
| 204 # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the | |
| 205 # output network files | |
| 206 if input_type == "AnnData": | |
| 163 combined_df = pd.concat( | 207 combined_df = pd.concat( |
| 164 [adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1 | 208 [adata.obsm[f"{args.method}_estimate"], |
| 209 adata.obsm[f"{args.method}_pvals"]], axis=1 | |
| 165 ) | 210 ) |
| 166 | 211 else: |
| 167 # Save the combined dataframe to a file | 212 tf_est, tf_pvals = res |
| 168 combined_df.to_csv(args.output + ".tsv", sep="\t") | 213 combined_df = pd.DataFrame( |
| 169 | 214 { |
| 170 # if args.activities_path is specified, generate the activities AnnData | 215 # index is written, so no need for the set names |
| 171 # and save the AnnData object to the specified path | 216 f"{args.method}_estimate": tf_est.iloc[0], |
| 172 if args.activities_path is not None: | 217 f"{args.method}_pvals": tf_pvals.iloc[0], |
| 173 acts = dc.get_acts(adata, obsm_key="ulm_estimate") | 218 } |
| 174 acts.write_h5ad(args.activities_path) | 219 ) |
| 220 # sort ascending on the p-values | |
| 221 combined_df.sort_values(by=f"{args.method}_pvals", inplace=True) | |
| 222 | |
| 223 # Save the combined dataframe to a file | |
| 224 combined_df.to_csv(args.output + ".tsv", sep="\t") | |
| 225 | |
| 226 # if args.activities_path is specified, generate the activities AnnData | |
| 227 # and save the AnnData object to the specified path | |
| 228 if args.activities_path is not None and input_type == "AnnData": | |
| 229 acts = dc.get_acts(adata, obsm_key=f"{args.method}_estimate") | |
| 230 acts.write_h5ad(args.activities_path) |
