Mercurial > repos > ebi-gxa > score_genes_aucell
comparison decoupler_aucell_score.py @ 3:e887a7e8c5b4 draft
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 11fb36a94b8262ef8e78f1c6dd46c4146eb59341
author | ebi-gxa |
---|---|
date | Mon, 15 Apr 2024 13:20:21 +0000 |
parents | e024d8280886 |
children | 515ac51db6e5 |
comparison
equal
deleted
inserted
replaced
2:c700f0381e84 | 3:e887a7e8c5b4 |
---|---|
51 # Convert the list of dictionaries to a DataFrame | 51 # Convert the list of dictionaries to a DataFrame |
52 return pd.DataFrame(gene_sets) | 52 return pd.DataFrame(gene_sets) |
53 | 53 |
54 | 54 |
55 def score_genes_aucell( | 55 def score_genes_aucell( |
56 adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False | 56 adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False, min_n_genes=5 |
57 ): | 57 ): |
58 """Score genes using Aucell. | 58 """Score genes using Aucell. |
59 | 59 |
60 Parameters | 60 Parameters |
61 ---------- | 61 ---------- |
78 "gene_id": gene_list, | 78 "gene_id": gene_list, |
79 "geneset": score_name, | 79 "geneset": score_name, |
80 } | 80 } |
81 ) | 81 ) |
82 # run decoupler's run_aucell | 82 # run decoupler's run_aucell |
83 dc.run_aucell( | 83 # catch the value error |
84 adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw | 84 try: |
85 ) | 85 dc.run_aucell( |
86 # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names | 86 adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw |
87 adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name] | 87 ) |
88 # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names | |
89 adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name] | |
90 except ValueError as ve: | |
91 print(f"Gene list {score_name} failed, skipping: {str(ve)}") | |
88 | 92 |
89 | 93 |
90 def run_for_genelists( | 94 def run_for_genelists( |
91 adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols" | 95 adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols", min_n_genes=5 |
92 ): | 96 ): |
93 if len(gene_lists) == len(score_names): | 97 if len(gene_lists) == len(score_names): |
94 for gene_list, score_names in zip(gene_lists, score_names): | 98 for gene_list, score_names in zip(gene_lists, score_names): |
95 genes = gene_list.split(",") | 99 genes = gene_list.split(",") |
96 ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index | 100 ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index |
97 score_genes_aucell( | 101 score_genes_aucell( |
98 adata, | 102 adata, |
99 ens_gene_ids, | 103 ens_gene_ids, |
100 f"AUCell_{score_names}", | 104 f"AUCell_{score_names}", |
101 use_raw, | 105 use_raw, |
106 min_n_genes | |
102 ) | 107 ) |
103 else: | 108 else: |
104 raise ValueError( | 109 raise ValueError( |
105 "The number of gene lists (separated by :) and score names (separated by :) must be the same" | 110 "The number of gene lists (separated by :) and score names (separated by :) must be the same" |
106 ) | 111 ) |
141 "--gene_symbols_field", | 146 "--gene_symbols_field", |
142 type=str, | 147 type=str, |
143 help="Name of the gene symbols field in the AnnData object", | 148 help="Name of the gene symbols field in the AnnData object", |
144 required=True, | 149 required=True, |
145 ) | 150 ) |
151 # argument for min_n Minimum of targets per source. If less, sources are removed. | |
152 parser.add_argument( | |
153 "--min_n", | |
154 type=int, | |
155 required=False, | |
156 default=5, | |
157 help="Minimum of targets per source. If less, sources are removed.", | |
158 ) | |
146 parser.add_argument("--use_raw", action="store_true", help="Use raw data") | 159 parser.add_argument("--use_raw", action="store_true", help="Use raw data") |
147 parser.add_argument( | 160 parser.add_argument( |
148 "--write_anndata", action="store_true", help="Write the modified AnnData object" | 161 "--write_anndata", action="store_true", help="Write the modified AnnData object" |
149 ) | 162 ) |
150 | 163 |
156 | 169 |
157 if args.gmt_file is not None: | 170 if args.gmt_file is not None: |
158 # Load MSigDB file in GMT format | 171 # Load MSigDB file in GMT format |
159 msigdb = read_gmt(args.gmt_file) | 172 msigdb = read_gmt(args.gmt_file) |
160 | 173 |
161 gene_sets_to_score = args.gene_sets_to_score.split(",") if args.gene_sets_to_score else [] | 174 gene_sets_to_score = ( |
175 args.gene_sets_to_score.split(",") if args.gene_sets_to_score else [] | |
176 ) | |
162 # Score genes by their ensembl ids using the score_genes_aucell function | 177 # Score genes by their ensembl ids using the score_genes_aucell function |
163 for _, row in msigdb.iterrows(): | 178 for _, row in msigdb.iterrows(): |
164 gene_set_name = row["gene_set_name"] | 179 gene_set_name = row["gene_set_name"] |
165 if not gene_sets_to_score or gene_set_name in gene_sets_to_score: | 180 if not gene_sets_to_score or gene_set_name in gene_sets_to_score: |
166 genes = row["genes"].split(",") | 181 genes = row["genes"].split(",") |
167 # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set | 182 # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set |
168 ens_gene_ids = adata.var[ | 183 ens_gene_ids = adata.var[ |
169 adata.var[args.gene_symbols_field].isin(genes) | 184 adata.var[args.gene_symbols_field].isin(genes) |
170 ].index | 185 ].index |
171 score_genes_aucell( | 186 score_genes_aucell( |
172 adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw | 187 adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw, args.min_n |
173 ) | 188 ) |
174 elif args.gene_lists_to_score is not None and args.score_names is not None: | 189 elif args.gene_lists_to_score is not None and args.score_names is not None: |
175 gene_lists = args.gene_lists_to_score.split(":") | 190 gene_lists = args.gene_lists_to_score.split(":") |
176 score_names = args.score_names.split(",") | 191 score_names = args.score_names.split(",") |
177 run_for_genelists( | 192 run_for_genelists( |
178 adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field | 193 adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field, args.min_n |
179 ) | 194 ) |
180 | 195 |
181 # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns | 196 # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns |
182 if args.write_anndata: | 197 if args.write_anndata: |
183 adata.write_h5ad(args.output_file) | 198 adata.write_h5ad(args.output_file) |