# HG changeset patch # User ebi-gxa # Date 1699529768 0 # Node ID 1e8697931d7304f7fc83ef9da11148cfb6b8e6c3 planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit c8c39f14eeee6e7a6d097fd0cb9430b12793eb8b diff -r 000000000000 -r 1e8697931d73 decoupler_aucell_score.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decoupler_aucell_score.py Thu Nov 09 11:36:08 2023 +0000 @@ -0,0 +1,181 @@ +import argparse +import os +import tempfile + +import anndata +import decoupler as dc +import pandas as pd + + +def read_gmt(gmt_file): + """ + Reads a GMT file into a Pandas DataFrame. + + Parameters + ---------- + gmt_file : str + Path to the GMT file. + + Returns + ------- + pd.DataFrame + A DataFrame with the gene sets. Each row represents a gene set, and the columns are "gene_set_name", "gene_set_url", and "genes". + >>> line = "HALLMARK_NOTCH_SIGNALING\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\tJAG1\\tNOTCH3\\tNOTCH2\\tAPH1A\\tHES1\\tCCND1\\tFZD1\\tPSEN2\\tFZD7\\tDTX1\\tDLL1\\tFZD5\\tMAML2\\tNOTCH1\\tPSENEN\\tWNT5A\\tCUL1\\tWNT2\\tDTX4\\tSAP30\\tPPARD\\tKAT2A\\tHEYL\\tSKP1\\tRBX1\\tTCF7L2\\tARRB1\\tLFNG\\tPRKCA\\tDTX2\\tST3GAL6\\tFBXW11\\n" + >>> line2 = "HALLMARK_APICAL_SURFACE\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\tB4GALT1\\tRHCG\\tMAL\\tLYPD3\\tPKHD1\\tATP6V0A4\\tCRYBG1\\tSHROOM2\\tSRPX\\tMDGA1\\tTMEM8B\\tTHY1\\tPCSK9\\tEPHB4\\tDCBLD2\\tGHRL\\tLYN\\tGAS1\\tFLOT2\\tPLAUR\\tAKAP7\\tATP8B1\\tEFNA5\\tSLC34A3\\tAPP\\tGSTM3\\tHSPB1\\tSLC2A4\\tIL2RB\\tRTN4RL1\\tNCOA6\\tSULF2\\tADAM10\\tBRCA1\\tGATA3\\tAFAP1L2\\tIL2RG\\tCD160\\tADIPOR2\\tSLC22A12\\tNTNG1\\tSCUBE1\\tCX3CL1\\tCROCC\\n" + >>> temp_dir = tempfile.gettempdir() + >>> temp_gmt = os.path.join(temp_dir, "temp_file.gmt") + >>> with open(temp_gmt, "w") as f: + ... f.write(line) + ... f.write(line2) + 288 + 380 + >>> df = read_gmt(temp_gmt) + >>> df.shape[0] + 2 + >>> df.columns == ["gene_set_name", "genes"] + array([ True, True]) + >>> df.loc[df["gene_set_name"] == "HALLMARK_APICAL_SURFACE"].genes.tolist()[0].startswith("B4GALT1") + True + """ + # Read the GMT file into a list of lines + with open(gmt_file, "r") as f: + lines = f.readlines() + + # Create a list of dictionaries, where each dictionary represents a gene set + gene_sets = [] + for line in lines: + fields = line.strip().split("\t") + gene_set = {"gene_set_name": fields[0], "genes": ",".join(fields[2:])} + gene_sets.append(gene_set) + + # Convert the list of dictionaries to a DataFrame + return pd.DataFrame(gene_sets) + + +def score_genes_aucell( + adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False +): + """Score genes using Aucell. + + Parameters + ---------- + adata : anndata.AnnData + gene_list : list + score_names : str + use_raw : bool, optional + + >>> import scanpy as sc + >>> import decoupler as dc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> gene_list = adata.var[adata.var.index.str.startswith("RP")].index.tolist() + >>> score_genes_aucell(adata, gene_list, "ribosomal_aucell", use_raw=False) + >>> "ribosomal_aucell" in adata.obs.columns + True + """ + # make a data.frame with two columns, geneset and gene_id, geneset filled with score_names and gene_id with gene_list, one row per element + geneset_df = pd.DataFrame( + { + "gene_id": gene_list, + "geneset": score_name, + } + ) + # run decoupler's run_aucell + dc.run_aucell( + adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw + ) + # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names + adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name] + + +def run_for_genelists( + adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols" +): + if len(gene_lists) == len(score_names): + for gene_list, score_names in zip(gene_lists, score_names): + genes = gene_list.split(",") + ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index + score_genes_aucell( + adata, + ens_gene_ids, + f"AUCell_{score_names}", + use_raw, + ) + else: + raise ValueError( + "The number of gene lists (separated by :) and score names (separated by :) must be the same" + ) + + +if __name__ == "__main__": + # Create command-line arguments parser + parser = argparse.ArgumentParser(description="Score genes using Aucell") + parser.add_argument("--input_file", type=str, help="Path to input AnnData file") + parser.add_argument("--output_file", type=str, help="Path to output file") + parser.add_argument("--gmt_file", type=str, help="Path to GMT file", required=False) + # add argument for gene sets to score + parser.add_argument( + "--gene_sets_to_score", + type=str, + required=False, + help="Comma separated list of gene sets to score (the need to be in the gmt file)", + ) + # add argument for gene list (comma separated) to score + parser.add_argument( + "--gene_lists_to_score", + type=str, + required=False, + help="Comma separated list of genes to score. You can have more than one set of genes, separated by colon :", + ) + # argument for the score name when using the gene list + parser.add_argument( + "--score_names", + type=str, + required=False, + help="Name of the score column when using the gene list. You can have more than one set of score names, separated by colon :. It should be the same length as the number of gene lists.", + ) + parser.add_argument( + "--gene_symbols_field", + type=str, + help="Name of the gene symbols field in the AnnData object", + ) + parser.add_argument("--use_raw", action="store_true", help="Use raw data") + parser.add_argument( + "--write_anndata", action="store_true", help="Write the modified AnnData object" + ) + + # Parse command-line arguments + args = parser.parse_args() + + # Load input AnnData object + adata = anndata.read_h5ad(args.input_file) + + if args.gene_sets_to_score is not None and args.gmt_file is not None: + # Load MSigDB file in GMT format + msigdb = read_gmt(args.gmt_file) + + gene_sets_to_score = args.gene_sets_to_score.split(",") + # Score genes by their ensembl ids using the score_genes_aucell function + for _, row in msigdb.iterrows(): + gene_set_name = row["gene_set_name"] + if gene_set_name in gene_sets_to_score: + genes = row["genes"].split(",") + # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set + ens_gene_ids = adata.var[ + adata.var[args.gene_symbols_field].isin(genes) + ].index + score_genes_aucell( + adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw + ) + elif args.gene_lists_to_score is not None and args.score_names is not None: + gene_lists = args.gene_lists_to_score.split(":") + score_names = args.score_names.split(",") + run_for_genelists( + adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field + ) + + # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns + if args.write_anndata: + adata.write_h5ad(args.output_file) + else: + new_columns = [col for col in adata.obs.columns if col.startswith("AUCell_")] + adata.obs[new_columns].to_csv(args.output_file, sep="\t", index=True) diff -r 000000000000 -r 1e8697931d73 decoupler_aucell_score.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decoupler_aucell_score.xml Thu Nov 09 11:36:08 2023 +0000 @@ -0,0 +1,137 @@ + + + + scores cells using the AUCell method for gene sets. + + + decoupler + + + python '$__tool_directory__/decoupler_aucell_score.py' + --input_file '$input_file' + #if $gene_lists_source.source == "gmt" + --gmt_file '$gmt_file' + --gene_sets_to_score '$gene_sets_to_score' + #else: + --gene_lists_to_score '$gene_lists_to_score' + --score_names '$score_names' + #end if + --gene_symbols_field '$gene_symbols_field' + $use_raw + #if $write_anndata: + --write_anndata + --output_file anndata_aucell.h5ad + #else: + --output_file anndata_aucell_per_cell.tsv + #end if + + + + + + + + + + + + + + + + + + + + + + + + write_anndata + + + not write_anndata + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +**Description** + +This tool scores cells using the AUCell method for gene sets. + +**Input** + +The input file should be an AnnData object in H5AD format. The tool accepts an H5AD file containing raw or normalized data. + +The tool takes one of two input formats for specifying the gene sets to score: + +1. A GMT file containing gene sets. You can specify which gene sets to use within the GMT file using the "gene_sets_to_score" parameter. +2. A list of genes and their associated score names. You can specify the genes to score using the "gene_lists_to_score" parameter, and their associated score names using the "score_names" parameter. + +In both cases, you must specify the name of the field in the AnnData object that contains the gene symbols using the "gene_symbols_field" parameter. + +You can also specify whether to use the raw data in the AnnData object instead of the X matrix using the "use_raw" parameter. + +**Output** + +The tool outputs an AnnData object containing the scores in the "obs" field, or a tab-separated text file containing the scores for each cell. + +If the "write_anndata" parameter is set to "true", the tool will write the modified AnnData object to an H5AD file. Otherwise, it will output a tab-separated text file containing the scores for each cell. + +**Example** + + + + + 10.1093/bioadv/vbac016 + + \ No newline at end of file diff -r 000000000000 -r 1e8697931d73 decoupler_pseudobulk.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decoupler_pseudobulk.py Thu Nov 09 11:36:08 2023 +0000 @@ -0,0 +1,367 @@ +import argparse + +import anndata +import decoupler +import pandas as pd + + +def get_pseudobulk( + adata, + sample_col, + groups_col, + layer=None, + mode="sum", + min_cells=10, + min_counts=1000, + use_raw=False, +): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> adata.X = abs(adata.X).astype(int) + >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain") + """ + + return decoupler.get_pseudobulk( + adata, + sample_col=sample_col, + groups_col=groups_col, + layer=layer, + mode=mode, + use_raw=use_raw, + min_cells=min_cells, + min_counts=min_counts, + ) + + +def prepend_c_to_index(index_value): + if index_value and index_value[0].isdigit(): + return "C" + index_value + return index_value + + +# write results for loading into DESeq2 +def write_DESeq2_inputs(pdata, layer=None, output_dir="", factor_fields=None): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> adata.X = abs(adata.X).astype(int) + >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain") + >>> write_DESeq2_inputs(pseudobulk) + """ + # add / to output_dir if is not empty or if it doesn't end with / + if output_dir != "" and not output_dir.endswith("/"): + output_dir = output_dir + "/" + obs_for_deseq = pdata.obs.copy() + # replace any index starting with digits to start with C instead. + obs_for_deseq.rename(index=prepend_c_to_index, inplace=True) + # avoid dash that is read as point on R colnames. + obs_for_deseq.index = obs_for_deseq.index.str.replace("-", "_") + obs_for_deseq.index = obs_for_deseq.index.str.replace(" ", "_") + col_metadata_file = f"{output_dir}col_metadata.tsv" + # write obs to a col_metadata file + if factor_fields: + # only output the index plus the columns in factor_fields in that order + obs_for_deseq[factor_fields].to_csv(col_metadata_file, sep="\t", index=True) + else: + obs_for_deseq.to_csv(col_metadata_file, sep="\t", index=True) + # write var to a gene_metadata file + pdata.var.to_csv(f"{output_dir}gene_metadata.tsv", sep="\t", index=True) + # write the counts matrix of a specified layer to file + if layer is None: + # write the X numpy matrix transposed to file + df = pd.DataFrame(pdata.X.T, index=pdata.var.index, columns=obs_for_deseq.index) + else: + df = pd.DataFrame( + pdata.layers[layer].T, index=pdata.var.index, columns=obs_for_deseq.index + ) + df.to_csv(f"{output_dir}counts_matrix.tsv", sep="\t", index_label="") + + +def plot_pseudobulk_samples( + pseudobulk_data, + groupby, + figsize=(10, 10), + save_path=None, +): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> adata.X = abs(adata.X).astype(int) + >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain") + >>> plot_pseudobulk_samples(pseudobulk, groupby=["bulk_labels", "louvain"], figsize=(10, 10)) + """ + fig = decoupler.plot_psbulk_samples( + pseudobulk_data, groupby=groupby, figsize=figsize, return_fig=True + ) + if save_path: + fig.savefig(f"{save_path}/pseudobulk_samples.png") + else: + fig.show() + + +def plot_filter_by_expr( + pseudobulk_data, group, min_count=None, min_total_count=None, save_path=None +): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> adata.X = abs(adata.X).astype(int) + >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain") + >>> plot_filter_by_expr(pseudobulk, group="bulk_labels", min_count=10, min_total_count=200) + """ + fig = decoupler.plot_filter_by_expr( + pseudobulk_data, + group=group, + min_count=min_count, + min_total_count=min_total_count, + return_fig=True, + ) + if save_path: + fig.savefig(f"{save_path}/filter_by_expr.png") + else: + fig.show() + + +def filter_by_expr(pdata, min_count=None, min_total_count=None): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> adata.X = abs(adata.X).astype(int) + >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain") + >>> pdata_filt = filter_by_expr(pseudobulk, min_count=10, min_total_count=200) + """ + genes = decoupler.filter_by_expr( + pdata, min_count=min_count, min_total_count=min_total_count + ) + return pdata[:, genes].copy() + + +def check_fields(fields, adata, obs=True, context=None): + """ + >>> import scanpy as sc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> check_fields(["bulk_labels", "louvain"], adata, obs=True) + """ + + legend = "" + if context: + legend = f", passed in {context}," + if obs: + if not set(fields).issubset(set(adata.obs.columns)): + raise ValueError( + f"Some of the following fields {legend} are not present in adata.obs: {fields}. Possible fields are: {list(set(adata.obs.columns))}" + ) + else: + if not set(fields).issubset(set(adata.var.columns)): + raise ValueError( + f"Some of the following fields {legend} are not present in adata.var: {fields}. Possible fields are: {list(set(adata.var.columns))}" + ) + + +def main(args): + # Load AnnData object from file + adata = anndata.read_h5ad(args.adata_file) + + # Merge adata.obs fields specified in args.adata_obs_fields_to_merge + if args.adata_obs_fields_to_merge: + # first split potential groups by ":" and iterate over them + for group in args.adata_obs_fields_to_merge.split(":"): + fields = group.split(",") + check_fields(fields, adata) + adata = merge_adata_obs_fields(fields, adata) + + check_fields([args.groupby, args.sample_key], adata) + + factor_fields = None + if args.factor_fields: + factor_fields = args.factor_fields.split(",") + check_fields(factor_fields, adata) + + print(f"Using mode: {args.mode}") + # Perform pseudobulk analysis + pseudobulk_data = get_pseudobulk( + adata, + sample_col=args.sample_key, + groups_col=args.groupby, + layer=args.layer, + mode=args.mode, + use_raw=args.use_raw, + min_cells=args.min_cells, + min_counts=args.min_counts, + ) + + # Plot pseudobulk samples + plot_pseudobulk_samples( + pseudobulk_data, + args.groupby, + save_path=args.save_path, + figsize=args.plot_samples_figsize, + ) + + plot_filter_by_expr( + pseudobulk_data, + group=args.groupby, + min_count=args.min_counts, + min_total_count=args.min_total_counts, + save_path=args.save_path, + ) + + # Filter by expression if enabled + if args.filter_expr: + filtered_adata = filter_by_expr( + pseudobulk_data, + min_count=args.min_counts, + min_total_count=args.min_total_counts, + ) + + pseudobulk_data = filtered_adata + + # Save the pseudobulk data + if args.anndata_output_path: + pseudobulk_data.write_h5ad(args.anndata_output_path, compression="gzip") + + write_DESeq2_inputs( + pseudobulk_data, output_dir=args.deseq2_output_path, factor_fields=factor_fields + ) + + +def merge_adata_obs_fields(obs_fields_to_merge, adata): + """ + Merge adata.obs fields specified in args.adata_obs_fields_to_merge + + Parameters + ---------- + obs_fields_to_merge : str + Fields in adata.obs to merge, comma separated + adata : anndata.AnnData + The AnnData object + + Returns + ------- + anndata.AnnData + The merged AnnData object + + docstring tests: + >>> import scanpy as sc + >>> ad = sc.datasets.pbmc68k_reduced() + >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad) + >>> ad.obs.columns + Index(['bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', + 'G2M_score', 'phase', 'louvain', 'bulk_labels_louvain'], + dtype='object') + """ + field_name = "_".join(obs_fields_to_merge) + for field in obs_fields_to_merge: + if field not in adata.obs.columns: + raise ValueError(f"The '{field}' column is not present in adata.obs.") + if field_name not in adata.obs.columns: + adata.obs[field_name] = adata.obs[field].astype(str) + else: + adata.obs[field_name] = ( + adata.obs[field_name] + "_" + adata.obs[field].astype(str) + ) + return adata + + +if __name__ == "__main__": + # Create argument parser + parser = argparse.ArgumentParser( + description="Perform pseudobulk analysis on an AnnData object" + ) + + # Add arguments + parser.add_argument("adata_file", type=str, help="Path to the AnnData file") + parser.add_argument( + "-m", + "--adata_obs_fields_to_merge", + type=str, + help="Fields in adata.obs to merge, comma separated. You can have more than one set of fields, separated by semi-colon ;", + ) + parser.add_argument( + "--groupby", + type=str, + required=True, + help="The column in adata.obs that defines the groups", + ) + parser.add_argument( + "--sample_key", + required=True, + type=str, + help="The column in adata.obs that defines the samples", + ) + # add argument for layer + parser.add_argument( + "--layer", + type=str, + default=None, + help="The name of the layer of the AnnData object to use", + ) + # add argument for mode + parser.add_argument( + "--mode", + type=str, + default="sum", + help="The mode for Decoupler pseudobulk analysis", + choices=["sum", "mean", "median"], + ) + # add boolean argument for use_raw + parser.add_argument( + "--use_raw", + action="store_true", + default=False, + help="Whether to use the raw part of the AnnData object", + ) + # add argument for min_cells + parser.add_argument( + "--min_cells", + type=int, + default=10, + help="Minimum number of cells for pseudobulk analysis", + ) + parser.add_argument( + "--save_path", type=str, help="Path to save the plot (optional)" + ) + parser.add_argument( + "--min_counts", + type=int, + help="Minimum count threshold for filtering by expression", + ) + parser.add_argument( + "--min_total_counts", + type=int, + help="Minimum total count threshold for filtering by expression", + ) + parser.add_argument( + "--anndata_output_path", + type=str, + help="Path to save the filtered AnnData object or pseudobulk data", + ) + parser.add_argument( + "--filter_expr", action="store_true", help="Enable filtering by expression" + ) + parser.add_argument( + "--factor_fields", + type=str, + help="Comma separated list of fields for the factors", + ) + parser.add_argument( + "--deseq2_output_path", + type=str, + help="Path to save the DESeq2 inputs", + required=True, + ) + parser.add_argument( + "--plot_samples_figsize", + type=int, + default=[10, 10], + nargs=2, + help="Size of the samples plot as a tuple (two arguments)", + ) + parser.add_argument("--plot_filtering_figsize", type=int, default=[10, 10], nargs=2) + + # Parse the command line arguments + args = parser.parse_args() + + # Call the main function + main(args) diff -r 000000000000 -r 1e8697931d73 get_test_data.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_test_data.sh Thu Nov 09 11:36:08 2023 +0000 @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +BASENAME_FILE='mito_counted_anndata.h5ad' + +MTX_LINK='https://zenodo.org/record/7053673/files/Mito-counted_AnnData' + +# convenience for getting data +function get_data { + local link=$1 + local fname=$2 + + if [ ! -f $fname ]; then + echo "$fname not available locally, downloading.." + wget -O $fname --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 $link + fi +} + +# get matrix data +mkdir -p test-data +pushd test-data +get_data $MTX_LINK $BASENAME_FILE diff -r 000000000000 -r 1e8697931d73 test-data/mouse_hallmark_ss.gmt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mouse_hallmark_ss.gmt Thu Nov 09 11:36:08 2023 +0000 @@ -0,0 +1,2 @@ +HALLMARK_NOTCH_SIGNALING http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_NOTCH_SIGNALING Jag1 Notch1 Notch2 Notch3 Ccnd1 Tcf7l2 Wnt5a Lfng Psenen Psen2 Heyl Fzd1 Rbx1 Hes1 Arrb1 Ppard Prkca Wnt2 Fzd5 Dtx1 Sap30 Dtx2 Kat2a Dll1 Fzd7 St3gal6 Fbxw11 Cul1 Aph1a Dtx4 Skp1 Maml2 +HALLMARK_APICAL_SURFACE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_APICAL_SURFACE Adam10 Gata3 B4galt1 Hspb1 App Il2rg Atp8b1 Brca1 Slc34a3 Atp6v0a4 Ghrl Ncoa6 Rhcg Scube1 Efna5 Crybg1 Ephb4 Flot2 Gas1 Gstm5 Il2rb Shroom2 Lyn Mal Plaur Slc2a4 Thy1 Akap7 Srpx Cx3cl1 Dcbld2 Lypd3 Pkhd1 Sulf2 Pcsk9 Tmem8b Rtn4rl1 Ntng1 Slc22a12 Adipor2 Afap1l2 Mdga1 Cd160 Crocc