# HG changeset patch # User ebi-gxa # Date 1699529757 0 # Node ID 4fa5f370599fe541058b84365257ab676b3d1c46 # Parent 130e25d3ce92fde2bf925c0629528c0533d73f1a planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit c8c39f14eeee6e7a6d097fd0cb9430b12793eb8b diff -r 130e25d3ce92 -r 4fa5f370599f decoupler_aucell_score.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decoupler_aucell_score.py Thu Nov 09 11:35:57 2023 +0000 @@ -0,0 +1,181 @@ +import argparse +import os +import tempfile + +import anndata +import decoupler as dc +import pandas as pd + + +def read_gmt(gmt_file): + """ + Reads a GMT file into a Pandas DataFrame. + + Parameters + ---------- + gmt_file : str + Path to the GMT file. + + Returns + ------- + pd.DataFrame + A DataFrame with the gene sets. Each row represents a gene set, and the columns are "gene_set_name", "gene_set_url", and "genes". + >>> line = "HALLMARK_NOTCH_SIGNALING\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\tJAG1\\tNOTCH3\\tNOTCH2\\tAPH1A\\tHES1\\tCCND1\\tFZD1\\tPSEN2\\tFZD7\\tDTX1\\tDLL1\\tFZD5\\tMAML2\\tNOTCH1\\tPSENEN\\tWNT5A\\tCUL1\\tWNT2\\tDTX4\\tSAP30\\tPPARD\\tKAT2A\\tHEYL\\tSKP1\\tRBX1\\tTCF7L2\\tARRB1\\tLFNG\\tPRKCA\\tDTX2\\tST3GAL6\\tFBXW11\\n" + >>> line2 = "HALLMARK_APICAL_SURFACE\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\tB4GALT1\\tRHCG\\tMAL\\tLYPD3\\tPKHD1\\tATP6V0A4\\tCRYBG1\\tSHROOM2\\tSRPX\\tMDGA1\\tTMEM8B\\tTHY1\\tPCSK9\\tEPHB4\\tDCBLD2\\tGHRL\\tLYN\\tGAS1\\tFLOT2\\tPLAUR\\tAKAP7\\tATP8B1\\tEFNA5\\tSLC34A3\\tAPP\\tGSTM3\\tHSPB1\\tSLC2A4\\tIL2RB\\tRTN4RL1\\tNCOA6\\tSULF2\\tADAM10\\tBRCA1\\tGATA3\\tAFAP1L2\\tIL2RG\\tCD160\\tADIPOR2\\tSLC22A12\\tNTNG1\\tSCUBE1\\tCX3CL1\\tCROCC\\n" + >>> temp_dir = tempfile.gettempdir() + >>> temp_gmt = os.path.join(temp_dir, "temp_file.gmt") + >>> with open(temp_gmt, "w") as f: + ... f.write(line) + ... f.write(line2) + 288 + 380 + >>> df = read_gmt(temp_gmt) + >>> df.shape[0] + 2 + >>> df.columns == ["gene_set_name", "genes"] + array([ True, True]) + >>> df.loc[df["gene_set_name"] == "HALLMARK_APICAL_SURFACE"].genes.tolist()[0].startswith("B4GALT1") + True + """ + # Read the GMT file into a list of lines + with open(gmt_file, "r") as f: + lines = f.readlines() + + # Create a list of dictionaries, where each dictionary represents a gene set + gene_sets = [] + for line in lines: + fields = line.strip().split("\t") + gene_set = {"gene_set_name": fields[0], "genes": ",".join(fields[2:])} + gene_sets.append(gene_set) + + # Convert the list of dictionaries to a DataFrame + return pd.DataFrame(gene_sets) + + +def score_genes_aucell( + adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False +): + """Score genes using Aucell. + + Parameters + ---------- + adata : anndata.AnnData + gene_list : list + score_names : str + use_raw : bool, optional + + >>> import scanpy as sc + >>> import decoupler as dc + >>> adata = sc.datasets.pbmc68k_reduced() + >>> gene_list = adata.var[adata.var.index.str.startswith("RP")].index.tolist() + >>> score_genes_aucell(adata, gene_list, "ribosomal_aucell", use_raw=False) + >>> "ribosomal_aucell" in adata.obs.columns + True + """ + # make a data.frame with two columns, geneset and gene_id, geneset filled with score_names and gene_id with gene_list, one row per element + geneset_df = pd.DataFrame( + { + "gene_id": gene_list, + "geneset": score_name, + } + ) + # run decoupler's run_aucell + dc.run_aucell( + adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw + ) + # copy .obsm['aucell_estimate'] matrix columns to adata.obs using the column names + adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name] + + +def run_for_genelists( + adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols" +): + if len(gene_lists) == len(score_names): + for gene_list, score_names in zip(gene_lists, score_names): + genes = gene_list.split(",") + ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index + score_genes_aucell( + adata, + ens_gene_ids, + f"AUCell_{score_names}", + use_raw, + ) + else: + raise ValueError( + "The number of gene lists (separated by :) and score names (separated by :) must be the same" + ) + + +if __name__ == "__main__": + # Create command-line arguments parser + parser = argparse.ArgumentParser(description="Score genes using Aucell") + parser.add_argument("--input_file", type=str, help="Path to input AnnData file") + parser.add_argument("--output_file", type=str, help="Path to output file") + parser.add_argument("--gmt_file", type=str, help="Path to GMT file", required=False) + # add argument for gene sets to score + parser.add_argument( + "--gene_sets_to_score", + type=str, + required=False, + help="Comma separated list of gene sets to score (the need to be in the gmt file)", + ) + # add argument for gene list (comma separated) to score + parser.add_argument( + "--gene_lists_to_score", + type=str, + required=False, + help="Comma separated list of genes to score. You can have more than one set of genes, separated by colon :", + ) + # argument for the score name when using the gene list + parser.add_argument( + "--score_names", + type=str, + required=False, + help="Name of the score column when using the gene list. You can have more than one set of score names, separated by colon :. It should be the same length as the number of gene lists.", + ) + parser.add_argument( + "--gene_symbols_field", + type=str, + help="Name of the gene symbols field in the AnnData object", + ) + parser.add_argument("--use_raw", action="store_true", help="Use raw data") + parser.add_argument( + "--write_anndata", action="store_true", help="Write the modified AnnData object" + ) + + # Parse command-line arguments + args = parser.parse_args() + + # Load input AnnData object + adata = anndata.read_h5ad(args.input_file) + + if args.gene_sets_to_score is not None and args.gmt_file is not None: + # Load MSigDB file in GMT format + msigdb = read_gmt(args.gmt_file) + + gene_sets_to_score = args.gene_sets_to_score.split(",") + # Score genes by their ensembl ids using the score_genes_aucell function + for _, row in msigdb.iterrows(): + gene_set_name = row["gene_set_name"] + if gene_set_name in gene_sets_to_score: + genes = row["genes"].split(",") + # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set + ens_gene_ids = adata.var[ + adata.var[args.gene_symbols_field].isin(genes) + ].index + score_genes_aucell( + adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw + ) + elif args.gene_lists_to_score is not None and args.score_names is not None: + gene_lists = args.gene_lists_to_score.split(":") + score_names = args.score_names.split(",") + run_for_genelists( + adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field + ) + + # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns + if args.write_anndata: + adata.write_h5ad(args.output_file) + else: + new_columns = [col for col in adata.obs.columns if col.startswith("AUCell_")] + adata.obs[new_columns].to_csv(args.output_file, sep="\t", index=True) diff -r 130e25d3ce92 -r 4fa5f370599f test-data/mouse_hallmark_ss.gmt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mouse_hallmark_ss.gmt Thu Nov 09 11:35:57 2023 +0000 @@ -0,0 +1,2 @@ +HALLMARK_NOTCH_SIGNALING http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_NOTCH_SIGNALING Jag1 Notch1 Notch2 Notch3 Ccnd1 Tcf7l2 Wnt5a Lfng Psenen Psen2 Heyl Fzd1 Rbx1 Hes1 Arrb1 Ppard Prkca Wnt2 Fzd5 Dtx1 Sap30 Dtx2 Kat2a Dll1 Fzd7 St3gal6 Fbxw11 Cul1 Aph1a Dtx4 Skp1 Maml2 +HALLMARK_APICAL_SURFACE http://www.gsea-msigdb.org/gsea/msigdb/mouse/geneset/HALLMARK_APICAL_SURFACE Adam10 Gata3 B4galt1 Hspb1 App Il2rg Atp8b1 Brca1 Slc34a3 Atp6v0a4 Ghrl Ncoa6 Rhcg Scube1 Efna5 Crybg1 Ephb4 Flot2 Gas1 Gstm5 Il2rb Shroom2 Lyn Mal Plaur Slc2a4 Thy1 Akap7 Srpx Cx3cl1 Dcbld2 Lypd3 Pkhd1 Sulf2 Pcsk9 Tmem8b Rtn4rl1 Ntng1 Slc22a12 Adipor2 Afap1l2 Mdga1 Cd160 Crocc