Repository 'decoupler_pseudobulk'
hg clone https://toolshed.g2.bx.psu.edu/repos/ebi-gxa/decoupler_pseudobulk

Changeset 7:68a2b5445558 (2024-04-16)
Previous changeset 6:ed2a77422e00 (2024-04-15) Next changeset 8:93f61ea19336 (2024-07-15)
Commit message:
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b01245159f9cb67101497bb974b2c13bcee019b7
modified:
decoupler_aucell_score.py
b
diff -r ed2a77422e00 -r 68a2b5445558 decoupler_aucell_score.py
--- a/decoupler_aucell_score.py Mon Apr 15 13:20:32 2024 +0000
+++ b/decoupler_aucell_score.py Tue Apr 16 11:49:25 2024 +0000
[
b'@@ -5,11 +5,12 @@\n import anndata\n import decoupler as dc\n import pandas as pd\n+import numba as nb\n \n \n-def read_gmt(gmt_file):\n+def read_gmt_long(gmt_file):\n     """\n-    Reads a GMT file into a Pandas DataFrame.\n+    Reads a GMT file and produce a Pandas DataFrame in long format, ready to be passed to the AUCell method.\n \n     Parameters\n     ----------\n@@ -19,7 +20,7 @@\n     Returns\n     -------\n     pd.DataFrame\n-        A DataFrame with the gene sets. Each row represents a gene set, and the columns are "gene_set_name", "gene_set_url", and "genes".\n+        A DataFrame with the gene sets. Each row represents a gene set to gene assignment, and the columns are "gene_set_name" and "genes".\n     >>> line = "HALLMARK_NOTCH_SIGNALING\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\\tJAG1\\\\tNOTCH3\\\\tNOTCH2\\\\tAPH1A\\\\tHES1\\\\tCCND1\\\\tFZD1\\\\tPSEN2\\\\tFZD7\\\\tDTX1\\\\tDLL1\\\\tFZD5\\\\tMAML2\\\\tNOTCH1\\\\tPSENEN\\\\tWNT5A\\\\tCUL1\\\\tWNT2\\\\tDTX4\\\\tSAP30\\\\tPPARD\\\\tKAT2A\\\\tHEYL\\\\tSKP1\\\\tRBX1\\\\tTCF7L2\\\\tARRB1\\\\tLFNG\\\\tPRKCA\\\\tDTX2\\\\tST3GAL6\\\\tFBXW11\\\\n"\n     >>> line2 = "HALLMARK_APICAL_SURFACE\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\\tB4GALT1\\\\tRHCG\\\\tMAL\\\\tLYPD3\\\\tPKHD1\\\\tATP6V0A4\\\\tCRYBG1\\\\tSHROOM2\\\\tSRPX\\\\tMDGA1\\\\tTMEM8B\\\\tTHY1\\\\tPCSK9\\\\tEPHB4\\\\tDCBLD2\\\\tGHRL\\\\tLYN\\\\tGAS1\\\\tFLOT2\\\\tPLAUR\\\\tAKAP7\\\\tATP8B1\\\\tEFNA5\\\\tSLC34A3\\\\tAPP\\\\tGSTM3\\\\tHSPB1\\\\tSLC2A4\\\\tIL2RB\\\\tRTN4RL1\\\\tNCOA6\\\\tSULF2\\\\tADAM10\\\\tBRCA1\\\\tGATA3\\\\tAFAP1L2\\\\tIL2RG\\\\tCD160\\\\tADIPOR2\\\\tSLC22A12\\\\tNTNG1\\\\tSCUBE1\\\\tCX3CL1\\\\tCROCC\\\\n"\n     >>> temp_dir = tempfile.gettempdir()\n@@ -29,81 +30,91 @@\n     ...   f.write(line2)\n     288\n     380\n-    >>> df = read_gmt(temp_gmt)\n+    >>> df = read_gmt_long(temp_gmt)\n     >>> df.shape[0]\n-    2\n-    >>> df.columns == ["gene_set_name", "genes"]\n-    array([ True,  True])\n-    >>> df.loc[df["gene_set_name"] == "HALLMARK_APICAL_SURFACE"].genes.tolist()[0].startswith("B4GALT1")\n-    True\n+    76\n+    >>> len(df.loc[df["gene_set"] == "HALLMARK_APICAL_SURFACE"].gene.tolist())\n+    44\n     """\n+    # Create a list of dictionaries, where each dictionary represents a gene set\n+    gene_sets = {}\n+\n     # Read the GMT file into a list of lines\n     with open(gmt_file, "r") as f:\n-        lines = f.readlines()\n+        while True:\n+            line = f.readline()\n+            if not line:\n+                break\n+            fields = line.strip().split("\\t")\n+            gene_sets[fields[0]]= fields[2:]\n \n-    # Create a list of dictionaries, where each dictionary represents a gene set\n-    gene_sets = []\n-    for line in lines:\n-        fields = line.strip().split("\\t")\n-        gene_set = {"gene_set_name": fields[0], "genes": ",".join(fields[2:])}\n-        gene_sets.append(gene_set)\n-\n-    # Convert the list of dictionaries to a DataFrame\n-    return pd.DataFrame(gene_sets)\n+    return pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n \n \n-def score_genes_aucell(\n-    adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False, min_n_genes=5\n-):\n+def score_genes_aucell_mt(adata: anndata.AnnData, gene_set_gene: pd.DataFrame, use_raw=False, min_n_genes=5, var_gene_symbols_field=None):\n     """Score genes using Aucell.\n \n     Parameters\n     ----------\n     adata : anndata.AnnData\n-    gene_list : list\n-    score_names : str\n-    use_raw : bool, optional\n+    gene_set_gene: pd.DataFrame with columns gene_set and gene\n+    use_raw : bool, optional, False by default.\n+    min_n_genes : int, optional, 5 by default.\n+    var_gene_symbols_field : str, optional, None by default. The field in var where gene symbols are stored\n \n     >>> import scanpy as sc\n     >>> import decoupler as dc\n     >>> adata = sc.datasets.pbmc68k_reduced()\n-    >>> gene_list = adata.var[adata.var.index.str.startswith("RP")].index.tolist()\n-    >>> score_genes_aucell(adata, gene_list, "ribosomal_aucell", use_raw=False)\n-    >>> "ribosomal_aucell" in adata.obs.columns\n+    >>> r_ge'..b'_set_gene[\'gene\'] = gene_set_gene[\'gene_id\']\n+    \n     # run decoupler\'s run_aucell\n-    # catch the value error\n-    try:\n-        dc.run_aucell(\n-            adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw\n+    dc.run_aucell(\n+            adata, net=gene_set_gene, source="gene_set", target="gene", use_raw=use_raw, min_n=min_n_genes\n         )\n-        # copy .obsm[\'aucell_estimate\'] matrix columns to adata.obs using the column names\n-        adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name]\n-    except ValueError as ve:\n-        print(f"Gene list {score_name} failed, skipping: {str(ve)}")\n+    for gs in gene_set_gene.gene_set.unique():\n+        if gs in adata.obsm[\'aucell_estimate\'].keys():\n+            adata.obs[f"AUCell_{gs}"] = adata.obsm["aucell_estimate"][gs]\n \n \n def run_for_genelists(\n-    adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols", min_n_genes=5\n+    adata, gene_lists, score_names, use_raw=False, gene_symbols_field=None, min_n_genes=5\n ):\n     if len(gene_lists) == len(score_names):\n         for gene_list, score_names in zip(gene_lists, score_names):\n             genes = gene_list.split(",")\n-            ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index\n-            score_genes_aucell(\n+            gene_sets = {}\n+            gene_sets[score_names] = genes\n+            gene_set_gene_df = pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n+            \n+            score_genes_aucell_mt(\n                 adata,\n-                ens_gene_ids,\n-                f"AUCell_{score_names}",\n+                gene_set_gene_df,\n                 use_raw,\n-                min_n_genes\n+                min_n_genes,\n+                var_gene_symbols_field=gene_symbols_field\n             )\n     else:\n         raise ValueError(\n@@ -160,32 +171,31 @@\n     parser.add_argument(\n         "--write_anndata", action="store_true", help="Write the modified AnnData object"\n     )\n+    # argument for number of max concurrent processes\n+    parser.add_argument("--max_threads", type=int, required=False, default=1, help="Number of max concurrent threads")\n+\n \n     # Parse command-line arguments\n     args = parser.parse_args()\n \n+    nb.set_num_threads(n=args.max_threads)\n+\n     # Load input AnnData object\n     adata = anndata.read_h5ad(args.input_file)\n \n     if args.gmt_file is not None:\n         # Load MSigDB file in GMT format\n-        msigdb = read_gmt(args.gmt_file)\n+        # msigdb = read_gmt(args.gmt_file)\n+        msigdb = read_gmt_long(args.gmt_file)\n \n         gene_sets_to_score = (\n             args.gene_sets_to_score.split(",") if args.gene_sets_to_score else []\n         )\n-        # Score genes by their ensembl ids using the score_genes_aucell function\n-        for _, row in msigdb.iterrows():\n-            gene_set_name = row["gene_set_name"]\n-            if not gene_sets_to_score or gene_set_name in gene_sets_to_score:\n-                genes = row["genes"].split(",")\n-                # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set\n-                ens_gene_ids = adata.var[\n-                    adata.var[args.gene_symbols_field].isin(genes)\n-                ].index\n-                score_genes_aucell(\n-                    adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw, args.min_n\n-                )\n+        if gene_sets_to_score:\n+            # we limit the GMT file read to the genesets specified in the gene_sets_to_score argument\n+            msigdb = msigdb[msigdb["gene_set"].isin(gene_sets_to_score)]\n+        \n+        score_genes_aucell_mt(adata, msigdb, args.use_raw, args.min_n, var_gene_symbols_field=args.gene_symbols_field)\n     elif args.gene_lists_to_score is not None and args.score_names is not None:\n         gene_lists = args.gene_lists_to_score.split(":")\n         score_names = args.score_names.split(",")\n'