Previous changeset 6:ed2a77422e00 (2024-04-15) Next changeset 8:93f61ea19336 (2024-07-15) |
Commit message:
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b01245159f9cb67101497bb974b2c13bcee019b7 |
modified:
decoupler_aucell_score.py |
b |
diff -r ed2a77422e00 -r 68a2b5445558 decoupler_aucell_score.py --- a/decoupler_aucell_score.py Mon Apr 15 13:20:32 2024 +0000 +++ b/decoupler_aucell_score.py Tue Apr 16 11:49:25 2024 +0000 |
[ |
b'@@ -5,11 +5,12 @@\n import anndata\n import decoupler as dc\n import pandas as pd\n+import numba as nb\n \n \n-def read_gmt(gmt_file):\n+def read_gmt_long(gmt_file):\n """\n- Reads a GMT file into a Pandas DataFrame.\n+ Reads a GMT file and produce a Pandas DataFrame in long format, ready to be passed to the AUCell method.\n \n Parameters\n ----------\n@@ -19,7 +20,7 @@\n Returns\n -------\n pd.DataFrame\n- A DataFrame with the gene sets. Each row represents a gene set, and the columns are "gene_set_name", "gene_set_url", and "genes".\n+ A DataFrame with the gene sets. Each row represents a gene set to gene assignment, and the columns are "gene_set_name" and "genes".\n >>> line = "HALLMARK_NOTCH_SIGNALING\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\\tJAG1\\\\tNOTCH3\\\\tNOTCH2\\\\tAPH1A\\\\tHES1\\\\tCCND1\\\\tFZD1\\\\tPSEN2\\\\tFZD7\\\\tDTX1\\\\tDLL1\\\\tFZD5\\\\tMAML2\\\\tNOTCH1\\\\tPSENEN\\\\tWNT5A\\\\tCUL1\\\\tWNT2\\\\tDTX4\\\\tSAP30\\\\tPPARD\\\\tKAT2A\\\\tHEYL\\\\tSKP1\\\\tRBX1\\\\tTCF7L2\\\\tARRB1\\\\tLFNG\\\\tPRKCA\\\\tDTX2\\\\tST3GAL6\\\\tFBXW11\\\\n"\n >>> line2 = "HALLMARK_APICAL_SURFACE\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\\tB4GALT1\\\\tRHCG\\\\tMAL\\\\tLYPD3\\\\tPKHD1\\\\tATP6V0A4\\\\tCRYBG1\\\\tSHROOM2\\\\tSRPX\\\\tMDGA1\\\\tTMEM8B\\\\tTHY1\\\\tPCSK9\\\\tEPHB4\\\\tDCBLD2\\\\tGHRL\\\\tLYN\\\\tGAS1\\\\tFLOT2\\\\tPLAUR\\\\tAKAP7\\\\tATP8B1\\\\tEFNA5\\\\tSLC34A3\\\\tAPP\\\\tGSTM3\\\\tHSPB1\\\\tSLC2A4\\\\tIL2RB\\\\tRTN4RL1\\\\tNCOA6\\\\tSULF2\\\\tADAM10\\\\tBRCA1\\\\tGATA3\\\\tAFAP1L2\\\\tIL2RG\\\\tCD160\\\\tADIPOR2\\\\tSLC22A12\\\\tNTNG1\\\\tSCUBE1\\\\tCX3CL1\\\\tCROCC\\\\n"\n >>> temp_dir = tempfile.gettempdir()\n@@ -29,81 +30,91 @@\n ... f.write(line2)\n 288\n 380\n- >>> df = read_gmt(temp_gmt)\n+ >>> df = read_gmt_long(temp_gmt)\n >>> df.shape[0]\n- 2\n- >>> df.columns == ["gene_set_name", "genes"]\n- array([ True, True])\n- >>> df.loc[df["gene_set_name"] == "HALLMARK_APICAL_SURFACE"].genes.tolist()[0].startswith("B4GALT1")\n- True\n+ 76\n+ >>> len(df.loc[df["gene_set"] == "HALLMARK_APICAL_SURFACE"].gene.tolist())\n+ 44\n """\n+ # Create a list of dictionaries, where each dictionary represents a gene set\n+ gene_sets = {}\n+\n # Read the GMT file into a list of lines\n with open(gmt_file, "r") as f:\n- lines = f.readlines()\n+ while True:\n+ line = f.readline()\n+ if not line:\n+ break\n+ fields = line.strip().split("\\t")\n+ gene_sets[fields[0]]= fields[2:]\n \n- # Create a list of dictionaries, where each dictionary represents a gene set\n- gene_sets = []\n- for line in lines:\n- fields = line.strip().split("\\t")\n- gene_set = {"gene_set_name": fields[0], "genes": ",".join(fields[2:])}\n- gene_sets.append(gene_set)\n-\n- # Convert the list of dictionaries to a DataFrame\n- return pd.DataFrame(gene_sets)\n+ return pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n \n \n-def score_genes_aucell(\n- adata: anndata.AnnData, gene_list: list, score_name: str, use_raw=False, min_n_genes=5\n-):\n+def score_genes_aucell_mt(adata: anndata.AnnData, gene_set_gene: pd.DataFrame, use_raw=False, min_n_genes=5, var_gene_symbols_field=None):\n """Score genes using Aucell.\n \n Parameters\n ----------\n adata : anndata.AnnData\n- gene_list : list\n- score_names : str\n- use_raw : bool, optional\n+ gene_set_gene: pd.DataFrame with columns gene_set and gene\n+ use_raw : bool, optional, False by default.\n+ min_n_genes : int, optional, 5 by default.\n+ var_gene_symbols_field : str, optional, None by default. The field in var where gene symbols are stored\n \n >>> import scanpy as sc\n >>> import decoupler as dc\n >>> adata = sc.datasets.pbmc68k_reduced()\n- >>> gene_list = adata.var[adata.var.index.str.startswith("RP")].index.tolist()\n- >>> score_genes_aucell(adata, gene_list, "ribosomal_aucell", use_raw=False)\n- >>> "ribosomal_aucell" in adata.obs.columns\n+ >>> r_ge'..b'_set_gene[\'gene\'] = gene_set_gene[\'gene_id\']\n+ \n # run decoupler\'s run_aucell\n- # catch the value error\n- try:\n- dc.run_aucell(\n- adata, net=geneset_df, source="geneset", target="gene_id", use_raw=use_raw\n+ dc.run_aucell(\n+ adata, net=gene_set_gene, source="gene_set", target="gene", use_raw=use_raw, min_n=min_n_genes\n )\n- # copy .obsm[\'aucell_estimate\'] matrix columns to adata.obs using the column names\n- adata.obs[score_name] = adata.obsm["aucell_estimate"][score_name]\n- except ValueError as ve:\n- print(f"Gene list {score_name} failed, skipping: {str(ve)}")\n+ for gs in gene_set_gene.gene_set.unique():\n+ if gs in adata.obsm[\'aucell_estimate\'].keys():\n+ adata.obs[f"AUCell_{gs}"] = adata.obsm["aucell_estimate"][gs]\n \n \n def run_for_genelists(\n- adata, gene_lists, score_names, use_raw=False, gene_symbols_field="gene_symbols", min_n_genes=5\n+ adata, gene_lists, score_names, use_raw=False, gene_symbols_field=None, min_n_genes=5\n ):\n if len(gene_lists) == len(score_names):\n for gene_list, score_names in zip(gene_lists, score_names):\n genes = gene_list.split(",")\n- ens_gene_ids = adata.var[adata.var[gene_symbols_field].isin(genes)].index\n- score_genes_aucell(\n+ gene_sets = {}\n+ gene_sets[score_names] = genes\n+ gene_set_gene_df = pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n+ \n+ score_genes_aucell_mt(\n adata,\n- ens_gene_ids,\n- f"AUCell_{score_names}",\n+ gene_set_gene_df,\n use_raw,\n- min_n_genes\n+ min_n_genes,\n+ var_gene_symbols_field=gene_symbols_field\n )\n else:\n raise ValueError(\n@@ -160,32 +171,31 @@\n parser.add_argument(\n "--write_anndata", action="store_true", help="Write the modified AnnData object"\n )\n+ # argument for number of max concurrent processes\n+ parser.add_argument("--max_threads", type=int, required=False, default=1, help="Number of max concurrent threads")\n+\n \n # Parse command-line arguments\n args = parser.parse_args()\n \n+ nb.set_num_threads(n=args.max_threads)\n+\n # Load input AnnData object\n adata = anndata.read_h5ad(args.input_file)\n \n if args.gmt_file is not None:\n # Load MSigDB file in GMT format\n- msigdb = read_gmt(args.gmt_file)\n+ # msigdb = read_gmt(args.gmt_file)\n+ msigdb = read_gmt_long(args.gmt_file)\n \n gene_sets_to_score = (\n args.gene_sets_to_score.split(",") if args.gene_sets_to_score else []\n )\n- # Score genes by their ensembl ids using the score_genes_aucell function\n- for _, row in msigdb.iterrows():\n- gene_set_name = row["gene_set_name"]\n- if not gene_sets_to_score or gene_set_name in gene_sets_to_score:\n- genes = row["genes"].split(",")\n- # Convert gene symbols to ensembl ids by using the columns gene_symbols and index in adata.var specific to the gene set\n- ens_gene_ids = adata.var[\n- adata.var[args.gene_symbols_field].isin(genes)\n- ].index\n- score_genes_aucell(\n- adata, ens_gene_ids, f"AUCell_{gene_set_name}", args.use_raw, args.min_n\n- )\n+ if gene_sets_to_score:\n+ # we limit the GMT file read to the genesets specified in the gene_sets_to_score argument\n+ msigdb = msigdb[msigdb["gene_set"].isin(gene_sets_to_score)]\n+ \n+ score_genes_aucell_mt(adata, msigdb, args.use_raw, args.min_n, var_gene_symbols_field=args.gene_symbols_field)\n elif args.gene_lists_to_score is not None and args.score_names is not None:\n gene_lists = args.gene_lists_to_score.split(":")\n score_names = args.score_names.split(",")\n' |