Repository 'decoupler_pseudobulk'
hg clone https://toolshed.g2.bx.psu.edu/repos/ebi-gxa/decoupler_pseudobulk

Changeset 0:59a7f3f83aec (2023-09-24)
Next changeset 1:046d8ff974ff (2023-09-27)
Commit message:
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit 20f4a739092bd05106d5de170523ad61d66e41fc
added:
decoupler_pseudobulk.py
decoupler_pseudobulk.xml
get_test_data.sh
b
diff -r 000000000000 -r 59a7f3f83aec decoupler_pseudobulk.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/decoupler_pseudobulk.py Sun Sep 24 08:44:24 2023 +0000
[
b'@@ -0,0 +1,365 @@\n+import argparse\n+\n+import anndata\n+import decoupler\n+import pandas as pd\n+\n+\n+def get_pseudobulk(\n+    adata,\n+    sample_col,\n+    groups_col,\n+    layer=None,\n+    mode="sum",\n+    min_cells=10,\n+    min_counts=1000,\n+    use_raw=False,\n+):\n+    """\n+    >>> import scanpy as sc\n+    >>> adata = sc.datasets.pbmc68k_reduced()\n+    >>> adata.X = abs(adata.X).astype(int)\n+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")\n+    """\n+\n+    return decoupler.get_pseudobulk(\n+        adata,\n+        sample_col=sample_col,\n+        groups_col=groups_col,\n+        layer=layer,\n+        mode=mode,\n+        use_raw=use_raw,\n+        min_cells=min_cells,\n+        min_counts=min_counts,\n+    )\n+\n+\n+def prepend_c_to_index(index_value):\n+    if index_value and index_value[0].isdigit():\n+        return "C" + index_value\n+    return index_value\n+\n+\n+# write results for loading into DESeq2\n+def write_DESeq2_inputs(pdata, layer=None, output_dir="", factor_fields=None):\n+    """\n+    >>> import scanpy as sc\n+    >>> adata = sc.datasets.pbmc68k_reduced()\n+    >>> adata.X = abs(adata.X).astype(int)\n+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")\n+    >>> write_DESeq2_inputs(pseudobulk)\n+    """\n+    # add / to output_dir if is not empty or if it doesn\'t end with /\n+    if output_dir != "" and not output_dir.endswith("/"):\n+        output_dir = output_dir + "/"\n+    obs_for_deseq = pdata.obs.copy()\n+    # replace any index starting with digits to start with C instead.\n+    obs_for_deseq.rename(index=prepend_c_to_index, inplace=True)\n+    # avoid dash that is read as point on R colnames.\n+    obs_for_deseq.index = obs_for_deseq.index.str.replace("-", "_")\n+    obs_for_deseq.index = obs_for_deseq.index.str.replace(" ", "_")\n+    col_metadata_file = f"{output_dir}col_metadata.csv"\n+    # write obs to a col_metadata file\n+    if factor_fields:\n+        # only output the index plus the columns in factor_fields in that order\n+        obs_for_deseq[factor_fields].to_csv(col_metadata_file, sep=",", index=True)\n+    else:\n+        obs_for_deseq.to_csv(col_metadata_file, sep=",", index=True)\n+    # write var to a gene_metadata file\n+    pdata.var.to_csv(f"{output_dir}gene_metadata.csv", sep=",", index=True)\n+    # write the counts matrix of a specified layer to file\n+    if layer is None:\n+        # write the X numpy matrix transposed to file\n+        df = pd.DataFrame(pdata.X.T, index=pdata.var.index, columns=obs_for_deseq.index)\n+    else:\n+        df = pd.DataFrame(\n+            pdata.layers[layer].T, index=pdata.var.index, columns=obs_for_deseq.index\n+        )\n+    df.to_csv(f"{output_dir}counts_matrix.csv", sep=",", index_label="")\n+\n+\n+def plot_pseudobulk_samples(\n+    pseudobulk_data,\n+    groupby,\n+    figsize=(10, 10),\n+    save_path=None,\n+):\n+    """\n+    >>> import scanpy as sc\n+    >>> adata = sc.datasets.pbmc68k_reduced()\n+    >>> adata.X = abs(adata.X).astype(int)\n+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")\n+    >>> plot_pseudobulk_samples(pseudobulk, groupby=["bulk_labels", "louvain"], figsize=(10, 10))\n+    """\n+    fig = decoupler.plot_psbulk_samples(\n+        pseudobulk_data, groupby=groupby, figsize=figsize, return_fig=True\n+    )\n+    if save_path:\n+        fig.savefig(f"{save_path}/pseudobulk_samples.png")\n+    else:\n+        fig.show()\n+\n+\n+def plot_filter_by_expr(\n+    pseudobulk_data, group, min_count=None, min_total_count=None, save_path=None\n+):\n+    """\n+    >>> import scanpy as sc\n+    >>> adata = sc.datasets.pbmc68k_reduced()\n+    >>> adata.X = abs(adata.X).astype(int)\n+    >>> pseudobulk = get_pseudobulk(adata, "bulk_labels", "louvain")\n+    >>> plot_filter_by_expr(pseudobulk, group="bulk_labels", min_count=10, min_total_count=200)\n+    """\n+    fig = decoupler.plot_filter_by_expr(\n+        pseudobulk_data,\n+        group=group,\n+        min_count=min_count,\n+        min_total_count=min_total_count,\n+        return_fig=True,\n+    )\n+    if sav'..b'        The AnnData object\n+\n+    Returns\n+    -------\n+    anndata.AnnData\n+        The merged AnnData object\n+\n+    docstring tests:\n+    >>> import scanpy as sc\n+    >>> ad = sc.datasets.pbmc68k_reduced()\n+    >>> ad = merge_adata_obs_fields(["bulk_labels","louvain"], ad)\n+    >>> ad.obs.columns\n+    Index([\'bulk_labels\', \'n_genes\', \'percent_mito\', \'n_counts\', \'S_score\',\n+           \'G2M_score\', \'phase\', \'louvain\', \'bulk_labels_louvain\'],\n+          dtype=\'object\')\n+    """\n+    field_name = "_".join(obs_fields_to_merge)\n+    for field in obs_fields_to_merge:\n+        if field not in adata.obs.columns:\n+            raise ValueError(f"The \'{field}\' column is not present in adata.obs.")\n+        if field_name not in adata.obs.columns:\n+            adata.obs[field_name] = adata.obs[field].astype(str)\n+        else:\n+            adata.obs[field_name] = (\n+                adata.obs[field_name] + "_" + adata.obs[field].astype(str)\n+            )\n+    return adata\n+\n+\n+if __name__ == "__main__":\n+    # Create argument parser\n+    parser = argparse.ArgumentParser(\n+        description="Perform pseudobulk analysis on an AnnData object"\n+    )\n+\n+    # Add arguments\n+    parser.add_argument("adata_file", type=str, help="Path to the AnnData file")\n+    parser.add_argument(\n+        "-m",\n+        "--adata_obs_fields_to_merge",\n+        type=str,\n+        help="Fields in adata.obs to merge, comma separated",\n+    )\n+    parser.add_argument(\n+        "--groupby",\n+        type=str,\n+        required=True,\n+        help="The column in adata.obs that defines the groups",\n+    )\n+    parser.add_argument(\n+        "--sample_key",\n+        required=True,\n+        type=str,\n+        help="The column in adata.obs that defines the samples",\n+    )\n+    # add argument for layer\n+    parser.add_argument(\n+        "--layer",\n+        type=str,\n+        default=None,\n+        help="The name of the layer of the AnnData object to use",\n+    )\n+    # add argument for mode\n+    parser.add_argument(\n+        "--mode",\n+        type=str,\n+        default="sum",\n+        help="The mode for Decoupler pseudobulk analysis",\n+        choices=["sum", "mean", "median"],\n+    )\n+    # add boolean argument for use_raw\n+    parser.add_argument(\n+        "--use_raw",\n+        action="store_true",\n+        default=False,\n+        help="Whether to use the raw part of the AnnData object",\n+    )\n+    # add argument for min_cells\n+    parser.add_argument(\n+        "--min_cells",\n+        type=int,\n+        default=10,\n+        help="Minimum number of cells for pseudobulk analysis",\n+    )\n+    parser.add_argument(\n+        "--save_path", type=str, help="Path to save the plot (optional)"\n+    )\n+    parser.add_argument(\n+        "--min_counts",\n+        type=int,\n+        help="Minimum count threshold for filtering by expression",\n+    )\n+    parser.add_argument(\n+        "--min_total_counts",\n+        type=int,\n+        help="Minimum total count threshold for filtering by expression",\n+    )\n+    parser.add_argument(\n+        "--anndata_output_path",\n+        type=str,\n+        help="Path to save the filtered AnnData object or pseudobulk data",\n+    )\n+    parser.add_argument(\n+        "--filter_expr", action="store_true", help="Enable filtering by expression"\n+    )\n+    parser.add_argument(\n+        "--factor_fields",\n+        type=str,\n+        help="Comma separated list of fields for the factors",\n+    )\n+    parser.add_argument(\n+        "--deseq2_output_path",\n+        type=str,\n+        help="Path to save the DESeq2 inputs",\n+        required=True,\n+    )\n+    parser.add_argument(\n+        "--plot_samples_figsize",\n+        type=int,\n+        default=[10, 10],\n+        nargs=2,\n+        help="Size of the samples plot as a tuple (two arguments)",\n+    )\n+    parser.add_argument("--plot_filtering_figsize", type=int, default=[10, 10], nargs=2)\n+\n+    # Parse the command line arguments\n+    args = parser.parse_args()\n+\n+    # Call the main function\n+    main(args)\n'
b
diff -r 000000000000 -r 59a7f3f83aec decoupler_pseudobulk.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/decoupler_pseudobulk.xml Sun Sep 24 08:44:24 2023 +0000
[
b'@@ -0,0 +1,161 @@\n+<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy0" profile="20.05">\n+    <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description>\n+    <requirements>\n+        <requirement type="package" version="1.4.0">decoupler</requirement>\n+    </requirements>\n+    <command detect_errors="exit_code"><![CDATA[\n+mkdir deseq_output_dir &&\n+mkdir plots_output_dir &&\n+python \'$__tool_directory__/decoupler_pseudobulk.py\' $input_file\n+    #if $adata_obs_fields_to_merge:\n+    --adata_obs_fields_to_merge $adata_obs_fields_to_merge\n+    #end if\n+    --groupby $groupby\n+    --sample_key $sample_key\n+    #if $layer:\n+    --layer $layer\n+    #end if\n+    --mode $mode\n+    #if $use_raw:\n+    --use_raw\n+    #end if\n+    #if $min_cells:\n+    --min_cells $min_cells\n+    #end if\n+    #if $produce_plots:\n+    --save_path plots_output_dir\n+    #end if\n+    #if $min_counts:\n+    --min_counts $min_counts\n+    #end if\n+    #if $min_total_counts:\n+    --min_total_counts $min_total_counts\n+    #end if\n+    #if $produce_anndata:\n+    --anndata_output_path $pbulk_anndata\n+    #end if\n+    #if $filter_expr:\n+    --filter_expr\n+    #end if\n+    #if $factor_fields:\n+    --factor_fields \'$factor_fields\'\n+    #end if\n+    --deseq2_output_path deseq_output_dir\n+    --plot_samples_figsize $plot_samples_figsize\n+    --plot_filtering_figsize $plot_filtering_figsize\n+]]></command>\n+    <environment_variables>\n+        <environment_variable name="NUMBA_CACHE_DIR">\\$_GALAXY_JOB_TMP_DIR</environment_variable>\n+        <environment_variable name="MPLCONFIGDIR">\\$_GALAXY_JOB_TMP_DIR</environment_variable>\n+    </environment_variables>\n+    <inputs>\n+        <param type="data" name="input_file" format="data" label="Input AnnData file"/>\n+        <param type="text" name="adata_obs_fields_to_merge" label="Obs Fields to Merge" optional="true" help="Fields in adata.obs to merge, comma separated (optional). They will be available as field1_field2_field3 in the AnnData Obs dataframe."/>\n+        <param type="text" name="groupby" label="Groupby column" help="The column in adata.obs that defines the groups. Merged columns in the above field are available here."/>\n+        <param type="text" name="sample_key" label="Sample Key column" help="The column in adata.obs that defines the samples. Merged columns in the above field are available here."/>\n+        <param type="text" name="layer" label="Layer" optional="true" help="The name of the layer of the AnnData object to use. It needs to be present in the AnnData object."/>\n+        <param type="select" name="mode" label="Decoupler pseudobulk Mode" optional="true" help="Determines how counts are aggregated across cells with the specificied groups: sum, mean or median.">\n+            <option value="sum" selected="true">sum</option>\n+            <option value="mean">mean</option>\n+            <option value="median">median</option>\n+        </param>\n+        <param type="text" name="factor_fields" label="Factor Fields" optional="true" help="Fields in adata.obs to use as factors, comma separated (optional). For EdgeR make sure that the first field is the main contrast field desired and the rest of the fields are the covariates desired. Decoupler produces two fields in the intermediate AnnData (which can be added here if desired for covariates): psbulk_n_cells and psbulk_counts."/>\n+        <param type="boolean" name="use_raw" label="Use Raw" optional="true"/>\n+        <param type="integer" name="min_cells" label="Minimum Cells" optional="true"/>\n+        <param type="boolean" name="produce_plots" label="Produce plots"/>\n+        <param type="boolean" name="produce_anndata" label="Produce AnnData with Pseudo-bulk"/>\n+        <param type="integer" name="min_counts" label="Minimum Counts" optional="true"/>\n+        <param type="integer" name="min_total_counts" label="Minimum Total Counts" optional="true"/>\n+        <param type="boolean" name="filter_expr" labe'..b'+    <tests>\n+        <test expect_num_outputs="6">\n+            <param name="input_file" value="mito_counted_anndata.h5ad"/>\n+            <param name="adata_obs_fields_to_merge" value="batch,sex"/>\n+            <param name="groupby" value="batch_sex"/>\n+            <param name="sample_key" value="genotype"/>\n+            <param name="factor_fields" value="genotype,batch_sex"/>\n+            <param name="mode" value="sum"/>\n+            <param name="min_cells" value="10"/>\n+            <param name="produce_plots" value="true"/>\n+            <param name="produce_anndata" value="true"/>\n+            <param name="min_counts" value="10"/>\n+            <param name="min_total_counts" value="1000"/>\n+            <param name="filter_expr" value="true"/>\n+            <param name="plot_samples_figsize" value="10 10"/>\n+            <param name="plot_filtering_figsize" value="10 10"/>\n+            <output name="pbulk_anndata" ftype="h5ad">\n+                <assert_contents>\n+                    <has_h5_keys keys="obs/psbulk_n_cells"/>\n+                </assert_contents>\n+            </output>\n+            <output name="count_matrix" ftype="csv">\n+                <assert_contents>\n+                    <has_n_lines n="3620"/>\n+                </assert_contents>\n+            </output>\n+            <output name="samples_metadata" ftype="csv">\n+                <assert_contents>\n+                    <has_n_lines n="8"/>\n+                </assert_contents>\n+            </output>\n+            <output name="genes_metadata" ftype="csv">\n+                <assert_contents>\n+                    <has_n_lines n="3620"/>\n+                </assert_contents>\n+            </output>\n+            <output name="plot_output" ftype="png">\n+                <assert_contents>\n+                    <has_size value="31853" delta="3000"/>\n+                </assert_contents>\n+            </output>\n+            <output name="filter_by_expr_plot" ftype="png">\n+                <assert_contents>\n+                    <has_size value="21656" delta="2000"/>\n+                </assert_contents>\n+            </output>\n+        </test>\n+    </tests>\n+    <help>\n+        <![CDATA[\n+        This tool performs pseudobulk analysis and filtering using Decoupler-py. Provide the input AnnData file and specify the necessary parameters.\n+\n+        - Input AnnData file: The input AnnData file to be processed.\n+        - Obs Fields to Merge: Fields in adata.obs to merge, comma separated (optional).\n+        - Groupby column: The column in adata.obs that defines the groups.\n+        - Sample Key column: The column in adata.obs that defines the samples.\n+        - Layer (optional): The name of the layer of the AnnData object to use.\n+        - Mode: The mode for Decoupler pseudobulk analysis (sum, mean, median). Sum by default.\n+        - Factor Fields (optional): Fields in adata.obs to use as factors, comma separated (optional). For EdgeR make sure that the first field is the main contrast field desired and the rest of the fields are the covariates desired.\n+        - Use Raw: Whether to use the raw part of the AnnData object.\n+        - Minimum Cells: Minimum number of cells for pseudobulk analysis (optional).\n+        - Minimum Counts: Minimum count threshold for filtering by expression (optional).\n+        - Minimum Total Counts: Minimum total count threshold for filtering by expression (optional).\n+        - Enable Filtering by Expression: Check this box to enable filtering by expression.\n+        - Plot Samples Figsize: Size of the samples plot as a tuple (two arguments).\n+        - Plot Filtering Figsize: Size of the filtering plot as a tuple (two arguments).\n+\n+        The tool will output the filtered AnnData, count matrix, samples metadata, genes metadata (in DESeq2 format), and the pseudobulk plot and filter by expression plot (if enabled).\n+\n+        ]]>\n+    </help>\n+    <citations>\n+        <citation type="doi">doi.org/10.1093/bioadv/vbac016</citation>\n+    </citations>\n+</tool>\n'
b
diff -r 000000000000 -r 59a7f3f83aec get_test_data.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/get_test_data.sh Sun Sep 24 08:44:24 2023 +0000
[
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+BASENAME_FILE='mito_counted_anndata.h5ad'
+
+MTX_LINK='https://zenodo.org/record/7053673/files/Mito-counted_AnnData'
+
+# convenience for getting data
+function get_data {
+  local link=$1
+  local fname=$2
+
+  if [ ! -f $fname ]; then
+    echo "$fname not available locally, downloading.."
+    wget -O $fname --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 $link
+  fi
+}
+
+# get matrix data
+mkdir -p test-data
+pushd test-data
+get_data $MTX_LINK $BASENAME_FILE