Repository 'decoupler_pseudobulk'
hg clone https://toolshed.g2.bx.psu.edu/repos/ebi-gxa/decoupler_pseudobulk

Changeset 8:93f61ea19336 (2024-07-15)
Previous changeset 7:68a2b5445558 (2024-04-16) Next changeset 9:bd4b54b75888 (2024-09-15)
Commit message:
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit eea5c13f9e6e070a2359c59400773b01f9cd7567
modified:
decoupler_aucell_score.py
decoupler_pathway_inference.py
decoupler_pseudobulk.py
decoupler_pseudobulk.xml
b
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_aucell_score.py
--- a/decoupler_aucell_score.py Tue Apr 16 11:49:25 2024 +0000
+++ b/decoupler_aucell_score.py Mon Jul 15 10:56:42 2024 +0000
[
b'@@ -1,16 +1,15 @@\n import argparse\n-import os\n-import tempfile\n \n import anndata\n import decoupler as dc\n+import numba as nb\n import pandas as pd\n-import numba as nb\n \n \n def read_gmt_long(gmt_file):\n-    """\n-    Reads a GMT file and produce a Pandas DataFrame in long format, ready to be passed to the AUCell method.\n+    r"""\n+    Reads a GMT file and produce a Pandas DataFrame in long format, ready to\n+    be passed to the AUCell method.\n \n     Parameters\n     ----------\n@@ -20,9 +19,29 @@\n     Returns\n     -------\n     pd.DataFrame\n-        A DataFrame with the gene sets. Each row represents a gene set to gene assignment, and the columns are "gene_set_name" and "genes".\n-    >>> line = "HALLMARK_NOTCH_SIGNALING\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\\tJAG1\\\\tNOTCH3\\\\tNOTCH2\\\\tAPH1A\\\\tHES1\\\\tCCND1\\\\tFZD1\\\\tPSEN2\\\\tFZD7\\\\tDTX1\\\\tDLL1\\\\tFZD5\\\\tMAML2\\\\tNOTCH1\\\\tPSENEN\\\\tWNT5A\\\\tCUL1\\\\tWNT2\\\\tDTX4\\\\tSAP30\\\\tPPARD\\\\tKAT2A\\\\tHEYL\\\\tSKP1\\\\tRBX1\\\\tTCF7L2\\\\tARRB1\\\\tLFNG\\\\tPRKCA\\\\tDTX2\\\\tST3GAL6\\\\tFBXW11\\\\n"\n-    >>> line2 = "HALLMARK_APICAL_SURFACE\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\\tB4GALT1\\\\tRHCG\\\\tMAL\\\\tLYPD3\\\\tPKHD1\\\\tATP6V0A4\\\\tCRYBG1\\\\tSHROOM2\\\\tSRPX\\\\tMDGA1\\\\tTMEM8B\\\\tTHY1\\\\tPCSK9\\\\tEPHB4\\\\tDCBLD2\\\\tGHRL\\\\tLYN\\\\tGAS1\\\\tFLOT2\\\\tPLAUR\\\\tAKAP7\\\\tATP8B1\\\\tEFNA5\\\\tSLC34A3\\\\tAPP\\\\tGSTM3\\\\tHSPB1\\\\tSLC2A4\\\\tIL2RB\\\\tRTN4RL1\\\\tNCOA6\\\\tSULF2\\\\tADAM10\\\\tBRCA1\\\\tGATA3\\\\tAFAP1L2\\\\tIL2RG\\\\tCD160\\\\tADIPOR2\\\\tSLC22A12\\\\tNTNG1\\\\tSCUBE1\\\\tCX3CL1\\\\tCROCC\\\\n"\n+        A DataFrame with the gene sets. Each row represents a gene set to gene\n+        assignment, and the columns are "gene_set_name" and "genes".\n+    >>> import os\n+    >>> import tempfile\n+    >>> line = "HALLMARK_NOTCH_SIGNALING\\\n+    ... \\thttp://www.gsea-msigdb.org/\\\n+    ... gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\n+    ... \\tJAG1\\tNOTCH3\\tNOTCH2\\tAPH1A\\tHES1\\tCCND1\\\n+    ... \\tFZD1\\tPSEN2\\tFZD7\\tDTX1\\tDLL1\\tFZD5\\tMAML2\\\n+    ... \\tNOTCH1\\tPSENEN\\tWNT5A\\tCUL1\\tWNT2\\tDTX4\\\n+    ... \\tSAP30\\tPPARD\\tKAT2A\\tHEYL\\tSKP1\\tRBX1\\tTCF7L2\\\n+    ... \\tARRB1\\tLFNG\\tPRKCA\\tDTX2\\tST3GAL6\\tFBXW11\\n"\n+    >>> line2 = "HALLMARK_APICAL_SURFACE\\\n+    ... \\thttp://www.gsea-msigdb.org/\\\n+    ... gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\n+    ... \\tB4GALT1\\tRHCG\\tMAL\\tLYPD3\\tPKHD1\\tATP6V0A4\\\n+    ... \\tCRYBG1\\tSHROOM2\\tSRPX\\tMDGA1\\tTMEM8B\\tTHY1\\\n+    ... \\tPCSK9\\tEPHB4\\tDCBLD2\\tGHRL\\tLYN\\tGAS1\\tFLOT2\\\n+    ... \\tPLAUR\\tAKAP7\\tATP8B1\\tEFNA5\\tSLC34A3\\tAPP\\\n+    ... \\tGSTM3\\tHSPB1\\tSLC2A4\\tIL2RB\\tRTN4RL1\\tNCOA6\\\n+    ... \\tSULF2\\tADAM10\\tBRCA1\\tGATA3\\tAFAP1L2\\tIL2RG\\\n+    ... \\tCD160\\tADIPOR2\\tSLC22A12\\tNTNG1\\tSCUBE1\\tCX3CL1\\\n+    ... \\tCROCC\\n"\n     >>> temp_dir = tempfile.gettempdir()\n     >>> temp_gmt = os.path.join(temp_dir, "temp_file.gmt")\n     >>> with open(temp_gmt, "w") as f:\n@@ -36,7 +55,8 @@\n     >>> len(df.loc[df["gene_set"] == "HALLMARK_APICAL_SURFACE"].gene.tolist())\n     44\n     """\n-    # Create a list of dictionaries, where each dictionary represents a gene set\n+    # Create a list of dictionaries, where each dictionary represents a\n+    # gene set\n     gene_sets = {}\n \n     # Read the GMT file into a list of lines\n@@ -46,12 +66,20 @@\n             if not line:\n                 break\n             fields = line.strip().split("\\t")\n-            gene_sets[fields[0]]= fields[2:]\n+            gene_sets[fields[0]] = fields[2:]\n \n-    return pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n+    return pd.concat(\n+        pd.DataFrame({"gene_set": k, "gene": v}) for k, v in gene_sets.items()\n+    )\n \n \n-def score_genes_aucell_mt(adata: anndata.AnnData, gene_set_gene: pd.DataFrame, use_raw=False, min_n_genes=5, var_gene_symbols_field=None):\n+def score_genes_aucell_mt(\n+    adata: anndata.AnnData,\n+    gene_set_gene: pd.DataFrame,\n+    use_raw=False,\n+    min_n_genes=5,\n+    var_gene_symbols_field=None,\n+):\n     """Score genes using Aucell.\n \n     Parameters\n@@ -60,1'..b'e than one set of genes, separated by colon :",\n+        help="Comma separated list of genes to score. You can have more \\\n+            than one set of genes, separated by colon :",\n     )\n     # argument for the score name when using the gene list\n     parser.add_argument(\n         "--score_names",\n         type=str,\n         required=False,\n-        help="Name of the score column when using the gene list. You can have more than one set of score names, separated by colon :. It should be the same length as the number of gene lists.",\n+        help="Name of the score column when using the gene list. You can \\\n+            have more than one set of score names, separated by colon :. \\\n+                It should be the same length as the number of gene lists.",\n     )\n     parser.add_argument(\n         "--gene_symbols_field",\n@@ -159,7 +227,8 @@\n         help="Name of the gene symbols field in the AnnData object",\n         required=True,\n     )\n-    # argument for min_n Minimum of targets per source. If less, sources are removed.\n+    # argument for min_n Minimum of targets per source. If less, sources\n+    # are removed.\n     parser.add_argument(\n         "--min_n",\n         type=int,\n@@ -169,11 +238,18 @@\n     )\n     parser.add_argument("--use_raw", action="store_true", help="Use raw data")\n     parser.add_argument(\n-        "--write_anndata", action="store_true", help="Write the modified AnnData object"\n+        "--write_anndata",\n+        action="store_true",\n+        help="Write the modified AnnData object",\n     )\n     # argument for number of max concurrent processes\n-    parser.add_argument("--max_threads", type=int, required=False, default=1, help="Number of max concurrent threads")\n-\n+    parser.add_argument(\n+        "--max_threads",\n+        type=int,\n+        required=False,\n+        default=1,\n+        help="Number of max concurrent threads",\n+    )\n \n     # Parse command-line arguments\n     args = parser.parse_args()\n@@ -189,23 +265,40 @@\n         msigdb = read_gmt_long(args.gmt_file)\n \n         gene_sets_to_score = (\n-            args.gene_sets_to_score.split(",") if args.gene_sets_to_score else []\n+            args.gene_sets_to_score.split(",")\n+            if args.gene_sets_to_score\n+            else []\n         )\n         if gene_sets_to_score:\n-            # we limit the GMT file read to the genesets specified in the gene_sets_to_score argument\n+            # we limit the GMT file read to the genesets specified in the\n+            # gene_sets_to_score argument\n             msigdb = msigdb[msigdb["gene_set"].isin(gene_sets_to_score)]\n-        \n-        score_genes_aucell_mt(adata, msigdb, args.use_raw, args.min_n, var_gene_symbols_field=args.gene_symbols_field)\n+\n+        score_genes_aucell_mt(\n+            adata,\n+            msigdb,\n+            args.use_raw,\n+            args.min_n,\n+            var_gene_symbols_field=args.gene_symbols_field,\n+        )\n     elif args.gene_lists_to_score is not None and args.score_names is not None:\n         gene_lists = args.gene_lists_to_score.split(":")\n         score_names = args.score_names.split(",")\n         run_for_genelists(\n-            adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field, args.min_n\n+            adata,\n+            gene_lists,\n+            score_names,\n+            args.use_raw,\n+            args.gene_symbols_field,\n+            args.min_n,\n         )\n \n-    # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns\n+    # Save the modified AnnData object or generate a file with cells as rows\n+    # and the new score_names columns\n     if args.write_anndata:\n         adata.write_h5ad(args.output_file)\n     else:\n-        new_columns = [col for col in adata.obs.columns if col.startswith("AUCell_")]\n+        new_columns = [\n+            col for col in adata.obs.columns if col.startswith("AUCell_")\n+        ]\n         adata.obs[new_columns].to_csv(args.output_file, sep="\\t", index=True)\n'
b
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pathway_inference.py
--- a/decoupler_pathway_inference.py Tue Apr 16 11:49:25 2024 +0000
+++ b/decoupler_pathway_inference.py Mon Jul 15 10:56:42 2024 +0000
[
@@ -20,24 +20,34 @@
 
 # output file prefix
 parser.add_argument(
-    "-o", "--output",
+    "-o",
+    "--output",
     help="output files prefix",
     default=None,
 )
 
 # path to save Activities AnnData file
 parser.add_argument(
-    "-a", "--activities_path", help="Path to save Activities AnnData file", default=None
+    "-a",
+    "--activities_path",
+    help="Path to save Activities AnnData file",
+    default=None,
 )
 
 # Column name in net with source nodes
 parser.add_argument(
-    "-s", "--source", help="Column name in net with source nodes.", default="source"
+    "-s",
+    "--source",
+    help="Column name in net with source nodes.",
+    default="source",
 )
 
 # Column name in net with target nodes
 parser.add_argument(
-    "-t", "--target", help="Column name in net with target nodes.", default="target"
+    "-t",
+    "--target",
+    help="Column name in net with target nodes.",
+    default="target",
 )
 
 # Column name in net with weights.
@@ -47,17 +57,27 @@
 
 # add boolean argument for use_raw
 parser.add_argument(
-    "--use_raw", action="store_true", default=False, help="Whether to use the raw part of the AnnData object"
+    "--use_raw",
+    action="store_true",
+    default=False,
+    help="Whether to use the raw part of the AnnData object",
 )
 
 # add argument for min_cells
 parser.add_argument(
-    "--min_n", help="Minimum of targets per source. If less, sources are removed.", default=5, type=int
+    "--min_n",
+    help="Minimum of targets per source. If less, sources are removed.",
+    default=5,
+    type=int,
 )
 
 # add activity inference method option
 parser.add_argument(
-    "-m", "--method", help="Activity inference method", default="mlm", required=True
+    "-m",
+    "--method",
+    help="Activity inference method",
+    default="mlm",
+    required=True,
 )
 args = parser.parse_args()
 
@@ -69,7 +89,7 @@
 adata = ad.read_h5ad(args.input_anndata)
 
 # read in the input file network input file
-network = pd.read_csv(args.input_network, sep='\t')
+network = pd.read_csv(args.input_network, sep="\t")
 
 if (
     args.source not in network.columns
@@ -92,17 +112,21 @@
         weight=args.weight,
         verbose=True,
         min_n=args.min_n,
-        use_raw=args.use_raw 
+        use_raw=args.use_raw,
     )
 
     if args.output is not None:
-        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
-        combined_df = pd.concat([adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1)
+        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
+        # output network files
+        combined_df = pd.concat(
+            [adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1
+        )
 
         # Save the combined dataframe to a file
         combined_df.to_csv(args.output + ".tsv", sep="\t")
 
-    # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
+    # if args.activities_path is specified, generate the activities AnnData
+    # and save the AnnData object to the specified path
     if args.activities_path is not None:
         acts = dc.get_acts(adata, obsm_key="mlm_estimate")
         acts.write_h5ad(args.activities_path)
@@ -116,17 +140,21 @@
         weight=args.weight,
         verbose=True,
         min_n=args.min_n,
-        use_raw=args.use_raw 
+        use_raw=args.use_raw,
     )
 
     if args.output is not None:
-        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files
-        combined_df = pd.concat([adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1)
+        # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the
+        # output network files
+        combined_df = pd.concat(
+            [adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1
+        )
 
         # Save the combined dataframe to a file
         combined_df.to_csv(args.output + ".tsv", sep="\t")
 
-    # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path
+    # if args.activities_path is specified, generate the activities AnnData
+    # and save the AnnData object to the specified path
     if args.activities_path is not None:
         acts = dc.get_acts(adata, obsm_key="ulm_estimate")
         acts.write_h5ad(args.activities_path)
b
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pseudobulk.py
--- a/decoupler_pseudobulk.py Tue Apr 16 11:49:25 2024 +0000
+++ b/decoupler_pseudobulk.py Mon Jul 15 10:56:42 2024 +0000
[
b'@@ -40,8 +40,108 @@\n     return index_value\n \n \n+def genes_to_ignore_per_contrast_field(\n+    count_matrix_df,\n+    samples_metadata,\n+    sample_metadata_col_contrasts,\n+    min_counts_per_sample=5,\n+    use_cpms=False,\n+):\n+    """\n+    # This function calculates the genes to ignore per contrast field\n+    # (e.g., bulk_labels, louvain).\n+    # It does this by first getting the count matrix for each group,\n+    # then identifying genes with a count below a specified threshold.\n+    # The genes to ignore are those that are present in more than a specified\n+    # number of groups.\n+\n+    >>> import pandas as pd\n+    >>> samples_metadata = pd.DataFrame({\'sample\':\n+    ...                                    [\'S1\', \'S2\', \'S3\',\n+    ...                                     \'S4\', \'S5\', \'S6\'],\n+    ...                                  \'contrast_field\':\n+    ...                                    [\'A\', \'A\', \'A\', \'B\', \'B\', \'B\']})\n+    >>> count_matrix_df = pd.DataFrame(\n+    ...                       {\'S1\':\n+    ...                          [30, 1, 40, 50, 30],\n+    ...                        \'S2\':\n+    ...                          [40, 2, 60, 50, 80],\n+    ...                        \'S3\':\n+    ...                          [80, 1, 60, 50, 50],\n+    ...                        \'S4\': [1, 50, 50, 50, 2],\n+    ...                        \'S5\': [3, 40, 40, 40, 2],\n+    ...                        \'S6\': [0, 50, 50, 50, 1]})\n+    >>> count_matrix_df.index = [\'Gene1\', \'Gene2\', \'Gene3\', \'Gene4\', \'Gene5\']\n+    >>> df = genes_to_ignore_per_contrast_field(count_matrix_df,\n+    ...             samples_metadata, min_counts_per_sample=5,\n+    ...             sample_metadata_col_contrasts=\'contrast_field\')\n+    >>> df[df[\'contrast_field\'] == \'A\'].genes_to_ignore.tolist()[0]\n+    \'Gene2\'\n+    >>> df[df[\'contrast_field\'] == \'B\'].genes_to_ignore.tolist()[0]\n+    \'Gene1\'\n+    >>> df[df[\'contrast_field\'] == \'B\'].genes_to_ignore.tolist()[1]\n+    \'Gene5\'\n+    """\n+\n+    # Initialize a dictionary to store the genes to ignore per contrast field\n+    contrast_fields = []\n+    genes_to_ignore = []\n+\n+    # Iterate over the contrast fields\n+    for contrast_field in samples_metadata[\n+        sample_metadata_col_contrasts\n+    ].unique():\n+        # Get the count matrix for the current contrast field\n+        count_matrix_field = count_matrix_df.loc[\n+            :,\n+            (\n+                samples_metadata[sample_metadata_col_contrasts]\n+                == contrast_field\n+            ).tolist(),\n+        ]\n+\n+        # We derive min_counts from the number of samples with that\n+        # contrast_field value\n+        min_counts = count_matrix_field.shape[1] * min_counts_per_sample\n+\n+        if use_cpms:\n+            # Convert counts to counts per million (CPM)\n+            count_matrix_field = (\n+                count_matrix_field.div(count_matrix_field.sum(axis=1), axis=0)\n+                * 1e6\n+            )\n+            min_counts = 1  # use 1 CPM\n+\n+        # Calculate the total number of cells in the current contrast field\n+        # (this produces a vector of counts per gene)\n+        total_counts_per_gene = count_matrix_field.sum(axis=1)\n+\n+        # Identify genes with a count below the specified threshold\n+        genes = total_counts_per_gene[\n+            total_counts_per_gene < min_counts\n+        ].index.tolist()\n+        if len(genes) > 0:\n+            # genes_to_ignore[contrast_field] = " ".join(genes)\n+            for gene in genes:\n+                genes_to_ignore.append(gene)\n+                contrast_fields.append(contrast_field)\n+    # transform gene_to_ignore to a DataFrame\n+    # genes_to_ignore_df = pd.DataFrame(genes_to_ignore.items(),\n+    #                           columns=["contrast_field", "genes_to_ignore"])\n+    genes_to_ignore_df = pd.DataFrame(\n+        {"contrast_field": contrast_fields, "genes_to_ignore": genes_to_ignore}\n+    )\n+    return genes_to_ignore_df\n+\n+\n # write results for loading into DESeq2\n-def write_DESeq2_'..b'count, min_total_count=min_total_count\n@@ -150,12 +275,16 @@\n     if obs:\n         if not set(fields).issubset(set(adata.obs.columns)):\n             raise ValueError(\n-                f"Some of the following fields {legend} are not present in adata.obs: {fields}. Possible fields are: {list(set(adata.obs.columns))}"\n+                f"Some of the following fields {legend} are not present \\\n+                    in adata.obs: {fields}. \\\n+                        Possible fields are: {list(set(adata.obs.columns))}"\n             )\n     else:\n         if not set(fields).issubset(set(adata.var.columns)):\n             raise ValueError(\n-                f"Some of the following fields {legend} are not present in adata.var: {fields}. Possible fields are: {list(set(adata.var.columns))}"\n+                f"Some of the following fields {legend} are not present \\\n+                    in adata.var: {fields}. \\\n+                        Possible fields are: {list(set(adata.var.columns))}"\n             )\n \n \n@@ -219,10 +348,15 @@\n \n     # Save the pseudobulk data\n     if args.anndata_output_path:\n-        pseudobulk_data.write_h5ad(args.anndata_output_path, compression="gzip")\n+        pseudobulk_data.write_h5ad(\n+            args.anndata_output_path, compression="gzip"\n+        )\n \n     write_DESeq2_inputs(\n-        pseudobulk_data, output_dir=args.deseq2_output_path, factor_fields=factor_fields\n+        pseudobulk_data,\n+        output_dir=args.deseq2_output_path,\n+        factor_fields=factor_fields,\n+        min_counts_per_sample_marking=args.min_counts_per_sample_marking,\n     )\n \n \n@@ -254,7 +388,9 @@\n     field_name = "_".join(obs_fields_to_merge)\n     for field in obs_fields_to_merge:\n         if field not in adata.obs.columns:\n-            raise ValueError(f"The \'{field}\' column is not present in adata.obs.")\n+            raise ValueError(\n+                f"The \'{field}\' column is not present in adata.obs."\n+            )\n         if field_name not in adata.obs.columns:\n             adata.obs[field_name] = adata.obs[field].astype(str)\n         else:\n@@ -271,12 +407,16 @@\n     )\n \n     # Add arguments\n-    parser.add_argument("adata_file", type=str, help="Path to the AnnData file")\n+    parser.add_argument(\n+        "adata_file", type=str, help="Path to the AnnData file"\n+    )\n     parser.add_argument(\n         "-m",\n         "--adata_obs_fields_to_merge",\n         type=str,\n-        help="Fields in adata.obs to merge, comma separated. You can have more than one set of fields, separated by semi-colon ;",\n+        help="Fields in adata.obs to merge, comma separated. \\\n+            You can have more than one set of fields, \\\n+                separated by semi-colon ;",\n     )\n     parser.add_argument(\n         "--groupby",\n@@ -328,6 +468,13 @@\n         help="Minimum count threshold for filtering by expression",\n     )\n     parser.add_argument(\n+        "--min_counts_per_sample_marking",\n+        type=int,\n+        default=20,\n+        help="Minimum count threshold per sample for \\\n+            marking genes to be ignored after DE",\n+    )\n+    parser.add_argument(\n         "--min_total_counts",\n         type=int,\n         help="Minimum total count threshold for filtering by expression",\n@@ -338,7 +485,9 @@\n         help="Path to save the filtered AnnData object or pseudobulk data",\n     )\n     parser.add_argument(\n-        "--filter_expr", action="store_true", help="Enable filtering by expression"\n+        "--filter_expr",\n+        action="store_true",\n+        help="Enable filtering by expression",\n     )\n     parser.add_argument(\n         "--factor_fields",\n@@ -358,7 +507,9 @@\n         nargs=2,\n         help="Size of the samples plot as a tuple (two arguments)",\n     )\n-    parser.add_argument("--plot_filtering_figsize", type=int, default=[10, 10], nargs=2)\n+    parser.add_argument(\n+        "--plot_filtering_figsize", type=int, default=[10, 10], nargs=2\n+    )\n \n     # Parse the command line arguments\n     args = parser.parse_args()\n'
b
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pseudobulk.xml
--- a/decoupler_pseudobulk.xml Tue Apr 16 11:49:25 2024 +0000
+++ b/decoupler_pseudobulk.xml Mon Jul 15 10:56:42 2024 +0000
b
@@ -1,4 +1,4 @@
-<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy2" profile="20.05">
+<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy3" profile="20.05">
     <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description>
     <requirements>
         <requirement type="package" version="1.4.0">decoupler</requirement>
@@ -28,6 +28,9 @@
     #if $min_counts:
     --min_counts $min_counts
     #end if
+    #if $min_counts_per_sample:
+    --min_counts_per_sample_marking $min_counts_per_sample
+    #end if
     #if $min_total_counts:
     --min_total_counts $min_total_counts
     #end if
@@ -66,6 +69,7 @@
         <param type="boolean" name="produce_anndata" label="Produce AnnData with Pseudo-bulk"/>
         <param type="integer" name="min_counts" label="Minimum Counts" optional="true"/>
         <param type="integer" name="min_total_counts" label="Minimum Total Counts" optional="true"/>
+        <param type="integer" name="min_counts_per_sample" value="20" label="Minimum counts per gene per contrast field" help="Used to signal genes that should be excluded per contrast field after DE, to avoid very lowly expressed genes in specific contrasts. Genes are not excluded from the result, but a separate file tagging them is produced."/>
         <param type="boolean" name="filter_expr" label="Enable Filtering by Expression"/>
         <param type="text" name="plot_samples_figsize" label="Plot Samples Figsize" value="10 10" help="X and Y sizes in points separated by a space"/>
         <param type="text" name="plot_filtering_figsize" label="Plot Filtering Figsize" value="10 10" help="X and Y sizes in points separated by a space"/>
@@ -83,9 +87,12 @@
         <data name="filter_by_expr_plot" format="png" label="${tool.name} on ${on_string}: Filter by Expression plot" from_work_dir="plots_output_dir/filter_by_expr.png">
             <filter>produce_plots</filter>
         </data>
+        <data name="genes_ignore_per_contrast_field" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast field" from_work_dir="deseq_output_dir/genes_to_ignore_per_contrast_field.tsv">
+            <filter>factor_fields</filter>
+        </data>
     </outputs>
     <tests>
-        <test expect_num_outputs="6">
+        <test expect_num_outputs="7">
             <param name="input_file" value="mito_counted_anndata.h5ad"/>
             <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/>
             <param name="groupby" value="batch_sex"/>
@@ -96,6 +103,7 @@
             <param name="produce_plots" value="true"/>
             <param name="produce_anndata" value="true"/>
             <param name="min_counts" value="10"/>
+            <param name="min_counts_per_sample" value="50"/>
             <param name="min_total_counts" value="1000"/>
             <param name="filter_expr" value="true"/>
             <param name="plot_samples_figsize" value="10 10"/>
@@ -128,6 +136,11 @@
                     <has_size value="31853" delta="3000"/>
                 </assert_contents>
             </output>
+            <output name="genes_ignore_per_contrast_field" ftype="tabular">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
             <output name="filter_by_expr_plot" ftype="png">
                 <assert_contents>
                     <has_size value="21656" delta="2000"/>