Previous changeset 7:68a2b5445558 (2024-04-16) Next changeset 9:bd4b54b75888 (2024-09-15) |
Commit message:
planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit eea5c13f9e6e070a2359c59400773b01f9cd7567 |
modified:
decoupler_aucell_score.py decoupler_pathway_inference.py decoupler_pseudobulk.py decoupler_pseudobulk.xml |
b |
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_aucell_score.py --- a/decoupler_aucell_score.py Tue Apr 16 11:49:25 2024 +0000 +++ b/decoupler_aucell_score.py Mon Jul 15 10:56:42 2024 +0000 |
[ |
b'@@ -1,16 +1,15 @@\n import argparse\n-import os\n-import tempfile\n \n import anndata\n import decoupler as dc\n+import numba as nb\n import pandas as pd\n-import numba as nb\n \n \n def read_gmt_long(gmt_file):\n- """\n- Reads a GMT file and produce a Pandas DataFrame in long format, ready to be passed to the AUCell method.\n+ r"""\n+ Reads a GMT file and produce a Pandas DataFrame in long format, ready to\n+ be passed to the AUCell method.\n \n Parameters\n ----------\n@@ -20,9 +19,29 @@\n Returns\n -------\n pd.DataFrame\n- A DataFrame with the gene sets. Each row represents a gene set to gene assignment, and the columns are "gene_set_name" and "genes".\n- >>> line = "HALLMARK_NOTCH_SIGNALING\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\\tJAG1\\\\tNOTCH3\\\\tNOTCH2\\\\tAPH1A\\\\tHES1\\\\tCCND1\\\\tFZD1\\\\tPSEN2\\\\tFZD7\\\\tDTX1\\\\tDLL1\\\\tFZD5\\\\tMAML2\\\\tNOTCH1\\\\tPSENEN\\\\tWNT5A\\\\tCUL1\\\\tWNT2\\\\tDTX4\\\\tSAP30\\\\tPPARD\\\\tKAT2A\\\\tHEYL\\\\tSKP1\\\\tRBX1\\\\tTCF7L2\\\\tARRB1\\\\tLFNG\\\\tPRKCA\\\\tDTX2\\\\tST3GAL6\\\\tFBXW11\\\\n"\n- >>> line2 = "HALLMARK_APICAL_SURFACE\\\\thttp://www.gsea-msigdb.org/gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\\tB4GALT1\\\\tRHCG\\\\tMAL\\\\tLYPD3\\\\tPKHD1\\\\tATP6V0A4\\\\tCRYBG1\\\\tSHROOM2\\\\tSRPX\\\\tMDGA1\\\\tTMEM8B\\\\tTHY1\\\\tPCSK9\\\\tEPHB4\\\\tDCBLD2\\\\tGHRL\\\\tLYN\\\\tGAS1\\\\tFLOT2\\\\tPLAUR\\\\tAKAP7\\\\tATP8B1\\\\tEFNA5\\\\tSLC34A3\\\\tAPP\\\\tGSTM3\\\\tHSPB1\\\\tSLC2A4\\\\tIL2RB\\\\tRTN4RL1\\\\tNCOA6\\\\tSULF2\\\\tADAM10\\\\tBRCA1\\\\tGATA3\\\\tAFAP1L2\\\\tIL2RG\\\\tCD160\\\\tADIPOR2\\\\tSLC22A12\\\\tNTNG1\\\\tSCUBE1\\\\tCX3CL1\\\\tCROCC\\\\n"\n+ A DataFrame with the gene sets. Each row represents a gene set to gene\n+ assignment, and the columns are "gene_set_name" and "genes".\n+ >>> import os\n+ >>> import tempfile\n+ >>> line = "HALLMARK_NOTCH_SIGNALING\\\n+ ... \\thttp://www.gsea-msigdb.org/\\\n+ ... gsea/msigdb/human/geneset/HALLMARK_NOTCH_SIGNALING\\\n+ ... \\tJAG1\\tNOTCH3\\tNOTCH2\\tAPH1A\\tHES1\\tCCND1\\\n+ ... \\tFZD1\\tPSEN2\\tFZD7\\tDTX1\\tDLL1\\tFZD5\\tMAML2\\\n+ ... \\tNOTCH1\\tPSENEN\\tWNT5A\\tCUL1\\tWNT2\\tDTX4\\\n+ ... \\tSAP30\\tPPARD\\tKAT2A\\tHEYL\\tSKP1\\tRBX1\\tTCF7L2\\\n+ ... \\tARRB1\\tLFNG\\tPRKCA\\tDTX2\\tST3GAL6\\tFBXW11\\n"\n+ >>> line2 = "HALLMARK_APICAL_SURFACE\\\n+ ... \\thttp://www.gsea-msigdb.org/\\\n+ ... gsea/msigdb/human/geneset/HALLMARK_APICAL_SURFACE\\\n+ ... \\tB4GALT1\\tRHCG\\tMAL\\tLYPD3\\tPKHD1\\tATP6V0A4\\\n+ ... \\tCRYBG1\\tSHROOM2\\tSRPX\\tMDGA1\\tTMEM8B\\tTHY1\\\n+ ... \\tPCSK9\\tEPHB4\\tDCBLD2\\tGHRL\\tLYN\\tGAS1\\tFLOT2\\\n+ ... \\tPLAUR\\tAKAP7\\tATP8B1\\tEFNA5\\tSLC34A3\\tAPP\\\n+ ... \\tGSTM3\\tHSPB1\\tSLC2A4\\tIL2RB\\tRTN4RL1\\tNCOA6\\\n+ ... \\tSULF2\\tADAM10\\tBRCA1\\tGATA3\\tAFAP1L2\\tIL2RG\\\n+ ... \\tCD160\\tADIPOR2\\tSLC22A12\\tNTNG1\\tSCUBE1\\tCX3CL1\\\n+ ... \\tCROCC\\n"\n >>> temp_dir = tempfile.gettempdir()\n >>> temp_gmt = os.path.join(temp_dir, "temp_file.gmt")\n >>> with open(temp_gmt, "w") as f:\n@@ -36,7 +55,8 @@\n >>> len(df.loc[df["gene_set"] == "HALLMARK_APICAL_SURFACE"].gene.tolist())\n 44\n """\n- # Create a list of dictionaries, where each dictionary represents a gene set\n+ # Create a list of dictionaries, where each dictionary represents a\n+ # gene set\n gene_sets = {}\n \n # Read the GMT file into a list of lines\n@@ -46,12 +66,20 @@\n if not line:\n break\n fields = line.strip().split("\\t")\n- gene_sets[fields[0]]= fields[2:]\n+ gene_sets[fields[0]] = fields[2:]\n \n- return pd.concat(pd.DataFrame({\'gene_set\':k, \'gene\':v}) for k, v in gene_sets.items())\n+ return pd.concat(\n+ pd.DataFrame({"gene_set": k, "gene": v}) for k, v in gene_sets.items()\n+ )\n \n \n-def score_genes_aucell_mt(adata: anndata.AnnData, gene_set_gene: pd.DataFrame, use_raw=False, min_n_genes=5, var_gene_symbols_field=None):\n+def score_genes_aucell_mt(\n+ adata: anndata.AnnData,\n+ gene_set_gene: pd.DataFrame,\n+ use_raw=False,\n+ min_n_genes=5,\n+ var_gene_symbols_field=None,\n+):\n """Score genes using Aucell.\n \n Parameters\n@@ -60,1'..b'e than one set of genes, separated by colon :",\n+ help="Comma separated list of genes to score. You can have more \\\n+ than one set of genes, separated by colon :",\n )\n # argument for the score name when using the gene list\n parser.add_argument(\n "--score_names",\n type=str,\n required=False,\n- help="Name of the score column when using the gene list. You can have more than one set of score names, separated by colon :. It should be the same length as the number of gene lists.",\n+ help="Name of the score column when using the gene list. You can \\\n+ have more than one set of score names, separated by colon :. \\\n+ It should be the same length as the number of gene lists.",\n )\n parser.add_argument(\n "--gene_symbols_field",\n@@ -159,7 +227,8 @@\n help="Name of the gene symbols field in the AnnData object",\n required=True,\n )\n- # argument for min_n Minimum of targets per source. If less, sources are removed.\n+ # argument for min_n Minimum of targets per source. If less, sources\n+ # are removed.\n parser.add_argument(\n "--min_n",\n type=int,\n@@ -169,11 +238,18 @@\n )\n parser.add_argument("--use_raw", action="store_true", help="Use raw data")\n parser.add_argument(\n- "--write_anndata", action="store_true", help="Write the modified AnnData object"\n+ "--write_anndata",\n+ action="store_true",\n+ help="Write the modified AnnData object",\n )\n # argument for number of max concurrent processes\n- parser.add_argument("--max_threads", type=int, required=False, default=1, help="Number of max concurrent threads")\n-\n+ parser.add_argument(\n+ "--max_threads",\n+ type=int,\n+ required=False,\n+ default=1,\n+ help="Number of max concurrent threads",\n+ )\n \n # Parse command-line arguments\n args = parser.parse_args()\n@@ -189,23 +265,40 @@\n msigdb = read_gmt_long(args.gmt_file)\n \n gene_sets_to_score = (\n- args.gene_sets_to_score.split(",") if args.gene_sets_to_score else []\n+ args.gene_sets_to_score.split(",")\n+ if args.gene_sets_to_score\n+ else []\n )\n if gene_sets_to_score:\n- # we limit the GMT file read to the genesets specified in the gene_sets_to_score argument\n+ # we limit the GMT file read to the genesets specified in the\n+ # gene_sets_to_score argument\n msigdb = msigdb[msigdb["gene_set"].isin(gene_sets_to_score)]\n- \n- score_genes_aucell_mt(adata, msigdb, args.use_raw, args.min_n, var_gene_symbols_field=args.gene_symbols_field)\n+\n+ score_genes_aucell_mt(\n+ adata,\n+ msigdb,\n+ args.use_raw,\n+ args.min_n,\n+ var_gene_symbols_field=args.gene_symbols_field,\n+ )\n elif args.gene_lists_to_score is not None and args.score_names is not None:\n gene_lists = args.gene_lists_to_score.split(":")\n score_names = args.score_names.split(",")\n run_for_genelists(\n- adata, gene_lists, score_names, args.use_raw, args.gene_symbols_field, args.min_n\n+ adata,\n+ gene_lists,\n+ score_names,\n+ args.use_raw,\n+ args.gene_symbols_field,\n+ args.min_n,\n )\n \n- # Save the modified AnnData object or generate a file with cells as rows and the new score_names columns\n+ # Save the modified AnnData object or generate a file with cells as rows\n+ # and the new score_names columns\n if args.write_anndata:\n adata.write_h5ad(args.output_file)\n else:\n- new_columns = [col for col in adata.obs.columns if col.startswith("AUCell_")]\n+ new_columns = [\n+ col for col in adata.obs.columns if col.startswith("AUCell_")\n+ ]\n adata.obs[new_columns].to_csv(args.output_file, sep="\\t", index=True)\n' |
b |
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pathway_inference.py --- a/decoupler_pathway_inference.py Tue Apr 16 11:49:25 2024 +0000 +++ b/decoupler_pathway_inference.py Mon Jul 15 10:56:42 2024 +0000 |
[ |
@@ -20,24 +20,34 @@ # output file prefix parser.add_argument( - "-o", "--output", + "-o", + "--output", help="output files prefix", default=None, ) # path to save Activities AnnData file parser.add_argument( - "-a", "--activities_path", help="Path to save Activities AnnData file", default=None + "-a", + "--activities_path", + help="Path to save Activities AnnData file", + default=None, ) # Column name in net with source nodes parser.add_argument( - "-s", "--source", help="Column name in net with source nodes.", default="source" + "-s", + "--source", + help="Column name in net with source nodes.", + default="source", ) # Column name in net with target nodes parser.add_argument( - "-t", "--target", help="Column name in net with target nodes.", default="target" + "-t", + "--target", + help="Column name in net with target nodes.", + default="target", ) # Column name in net with weights. @@ -47,17 +57,27 @@ # add boolean argument for use_raw parser.add_argument( - "--use_raw", action="store_true", default=False, help="Whether to use the raw part of the AnnData object" + "--use_raw", + action="store_true", + default=False, + help="Whether to use the raw part of the AnnData object", ) # add argument for min_cells parser.add_argument( - "--min_n", help="Minimum of targets per source. If less, sources are removed.", default=5, type=int + "--min_n", + help="Minimum of targets per source. If less, sources are removed.", + default=5, + type=int, ) # add activity inference method option parser.add_argument( - "-m", "--method", help="Activity inference method", default="mlm", required=True + "-m", + "--method", + help="Activity inference method", + default="mlm", + required=True, ) args = parser.parse_args() @@ -69,7 +89,7 @@ adata = ad.read_h5ad(args.input_anndata) # read in the input file network input file -network = pd.read_csv(args.input_network, sep='\t') +network = pd.read_csv(args.input_network, sep="\t") if ( args.source not in network.columns @@ -92,17 +112,21 @@ weight=args.weight, verbose=True, min_n=args.min_n, - use_raw=args.use_raw + use_raw=args.use_raw, ) if args.output is not None: - # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files - combined_df = pd.concat([adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1) + # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the + # output network files + combined_df = pd.concat( + [adata.obsm["mlm_estimate"], adata.obsm["mlm_pvals"]], axis=1 + ) # Save the combined dataframe to a file combined_df.to_csv(args.output + ".tsv", sep="\t") - # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path + # if args.activities_path is specified, generate the activities AnnData + # and save the AnnData object to the specified path if args.activities_path is not None: acts = dc.get_acts(adata, obsm_key="mlm_estimate") acts.write_h5ad(args.activities_path) @@ -116,17 +140,21 @@ weight=args.weight, verbose=True, min_n=args.min_n, - use_raw=args.use_raw + use_raw=args.use_raw, ) if args.output is not None: - # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the output network files - combined_df = pd.concat([adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1) + # write adata.obsm[mlm_key] and adata.obsm[mlm_pvals_key] to the + # output network files + combined_df = pd.concat( + [adata.obsm["ulm_estimate"], adata.obsm["ulm_pvals"]], axis=1 + ) # Save the combined dataframe to a file combined_df.to_csv(args.output + ".tsv", sep="\t") - # if args.activities_path is specified, generate the activities AnnData and save the AnnData object to the specified path + # if args.activities_path is specified, generate the activities AnnData + # and save the AnnData object to the specified path if args.activities_path is not None: acts = dc.get_acts(adata, obsm_key="ulm_estimate") acts.write_h5ad(args.activities_path) |
b |
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pseudobulk.py --- a/decoupler_pseudobulk.py Tue Apr 16 11:49:25 2024 +0000 +++ b/decoupler_pseudobulk.py Mon Jul 15 10:56:42 2024 +0000 |
[ |
b'@@ -40,8 +40,108 @@\n return index_value\n \n \n+def genes_to_ignore_per_contrast_field(\n+ count_matrix_df,\n+ samples_metadata,\n+ sample_metadata_col_contrasts,\n+ min_counts_per_sample=5,\n+ use_cpms=False,\n+):\n+ """\n+ # This function calculates the genes to ignore per contrast field\n+ # (e.g., bulk_labels, louvain).\n+ # It does this by first getting the count matrix for each group,\n+ # then identifying genes with a count below a specified threshold.\n+ # The genes to ignore are those that are present in more than a specified\n+ # number of groups.\n+\n+ >>> import pandas as pd\n+ >>> samples_metadata = pd.DataFrame({\'sample\':\n+ ... [\'S1\', \'S2\', \'S3\',\n+ ... \'S4\', \'S5\', \'S6\'],\n+ ... \'contrast_field\':\n+ ... [\'A\', \'A\', \'A\', \'B\', \'B\', \'B\']})\n+ >>> count_matrix_df = pd.DataFrame(\n+ ... {\'S1\':\n+ ... [30, 1, 40, 50, 30],\n+ ... \'S2\':\n+ ... [40, 2, 60, 50, 80],\n+ ... \'S3\':\n+ ... [80, 1, 60, 50, 50],\n+ ... \'S4\': [1, 50, 50, 50, 2],\n+ ... \'S5\': [3, 40, 40, 40, 2],\n+ ... \'S6\': [0, 50, 50, 50, 1]})\n+ >>> count_matrix_df.index = [\'Gene1\', \'Gene2\', \'Gene3\', \'Gene4\', \'Gene5\']\n+ >>> df = genes_to_ignore_per_contrast_field(count_matrix_df,\n+ ... samples_metadata, min_counts_per_sample=5,\n+ ... sample_metadata_col_contrasts=\'contrast_field\')\n+ >>> df[df[\'contrast_field\'] == \'A\'].genes_to_ignore.tolist()[0]\n+ \'Gene2\'\n+ >>> df[df[\'contrast_field\'] == \'B\'].genes_to_ignore.tolist()[0]\n+ \'Gene1\'\n+ >>> df[df[\'contrast_field\'] == \'B\'].genes_to_ignore.tolist()[1]\n+ \'Gene5\'\n+ """\n+\n+ # Initialize a dictionary to store the genes to ignore per contrast field\n+ contrast_fields = []\n+ genes_to_ignore = []\n+\n+ # Iterate over the contrast fields\n+ for contrast_field in samples_metadata[\n+ sample_metadata_col_contrasts\n+ ].unique():\n+ # Get the count matrix for the current contrast field\n+ count_matrix_field = count_matrix_df.loc[\n+ :,\n+ (\n+ samples_metadata[sample_metadata_col_contrasts]\n+ == contrast_field\n+ ).tolist(),\n+ ]\n+\n+ # We derive min_counts from the number of samples with that\n+ # contrast_field value\n+ min_counts = count_matrix_field.shape[1] * min_counts_per_sample\n+\n+ if use_cpms:\n+ # Convert counts to counts per million (CPM)\n+ count_matrix_field = (\n+ count_matrix_field.div(count_matrix_field.sum(axis=1), axis=0)\n+ * 1e6\n+ )\n+ min_counts = 1 # use 1 CPM\n+\n+ # Calculate the total number of cells in the current contrast field\n+ # (this produces a vector of counts per gene)\n+ total_counts_per_gene = count_matrix_field.sum(axis=1)\n+\n+ # Identify genes with a count below the specified threshold\n+ genes = total_counts_per_gene[\n+ total_counts_per_gene < min_counts\n+ ].index.tolist()\n+ if len(genes) > 0:\n+ # genes_to_ignore[contrast_field] = " ".join(genes)\n+ for gene in genes:\n+ genes_to_ignore.append(gene)\n+ contrast_fields.append(contrast_field)\n+ # transform gene_to_ignore to a DataFrame\n+ # genes_to_ignore_df = pd.DataFrame(genes_to_ignore.items(),\n+ # columns=["contrast_field", "genes_to_ignore"])\n+ genes_to_ignore_df = pd.DataFrame(\n+ {"contrast_field": contrast_fields, "genes_to_ignore": genes_to_ignore}\n+ )\n+ return genes_to_ignore_df\n+\n+\n # write results for loading into DESeq2\n-def write_DESeq2_'..b'count, min_total_count=min_total_count\n@@ -150,12 +275,16 @@\n if obs:\n if not set(fields).issubset(set(adata.obs.columns)):\n raise ValueError(\n- f"Some of the following fields {legend} are not present in adata.obs: {fields}. Possible fields are: {list(set(adata.obs.columns))}"\n+ f"Some of the following fields {legend} are not present \\\n+ in adata.obs: {fields}. \\\n+ Possible fields are: {list(set(adata.obs.columns))}"\n )\n else:\n if not set(fields).issubset(set(adata.var.columns)):\n raise ValueError(\n- f"Some of the following fields {legend} are not present in adata.var: {fields}. Possible fields are: {list(set(adata.var.columns))}"\n+ f"Some of the following fields {legend} are not present \\\n+ in adata.var: {fields}. \\\n+ Possible fields are: {list(set(adata.var.columns))}"\n )\n \n \n@@ -219,10 +348,15 @@\n \n # Save the pseudobulk data\n if args.anndata_output_path:\n- pseudobulk_data.write_h5ad(args.anndata_output_path, compression="gzip")\n+ pseudobulk_data.write_h5ad(\n+ args.anndata_output_path, compression="gzip"\n+ )\n \n write_DESeq2_inputs(\n- pseudobulk_data, output_dir=args.deseq2_output_path, factor_fields=factor_fields\n+ pseudobulk_data,\n+ output_dir=args.deseq2_output_path,\n+ factor_fields=factor_fields,\n+ min_counts_per_sample_marking=args.min_counts_per_sample_marking,\n )\n \n \n@@ -254,7 +388,9 @@\n field_name = "_".join(obs_fields_to_merge)\n for field in obs_fields_to_merge:\n if field not in adata.obs.columns:\n- raise ValueError(f"The \'{field}\' column is not present in adata.obs.")\n+ raise ValueError(\n+ f"The \'{field}\' column is not present in adata.obs."\n+ )\n if field_name not in adata.obs.columns:\n adata.obs[field_name] = adata.obs[field].astype(str)\n else:\n@@ -271,12 +407,16 @@\n )\n \n # Add arguments\n- parser.add_argument("adata_file", type=str, help="Path to the AnnData file")\n+ parser.add_argument(\n+ "adata_file", type=str, help="Path to the AnnData file"\n+ )\n parser.add_argument(\n "-m",\n "--adata_obs_fields_to_merge",\n type=str,\n- help="Fields in adata.obs to merge, comma separated. You can have more than one set of fields, separated by semi-colon ;",\n+ help="Fields in adata.obs to merge, comma separated. \\\n+ You can have more than one set of fields, \\\n+ separated by semi-colon ;",\n )\n parser.add_argument(\n "--groupby",\n@@ -328,6 +468,13 @@\n help="Minimum count threshold for filtering by expression",\n )\n parser.add_argument(\n+ "--min_counts_per_sample_marking",\n+ type=int,\n+ default=20,\n+ help="Minimum count threshold per sample for \\\n+ marking genes to be ignored after DE",\n+ )\n+ parser.add_argument(\n "--min_total_counts",\n type=int,\n help="Minimum total count threshold for filtering by expression",\n@@ -338,7 +485,9 @@\n help="Path to save the filtered AnnData object or pseudobulk data",\n )\n parser.add_argument(\n- "--filter_expr", action="store_true", help="Enable filtering by expression"\n+ "--filter_expr",\n+ action="store_true",\n+ help="Enable filtering by expression",\n )\n parser.add_argument(\n "--factor_fields",\n@@ -358,7 +507,9 @@\n nargs=2,\n help="Size of the samples plot as a tuple (two arguments)",\n )\n- parser.add_argument("--plot_filtering_figsize", type=int, default=[10, 10], nargs=2)\n+ parser.add_argument(\n+ "--plot_filtering_figsize", type=int, default=[10, 10], nargs=2\n+ )\n \n # Parse the command line arguments\n args = parser.parse_args()\n' |
b |
diff -r 68a2b5445558 -r 93f61ea19336 decoupler_pseudobulk.xml --- a/decoupler_pseudobulk.xml Tue Apr 16 11:49:25 2024 +0000 +++ b/decoupler_pseudobulk.xml Mon Jul 15 10:56:42 2024 +0000 |
b |
@@ -1,4 +1,4 @@ -<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy2" profile="20.05"> +<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy3" profile="20.05"> <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description> <requirements> <requirement type="package" version="1.4.0">decoupler</requirement> @@ -28,6 +28,9 @@ #if $min_counts: --min_counts $min_counts #end if + #if $min_counts_per_sample: + --min_counts_per_sample_marking $min_counts_per_sample + #end if #if $min_total_counts: --min_total_counts $min_total_counts #end if @@ -66,6 +69,7 @@ <param type="boolean" name="produce_anndata" label="Produce AnnData with Pseudo-bulk"/> <param type="integer" name="min_counts" label="Minimum Counts" optional="true"/> <param type="integer" name="min_total_counts" label="Minimum Total Counts" optional="true"/> + <param type="integer" name="min_counts_per_sample" value="20" label="Minimum counts per gene per contrast field" help="Used to signal genes that should be excluded per contrast field after DE, to avoid very lowly expressed genes in specific contrasts. Genes are not excluded from the result, but a separate file tagging them is produced."/> <param type="boolean" name="filter_expr" label="Enable Filtering by Expression"/> <param type="text" name="plot_samples_figsize" label="Plot Samples Figsize" value="10 10" help="X and Y sizes in points separated by a space"/> <param type="text" name="plot_filtering_figsize" label="Plot Filtering Figsize" value="10 10" help="X and Y sizes in points separated by a space"/> @@ -83,9 +87,12 @@ <data name="filter_by_expr_plot" format="png" label="${tool.name} on ${on_string}: Filter by Expression plot" from_work_dir="plots_output_dir/filter_by_expr.png"> <filter>produce_plots</filter> </data> + <data name="genes_ignore_per_contrast_field" format="tabular" label="{tool.name} on ${on_string}: Genes to ignore by contrast field" from_work_dir="deseq_output_dir/genes_to_ignore_per_contrast_field.tsv"> + <filter>factor_fields</filter> + </data> </outputs> <tests> - <test expect_num_outputs="6"> + <test expect_num_outputs="7"> <param name="input_file" value="mito_counted_anndata.h5ad"/> <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/> <param name="groupby" value="batch_sex"/> @@ -96,6 +103,7 @@ <param name="produce_plots" value="true"/> <param name="produce_anndata" value="true"/> <param name="min_counts" value="10"/> + <param name="min_counts_per_sample" value="50"/> <param name="min_total_counts" value="1000"/> <param name="filter_expr" value="true"/> <param name="plot_samples_figsize" value="10 10"/> @@ -128,6 +136,11 @@ <has_size value="31853" delta="3000"/> </assert_contents> </output> + <output name="genes_ignore_per_contrast_field" ftype="tabular"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> <output name="filter_by_expr_plot" ftype="png"> <assert_contents> <has_size value="21656" delta="2000"/> |