view decoupler_pseudobulk.xml @ 15:09c833d9b03b draft default tip

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/ commit b581a5b4ba88c5bf06f6223ba9aec51a8564796c
author ebi-gxa
date Fri, 29 Nov 2024 11:34:16 +0000
parents ef054892d47f
children
line wrap: on
line source

<tool id="decoupler_pseudobulk" name="Decoupler pseudo-bulk" version="1.4.0+galaxy8" profile="20.05">
    <description>aggregates single cell RNA-seq data for running bulk RNA-seq methods</description>
    <requirements>
        <requirement type="package" version="1.4.0">decoupler</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
mkdir deseq_output_dir &&
mkdir plots_output_dir &&
python '$__tool_directory__/decoupler_pseudobulk.py' '$input_file'
    #if $adata_obs_fields_to_merge:
    --adata_obs_fields_to_merge '$adata_obs_fields_to_merge'
    #end if
    --groupby '$groupby'
    --sample_key '$sample_key'
    #if $layer:
    --layer '$layer'
    #end if
    --mode '$mode'
    #if $use_raw:
    --use_raw
    #end if
    #if $min_cells:
    --min_cells $min_cells
    #end if
    #if $produce_plots:
    --save_path plots_output_dir
    #end if
    #if $min_counts:
    --min_counts $min_counts
    #end if
    #if $min_counts_per_sample:
    --min_counts_per_sample_marking $min_counts_per_sample
    #end if
    #if $min_total_counts:
    --min_total_counts $min_total_counts
    #end if
    #if $produce_anndata:
    --anndata_output_path '$pbulk_anndata'
    #end if
    #if $filter_expr:
    --filter_expr
    #end if
    #if $factor_fields:
    --factor_fields '$factor_fields'
    #end if
    #if $filter_per_contrast.filter == 'yes':
    --contrasts_file '$filter_per_contrast.contrasts_file'
    --min_gene_exp_perc_per_cell '$filter_per_contrast.min_cells_perc_per_contrast_cond'
    #end if
    --deseq2_output_path deseq_output_dir
    --plot_samples_figsize $plot_samples_figsize
    --plot_filtering_figsize $plot_filtering_figsize
]]></command>
    <environment_variables>
        <environment_variable name="NUMBA_CACHE_DIR">\$_GALAXY_JOB_TMP_DIR</environment_variable>
        <environment_variable name="MPLCONFIGDIR">\$_GALAXY_JOB_TMP_DIR</environment_variable>
    </environment_variables>
    <inputs>
        <param type="data" name="input_file" format="data" label="Input AnnData file"/>
        <conditional name="filter_per_contrast">
            <param name="filter" type="select" label="Produce a list of genes to filter out per contrast?" help="TODO">
                <option value="yes">Yes</option>
                <option value="no" selected="true">No</option>
            </param>
            <when value="yes">
                <param type="data" name="contrasts_file" format="txt,tabular" label="Contrasts file" help="A file with header and arithmetic operations between existing values in the Groupby column in the AnnData file."/>
                <param type="float" name="min_cells_perc_per_contrast_cond" value="20" label="Min. percentage of cells that need to be expressing a gene in any of the conditions of a contrast" help="Genes whose expression across all conditions of a contrast are below this threshold are tagged for removal from the contrast on a separate file"/>
            </when>
            <when value="no">
            </when>
        </conditional>
        <param type="text" name="adata_obs_fields_to_merge" label="Obs Fields to Merge" optional="true" help="Fields in adata.obs to merge, comma separated (optional). They will be available as field1_field2_field3 in the AnnData Obs dataframe. You can have multiple groups to merge, separated by colon (:)."/>
        <param type="text" name="groupby" label="Groupby column" help="The column in adata.obs that defines the groups. Merged columns in the above field are available here."/>
        <param type="text" name="sample_key" label="Sample Key column" help="The column in adata.obs that defines the samples. Merged columns in the above field are available here."/>
        <param type="text" name="layer" label="Layer" optional="true" help="The name of the layer of the AnnData object to use. It needs to be present in the AnnData object."/>
        <param type="select" name="mode" label="Decoupler pseudobulk Mode" optional="true" help="Determines how counts are aggregated across cells with the specificied groups: sum, mean or median.">
            <option value="sum" selected="true">sum</option>
            <option value="mean">mean</option>
            <option value="median">median</option>
        </param>
        <param type="text" name="factor_fields" label="Factor Fields" optional="true" help="Fields in adata.obs to use as factors, comma separated (optional). For EdgeR make sure that the first field is the main contrast field desired and the rest of the fields are the covariates desired. Decoupler produces two fields in the intermediate AnnData (which can be added here if desired for covariates): psbulk_n_cells and psbulk_counts."/>
        <param type="boolean" name="use_raw" label="Use Raw" optional="true"/>
        <param type="integer" name="min_cells" label="Minimum Cells" optional="true"/>
        <param type="boolean" name="produce_plots" label="Produce plots"/>
        <param type="boolean" name="produce_anndata" label="Produce AnnData with Pseudo-bulk"/>
        <param type="integer" name="min_counts" label="Minimum Counts" optional="true"/>
        <param type="integer" name="min_total_counts" label="Minimum Total Counts" optional="true"/>
        <param type="integer" name="min_counts_per_sample" value="20" label="Minimum counts per gene per contrast field" help="Used to signal genes that should be excluded per contrast field after DE, to avoid very lowly expressed genes in specific contrasts. Genes are not excluded from the result, but a separate file tagging them is produced."/>
        <param type="boolean" name="filter_expr" label="Enable Filtering by Expression"/>
        <param type="text" name="plot_samples_figsize" label="Plot Samples Figsize" value="10 10" help="X and Y sizes in points separated by a space"/>
        <param type="text" name="plot_filtering_figsize" label="Plot Filtering Figsize" value="10 10" help="X and Y sizes in points separated by a space"/>
    </inputs>
    <outputs>
        <data name="pbulk_anndata" format="h5ad" label="${tool.name} on ${on_string}: Pseudo-bulk AnnData">
            <filter>produce_anndata</filter>
        </data>
        <data name="count_matrix" format="tabular" label="${tool.name} on ${on_string}: Count Matrix" from_work_dir="deseq_output_dir/counts_matrix.tsv"/>
        <data name="samples_metadata" format="tabular" label="${tool.name} on ${on_string}: Samples Metadata (factors file)" from_work_dir="deseq_output_dir/col_metadata.tsv"/>
        <data name="genes_metadata" format="tabular" label="${tool.name} on ${on_string}: Genes Metadata" from_work_dir="deseq_output_dir/gene_metadata.tsv"/>
        <data name="plot_output" format="png" label="${tool.name} on ${on_string}: Pseudobulk plot" from_work_dir="plots_output_dir/pseudobulk_samples.png">
            <filter>produce_plots</filter>
        </data>
        <data name="filter_by_expr_plot" format="png" label="${tool.name} on ${on_string}: Filter by Expression plot" from_work_dir="plots_output_dir/filter_by_expr.png">
            <filter>produce_plots</filter>
        </data>
        <data name="genes_ignore_per_contrast_field" format="tabular" label="${tool.name} on ${on_string}: Genes to ignore by contrast field" from_work_dir="deseq_output_dir/genes_to_ignore_per_contrast_field.tsv">
            <filter>factor_fields</filter>
        </data>
        <data name="genes_ignore_per_contrast" format="tabular" label="${tool.name} on ${on_string}: Genes to ignore by contrast" from_work_dir="plots_output_dir/genes_to_filter_by_contrast.tsv">
            <filter>filter_per_contrast['filter'] == 'yes'</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="7">
            <param name="input_file" value="mito_counted_anndata.h5ad"/>
            <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/>
            <param name="groupby" value="batch_sex"/>
            <param name="sample_key" value="genotype"/>
            <param name="factor_fields" value="genotype,batch_sex"/>
            <param name="mode" value="sum"/>
            <param name="min_cells" value="10"/>
            <param name="produce_plots" value="true"/>
            <param name="produce_anndata" value="true"/>
            <param name="min_counts" value="10"/>
            <param name="min_counts_per_sample" value="50"/>
            <param name="min_total_counts" value="1000"/>
            <param name="filter_expr" value="true"/>
            <param name="plot_samples_figsize" value="10 10"/>
            <param name="plot_filtering_figsize" value="10 10"/>
            <output name="pbulk_anndata" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/psbulk_n_cells"/>
                </assert_contents>
            </output>
            <output name="count_matrix" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="3620"/>
                    <has_n_columns n="8"/>
                </assert_contents>
            </output>
            <output name="samples_metadata" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="8"/>
                    <has_n_columns n="3"/>
                </assert_contents>
            </output>
            <output name="genes_metadata" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="3620"/>
                    <has_n_columns n="13"/>
                </assert_contents>
            </output>
            <output name="plot_output" ftype="png">
                <assert_contents>
                    <has_size value="31853" delta="3000"/>
                </assert_contents>
            </output>
            <output name="genes_ignore_per_contrast_field" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="filter_by_expr_plot" ftype="png">
                <assert_contents>
                    <has_size value="21656" delta="2000"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="8">
            <param name="input_file" value="mito_counted_anndata.h5ad"/>
            <param name="filter" value="yes"/>
            <param name="contrasts_file" value="test_contrasts.txt" ftype="txt"/>
            <param name="min_cells_perc_per_contrast_cond" value="25"/>
            <param name="adata_obs_fields_to_merge" value="batch,sex:batch,genotype"/>
            <param name="groupby" value="batch_sex"/>
            <param name="sample_key" value="genotype"/>
            <param name="factor_fields" value="genotype,batch_sex"/>
            <param name="mode" value="sum"/>
            <param name="min_cells" value="10"/>
            <param name="produce_plots" value="true"/>
            <param name="produce_anndata" value="true"/>
            <param name="min_counts" value="10"/>
            <param name="min_counts_per_sample" value="50"/>
            <param name="min_total_counts" value="1000"/>
            <param name="filter_expr" value="true"/>
            <param name="plot_samples_figsize" value="10 10"/>
            <param name="plot_filtering_figsize" value="10 10"/>
            <output name="pbulk_anndata" ftype="h5ad">
                <assert_contents>
                    <has_h5_keys keys="obs/psbulk_n_cells"/>
                </assert_contents>
            </output>
            <output name="count_matrix" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="3620"/>
                    <has_n_columns n="8"/>
                </assert_contents>
            </output>
            <output name="samples_metadata" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="8"/>
                    <has_n_columns n="3"/>
                </assert_contents>
            </output>
            <output name="genes_metadata" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="3620"/>
                    <has_n_columns n="13"/>
                </assert_contents>
            </output>
            <output name="plot_output" ftype="png">
                <assert_contents>
                    <has_size value="31853" delta="3000"/>
                </assert_contents>
            </output>
            <output name="genes_ignore_per_contrast_field" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="filter_by_expr_plot" ftype="png">
                <assert_contents>
                    <has_size value="21656" delta="2000"/>
                </assert_contents>
            </output>
            <output name="genes_ignore_per_contrast" ftype="tabular">
                <assert_contents>
                    <has_n_lines n="35478"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
        <![CDATA[
        This tool generates a count matrix for pseudo-bulk analysis (to be done with a separate tool like EdgeR or DESeq2) and filtering using Decoupler-py. Provide the input AnnData file and specify the necessary parameters.

        - Input AnnData file: The input AnnData file to be processed.
        - Contrasts file: optional file with a header and a single column, with arithmetic operations (contrasts definitions) as expected by EdgeR or DESeq2 based on the existing groups in the AnnData. This is optional and only required if you want to get a list of genes to filter out later on based on the percetage of cells that express those genes per contrast's conditions.
        - Min % expression per contrast (in at least one condition of the contrast): Percentage of cells (within at least one condition of a contrast) that need to express a gene for that genes not to be marked for later filtering. This requires the contrast file to be provided.
        - Obs Fields to Merge: Fields in adata.obs to merge, comma separated (optional).
        - Groupby column: The column in adata.obs that defines the groups.
        - Sample Key column: The column in adata.obs that defines the samples.
        - Layer (optional): The name of the layer of the AnnData object to use.
        - Mode: The mode for Decoupler pseudobulk analysis (sum, mean, median). Sum by default.
        - Factor Fields (optional): Fields in adata.obs to use as factors, comma separated (optional). For EdgeR make sure that the first field is the main contrast field desired and the rest of the fields are the covariates desired.
        - Use Raw: Whether to use the raw part of the AnnData object.
        - Minimum Cells: Minimum number of cells for pseudobulk analysis (optional).
        - Minimum Counts: Minimum count threshold for filtering by expression (optional).
        - Minimum Total Counts: Minimum total count threshold for filtering by expression (optional).
        - Enable Filtering by Expression: Check this box to enable filtering by expression.
        - Plot Samples Figsize: Size of the samples plot as a tuple (two arguments).
        - Plot Filtering Figsize: Size of the filtering plot as a tuple (two arguments).

        The tool will output the filtered AnnData, count matrix, samples metadata, genes metadata (in DESeq2 format), and the pseudobulk plot and filter by expression plot (if enabled). Files for filtering genes later on are also generated (to ignore after the DE model).

        You can obtain more information about Decoupler pseudobulk at the developers documentation `here <https://decoupler-py.readthedocs.io/en/latest/notebooks/pseudobulk.html>`_ .

        ]]>
    </help>
    <citations>
        <citation type="doi">10.1093/bioadv/vbac016</citation>
    </citations>
</tool>