view normalize_select_features_scale.xml @ 1:6bccf5f85f92 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/seurat_v5 commit 566984b588e88225f0b3f2dae88c6fd084315e7c
author iuc
date Tue, 05 Nov 2024 11:54:52 +0000
parents c3170652bd98
children
line wrap: on
line source

<tool id="seurat_preprocessing" name="Seurat Preprocessing" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>- Normalize, Find Variable Features, Scale and Regress</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="exit_code"><![CDATA[
@CMD@
    ]]></command>
    <configfiles>
        <configfile name="script_file"><![CDATA[
@CMD_imports@
@CMD_read_inputs@

#if $method.method == 'NormalizeData'
seurat_obj<-NormalizeData(
    seurat_obj,
    #if $method.assay != ''
    assay = '$method.assay',
    #end if
    normalization.method = '$method.normalization_method.normalization_method',
    #if $method.normalization_method.normalization_method == 'CLR'
    margin = $method.normalization_method.margin,
    #end if
    scale.factor = $method.scale_factor,
    #if $method.block_size
    block.size = $method.block_size
    #end if
)

#else if $method.method == 'FindVariableFeatures'
seurat_obj<-FindVariableFeatures(
    seurat_obj,
    #if $method.assay != ''
    assay = '$method.assay',
    #end if
    selection.method = '$method.selection_method.selection_method',
    #if $method.selection_method.selection_method == 'vst'
    loess.span = $method.selection_method.loess_span,
        #if $method.selection_method.clip_max
        clip.max = $method.selection_method.clip_max,
        #end if
        #if $method.selection_method.nfeatures
        nfeatures = $method.selection_method.nfeatures,
        #end if
    #else if $method.selection_method.selection_method == 'dispersion'
        #if $method.selection_method.nfeatures
        nfeatures = $method.selection_method.nfeatures,
        #end if
    #end if
    num.bin = $method.num_bin,
    binning.method = '$method.binning_method'
)

    #if $method.output_topN.output_topN == 'true'
    N = $method.output_topN.topN
    top_N<-head(VariableFeatures(seurat_obj), N)
    @CMD_write_variable_tab@
    #end if

#else if $method.method == 'ScaleData'

    #if $method.scale_features.scale_features == 'list_genes'
    features_list<-paste(readLines('$method.scale_features.features_to_scale'), collapse=",")
    #else if $method.scale_features.scale_features == 'all_genes'
    all.genes<-rownames(seurat_obj)
    #end if

seurat_obj<-ScaleData(
    seurat_obj,
    #if $method.scale_features.scale_features == 'all_genes'
    features = all.genes,
    #else if $method.scale_features.scale_features == 'list_genes'
    features = c(unlist(strsplit(features_list, ","))),
    #end if
    #if $method.assay != ''
    assay = '$method.assay',
    #end if
    #if $method.regress.regress == 'true'
    vars.to.regress = c(unlist(strsplit(gsub(" ", "", '$method.regress.vars_to_regress'), ","))),
    model.use = '$method.regress.model_use',
    use.umi = $method.regress.use_umi,
    #end if
    #if $method.split_by != ''
    split.by = '$method.split_by',
    #end if
    do.scale = $method.do_scale,
    do.center = $method.do_center,
    #if $method.do_scale == 'true'
    scale.max = $method.scale_max,
    block.size = $method.block_size,
    min.cells.to.block = $method.min_cells_to_block
    #end if
)

#else if $method.method == 'SCTransform'

    #if $method.residual_features.residual_features_options == 'selected_features'
    features_list<-paste(readLines('$method.residual_features.residual_features'), collapse=",")
    #end if

seurat_obj<-SCTransform(
    seurat_obj,
    #if $method.assay != ''
    assay = '$method.assay',
    #end if
    new.assay.name = '$method.new_assay_name',
    #if $method.residual_features.residual_features_options == 'NULL'
        #if $method.residual_features.variable_features.variable_features == 'set_number'
        variable.features.n = $method.residual_features.variable_features.variable_features_n,
        #else if $method.residual_features.variable_features.variable_features == 'use_cutoff'
        variable.features.rv.th = $method.residual_features.variable_features.variable_features_rv_th,
        #end if
    #else if $method.residual_features.residual_features_options == 'selected_features'
    residual.features = c(unlist(strsplit(features_list, ","))),
    #end if
    #if $method.vars_to_regress != ''
    vars.to.regress = c(unlist(strsplit(gsub(" ", "", '$method.vars_to_regress'), ","))),
    #end if
    do.scale = $method.do_scale,
    do.center = $method.do_center,
    #if $method.min_clip_range and $method.max_clip_range
    clip.range = c($method.min_clip_range, $method.max_clip_range),
    #end if
    do.correct.umi = $method.adv.do_correct_umi,
    ncells = $method.adv.ncells,
    seed.use = $method.adv.seed_use,
    vst.flavor = '$method.adv.vst_flavor',
    conserve.memory = $method.adv.conserve_memory.conserve_memory,
    #if $method.adv.conserve_memory.conserve_memory == 'FALSE'
    return.only.var.genes = $method.adv.conserve_memory.return_only_var_genes
    #end if
)

    #if $method.output_topN.output_topN == 'true'
    N = $method.output_topN.topN
    top_N<-head(VariableFeatures(seurat_obj), N)
    @CMD_write_variable_tab@
    #end if

#end if

@CMD_rds_write_outputs@

]]></configfile>
    </configfiles>
    <inputs>
        <expand macro="input_rds"/>
        <conditional name="method">
            <param name="method" type="select" label="Method used">
                <option value="NormalizeData">Normalize with 'NormalizeData'</option>
                <option value="FindVariableFeatures">Identify highly variable genes with 'FindVariableFeatures'</option>
                <option value="ScaleData">Scale and regress with 'ScaleData'</option>
                <option value="SCTransform">Complete all preprocessing with 'SCTransform'</option>
            </param>
            <when value="NormalizeData">
                <expand macro="select_assay"/>
                <expand macro="normalize"/>
            </when>
            <when value="FindVariableFeatures">
                <expand macro="select_assay"/>
                <conditional name="selection_method">
                    <param name="selection_method" type="select" label="Method to select variable features" help="(selection.method)">
                        <option value="vst" selected="true">vst</option>
                        <option value="mean.var.plot">mean.var.plot</option>
                        <option value="dispersion">dispersion</option>
                    </param>
                    <when value="vst">
                        <param name="loess_span" type="float" value="0.3" label="Loess span parameter for fitting variance-mean relationship" help="(loess.span)"/>
                        <param name="clip_max" type="float" optional="true" value="" label="Maximum value after standardisation" help="leave blank to use default, the square root of number of cells (clip.max)"/>
                        <param argument="nfeatures" type="integer" optional="true" value="2000" label="Number of features to select as top variable features"/>
                    </when>
                    <when value="mean.var.plot"></when>
                    <when value="dispersion">
                        <param argument="nfeatures" type="integer" optional="true" value="2000" label="Number of features to select as top variable features"/>
                    </when>
                </conditional>
                <param name="num_bin" type="integer" value="20" label="Number of bins to use" help="(num.bin)"/>
                <param name="binning_method" type="select" label="Method to compute bins" help="(binning.method)">
                    <option value="equal_width" selected="true">equal width</option>
                    <option value="equal_frequency">equal frequency</option>
                </param>
                <conditional name="output_topN">
                    <param name="output_topN" type="select" label="Output list of most variable features">
                        <option value="true">Yes</option>
                        <option value="false" selected="true">No</option>
                    </param>
                    <when value="true">
                        <expand macro="set_topN"/>
                    </when>
                    <when value="false">
                    </when>
                </conditional>
            </when>
            <when value="ScaleData">
                <expand macro="select_assay"/>
                <conditional name="regress">
                    <param name="regress" type="select" label="Regress out a variable">
                        <option value="true">Yes</option>
                        <option value="false" selected="true">No</option>
                    </param>
                    <when value="true">
                        <param name="vars_to_regress" type="text" optional="true" value="" label="Variable(s) to regress out" help="comma-separated list e.g. percent.mt, nCount_RNA (vars.to.regress)">
                            <expand macro="valid_list"/>
                        </param>
                        <param name="model_use" type="select" optional="true" label="Model to use for regression" help="(model.use)">
                            <option value="linear" selected="true">linear</option>
                            <option value="poisson">poisson</option>
                            <option value="negbinom">negbinom</option>
                        </param>
                        <param name="use_umi" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Regress on UMI count data" help="recommended if regression model is not linear (use.umi)"/>
                    </when>
                    <when value="false">
                    </when>
                </conditional>
                <conditional name="scale_features">
                    <param name="scale_features" type="select" label="Features to scale">
                        <option value="variable_features" selected="true">Variable Features</option>
                        <option value="all_genes">All Features</option>
                        <option value="list_genes">Enter a list of features</option>
                    </param>
                    <when value="variable_features">
                    </when>
                    <when value="all_genes">
                    </when>
                    <when value="list_genes">
                        <param name="features_to_scale" type="data" format="txt,tabular" label="List of features to scale" help="text file with one feature on each line"/>
                    </when>
                </conditional>
                <param name="split_by" type="text" optional="true" value="" label="Scale cells in groups by a variable, vector or factor" help="(split.by)"/>
                <param name="do_scale" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Scale the data" help="(do.scale)"/>
                <param name="do_center" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Center the data" help="(do.center)"/>
                <param name="scale_max" type="float" value="10" label="Set max value for scaled data" help="(scale.max)"/>
                <param name="block_size" type="integer" value="1000" label="Number of features to scale in a single computation" help="(block.size)"/>
                <param name="min_cells_to_block" type="integer" value="3000" label="Use blocks for scaling if object has more than this number of cells" help="(min.cells.to.block)"/>
            </when>
            <when value="SCTransform">
                <expand macro="select_assay_RNA"/>
                <param name="new_assay_name" type="text" value="SCT" label="Name for new assay" help="to contain the normalized data (new.assay.name)">
                    <expand macro="valid_name"/>
                </param>
                <conditional name="residual_features">
                    <param name="residual_features_options" type="select" label="Genes to calculate residual features for" help="(residual.features)">
                        <option value="NULL" selected="true">all genes</option>
                        <option value="selected_features">selected features</option>
                    </param>
                    <when value="NULL">
                        <conditional name="variable_features">
                            <param name="variable_features" type="select" label="How to set variable features">
                                <option value="set_number" selected="true">set number of variable features</option>
                                <option value="use_cutoff">set cutoff for residual variance</option>
                            </param>
                            <when value="set_number">
                                <param name="variable_features_n" type="integer" value="3000" label="Use this many features as variable features" help="after ranking residual variance for all genes (variable.features.n)"/>
                            </when>
                            <when value="use_cutoff">
                                <param name="variable_features_rv_th" type="float" value="1.3" label="Use this residual variance cutoff" help="after calculating residual variance for all genes (variable.features.rv.th)"/>
                            </when>
                        </conditional>
                    </when>
                    <when value="selected_features">
                        <param name="residual_features" type="data" format="txt,tabular" label="List of genes to use" help="text file with one feature on each line. These genes will be set as VariableFeatures in returned object (residual.features)"/>
                    </when>
                </conditional>
                <conditional name="output_topN">
                    <param name="output_topN" type="select" label="Output list of most variable features">
                        <option value="true">Yes</option>
                        <option value="false" selected="true">No</option>
                    </param>
                    <when value="true">
                        <expand macro="set_topN"/>
                    </when>
                    <when value="false">
                    </when>
                </conditional>
                <param name="vars_to_regress" type="text" optional="true" value="" label="Variable(s) to regress out" help="comma-separated list e.g. percent.mt, nCount_RNA (vars.to.regress)">
                    <expand macro="valid_list"/>
                </param>
                <param name="do_scale" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="false" label="Scale the residuals" help="(do.scale)"/>
                <param name="do_center" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Center the residuals" help="(do.center)"/>
                <param name="min_clip_range" type="float" optional="true" value="" label="Minimum for residual variances" help="clip values below this or leave empty to use default of -sqrt(n/30) (clip.range)"/>
                <param name="max_clip_range" type="float" optional="true" value="" label="Maximum for residual variances" help="clip values above this or leave empty to use default of sqrt(n/30) (clip.range)"/>
                <section name="adv" title="Advanced Options">
                    <param name="do_correct_umi" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Place corrected UMI matrix in assay counts slot" help="(do.correct.umi)"/>
                    <param argument="ncells" type="integer" value="5000" label="Number of subsampling cells used to build NB regression"/>
                    <param name="seed_use" type="integer" value="1448145" label="Set random seed" help="--seed.use"/>
                    <param name="vst_flavor" type="select" label="Vst Flavor" help="version of sctransform to use(vst.flavor)">
                        <option value="v1">vs1</option>
                        <option value="v2" selected="true">vs2</option>
                    </param>
                    <conditional name="conserve_memory">
                        <param name="conserve_memory" type="select" label="Conserve memory" help="by not creating residual matrix for all genes (conserve.memory)">
                            <option value="FALSE" selected="true">No</option>
                            <option value="TRUE">Yes</option>
                        </param>
                        <when value="FALSE">
                            <param name="return_only_var_genes" type="boolean" truevalue="TRUE" falsevalue="FALSE" checked="true" label="Only return variable genes in scale.data matrices" help="(return.only.var.genes)"/>
                        </when>
                        <when value="TRUE"></when>
                    </conditional>
                </section>
            </when>
        </conditional>
        <expand macro="inputs_common_advanced"/>
    </inputs>
    <outputs>
        <expand macro="seurat_outputs"/>
        <expand macro="variable_out"/>
    </outputs>
    <tests>
        <test expect_num_outputs="2">
            <!-- test1: NormalizeData -->
            <param name="seurat_rds" location="https://zenodo.org/records/13732784/files/filtered.rds"/>
            <conditional name="method">
                <param name="method" value="NormalizeData"/>
                <conditional name="normalization_method">
                    <param name="normalization_method" value="LogNormalize"/>
                </conditional>
                <param name="scale_factor" value="10000"/>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="NormalizeData"/>
                </assert_contents>
            </output>
            <output name="rds_out" location="https://zenodo.org/records/13732784/files/normalized.rds" ftype="rds" compare="sim_size"/>
        </test>
        <test expect_num_outputs="3">
            <!-- test2: FindVariableFeatures -->
            <param name="seurat_rds" location="https://zenodo.org/records/13732784/files/normalized.rds"/>
            <conditional name="method">
                <param name="method" value="FindVariableFeatures"/>
                <conditional name="selection_method">
                    <param name="selection_method" value="vst"/>
                    <param name="nfeatures" value="100"/>
                </conditional>
                <param name="num_bin" value="20"/>
                <param name="binning_method" value="equal_width"/>
                <conditional name="output_topN">
                    <param name="output_topN" value="true"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="FindVariableFeatures"/>
                </assert_contents>
            </output>
            <output name="rds_out" location="https://zenodo.org/records/13732784/files/variablefeatures.rds" ftype="rds" compare="sim_size"/>
            <output name="variable_tabular" location="https://zenodo.org/records/13732784/files/variable_top10.txt" ftype="txt">
                <assert_contents>
                    <has_n_lines n="10"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <!-- test3: ScaleData -->
            <param name="seurat_rds" location="https://zenodo.org/records/13732784/files/variablefeatures.rds"/>
            <conditional name="method">
                <param name="method" value="ScaleData"/>
                <conditional name="scale_features">
                <param name="scale_features" value="all_genes"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="ScaleData"/>
                </assert_contents>
            </output>
            <output name="rds_out" location="https://zenodo.org/records/13732784/files/scaled.rds" ftype="rds" compare="sim_size"/>
        </test>
        <test expect_num_outputs="2">
            <!-- test4: SCTransform -->
            <param name="seurat_rds" location="https://zenodo.org/records/13732784/files/filtered.rds"/>
            <conditional name="method">
                <param name="method" value="SCTransform"/>
                <param name="vars_to_regress" value="percent.mt"/>
                <conditional name="residual_features">
                    <param name="residual_features_options" value="selected_features"/>
                    <param name="residual_features" location="https://zenodo.org/records/13741333/files/residual_features.txt"/>
                </conditional>
            </conditional>
            <section name="advanced_common">
                <param name="show_log" value="true"/>
            </section>
            <output name="hidden_output">
                <assert_contents>
                    <has_text_matching expression="SCTransform"/>
                </assert_contents>
            </output>
            <output name="rds_out" location="https://zenodo.org/records/13732784/files/SCTransformed.rds" ftype="rds" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[
Seurat
======

Seurat is an R package designed for QC, analysis, and exploration of single-cell RNA-seq data.

Seurat aims to enable users to identify and interpret sources of heterogeneity from single-cell transcriptomic measurements, and to integrate diverse types of single-cell data.

NormalizeData
=============

Normalize the count data present in a given assay.

Methods:

“LogNormalize”: Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. This is then natural-log transformed using log1p

“CLR”: Applies a centered log ratio transformation

“RC”: Relative counts. Feature counts for each cell are divided by the total counts for that cell and multiplied by the scale.factor. No log-transformation is applied. For counts per million (CPM) set scale.factor = 1e6


More details on the `seurat documentation
<https://satijalab.org/seurat/reference/normalizedata>`__

FindVariableFeatures
====================

Identify features that are outliers on a 'mean variability plot'.

Methods:

“vst”: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).

“mean.var.plot” (mvp): First, uses a function to calculate average expression (mean.function, using FastExpMean) and dispersion (dispersion.function, using FastLogVMR) for each feature. Next, divides features into num.bin (deafult 20) bins based on their average expression, and calculates z-scores for dispersion within each bin. The purpose of this is to identify variable features while controlling for the strong relationship between variability and average expression

“dispersion” (disp): selects the genes with the highest dispersion values

More details on the `seurat documentation
<https://satijalab.org/seurat/reference/findvariablefeatures>`__

Scale and regress the data with ScaleData
=========================================

Scale and center features in the dataset.

If variables are provided in vars.to.regress, they are individually regressed against each feature, and the resulting residuals are then scaled and centered.

More details on the `seurat documentation
<https://satijalab.org/seurat/reference/scaledata>`__

SCTransform
===========

Use this function as an alternative to the NormalizeData, FindVariableFeatures, ScaleData workflow.

Results are saved in a new assay (named SCT by default) with counts being (corrected) counts, data being log1p(counts), scale.data being pearson residuals; sctransform::vst intermediate results are saved in misc slot of new assay.

More details on the `seurat documentation
<https://satijalab.org/seurat/reference/sctransform>`__

    ]]></help>
    <expand macro="citations"/>
</tool>