Mercurial > repos > iuc > scanpy_normalize
diff normalize.xml @ 17:5dada6f76047 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 91121b1e72696f17478dae383badaa71e9f96dbb
author | iuc |
---|---|
date | Sat, 14 Sep 2024 12:42:55 +0000 |
parents | d844935c906c |
children |
line wrap: on
line diff
--- a/normalize.xml Tue Aug 20 09:52:24 2024 +0000 +++ b/normalize.xml Sat Sep 14 12:42:55 2024 +0000 @@ -1,45 +1,40 @@ -<tool id="scanpy_normalize" name="Normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@"> - <description>and impute with scanpy</description> +<tool id="scanpy_normalize" name="Scanpy normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>and impute</description> <macros> <import>macros.xml</import> </macros> <expand macro="bio_tools"/> - <expand macro="requirements"/> + <expand macro="requirements"> + <requirement type="package" version="3.0.0">magic-impute</requirement> + </expand> <expand macro="version_command"/> <command detect_errors="exit_code"><![CDATA[ @CMD@ ]]></command> <configfiles> <configfile name="script_file"><![CDATA[ -@CMD_imports@ -@CMD_read_inputs@ +@CMD_IMPORTS@ +@CMD_READ_INPUTS@ -#if $method.method == "pp.normalize_total" +#if str($method.method) == 'pp.normalize_total': sc.pp.normalize_total( adata, - #if str($method.target_sum) != '' + #if str($method.target_sum) != '': target_sum=$method.target_sum, #end if exclude_highly_expressed=$method.exclude_highly_expressed.exclude_highly_expressed, - #if $method.exclude_highly_expressed.exclude_highly_expressed == "True" + #if str($method.exclude_highly_expressed.exclude_highly_expressed) == 'True': max_fraction=$method.exclude_highly_expressed.max_fraction, #end if - #if $method.key_added + #if str($method.key_added) != '': key_added='$method.key_added', #end if - #if $method.layers - #if str($method.layers) != 'all' - layers[str(x.strip()) for x in str($method.layers).split(',')], - #else - layers='$method.layers', - #end if - #end if - #if str($method.layer_norm) != "None" - layer_norm='$method.layer_norm', + #if str($method.layer) != '': + layer='$method.layer', #end if inplace=True) -#else if $method.method == "pp.recipe_zheng17" +#else if str($method.method) == 'pp.recipe_zheng17': sc.pp.recipe_zheng17( adata=adata, n_top_genes=$method.n_top_genes, @@ -47,52 +42,62 @@ plot=False, copy=False) -#else if $method.method == "pp.recipe_weinreb17" +#else if str($method.method) == 'pp.recipe_weinreb17': sc.pp.recipe_weinreb17( adata=adata, log=$method.log, - mean_threshold=$method.mean_threshold, - cv_threshold=$method.cv_threshold, - n_pcs=$method.n_pcs, - svd_solver='$method.svd_solver', - random_state=$method.random_state, + mean_threshold=0.01, + cv_threshold=2, + n_pcs=50, + svd_solver='randomized', + random_state=0, copy=False) -#else if $method.method == "pp.recipe_seurat" +#else if str($method.method) == 'pp.recipe_seurat': sc.pp.recipe_seurat( adata=adata, log=$method.log, plot=False, copy=False) -#else if $method.method == "external.pp.magic" +#else if str($method.method) == 'external.pp.magic': +print("stats before magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}") + sc.external.pp.magic( adata=adata, name_list='$method.name_list', knn=$method.knn, - #if str($method.decay) != '' + #if str($method.decay) != '': decay=$method.decay, #end if - #if str($method.knn_max) != '' + #if str($method.knn_max) != '': knn_max=$method.knn_max, #end if - #if $method.t == -1 + #if $method.t == -1: t='auto', #else t=$method.t, #end if - #if str($method.n_pca) != '' + #if str($method.n_pca) != '': n_pca=$method.n_pca, #end if solver='$method.solver', knn_dist='$method.knn_dist', + #if str($method.random_state) != '': random_state=$method.random_state, + #else + random_state=None, + #end if copy=False) + + #if str($method.name_list) == 'all_genes': +print("stats after magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}") + #end if #end if -@CMD_anndata_write_outputs@ - -]]></configfile> +@CMD_ANNDATA_WRITE_OUTPUTS@ + ]]> + </configfile> </configfiles> <inputs> <expand macro="inputs_anndata"/> @@ -108,63 +113,50 @@ <param argument="target_sum" type="float" value="" optional="true" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/> <conditional name="exclude_highly_expressed"> <param argument="exclude_highly_expressed" type="select" label="Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell" help=" A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum"> + <option value="False" selected="true">No</option> <option value="True">Yes</option> - <option value="False" selected="true">No</option> </param> <when value="True"> - <param argument="max_fraction" type="float" value="0.05" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/> + <param argument="max_fraction" type="float" value="0.05" label="Consider cells as highly expressed that have more counts than this value of the original total counts in at least one cell."/> </when> <when value="False"/> </conditional> - <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored" help=""> - <expand macro="sanitize_query" /> + <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored"> + <expand macro="sanitize_query"/> </param> - <param argument="layers" type="text" value="" optional="true" label="List of layers to normalize" help="'All' will normalize all layers. The list should be comma-separated."> - <expand macro="sanitize_query" /> - </param> - <param argument="layer_norm" type="select" label="How to normalize layers?"> - <option value="None">None: after normalization, for each layer in layers each cell has a total count equal to the median of the median of the total counts (cells) before normalization of the layer.</option> - <option value="after">After: for each layer in layers each cell has a total count equal to target_sum.</option> - <option value="X">X: for each layer in layers each cell has a total count equal to the median of total counts for observations (cells) of adata.X before normalization.</option> + <param argument="layer" type="text" value="" label="Layer to normalize instead of X. If not provided, X is normalized."> + <expand macro="sanitize_query"/> </param> </when> <when value="pp.recipe_zheng17"> - <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep" help=""/> - <expand macro="param_log"/> + <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep"/> + <expand macro="param_log" checked="true"/> </when> <when value="pp.recipe_weinreb17"> - <expand macro="param_log"/> - <param argument="mean_threshold" type="float" value="0.01" label="Mean threshold" help=""/> - <param argument="cv_threshold" type="float" value="2" label="CV threshold" help=""/> - <param argument="n_pcs" type="integer" min="0" value="50" label="Number of principal component" help=""/> - <expand macro="svd_solver"/> - <expand macro="pca_random_state"/> + <expand macro="param_log" checked="true"/> </when> <when value="pp.recipe_seurat"> - <expand macro="param_log"/> + <expand macro="param_log" checked="true"/> </when> <when value="external.pp.magic"> <param name="name_list" type="select" label="Denoised genes to return" help="Selecting all genes may require a large amount of memory"> - <option value="all_genes">All genes</option> + <option value="all_genes" selected="true">All genes</option> <option value="pca_only">PCA only</option> </param> - <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel" help=""/> - <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" - help="If not set, alpha decaying kernel is not used" /> - <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection" - help="If not set, will be set to 3 * knn" /> - <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion" - help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data." /> + <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel"/> + <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" help="If not set, alpha decaying kernel is not used"/> + <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection" help="If not set, will be set to 3 * knn"/> + <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion" help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data."/> <param argument="n_pca" type="integer" value="100" optional="true" label="Number of principal components to use for calculating neighborhoods" - help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed." /> + help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed."/> <param name="solver" type="select" label="Which solver to use" help="Selecting all genes may require a large amount of memory"> - <option value="exact">"exact", the implementation described in van Dijk et al. (2018) </option> + <option value="exact" selected="true">"exact", the implementation described in van Dijk et al. (2018) </option> <option value="approximate">"approximate", is faster that performs imputation in the PCA space and then projects back to the gene space</option> </param> <param name="knn_dist" type="select" label="Distance metric to use for the data" help="See scipy.spatial.distance.pdist documentation for more options https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html"> <expand macro="distance_metric_options"/> </param> - <expand macro="param_random_state"/> + <param argument="random_state" type="integer" optional="true" label="Random seed" help="Defaults to the global numpy random number generator."/> </when> </conditional> <expand macro="inputs_common_advanced"/> @@ -173,41 +165,39 @@ <expand macro="anndata_outputs"/> </outputs> <tests> + + <!-- test 1 --> <test expect_num_outputs="2"> - <!-- test 1 --> - <param name="adata" value="krumsiek11.h5ad" /> + <param name="adata" value="krumsiek11.h5ad"/> <conditional name="method"> <param name="method" value="pp.normalize_total"/> - <conditional name="exclude_highly_expressed"> - <param name="exclude_highly_expressed" value="False"/> - </conditional> <param name="key_added" value="n_counts"/> - <param name="layers" value="all"/> - <param name="layer_norm" value="None"/> </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> <has_text_matching expression="sc.pp.normalize_total"/> <has_text_matching expression="exclude_highly_expressed=False"/> <has_text_matching expression="key_added='n_counts'"/> - <has_text_matching expression="layers='all'"/> </assert_contents> </output> - <output name="anndata_out" file="pp.normalize_total.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/n_counts"/> + </assert_contents> + </output> </test> + + <!-- test 2 --> <test expect_num_outputs="2"> - <!-- test 2 --> <param name="adata" value="random-randint.h5ad"/> <conditional name="method"> <param name="method" value="pp.recipe_zheng17"/> - <param name="n_top_genes" value="1000"/> - <param name="log" value="True"/> </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> @@ -216,45 +206,50 @@ <has_text_matching expression="log=True"/> </assert_contents> </output> - <output name="anndata_out" file="pp.recipe_zheng17.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.15"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/n_counts_all"/> + <has_h5_keys keys="var/n_counts,var/mean,var/std"/> + <has_h5_keys keys="uns/log1p"/> + </assert_contents> + </output> </test> + + <!-- test 3 --> <test expect_num_outputs="2"> - <!-- test 3 --> - <param name="adata" value="paul15_subsample.h5ad" /> + <param name="adata" value="paul15_subsample.h5ad"/> <conditional name="method"> <param name="method" value="pp.recipe_weinreb17"/> - <param name="log" value="True"/> - <param name="mean_threshold" value="0.01"/> - <param name="cv_threshold" value="2.0"/> - <param name="n_pcs" value="50"/> - <param name="svd_solver" value="randomized"/> - <param name="random_state" value="0"/> </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> <has_text_matching expression="sc.pp.recipe_weinreb17"/> <has_text_matching expression="log=True"/> <has_text_matching expression="mean_threshold=0.01"/> - <has_text_matching expression="cv_threshold=2.0"/> + <has_text_matching expression="cv_threshold=2"/> <has_text_matching expression="n_pcs=50"/> <has_text_matching expression="svd_solver='randomized'"/> <has_text_matching expression="random_state=0"/> </assert_contents> </output> - <output name="anndata_out" file="pp.recipe_weinreb17.paul15_subsample.updated.h5ad" ftype="h5ad" compare="sim_size"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="uns/log1p"/> + </assert_contents> + </output> </test> + + <!-- test 4 --> <test expect_num_outputs="2"> - <!-- test 4 --> - <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad" /> + <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad"/> <conditional name="method"> <param name="method" value="pp.recipe_seurat"/> - <param name="log" value="True"/> - </conditional> + </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> @@ -262,19 +257,25 @@ <has_text_matching expression="log=True"/> </assert_contents> </output> - <output name="anndata_out" file="pp.recipe_seurat.recipe_zheng17.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.25"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/n_genes"/> + <has_h5_keys keys="var/n_cells"/> + <has_h5_keys keys="uns/log1p"/> + </assert_contents> + </output> </test> + + <!-- test 5 --> <test expect_num_outputs="2"> - <!-- test 5 --> - <param name="adata" value="krumsiek11.h5ad" /> + <param name="adata" value="krumsiek11.h5ad"/> <conditional name="method"> <param name="method" value="external.pp.magic"/> - <param name="name_list" value="all_genes"/> <param name="t" value="-1"/> <param name="n_pca" value="5"/> </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> @@ -282,13 +283,20 @@ <has_text_matching expression="name_list='all_genes'"/> <has_text_matching expression="t='auto'"/> <has_text_matching expression="n_pca=5"/> + <has_text_matching expression="stats before magic: min= -0.01630 max= 1.01060 mean= 0.28644"/> + <has_text_matching expression="stats after magic: min= -0.00857 max= 1.00546 mean= 0.28645"/> </assert_contents> </output> - <output name="anndata_out" file="external.pp.magic.all_genes.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obs/cell_type"/> + </assert_contents> + </output> </test> + + <!-- test 6 --> <test expect_num_outputs="2"> - <!-- test 6 --> - <param name="adata" value="krumsiek11.h5ad" /> + <param name="adata" value="krumsiek11.h5ad"/> <conditional name="method"> <param name="method" value="external.pp.magic"/> <param name="name_list" value="pca_only"/> @@ -296,7 +304,7 @@ <param name="n_pca" value="5"/> </conditional> <section name="advanced_common"> - <param name="show_log" value="true" /> + <param name="show_log" value="true"/> </section> <output name="hidden_output"> <assert_contents> @@ -306,23 +314,26 @@ <has_text_matching expression="n_pca=5"/> </assert_contents> </output> - <output name="anndata_out" file="external.pp.magic.pca_only.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + <output name="anndata_out" ftype="h5ad"> + <assert_contents> + <has_h5_keys keys="obsm/X_magic"/> + </assert_contents> + </output> <assert_stdout> <has_text text="X_magic"/> </assert_stdout> </test> </tests> <help><![CDATA[ -Normalize total counts per cell (`pp.normalize_per_cell`) -========================================================= +Normalize total counts per cell (`pp.normalize_total`) +====================================================== -Normalize each cell by total counts over all genes, so that every cell has -the same total count after normalization. +Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization. Similar functions are used, for example, by Seurat, Cell Ranger or SPRING. More details on the `scanpy documentation -<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_per_cell.html>`__ +<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_total.html>`__ Normalization and filtering as of Zheng et al. (2017), the Cell Ranger R Kit of 10x Genomics (`pp.recipe_zheng17`) @@ -369,7 +380,7 @@ MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold. -The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018). +The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018). - Firstly, we use the adaptive kernel described in Moon et al, (2019) for improved stability. - Secondly, data diffusion is applied in the PCA space, rather than the data space, for speed and memory improvements.