Mercurial > repos > iuc > scanpy_normalize

diff normalize.xml @ 17:5dada6f76047 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 91121b1e72696f17478dae383badaa71e9f96dbb
author: iuc
date: Sat, 14 Sep 2024 12:42:55 +0000
parents: d844935c906c
--- a/normalize.xml	Tue Aug 20 09:52:24 2024 +0000
+++ b/normalize.xml	Sat Sep 14 12:42:55 2024 +0000
@@ -1,45 +1,40 @@
-<tool id="scanpy_normalize" name="Normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
-    <description>and impute with scanpy</description>
+<tool id="scanpy_normalize" name="Scanpy normalize" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>and impute</description>
     <macros>
         <import>macros.xml</import>
     </macros>
     <expand macro="bio_tools"/>
-    <expand macro="requirements"/>
+    <expand macro="requirements">
+        <requirement type="package" version="3.0.0">magic-impute</requirement>
+    </expand>
     <expand macro="version_command"/>
     <command detect_errors="exit_code"><![CDATA[
 @CMD@
       ]]></command>
     <configfiles>
         <configfile name="script_file"><![CDATA[
-@CMD_imports@
-@CMD_read_inputs@
+@CMD_IMPORTS@
+@CMD_READ_INPUTS@
 
-#if $method.method == "pp.normalize_total"
+#if str($method.method) == 'pp.normalize_total':
 sc.pp.normalize_total(
     adata,
-    #if str($method.target_sum) != ''
+    #if str($method.target_sum) != '':
     target_sum=$method.target_sum,
     #end if
     exclude_highly_expressed=$method.exclude_highly_expressed.exclude_highly_expressed,
-    #if $method.exclude_highly_expressed.exclude_highly_expressed == "True"
+    #if str($method.exclude_highly_expressed.exclude_highly_expressed) == 'True':
     max_fraction=$method.exclude_highly_expressed.max_fraction,
     #end if
-    #if $method.key_added
+    #if str($method.key_added) != '':
     key_added='$method.key_added',
     #end if
-    #if $method.layers
-        #if str($method.layers) != 'all'
-    layers[str(x.strip()) for x in str($method.layers).split(',')],
-        #else
-    layers='$method.layers',
-        #end if
-    #end if
-    #if str($method.layer_norm) != "None"
-        layer_norm='$method.layer_norm',
+    #if str($method.layer) != '':
+        layer='$method.layer',
     #end if
     inplace=True)
 
-#else if $method.method == "pp.recipe_zheng17"
+#else if str($method.method) == 'pp.recipe_zheng17':
 sc.pp.recipe_zheng17(
     adata=adata,
     n_top_genes=$method.n_top_genes,
@@ -47,52 +42,62 @@
     plot=False,
     copy=False)
 
-#else if $method.method == "pp.recipe_weinreb17"
+#else if str($method.method) == 'pp.recipe_weinreb17':
 sc.pp.recipe_weinreb17(
     adata=adata,
     log=$method.log,
-    mean_threshold=$method.mean_threshold,
-    cv_threshold=$method.cv_threshold,
-    n_pcs=$method.n_pcs,
-    svd_solver='$method.svd_solver',
-    random_state=$method.random_state,
+    mean_threshold=0.01,
+    cv_threshold=2,
+    n_pcs=50,
+    svd_solver='randomized',
+    random_state=0,
     copy=False)
 
-#else if $method.method == "pp.recipe_seurat"
+#else if str($method.method) == 'pp.recipe_seurat':
 sc.pp.recipe_seurat(
     adata=adata,
     log=$method.log,
     plot=False,
     copy=False)
 
-#else if $method.method == "external.pp.magic"
+#else if str($method.method) == 'external.pp.magic':
+print("stats before magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}")
+
 sc.external.pp.magic(
     adata=adata,
     name_list='$method.name_list',
     knn=$method.knn,
-    #if str($method.decay) != ''
+    #if str($method.decay) != '':
     decay=$method.decay,
     #end if
-    #if str($method.knn_max) != ''
+    #if str($method.knn_max) != '':
     knn_max=$method.knn_max,
     #end if
-    #if $method.t == -1
+    #if $method.t == -1:
     t='auto',
     #else
     t=$method.t,
     #end if
-    #if str($method.n_pca) != ''
+    #if str($method.n_pca) != '':
     n_pca=$method.n_pca,
     #end if
     solver='$method.solver',
     knn_dist='$method.knn_dist',
+    #if str($method.random_state) != '':
     random_state=$method.random_state,
+    #else
+    random_state=None,
+    #end if
     copy=False)
+
+    #if str($method.name_list) == 'all_genes':
+print("stats after magic:", "min=", f"{adata.X.min():.5f}", "max=", f"{adata.X.max():.5f}", "mean=", f"{adata.X.mean():.5f}")
+    #end if
 #end if
 
-@CMD_anndata_write_outputs@
-
-]]></configfile>
+@CMD_ANNDATA_WRITE_OUTPUTS@
+        ]]>
+        </configfile>
     </configfiles>
     <inputs>
         <expand macro="inputs_anndata"/>
@@ -108,63 +113,50 @@
                 <param argument="target_sum" type="float" value="" optional="true" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/>
                 <conditional name="exclude_highly_expressed">
                     <param argument="exclude_highly_expressed" type="select" label="Exclude (very) highly expressed genes for the computation of the normalization factor (size factor) for each cell" help=" A gene is considered highly expressed, if it has more than max_fraction of the total counts in at least one cell. The not-excluded genes will sum up to target_sum">
+                        <option value="False" selected="true">No</option>
                         <option value="True">Yes</option>
-                        <option value="False" selected="true">No</option>
                     </param>
                     <when value="True">
-                        <param argument="max_fraction" type="float" value="0.05" label="Target sum" help="If not provided, after normalization, each observation (cell) has a total count equal to the median of the total counts (cells) before normalization."/>
+                        <param argument="max_fraction" type="float" value="0.05" label="Consider cells as highly expressed that have more counts than this value of the original total counts in at least one cell."/>
                     </when>
                     <when value="False"/>
                 </conditional>
-                <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored" help="">
-                    <expand macro="sanitize_query" />
+                <param argument="key_added" type="text" value="" optional="true" label="Name of the field in 'adata.obs' where the normalization factor is stored">
+                    <expand macro="sanitize_query"/>
                 </param>
-                <param argument="layers" type="text" value="" optional="true" label="List of layers to normalize" help="'All' will normalize all layers. The list should be comma-separated.">
-                    <expand macro="sanitize_query" />
-                </param>
-                <param argument="layer_norm" type="select" label="How to normalize layers?">
-                    <option value="None">None: after normalization, for each layer in layers each cell has a total count equal to the median of the median of the total counts (cells) before normalization of the layer.</option>
-                    <option value="after">After: for each layer in layers each cell has a total count equal to target_sum.</option>
-                    <option value="X">X: for each layer in layers each cell has a total count equal to the median of total counts for observations (cells) of adata.X before normalization.</option>
+                <param argument="layer" type="text" value="" label="Layer to normalize instead of X. If not provided, X is normalized.">
+                    <expand macro="sanitize_query"/>
                 </param>
             </when>
             <when value="pp.recipe_zheng17">
-                <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep" help=""/>
-                <expand macro="param_log"/>
+                <param argument="n_top_genes" type="integer" min="0" value="1000" label="Number of genes to keep"/>
+                <expand macro="param_log" checked="true"/>
             </when>
             <when value="pp.recipe_weinreb17">
-                <expand macro="param_log"/>
-                <param argument="mean_threshold" type="float" value="0.01" label="Mean threshold" help=""/>
-                <param argument="cv_threshold" type="float" value="2" label="CV threshold" help=""/>
-                <param argument="n_pcs" type="integer" min="0" value="50" label="Number of principal component" help=""/>
-                <expand macro="svd_solver"/>
-                <expand macro="pca_random_state"/>
+                <expand macro="param_log" checked="true"/>
             </when>
             <when value="pp.recipe_seurat">
-                <expand macro="param_log"/>
+                <expand macro="param_log" checked="true"/>
             </when>
             <when value="external.pp.magic">
                 <param name="name_list" type="select" label="Denoised genes to return" help="Selecting all genes may require a large amount of memory">
-                    <option value="all_genes">All genes</option>
+                    <option value="all_genes" selected="true">All genes</option>
                     <option value="pca_only">PCA only</option>
                 </param>
-                <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel" help=""/>
-                <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" 
-                    help="If not set, alpha decaying kernel is not used" />
-                <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection"
-                    help="If not set, will be set to 3 * knn" />
-                <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion"
-                    help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data." />
+                <param argument="knn" type="integer" min="1" value="5" label="Number of nearest neighbors on which to build kernel"/>
+                <param argument="decay" type="integer" optional="true" value="1" label="Set decay rate of kernel tails" help="If not set, alpha decaying kernel is not used"/>
+                <param argument="knn_max" type="integer" min="1" optional="true" value="" label="Maximum number of nearest neighbors with nonzero connection" help="If not set, will be set to 3 * knn"/>
+                <param argument="t" type="integer" min="-1" value="3" label="Power to which the diffusion operator is powered. This sets the level of diffusion" help="If ‘-1’, this parameter is selected according to the Procrustes disparity of the diffused data."/>
                 <param argument="n_pca" type="integer" value="100" optional="true" label="Number of principal components to use for calculating neighborhoods"
-                    help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed." />
+                    help="For extremely large datasets, using n_pca less than 20 allows neighborhoods to be calculated in roughly log(n_samples) time. If not set, no PCA is performed."/>
                 <param name="solver" type="select" label="Which solver to use" help="Selecting all genes may require a large amount of memory">
-                    <option value="exact">"exact", the implementation described in van Dijk et al. (2018) </option>
+                    <option value="exact" selected="true">"exact", the implementation described in van Dijk et al. (2018) </option>
                     <option value="approximate">"approximate", is faster that performs imputation in the PCA space and then projects back to the gene space</option>
                 </param>
                 <param name="knn_dist" type="select" label="Distance metric to use for the data" help="See scipy.spatial.distance.pdist documentation for more options https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html">
                     <expand macro="distance_metric_options"/>
                 </param>
-                <expand macro="param_random_state"/>
+                <param argument="random_state" type="integer" optional="true" label="Random seed" help="Defaults to the global numpy random number generator."/>
             </when>
         </conditional>
         <expand macro="inputs_common_advanced"/>
@@ -173,41 +165,39 @@
         <expand macro="anndata_outputs"/>
     </outputs>
     <tests>
+
+        <!-- test 1 -->
         <test expect_num_outputs="2">
-            <!-- test 1 -->
-            <param name="adata" value="krumsiek11.h5ad" />
+            <param name="adata" value="krumsiek11.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.normalize_total"/>
-                <conditional name="exclude_highly_expressed">
-                    <param name="exclude_highly_expressed" value="False"/>
-                </conditional>
                 <param name="key_added" value="n_counts"/>
-                <param name="layers" value="all"/>
-                <param name="layer_norm" value="None"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
                     <has_text_matching expression="sc.pp.normalize_total"/>
                     <has_text_matching expression="exclude_highly_expressed=False"/>
                     <has_text_matching expression="key_added='n_counts'"/>
-                    <has_text_matching expression="layers='all'"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.normalize_total.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/n_counts"/>
+                </assert_contents>
+            </output>
         </test>
+
+        <!-- test 2 -->
         <test expect_num_outputs="2">
-            <!-- test 2 -->
             <param name="adata" value="random-randint.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.recipe_zheng17"/>
-                <param name="n_top_genes" value="1000"/>
-                <param name="log" value="True"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
@@ -216,45 +206,50 @@
                     <has_text_matching expression="log=True"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.recipe_zheng17.random-randint.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.15"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/n_counts_all"/>
+                    <has_h5_keys keys="var/n_counts,var/mean,var/std"/>
+                    <has_h5_keys keys="uns/log1p"/>
+                </assert_contents>
+            </output>
         </test>
+
+        <!-- test 3 -->
         <test expect_num_outputs="2">
-            <!-- test 3 -->
-            <param name="adata" value="paul15_subsample.h5ad" />
+            <param name="adata" value="paul15_subsample.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.recipe_weinreb17"/>
-                <param name="log" value="True"/>
-                <param name="mean_threshold" value="0.01"/>
-                <param name="cv_threshold" value="2.0"/>
-                <param name="n_pcs" value="50"/>
-                <param name="svd_solver" value="randomized"/>
-                <param name="random_state" value="0"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
                     <has_text_matching expression="sc.pp.recipe_weinreb17"/>
                     <has_text_matching expression="log=True"/>
                     <has_text_matching expression="mean_threshold=0.01"/>
-                    <has_text_matching expression="cv_threshold=2.0"/>
+                    <has_text_matching expression="cv_threshold=2"/>
                     <has_text_matching expression="n_pcs=50"/>
                     <has_text_matching expression="svd_solver='randomized'"/>
                     <has_text_matching expression="random_state=0"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.recipe_weinreb17.paul15_subsample.updated.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="uns/log1p"/>
+                </assert_contents>
+            </output>
         </test>
+
+        <!-- test 4 -->
         <test expect_num_outputs="2">
-            <!-- test 4 -->
-            <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad" />
+            <param name="adata" value="pp.recipe_zheng17.random-randint.h5ad"/>
             <conditional name="method">
                 <param name="method" value="pp.recipe_seurat"/>
-                <param name="log" value="True"/>
-            </conditional>
+           </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
@@ -262,19 +257,25 @@
                     <has_text_matching expression="log=True"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="pp.recipe_seurat.recipe_zheng17.h5ad" ftype="h5ad" compare="sim_size" delta="1000000" delta_frac="0.25"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/n_genes"/>
+                    <has_h5_keys keys="var/n_cells"/>
+                    <has_h5_keys keys="uns/log1p"/>
+                </assert_contents>
+            </output>
         </test>
+
+        <!-- test 5 -->
         <test expect_num_outputs="2">
-            <!-- test 5 -->
-            <param name="adata" value="krumsiek11.h5ad" />
+            <param name="adata" value="krumsiek11.h5ad"/>
             <conditional name="method">
                 <param name="method" value="external.pp.magic"/>
-                <param name="name_list" value="all_genes"/>
                 <param name="t" value="-1"/>
                 <param name="n_pca" value="5"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
@@ -282,13 +283,20 @@
                     <has_text_matching expression="name_list='all_genes'"/>
                     <has_text_matching expression="t='auto'"/>
                     <has_text_matching expression="n_pca=5"/>
+                    <has_text_matching expression="stats before magic: min= -0.01630 max= 1.01060 mean= 0.28644"/>
+                    <has_text_matching expression="stats after magic: min= -0.00857 max= 1.00546 mean= 0.28645"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="external.pp.magic.all_genes.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obs/cell_type"/>
+                </assert_contents>
+            </output>
         </test>
+
+        <!-- test 6 -->
         <test expect_num_outputs="2">
-            <!-- test 6 -->
-            <param name="adata" value="krumsiek11.h5ad" />
+            <param name="adata" value="krumsiek11.h5ad"/>
             <conditional name="method">
                 <param name="method" value="external.pp.magic"/>
                 <param name="name_list" value="pca_only"/>
@@ -296,7 +304,7 @@
                 <param name="n_pca" value="5"/>
             </conditional>
             <section name="advanced_common">
-                <param name="show_log" value="true" />
+                <param name="show_log" value="true"/>
             </section>
             <output name="hidden_output">
                 <assert_contents>
@@ -306,23 +314,26 @@
                     <has_text_matching expression="n_pca=5"/>
                 </assert_contents>
             </output>
-            <output name="anndata_out" file="external.pp.magic.pca_only.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+            <output name="anndata_out" ftype="h5ad">
+                <assert_contents>
+                    <has_h5_keys keys="obsm/X_magic"/>
+                </assert_contents>
+            </output>
             <assert_stdout>
                 <has_text text="X_magic"/>
             </assert_stdout>
         </test>
     </tests>
     <help><![CDATA[
-Normalize total counts per cell (`pp.normalize_per_cell`)
-=========================================================
+Normalize total counts per cell (`pp.normalize_total`)
+======================================================
 
-Normalize each cell by total counts over all genes, so that every cell has
-the same total count after normalization.
+Normalize each cell by total counts over all genes, so that every cell has the same total count after normalization. If choosing target_sum=1e6, this is CPM normalization.
 
 Similar functions are used, for example, by Seurat, Cell Ranger or SPRING.
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_per_cell.html>`__
+<https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.normalize_total.html>`__
 
 
 Normalization and filtering as of Zheng et al. (2017), the Cell Ranger R Kit of 10x Genomics (`pp.recipe_zheng17`)
@@ -369,7 +380,7 @@
 
 MAGIC is an algorithm for denoising and transcript recover of single cells applied to single-cell sequencing data. MAGIC builds a graph from the data and uses diffusion to smooth out noise and recover the data manifold.
 
-The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018). 
+The algorithm implemented here has changed primarily in two ways compared to the algorithm described in Van Dijk D et al. (2018).
 
 - Firstly, we use the adaptive kernel described in Moon et al, (2019) for improved stability.
 - Secondly, data diffusion is applied in the PCA space, rather than the data space, for speed and memory improvements.
author	iuc
date	Sat, 14 Sep 2024 12:42:55 +0000
parents	d844935c906c
children