diff remove_confounders.xml @ 1:a89ee42625ad draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 8ef5f7c6f8728608a3f05bb51e11b642b84a05f5"
author iuc
date Wed, 16 Oct 2019 06:30:25 -0400
parents 9ca360dde8e3
children 94c8f42efc47
line wrap: on
line diff
--- a/remove_confounders.xml	Mon Mar 04 10:16:47 2019 -0500
+++ b/remove_confounders.xml	Wed Oct 16 06:30:25 2019 -0400
@@ -1,18 +1,7 @@
-<tool id="scanpy_remove_confounders" name="Remove confounders with scanpy" version="@version@">
-    <description></description>
+<tool id="scanpy_remove_confounders" name="Remove confounders" version="@version@">
+    <description>with scanpy</description>
     <macros>
         <import>macros.xml</import>
-        <xml name="score_genes_params">
-            <param argument="n_bins" type="integer" value="25" label="Number of expression level bins for sampling" help=""/>
-            <param argument="random_state" type="integer" value="0" label="Random seed for sampling" help=""/>
-            <expand macro="param_use_raw"/>
-        </xml>
-        <token name="@CMD_score_genes_inputs@"><![CDATA[
-    n_bins=$method.n_bins,
-    random_state=$method.random_state,
-    use_raw=$method.use_raw,
-    copy=False
-        ]]></token>
     </macros>
     <expand macro="requirements"/>
     <command detect_errors="exit_code"><![CDATA[
@@ -26,30 +15,49 @@
 #if $method.method == "pp.regress_out"
 sc.pp.regress_out(
    adata=adata,
-   keys='$method.reg_keys',
+   #set $keys = [str(x.strip()) for x in str($method.keys).split(',')]
+   keys=$keys,
    copy=False)
-#elif $method.method == "tl.score_genes"
-sc.tl.score_genes(
-    adata=adata,
-    #set $gene_list = [str(x.strip()) for x in str($method.gene_list).split(',')]
-    gene_list=$gene_list,
-    ctrl_size=$method.ctrl_size,
-    score_name='$method.score_name',
-    #if $method.gene_pool
-        #set $gene_pool = [str(x.strip()) for x in $method.gene_pool.split(',')]
-    gene_pool=$gene_pool,
+
+#else if $method.method == "pp.mnn_correct"
+    #for i, filepath in enumerate($methods.extra_adata)
+adata_$i = ad.read('$filepath')
+    #end for
+
+sc.pp.mnn_correct(
+    adata,
+    #for i, filepath in enumerate($methods.extra_adata)
+    adata_$i,
+    #end for
+    #if str($methods.var_subset) != ''
+    #set $var_subset=([x.strip() for x in str($method.var_subset).split(',')])
+    var_subset=$var_subset,
     #end if
-    @CMD_score_genes_inputs@)
-adata.obs.to_csv('$obs', sep='\t')
-#elif $method.method == "tl.score_genes_cell_cycle"
-sc.tl.score_genes_cell_cycle(
-    adata=adata,
-    #set $s_genes = [str(x.strip()) for x in $method.s_genes.split(',')]
-    s_genes=$s_genes,
-    #set $g2m_genes = [str(x.strip()) for x in $method.g2m_genes.split(',')]
-    g2m_genes=$g2m_genes,
-    @CMD_score_genes_inputs@)
-adata.obs.to_csv('$obs', sep='\t')
+    batch_key='$method.batch_key',
+    index_unique='$method.index_unique'
+    #if str($methods.batch_categories) != ''
+    #set $batch_categories=([x.strip() for x in str($method.batch_categories).split(',')])
+    batch_categories=$batch_categories,
+    #end if
+    k=$method.k,
+    sigma=$method.sigma,
+    cos_norm_in=$method.cos_norm_in,
+    cos_norm_out=$method.cos_norm_out,
+    svd_dim=$method.svd_dim,
+    var_adj=$method.var_adj,
+    compute_angle=$method.compute_angle,
+    mnn_order='$method.mnn_order',
+    svd_mode='$method.svd_mode',
+    do_concatenate=True,
+    save_raw=True,
+    n_jobs=\${GALAXY_SLOTS:-4})
+
+#else if $method.method == "pp.combat"
+sc.pp.combat(
+    adata,
+    key='$method.key',
+    inplace=True)
+
 #end if
 
 @CMD_anndata_write_outputs@
@@ -60,111 +68,84 @@
         <conditional name="method">
             <param argument="method" type="select" label="Method used for plotting">
                 <option value="pp.regress_out">Regress out unwanted sources of variation, using `pp.regress_out`</option>
-                <!--<option value="pp.mnn_correct">, using `pp.mnn_correct`</option>!-->
-                <!--<option value="pp.dca">, using `pp.mnn_correct`</option>!-->
-                <!--<option value="pp.magic">, using `pp.magic`</option>!-->
-                <!--<option value="tl.sim">, using `tl.sim`</option>!-->
-                <!--<option value="pp.calculate_qc_metrics">, using `pp.calculate_qc_metrics`</option>!-->
-                <option value="tl.score_genes">Score a set of genes, using `tl.score_genes`</option>
-                <option value="tl.score_genes_cell_cycle">Score cell cycle genes, using `tl.score_genes_cell_cycle`</option>
-                <!--<option value="tl.cyclone">, using `tl.cyclone`</option>!-->
-                <!--<option value="tl.andbag">, using `tl.andbag`</option>!-->
+                <option value="pp.mnn_correct">Correct batch effects by matching mutual nearest neighbors, using `pp.mnn_correct`</option>
+                <option value="pp.combat">Correct batch effects with ComBat function, using `pp.combat`</option>
             </param>
             <when value="pp.regress_out">
-                <param argument="reg_keys" type="text" value="" label="Keys for observation annotation on which to regress on" help=""/>
+                <param argument="keys" type="text" value="" label="Keys for observation annotation on which to regress on" help="Keys separated by a comma"/>
             </when>
-            <when value="tl.score_genes">
-                <param argument="gene_list" type="text" value="" label="The list of gene names used for score calculation" help="Genes separated by a comma"/>
-                <param argument="ctrl_size" type="integer" value="50" label="Number of reference genes to be sampled"
-                    help="If `len(gene_list)` is not too low, you can set `ctrl_size=len(gene_list)`."/>
-                <param argument="gene_pool" type="text" value="" optional="true" label="Genes for sampling the reference set"
-                    help="Default is all genes. Genes separated by a comma"/>
-                <expand macro="score_genes_params"/>
-                <param argument="score_name" type="text" value="score" label="Name of the field to be added in `.obs`" help=""/>
+            <when value="pp.mnn_correct">
+                <param name="extra_adata" type="data" multiple="true" optional="true" format="h5ad" label="Extra annotated data matrix" help="They should have same number of variables."/>
+                <param argument="var_subset" type="text" value="" optional="true" label="The subset of vars to be used when performing MNN correction" help="List of comma-separated key from `.var_names`. If not set, all vars are used"/>
+                <param argument="batch_key" type="text" value="batch" label="Batch key for the concatenate"/>
+                <param name="index_unique" type="select" label="Separator to join the existing index names with the batch category" help="Leave it empty to keep existing indices">
+                    <option value="-">-</option>
+                    <option value="_">_</option>
+                    <option value=" "> </option>
+                    <option value="/">/</option>
+                </param>
+                <param argument="batch_categories" type="text" value="" optional="true" label="Batch categories for the concatenate" help="List of comma-separated key"/>
+                <param argument="k" type="integer" value="20" label="Number of mutual nearest neighbors"/>
+                <param argument="sigma" type="float" value="1" label="The bandwidth of the Gaussian smoothing kernel used to compute the correction vectors"/>
+                <param argument="cos_norm_in" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Should cosine normalization be performed on the input data prior to calculating distances between cells?"/>
+                <param argument="cos_norm_out" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Should cosine normalization be performed prior to computing corrected expression values?"/>
+                <param argument="svd_dim" type="integer" value="" optional="true" label="Number of dimensions to use for summarizing biological substructure within each batch" help="If not set, biological components will not be removed from the correction vectors."/>
+                <param argument="var_adj" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Adjust variance of the correction vectors?" help="This step takes most computing time."/>
+                <param argument="compute_angle" type="boolean" truevalue="True" falsevalue="False" checked="false" label="compute the angle between each cell’s correction vector and the biological subspace of the reference batch?"/>
+                <param argument="mnn_order" type="text" value="" optional="true" label="The order in which batches are to be corrected" help="List of comma-separated key. If not set, datas are corrected sequentially"/>
+                <param name="svd_mode" type="select" label="SVD mode">
+                    <option value="svd">svd: SVD using a non-randomized SVD-via-ID algorithm</option>
+                    <option value="rsvd" selected="true">rsvd: SVD using a randomized SVD-via-ID algorithm</option>
+                    <option value="irlb">irlb: truncated SVD by implicitly restarted Lanczos bidiagonalization</option>
+                </param>
             </when>
-            <when value="tl.score_genes_cell_cycle">
-                <param name="s_genes" type="text" value="" label="List of genes associated with S phase" help="Genes separated by a comma"/>
-                <param name="g2m_genes" type="text" value="" label="List of genes associated with G2M phase" help="Genes separated by a comma"/>
-                <expand macro="score_genes_params"/>
+            <when value="pp.combat">
+                <param argument="key" type="text" value="batch" label="Key to a categorical annotation from adata.obs that will be used for batch effect removal"/>
             </when>
         </conditional>
-        <expand macro="anndata_output_format"/>
     </inputs>
     <outputs>
         <expand macro="anndata_outputs"/>
-        <data name="obs" format="tabular" label="${tool.name} on ${on_string}: Observations annotation">
-            <filter>method['method'] == 'tl.score_genes' or method['method'] == 'tl.score_genes_cell_cycle'</filter>
-        </data>
     </outputs>
     <tests>
         <test>
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+            <!-- test 1 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.regress_out"/>
+                <param name="keys" value="cell_type"/>
+            </conditional>
+            <assert_stdout>
+                <has_text_matching expression="sc.pp.regress_out"/>
+                <has_text_matching expression="keys=\['cell_type'\]"/>
+            </assert_stdout>
+            <output name="anndata_out" file="pp.regress_out.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+        </test>
+        <!--<test>
+            < test 2 >
+            <param name="adata" value="krumsiek11.h5ad" />
+            <conditional name="method">
+                <param name="method" value="pp.mnn_correct"/>
                 <param name="reg_keys" value="cell_type"/>
             </conditional>
-            <param name="anndata_output_format" value="h5ad" />
             <assert_stdout>
-                <has_text_matching expression="sc.pp.regress_out"/>
+                <has_text_matching expression="sc.pp.mnn_correct"/>
                 <has_text_matching expression="keys='cell_type'"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.regress_out.krumsiek11.h5ad" ftype="h5" compare="sim_size"/>
-        </test>
+            <output name="anndata_out" file="pp.mnn_correct.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/>
+        </test>-->
         <test>
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+            <!-- test 2 -->
+            <param name="adata" value="blobs.h5ad" />
             <conditional name="method">
-                <param name="method" value="tl.score_genes"/>
-                <param name="gene_list" value="Gata2, Fog1"/>
-                <param name="ctrl_size" value="2"/>
-                <param name="n_bins" value="2"/>
-                <param name="random_state" value="2"/>
-                <param name="use_raw" value="False"/>
-                <param name="score_name" value="score"/>
+                <param name="method" value="pp.combat"/>
+                <param name="key" value="blobs"/>
             </conditional>
-            <param name="anndata_output_format" value="h5ad"/>
             <assert_stdout>
-                <has_text_matching expression="sc.tl.score_genes" />
-                <has_text_matching expression="gene_list=\['Gata2', 'Fog1'\]" />
-                <has_text_matching expression="ctrl_size=2" />
-                <has_text_matching expression="score_name='score'" />
-                <has_text_matching expression="n_bins=2" />
-                <has_text_matching expression="random_state=2" />
-                <has_text_matching expression="use_raw=False" />
-                <has_text_matching expression="copy=False" />
+                <has_text_matching expression="sc.pp.combat"/>
+                <has_text_matching expression="key='blobs'"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="tl.score_genes.krumsiek11.h5ad" ftype="h5" compare="sim_size"/>
-            <output name="obs" file="tl.score_genes.krumsiek11.obs.tabular" ftype="tabular" compare="sim_size"/>
-        </test>
-        <test>
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
-            <conditional name="method">
-                <param name="method" value="tl.score_genes_cell_cycle"/>
-                <param name="s_genes" value="Gata2, Fog1, EgrNab"/>
-                <param name="g2m_genes" value="Gata2, Fog1, EgrNab"/>
-                <param name="n_bins" value="2"/>
-                <param name="random_state" value="1"/>
-                <param name="use_raw" value="False"/>
-            </conditional>
-            <param name="anndata_output_format" value="h5ad"/>
-            <assert_stdout>
-                <has_text_matching expression="sc.tl.score_genes_cell_cycle"/>
-                <has_text_matching expression="s_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
-                <has_text_matching expression="g2m_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/>
-                <has_text_matching expression="n_bins=2"/>
-                <has_text_matching expression="random_state=1"/>
-                <has_text_matching expression="use_raw=False"/>
-            </assert_stdout>
-            <output name="anndata_out_h5ad" file="tl.score_genes_cell_cycle.krumsiek11.h5ad" ftype="h5" compare="sim_size"/>
-            <output name="obs" file="tl.score_genes_cell_cycle.krumsiek11.obs.tabular" ftype="tabular" compare="sim_size"/>
+            <output name="anndata_out" file="pp.combat.blobs.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -175,30 +156,29 @@
 inspired by Seurat's `regressOut` function in R.
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.regress_out.html#scanpy.api.pp.regress_out>`__
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.regress_out.html>`__
 
-Score a set of genes, using `tl.score_genes`
-============================================
+Correct batch effects by matching mutual nearest neighbors, using `pp.mnn_correct`
+==================================================================================
 
-The score is the average expression of a set of genes subtracted with the
-average expression of a reference set of genes. The reference set is
-randomly sampled from the `gene_pool` for each binned expression value.
+This uses the implementation of mnnpy. Depending on do_concatenate, it returns AnnData objects in the 
+original order containing corrected expression values or a concatenated matrix or AnnData object.
 
-This reproduces the approach in Seurat (Satija et al, 2015) and has been implemented
-for Scanpy by Davide Cittaro.
+Be reminded that it is not advised to use the corrected data matrices for differential expression testing.
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.tl.score_genes.html#scanpy.api.tl.score_genes>`__
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.api.pp.mnn_correct.html>`__
+
 
-Score cell cycle genes, using `tl.score_genes_cell_cycle`
-=========================================================
+Correct batch effects with ComBat function (`pp.combat`)
+========================================================
 
-Given two lists of genes associated to S phase and G2M phase, calculates
-scores and assigns a cell cycle phase (G1, S or G2M). See
-`score_genes` for more explanation.
+Corrects for batch effects by fitting linear models, gains statistical power via an EB framework where information is borrowed across genes. This uses the implementation of ComBat
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.tl.score_genes_cell_cycle.html#scanpy.api.tl.score_genes_cell_cycle>`__
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.combat.html>`__
+
+
     ]]></help>
     <expand macro="citations"/>
 </tool>