Mercurial > repos > iuc > scanpy_filter

diff filter.xml @ 1:6a76b60e05f5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 8ef5f7c6f8728608a3f05bb51e11b642b84a05f5"
author: iuc
date: Wed, 16 Oct 2019 06:32:33 -0400
parents: 6ea5a05a260a
children: e62673c32a5d
--- a/filter.xml	Mon Mar 04 10:15:02 2019 -0500
+++ b/filter.xml	Wed Oct 16 06:32:33 2019 -0400
@@ -1,5 +1,5 @@
-<tool id="scanpy_filter" name="Filter with scanpy" version="@galaxy_version@">
-    <description></description>
+<tool id="scanpy_filter" name="Filter" version="@galaxy_version@">
+    <description>with scanpy</description>
     <macros>
         <import>macros.xml</import>
     </macros>
@@ -14,94 +14,74 @@
 @CMD_read_inputs@
 
 #if $method.method == 'pp.filter_cells'
-res = sc.pp.filter_cells(
-    #if $modify_anndata.modify_anndata == 'true'
+sc.pp.filter_cells(
     adata,
-    #else
-    adata.X,
-    #end if
     #if $method.filter.filter == 'min_counts'
     min_counts=$method.filter.min_counts,
-    #elif $method.filter.filter == 'max_counts'
+    #else if $method.filter.filter == 'max_counts'
     max_counts=$method.filter.max_counts,
-    #elif $method.filter.filter == 'min_genes'
+    #else if $method.filter.filter == 'min_genes'
     min_genes=$method.filter.min_genes,
-    #elif $method.filter.filter == 'max_genes'
+    #else if $method.filter.filter == 'max_genes'
     max_genes=$method.filter.max_genes,
     #end if
     copy=False)
 
-    #if $modify_anndata.modify_anndata == 'true'
-df = adata.obs
-    #else
-df = pd.DataFrame(data=dict(cell_subset=res[0], number_per_cell=res[1]))
-    #end if
-
-    #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts'
-df.to_csv('$counts_per_cell', sep='\t')
-    #elif $method.filter.filter == 'min_genes' or $method.filter.filter == 'max_genes'
-df.to_csv('$genes_per_cell', sep='\t')
-    #end if
-
-#elif $method.method == 'pp.filter_genes'
-res = sc.pp.filter_genes(
-    #if $modify_anndata.modify_anndata == 'true'
+#else if $method.method == 'pp.filter_genes'
+sc.pp.filter_genes(
     adata,
-    #else
-    adata.X,
-    #end if
     #if $method.filter.filter == 'min_counts'
     min_counts=$method.filter.min_counts,
-    #elif $method.filter.filter == 'max_counts'
+    #else if $method.filter.filter == 'max_counts'
     max_counts=$method.filter.max_counts,
-    #elif $method.filter.filter == 'min_cells'
+    #else if $method.filter.filter == 'min_cells'
     min_cells=$method.filter.min_cells,
-    #elif $method.filter.filter == 'max_cells'
+    #else if $method.filter.filter == 'max_cells'
     max_cells=$method.filter.max_cells,
     #end if
     copy=False)
 
-    #if $modify_anndata.modify_anndata == 'true'
-df = adata.var
-    #else
-df = pd.DataFrame(data=dict(gene_subset=res[0], number_per_gene=res[1]))
+#else if $method.method == 'tl.filter_rank_genes_groups'
+sc.tl.filter_rank_genes_groups(
+    adata,
+    #if str($method.key) != ''
+    key='$method.key',
     #end if
-
-    #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts'
-df.to_csv('$counts_per_gene', sep='\t')
-    #elif $method.filter.filter == 'min_cells' or $method.filter.filter == 'max_cells'
-df.to_csv('$cells_per_gene', sep='\t')
+    #if str($method.groupby) != ''
+    groupby='$method.groupby',
     #end if
+    use_raw=$method.use_raw,
+    log=$method.log,
+    key_added='$method.key_added',
+    min_in_group_fraction=$method.min_in_group_fraction,
+    max_out_group_fraction=$method.max_out_group_fraction,
+    min_fold_change=$method.min_fold_change)
 
-#elif $method.method == 'pp.filter_genes_dispersion'
-res = sc.pp.filter_genes_dispersion(
-    #if $modify_anndata.modify_anndata == 'true'
-    adata,
-    #else
-    adata.X,
-    #end if
+#else if $method.method == "pp.highly_variable_genes"
+sc.pp.highly_variable_genes(
+    adata=adata,
     flavor='$method.flavor.flavor',
-    #if $method.flavor.flavor=='seurat'
+    #if $method.flavor.flavor == 'seurat'
+        #if str($method.flavor.min_mean) != ''
     min_mean=$method.flavor.min_mean,
+        #end if
+        #if str($method.flavor.max_mean) != ''
     max_mean=$method.flavor.max_mean,
+        #end if
+        #if str($method.flavor.min_disp) != ''
     min_disp=$method.flavor.min_disp,
-        #if $method.flavor.max_disp
+        #end if
+        #if str($method.flavor.max_disp) != ''
     max_disp=$method.flavor.max_disp,
         #end if
-    #else
+    #else if $method.flavor.flavor == 'cell_ranger'
     n_top_genes=$method.flavor.n_top_genes,
     #end if
     n_bins=$method.n_bins,
-    log=$method.log,
-    copy=False)
+    subset=$method.subset,
+    inplace=True)
 
-    #if $modify_anndata.modify_anndata == 'true'
-adata.var.to_csv('$per_gene', sep='\t')
-    #else
-pd.DataFrame(res).to_csv('$per_gene', sep='\t')
-    #end if
-
-#elif $method.method == 'pp.subsample'
+#else if $method.method == 'pp.subsample'
 sc.pp.subsample(
     data=adata,
     #if $method.type.type == 'fraction'
@@ -112,9 +92,21 @@
     random_state=$method.random_state,
     copy=False)
 
+#else if $method.method == "pp.downsample_counts"
+sc.pp.downsample_counts(
+    adata=adata,
+    #if str($method.counts_per_cell) != ''
+    counts_per_cell=$method.counts_per_cell,
+    #end if
+    #if str($method.total_counts) != ''
+    total_counts=$method.total_counts,
+    #end if
+    random_state=$method.random_state,
+    replace=$method.replace,
+    copy=False)
 #end if
 
-@CMD_anndata_write_modify_outputs@
+@CMD_anndata_write_outputs@
 ]]></configfile>
     </configfiles>
     <inputs>
@@ -123,11 +115,10 @@
             <param argument="method" type="select" label="Method used for filtering">
                 <option value="pp.filter_cells">Filter cell outliers based on counts and numbers of genes expressed, using `pp.filter_cells`</option>
                 <option value="pp.filter_genes">Filter genes based on number of cells or counts, using `pp.filter_genes`</option>
-                <option value="pp.filter_genes_dispersion">Extract highly variable genes, using `pp.filter_genes_dispersion`</option>
-                <!--<option value="pp.highly_variable_genes">, using `tl.highly_variable_genes`</option>!-->
+                <option value="tl.filter_rank_genes_groups">Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories, using `tl.filter_rank_genes_groups`</option>
+                <option value="pp.highly_variable_genes">, using `tl.highly_variable_genes`</option>
                 <option value="pp.subsample">Subsample to a fraction of the number of observations, using `pp.subsample`</option>
-                <!--<option value="queries.gene_coordinates">, using `queries.gene_coordinates`</option>!-->
-                <!--<option value="queries.mitochondrial_genes">, using `queries.mitochondrial_genes`</option>!-->
+                <option value="pp.downsample_counts">Downsample counts from count matrix, using `pp.downsample_counts`</option>
             </param>
             <when value="pp.filter_cells">
                 <conditional name="filter">
@@ -160,37 +151,47 @@
                         <option value="max_cells">Maximum number of cells expressed</option>
                     </param>
                     <when value="min_counts">
-                        <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a gene to pass filtering" help=""/>
+                        <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a gene to pass filtering"/>
                     </when>
                     <when value="max_counts">
-                        <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a gene to pass filtering" help=""/>
+                        <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a gene to pass filtering"/>
                     </when>
                     <when value="min_cells">
-                        <param argument="min_cells" type="integer" min="0" value="" label="Minimum number of cells expressed required for a gene to pass filtering" help=""/>
+                        <param argument="min_cells" type="integer" min="0" value="" label="Minimum number of cells expressed required for a gene to pass filtering"/>
                     </when>    
                     <when value="max_cells">
-                        <param argument="max_cells" type="integer" min="0" value="" label="Maximum number of cells expressed required for a gene to pass filtering" help=""/>
+                        <param argument="max_cells" type="integer" min="0" value="" label="Maximum number of cells expressed required for a gene to pass filtering"/>
                     </when>
                 </conditional>
             </when>
-            <when value="pp.filter_genes_dispersion">
+            <when value="tl.filter_rank_genes_groups">
+                <param argument="key" type="text" optional="true" label="Key in adata.uns where the rank_genes_groups output is stored"/>
+                <param argument="groupby" type="text" optional="true" label="The key of the observations grouping to consider"/>
+                <expand macro="param_use_raw"/>
+                <expand macro="param_log"/>
+                <param argument="key_added" type="text" value="rank_genes_groups_filtered" label="Key that will contain new values"/>
+                <param argument="min_in_group_fraction" type="float" min="0" max="1" value="0.25" label="Minimum fraction of genes expressing the gene within the categories"/>
+                <param argument="max_out_group_fraction" type="float" min="0" max="1" value="0.5" label="Maximum fraction of genes expressing the gene outside the categories"/>
+                <param argument="min_fold_change" type="integer" value="2" label="Minimum fold change"/>
+            </when>
+            <when value="pp.highly_variable_genes">
                 <conditional name='flavor'>
-                    <param argument="flavor" type="select" label="Flavor for computing normalized dispersion" help="">
+                    <param argument="flavor" type="select" label="Flavor for computing normalized dispersion">
                         <option value="seurat">seurat: expects non-logarithmized data</option>
                         <option value="cell_ranger">cell_ranger: usually called for logarithmized data</option>
                     </param>
                     <when value="seurat">
-                        <param argument="min_mean" type="float" value="0.0125" label="Minimal mean cutoff" help=""/>
-                        <param argument="max_mean" type="float" value="3" label="Maximal mean cutoff" help=""/>
-                        <param argument="min_disp" type="float" value="0.5" label="Minimal normalized dispersion cutoff" help=""/>
-                        <param argument="max_disp" type="float" value="" optional="true" label="Maximal normalized dispersion cutoff" help=""/>
+                        <param argument="min_mean" type="float" value="0.0125" label="Minimal mean cutoff"/>
+                        <param argument="max_mean" type="float" value="3" label="Maximal mean cutoff"/>
+                        <param argument="min_disp" type="float" value="0.5" label="Minimal normalized dispersion cutoff"/>
+                        <param argument="max_disp" type="float" value="" optional="true" label="Maximal normalized dispersion cutoff"/>
                     </when>
                     <when value="cell_ranger">
-                        <param argument="n_top_genes" type="integer" value="" label="Number of highly-variable genes to keep" help=""/>
+                        <param argument="n_top_genes" type="integer" value="" label="Number of highly-variable genes to keep"/>
                     </when>
                 </conditional>
                 <param argument="n_bins" type="integer" value="20" label="Number of bins for binning the mean gene expression" help="Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1"/>
-                <expand macro="param_log"/>
+                <param argument="subset" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Inplace subset to highly-variable genes?" help="Otherwise it merely indicates highly variable genes."/>
             </when>
             <when value="pp.subsample">
                 <conditional name="type">
@@ -199,44 +200,29 @@
                         <option value="n_obs">By number of observation</option>
                     </param>
                     <when value="fraction">
-                        <param argument="fraction" type="float" value="" label="Subsample to this `fraction` of the number of observations" help=""/>
+                        <param argument="fraction" type="float" value="" label="Subsample to this `fraction` of the number of observations"/>
                     </when>
                     <when value="n_obs">
-                        <param argument="n_obs" type="integer" min="0" value="" label="Subsample to this number of observations" help=""/>
+                        <param argument="n_obs" type="integer" min="0" value="" label="Subsample to this number of observations"/>
                     </when>
                 </conditional>
-                <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling" help=""/>
+                <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
+            </when>
+            <when value="pp.downsample_counts">
+                <param argument="counts_per_cell" type="integer" min="0" optional="true" label="Target total counts per cell" help="If a cell has more than ‘counts_per_cell’, it will be downsampled to this number. Resulting counts can be specified on a per cell basis by passing an array."/>
+                <param argument="total_counts" type="integer" min="0" optional="true" label="Target total counts" help="If the count matrix has more than total_counts it will be downsampled to have this number."/>
+                <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling"/>
+                <param argument="replace" type="boolean" truevalue="True" falsevalue="False" checked="false" label="Sample the counts with replacement?"/>
             </when>
         </conditional>
-        <expand macro="anndata_modify_output_input"/>
     </inputs>
     <outputs>
-        <expand macro="anndata_modify_outputs"/>
-        <!-- for pp.filter_cells -->
-        <data name="counts_per_cell" format="tabular" label="${tool.name} on ${on_string}: Counts per cells after filtering">
-            <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter>
-        </data>
-        <data name="genes_per_cell" format="tabular" label="${tool.name} on ${on_string}: Number of genes per cell after filtering">
-            <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_genes' or method['filter']['filter'] == 'max_genes')</filter>
-        </data>
-        <!-- for pp.filter_genes -->
-        <data name="counts_per_gene" format="tabular" label="${tool.name} on ${on_string}: Counts per genes after filtering">
-            <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter>
-        </data>
-        <data name="cells_per_gene" format="tabular" label="${tool.name} on ${on_string}: Number of cells per genes after filtering">
-            <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_cells' or method['filter']['filter'] == 'max_cells')</filter>
-        </data>
-        <!-- for pp.filter_genes_dispersion -->
-        <data name="per_gene" format="tabular" label="${tool.name} on ${on_string}: Means, dispersions and normalized dispersions per gene">
-            <filter>method['method'] == 'pp.filter_genes_dispersion'</filter>
-        </data>
+        <expand macro="anndata_outputs"/>
     </outputs>
     <tests>
-        <test expect_num_outputs="2">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 1 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.filter_cells"/>
                 <conditional name="filter">
@@ -244,67 +230,15 @@
                     <param name="min_counts" value="3"/>
                 </conditional>
             </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="h5ad" />
-            </conditional>
             <assert_stdout>
                 <has_text_matching expression="sc.pp.filter_cells"/>
                 <has_text_matching expression="min_counts=3"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/>
-            <output name="counts_per_cell">
-                <assert_contents>
-                    <has_text_matching expression="cell_type\tn_counts" />
-                    <has_text_matching expression="46\tprogenitor\t3.028" />
-                    <has_text_matching expression="85\tEry\t3.7001" />
-                    <has_text_matching expression="150\tMk\t4.095" />
-                    <has_n_columns n="3" />
-                </assert_contents>
-            </output>
+            <output name="anndata_out" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="2">
-            <conditional name="input">
-                <param name="format" value="loom" />
-                <param name="adata" value="krumsiek11.loom" />
-                <param name="sparse" value="True"/>
-                <param name="cleanup" value="False"/>
-                <param name="x_name"  value="spliced"/>
-                <param name="obs_names" value="CellID" />
-                <param name="var_names" value="Gene"/>
-            </conditional>
-            <conditional name="method">
-                <param name="method" value="pp.filter_cells"/>
-                <conditional name="filter">
-                    <param name="filter" value="min_counts"/>
-                    <param name="min_counts" value="3"/>
-                </conditional>
-            </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="loom" />
-            </conditional>
-            <assert_stdout>
-                <has_text_matching expression="sc.pp.filter_cells"/>
-                <has_text_matching expression="min_counts=3"/>
-            </assert_stdout>
-            <output name="anndata_out_loom" file="pp.filter_cells.krumsiek11-min_counts.loom" ftype="loom" compare="sim_size"/>
-            <output name="counts_per_cell">
-                <assert_contents>
-                    <has_text_matching expression="cell_type\tn_counts" />
-                    <has_text_matching expression="46\tprogenitor\t3.028" />
-                    <has_text_matching expression="85\tEry\t3.7001" />
-                    <has_text_matching expression="97\tMo\t3.925" />
-                    <has_text_matching expression="150\tMk\t4.095" />
-                    <has_n_columns n="3" />
-                </assert_contents>
-            </output>
-        </test>
-        <test expect_num_outputs="1">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad"/>
-            </conditional>
+        <test>
+            <!-- test 2 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.filter_cells"/>
                 <conditional name="filter">
@@ -312,21 +246,16 @@
                     <param name="max_genes" value="100"/>
                 </conditional>
             </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="false"/>
-            </conditional>
             <assert_stdout>
                 <has_text_matching expression="sc.pp.filter_cells"/>
-                <has_text_matching expression="adata.X"/>
+                <has_text_matching expression="adata"/>
                 <has_text_matching expression="max_genes=100"/>
             </assert_stdout>
-            <output name="genes_per_cell" file="pp.filter_cells.number_per_cell.krumsiek11-max_genes.tabular"/>
+            <output name="anndata_out" file="pp.filter_cells.krumsiek11-max_genes.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="2">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 3 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.filter_genes"/>
                 <conditional name="filter">
@@ -334,102 +263,84 @@
                     <param name="min_counts" value="3"/>
                 </conditional>
             </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="h5ad" />
-            </conditional>
             <assert_stdout>
                 <has_text_matching expression="sc.pp.filter_genes"/>
                 <has_text_matching expression="min_counts=3"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/>
-            <output name="counts_per_gene" file="pp.filter_genes.number_per_gene.krumsiek11-min_counts.tabular"/>
+            <output name="anndata_out" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="1">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="pbmc68k_reduced.h5ad"/>
-            </conditional>
+        <test>
+            <!-- test 4 -->
+            <param name="adata" value="tl.rank_genes_groups.krumsiek11.h5ad" />
             <conditional name="method">
-                <param name="method" value="pp.filter_genes"/>
-                <conditional name="filter">
-                    <param name="filter" value="max_cells"/>
-                    <param name="max_cells" value="500"/>
-                </conditional>
-            </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="false"/>
+                <param name="method" value="tl.filter_rank_genes_groups"/>
+                <param name="key" value="rank_genes_groups"/>
+                <param name="use_raw" value="False"/>
+                <param name="log" value="False"/>
+                <param name="key_added" value="rank_genes_groups_filtered"/>
+                <param name="min_in_group_fraction" value="0.25"/>
+                <param name="max_out_group_fraction" value="0.5"/>
+                <param name="min_fold_change" value="3"/>
             </conditional>
             <assert_stdout>
-                <has_text_matching expression="sc.pp.filter_genes"/>
-                <has_text_matching expression="adata.X"/>
-                <has_text_matching expression="max_cells=500"/>
+                <has_text_matching expression="tl.filter_rank_genes_groups"/>
+                <has_text_matching expression="key='rank_genes_groups'"/>
+                <has_text_matching expression="use_raw=False"/>
+                <has_text_matching expression="log=False"/>
+                <has_text_matching expression="key_added='rank_genes_groups_filtered'"/>
+                <has_text_matching expression="min_in_group_fraction=0.25"/>
+                <has_text_matching expression="max_out_group_fraction=0.5"/>
+                <has_text_matching expression="min_fold_change=3"/>
             </assert_stdout>
-            <output name="cells_per_gene" file="pp.filter_genes.number_per_gene.pbmc68k_reduced-max_cells.tabular"/>
+            <output name="anndata_out" file="pp.filter_rank_genes_groups.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="2">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 5 -->
+            <param name="adata" value="blobs.h5ad"/>
             <conditional name="method">
-                <param name="method" value="pp.filter_genes_dispersion"/>
+                <param name="method" value="pp.highly_variable_genes"/>
                 <conditional name="flavor">
                     <param name="flavor" value="seurat"/>
                     <param name="min_mean" value="0.0125"/>
                     <param name="max_mean" value="3"/>
                     <param name="min_disp" value="0.5"/>
                 </conditional>
-                <param name="n_bins" value="20" />
-                <param name="log" value="true"/>
-            </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="h5ad" />
+                <param name="n_bins" value="20"/>
+                <param name="subset" value="false"/>
             </conditional>
             <assert_stdout>
-                <has_text_matching expression="sc.pp.filter_genes_dispersion"/>
+                <has_text_matching expression="sc.pp.highly_variable_genes"/>
                 <has_text_matching expression="flavor='seurat'"/>
                 <has_text_matching expression="min_mean=0.0125"/>
-                <has_text_matching expression="max_mean=3.0"/>
+                <has_text_matching expression="max_mean=3"/>
                 <has_text_matching expression="min_disp=0.5"/>
                 <has_text_matching expression="n_bins=20"/>
-                <has_text_matching expression="log=True"/>
+                <has_text_matching expression="subset=False"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.filter_genes_dispersion.krumsiek11-seurat.h5ad" ftype="h5" compare="sim_size"/>
-            <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-seurat.tabular"/>
+            <output name="anndata_out" file="pp.highly_variable_genes.seurat.blobs.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="1">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 6 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
-                <param name="method" value="pp.filter_genes_dispersion"/>
+                <param name="method" value="pp.highly_variable_genes"/>
                 <conditional name="flavor">
                     <param name="flavor" value="cell_ranger"/>
                     <param name="n_top_genes" value="2"/>
                 </conditional>
                 <param name="n_bins" value="20"/>
-                <param name="log" value="true"/>
-            </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="false"/>
             </conditional>
             <assert_stdout>
-                <has_text_matching expression="sc.pp.filter_genes_dispersion"/>
+                <has_text_matching expression="sc.pp.highly_variable_genes"/>
                 <has_text_matching expression="flavor='cell_ranger'"/>
                 <has_text_matching expression="n_top_genes=2"/>
                 <has_text_matching expression="n_bins=20"/>
-                <has_text_matching expression="og=True"/>
             </assert_stdout>
-            <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-cell_ranger.tabular"/>
+            <output name="anndata_out" file="pp.highly_variable_genes.krumsiek11-cell_ranger.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="1">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 7 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.subsample"/>
                 <conditional name="type">
@@ -438,22 +349,16 @@
                 </conditional>
                 <param name="random_state" value="0"/>
             </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="h5ad" />
-            </conditional>
             <assert_stdout>
                 <has_text_matching expression="sc.pp.subsample"/>
                 <has_text_matching expression="fraction=0.5"/>
                 <has_text_matching expression="random_state=0"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5" compare="sim_size"/>
+            <output name="anndata_out" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
-        <test expect_num_outputs="1">
-            <conditional name="input">
-                <param name="format" value="h5ad" />
-                <param name="adata" value="krumsiek11.h5ad" />
-            </conditional>
+        <test>
+            <!-- test 8 -->
+            <param name="adata" value="krumsiek11.h5ad" />
             <conditional name="method">
                 <param name="method" value="pp.subsample"/>
                 <conditional name="type">
@@ -462,16 +367,29 @@
                 </conditional>
                 <param name="random_state" value="0"/>
             </conditional>
-            <conditional name="modify_anndata">
-                <param name="modify_anndata" value="true"/>
-                <param name="anndata_output_format" value="h5ad" />
-            </conditional>
             <assert_stdout>
                 <has_text_matching expression="sc.pp.subsample"/>
                 <has_text_matching expression="n_obs=10"/>
                 <has_text_matching expression="random_state=0"/>
             </assert_stdout>
-            <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5" compare="sim_size"/>
+            <output name="anndata_out" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5ad" compare="sim_size"/>
+        </test>
+        <test>
+            <!-- test 9 -->
+            <param name="adata" value="random-randint.h5ad" />
+            <conditional name="method">
+                <param name="method" value="pp.downsample_counts"/>
+                <param name="total_counts" value="20000"/>
+                <param name="random_state" value="0"/>
+                <param name="replace" value="false"/>
+            </conditional>
+            <assert_stdout>
+                <has_text_matching expression="sc.pp.downsample_counts"/>
+                <has_text_matching expression="total_counts=20000"/>
+                <has_text_matching expression="random_state=0"/>
+                <has_text_matching expression="replace=False"/>
+            </assert_stdout>
+            <output name="anndata_out" file="pp.downsample_counts.random-randint.h5ad" ftype="h5ad" compare="sim_size"/>
         </test>
     </tests>
     <help><![CDATA[
@@ -487,12 +405,7 @@
 `max_counts`, `max_genes` per call.
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_cells.html#scanpy.api.pp.filter_cells>`__
-
-Return
-------
-
-number_per_cell : Number per cell (either `n_counts` or `n_genes` per cell)
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.filter_cells.html>`__
 
 
 Filter genes based on number of cells or counts (`pp.filter_genes`)
@@ -506,42 +419,38 @@
 `max_counts`, `max_cells` per call.
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes.html#scanpy.api.pp.filter_genes>`__
-
-Return
-------
-
-number_per_gene : Number per genes (either `n_counts` or `n_genes` per cell)
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.filter_genes.html>`__
 
 
-Extract highly variable genes (`pp.filter_genes_dispersion`)
-============================================================
-
-If trying out parameters, pass the data matrix instead of AnnData.
-
-Depending on `flavor`, this reproduces the R-implementations of Seurat and Cell Ranger.
-
-The normalized dispersion is obtained by scaling with the mean and standard
-deviation of the dispersions for genes falling into a given bin for mean
-expression of genes. This means that for each bin of mean expression, highly
-variable genes are selected.
-
-Use `flavor='cell_ranger'` with care and in the same way as in `pp.recipe_zheng17`.
+Filters out genes based on fold change and fraction of genes expressing the gene within and outside the groupby categories (`tl.filter_rank_genes_groups`)
+==========================================================================================================================================================
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes_dispersion.html#scanpy.api.pp.filter_genes_dispersion>`__
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.tl.filter_rank_genes_groups.html>`__
+
 
-Returns
--------
-- The annotated matrix filtered, with the annotations
-- A table with the means, dispersions, and normalized dispersions per gene, logarithmized when `log` is `True`.
+Annotate highly variable genes (`pp.highly_variable_genes`)
+===========================================================
+
+It expects logarithmized data.
+
+Depending on flavor, this reproduces the R-implementations of Seurat or Cell Ranger. The normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for genes falling into a given bin for mean expression of genes. This means that for each bin of mean expression, highly variable genes are selected.
 
 
 Subsample to a fraction of the number of observations (`pp.subsample`)
 ======================================================================
 
 More details on the `scanpy documentation
-<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.subsample.html#scanpy.api.pp.subsample>`__
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.subsample.html>`__
+
+Downsample counts (`pp.downsample_counts`)
+==========================================
+
+Downsample counts so that each cell has no more than `target_counts`. Cells with fewer counts than `target_counts` are unaffected by this. This
+has been implemented by M. D. Luecken.
+
+More details on the `scanpy documentation
+<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.downsample_counts.html>`__
 
 
     ]]></help>
author	iuc
date	Wed, 16 Oct 2019 06:32:33 -0400
parents	6ea5a05a260a
children	e62673c32a5d