comparison filter.xml @ 0:6ea5a05a260a draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 92f85afaed0097d1879317a9f513093fce5481d6
author iuc
date Mon, 04 Mar 2019 10:15:02 -0500
parents
children 6a76b60e05f5
comparison
equal deleted inserted replaced
-1:000000000000 0:6ea5a05a260a
1 <tool id="scanpy_filter" name="Filter with scanpy" version="@galaxy_version@">
2 <description></description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements"/>
7 <expand macro="version_command"/>
8 <command detect_errors="exit_code"><![CDATA[
9 @CMD@
10 ]]></command>
11 <configfiles>
12 <configfile name="script_file"><![CDATA[
13 @CMD_imports@
14 @CMD_read_inputs@
15
16 #if $method.method == 'pp.filter_cells'
17 res = sc.pp.filter_cells(
18 #if $modify_anndata.modify_anndata == 'true'
19 adata,
20 #else
21 adata.X,
22 #end if
23 #if $method.filter.filter == 'min_counts'
24 min_counts=$method.filter.min_counts,
25 #elif $method.filter.filter == 'max_counts'
26 max_counts=$method.filter.max_counts,
27 #elif $method.filter.filter == 'min_genes'
28 min_genes=$method.filter.min_genes,
29 #elif $method.filter.filter == 'max_genes'
30 max_genes=$method.filter.max_genes,
31 #end if
32 copy=False)
33
34 #if $modify_anndata.modify_anndata == 'true'
35 df = adata.obs
36 #else
37 df = pd.DataFrame(data=dict(cell_subset=res[0], number_per_cell=res[1]))
38 #end if
39
40 #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts'
41 df.to_csv('$counts_per_cell', sep='\t')
42 #elif $method.filter.filter == 'min_genes' or $method.filter.filter == 'max_genes'
43 df.to_csv('$genes_per_cell', sep='\t')
44 #end if
45
46 #elif $method.method == 'pp.filter_genes'
47 res = sc.pp.filter_genes(
48 #if $modify_anndata.modify_anndata == 'true'
49 adata,
50 #else
51 adata.X,
52 #end if
53 #if $method.filter.filter == 'min_counts'
54 min_counts=$method.filter.min_counts,
55 #elif $method.filter.filter == 'max_counts'
56 max_counts=$method.filter.max_counts,
57 #elif $method.filter.filter == 'min_cells'
58 min_cells=$method.filter.min_cells,
59 #elif $method.filter.filter == 'max_cells'
60 max_cells=$method.filter.max_cells,
61 #end if
62 copy=False)
63
64 #if $modify_anndata.modify_anndata == 'true'
65 df = adata.var
66 #else
67 df = pd.DataFrame(data=dict(gene_subset=res[0], number_per_gene=res[1]))
68 #end if
69
70 #if $method.filter.filter == 'min_counts' or $method.filter.filter == 'max_counts'
71 df.to_csv('$counts_per_gene', sep='\t')
72 #elif $method.filter.filter == 'min_cells' or $method.filter.filter == 'max_cells'
73 df.to_csv('$cells_per_gene', sep='\t')
74 #end if
75
76 #elif $method.method == 'pp.filter_genes_dispersion'
77 res = sc.pp.filter_genes_dispersion(
78 #if $modify_anndata.modify_anndata == 'true'
79 adata,
80 #else
81 adata.X,
82 #end if
83 flavor='$method.flavor.flavor',
84 #if $method.flavor.flavor=='seurat'
85 min_mean=$method.flavor.min_mean,
86 max_mean=$method.flavor.max_mean,
87 min_disp=$method.flavor.min_disp,
88 #if $method.flavor.max_disp
89 max_disp=$method.flavor.max_disp,
90 #end if
91 #else
92 n_top_genes=$method.flavor.n_top_genes,
93 #end if
94 n_bins=$method.n_bins,
95 log=$method.log,
96 copy=False)
97
98 #if $modify_anndata.modify_anndata == 'true'
99 adata.var.to_csv('$per_gene', sep='\t')
100 #else
101 pd.DataFrame(res).to_csv('$per_gene', sep='\t')
102 #end if
103
104 #elif $method.method == 'pp.subsample'
105 sc.pp.subsample(
106 data=adata,
107 #if $method.type.type == 'fraction'
108 fraction=$method.type.fraction,
109 #else if $method.type.type == 'n_obs'
110 n_obs=$method.type.n_obs,
111 #end if
112 random_state=$method.random_state,
113 copy=False)
114
115 #end if
116
117 @CMD_anndata_write_modify_outputs@
118 ]]></configfile>
119 </configfiles>
120 <inputs>
121 <expand macro="inputs_anndata"/>
122 <conditional name="method">
123 <param argument="method" type="select" label="Method used for filtering">
124 <option value="pp.filter_cells">Filter cell outliers based on counts and numbers of genes expressed, using `pp.filter_cells`</option>
125 <option value="pp.filter_genes">Filter genes based on number of cells or counts, using `pp.filter_genes`</option>
126 <option value="pp.filter_genes_dispersion">Extract highly variable genes, using `pp.filter_genes_dispersion`</option>
127 <!--<option value="pp.highly_variable_genes">, using `tl.highly_variable_genes`</option>!-->
128 <option value="pp.subsample">Subsample to a fraction of the number of observations, using `pp.subsample`</option>
129 <!--<option value="queries.gene_coordinates">, using `queries.gene_coordinates`</option>!-->
130 <!--<option value="queries.mitochondrial_genes">, using `queries.mitochondrial_genes`</option>!-->
131 </param>
132 <when value="pp.filter_cells">
133 <conditional name="filter">
134 <param argument="filter" type="select" label="Filter">
135 <option value="min_counts">Minimum number of counts</option>
136 <option value="max_counts">Maximum number of counts</option>
137 <option value="min_genes">Minimum number of genes expressed</option>
138 <option value="max_genes">Maximum number of genes expressed</option>
139 </param>
140 <when value="min_counts">
141 <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a cell to pass filtering" help=""/>
142 </when>
143 <when value="max_counts">
144 <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a cell to pass filtering" help=""/>
145 </when>
146 <when value="min_genes">
147 <param argument="min_genes" type="integer" min="0" value="" label="Minimum number of genes expressed required for a cell to pass filtering" help=""/>
148 </when>
149 <when value="max_genes">
150 <param argument="max_genes" type="integer" min="0" value="" label="Maximum number of genes expressed required for a cell to pass filtering" help=""/>
151 </when>
152 </conditional>
153 </when>
154 <when value="pp.filter_genes">
155 <conditional name="filter">
156 <param argument="filter" type="select" label="Filter">
157 <option value="min_counts">Minimum number of counts</option>
158 <option value="max_counts">Maximum number of counts</option>
159 <option value="min_cells">Minimum number of cells expressed</option>
160 <option value="max_cells">Maximum number of cells expressed</option>
161 </param>
162 <when value="min_counts">
163 <param argument="min_counts" type="integer" min="0" value="" label="Minimum number of counts required for a gene to pass filtering" help=""/>
164 </when>
165 <when value="max_counts">
166 <param argument="max_counts" type="integer" min="0" value="" label="Maximum number of counts required for a gene to pass filtering" help=""/>
167 </when>
168 <when value="min_cells">
169 <param argument="min_cells" type="integer" min="0" value="" label="Minimum number of cells expressed required for a gene to pass filtering" help=""/>
170 </when>
171 <when value="max_cells">
172 <param argument="max_cells" type="integer" min="0" value="" label="Maximum number of cells expressed required for a gene to pass filtering" help=""/>
173 </when>
174 </conditional>
175 </when>
176 <when value="pp.filter_genes_dispersion">
177 <conditional name='flavor'>
178 <param argument="flavor" type="select" label="Flavor for computing normalized dispersion" help="">
179 <option value="seurat">seurat: expects non-logarithmized data</option>
180 <option value="cell_ranger">cell_ranger: usually called for logarithmized data</option>
181 </param>
182 <when value="seurat">
183 <param argument="min_mean" type="float" value="0.0125" label="Minimal mean cutoff" help=""/>
184 <param argument="max_mean" type="float" value="3" label="Maximal mean cutoff" help=""/>
185 <param argument="min_disp" type="float" value="0.5" label="Minimal normalized dispersion cutoff" help=""/>
186 <param argument="max_disp" type="float" value="" optional="true" label="Maximal normalized dispersion cutoff" help=""/>
187 </when>
188 <when value="cell_ranger">
189 <param argument="n_top_genes" type="integer" value="" label="Number of highly-variable genes to keep" help=""/>
190 </when>
191 </conditional>
192 <param argument="n_bins" type="integer" value="20" label="Number of bins for binning the mean gene expression" help="Normalization is done with respect to each bin. If just a single gene falls into a bin, the normalized dispersion is artificially set to 1"/>
193 <expand macro="param_log"/>
194 </when>
195 <when value="pp.subsample">
196 <conditional name="type">
197 <param name="type" type="select" label="Type of subsampling">
198 <option value="fraction">By fraction</option>
199 <option value="n_obs">By number of observation</option>
200 </param>
201 <when value="fraction">
202 <param argument="fraction" type="float" value="" label="Subsample to this `fraction` of the number of observations" help=""/>
203 </when>
204 <when value="n_obs">
205 <param argument="n_obs" type="integer" min="0" value="" label="Subsample to this number of observations" help=""/>
206 </when>
207 </conditional>
208 <param argument="random_state" type="integer" value="0" label="Random seed to change subsampling" help=""/>
209 </when>
210 </conditional>
211 <expand macro="anndata_modify_output_input"/>
212 </inputs>
213 <outputs>
214 <expand macro="anndata_modify_outputs"/>
215 <!-- for pp.filter_cells -->
216 <data name="counts_per_cell" format="tabular" label="${tool.name} on ${on_string}: Counts per cells after filtering">
217 <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter>
218 </data>
219 <data name="genes_per_cell" format="tabular" label="${tool.name} on ${on_string}: Number of genes per cell after filtering">
220 <filter>method['method'] == 'pp.filter_cells' and (method['filter']['filter'] == 'min_genes' or method['filter']['filter'] == 'max_genes')</filter>
221 </data>
222 <!-- for pp.filter_genes -->
223 <data name="counts_per_gene" format="tabular" label="${tool.name} on ${on_string}: Counts per genes after filtering">
224 <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_counts' or method['filter']['filter'] == 'max_counts')</filter>
225 </data>
226 <data name="cells_per_gene" format="tabular" label="${tool.name} on ${on_string}: Number of cells per genes after filtering">
227 <filter>method['method'] == 'pp.filter_genes' and (method['filter']['filter'] == 'min_cells' or method['filter']['filter'] == 'max_cells')</filter>
228 </data>
229 <!-- for pp.filter_genes_dispersion -->
230 <data name="per_gene" format="tabular" label="${tool.name} on ${on_string}: Means, dispersions and normalized dispersions per gene">
231 <filter>method['method'] == 'pp.filter_genes_dispersion'</filter>
232 </data>
233 </outputs>
234 <tests>
235 <test expect_num_outputs="2">
236 <conditional name="input">
237 <param name="format" value="h5ad" />
238 <param name="adata" value="krumsiek11.h5ad" />
239 </conditional>
240 <conditional name="method">
241 <param name="method" value="pp.filter_cells"/>
242 <conditional name="filter">
243 <param name="filter" value="min_counts"/>
244 <param name="min_counts" value="3"/>
245 </conditional>
246 </conditional>
247 <conditional name="modify_anndata">
248 <param name="modify_anndata" value="true"/>
249 <param name="anndata_output_format" value="h5ad" />
250 </conditional>
251 <assert_stdout>
252 <has_text_matching expression="sc.pp.filter_cells"/>
253 <has_text_matching expression="min_counts=3"/>
254 </assert_stdout>
255 <output name="anndata_out_h5ad" file="pp.filter_cells.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/>
256 <output name="counts_per_cell">
257 <assert_contents>
258 <has_text_matching expression="cell_type\tn_counts" />
259 <has_text_matching expression="46\tprogenitor\t3.028" />
260 <has_text_matching expression="85\tEry\t3.7001" />
261 <has_text_matching expression="150\tMk\t4.095" />
262 <has_n_columns n="3" />
263 </assert_contents>
264 </output>
265 </test>
266 <test expect_num_outputs="2">
267 <conditional name="input">
268 <param name="format" value="loom" />
269 <param name="adata" value="krumsiek11.loom" />
270 <param name="sparse" value="True"/>
271 <param name="cleanup" value="False"/>
272 <param name="x_name" value="spliced"/>
273 <param name="obs_names" value="CellID" />
274 <param name="var_names" value="Gene"/>
275 </conditional>
276 <conditional name="method">
277 <param name="method" value="pp.filter_cells"/>
278 <conditional name="filter">
279 <param name="filter" value="min_counts"/>
280 <param name="min_counts" value="3"/>
281 </conditional>
282 </conditional>
283 <conditional name="modify_anndata">
284 <param name="modify_anndata" value="true"/>
285 <param name="anndata_output_format" value="loom" />
286 </conditional>
287 <assert_stdout>
288 <has_text_matching expression="sc.pp.filter_cells"/>
289 <has_text_matching expression="min_counts=3"/>
290 </assert_stdout>
291 <output name="anndata_out_loom" file="pp.filter_cells.krumsiek11-min_counts.loom" ftype="loom" compare="sim_size"/>
292 <output name="counts_per_cell">
293 <assert_contents>
294 <has_text_matching expression="cell_type\tn_counts" />
295 <has_text_matching expression="46\tprogenitor\t3.028" />
296 <has_text_matching expression="85\tEry\t3.7001" />
297 <has_text_matching expression="97\tMo\t3.925" />
298 <has_text_matching expression="150\tMk\t4.095" />
299 <has_n_columns n="3" />
300 </assert_contents>
301 </output>
302 </test>
303 <test expect_num_outputs="1">
304 <conditional name="input">
305 <param name="format" value="h5ad" />
306 <param name="adata" value="krumsiek11.h5ad"/>
307 </conditional>
308 <conditional name="method">
309 <param name="method" value="pp.filter_cells"/>
310 <conditional name="filter">
311 <param name="filter" value="max_genes"/>
312 <param name="max_genes" value="100"/>
313 </conditional>
314 </conditional>
315 <conditional name="modify_anndata">
316 <param name="modify_anndata" value="false"/>
317 </conditional>
318 <assert_stdout>
319 <has_text_matching expression="sc.pp.filter_cells"/>
320 <has_text_matching expression="adata.X"/>
321 <has_text_matching expression="max_genes=100"/>
322 </assert_stdout>
323 <output name="genes_per_cell" file="pp.filter_cells.number_per_cell.krumsiek11-max_genes.tabular"/>
324 </test>
325 <test expect_num_outputs="2">
326 <conditional name="input">
327 <param name="format" value="h5ad" />
328 <param name="adata" value="krumsiek11.h5ad" />
329 </conditional>
330 <conditional name="method">
331 <param name="method" value="pp.filter_genes"/>
332 <conditional name="filter">
333 <param name="filter" value="min_counts"/>
334 <param name="min_counts" value="3"/>
335 </conditional>
336 </conditional>
337 <conditional name="modify_anndata">
338 <param name="modify_anndata" value="true"/>
339 <param name="anndata_output_format" value="h5ad" />
340 </conditional>
341 <assert_stdout>
342 <has_text_matching expression="sc.pp.filter_genes"/>
343 <has_text_matching expression="min_counts=3"/>
344 </assert_stdout>
345 <output name="anndata_out_h5ad" file="pp.filter_genes.krumsiek11-min_counts.h5ad" ftype="h5" compare="sim_size"/>
346 <output name="counts_per_gene" file="pp.filter_genes.number_per_gene.krumsiek11-min_counts.tabular"/>
347 </test>
348 <test expect_num_outputs="1">
349 <conditional name="input">
350 <param name="format" value="h5ad" />
351 <param name="adata" value="pbmc68k_reduced.h5ad"/>
352 </conditional>
353 <conditional name="method">
354 <param name="method" value="pp.filter_genes"/>
355 <conditional name="filter">
356 <param name="filter" value="max_cells"/>
357 <param name="max_cells" value="500"/>
358 </conditional>
359 </conditional>
360 <conditional name="modify_anndata">
361 <param name="modify_anndata" value="false"/>
362 </conditional>
363 <assert_stdout>
364 <has_text_matching expression="sc.pp.filter_genes"/>
365 <has_text_matching expression="adata.X"/>
366 <has_text_matching expression="max_cells=500"/>
367 </assert_stdout>
368 <output name="cells_per_gene" file="pp.filter_genes.number_per_gene.pbmc68k_reduced-max_cells.tabular"/>
369 </test>
370 <test expect_num_outputs="2">
371 <conditional name="input">
372 <param name="format" value="h5ad" />
373 <param name="adata" value="krumsiek11.h5ad" />
374 </conditional>
375 <conditional name="method">
376 <param name="method" value="pp.filter_genes_dispersion"/>
377 <conditional name="flavor">
378 <param name="flavor" value="seurat"/>
379 <param name="min_mean" value="0.0125"/>
380 <param name="max_mean" value="3"/>
381 <param name="min_disp" value="0.5"/>
382 </conditional>
383 <param name="n_bins" value="20" />
384 <param name="log" value="true"/>
385 </conditional>
386 <conditional name="modify_anndata">
387 <param name="modify_anndata" value="true"/>
388 <param name="anndata_output_format" value="h5ad" />
389 </conditional>
390 <assert_stdout>
391 <has_text_matching expression="sc.pp.filter_genes_dispersion"/>
392 <has_text_matching expression="flavor='seurat'"/>
393 <has_text_matching expression="min_mean=0.0125"/>
394 <has_text_matching expression="max_mean=3.0"/>
395 <has_text_matching expression="min_disp=0.5"/>
396 <has_text_matching expression="n_bins=20"/>
397 <has_text_matching expression="log=True"/>
398 </assert_stdout>
399 <output name="anndata_out_h5ad" file="pp.filter_genes_dispersion.krumsiek11-seurat.h5ad" ftype="h5" compare="sim_size"/>
400 <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-seurat.tabular"/>
401 </test>
402 <test expect_num_outputs="1">
403 <conditional name="input">
404 <param name="format" value="h5ad" />
405 <param name="adata" value="krumsiek11.h5ad" />
406 </conditional>
407 <conditional name="method">
408 <param name="method" value="pp.filter_genes_dispersion"/>
409 <conditional name="flavor">
410 <param name="flavor" value="cell_ranger"/>
411 <param name="n_top_genes" value="2"/>
412 </conditional>
413 <param name="n_bins" value="20"/>
414 <param name="log" value="true"/>
415 </conditional>
416 <conditional name="modify_anndata">
417 <param name="modify_anndata" value="false"/>
418 </conditional>
419 <assert_stdout>
420 <has_text_matching expression="sc.pp.filter_genes_dispersion"/>
421 <has_text_matching expression="flavor='cell_ranger'"/>
422 <has_text_matching expression="n_top_genes=2"/>
423 <has_text_matching expression="n_bins=20"/>
424 <has_text_matching expression="og=True"/>
425 </assert_stdout>
426 <output name="per_gene" file="pp.filter_genes_dispersion.per_gene.krumsiek11-cell_ranger.tabular"/>
427 </test>
428 <test expect_num_outputs="1">
429 <conditional name="input">
430 <param name="format" value="h5ad" />
431 <param name="adata" value="krumsiek11.h5ad" />
432 </conditional>
433 <conditional name="method">
434 <param name="method" value="pp.subsample"/>
435 <conditional name="type">
436 <param name="type" value="fraction" />
437 <param name="fraction" value="0.5"/>
438 </conditional>
439 <param name="random_state" value="0"/>
440 </conditional>
441 <conditional name="modify_anndata">
442 <param name="modify_anndata" value="true"/>
443 <param name="anndata_output_format" value="h5ad" />
444 </conditional>
445 <assert_stdout>
446 <has_text_matching expression="sc.pp.subsample"/>
447 <has_text_matching expression="fraction=0.5"/>
448 <has_text_matching expression="random_state=0"/>
449 </assert_stdout>
450 <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_fraction.h5ad" ftype="h5" compare="sim_size"/>
451 </test>
452 <test expect_num_outputs="1">
453 <conditional name="input">
454 <param name="format" value="h5ad" />
455 <param name="adata" value="krumsiek11.h5ad" />
456 </conditional>
457 <conditional name="method">
458 <param name="method" value="pp.subsample"/>
459 <conditional name="type">
460 <param name="type" value="n_obs" />
461 <param name="n_obs" value="10"/>
462 </conditional>
463 <param name="random_state" value="0"/>
464 </conditional>
465 <conditional name="modify_anndata">
466 <param name="modify_anndata" value="true"/>
467 <param name="anndata_output_format" value="h5ad" />
468 </conditional>
469 <assert_stdout>
470 <has_text_matching expression="sc.pp.subsample"/>
471 <has_text_matching expression="n_obs=10"/>
472 <has_text_matching expression="random_state=0"/>
473 </assert_stdout>
474 <output name="anndata_out_h5ad" file="pp.subsample.krumsiek11_n_obs.h5ad" ftype="h5" compare="sim_size"/>
475 </test>
476 </tests>
477 <help><![CDATA[
478
479 Filter cells outliers based on counts and numbers of genes expressed (`pp.filter_cells`)
480 ========================================================================================
481
482 For instance, only keep cells with at least `min_counts` counts or
483 `min_genes` genes expressed. This is to filter measurement outliers, i.e.,
484 "unreliable" observations.
485
486 Only provide one of the optional parameters `min_counts`, `min_genes`,
487 `max_counts`, `max_genes` per call.
488
489 More details on the `scanpy documentation
490 <https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_cells.html#scanpy.api.pp.filter_cells>`__
491
492 Return
493 ------
494
495 number_per_cell : Number per cell (either `n_counts` or `n_genes` per cell)
496
497
498 Filter genes based on number of cells or counts (`pp.filter_genes`)
499 ===================================================================
500
501 Keep genes that have at least `min_counts` counts or are expressed in at
502 least `min_cells` cells or have at most `max_counts` counts or are expressed
503 in at most `max_cells` cells.
504
505 Only provide one of the optional parameters `min_counts`, `min_cells`,
506 `max_counts`, `max_cells` per call.
507
508 More details on the `scanpy documentation
509 <https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes.html#scanpy.api.pp.filter_genes>`__
510
511 Return
512 ------
513
514 number_per_gene : Number per genes (either `n_counts` or `n_genes` per cell)
515
516
517 Extract highly variable genes (`pp.filter_genes_dispersion`)
518 ============================================================
519
520 If trying out parameters, pass the data matrix instead of AnnData.
521
522 Depending on `flavor`, this reproduces the R-implementations of Seurat and Cell Ranger.
523
524 The normalized dispersion is obtained by scaling with the mean and standard
525 deviation of the dispersions for genes falling into a given bin for mean
526 expression of genes. This means that for each bin of mean expression, highly
527 variable genes are selected.
528
529 Use `flavor='cell_ranger'` with care and in the same way as in `pp.recipe_zheng17`.
530
531 More details on the `scanpy documentation
532 <https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.filter_genes_dispersion.html#scanpy.api.pp.filter_genes_dispersion>`__
533
534 Returns
535 -------
536 - The annotated matrix filtered, with the annotations
537 - A table with the means, dispersions, and normalized dispersions per gene, logarithmized when `log` is `True`.
538
539
540 Subsample to a fraction of the number of observations (`pp.subsample`)
541 ======================================================================
542
543 More details on the `scanpy documentation
544 <https://scanpy.readthedocs.io/en/latest/api/scanpy.api.pp.subsample.html#scanpy.api.pp.subsample>`__
545
546
547 ]]></help>
548 <expand macro="citations"/>
549 </tool>