Mercurial > repos > iuc > scanpy_inspect
diff inspect.xml @ 1:a755eaa1cc32 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/scanpy/ commit 8ef5f7c6f8728608a3f05bb51e11b642b84a05f5"
author | iuc |
---|---|
date | Wed, 16 Oct 2019 06:31:52 -0400 |
parents | 5d2e17328afe |
children | 7d22964a8639 |
line wrap: on
line diff
--- a/inspect.xml Mon Mar 04 10:15:38 2019 -0500 +++ b/inspect.xml Wed Oct 16 06:31:52 2019 -0400 @@ -1,7 +1,52 @@ -<tool id="scanpy_inspect" name="Inspect with scanpy" version="@galaxy_version@"> - <description></description> +<tool id="scanpy_inspect" name="Inspect and manipulate" version="@galaxy_version@"> + <description> with scanpy</description> <macros> <import>macros.xml</import> + <xml name="score_genes_params"> + <param argument="n_bins" type="integer" value="25" label="Number of expression level bins for sampling" help=""/> + <param argument="random_state" type="integer" value="0" label="Random seed for sampling" help=""/> + <expand macro="param_use_raw"/> + </xml> + <token name="@CMD_score_genes_inputs@"><![CDATA[ + n_bins=$method.n_bins, + random_state=$method.random_state, + use_raw=$method.use_raw, + copy=False + ]]></token> + <xml name="corr_method"> + <param argument="corr_method" type="select" label="P-value correction method"> + <option value="benjamini-hochberg">Benjamini-Hochberg</option> + <option value="bonferroni">Bonferroni</option> + </param> + </xml> + <xml name="fit_intercept"> + <param argument="fit_intercept" type="boolean" truevalue="True" falsevalue="False" checked="true" + label="Should a constant (a.k.a. bias or intercept) be added to the decision function?" help=""/> + </xml> + <xml name="max_iter"> + <param argument="max_iter" type="integer" min="0" value="100" label="Maximum number of iterations taken for the solvers to converge" help=""/> + </xml> + <xml name="multi_class"> + <param argument="multi_class" type="select" label="Multi class" help=""> + <option value="ovr">ovr: a binary problem is fit for each label</option> + <option value="multinomial">multinomial: the multinomial loss fit across the entire probability distribution, even when the data is binary</option> + <option value="auto">auto: selects ‘ovr’ if the data is binary and otherwise selects ‘multinomial’</option> + </param> + </xml> + <xml name="penalty"> + <param argument="penalty" type="select" label="Norm used in the penalization" help=""> + <option value="l1">l1</option> + <option value="l2">l2</option> + <option value="customized">customized</option> + </param> + </xml> + <xml name="custom_penalty"> + <param argument="pen" type="text" value="" label="Norm used in the penalization" help=""/> + </xml> + <xml name="random_state"> + <param argument="random_state" type="integer" value="" optional="true" + label="The seed of the pseudo random number generator to use when shuffling the data" help=""/> + </xml> </macros> <expand macro="requirements"/> <expand macro="version_command"/> @@ -13,22 +58,195 @@ @CMD_imports@ @CMD_read_inputs@ -#if $method.method == "tl.paga" -sc.tl.paga( +#if $method.method == "pp.calculate_qc_metrics" +sc.pp.calculate_qc_metrics( + adata=adata, + expr_type='$method.expr_type', + var_type='$method.var_type', + #if str($method.qc_vars) != '' + #set $qc_vars = [str(x.strip()) for x in str($method.qc_vars).split(',')] + qc_vars=$qc_vars, + #end if + #if str($method.percent_top) != '' + #set $percent_top = [int(x.strip()) for x in str($method.percent_top).split(',')] + percent_top=$method.percent_top, + #end if + inplace=True) + +#else if $method.method == "tl.score_genes" +sc.tl.score_genes( adata=adata, - groups='$method.groups', - use_rna_velocity =$method.use_rna_velocity, - model='$method.model', + #set $gene_list = [str(x.strip()) for x in str($method.gene_list).split(',')] + gene_list=$gene_list, + ctrl_size=$method.ctrl_size, + score_name='$method.score_name', + #if $method.gene_pool + #set $gene_pool = [str(x.strip()) for x in $method.gene_pool.split(',')] + gene_pool=$gene_pool, + #end if + @CMD_score_genes_inputs@) + +#else if $method.method == "tl.score_genes_cell_cycle" + #if str($method.s_genes.format) == 'file' +with open('$method.s_genes.file', 'r') as s_genes_f: + s_genes = [str(x.strip()) for x in s_genes_f.readlines()] +print(s_genes) + #end if + + #if str($method.g2m_genes.format) == 'file' +with open('$method.g2m_genes.file', 'r') as g2m_genes_f: + g2m_genes = [str(x.strip()) for x in g2m_genes_f.readlines()] +print(g2m_genes) + #end if + +sc.tl.score_genes_cell_cycle( + adata=adata, + #if str($method.s_genes.format) == 'text' + #set $s_genes = [str(x.strip()) for x in $method.s_genes.text.split(',')] + s_genes=$s_genes, + #else if str($method.s_genes.format) == 'file' + s_genes=s_genes, + #end if + #if str($method.g2m_genes.format) == 'text' + #set $g2m_genes = [str(x.strip()) for x in $method.g2m_genes.text.split(',')] + g2m_genes=$g2m_genes, + #else if str($method.g2m_genes.format) == 'file' + g2m_genes=g2m_genes, + #end if + @CMD_score_genes_inputs@) + +#else if $method.method == 'pp.neighbors' +sc.pp.neighbors( + adata=adata, + n_neighbors=$method.n_neighbors, + #if str($method.n_pcs) != '' + n_pcs=$method.n_pcs, + #end if + #if str($method.use_rep) != '' + use_rep='$method.use_rep', + #end if + knn=$method.knn, + random_state=$method.random_state, + method='$method.pp_neighbors_method', + metric='$method.metric', copy=False) -#elif $method.method == "tl.dpt" -sc.tl.dpt( + +#else if $method.method == 'tl.rank_genes_groups' +sc.tl.rank_genes_groups( adata=adata, - n_dcs=$method.n_dcs, - n_branchings=$method.n_branchings, - min_group_size=$method.min_group_size, - allow_kendall_tau_shift=$method.allow_kendall_tau_shift, + groupby='$method.groupby', + use_raw=$method.use_raw, + #if str($method.groups) != '' + #set $group=[x.strip() for x in str($method.groups).split(',')] + groups=$group, + #end if + #if $method.ref.rest == 'rest' + reference='$method.ref.rest', + #else + reference='$method.ref.reference', + #end if + n_genes=$method.n_genes, + method='$method.tl_rank_genes_groups_method.method', + #if $method.tl_rank_genes_groups_method.method != 'logreg' + corr_method='$method.tl_rank_genes_groups_method.corr_method', + #else + solver='$method.tl_rank_genes_groups_method.solver.solver', + #if $method.tl_rank_genes_groups_method.solver.solver == 'newton-cg' + penalty='l2', + fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept, + max_iter=$method.tl_rank_genes_groups_method.solver.max_iter, + multi_class='$method.tl_rank_genes_groups_method.solver.multi_class', + #else if $method.tl_rank_genes_groups_method.solver.solver == 'lbfgs' + penalty='l2', + fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept, + max_iter=$method.tl_rank_genes_groups_method.solver.max_iter, + multi_class='$method.tl_rank_genes_groups_method.solver.multi_class', + #else if $method.tl_rank_genes_groups_method.solver.solver == 'liblinear' + #if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l1' + penalty='l1', + #else if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l2' + penalty='l2', + dual=$method.tl_rank_genes_groups_method.solver.penalty.dual, + #else + penalty='$method.tl_rank_genes_groups_method.solver.penalty.pen', + #end if + fit_intercept=$method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept, + #if $method.tl_rank_genes_groups_method.solver.intercept_scaling.fit_intercept == 'True' + intercept_scaling=$method.tl_rank_genes_groups_method.solver.intercept_scaling.intercept_scaling, + #end if + #if $method.tl_rank_genes_groups_method.solver.random_state + random_state=$method.tl_rank_genes_groups_method.solver.random_state, + #end if + #else if $method.tl_rank_genes_groups_method.solver.solver == 'sag' + penalty='l2', + fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept, + #if $method.tl_rank_genes_groups_method.solver.random_state + random_state=$method.tl_rank_genes_groups_method.solver.random_state, + #end if + max_iter=$method.tl_rank_genes_groups_method.solver.max_iter, + multi_class='$method.tl_rank_genes_groups_method.solver.multi_class', + #else if $method.tl_rank_genes_groups_method.solver.solver == 'saga' + #if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l1' + penalty='l1', + #else if $method.tl_rank_genes_groups_method.solver.penalty.penalty == 'l2' + penalty='l2', + #else + penalty='$method.tl_rank_genes_groups_method.solver.penalty.pen', + #end if + fit_intercept=$method.tl_rank_genes_groups_method.solver.fit_intercept, + multi_class='$method.tl_rank_genes_groups_method.solver.multi_class', + #end if + tol=$method.tl_rank_genes_groups_method.tol, + C=$method.tl_rank_genes_groups_method.c, + #end if + only_positive=$method.only_positive) + +#else if $method.method == "tl.marker_gene_overlap" +reference_markers = {} +#for $i, $s in enumerate($method.reference_markers) + #set $list=[x.strip() for x in str($s.values).split(',')] +reference_markers['$s.key'] = $list +#end for + +sc.tl.marker_gene_overlap( + adata, + reference_markers, + #if str($method.key) != '' + key='$method.key', + #end if + method='$method.overlap.method', + #if $method.overlap.method == 'overlap_count' and str($method.overlap.normalize) != 'None' + normalize='$method.overlap.normalize', + #end if + #if str($method.top_n_markers) != '' + top_n_markers=$method.top_n_markers, + #end if + #if str($method.adj_pval_threshold) != '' + adj_pval_threshold=$method.adj_pval_threshold, + #end if + #if str($method.key_added) != '' + key_added='$method.key_added', + #end if + inplace=True) + +#else if $method.method == "pp.log1p" +sc.pp.log1p( + data=adata, copy=False) -adata.obs.to_csv('$obs', sep='\t') + +#else if $method.method == "pp.scale" +sc.pp.scale( + data=adata, + zero_center=$method.zero_center, + #if $method.max_value + max_value=$method.max_value, + #end if + copy=False) + +#else if $method.method == "pp.sqrt" +sc.pp.sqrt( + data=adata, + copy=False) #end if @CMD_anndata_write_outputs@ @@ -37,143 +255,647 @@ <inputs> <expand macro="inputs_anndata"/> <conditional name="method"> - <param argument="method" type="select" label="Method used for plotting"> - <!--<option value="tl.paga_compare_paths">, using `tl.paga_compare_paths`</option>!--> - <!--<option value="tl.paga_degrees">, using `tl.paga_degrees`</option>!--> - <!--<option value="tl.paga_expression_entropies">, using `tl.paga_expression_entropies`</option>!--> - <option value="tl.paga">Generate cellular maps of differentiation manifolds with complex topologies, using `tl.paga`</option> - <option value="tl.dpt">Infer progression of cells through geodesic distance along the graph, using `tl.dpt`</option> + <param argument="method" type="select" label="Method used for inspecting"> + <option value="pp.calculate_qc_metrics">Calculate quality control metrics, using `pp.calculate_qc_metrics`</option> + <option value="pp.neighbors">Compute a neighborhood graph of observations, using `pp.neighbors`</option> + <option value="tl.score_genes">Score a set of genes, using `tl.score_genes`</option> + <option value="tl.score_genes_cell_cycle">Score cell cycle genes, using `tl.score_genes_cell_cycle`</option> + <option value="tl.rank_genes_groups">Rank genes for characterizing groups, using `tl.rank_genes_groups`</option> + <!--<option value="tl.marker_gene_overlap">Calculate an overlap score between data-deriven marker genes and provided markers, using `tl.marker_gene_overlap`</option>--> + <option value="pp.log1p">Logarithmize the data matrix, using `pp.log1p`</option> + <option value="pp.scale">Scale data to unit variance and zero mean, using `pp.scale`</option> + <option value="pp.sqrt">Square root the data matrix, using `pp.sqrt`</option> </param> - <when value="tl.paga"> - <param argument="groups" type="text" value="louvain" label="Key for categorical in the input" help="You can pass your predefined groups by choosing any categorical annotation of observations (`adata.obs`)."/> - <param argument="use_rna_velocity" type="boolean" truevalue="False" falsevalue="False" checked="false" label="Use RNA velocity to orient edges in the abstracted graph and estimate transitions?" help="Requires that `adata.uns` contains a directed single-cell graph with key `['velocyto_transitions']`. This feature might be subject to change in the future."/> - <param argument="model" type="select" label="PAGA connectivity model" help=""> - <option value="v1.2">v1.2</option> - <option value="v1.0">v1.0</option> + <when value="pp.calculate_qc_metrics"> + <param argument="expr_type" type="text" value="counts" label="Name of kind of values in X"/> + <param argument="var_type" type="text" value="genes" label="The kind of thing the variables are"/> + <param argument="qc_vars" type="text" value="" label="Keys for boolean columns of `.var` which identify variables you could want to control for" + help="Keys separated by a comma"/> + <param argument="percent_top" type="text" value="" label="Proportions of top genes to cover" + help=" Values (integers) are considered 1-indexed, `50` finds cumulative proportion to the 50th most expressed genes. Values separated by a comma. + If empty don't calculate"/> + </when> + <when value="pp.neighbors"> + <param argument="n_neighbors" type="integer" min="0" value="15" label="The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation" help="Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. If `knn` is `True`, number of nearest neighbors to be searched. If `knn` is `False`, a Gaussian kernel width is set to the distance of the `n_neighbors` neighbor."/> + <param argument="n_pcs" type="integer" min="0" value="" optional="true" label="Number of PCs to use" help=""/> + <param argument="use_rep" type="text" value="" optional="true" label="Indicated representation to use" help="If not set, the representation is chosen automatically: for n_vars below 50, X is used, otherwise X_pca (uns) is used. If X_pca is not present, it's computed with default parameter"/> + <param argument="knn" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Use a hard threshold to restrict the number of neighbors to n_neighbors?" help="If true, it considers a knn graph. Otherwise, it uses a Gaussian Kernel to assign low weights to neighbors more distant than the `n_neighbors` nearest neighbor."/> + <param argument="random_state" type="integer" value="0" label="Numpy random seed" help=""/> + <param name="pp_neighbors_method" argument="method" type="select" label="Method for computing connectivities" help=""> + <option value="umap">umap (McInnes et al, 2018)</option> + <option value="gauss">gauss: Gauss kernel following (Coifman et al 2005) with adaptive width (Haghverdi et al 2016)</option> + </param> + <param argument="metric" type="select" label="Distance metric" help=""> + <expand macro="distance_metric_options"/> </param> </when> - <when value="tl.dpt"> - <param argument="n_dcs" type="integer" min="0" value="10" label="Number of diffusion components to use" help=""/> - <param argument="n_branchings" type="integer" min="0" value="0" label="Number of branchings to detect" help=""/> - <param argument="min_group_size" type="float" min="0" value="0.01" label="Min group size" help="During recursive splitting of branches ('dpt groups') for `n_branchings` > 1, do not consider groups that contain less than `min_group_size` data points. If a float, `min_group_size` refers to a fraction of the total number of data points."/> - <param argument="allow_kendall_tau_shift" type="boolean" truevalue="True" falsevalue="False" checked="true" label="Allow Kendal tau shift?" help="If a very small branch is detected upon splitting, shift away from maximum correlation in Kendall tau criterion of Haghverdi et al (2016) to stabilize the splitting."/> + <when value="tl.score_genes"> + <param argument="gene_list" type="text" value="" label="The list of gene names used for score calculation" help="Genes separated by a comma"/> + <param argument="ctrl_size" type="integer" value="50" label="Number of reference genes to be sampled" + help="If `len(gene_list)` is not too low, you can set `ctrl_size=len(gene_list)`."/> + <param argument="gene_pool" type="text" value="" optional="true" label="Genes for sampling the reference set" + help="Default is all genes. Genes separated by a comma"/> + <expand macro="score_genes_params"/> + <param argument="score_name" type="text" value="score" label="Name of the field to be added in `.obs`" help=""/> + </when> + <when value="tl.score_genes_cell_cycle"> + <conditional name='s_genes'> + <param name="format" type="select" label="Format for the list of genes associated with S phase"> + <option value="file">File</option> + <option value="text" selected="true">Text</option> + </param> + <when value="text"> + <param name="text" type="text" value="" label="List of genes associated with S phase" help="Genes separated by a comma"/> + </when> + <when value="file"> + <param name="file" type="data" format="txt" label="File with the list of genes associated with S phase" help="One gene per line"/> + </when> + </conditional> + <conditional name='g2m_genes'> + <param name="format" type="select" label="Format for the list of genes associated with G2M phase"> + <option value="file">File</option> + <option value="text" selected="true">Text</option> + </param> + <when value="text"> + <param name="text" type="text" value="" label="List of genes associated with G2M phase" help="Genes separated by a comma"/> + </when> + <when value="file"> + <param name="file" type="data" format="txt" label="File with the list of genes associated with G2M phase" help="One gene per line"/> + </when> + </conditional> + <expand macro="score_genes_params"/> </when> + <when value="tl.rank_genes_groups"> + <param argument="groupby" type="text" value="" label="The key of the observations grouping to consider" help=""/> + <expand macro="param_use_raw"/> + <param argument="groups" type="text" value="" label="Subset of groups to which comparison shall be restricted" help="e.g. ['g1', 'g2', 'g3']. If not passed, a ranking will be generated for all groups."/> + <conditional name="ref"> + <param name="rest" type="select" label="Comparison"> + <option value="rest">Compare each group to the union of the rest of the group</option> + <option value="group_id">Compare with respect to a specific group</option> + </param> + <when value="rest"/> + <when value="group_id"> + <param argument="reference" type="text" value="" label="Group identifier with respect to which compare"/> + </when> + </conditional> + <param argument="n_genes" type="integer" min="0" value="100" label="The number of genes that appear in the returned tables" help=""/> + <conditional name="tl_rank_genes_groups_method"> + <param argument="method" type="select" label="Method"> + <option value="t-test">t-test</option> + <option value="wilcoxon">Wilcoxon-Rank-Sum</option> + <option value="t-test_overestim_var" selected="true">t-test with overestimate of variance of each group</option> + <option value="logreg">Logistic regression</option> + </param> + <when value="t-test"> + <expand macro="corr_method"/> + </when> + <when value="wilcoxon"> + <expand macro="corr_method"/> + </when> + <when value="t-test_overestim_var"> + <expand macro="corr_method"/> + </when> + <when value="logreg"> + <conditional name="solver"> + <param argument="solver" type="select" label="Algorithm to use in the optimization problem" help="For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas ‘liblinear’ and ‘saga’ handle L1 penalty."> + <option value="newton-cg">newton-cg</option> + <option value="lbfgs">lbfgs</option> + <option value="liblinear">liblinear</option> + <option value="sag">sag</option> + <option value="saga">saga</option> + </param> + <when value="newton-cg"> + <expand macro="fit_intercept"/> + <expand macro="max_iter"/> + <expand macro="multi_class"/> + </when> + <when value="lbfgs"> + <expand macro="fit_intercept"/> + <expand macro="max_iter"/> + <expand macro="multi_class"/> + </when> + <when value="liblinear"> + <conditional name="penalty"> + <expand macro="penalty"/> + <when value="l1"/> + <when value="l2"> + <param argument="dual" type="boolean" truevalue="True" falsevalue="False" checked="false" + label="Dual (not primal) formulation?" help="Prefer primal when n_samples > n_features"/> + </when> + <when value="customized"> + <expand macro="custom_penalty"/> + </when> + </conditional> + <conditional name="intercept_scaling"> + <param argument="fit_intercept" type="select" + label="Should a constant (a.k.a. bias or intercept) be added to the decision function?" help=""> + <option value="True">Yes</option> + <option value="False">No</option> + </param> + <when value="True"> + <param argument="intercept_scaling" type="float" value="1.0" + label="Intercept scaling" + help="x becomes [x, self.intercept_scaling], i.e. a 'synthetic' feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight."/> + </when> + <when value="False"/> + </conditional> + <expand macro="random_state"/> + </when> + <when value="sag"> + <expand macro="fit_intercept"/> + <expand macro="random_state"/> + <expand macro="max_iter"/> + <expand macro="multi_class"/> + </when> + <when value="saga"> + <conditional name="penalty"> + <expand macro="penalty"/> + <when value="l1"/> + <when value="l2"/> + <when value="customized"> + <expand macro="custom_penalty"/> + </when> + </conditional> + <expand macro="fit_intercept"/> + <expand macro="multi_class"/> + </when> + </conditional> + <param argument="tol" type="float" value="1e-4" label="Tolerance for stopping criteria" help=""/> + <param argument="c" type="float" value="1.0" label="Inverse of regularization strength" + help="It must be a positive float. Like in support vector machines, smaller values specify stronger regularization."/> + </when> + </conditional> + <param argument="only_positive" type="boolean" truevalue="True" falsevalue="False" checked="true" + label="Only consider positive differences?" help=""/> + </when> + <!--<when value="tl.marker_gene_overlap"> + <repeat name="reference_markers" title="Marker genes"> + <param name="key" type="text" value="" label="Cell identity name" help=""/> + <param name="values" type="text" value="" label="List of genes" help="Comma-separated names from `var`"/> + </repeat> + <param argument="key" type="text" value="rank_genes_groups" label="Key in adata.uns where the rank_genes_groups output is stored"/> + <conditional name="overlap"> + <param argument="method" type="select" label="Method to calculate marker gene overlap"> + <option value="overlap_count">overlap_count: Intersection of the gene set</option> + <option value="overlap_coef">overlap_coef: Overlap coefficient</option> + <option value="jaccard">jaccard: Jaccard index</option> + </param> + <when value="overlap_count"> + <param argument="normalize" type="select" label="Normalization option for the marker gene overlap output"> + <option value="None">None</option> + <option value="reference">reference: Normalization of the data by the total number of marker genes given in the reference annotation per group</option> + <option value="data">data: Normalization of the data by the total number of marker genes used for each cluster</option> + </param> + </when> + <when value="overlap_coef"/> + <when value="jaccard"/> + </conditional> + <param argument="top_n_markers" type="integer" optional="true" label="Number of top data-derived marker genes to use" help="By default all calculated marker genes are used. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/> + <param argument="adj_pval_threshold" type="float" optional="true" label="Significance threshold on the adjusted p-values to select marker genes" help=" This can only be used when adjusted p-values are calculated by 'tl.rank_genes_groups'. If adj_pval_threshold is set along with top_n_markers, then adj_pval_threshold is ignored."/> + <param argument="key_added" type="text" value="" optional="true" label="Key that will contain the marker overlap scores in 'uns'"/> + </when>--> + <when value="pp.log1p"/> + <when value="pp.scale"> + <param argument="zero_center" type="boolean" truevalue="True" falsevalue="False" checked="true" + label="Zero center?" help="If not, it omits zero-centering variables, which allows to handle sparse input efficiently."/> + <param argument="max_value" type="float" value="" optional="true" label="Maximum value" + help="Clip (truncate) to this value after scaling. If not set, it does not clip."/> + </when> + <when value="pp.sqrt"/> </conditional> - <expand macro="anndata_output_format"/> </inputs> <outputs> <expand macro="anndata_outputs"/> - <data name="obs" format="tabular" label="${tool.name} on ${on_string}: Observations annotation"> - <filter>method['method'] == 'tl.dpt'</filter> - </data> </outputs> <tests> <test> - <conditional name="input"> - <param name="format" value="h5ad" /> - <param name="adata" value="pp.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" /> - </conditional> + <!-- test 1 --> + <param name="adata" value="sparce_csr_matrix.h5ad" /> <conditional name="method"> - <param name="method" value="tl.paga"/> - <param name="groups" value="paul15_clusters"/> - <param name="use_rna_velocity" value="False"/> - <param name="model" value="v1.2"/> + <param name="method" value="pp.calculate_qc_metrics"/> + <param name="expr_type" value="counts"/> + <param name="var_type" value="genes"/> + <param name="qc_vars" value="mito,negative"/> + <param name="percent_top" value=""/> </conditional> - <param name="anndata_output_format" value="h5ad" /> <assert_stdout> - <has_text_matching expression="sc.tl.paga"/> - <has_text_matching expression="groups='paul15_clusters'"/> - <has_text_matching expression="use_rna_velocity =False"/> - <has_text_matching expression="model='v1.2'"/> + <has_text_matching expression="sc.pp.calculate_qc_metrics" /> + <has_text_matching expression="expr_type='counts'" /> + <has_text_matching expression="var_type='genes'" /> + <has_text_matching expression="qc_vars=\['mito', 'negative'\]" /> </assert_stdout> - <output name="anndata_out_h5ad" file="tl.paga.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5" compare="sim_size"> + <output name="anndata_out" file="pp.calculate_qc_metrics.sparce_csr_matrix.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 2 --> + <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.neighbors"/> + <param name="n_neighbors" value="15"/> + <param name="knn" value="True"/> + <param name="random_state" value="0"/> + <param name="pp_neighbors_method" value="umap"/> + <param name="metric" value="euclidean"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.neighbors"/> + <has_text_matching expression="n_neighbors=15"/> + <has_text_matching expression="knn=True"/> + <has_text_matching expression="random_state=0"/> + <has_text_matching expression="method='umap'"/> + <has_text_matching expression="metric='euclidean'"/> + </assert_stdout> + <output name="anndata_out" file="pp.neighbors_umap_euclidean.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5ad" compare="sim_size"> <assert_contents> <has_h5_keys keys="X, obs, obsm, uns, var" /> </assert_contents> </output> </test> <test> - <conditional name="input"> - <param name="format" value="h5ad" /> - <param name="adata" value="tl.diffmap.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" /> + <!-- test 3 --> + <param name="adata" value="pp.recipe_weinreb17.paul15_subsample.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.neighbors"/> + <param name="n_neighbors" value="15"/> + <param name="knn" value="True"/> + <param name="pp_neighbors_method" value="gauss"/> + <param name="metric" value="braycurtis"/> </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.neighbors"/> + <has_text_matching expression="n_neighbors=15"/> + <has_text_matching expression="knn=True"/> + <has_text_matching expression="random_state=0"/> + <has_text_matching expression="method='gauss'"/> + <has_text_matching expression="metric='braycurtis'"/> + </assert_stdout> + <output name="anndata_out" file="pp.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 4 --> + <param name="adata" value="krumsiek11.h5ad" /> <conditional name="method"> - <param name="method" value="tl.dpt"/> - <param name="n_dcs" value="15"/> - <param name="n_branchings" value="1"/> - <param name="min_group_size" value="0.01"/> - <param name="allow_kendall_tau_shift" value="True"/> + <param name="method" value="tl.score_genes"/> + <param name="gene_list" value="Gata2, Fog1"/> + <param name="ctrl_size" value="2"/> + <param name="n_bins" value="2"/> + <param name="random_state" value="2"/> + <param name="use_raw" value="False"/> + <param name="score_name" value="score"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.tl.score_genes" /> + <has_text_matching expression="gene_list=\['Gata2', 'Fog1'\]" /> + <has_text_matching expression="ctrl_size=2" /> + <has_text_matching expression="score_name='score'" /> + <has_text_matching expression="n_bins=2" /> + <has_text_matching expression="random_state=2" /> + <has_text_matching expression="use_raw=False" /> + <has_text_matching expression="copy=False" /> + </assert_stdout> + <output name="anndata_out" file="tl.score_genes.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 5 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.score_genes_cell_cycle"/> + <conditional name='s_genes'> + <param name="format" value="text"/> + <param name="text" value="Gata2, Fog1, EgrNab"/> + </conditional> + <conditional name='g2m_genes'> + <param name="format" value="text"/> + <param name="text" value="Gata2, Fog1, EgrNab"/> + </conditional> + <param name="n_bins" value="2"/> + <param name="random_state" value="1"/> + <param name="use_raw" value="False"/> </conditional> - <param name="anndata_output_format" value="h5ad" /> + <assert_stdout> + <has_text_matching expression="sc.tl.score_genes_cell_cycle"/> + <has_text_matching expression="s_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/> + <has_text_matching expression="g2m_genes=\['Gata2', 'Fog1', 'EgrNab'\]"/> + <has_text_matching expression="n_bins=2"/> + <has_text_matching expression="random_state=1"/> + <has_text_matching expression="use_raw=False"/> + </assert_stdout> + <output name="anndata_out" file="tl.score_genes_cell_cycle.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 6 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.rank_genes_groups"/> + <param name="groupby" value="cell_type"/> + <param name="use_raw" value="True"/> + <conditional name="ref"> + <param name="rest" value="rest"/> + </conditional> + <param name="n_genes" value="100"/> + <conditional name="tl_rank_genes_groups_method"> + <param name="method" value="t-test_overestim_var"/> + <param name="corr_method" value="benjamini-hochberg"/> + </conditional> + <param name="only_positive" value="true"/> + </conditional> <assert_stdout> - <has_text_matching expression="sc.tl.dpt"/> - <has_text_matching expression="n_dcs=15"/> - <has_text_matching expression="n_branchings=1"/> - <has_text_matching expression="min_group_size=0.01"/> - <has_text_matching expression="allow_kendall_tau_shift=True"/> + <has_text_matching expression="sc.tl.rank_genes_groups"/> + <has_text_matching expression="groupby='cell_type'"/> + <has_text_matching expression="use_raw=True"/> + <has_text_matching expression="reference='rest'"/> + <has_text_matching expression="n_genes=100"/> + <has_text_matching expression="method='t-test_overestim_var'"/> + <has_text_matching expression="corr_method='benjamini-hochberg'"/> + <has_text_matching expression="only_positive=True"/> </assert_stdout> - <output name="anndata_out_h5ad" file="tl.dpt.diffmap.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.h5ad" ftype="h5" compare="sim_size"> + <output name="anndata_out" file="tl.rank_genes_groups.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 7 --> + <param name="adata" value="pbmc68k_reduced.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.rank_genes_groups"/> + <param name="groupby" value="louvain"/> + <param name="use_raw" value="True"/> + <conditional name="ref"> + <param name="rest" value="rest"/> + </conditional> + <param name="n_genes" value="100"/> + <conditional name="tl_rank_genes_groups_method"> + <param name="method" value="logreg"/> + <conditional name="solver"> + <param name="solver" value="newton-cg"/> + <param name="fit_intercept" value="True"/> + <param name="max_iter" value="100"/> + <param name="multi_class" value="auto"/> + </conditional> + <param name="tol" value="1e-4"/> + <param name="c" value="1.0"/> + </conditional> + <param name="only_positive" value="true"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.tl.rank_genes_groups"/> + <has_text_matching expression="groupby='louvain'"/> + <has_text_matching expression="use_raw=True"/> + <has_text_matching expression="reference='rest'"/> + <has_text_matching expression="n_genes=100"/> + <has_text_matching expression="method='logreg'"/> + <has_text_matching expression="solver='newton-cg'"/> + <has_text_matching expression="penalty='l2'"/> + <has_text_matching expression="fit_intercept=True"/> + <has_text_matching expression="max_iter=100"/> + <has_text_matching expression="multi_class='auto'"/> + <has_text_matching expression="tol=0.0001"/> + <has_text_matching expression="C=1.0"/> + <has_text_matching expression="only_positive=True"/> + </assert_stdout> + <output name="anndata_out" file="tl.rank_genes_groups.newton-cg.pbmc68k_reduced.h5ad" ftype="h5ad" compare="sim_size"> <assert_contents> - <has_h5_keys keys="X, obs, obsm, uns, var" /> + <has_h5_keys keys="X, obs, obsm, raw.X, raw.var, uns, var" /> </assert_contents> </output> - <output name="obs" file="tl.dpt.diffmap.neighbors_gauss_braycurtis.recipe_weinreb17.paul15_subsample.obs.tabular" compare="sim_size"/> + </test> + <test> + <!-- test 8 --> + <param name="adata" value="pbmc68k_reduced.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.rank_genes_groups"/> + <param name="groupby" value="louvain"/> + <param name="use_raw" value="True"/> + <conditional name="ref"> + <param name="rest" value="rest"/> + </conditional> + <param name="n_genes" value="100"/> + <conditional name="tl_rank_genes_groups_method"> + <param name="method" value="logreg"/> + <conditional name="solver"> + <param name="solver" value="liblinear"/> + <conditional name="penalty"> + <param name="penalty" value="l2"/> + <param name="dual" value="False"/> + <conditional name="intercept_scaling"> + <param name="fit_intercept" value="True"/> + <param name="intercept_scaling" value="1.0" /> + </conditional> + <param name="random_state" value="1"/> + </conditional> + </conditional> + <param name="tol" value="1e-4"/> + <param name="c" value="1.0"/> + </conditional> + <param name="only_positive" value="true"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.tl.rank_genes_groups"/> + <has_text_matching expression="groupby='louvain'"/> + <has_text_matching expression="use_raw=True"/> + <has_text_matching expression="reference='rest'"/> + <has_text_matching expression="n_genes=100"/> + <has_text_matching expression="method='logreg'"/> + <has_text_matching expression="solver='liblinear'"/> + <has_text_matching expression="penalty='l2'"/> + <has_text_matching expression="dual=False"/> + <has_text_matching expression="fit_intercept=True"/> + <has_text_matching expression="intercept_scaling=1.0"/> + <has_text_matching expression="tol=0.0001"/> + <has_text_matching expression="C=1.0"/> + <has_text_matching expression="only_positive=True"/> + </assert_stdout> + <output name="anndata_out" file="tl.rank_genes_groups.liblinear.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"> + <assert_contents> + <has_h5_keys keys="X, obs, obsm, raw.X, raw.var, uns, var" /> + </assert_contents> + </output> + </test> + <!--<test> + < test 9 > + <param name="adata" value="tl.rank_genes_groups.louvain.neighbors.pca.pbmc68k_reduced.h5ad" /> + <conditional name="method"> + <param name="method" value="tl.marker_gene_overlap"/> + <repeat name="reference_markers"> + <param name="key" value="CD4 T cells"/> + <param name="value" value="IL7R"/> + </repeat> + <repeat name="reference_markers"> + <param name="key" value="CD14+ Monocytes"/> + <param name="value" value="CD14,LYZ"/> + </repeat> + <repeat name="reference_markers"> + <param name="key" value="B cells"/> + <param name="value" value="MS4A1"/> + </repeat> + <conditional name="overlap"> + <param argument="method" value="overlap_count"/> + <param argument="normalize" value="None"/> + </conditional> + </conditional> + <assert_stdout> + <has_text_matching expression="tl.marker_gene_overlap"/> + <has_text_matching expression="key='rank_genes_groups'"/> + <has_text_matching expression="method='overlap_count'"/> + </assert_stdout> + <output name="anndata_out" file="pp.log1p.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test>--> + <test> + <!-- test 9 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.log1p"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.log1p"/> + </assert_stdout> + <output name="anndata_out" file="pp.log1p.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 10 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.scale"/> + <param name="zero_center" value="true"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.scale"/> + <has_text_matching expression="zero_center=True"/> + </assert_stdout> + <output name="anndata_out" file="pp.scale.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 11 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.scale"/> + <param name="zero_center" value="true"/> + <param name="max_value" value="10"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.scale"/> + <has_text_matching expression="zero_center=True"/> + <has_text_matching expression="max_value=10.0"/> + </assert_stdout> + <output name="anndata_out" file="pp.scale_max_value.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> + </test> + <test> + <!-- test 12 --> + <param name="adata" value="krumsiek11.h5ad" /> + <conditional name="method"> + <param name="method" value="pp.sqrt"/> + </conditional> + <assert_stdout> + <has_text_matching expression="sc.pp.sqrt"/> + </assert_stdout> + <output name="anndata_out" file="pp.sqrt.krumsiek11.h5ad" ftype="h5ad" compare="sim_size"/> </test> </tests> <help><![CDATA[ -Generate cellular maps of differentiation manifolds with complex topologies (`tl.paga`) -======================================================================================= +Calculate quality control metrics., using `pp.calculate_qc_metrics` +=================================================================== + +Calculates a number of qc metrics for an AnnData object, largely based on calculateQCMetrics from scater. +Currently is most efficient on a sparse CSR or dense matrix. + +It updates the observation level metrics: + +- total_{var_type}_by_{expr_type} (e.g. "total_genes_by_counts", number of genes with positive counts in a cell) +- total_{expr_type} (e.g. "total_counts", total number of counts for a cell) +- pct_{expr_type}_in_top_{n}_{var_type} (e.g. "pct_counts_in_top_50_genes", cumulative percentage of counts for 50 most expressed genes in a cell) +- total_{expr_type}_{qc_var} (e.g. "total_counts_mito", total number of counts for variabes in qc_vars ) +- pct_{expr_type}_{qc_var} (e.g. "pct_counts_mito", proportion of total counts for a cell which are mitochondrial) + +And also the variable level metrics: -By quantifying the connectivity of partitions (groups, clusters) of the -single-cell graph, partition-based graph abstraction (PAGA) generates a much -simpler abstracted graph (*PAGA graph*) of partitions, in which edge weights -represent confidence in the presence of connections. By tresholding this -confidence in `paga`, a much simpler representation of data -can be obtained. +- total_{expr_type} (e.g. "total_counts", sum of counts for a gene) +- mean_{expr_type} (e.g. "mean counts", mean expression over all cells. +- n_cells_by_{expr_type} (e.g. "n_cells_by_counts", number of cells this expression is measured in) +- pct_dropout_by_{expr_type} (e.g. "pct_dropout_by_counts", percentage of cells this feature does not appear in) + +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.calculate_qc_metrics.html>`__ + +Compute a neighborhood graph of observations, using `pp.neighbors` +================================================================== + +The neighbor search efficiency of this heavily relies on UMAP (McInnes et al, 2018), +which also provides a method for estimating connectivities of data points - +the connectivity of the manifold (`method=='umap'`). If `method=='diffmap'`, +connectivities are computed according to Coifman et al (2005), in the adaption of +Haghverdi et al (2016). + +The returned AnnData object contains: + +- Weighted adjacency matrix of the neighborhood graph of data points (connectivities). Weights should be interpreted as connectivities. +- Distances for each pair of neighbors (distances) + +This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects -The confidence can be interpreted as the ratio of the actual versus the -expected value of connetions under the null model of randomly connecting -partitions. We do not provide a p-value as this null model does not -precisely capture what one would consider "connected" in real data, hence it -strongly overestimates the expected value. See an extensive discussion of -this in Wolf et al (2017). +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.neighbors.html>`__ + +Score a set of genes, using `tl.score_genes` +============================================ + +The score is the average expression of a set of genes subtracted with the +average expression of a reference set of genes. The reference set is +randomly sampled from the `gene_pool` for each binned expression value. + +This reproduces the approach in Seurat (Satija et al, 2015) and has been implemented +for Scanpy by Davide Cittaro. + +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.tl.score_genes.html>`__ + +Score cell cycle genes, using `tl.score_genes_cell_cycle` +========================================================= -Together with a random walk-based distance measure, this generates a partial -coordinatization of data useful for exploring and explaining its variation. +Given two lists of genes associated to S phase and G2M phase, calculates +scores and assigns a cell cycle phase (G1, S or G2M). See +`score_genes` for more explanation. + +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.tl.score_genes_cell_cycle.html>`__ + +Rank genes for characterizing groups, using `tl.rank_genes_groups` +================================================================== -More details on the `tl.paga scanpy documentation -<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.tl.paga.html#scanpy.api.tl.paga>`_ +The returned AnnData object contains: + +- Gene names, ordered according to scores +- Z-score underlying the computation of a p-value for each gene for each group, prdered according to scores +- Log2 fold change for each gene for each group, ordered according to scores. It is only provided if method is ‘t-test’ like. This is an approximation calculated from mean-log values. +- P-values +- Ajusted p-values + +This data are stored in the unstructured annotation (uns) and can be accessed using the inspect tool for AnnData objects + +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.tl.rank_genes_groups.html>`__ -Infer progression of cells through geodesic distance along the graph (`tl.dpt`) -=============================================================================== +Calculate an overlap score between data-deriven marker genes and provided markers (`tl.marker_gene_overlap`) +============================================================================================================ -Reconstruct the progression of a biological process from snapshot -data. `Diffusion Pseudotime` has been introduced by Haghverdi et al (2016) and -implemented within Scanpy (Wolf et al, 2017). Here, we use a further developed -version, which is able to deal with disconnected graphs (Wolf et al, 2017) and can -be run in a `hierarchical` mode by setting the parameter -`n_branchings>1`. We recommend, however, to only use -`tl.dpt` for computing pseudotime (`n_branchings=0`) and -to detect branchings via `paga`. For pseudotime, you need -to annotate your data with a root cell. - -This requires to run `pp.neighbors`, first. In order to -reproduce the original implementation of DPT, use `method=='gauss'` in -this. Using the default `method=='umap'` only leads to minor quantitative -differences, though. +Marker gene overlap scores can be quoted as overlap counts, overlap coefficients, or jaccard indices. The method returns a pandas dataframe which can be used to annotate clusters based on marker gene overlaps. -If `n_branchings==0`, no field `dpt_groups` will be written. +Logarithmize the data matrix (`pp.log1p`) +========================================= -- dpt_pseudotime : Array of dim (number of samples) that stores the pseudotime of each cell, that is, the DPT distance with respect to the root cell. -- dpt_groups : Array of dim (number of samples) that stores the subgroup id ('0','1', ...) for each cell. The groups typically correspond to 'progenitor cells', 'undecided cells' or 'branches' of a process. +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.log1p.html>`__ -The tool is similar to the R package `destiny` of Angerer et al (2016). +Scale data to unit variance and zero mean (`pp.scale`) +====================================================== -More details on the `tl.dpt scanpy documentation -<https://scanpy.readthedocs.io/en/latest/api/scanpy.api.tl.dpt.html#scanpy.api.tl.dpt>`_ +More details on the `scanpy documentation +<https://icb-scanpy.readthedocs-hosted.com/en/stable/api/scanpy.pp.scale.html>`__ +Computes the square root the data matrix (`pp.sqrt`) +==================================================== + +`X = sqrt(X)` ]]></help> <expand macro="citations"/> </tool> \ No newline at end of file