Mercurial > repos > iuc > spapros_evaluation

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/evaluation.xml	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,598 @@
+<tool id="spapros_evaluation" name="Evaluation" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@">
+    <description>of marker genes with spapros</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements">
+    </expand>
+    <expand macro="version_command"/>
+    <command detect_errors="exit_code"><![CDATA[
+@CMD@
+      ]]></command>
+    <configfiles>
+        <configfile name="script_file"><![CDATA[
+@CMD_imports@
+@CMD_read_inputs@
+
+mpl.rcParams['figure.dpi'] = $figure_options.dpi
+plt.rcParams["font.size"] = $figure_options.fontsize
+
+header_probeset='infer'
+#if $header_probeset == 'not_included':
+header_probeset=None
+#end if
+
+## Probe set generation
+probeset = pd.read_csv('$probeset', sep='\t', index_col=0, header=header_probeset)
+probeset = [x for x in probeset.values.flatten().tolist() if not pd.isna(x)]
+
+## Reference set generation
+methods_pre="#echo ','.join(["%s" % $s for $i, $s in enumerate($methods_reference)])#"
+methods=methods_pre.split(',')
+
+reference_sets = sp.se.select_reference_probesets(
+adata,
+n=$nreference,
+#if $genes_key != '':
+genes_key='$genes_key',
+#else:
+genes_key=None,
+#end if
+#if $obs_key != '':
+obs_key='$obs_key',
+#end if
+#if $seeds != '':
+seeds=[$seeds],
+#end if
+methods=methods,
+verbosity=0
+)
+
+## Parameter setting
+
+custom_params = {
+'data': {
+'name': '$probeset_name',
+'celltype_key': '$celltype_key'
+},
+'metrics': {
+#if $method.method == 'plot_summary' or $method.method == 'plot_cluster_similarity':
+'cluster_similarity': {
+'ns': [$method.ns_start, $method.ns_end],
+'AUC_borders': [#echo ",".join(["[%s, %s]" % ($s.auc_borders_start, $s.auc_borders_end) for $i, $s in enumerate($method.series_auc_borders)])#]
+},
+#end if
+#if $method.method == 'plot_summary' or $method.method == 'plot_knn_overlap':
+'knn_overlap': {
+'ks': [#echo ",".join([ "%s" % $s.k for $i, $s in enumerate($method.knn) ])#]
+},
+#end if
+#if $method.method == 'plot_confusion_matrix':
+'forest_clfs': {
+'ct_key': '$celltype_key',
+#if $method.method == 'plot_summary':
+'threshold': $method.forest_clfs_threshold
+#end if
+},
+#end if
+#if $method.method == 'plot_summary':
+#if $method.select_marker_corr.use_marker_corr == 'True':
+'marker_corr': {
+'per_celltype': $method.select_marker_corr.per_celltype,
+'per_marker': $method.select_marker_corr.per_marker,
+#if $method.select_marker_corr.select_per_celltype_min_mean.use_per_celltype_min_mean == 'True'
+'per_celltype_min_mean': $method.select_marker_corr.select_per_celltype_min_mean.per_celltype_min_mean,
+#end if
+#if $method.select_marker_corr.select_per_marker_min_mean.use_per_marker_min_mean == 'True'
+'per_marker_min_mean': $method.select_marker_corr.select_per_marker_min_mean.per_marker_min_mean
+#end if
+},
+#end if
+#end if
+#if $method.method == 'plot_marker_corr':
+'marker_corr': {
+'per_celltype': $method.per_celltype,
+'per_marker': $method.per_marker,
+#if $method.select_per_celltype_min_mean.use_per_celltype_min_mean == 'True'
+'per_celltype_min_mean': $method.select_per_celltype_min_mean.per_celltype_min_mean,
+#else:
+'per_celltype_min_mean': None,
+#end if
+#if $method.select_per_marker_min_mean.use_per_marker_min_mean == 'True'
+'per_marker_min_mean': $method.select_per_marker_min_mean.per_marker_min_mean
+#else:
+'per_marker_min_mean': None
+#end if
+},
+#end if
+#if $method.method == 'plot_summary':
+'gene_corr': {
+'threshold': $method.gene_corr_threshold
+}
+#end if
+}
+}
+
+## Setup evaluator
+
+#if $method.method == 'plot_summary' or $method.method == 'plot_marker_corr':
+
+header_markerset='infer'
+feature_marker=dict()
+
+#if $method.method == 'plot_summary':
+#if $method.select_marker_corr.header_markerset == 'not_included' :
+header_markerset=None
+#end if
+feature_marker = {key: [v for v in list(value.values()) if pd.notna(v)] for key, value in pd.read_csv('$method.select_marker_corr.markerset', sep='\t', index_col=0, header=header_markerset).to_dict(orient='index').items()}
+#end if
+#if $method.method == 'plot_marker_corr':
+#if $method.header_markerset == 'not_included':
+header_markerset=None
+#end if
+feature_marker = {key: [v for v in list(value.values()) if pd.notna(v)] for key, value in pd.read_csv('$method.markerset', sep='\t', index_col=0, header=header_markerset).to_dict(orient='index').items()}
+#end if
+
+#end if
+
+evaluator = sp.ev.ProbesetEvaluator(
+adata,
+metrics_params=custom_params,
+scheme="full",
+verbosity=0,
+#if $method.method == 'plot_summary' or $method.method == 'plot_marker_corr':
+marker_list=feature_marker,
+#end if
+results_dir=None
+)
+
+## Evaluation reference set
+for set_id, df in reference_sets.items():
+    gene_set = df[df["selection"]].index.to_list()
+    evaluator.evaluate_probeset(gene_set, set_id=set_id)
+
+## Evaluation probe set
+evaluator.evaluate_probeset(probeset, set_id='$probeset_name')
+
+## Execution of method ##
+
+#if $method.method == 'plot_summary':
+evaluator.plot_summary(
+@CMD_plot@
+)
+#end if
+
+#if $method.method == 'plot_confusion_matrix':
+evaluator.plot_confusion_matrix(
+@CMD_plot@
+)
+#end if
+
+#if $method.method == 'plot_coexpression':
+evaluator.plot_coexpression(
+@CMD_plot@
+)
+#end if
+
+#if $method.method == 'plot_cluster_similarity':
+evaluator.plot_cluster_similarity(
+@CMD_plot@
+)
+#end if
+
+#if $method.method == 'plot_knn_overlap':
+evaluator.plot_knn_overlap(
+@CMD_plot@
+)
+#end if
+
+## plot_marker_corr does not allow for the show keyword
+#if $method.method == 'plot_marker_corr':
+evaluator.plot_marker_corr(
+#if $method.set_ids != 'all' and $method.set_ids != '':
+set_ids=[$method.set_ids],
+#end if
+save='plot.$format'
+)
+#end if
+
+]]></configfile>
+    </configfiles>
+    <inputs>
+        <expand macro="inputs_anndata"/>
+        <expand macro="param_plot_format"/>
+        <param name="probeset" type="data" format="tabular" label="Probeset tabular file with rows=conditions (e.g., celltypes) and column=features (e.g., genes)"/>
+        <param name="header_probeset" type="select" optional="false" label="Header in the list of probes?">
+            <option value="included">Header incldued</option>
+            <option value="not_included">Header not included</option>
+        </param>
+        <param argument="nreference" type="integer" value="10" min="1" optional="false" label="Number of selected genes to generate a reference set"/>
+        <param name="methods_reference" type="select" multiple="True" optional="false" label="Select the methods to genereate a reference probeset">
+            <option value="random" selected="true">Random</option>
+            <option value="PCA" selected="false">PCA</option>
+            <option value="DE" selected="false">Differential expressed features based on wilxocon rank genes test (DE)</option>
+            <option value="HVG" selected="false">Highly variable features (HVG)</option>
+        </param>
+        <param name="genes_key" type="text" optional="true" label="adata.var key for subset of preselected genes to run the selections" help="This is typically highly_variable_genes. Leave empty to not subset genes."/>
+        <param name="obs_key" type="text" optional="true" label="Column name of adata.obs for which marker scores are calculated" help="Only required for method DE."/>
+        <param name="seeds" type="text" optional="true" label="List of random integer seeds (comma seperated, e.g., 123,999,22)" help="For each seed, one random gene set is selected if random in methods. Leave it empty to for one random set with a random seed."/>
+        <param name="probeset_name" type="text" value="probeset" optional="false" label="Name for your probeset that should be displayed in plots"/>
+        <param name="celltype_key" type="text" value="celltype" optional="false" label="Column name of adata.obs that represents groups of your cells (e.g., celltype)"/>
+        <conditional name="method">
+            <param argument="method" type="select" optional="false" label="Method of spapros">
+                <option value="plot_summary">Plot statistic summary, using 'evaluator.plot_summary'</option>
+                <option value="plot_confusion_matrix">Plot confusion matrix, using 'evaluator.plot_confusion_matrix'</option>
+                <option value="plot_coexpression">Plot coexpression heatmap, using 'evaluator.plot_coexpression'</option>
+                <option value="plot_cluster_similarity">Plot cluster similariy, using 'evaluator.plot_cluster_similarity'</option>
+                <option value="plot_knn_overlap">Plot knn overlap, using 'evaluator.plot_knn_overlap'</option>
+                <option value="plot_marker_corr">Plot featue marker correlation, using 'evaluator.plot_marker_corr'</option>
+            </param>
+            <when value="plot_summary">
+                <expand macro="set_ids"/>
+                <param name="forest_clfs_threshold" type="float" value="0.8" min="0.0" max="1.0" label="Assesses how many cell types (%) can be predicted with an accuracy of at least threshold" help="The metric gives an idea about how many cell types can be identified with high confidence with the given gene set."/>
+                <param name="gene_corr_threshold" type="float" value="0.8" min="0.0" max="1.0" label="Percentage of features (e.g., genes) that have a maximum correlation of less than threshold with all other features" help="This metric gives an idea about how many features show unique expression profiles in the gene set."/>
+                <param name="ns_start" type="integer" value="5" min="1" label="The minimum number of leiden clusters clusters" help="Clusterings are calculated with different leiden resolutions to genertate clusterings of n = ns_start to ns_end clusters."/>
+                <param name="ns_end" type="integer" value="21" min="1" label="The maximum number of leiden clusters clusters" help="Clusterings are calculated with different leiden resolutions to genertate clusterings of n = ns_start to ns_end clusters."/>
+                <repeat name="series_auc_borders" title="Calculates nmi AUCs over given borders">
+                    <param name="auc_borders_start" type="integer" value="15" min="1" label="Calculates nmi over n ranges auc_borders_start to auc_borders_end" help="Defined border shouldn't exceed values in nmis."/>
+                    <param name="auc_borders_end" type="integer" value="20" min="1" label="Calculates nmi over n ranges auc_borders_start to auc_borders_end" help="Defined border shouldn't exceed values in nmis."/>
+                </repeat>
+                <repeat name="knn" title="Calculate knn graphs for each k">
+                    <param name="k" type="integer" value="10" min="1" label="Includes nearest neighbors for all k"/>
+                </repeat>
+                <conditional name="select_marker_corr">
+                    <param name="use_marker_corr" type="select" label="Do you want to calculate the correlation between your probeset and marker features?">
+                        <option value="False">No</option>
+                        <option value="True">Yes</option>
+                    </param>
+                    <when value="True">
+                        <expand macro="param_markerset"/>
+                        <param name="per_celltype" type="boolean" truevalue="True" falsevalue="False" label="Wether to return columns with per cell type max correlations" checked="true"/>
+                        <param name="per_marker" type="boolean" truevalue="True" falsevalue="False" label="Wether to return columns with per marker max correlations" checked="true"/>
+                        <conditional name="select_per_celltype_min_mean">
+                            <param name="use_per_celltype_min_mean" type="select" label="Add a column for correlation per cell type that only takes into accounts markers with mean expression > per_celltype_min_mean">
+                                <option value="False">No</option>
+                                <option value="True">Yes</option>
+                            </param>
+                            <when value="True">
+                                <param name="per_celltype_min_mean" type="float" value="0.0" min="0.0" label="Set per_celltype_min_mean"/>
+                            </when>
+                            <when value="False"/>
+                        </conditional>
+                        <conditional name="select_per_marker_min_mean">
+                            <param name="use_per_marker_min_mean" type="select" label="Add a column for correlation per cell type that only takes into accounts markers with mean expression > per_celltype_min_mean">
+                                <option value="False">No</option>
+                                <option value="True">Yes</option>
+                            </param>
+                            <when value="True">
+                                <param name="per_marker_min_mean" type="float" value="0.0" min="0.0" label="Set per_marker_min_mean"/>
+                            </when>
+                            <when value="False"/>
+                        </conditional>
+                    </when>
+                    <when value="False"/>
+                </conditional>
+            </when>
+            <when value="plot_confusion_matrix">
+                <expand macro="set_ids"/>
+            </when>
+            <when value="plot_coexpression">
+                <expand macro="set_ids"/>
+            </when>
+            <when value="plot_cluster_similarity">
+                <expand macro="set_ids"/>
+                <param name="ns_start" type="integer" value="5" min="1" label="The minimum number of leiden clusters clusters" help="Clusterings are calculated with different leiden resolutions to genertate clusterings of n = ns_start to ns_end clusters."/>
+                <param name="ns_end" type="integer" value="21" min="1" label="The maximum number of leiden clusters clusters" help="Clusterings are calculated with different leiden resolutions to genertate clusterings of n = ns_start to ns_end clusters."/>
+                <repeat name="series_auc_borders" title="Calculates nmi AUCs over given borders">
+                    <param name="auc_borders_start" type="integer" value="15" min="1" label="Calculates nmi over n ranges auc_borders_start to auc_borders_end" help="Defined border shouldn't exceed values in nmis."/>
+                    <param name="auc_borders_end" type="integer" value="20" min="1" label="Calculates nmi over n ranges auc_borders_start to auc_borders_end" help="Defined border shouldn't exceed values in nmis."/>
+                </repeat>
+            </when>
+            <when value="plot_knn_overlap">
+                <expand macro="set_ids"/>
+                <repeat name="knn" title="Calculate knn graphs for each k">
+                    <param name="k" type="integer" value="10" min="1" label="Includes nearest neighbors for all k"/>
+                </repeat>
+            </when>
+            <when value="plot_marker_corr">
+                <expand macro="param_markerset"/>
+                <expand macro="set_ids"/>
+                <param name="per_celltype" type="boolean" truevalue="True" falsevalue="False" label="Wether to return columns with per cell type max correlations" checked="true"/>
+                <param name="per_marker" type="boolean" truevalue="True" falsevalue="False" label="Wether to return columns with per marker max correlations" checked="true"/>
+                <conditional name="select_per_celltype_min_mean">
+                    <param name="use_per_celltype_min_mean" type="select" label="Add a column for correlation per cell type that only takes into accounts markers with mean expression > per_celltype_min_mean">
+                        <option value="False">No</option>
+                        <option value="True">Yes</option>
+                    </param>
+                    <when value="True">
+                        <param name="per_celltype_min_mean" type="float" value="0.0" min="0.0" label="Set per_celltype_min_mean"/>
+                    </when>
+                    <when value="False"/>
+                </conditional>
+                <conditional name="select_per_marker_min_mean">
+                    <param name="use_per_marker_min_mean" type="select" label="Add a column for correlation per cell type that only takes into accounts markers with mean expression > per_celltype_min_mean">
+                        <option value="False">No</option>
+                        <option value="True">Yes</option>
+                    </param>
+                    <when value="True">
+                        <param name="per_marker_min_mean" type="float" value="0.0" min="0.0" label="Set per_marker_min_mean"/>
+                    </when>
+                    <when value="False"/>
+                </conditional>
+            </when>
+        </conditional>
+        <section name="figure_options" title="Figure Output Options" expanded="false">
+            <param argument="dpi" type="integer" value="300" min="1" label="Dpi of figures"/>
+            <param argument="fontsize" type="integer" value="100" min="1" label="Font size of figures"/>
+        </section>
+        <expand macro="inputs_common_advanced"/>
+    </inputs>
+    <outputs>
+        <data name="out_png" format="png" from_work_dir="*.png" label="PNG plot from ${tool.name} (${method.method}) on ${on_string}">
+            <filter>format == 'png'</filter>
+        </data>
+        <data name="out_pdf" format="pdf" from_work_dir="*.pdf" label="PDF plot from ${tool.name} (${method.method}) on ${on_string}">
+            <filter>format == 'pdf'</filter>
+        </data>
+        <data name="out_svg" format="svg" from_work_dir="*.svg" label="SVG plot from ${tool.name} (${method.method}) on ${on_string}">
+            <filter>format == 'svg'</filter>
+        </data>
+        <expand macro="hidden_outputs"/>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="30"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <param name="seeds" value="123,124"/>
+            <conditional name="method">
+                <param name="method" value="plot_summary"/>
+                <param name="set_ids" value="all"/>
+                <param name="forest_clfs_threshold" value="0.8"/>
+                <param name="gene_corr_threshold" value="0.8"/>
+                <param name="ns_start" value="5"/>
+                <param name="ns_end" value="21"/>
+                <repeat name="series_auc_borders">
+                    <param name="auc_borders_start" value="7"/>
+                    <param name="auc_borders_end" value="14"/>
+                </repeat>
+                <repeat name="series_auc_borders">
+                    <param name="auc_borders_start" value="15"/>
+                    <param name="auc_borders_end" value="20"/>
+                </repeat>
+                <repeat name="knn">
+                    <param name="k" value="5"/>
+                </repeat>
+                <repeat name="knn">
+                    <param name="k" value="10"/>
+                </repeat>
+                <conditional name="select_marker_corr">
+                    <param name="use_marker_corr" value="True"/>
+                    <param name="markerset" value="marker.tsv"/>
+                    <param name="header_markerset" value="not_included"/>
+                    <param name="per_celltype" value="True"/>
+                    <param name="per_marker" value="True"/>
+                    <conditional name="select_per_celltype_min_mean">
+                        <param name="use_per_celltype_min_mean" value="False"/>
+                    </conditional>
+                    <conditional name="select_per_marker_min_mean">
+                        <param name="use_per_marker_min_mean" value="True"/>
+                        <param name="per_marker_min_mean" value="0.025"/>
+                    </conditional>
+                </conditional>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="'per_celltype': True,"/>
+                    <has_text_matching expression="'per_marker': True,"/>
+                    <has_text_matching expression="'per_marker_min_mean': 0.025"/>
+                    <has_text_matching expression="evaluator.plot_summary"/>
+                    <has_text_matching expression="save='plot.png',"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="3253" delta="2"/>
+                    <has_image_height height="1446" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="30"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <param name="seeds" value="123,124"/>
+            <conditional name="method">
+                <param name="method" value="plot_confusion_matrix"/>
+                <param name="set_ids" value="all"/>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="'ct_key': 'celltype',"/>
+                    <has_text_matching expression="evaluator.plot_confusion_matrix"/>
+                    <has_text_matching expression="save='plot.png',"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="4560" delta="2"/>
+                    <has_image_height height="1859" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="30"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <param name="seeds" value="123,124"/>
+            <conditional name="method">
+                <param name="method" value="plot_coexpression"/>
+                <param name="set_ids" value="all"/>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="evaluator.plot_coexpression"/>
+                    <has_text_matching expression="save='plot.png',"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="5412" delta="2"/>
+                    <has_image_height height="3463" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="30"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <param name="seeds" value="123,124"/>
+            <conditional name="method">
+                <param name="method" value="plot_cluster_similarity"/>
+                <param name="set_ids" value="all"/>
+                <param name="ns_start" value="3"/>
+                <param name="ns_end" value="20"/>
+                <repeat name="series_auc_borders">
+                    <param name="auc_borders_start" value="7"/>
+                    <param name="auc_borders_end" value="14"/>
+                </repeat>
+                <repeat name="series_auc_borders">
+                    <param name="auc_borders_start" value="15"/>
+                    <param name="auc_borders_end" value="20"/>
+                </repeat>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="evaluator.plot_cluster_similarity"/>
+                    <has_text_matching expression="save='plot.png',"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="3223" delta="2"/>
+                    <has_image_height height="1406" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="30"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <param name="seeds" value="123,124"/>
+            <conditional name="method">
+                <param name="method" value="plot_knn_overlap"/>
+                <param name="set_ids" value="all"/>
+                <repeat name="knn">
+                    <param name="k" value="6"/>
+                </repeat>
+                <repeat name="knn">
+                    <param name="k" value="11"/>
+                </repeat>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="evaluator.plot_knn_overlap"/>
+                    <has_text_matching expression="save='plot.png',"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="3223" delta="2"/>
+                    <has_image_height height="1406" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+        <test expect_num_outputs="2">
+            <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/>
+            <param name="format" value="png"/>
+            <param name="probeset" value="tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv"/>
+            <param name="header_probeset" value="included"/>
+            <param name="nreference" value="50"/>
+            <param name="methods_reference" value="random,PCA,DE,HVG"/>
+            <param name="genes_key" value="highly_variable"/>
+            <param name="obs_key" value="celltype"/>
+            <conditional name="method">
+                <param name="method" value="plot_marker_corr"/>
+                <param name="set_ids" value="all"/>
+                <param name="use_marker_corr" value="True"/>
+                <param name="markerset" value="marker.tsv"/>
+                <param name="header_markerset" value="not_included"/>
+                <param name="per_celltype" value="True"/>
+                <param name="per_marker" value="True"/>
+                <conditional name="select_per_celltype_min_mean">
+                    <param name="use_per_celltype_min_mean" value="False"/>
+                </conditional>
+                <conditional name="select_per_marker_min_mean">
+                    <param name="use_per_marker_min_mean" value="True"/>
+                    <param name="per_marker_min_mean" value="0.025"/>
+                </conditional>
+            </conditional>
+            <param name="show_log" value="true" />
+            <output name="hidden_output">
+                <assert_contents>
+                    <has_text_matching expression="'name': 'probeset',"/>
+                    <has_text_matching expression="'per_celltype': True,"/>
+                    <has_text_matching expression="'per_marker': True,"/>
+                    <has_text_matching expression="'per_marker_min_mean': 0.025"/>
+                    <has_text_matching expression="evaluator.plot_marker_corr"/>
+                    <has_text_matching expression="save='plot.png'"/>
+                </assert_contents>
+            </output>
+            <output name="out_png">
+                <assert_contents>
+                    <has_image_width width="5064" delta="2"/>
+                    <has_image_height height="4554" delta="2"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+Probe set evaluation for single-cell sequencing data using spapros.
+============================================================================================================
+
+Spapros is a python package that provides a pipeline for probe set selection and evaluation for targeted spatial transcriptomics data.
+
+Key Features:
+* Select probe sets for spatial transcriptomics which identify cell types of interest, capture general transcriptomic variation, and incorporate prior knowledge
+* Evaluate probe sets with an extensive pipeline
+
+Further documentation can be found here: https://spapros.readthedocs.io/en/latest/index.html.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,96 @@
+<macros>
+    <token name="@TOOL_VERSION@">0.1.5</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@profile@">22.05</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">spapros</requirement>
+            <yield />
+        </requirements>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/" />
+        </creator>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1101/2022.08.16.504115</citation>
+            <citation type="doi">10.1093/gigascience/giaa102</citation>
+        </citations>
+    </xml>
+    <xml name="version_command">
+        <version_command><![CDATA[python -c "import spapros;print('%s' % spapros.__version__ )"]]></version_command>
+    </xml>
+    <token name="@CMD@"><![CDATA[
+cp '$adata' 'anndata.h5ad' &&
+cat '$script_file' > '$hidden_output' &&
+python '$script_file' >> '$hidden_output' &&
+ls . >> '$hidden_output'
+    ]]>
+    </token>
+    <token name="@CMD_imports@"><![CDATA[
+import spapros as sp
+import os
+import pandas as pd
+import scanpy as sc
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+import random
+    ]]>
+    </token>
+    <token name="@CMD_plot@"><![CDATA[
+#if $method.set_ids != 'all' and $method.set_ids != '':
+set_ids=[$method.set_ids],
+#end if
+save='plot.$format',
+show=False
+    ]]>
+    </token>
+    <xml name="set_ids">
+        <param name="set_ids" type="text" value="all" optional="true" label="List of probeset ids (comma seperated, e.g., DE,HVG,random)" help="Kepp it with all or empty to select all probeset ids."/>
+    </xml>
+    <xml name="sanitize_query" token_validinitial="string.printable">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <remove value="&apos;" />
+            </valid>
+       </sanitizer>
+    </xml>
+    <xml name="sanitize_vectors" token_validinitial="string.digits">
+        <sanitizer>
+            <valid initial="@VALIDINITIAL@">
+                <add value=","/>
+            </valid>
+        </sanitizer>
+    </xml>
+    <xml name="inputs_anndata">
+        <param name="adata" type="data" format="h5ad" label="Annotated data matrix"/>
+    </xml>
+    <token name="@CMD_read_inputs@"><![CDATA[
+adata = sc.read_h5ad('anndata.h5ad')
+]]>
+    </token>
+    <xml name="inputs_common_advanced">
+        <param name="show_log" type="boolean" checked="false" label="Output Log?" />
+    </xml>
+    <xml name="param_plot_format">
+        <param name="format" type="select" label="Format for saving figures">
+            <option value="png">png</option>
+            <option value="pdf">pdf</option>
+            <option value="svg">svg</option>
+        </param>
+    </xml>
+    <xml name="param_markerset">
+        <param name="markerset" type="data" format="tabular" label="Markerset tabular file with rows=conditions (e.g., celltypes) and column=features (e.g., genes)" help="This is beeing used to calculate the corelations betweens your probeset features and marker features. Marker features are for example genes that you know are important for your condition (e.g., celltypes)."/>
+        <param name="header_markerset" type="select" optional="false" label="Header in the list of markers?">
+            <option value="included">Header included</option>
+            <option value="not_included">Header not included</option>
+        </param>
+    </xml>
+    <xml name="hidden_outputs">
+        <data name="hidden_output" format="txt" label="Log file" >
+            <filter>show_log</filter>
+        </data>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,1 @@
+CD14+ Monocyte	PILRA	PSAP	CD68	TMEM176B
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker_out_test1.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,11 @@
+	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30
+CD14+ Monocyte	FTL	FCER1G
+CD19+ B	ISG20	LY86
+CD34+	PRSS57	SNHG7
+CD4+/CD25 T Reg	SIT1	IL32
+CD4+/CD45RO+ Memory	GZMK	IL32	CD3E
+CD56+ NK	CD7	GNLY
+CD8+ Cytotoxic T	CCL5	S100A4	NKG7
+CD8+/CD45RA+ Naive Cytotoxic	CD7	CD8A	CD8B	AES
+Dendritic	CD74	LYZ
+Unkown	LTB	CD247	SERPINB1	RPLP1	TNFRSF13B	BLK	SPON2	TPD52	RNF138	NUCB2	CD27	AMICA1	BTG1	CD63	HOPX	PTPRCAP	CPVL	JUN	RAB3IP	SPOCK2	PRF1	GZMA	STK17A	RPL3	GYPC	SOX4	GZMH	LINC00402	C9orf142	VIMP	DENND2D
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker_out_test2.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,10 @@
+	0	1	2
+CD14+ Monocyte	FTL
+CD19+ B	ISG20
+CD34+	PRSS57
+CD4+/CD25 T Reg	SIT1	IL32
+CD4+/CD45RO+ Memory	GZMK	IL32	CD3E
+CD56+ NK	CD7
+CD8+ Cytotoxic T	CCL5
+CD8+/CD45RA+ Naive Cytotoxic	CD7
+Dendritic	CD74
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker_out_test3.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,4 @@
+	0	1
+CD34+	PRSS57	SNHG7
+CD56+ NK	CD7	CTSW
+Unkown	RPL3
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker_out_test4.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,11 @@
+	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30
+CD14+ Monocyte	FTL	FCER1G
+CD19+ B	ISG20	CD79A
+CD34+	PRSS57	SNHG7
+CD4+/CD25 T Reg	SIT1	IL32
+CD4+/CD45RO+ Memory	GZMK	IL32	CD3E
+CD56+ NK	CD7	GNLY
+CD8+ Cytotoxic T	CCL5	S100A4	NKG7
+CD8+/CD45RA+ Naive Cytotoxic	CD7	CD8A	CD8B	AES
+Dendritic	CD74	CST3
+Unkown	LTB	CD247	SERPINB1	RPLP1	TNFRSF13B	BLK	SPON2	TPD52	RNF138	NUCB2	CD27	AMICA1	BTG1	CD63	HOPX	PTPRCAP	CPVL	JUN	RAB3IP	SPOCK2	PRF1	GZMA	STK17A	RPL3	GYPC	SOX4	GZMH	LINC00402	C9orf142	VIMP	DENND2D
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/marker_out_test5.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,11 @@
+	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30
+CD14+ Monocyte	FTL	FCER1G
+CD19+ B	ISG20	LY86
+CD34+	PRSS57	SNHG7
+CD4+/CD25 T Reg	SIT1	IL32
+CD4+/CD45RO+ Memory	GZMK	IL32	CD3E
+CD56+ NK	CD7	GNLY
+CD8+ Cytotoxic T	CCL5	S100A4	NKG7
+CD8+/CD45RA+ Naive Cytotoxic	CD7	CD8A	CD8B	AES
+Dendritic	CD74	LYZ
+Unkown	LTB	CD247	SERPINB1	RPLP1	TNFRSF13B	BLK	SPON2	TPD52	RNF138	NUCB2	CD27	AMICA1	BTG1	CD63	HOPX	PTPRCAP	CPVL	JUN	RAB3IP	SPOCK2	PRF1	GZMA	STK17A	RPL3	GYPC	SOX4	GZMH	LINC00402	C9orf142	VIMP	DENND2D
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/tl.rank_genes_groups.newton-cg.pbmc68k_highly_reduced_marker_filtered_1.tsv	Mon Sep 16 11:37:34 2024 +0000
@@ -0,0 +1,11 @@
+	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	40	41	42	43	44	45	46	47	48	49
+CD14+ Monocyte	PILRA	PSAP	CD68	TMEM176B	FTL	NPC2	LST1	FCGR3A	FCER1G	CEBPB	FCN1	SERPINA1	OAZ1	CFD	FTH1	HCK	AIF1	SAT1	CTSS	S100A11	MS4A7	TYROBP	COTL1	STXBP2	RP11-290F20.3	S100A4	IFITM2	SPI1	DUSP1	SESN2	IFITM3	MPP1	GALE	CORO1B	RP11-390E23.6	VIMP	RSBN1L-AS1	CHD4	CFP	GSTP1	PFN1	FCGRT	ADTRP	ARHGDIB	AMICA1	HLA-DRB5	CST3	GRN	HLA-DPA1	SSR3
+CD19+ B	TNFRSF13B	CD79B	SMARCB1	PNOC	CCDC50	AL928768.3	BANK1	MS4A1	CD79A	ISG20	IGLL5	TNFRSF17	KIAA0125	TPD52	PEBP1	FKBP11	CCDC132	SUB1	POU2AF1	MZB1	PTPRCAP	UBE2J1	BLK	SPIB	DERL3	FAM63B	MPHOSPH9	IGJ	FCRLA	XBP1	NCF1	SSR3	CD52	TSHZ2	PDLIM1	VIMP	SSR4	S1PR4	SELL	HMGA1	NUCB2	JUN	CD27	ARHGDIB	GYPC	CALR	ADTRP	BTG1	EXOG	RARRES3
+CD34+	PRSS57	C19orf77	SPINK2	RP11-620J15.3	SNHG7	CYTL1	EGFL7	NGFRAP1	SOX4	NFE2	EGR1	RP3-467N11.1	H1FX	CDK6	SERPINB1	SPINT2	HMGA1	IL1B	NUCB2	RPLP0	IGFBP7	RPLP1	ATXN7L3B	RPS3	C1orf228	KIAA0125	RPL3	SYPL1	CD63	LDHB	SEPT1	JUN	FAM101B	PRKCQ-AS1	MATK	PEBP1	SELL	ITM2A	SSR3	SPON2	XBP1	UBE2J1	VIMP	GYPC	STK17A	STMN1	VIM	MZB1	HOPX	CD99
+CD4+/CD25 T Reg	IL32	SPOCK2	ACTG1	CD2	CD3D	GPR171	ARHGDIB	ACOX1	MAL	SIT1	GIMAP4	AES	CD52	SEPT1	TMSB10	LAT	STMN1	LINC00402	CD27	TSHZ2	S1PR4	CD3E	PFN1	CD99	AQP3	PTPRCAP	CD3G	LY9	LCK	CD247	S100A4	CCR7	TTC39C	CORO1B	MPHOSPH9	FYB	RPSA	FLT3LG	B2M	GIMAP7	PRKCQ-AS1	SELL	BTG1	CCDC132	GYPC	DENND2D	LDHB	IL7R	ITM2A	RPLP0
+CD4+/CD45RA+/CD25- Naive T	EAF2	GNG7	SSR4	CALR	DERL3	MANF	IGJ	XBP1	ATXN7L3B	SSR3	UBE2J1	CD79A	MZB1	RP3-467N11.1	TNFRSF17	NCF1	CDK6	SUB1	POU2AF1	AL928768.3	FKBP11	VIMP	GYPC	JUN	CD27	PEBP1	SMARCB1	FLT3LG	RPLP1	RPLP0	CCDC50	ISG20	IGLL5	HCST	GSTP1	GPX1	CD52	VIM	PTPRCAP	FCGRT	CD74	B2M	RPL3	CYTL1	SPINK2	PRSS57	C19orf77	RP11-620J15.3	FAM101B	CCDC132
+CD4+/CD45RO+ Memory	RNF138	NOSIP	IFITM1	LCK	RARRES3	ALOX5AP	FAM63B	RAB3IP	GZMK	CD3G	SEPT1	LDHB	SELL	CD3D	EXOG	RPSA	CD247	AES	CD52	TMSB10	NUCB2	DENND2D	RPL3	RPLP1	ACTG1	FYB	GIMAP7	CORO1B	LY9	CD7	PFN1	RPS3	GYPC	CD2	ARHGDIB	IL32	RPLP0	CD99	CD3E	GIMAP4	HCST	B2M	LAT	ISG20	ITM2A	FKBP11	SERPINB1	STK17A	CCR7	PTPRCAP
+CD56+ NK	CST7	SPON2	HOPX	GNLY	NKG7	CTSW	KLRC2	CD7	MATK	PCIF1	CLIC3	FGFBP2	SYPL1	GZMB	C9orf142	PRF1	CD247	HCST	GZMA	GZMH	STMN1	ALOX5AP	CD63	CD99	IGFBP7	GZMM	CCL5	B2M	DENND2D	GIMAP7	RARRES3	SIT1	IFITM1	PFN1	EXOG	XBP1	IFITM2	GIMAP4	VIMP	STK17A	LCK	GZMK	SEPT1	SSR3	CD8A	CD3G	SPOCK2	RPS3	LDHB	IL32
+CD8+ Cytotoxic T	FAM101B	ADTRP	GZMK	HCST	LAT	EGR1	CD8B	CCL5	RPL3	LINC00402	FGFBP2	GZMM	RPS3	CD3E	GYPC	DENND2D	C9orf142	GZMA	SEPT1	JUN	FYB	CD8A	SELL	ALOX5AP	CD3G	STK17A	AQP3	C1orf228	CD3D	HOPX	NKG7	CD2	NGFRAP1	RPLP1	RPSA	CCR7	IL7R	SPON2	PRF1	RARRES3	PRKCQ-AS1	FKBP11	MANF	CTSW	GNLY	CD27	LDHB	MAL	LTB	RPLP0
+CD8+/CD45RA+ Naive Cytotoxic	RP11-291B21.2	CD8A	CD8B	RSBN1L-AS1	GIMAP5	GZMM	GALE	CCR7	STK17A	RAB3IP	GZMH	GIMAP7	CD3E	C1orf228	LCK	CCL5	PEBP1	CD27	GYPC	LDHB	RNF34	CD99	CD3G	PFN1	IL7R	CD2	C9orf142	TMSB10	NGFRAP1	S1PR4	ITM2A	CD7	RPS3	IL32	FYB	IFITM1	CD52	LAT	GIMAP4	MAL	STMN1	NOSIP	RARRES3	SPOCK2	ACTG1	PRF1	CD3D	RPLP1	SELL	GZMA
+Dendritic	HLA-DQB1	CST3	HLA-DRB1	HLA-DQA2	HLA-DQA1	LYZ	HLA-DPB1	HLA-DPA1	HLA-DMA	HLA-DRA	VIM	CD74	ALDH2	FCER1A	GPX1	HLA-DRB5	LGALS2	MNDA	FCGRT	GRN	HLA-DMB	FOS	CPVL	CLEC10A	AMICA1	CFP	LY86	GSTP1	RP11-473M20.7	IL1B	GSN	SPINT2	CCDC163P	IGFBP7	EXOG	DUSP1	CD63	COTL1	FTH1	SPI1	TYROBP	SPIB	S100A11	OAZ1	CTSS	CCDC50	AIF1	SERPINB1	TMSB10	PCIF1
Binary file test-data/tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad has changed