Mercurial > repos > iuc > spapros_selection
view selection.xml @ 0:c7527212b66b draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/spapros/ commit aed7fe13fa0ed09d77a31eeecaf3ec3fba7eed3b
author | iuc |
---|---|
date | Mon, 16 Sep 2024 11:37:46 +0000 |
parents | |
children |
line wrap: on
line source
<tool id="spapros_selection" name="Selection" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@profile@"> <description>of marker genes with spapros</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"> </expand> <expand macro="version_command"/> <command detect_errors="exit_code"><![CDATA[ @CMD@ ]]></command> <configfiles> <configfile name="script_file"><![CDATA[ @CMD_imports@ @CMD_read_inputs@ random.seed($seed) mpl.rcParams['figure.dpi'] = $general_figure_options.dpi plt.rcParams["font.size"] = $general_figure_options.fontsize header_markerset='infer' #if $cond_markerset.select_markerset == 'True' and $cond_markerset.header_markerset == 'not_included': header_markerset=None #end if selector = sp.se.ProbesetSelector( adata, celltype_key='$celltype_key', #if $genes_key != '': genes_key='$genes_key', #else: genes_key=None, #end if #if $cond_n.select_n == 'True': n=$cond_n.n, #else: n=None, #end if #if $preselected_genes != '' preselected_genes = '$preselected_genes'.split(','), #end if #if $prior_genes != '' prior_genes = '$prior_genes'.split(','), #end if #if $cond_n_pca_genes.select_n_pca_genes == 'True': n_pca_genes=$cond_n_pca_genes.n_pca_genes, #end if #if $min_mean_difference != 0.0: min_mean_difference=$min_mean_difference, #end if n_min_markers=$n_min_markers, #if $celltypes != 'all': celltypes='$celltypes'.split(','), #else celltypes='$celltypes', #end if #if $cond_markerset.select_markerset == 'True': marker_list = {key: [v for v in list(value.values()) if pd.notna(v)] for key, value in pd.read_csv('$cond_markerset.markerset', sep='\t', index_col=0, header=header_markerset).to_dict(orient='index').items()}, #end if n_list_markers=$n_list_markers, marker_corr_th=$marker_corr_th, #if $pca_penalties != '' pca_penalties = '$pca_penalties'.split(','), #end if #if $DE_penalties != '' DE_penalties = '$DE_penalties'.split(','), #end if #if $m_penalties_adata_celltypes != '' m_penalties_adata_celltypes = '$m_penalties_adata_celltypes'.split(','), #end if #if $m_penalties_list_celltypes != '' m_penalties_list_celltypes = '$m_penalties_list_celltypes'.split(','), #end if #if $advanced_options.cond_DE_selection_hparams.select_DE_selection_hparams == 'True': DE_selection_hparams={"n": $advanced_options.cond_DE_selection_hparams.n_DE_selection_hparams, "per_group": $advanced_options.cond_DE_selection_hparams.per_group} #end if #if $advanced_options.cond_forest_hparams.select_forest_hparams == 'True': forest_hparams={"n_trees": $advanced_options.cond_forest_hparams.n_trees, "subsample": $advanced_options.cond_forest_hparams.subsample, "test_subsample": $advanced_options.cond_forest_hparams.test_subsample}, #end if #if $advanced_options.cond_forest_DE_baseline_hparams.select_forest_DE_baseline_hparams == 'True': forest_DE_baseline_hparams={ "n_DE": $advanced_options.cond_forest_DE_baseline_hparams.n_DE, "min_score": $advanced_options.cond_forest_DE_baseline_hparams.min_score, "n_stds": $advanced_options.cond_forest_DE_baseline_hparams.n_stds, "max_step": $advanced_options.cond_forest_DE_baseline_hparams.max_step, "min_outlier_dif": $advanced_options.cond_forest_DE_baseline_hparams.min_outlier_dif, "n_terminal_repeats": $advanced_options.cond_forest_DE_baseline_hparams.n_terminal_repeats, }, #end if #if $advanced_options.cond_add_forest_genes_hparams.select_add_forest_genes_hparams == 'True': add_forest_genes_hparams={"n_max_per_it": $advanced_options.cond_add_forest_genes_hparams.n_max_per_it, "performance_th": $advanced_options.cond_add_forest_genes_hparams.performance_th, "importance_th": $advanced_options.cond_add_forest_genes_hparams.importance_th}, #end if #if $advanced_options.cond_marker_selection_hparams.select_marker_selection_hparams == 'True': marker_selection_hparams={"penalty_threshold": $advanced_options.cond_marker_selection_hparams.penalty_threshold}, #end if verbosity=0, seed=$seed, n_jobs=int(os.environ.get('GALAXY_SLOTS', '4')) ) selector.select_probeset() sp.pl.masked_dotplot( adata, selector, ct_key='$figure_options_masked_dotplot.ct_key', imp_threshold=$figure_options_masked_dotplot.imp_threshold, #if $figure_options_masked_dotplot.celltypes != '': celltypes='$figure_options_masked_dotplot.celltypes', #end if #if $figure_options_masked_dotplot.n_genes != 0: n_genes=$figure_options_masked_dotplot.n_genes, #end if #if $figure_options_masked_dotplot.comb_markers_only: comb_markers_only=True, #end if #if $figure_options_masked_dotplot.markers_only: markers_only=True, #end if cmap='$figure_options_masked_dotplot.cmap', comb_marker_color='$figure_options_masked_dotplot.comb_marker_color', marker_color='$figure_options_masked_dotplot.marker_color', non_adata_celltypes_color='$figure_options_masked_dotplot.non_adata_celltypes_color', use_raw=$figure_options_masked_dotplot.use_raw, save='masked_dotplot.$format', ) selector.plot_gene_overlap( style='$figure_options_plot_gene_overlap.style', save='gene_overlap.$format', show=False ) probe_genes = selector.probeset.index[selector.probeset.selection] celltypes_DE_1vsall = list(selector.probeset[selector.probeset.selection]['celltypes_DE_1vsall']) celltypes_DE_specific = list(selector.probeset[selector.probeset.selection]['celltypes_DE_specific']) celltypes_DE = list(selector.probeset[selector.probeset.selection]['celltypes_DE']) celltypes_marker = list(selector.probeset[selector.probeset.selection]['celltypes_marker']) marker_dict = dict() for i,g in enumerate(probe_genes): recognized_celltypes = list(set(celltypes_DE_1vsall[i].split(',') + celltypes_DE_specific[i].split(',') + \ celltypes_DE[i].split(',') + celltypes_marker[i].split(','))) if (len(recognized_celltypes) > 1 and '' in recognized_celltypes): recognized_celltypes.remove('') for c in recognized_celltypes: if c == '': c = 'Unkown' if c not in marker_dict: marker_dict[c] = [g] else: marker_dict[c] = marker_dict[c] + [g] # Find the maximum length of lists max_len = max(len(lst) for lst in marker_dict.values()) sorted_marker_dict_by_keys = {key: marker_dict[key] for key in sorted(marker_dict.keys())} # Fill smaller lists with empty values for key, value in sorted_marker_dict_by_keys.items(): sorted_marker_dict_by_keys[key] = value + [''] * (max_len - len(value)) df = pd.DataFrame(sorted_marker_dict_by_keys).T df.to_csv('marker.tsv', sep='\t', index=True) ]]></configfile> </configfiles> <inputs> <expand macro="inputs_anndata"/> <expand macro="param_plot_format"/> <param name="celltype_key" type="text" value="celltype" optional="false" label="Key in adata.obs with celltype annotations"/> <param name="genes_key" type="text" optional="true" label="Key in adata.var for preselected genes" help="This is typically highly_variable_genes. Leave empty to not subset genes."/> <conditional name="cond_n"> <param name="select_n" type="select" label="Do you want to set the number of finally selected genes?" help="Note that when `No` we automatically infer n as the minimal number of recommended genes."> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="n" type="integer" value="20" min="1" optional="false" label="Number of finally selected genes" help="Setting n might change the gene ranking since the final added list_markers are added based on the theoretically added genes without list_markers."/> </when> <when value="False"/> </conditional> <param name="preselected_genes" type="text" optional="true" label="Pre selected genes (comma separated)" help="These will also have the highest ranking in the final list."/> <param name="prior_genes" type="text" optional="true" label="Prioritized genes (comma separated)"/> <conditional name="cond_n_pca_genes"> <param name="select_n_pca_genes" type="select" label="Do you want to set the number of preselected pca genes?" help="If not, then this step will be skipped."> <option value="True">Yes</option> <option value="False">No</option> </param> <when value="True"> <param argument="n_pca_genes" type="integer" value="100" min="1" optional="false" label="Number of preselected pca genes"/> </when> <when value="False"/> </conditional> <param argument="min_mean_difference" type="float" value="0.0" optional="false" label="Minimal difference of mean expression between at least one celltype and the background" help="This minimal difference is applied as an additional binary penalty in pca_penalties, DE_penalties and m_penalties_adata_celltypes."/> <param argument="n_min_markers" type="integer" value="2" min="1" optional="false" label="The minimal number of identified and added markers"/> <param name="celltypes" type="text" value="all" optional="false" label="Cell types for which trees are trained" help="If not `all` then seperate the cell type with a comma (e.g., Glia,Neuron)"/> <conditional name="cond_markerset"> <param name="select_markerset" type="select" label="Do you want to provide a set of marker genes?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param name="markerset" type="data" format="tabular" label="Markerset tabular file with rows=conditions (e.g., celltypes) and column=features (e.g., genes)"/> <param name="header_markerset" type="select" optional="false" label="Header in the list of markers?"> <option value="included">Header included</option> <option value="not_included">Header not included</option> </param> </when> <when value="False"/> </conditional> <param argument="n_list_markers" type="integer" value="2" min="1" optional="false" label="Minimal number of markers per celltype that are at least selected" help="Selected means either selecting genes from the marker list or having correlated genes in the already selected panel."/> <param argument="marker_corr_th" type="float" value="0.5" optional="false" label="Minimal correlation to consider a gene as captured"/> <param name="pca_penalties" type="text" optional="true" label="List of keys for columns in adata.var containing penalty factors that are multiplied with the scores for PCA based gene selection" help="(comma separted)"/> <param name="DE_penalties" type="text" optional="true" label="List of keys for columns in adata.var containing penalty factors that are multiplied with the scores for DE based gene selection (comma separted)" help="(comma separted)"/> <param name="m_penalties_adata_celltypes" type="text" optional="true" label="List of keys for columns in adata.var containing penalty factors to filter out marker genes if a gene's penalty < threshold for celltypes in adata" help="(comma separted)"/> <param name="m_penalties_list_celltypes" type="text" optional="true" label="List of keys for columns in adata.var containing penalty factors to filter out marker genes if a gene's penalty < threshold for celltypes **not** in adata" help="(comma separted)"/> <param argument="seed" type="integer" value="123" min="0" optional="false" label="Random number seed"/> <section name="advanced_options" title="Advanced Options" expanded="false"> <conditional name="cond_DE_selection_hparams"> <param name="select_DE_selection_hparams" type="select" label="Do you want to tune hyperparameters for the DE based gene selection?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="n_DE_selection_hparams" type="integer" value="3" optional="false" label="n"/> <param name="per_group" type="select" label="per_group"> <option value="False">No</option> <option value="True">Yes</option> </param> </when> <when value="False"/> </conditional> <conditional name="cond_forest_hparams"> <param name="select_forest_hparams" type="select" label="Do you want to tune hyperparameters for the forest based gene selection?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="n_trees" type="integer" value="50" optional="false" label="n_trees"/> <param argument="subsample" type="integer" value="1000" optional="false" label="subsample"/> <param argument="test_subsample" type="integer" value="3000" optional="false" label="test_subsample"/> </when> <when value="False"/> </conditional> <conditional name="cond_forest_DE_baseline_hparams"> <param name="select_forest_DE_baseline_hparams" type="select" label="Do you want to tune hyperparameters for the DE based gene selection?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="n_DE" type="integer" value="1" optional="false" label="n_DE"/> <param argument="min_score" type="float" value="0.9" optional="false" label="min_score"/> <param argument="n_stds" type="float" value="1.0" optional="false" label="n_stds"/> <param argument="max_step" type="integer" value="3" optional="false" label="max_step"/> <param argument="min_outlier_dif" type="float" value="0.02" optional="false" label="min_outlier_dif"/> <param argument="n_terminal_repeats" type="integer" value="3" optional="false" label="n_terminal_repeats"/> </when> <when value="False"/> </conditional> <conditional name="cond_add_forest_genes_hparams"> <param name="select_add_forest_genes_hparams" type="select" label="Do you want to tune hyperparameters for adding marker genes to decision trees?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="n_max_per_it" type="integer" value="5" optional="false" label="n_max_per_it"/> <param argument="performance_th" type="float" value="0.02" optional="false" label="performance_th"/> <param argument="importance_th" type="integer" value="0" optional="false" label="importance_th"/> </when> <when value="False"/> </conditional> <conditional name="cond_marker_selection_hparams"> <param name="select_marker_selection_hparams" type="select" label="Do you want to tune marker selection hyperparameters?"> <option value="False">No</option> <option value="True">Yes</option> </param> <when value="True"> <param argument="penalty_threshold" type="integer" value="1" optional="false" label="penalty_threshold"/> </when> <when value="False"/> </conditional> </section> <section name="general_figure_options" title="General Figure Output Options" expanded="false"> <param argument="dpi" type="integer" value="300" min="1" label="Dpi of figures"/> <param argument="fontsize" type="integer" value="100" min="1" label="Font size of figures"/> </section> <section name="figure_options_masked_dotplot" title="Figure Output Options for masked_dotplot" expanded="false"> <param name="ct_key" type="text" value="celltype" optional="false" label="Key in adata.var for preselected genes" help="Column of adata.obs with cell type annotation"/> <param argument="imp_threshold" type="float" value="0.05" min="0.0" optional="false" label="Annotate genes as Spapros marker only for those genes with importance > imp_threshold"/> <param name="celltypes" type="text" optional="true" label="Subset of celltypes (rows of dotplot)"/> <param argument="n_genes" type="integer" value="0" min="0" label="Plot top n_genes genes." help="If 0 then all."/> <param name="comb_markers_only" type="boolean" value="false" label="Do you want to plot only genes that are Spapros markers for the plotted cell types?"/> <param name="markers_only" type="boolean" value="false" label="Do you want to plot only genes that are markers for the plotted cell types?"/> <param name="cmap" type="text" value="Reds" optional="false" label="Colormap of mean expressions"/> <param name="comb_marker_color" type="text" value="darkblue" optional="false" label="Color for Spapros markers"/> <param name="marker_color" type="text" value="blue" optional="false" label="Color for marker genes"/> <param name="non_adata_celltypes_color" type="text" value="grey" optional="false" label="Color for celltypes that don't occur in the data set."/> <param name="use_raw" type="select" label="Do you want to use adata.raw for plotting?"> <option value="False">No</option> <option value="True">Yes</option> </param> </section> <section name="figure_options_plot_gene_overlap" title="Figure Output Options for plot_gene_overlap" expanded="false"> <param name="style" type="select" label="Plot type"> <option value="upset">Upset plot</option> <option value="venn">Venn diagram</option> </param> </section> <expand macro="inputs_common_advanced"/> </inputs> <outputs> <data name="out_masked_dotplot_png" format="png" from_work_dir="*masked_dotplot.png" label="PNG masked_dotplot from ${tool.name} on ${on_string}"> <filter>format == 'png'</filter> </data> <data name="out_masked_dotplot_pdf" format="pdf" from_work_dir="*masked_dotplot.pdf" label="PDF masked_dotplot from ${tool.name} on ${on_string}"> <filter>format == 'pdf'</filter> </data> <data name="out_masked_dotplot_svg" format="svg" from_work_dir="*masked_dotplot.svg" label="SVG masked_dotplot from ${tool.name} on ${on_string}"> <filter>format == 'svg'</filter> </data> <data name="out_gene_overlap_png" format="png" from_work_dir="*gene_overlap.png" label="PNG gene_overlap from ${tool.name} on ${on_string}"> <filter>format == 'png'</filter> </data> <data name="out_gene_overlap_pdf" format="pdf" from_work_dir="*gene_overlap.pdf" label="PDF gene_overlap from ${tool.name} on ${on_string}"> <filter>format == 'pdf'</filter> </data> <data name="out_gene_overlap_svg" format="svg" from_work_dir="*gene_overlap.svg" label="SVG gene_overlap from ${tool.name} on ${on_string}"> <filter>format == 'svg'</filter> </data> <data name="marker_out" format="tabular" from_work_dir="marker.tsv" label="${tool.name} on ${on_string}: Markers"/> <expand macro="hidden_outputs"/> </outputs> <tests> <test expect_num_outputs="4"> <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/> <param name="format" value="png"/> <param name="genes_key" value="highly_variable"/> <param name="show_log" value="true" /> <section name="general_figure_options"> <param name="dpi" value="100"/> </section> <output name="hidden_output"> <assert_contents> <has_text_matching expression="celltype_key='celltype',"/> <has_text_matching expression="genes_key='highly_variable',"/> <has_text_matching expression="n_pca_genes=100,"/> <has_text_matching expression="seed=123,"/> <has_text_matching expression="cmap='Reds',"/> <has_text_matching expression="save='masked_dotplot.png',"/> <has_text_matching expression="style='upset',"/> <has_text_matching expression="save='gene_overlap.png',"/> </assert_contents> </output> <output name="out_masked_dotplot_png"> <assert_contents> <has_image_width width="4055" delta="2"/> <has_image_height height="1108" delta="2"/> </assert_contents> </output> <output name="out_gene_overlap_png"> <assert_contents> <has_image_width width="1189" delta="2"/> <has_image_height height="600" delta="2"/> </assert_contents> </output> <output name="marker_out" file="marker_out_test1.tsv" ftype="tabular"/> </test> <test expect_num_outputs="4"> <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/> <param name="format" value="png"/> <param name="genes_key" value="highly_variable"/> <param name="select_n" value="True"/> <param name="n" value="10"/> <section name="general_figure_options"> <param name="dpi" value="100"/> </section> <param name="show_log" value="true" /> <output name="hidden_output"> <assert_contents> <has_text_matching expression="celltype_key='celltype',"/> <has_text_matching expression="genes_key='highly_variable',"/> <has_text_matching expression="n_pca_genes=100,"/> <has_text_matching expression="n=10,"/> <has_text_matching expression="seed=123,"/> <has_text_matching expression="cmap='Reds',"/> <has_text_matching expression="save='masked_dotplot.png',"/> <has_text_matching expression="style='upset',"/> <has_text_matching expression="save='gene_overlap.png',"/> </assert_contents> </output> <output name="out_masked_dotplot_png"> <assert_contents> <has_image_width width="2914" delta="2"/> <has_image_height height="882" delta="2"/> </assert_contents> </output> <output name="out_gene_overlap_png"> <assert_contents> <has_image_width width="1032" delta="2"/> <has_image_height height="600" delta="2"/> </assert_contents> </output> <output name="marker_out" file="marker_out_test2.tsv" ftype="tabular"/> </test> <test expect_num_outputs="4"> <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/> <param name="format" value="png"/> <param name="genes_key" value="highly_variable"/> <param name="celltypes" value="CD34+,CD56+ NK"/> <section name="general_figure_options"> <param name="dpi" value="100"/> </section> <param name="show_log" value="true" /> <output name="hidden_output"> <assert_contents> <has_text_matching expression="celltype_key='celltype',"/> <has_text_matching expression="genes_key='highly_variable',"/> <has_text_matching expression="n_pca_genes=100,"/> <has_text_matching expression="seed=123,"/> <has_text_matching expression="cmap='Reds',"/> <has_text_matching expression="save='masked_dotplot.png',"/> <has_text_matching expression="style='upset',"/> <has_text_matching expression="save='gene_overlap.png',"/> </assert_contents> </output> <output name="out_masked_dotplot_png"> <assert_contents> <has_image_width width="2776" delta="2"/> <has_image_height height="882" delta="2"/> </assert_contents> </output> <output name="out_gene_overlap_png"> <assert_contents> <has_image_width width="929" delta="2"/> <has_image_height height="565" delta="2"/> </assert_contents> </output> <output name="marker_out" file="marker_out_test3.tsv" ftype="tabular"/> </test> <test expect_num_outputs="4"> <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/> <param name="format" value="png"/> <param name="genes_key" value="highly_variable"/> <param name="select_markerset" value="True"/> <param name="markerset" value="marker.tsv"/> <param name="header_markerset" value="not_included"/> <section name="general_figure_options"> <param name="dpi" value="100"/> </section> <param name="show_log" value="true"/> <output name="hidden_output"> <assert_contents> <has_text_matching expression="celltype_key='celltype',"/> <has_text_matching expression="genes_key='highly_variable',"/> <has_text_matching expression="n_pca_genes=100,"/> <has_text_matching expression="seed=123,"/> <has_text_matching expression="cmap='Reds',"/> <has_text_matching expression="save='masked_dotplot.png',"/> <has_text_matching expression="style='upset',"/> <has_text_matching expression="save='gene_overlap.png',"/> </assert_contents> </output> <output name="out_masked_dotplot_png"> <assert_contents> <has_image_width width="4055" delta="2"/> <has_image_height height="1108" delta="2"/> </assert_contents> </output> <output name="out_gene_overlap_png"> <assert_contents> <has_image_width width="1154" delta="2"/> <has_image_height height="600" delta="2"/> </assert_contents> </output> <output name="marker_out" file="marker_out_test4.tsv" ftype="tabular"/> </test> <test expect_num_outputs="4"> <param name="adata" value="tl.rank_genes_groups.newton-cg.pbmc68k_reduced_240cells.h5ad"/> <param name="format" value="png"/> <param name="genes_key" value="highly_variable"/> <section name="advanced_options"> <param name="select_DE_selection_hparams" value="True"/> <param name="select_forest_hparams" value="True"/> <param name="select_forest_DE_baseline_hparams" value="True"/> <param name="select_add_forest_genes_hparams" value="True"/> <param name="select_marker_selection_hparams" value="True"/> </section> <section name="general_figure_options"> <param name="dpi" value="100"/> </section> <param name="show_log" value="true"/> <output name="hidden_output"> <assert_contents> <has_text_matching expression="celltype_key='celltype',"/> <has_text_matching expression="genes_key='highly_variable',"/> <has_text_matching expression="n_pca_genes=100,"/> <has_text_matching expression="seed=123,"/> <has_text_matching expression="cmap='Reds',"/> <has_text_matching expression="save='masked_dotplot.png',"/> <has_text_matching expression="style='upset',"/> <has_text_matching expression="save='gene_overlap.png',"/> </assert_contents> </output> <output name="out_masked_dotplot_png"> <assert_contents> <has_image_width width="4055" delta="2"/> <has_image_height height="1108" delta="2"/> </assert_contents> </output> <output name="out_gene_overlap_png"> <assert_contents> <has_image_width width="1189" delta="2"/> <has_image_height height="600" delta="2"/> </assert_contents> </output> <output name="marker_out" file="marker_out_test5.tsv" ftype="tabular"/> </test> </tests> <help><![CDATA[ Probe set selection for single-cell sequencing data using spapros. ============================================================================================================ Spapros is a python package that provides a pipeline for probe set selection and evaluation for targeted spatial transcriptomics data. Key Features: Select probe sets for spatial transcriptomics which identify cell types of interest, capture general transcriptomic variation, and incorporate prior knowledge Evaluate probe sets with an extensive pipeline Further documentation can be found here: https://spapros.readthedocs.io/en/latest/index.html. ]]></help> <expand macro="citations"/> </tool>