diff halla.xml @ 0:cafea02ae3e0 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/halla commit 5cd01ad3808dff1ce4aae231706cbe2225079a04
author iuc
date Wed, 05 Nov 2025 09:37:19 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/halla.xml	Wed Nov 05 09:37:19 2025 +0000
@@ -0,0 +1,222 @@
+<tool id="halla" name="HAllA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description>Hierarchical All-against-All association</description>
+    <macros>
+        <token name="@TOOL_VERSION@">0.8.40</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">24.0</token>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools"></xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">halla</requirement>
+    </requirements>
+    <version_command><![CDATA[halla --version]]></version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        halla
+            -x '$x'
+            --x_dataset_label 
+            #if $x_dataset_label
+                '$x_dataset_label'
+            #else
+                '$x.element_identifier'
+            #end if
+            -y '$y'
+            --y_dataset_label 
+            #if $y_dataset_label
+                '$y_dataset_label'
+            #else
+                '$y.element_identifier'
+            #end if
+            $alla
+            --max_freq_thresh $max_freq_thresh
+            #if $transform_data_funcs
+                --transform_data_funcs
+                #for $foo in $transform_data_funcs
+                    $foo
+                #end for
+            #end if
+            $disable_bypass_discretization_if_possible
+            #if $discretize_func
+                --discretize_func $discretize_func
+            #end if
+            #if $discretize_num_bins
+                --discretize_num_bins $discretize_num_bins
+            #end if
+            --pdist_metric $pdist_metric
+            $sim2dist_disable_abs
+            --linkage_method $linkage_method
+            --permute_func $permute_func
+            --permute_iters $permute_iters
+            $disable_permute_speedup
+            --fdr_alpha $fdr_alpha
+            --fdr_method $fdr_method
+            --fnr_thresh $fnr_thresh
+            --rank_cluster $rank_cluster
+            #if $seed
+                --seed $seed
+            #end if
+            --block_num $block_num
+            $hallagram
+            $diagnostic_plot
+            -o output
+            --num_threads "\${GALAXY_SLOTS:-4}"
+    ]]></command>
+    <inputs>
+        <param argument="-x" type="data" format="tabular" label="Dataset X" help="Tabular dataset with p features/rows and n samples/columns" />
+        <param argument="-y" type="data" format="tabular" label="Dataset Y" help="Tabular dataset with d features/rows and n samples/columns" />
+        <param argument="--alla" type="boolean" truevalue="--alla" falsevalue="" checked="false" label="Use AllA instead of HAllA" help="HAllA uses a hierarchical approach for block association discovery on top of an existing all-against-all (AllA) association matrix. Use this option to skip the block association step." />
+        <param argument="--max_freq_thresh" type="float" min="0" max="1" value="1" label="Maximum frequency" help="features with max frequences >= the threshold will be removed" />
+        <param argument="--transform_data_funcs" type="select" optional="true" multiple="true" label="Continuous data transformation function">
+            <option value="zscore">zscore</option>
+            <option value="rank">rank</option>
+            <option value="quantile">quantile</option>
+        </param>
+        
+        <param argument="--disable_bypass_discretization_if_possible" type="boolean" truevalue="--disable_bypass_discretization_if_possible" falsevalue="" checked="false" label="Discretize even if all features are continuous" help="By default discritization is bypassed if all features are continuous" />
+        <param argument="--discretize_func" type="select" optional="true" label="Discretization function">
+            <option value="quantile">quantile</option>
+            <option value="kmeans">kmeans</option>
+            <option value="uniform">uniform</option>
+            <option value="jenks">jenks</option>
+        </param>
+        <param argument="--discretize_num_bins" type="integer" min="0" value="" optional="true" label="Discretization - number of bins"/>
+
+        <param argument="--pdist_metric" type="select" label="Distance/similarity metric" help="default: spearman for continuous data, If there is at least one categorical variable in either dataset, HAllA will shift to Normalized Mutual Information (NMI) as an alternative similarity measure.">
+            <option value="spearman" selected="true">Spearman</option>
+            <option value="pearson">Pearson</option>
+            <option value="dcor">Distance correlation</option>
+            <option value="mi">mutual information</option>
+            <option value="nmi">normalized mutual information</option>
+            <option value="xicor">xi correlation</option>
+        </param>
+
+        <param argument="--sim2dist_disable_abs" type="boolean" truevalue="--sim2dist_disable_abs" falsevalue="" checked="false" label="Hierarchical clustering - disable setting similarity scores as absolute when computing distance" />
+        <param argument="--linkage_method" type="select" label="Hierarchical clustering linkage method" help="see help below">
+            <option value="single">single</option>
+            <option value="complete">complete</option>
+            <option value="average" selected="true">average</option>
+            <option value="weighted">weighted</option>
+            <option value="centroid">centroid</option>
+            <option value="median">median</option>
+            <option value="ward">ward</option>
+        </param>
+
+        <param argument="--permute_func" type="select" label="P-value approximation function" help=" in the p-value permutation test">
+            <option value="gpd">gdp</option>
+            <option value="ecdf">ecdf</option>
+        </param>
+        <param argument="--permute_iters" type="integer" min="0" value="1000" label="Number of iterations in the p-value permutation test"/>
+        <param argument="--disable_permute_speedup" type="boolean" truevalue="--disable_permute_speedup" falsevalue="" checked="false" label="Do not break early in the permutation test if p-value is insignificant"/>
+        <!-- \-\-force_permutations  If turned on, force permutation testing -->
+
+        <param argument="--fdr_alpha" type="float" min="0" max="1" value="0.05" label="FDR threshold"/>
+        <param argument="--fdr_method" type="select" label="FDR method" help="see help below">
+            <option value="bonferroni">bonferroni: one-step correction</option>
+            <option value="sidak">sidak: one-step correction</option>
+            <option value="holm-sidak">holm-sidak: </option>
+            <option value="holm">holm: </option>
+            <option value="simes-hochberg">simes-hochberg: </option>
+            <option value="hommel">hommel: </option>
+            <option value="fdr_bh" selected="true">fdr_bh: Benjamini/Hochberg (non-negative)</option>
+            <option value="fdr_by">fdr_by: Benjamini/Yekutieli (negative)</option>
+            <option value="fdr_tsbh">fdr_tsbh: two stage fdr correction (non-negative)</option>
+            <option value="fdr_tsbky">fdr_tsbky: two stage fdr correction (non-negative)</option>
+        </param>
+        <param argument="--fnr_thresh" type="float" min="0" max="1" value="0.05" label="FNR threshold"/>
+        <param argument="--rank_cluster" type="select" label="Procedure to rank cluster using the p-values within the cluster">
+            <option value="best" selected="true">best</option>
+            <option value="average">average</option>
+        </param>
+        <param argument="--seed" type="integer" value="" optional="true" label="Randomization seed" />
+        
+        <param argument="--hallagram" type="boolean" truevalue="--hallagram" falsevalue="--no_hallagram" checked="true" label="Generate hallagram" />
+        <param argument="--x_dataset_label" type="text" label="Hallagram/clustermap: label for X dataset" help="By default the dataset identifier is used"/>
+        <param argument="--y_dataset_label" type="text" label="Hallagram/clustermap: label for Y dataset" help="By default the dataset identifier is used"/>
+        <param argument="--block_num" type="integer" min="-1" value="-1" label="Number of top clusters in hallagram" help="-1: show all clusters"/>
+
+        <param argument="--diagnostic_plot" type="boolean" truevalue="--diagnostic_plot" falsevalue="" checked="false" label="Generates diagnostic plot" />
+    </inputs>
+    <outputs>
+        <data name="sig_clusters" format="tabular" from_work_dir="output/sig_clusters.txt" label="${tool.name} on ${on_string}: block associations"/> 
+        <data name="all_associations" format="tabular" from_work_dir="output/all_associations.txt" label="${tool.name} on ${on_string}: all associations"/> 
+        <data name="hallagram_out" format="pdf" from_work_dir="output/hallagram.pdf" label="${tool.name} on ${on_string}: hallagram">
+            <filter>hallagram is True</filter>
+        </data>
+        <collection name="diagnostic_plot_out" format="pdf" type="list" label="${tool.name} on ${on_string}: diagnostic plots">
+            <discover_datasets pattern="(?P&lt;designation&gt;association_.*)\.pdf" format="pdf" directory="output/diagnostic/" />
+            <filter>diagnostic_plot is True</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="3">
+            <param name="x" value="X_16_100.txt"/>
+            <param name="y" value="Y_16_100.txt"/>
+            <param name="seed" value="42"/>
+            <output name="sig_clusters" value="sig_clusters.tsv"/>
+            <output name="all_associations" value="all_associations.tsv"/>
+        </test>
+        <test expect_num_outputs="3">
+            <param name="x" value="X_16_100.txt"/>
+            <param name="y" value="Y_16_100.txt"/>
+            <param name="seed" value="42"/>
+            <param name="hallagram" value="false"/>
+            <param name="diagnostic_plot" value="true"/>
+            <output name="sig_clusters" value="sig_clusters.tsv"/>
+            <output name="all_associations" value="all_associations.tsv"/>
+            <output_collection name="diagnostic_plot_out" type="list" count="18">
+                <element name="association_1" value="association_1.pdf"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+HAllA (Hierarchical All-against-All association) is a method for finding blocks of associated features in high-dimensional datasets
+measured from a common set of samples. HAllA operates by 
+
+1. optionally discretizing mixed continuous and categorical features to a uniform representation
+2. hierarchically clustering each dataset separately to generate a pair of data hierarchies,
+3. performing all-against-all association testing between features across two datasets using robust measures of correlation,
+4. determining the statistical significance of individual associations by permutation testing, and
+5. iteratively subdividing the space of significant all-against-all correlations into blocks of densely associated occurring as clusters in the original datasets.
+
+Tutorial https://github.com/biobakery/biobakery/wiki/halla
+
+Usage
+.....
+
+**Input**
+
+Data in scientific studies often come paired in the form of two high-dimensional datasets, where the dataset 
+X (with p features/rows and n samples/columns) are assumed to be p predictor variables (or features) measured
+on n samples that give rise to d response variables contained in the dataset Y (with d features/rows and n samples/columns).
+Note that column i of X is sampled jointly with column i of Y, so that X and Y are aligned.
+
+**Output**
+
+HAllA reports significant associations between clusters of related features ("block associations").
+Each block association is characterized by a cluster from the first dataset, a cluster from the second dataset,
+and measures of statistical significance and effect size (p-value, q-value, and similarity score) for the cluster's
+component pairwise associations.
+
+- **block associations** which reports block associations between the two datasets' features
+- **all associations** which reports the pairwise similarity scores for all features across the two datasets
+- **hallagram** graphical representation discovered block associations
+- **diagnostic plots** (optional)lattice plot showing the pairwise associations between microbiome features and metadata for each significant cluster. 
+
+**Notes**
+
+Details on the available: 
+
+- Hierarchical clustering linkage methods https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
+- FDR methods https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html
+
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1093/bioinformatics/btac232</citation>
+    </citations>
+</tool>
\ No newline at end of file