Mercurial > repos > iuc > halla

<tool id="halla" name="HAllA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
    <description>Hierarchical All-against-All association</description>
    <macros>
        <token name="@TOOL_VERSION@">0.8.40</token>
        <token name="@VERSION_SUFFIX@">0</token>
        <token name="@PROFILE@">24.0</token>
    </macros>
    <xrefs>
        <xref type="bio.tools"></xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">halla</requirement>
    </requirements>
    <version_command><![CDATA[halla --version]]></version_command>
    <command detect_errors="exit_code"><![CDATA[
        halla
            -x '$x'
            --x_dataset_label
            #if $x_dataset_label
                '$x_dataset_label'
            #else
                '$x.element_identifier'
            #end if
            -y '$y'
            --y_dataset_label
            #if $y_dataset_label
                '$y_dataset_label'
            #else
                '$y.element_identifier'
            #end if
            $alla
            --max_freq_thresh $max_freq_thresh
            #if $transform_data_funcs
                --transform_data_funcs
                #for $foo in $transform_data_funcs
                    $foo
                #end for
            #end if
            $disable_bypass_discretization_if_possible
            #if $discretize_func
                --discretize_func $discretize_func
            #end if
            #if $discretize_num_bins
                --discretize_num_bins $discretize_num_bins
            #end if
            --pdist_metric $pdist_metric
            $sim2dist_disable_abs
            --linkage_method $linkage_method
            --permute_func $permute_func
            --permute_iters $permute_iters
            $disable_permute_speedup
            --fdr_alpha $fdr_alpha
            --fdr_method $fdr_method
            --fnr_thresh $fnr_thresh
            --rank_cluster $rank_cluster
            #if $seed
                --seed $seed
            #end if
            --block_num $block_num
            $hallagram
            $diagnostic_plot
            -o output
            --num_threads "\${GALAXY_SLOTS:-4}"
    ]]></command>
    <inputs>
        <param argument="-x" type="data" format="tabular" label="Dataset X" help="Tabular dataset with p features/rows and n samples/columns" />
        <param argument="-y" type="data" format="tabular" label="Dataset Y" help="Tabular dataset with d features/rows and n samples/columns" />
        <param argument="--alla" type="boolean" truevalue="--alla" falsevalue="" checked="false" label="Use AllA instead of HAllA" help="HAllA uses a hierarchical approach for block association discovery on top of an existing all-against-all (AllA) association matrix. Use this option to skip the block association step." />
        <param argument="--max_freq_thresh" type="float" min="0" max="1" value="1" label="Maximum frequency" help="features with max frequences >= the threshold will be removed" />
        <param argument="--transform_data_funcs" type="select" optional="true" multiple="true" label="Continuous data transformation function">
            <option value="zscore">zscore</option>
            <option value="rank">rank</option>
            <option value="quantile">quantile</option>
        </param>

        <param argument="--disable_bypass_discretization_if_possible" type="boolean" truevalue="--disable_bypass_discretization_if_possible" falsevalue="" checked="false" label="Discretize even if all features are continuous" help="By default discritization is bypassed if all features are continuous" />
        <param argument="--discretize_func" type="select" optional="true" label="Discretization function">
            <option value="quantile">quantile</option>
            <option value="kmeans">kmeans</option>
            <option value="uniform">uniform</option>
            <option value="jenks">jenks</option>
        </param>
        <param argument="--discretize_num_bins" type="integer" min="0" value="" optional="true" label="Discretization - number of bins"/>

        <param argument="--pdist_metric" type="select" label="Distance/similarity metric" help="default: spearman for continuous data, If there is at least one categorical variable in either dataset, HAllA will shift to Normalized Mutual Information (NMI) as an alternative similarity measure.">
            <option value="spearman" selected="true">Spearman</option>
            <option value="pearson">Pearson</option>
            <option value="dcor">Distance correlation</option>
            <option value="mi">mutual information</option>
            <option value="nmi">normalized mutual information</option>
            <option value="xicor">xi correlation</option>
        </param>

        <param argument="--sim2dist_disable_abs" type="boolean" truevalue="--sim2dist_disable_abs" falsevalue="" checked="false" label="Hierarchical clustering - disable setting similarity scores as absolute when computing distance" />
        <param argument="--linkage_method" type="select" label="Hierarchical clustering linkage method" help="see help below">
            <option value="single">single</option>
            <option value="complete">complete</option>
            <option value="average" selected="true">average</option>
            <option value="weighted">weighted</option>
            <option value="centroid">centroid</option>
            <option value="median">median</option>
            <option value="ward">ward</option>
        </param>

        <param argument="--permute_func" type="select" label="P-value approximation function" help=" in the p-value permutation test">
            <option value="gpd">gdp</option>
            <option value="ecdf">ecdf</option>
        </param>
        <param argument="--permute_iters" type="integer" min="0" value="1000" label="Number of iterations in the p-value permutation test"/>
        <param argument="--disable_permute_speedup" type="boolean" truevalue="--disable_permute_speedup" falsevalue="" checked="false" label="Do not break early in the permutation test if p-value is insignificant"/>
        <!-- \-\-force_permutations  If turned on, force permutation testing -->

        <param argument="--fdr_alpha" type="float" min="0" max="1" value="0.05" label="FDR threshold"/>
        <param argument="--fdr_method" type="select" label="FDR method" help="see help below">
            <option value="bonferroni">bonferroni: one-step correction</option>
            <option value="sidak">sidak: one-step correction</option>
            <option value="holm-sidak">holm-sidak: </option>
            <option value="holm">holm: </option>
            <option value="simes-hochberg">simes-hochberg: </option>
            <option value="hommel">hommel: </option>
            <option value="fdr_bh" selected="true">fdr_bh: Benjamini/Hochberg (non-negative)</option>
            <option value="fdr_by">fdr_by: Benjamini/Yekutieli (negative)</option>
            <option value="fdr_tsbh">fdr_tsbh: two stage fdr correction (non-negative)</option>
            <option value="fdr_tsbky">fdr_tsbky: two stage fdr correction (non-negative)</option>
        </param>
        <param argument="--fnr_thresh" type="float" min="0" max="1" value="0.05" label="FNR threshold"/>
        <param argument="--rank_cluster" type="select" label="Procedure to rank cluster using the p-values within the cluster">
            <option value="best" selected="true">best</option>
            <option value="average">average</option>
        </param>
        <param argument="--seed" type="integer" value="" optional="true" label="Randomization seed" />

        <param argument="--hallagram" type="boolean" truevalue="--hallagram" falsevalue="--no_hallagram" checked="true" label="Generate hallagram" />
        <param argument="--x_dataset_label" type="text" label="Hallagram/clustermap: label for X dataset" help="By default the dataset identifier is used"/>
        <param argument="--y_dataset_label" type="text" label="Hallagram/clustermap: label for Y dataset" help="By default the dataset identifier is used"/>
        <param argument="--block_num" type="integer" min="-1" value="-1" label="Number of top clusters in hallagram" help="-1: show all clusters"/>

        <param argument="--diagnostic_plot" type="boolean" truevalue="--diagnostic_plot" falsevalue="" checked="false" label="Generates diagnostic plot" />
    </inputs>
    <outputs>
        <data name="sig_clusters" format="tabular" from_work_dir="output/sig_clusters.txt" label="${tool.name} on ${on_string}: block associations"/>
        <data name="all_associations" format="tabular" from_work_dir="output/all_associations.txt" label="${tool.name} on ${on_string}: all associations"/>
        <data name="hallagram_out" format="pdf" from_work_dir="output/hallagram.pdf" label="${tool.name} on ${on_string}: hallagram">
            <filter>hallagram is True</filter>
        </data>
        <collection name="diagnostic_plot_out" format="pdf" type="list" label="${tool.name} on ${on_string}: diagnostic plots">
            <discover_datasets pattern="(?P&lt;designation&gt;association_.*)\.pdf" format="pdf" directory="output/diagnostic/" />
            <filter>diagnostic_plot is True</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="x" value="X_16_100.txt"/>
            <param name="y" value="Y_16_100.txt"/>
            <param name="seed" value="42"/>
            <output name="sig_clusters" value="sig_clusters.tsv"/>
            <output name="all_associations" value="all_associations.tsv"/>
        </test>
        <test expect_num_outputs="3">
            <param name="x" value="X_16_100.txt"/>
            <param name="y" value="Y_16_100.txt"/>
            <param name="seed" value="42"/>
            <param name="hallagram" value="false"/>
            <param name="diagnostic_plot" value="true"/>
            <output name="sig_clusters" value="sig_clusters.tsv"/>
            <output name="all_associations" value="all_associations.tsv"/>
            <output_collection name="diagnostic_plot_out" type="list" count="18">
                <element name="association_1" value="association_1.pdf"/>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**What it does**

HAllA (Hierarchical All-against-All association) is a method for finding blocks of associated features in high-dimensional datasets
measured from a common set of samples. HAllA operates by

1. optionally discretizing mixed continuous and categorical features to a uniform representation
2. hierarchically clustering each dataset separately to generate a pair of data hierarchies,
3. performing all-against-all association testing between features across two datasets using robust measures of correlation,
4. determining the statistical significance of individual associations by permutation testing, and
5. iteratively subdividing the space of significant all-against-all correlations into blocks of densely associated occurring as clusters in the original datasets.

Tutorial https://github.com/biobakery/biobakery/wiki/halla

Usage
.....

**Input**

Data in scientific studies often come paired in the form of two high-dimensional datasets, where the dataset
X (with p features/rows and n samples/columns) are assumed to be p predictor variables (or features) measured
on n samples that give rise to d response variables contained in the dataset Y (with d features/rows and n samples/columns).
Note that column i of X is sampled jointly with column i of Y, so that X and Y are aligned.

**Output**

HAllA reports significant associations between clusters of related features ("block associations").
Each block association is characterized by a cluster from the first dataset, a cluster from the second dataset,
and measures of statistical significance and effect size (p-value, q-value, and similarity score) for the cluster's
component pairwise associations.

- **block associations** which reports block associations between the two datasets' features
- **all associations** which reports the pairwise similarity scores for all features across the two datasets
- **hallagram** graphical representation discovered block associations
- **diagnostic plots** (optional)lattice plot showing the pairwise associations between microbiome features and metadata for each significant cluster.

**Notes**

Details on the available:

- Hierarchical clustering linkage methods https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
- FDR methods https://www.statsmodels.org/dev/generated/statsmodels.stats.multitest.multipletests.html

    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/btac232</citation>
    </citations>
</tool>