Mercurial > repos > iuc > graphembed

<tool id="graphembed" name="GraphEmbed" version="@VERSION@.0" >
    <description>Compute and plot a 2D embedding of a data matrix given supervised class information</description>
    <macros>
        <token name="@VERSION@">2.4</token>
    </macros>
    <requirements>
        <requirement type="package" version="@VERSION@">graph_embed</requirement>
    </requirements>
    <version_command><![CDATA[
graphembed --version | cut -d'v' -f 2
]]></version_command>
    <command detect_errors='exit_code'><![CDATA[
graphembed
 -i '$i'
 -t '$t'
 -c '$class_confidence'
 -k '$k'
 -d '$d'
 -z '$z'
 -l '$l'
#if str($advanced.option) == 'yes'
 '$advanced.correlation_transformation'
 '$advanced.feature_selection'
 '$advanced.normalization'
 --cmap_name '$advanced.cmap_name'
 --min_threshold '$advanced.min_threshold'
 --max_threshold '$advanced.max_threshold'
 --random_state '$advanced.random_state'
#end if
 --do_not_add_timestamp
 --figure_size 15
 -o ./

 ]]>
    </command>

    <inputs>
        <param argument="-i" type="data" format="tabular" label="Input Matrix"
               help="An feature-observation matrix, with features as rows and observations as columns (e.g. Genes vs Cells)"  />
        <param argument="-t" type="data" format="tabular" label="Observation Classes"
               help="A two-column file with observations in the first column, and an integer representing their assigned class in the second column (e.g. 'YFPCD24X3w_20	2')" />
        <param argument="--class_confidence" type="float" value="1.0"
               label="Confidence bias for clustering" />
        <param argument="-k" type="integer" value="5" min="0"
               label="Number of links towards closest neighbour with the same class" />
        <param argument="-d" type="integer" value="1" min="0"
               label="Number of links towards denser neighbours with a different class" />
        <param argument="-z" type="integer" value="10" min="0"
               label="Number of nearest neighbours to limit the horizon of where to
                      search for denser neighbours of a different class" />
        <param argument="-l" type="integer" value="0"
               label="Number of mutual nearest neighbours that define outlier instances." />
        <conditional name="advanced" >
            <param name="option" type="select" label="Use Advanced Parameters?">
                <option value="yes" >Yes</option>
                <option value="no" selected="true" >No</option>
            </param>
            <when value="no" ></when>
            <when value="yes" >
                <param argument="--correlation_transformation" type="boolean" optional="true"
                       falsevalue="" truevalue="--correlation_transformation"
                       label="Convert data matrix to corr coeff matrix" />
                <param argument="--normalization" type="boolean" optional="true"
                       falsevalue="" truevalue="--normalization"
                       label="Convert data matrix to normalized matrix" />
                <param argument="--feature_selection" type="boolean" optional="true"
                       falsevalue="" truevalue="--feature_selection"
                       label="Select most discriminative features" />
                <param argument="--cmap_name" type="select"
                       label="Color map" >
                    <option value="cubehelix" />
                    <option value="terrain" />
                    <option value="nipy_spectral" />
                    <option value="gist_stern" />
                    <option value="gist_ncar" selected="true" />
                </param>
                <param argument="--min_threshold" type="integer" value="5"
                       label="Min num instances per class" />
                <param argument="--max_threshold" type="integer" value="400"
                       label="Max num instances per class" />
                <param argument="--random_state" type="integer" value="1"
                       label="Random seed" />
            </when>
        </conditional>
        <param name="showlog" type="boolean" checked="true" label="Output Log file" />
    </inputs>

    <outputs>
        <data name="out_coords" format="tabular" from_work_dir="2D_coords.txt"
              label="${tool.name} on ${on_string}: 2D coords" />
        <data name="out_target" format="tabular" from_work_dir="classes.txt"
              label="${tool.name} on ${on_string}: Classes" />
        <data name="out_clean" format="pdf" from_work_dir="img_1_clean.pdf"
              label="${tool.name} on ${on_string}: Embed." />
        <data name="out_links" format="pdf" from_work_dir="img_2_links.pdf"
              label="${tool.name} on ${on_string}: Embed + 1st shift" />
        <data name="out_hull" format="pdf" from_work_dir="img_4_hull.pdf"
              label="${tool.name} on ${on_string}: Embed + hulls + edges" />
        <data name="out_hulllink" format="pdf" from_work_dir="img_5_hull_link.pdf"
              label="${tool.name} on ${on_string}: Embed + hulls + 1st shift" />
        <data name="out_log" format="txt" from_work_dir="log"
              label="${tool.name} on ${on_string}: Log" >
            <filter>showlog</filter>
        </data>
    </outputs>

    <tests>
        <test expect_num_outputs="7" >
            <param name="i" value="prot_expression.mini.tsv" ftype="tabular" />
            <param name="t" value="target.tsv" ftype="tabular" />
            <output name="out_coords" >
                <assert_contents>
                    <has_text text="-3.8328 2.0278" />
                    <has_text text="7.0832 -4.1420" />
                </assert_contents>
            </output>
            <output name="out_target" >
                <assert_contents>
                    <has_text text="14" />
                    <has_text text="8" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="6" >
            <param name="i" value="prot_expression.mini.tsv" ftype="tabular" />
            <param name="t" value="target.tsv" ftype="tabular" />
            <param name="class_confidence" value="1.5" />
            <param name="k" value="5" />
            <param name="z" value="10" />
            <param name="l" value="0" />
            <param name="random_state" value="2" />
            <conditional name="advanced" >
                <param name="option" value="yes" />
                <param name="correlation_transformation" value="true" />
                <param name="feature_selection" value="true" />
                <param name="normalization" value="true" />
            </conditional>
            <param name="showlog" value="false" />
            <output name="out_target" >
                <assert_contents>
                    <has_text text="14" />
                    <has_text text="16" />
                </assert_contents>
            </output>
            <output name="out_coords" >
                <assert_contents>
                    <has_text text="25.9260 0.0171" />
                    <has_text text="-6.4521 -24.8940" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
============
GraphEmbed
============
*Compute a 2D embedding of a data matrix given supervised class information.*

Input: A discrete label for each instance is expected.

A graph is built where nodes are instances and there exist two types of edges:

 * 'knn' edges
    An edge to the k-th nearest instance that has the same label.
 * 'k_shift' edges
    An edge to the k-th nearest instance that is denser and has a different label

Density is defined as the sum of the pairwise cosine similarity between an instance and all the other instances. The desired edge length is the euclidean distance between the instances. If the endpoints of an edge have the same label then the desired distance is divided by 1 + class_confidence. A k-shift edge is deleted if at least one of the endpoints is an outlier. Outlier nodes are defined as those instances that have no mutual k neighbors.

Finally the embedding is computed as the 2D coordinates of the corresponding graph embedding using the force layout algorithm from Tomihisa Kamada, and Satoru Kawai. "An algorithm for drawing general undirected graphs.", Information processing letters 31, no. 1 (1989): 7-15.    </help>
    <citations>
        <citation type="doi">10.5281/zenodo.825832</citation>
    </citations>
</tool>