Mercurial > repos > iuc > cwpair2

<?xml version="1.0"?>
<tool id="cwpair2" name="CWPair2" version="@WRAPPER_VERSION@.0">
    <description>find matched pairs and unmatched orphans</description>
    <macros>
        <import>cwpair2_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="aggressive">
        <![CDATA[
            python $__tool_directory__/cwpair2.py
            #for $i in $input:
                 --input "${i}" "${i.hid}"
            #end for
            --up_distance $up_distance
            --down_distance $down_distance
            --method $method
            --binsize $binsize
            --threshold_format $threshold_format_cond.threshold_format
            #if str($threshold_format_cond.threshold_format) == "absolute_threshold":
                --absolute_threshold $threshold_format_cond.absolute_threshold
            #elif str($threshold_format_cond.threshold_format) == "relative_threshold":
                --relative_threshold $threshold_format_cond.relative_threshold
            #end if
            --output_files $output_files
            --statistics_output "$statistics_output"
        ]]>
    </command>
    <inputs>
        <param  name="input" type="data" format="gff" multiple="True" label="Find matched pairs on" />
        <param name="up_distance" type="integer" value="50" min="0" label="Distance upstream from a peak to allow a pair" help="The maximum distance upstream or 5’ to the primary peak"/>
        <param name="down_distance" type="integer" value="100" min="0" label="Distance downstream from a peak to allow a pair" help="The maximum distance downstream or 3’ to the primary peak"/>
        <param name="method" type="select" label="Method of finding a match">
            <option value="mode" selected="True">Mode</option>
            <option value="closest">Closest</option>
            <option value="largest">Largest</option>
            <option value="all">All</option>
        </param>
        <param name="binsize" type="integer" value="1" min="0" label="Width of bins for frequency plots and mode calculation" help="Value 1 implies no bins" />
        <conditional name="threshold_format_cond">
            <param name="threshold_format" type="select" label="Filter using">
                <option value="relative_threshold" selected="True">Relative threshold</option>
                <option value="absolute_threshold">Absolute threshold</option>
            </param>
            <when value="relative_threshold">
                <param  name="relative_threshold" type="float" value="0.0" min="0.0" label="Percentage of the 95 percentile value to filter below" help="Value 0 results in no filtering" />
            </when>
            <when value="absolute_threshold">
                <param name="absolute_threshold" type="float" value="0.0" min="0.0" label="Absolute value to filter below" />
            </when>
        </conditional>
        <param name="output_files" type="select" label="Select output" help="Statistics will always be generated." >
            <option value="all" selected="True">everything (C,D,F,O,P,MP)</option>
            <option value="matched_pair">matched pairs only (MP)</option>
            <option value="matched_pair_orphan">matched pairs and orphans only (O,MP)</option>
            <option value="matched_pair_orphan_detail">matched pairs, orphans and details only (D,O,MP)</option>
        </param>
    </inputs>
    <outputs>
        <data name="statistics_output" format="tabular" label="Statistics Table: ${tool.name} on ${on_string}" />
        <collection name="H" type="list" label="Statistics Histogram: ${tool.name} on ${on_string}">
            <filter>output_files == "all"</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="H" ext="pdf" visible="false" />
        </collection>
        <collection name="D" type="list" label="Data D: ${tool.name} on ${on_string}">
            <filter>output_files in ["all", "matched_pair_orphan_detail"]</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="data_D" ext="tabular" visible="false" />
        </collection>
        <collection name="O" type="list" label="Data O: ${tool.name} on ${on_string}">
            <filter>output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]</filter>
            <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="data_O" ext="tabular" visible="false" />
        </collection>
        <collection name="MP" type="list" label="Data MP: ${tool.name} on ${on_string}">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="data_MP" ext="gff" visible="false" />
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="input" value="cwpair2_input1.gff" />
            <param name="up_distance" value="25" />
            <param name="down_distance" value="100" />
            <param name="method" value="all" />
            <param name="binsize" value="1" />
            <param name="threshold_format" value="relative_threshold" />
            <param name="relative_threshold" value="0.0" />
            <param name="output_files" value="matched_pair" />
            <output name="statistics_output" file="statistics1.tabular" ftype="tabular" />
            <output_collection name="MP" type="list">
                <element name="data_MP_closest_f0u25d100_on_data_1.gff" file="closest_mp_output1.gff" ftype="gff" compare="contains"/>
                <element name="data_MP_largest_f0u25d100_on_data_1.gff" file="largest_mp_output1.gff" ftype="gff" compare="contains"/>
                <element name="data_MP_mode_f0u25d100_on_data_1.gff" file="mode_mp_output1.gff" ftype="gff" compare="contains"/>
            </output_collection>
        </test>
        <test>
            <param name="input" value="cwpair2_input1.gff" />
            <param name="up_distance" value="50" />
            <param name="down_distance" value="100" />
            <param name="method" value="all" />
            <param name="binsize" value="1" />
            <param name="threshold_format" value="relative_threshold" />
            <param name="relative_threshold" value="0.0" />
            <param name="output_files" value="all" />
            <output name="statistics_output" file="statistics2.tabular" ftype="tabular" />
            <output_collection name="H" type="list">
                <element name="histogram_C_mode_f0u50d100_on_data_1.pdf" file="magic.pdf" ftype="pdf" compare="contains" />
                <element name="histogram_F_closest_f0u50d100_on_data_1.pdf" file="magic.pdf" ftype="pdf" compare="contains" />
                <element name="histogram_F_largest_f0u50d100_on_data_1.pdf" file="magic.pdf" ftype="pdf" compare="contains" />
                <element name="histogram_F_mode_f0u50d100_on_data_1.pdf" file="magic.pdf" ftype="pdf" compare="contains" />
                <element name="histogram_P_mode_f0u50d100_on_data_1.pdf" file="magic.pdf" ftype="pdf" compare="contains" />
            </output_collection>
            <output_collection name="D" type="list">
                <element name="data_D_closest_f0u50d100_on_data_1.tabular" file="closest_d_output2.tabular" ftype="tabular" compare="contains"/>
                <element name="data_D_largest_f0u50d100_on_data_1.tabular" file="largest_d_output2.tabular" ftype="tabular" compare="contains"/>
                <element name="data_D_mode_f0u50d100_on_data_1.tabular" file="mode_d_output2.tabular" ftype="tabular" compare="contains"/>
            </output_collection>
            <output_collection name="O" type="list">
                <element name="data_O_closest_f0u50d100_on_data_1.tabular" file="closest_o_output2.tabular" ftype="tabular" compare="contains"/>
                <element name="data_O_largest_f0u50d100_on_data_1.tabular" file="largest_o_output2.tabular" ftype="tabular" compare="contains"/>
                <element name="data_O_mode_f0u50d100_on_data_1.tabular" file="mode_o_output2.tabular" ftype="tabular" compare="contains"/>
            </output_collection>
            <output_collection name="MP" type="list">
                <element name="data_MP_closest_f0u50d100_on_data_1.gff" file="closest_mp_output2.gff" ftype="gff" compare="contains"/>
                <element name="data_MP_largest_f0u50d100_on_data_1.gff" file="largest_mp_output2.gff" ftype="gff" compare="contains"/>
                <element name="data_MP_mode_f0u50d100_on_data_1.gff" file="mode_mp_output2.gff" ftype="gff" compare="contains"/>
            </output_collection>
        </test>
    </tests>
    <help>
**What it does**

CWPair accepts one or more gff files as input and takes the peak location to be the midpoint between the
exclusion zone start and end coordinate (columns D and E).  CWPair starts with the highest peak (primary peak)
in the dataset, and then looks on the opposite strand for another peak located within the distance defined by
a combination of the tool's **Distance upstream from a peak to allow a pair** (the distance upstream or 5’ to
the primary peak) and **Distance downstream from a peak to allow a pair** (the distance downstream or 3’ to the
primary peak) parameters.  So "upstream" value 30 "downstream" value 20 makes the tool look 30 bp upstream and
20 bp downstream (inclusive).  Consequently, the search space would be 51 bp, since it includes the primary peak
coordinate.  The use of a negative number changes the direction of the search limits.  So, "upstream" -30 and
"downstream" 20 produces an 11 bp downstream search window (20-30 bp downstream, inclusive).

.. image:: $PATH_TO_IMAGES/cwpair2.png

When encountering multiple candidate peaks within the search window, CWPair uses the resolution method defined by
the tool's **Method of finding a match** parameter as follows:


 * **mode** - This is an iterative process in which all peak-pair distances within the search window are determined, and the mode calculated.  The pair whose distance apart is closest to the mode is then selected.
 * **closest** - Pairs the peak that has the closest absolute distance from the primary peak.
 * **largest** - Pairs the peak that has the highest tag count.
 * **all** -  Runs all three methods, producing separate outputs for each.

When considering the candidate peaks for pairing to a primary peak, a tag-count threshold may also be set using
the tool's **Filter using relative/absolute threshold** parameter.  A relative threshold determines the tag counts
at the 95th percentile of peak occupancy (i.e. top 5% in terms of tag counts), then uses a tag count threshold at
the specified percentage of this 95th percentile.  So if the peak at the 95th percentile has 200 tags, and "relative
threshold" 50 is used, then it will not consider any peak having less than 100 tags.

-----

**Options**

 * **Method of finding match** - Method of finding matched pair, mode, closest, largest, or all (run with each method).
 * **Distance upstream from a peak to allow a pair** - The maximum distance (inclusive) upstream on the opposite strand from the primary peak to locate another peak, resulting in a pair.
 * **Distance downstream from a peak to allow a pair** - The maximum distance (inclusive) downstream on the opposite strand from the primary peak to locate another peak, resulting in a pair.
 * **Percentage of the 95 percentile value to filter below** - Percentage of the 95 percentile value below which to filter when using a relative threshold.
 * **Absolute value to filter below** - Absolute value below which to filter when using an absolute threshold.
 * **Output files** - Restrict output dataset collections to matched pairs only or one of several combinations of collection types.

-----

**Output Data Files**

 * **closest/largest/mode MP** - gff file containing the Matched Pairs and includes the peak-pair midpoint coordinate (column D) and the coordinate +1 (column E).  The tag count sum is reported in column F, along with the C-W distance in bp in column I.
 * **closest/largest/mode O** - tabular file containing the Orphans (all peaks that are not in pairs).
 * **closest/largest/mode D** - tabular file containing the Details, which lists + and – strand information separately.  The start and end represent the lower and higher coordinates of the exclusion zone from GeneTrack, and “Value” is the tag count sum within the exclusion zone.  The peak pair midpoint is calculated along with the distance between the two paired peaks (midpoint-to-midpoint or C-W distance).

**Output Statistics Files**

 * **closest/largest/mode C** - pdf file that provides the frequency distribution of peak pair distances.
 * **closest/largest/mode P** - pdf file that provides the preview plots graph (the initial iteration of the process for finding the mode).
 * **closest/largest/mode F** - pdf file that provides the final plots graph.
 * **Statistics Table** - provides the number of peaks in pairs (dividing this by 2 provides the number of peak-pairs).

    </help>
    <expand macro="citations" />
</tool>
author	iuc
date	Tue, 13 Dec 2016 09:25:15 -0500
parents	abc464ca7260
children	71188f3f4b76