Mercurial > repos > iuc > repmatch_gff3

<?xml version="1.0"?>
<tool id="repmatch_gff3" name="RepMatch" version="@WRAPPER_VERSION@.1">
    <description>Match paired peaks from two or more replicates</description>
    <macros>
        <import>repmatch_gff3_macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <stdio>
        <!-- Anything other than zero is an error -->
        <exit_code range=":-1" />
        <exit_code range="1:" />
        <!-- Check stderr in case the return code has not been set -->
        <regex match="Error:" />
        <regex match="Exception:" />
    </stdio>
    <command>
        python $__tool_directory__/repmatch_gff3.py
        #for $i in $input:
             --input "${i}" "${i.hid}"
        #end for
        --method $method
        --distance $distance
        --replicates $replicates
        --output_files $output_files_cond.output_files
        --output_matched_peaks "$output_matched_peaks"
        #if str($output_files_cond.output_files) in ["all", "matched_peaks_unmatched_peaks"]:
            --output_unmatched_peaks "$output_unmatched_peaks"
        #end if
        #if str($output_files_cond.output_files) =="all":
            --output_detail "$output_detail"
            --output_statistics_table "$output_statistics_table"
            --output_statistics_histogram "$output_statistics_histogram"
        #end if
        #if str($advanced_options_cond.advanced_options) == "on":
            --step $advanced_options_cond.step
            --low_limit $advanced_options_cond.low_limit
            --up_limit $advanced_options_cond.up_limit
        #end if
    </command>
    <inputs>
        <param  name="input" type="data" format="gff" multiple="True" min="2" label="Match paired peaks on" />
        <param name="method" type="select" label="Method of finding match">
            <option value="closest" selected="True">Closest</option>
            <option value="largest">Largest</option>
            <option value="all">All</option>
        </param>
        <param name="distance" type="integer" value="50" min="0" label="Maximum distance between peaks in different replicates to allow merging" />
        <param name="replicates" type="integer" value="2" min="2" label="Minimum number of replicates that must be matched for merging to occur" />
        <conditional name="output_files_cond">
            <param name="output_files" type="select" label="Select output" help="Statistics will always be generated." >
                <option value="all" selected="True">everything</option>
                <option value="matched_peaks">matched paired peaks only</option>
                <option value="matched_peaks_unmatched_peaks">matched paired peaks and unmatched paired peaks only</option>
            </param>
            <when value="matched_peaks" />
            <when value="matched_peaks_unmatched_peaks" />
            <when value="all" />
        </conditional>
        <conditional name="advanced_options_cond">
            <param name="advanced_options" type="select" label="Advanced options">
                <option value="off" selected="true">Hide advanced options</option>
                <option value="on">Display advanced options</option>
            </param>
            <when value="on">
                <param name="step" type="integer" value="0" min="0" label="Step size" help="Distance for each iteration" />
                <param name="low_limit" type="integer" value="-1000" label="Lower limit for Crick-Watson distance filter" />
                <param name="up_limit" type="integer" value="1000" label="Upper limit for Crick-Watson distance filter" />
            </when>
            <when value="off" />
        </conditional>
    </inputs>
    <outputs>
        <data name="output_statistics_table" format="tabular" label="Statistics Table: ${tool.name} on ${on_string}">
            <filter>output_files_cond["output_files"] == "all"</filter>
       </data>
        <data name="output_statistics_histogram" format="pdf" label="Statistics Histogram: ${tool.name} on ${on_string}">
            <filter>output_files_cond["output_files"] == "all"</filter>
       </data>
        <data name="output_detail" format="tabular" label="Data D: ${tool.name} on ${on_string}">
            <filter>output_files_cond["output_files"] == "all"</filter>
       </data>
        <data name="output_unmatched_peaks" format="tabular" label="Data UP: ${tool.name} on ${on_string}">
            <filter>output_files_cond["output_files"] in ["all", "matched_peaks_unmatched_peaks"]</filter>
        </data>
        <data name="output_matched_peaks" format="gff" label="Data MP: ${tool.name} on ${on_string}" />
    </outputs>
    <tests>
        <test>
            <param name="input" value="closest_matched_pairs_input1.gff,largest_matched_pairs_input1.gff" ftype="gff" />
            <param name="method" value="closest" />
            <param name="distance" value="50" />
            <param name="replicates" value="2" />
            <param name="output_files" value="all" />
            <param name="step" value="0" />
            <param name="low_limit" value="-1000" />
            <param name="up_limit" value="1000" />
            <output name="output_statistics_table" file="statistics_table_out1.tabular" ftype="tabular" />
            <output name="output_statistics_histogram" file="magic.pdf" ftype="pdf" compare="contains" />
            <output name="output_detail" file="detail_out1.tabular" ftype="tabular" />
            <output name="output_unmatched_peaks" file="unmatched_peaks_out1.tabular" ftype="tabular" />
            <output name="output_matched_peaks" file="matched_peaks_out1.gff" ftype="gff" />
        </test>
    </tests>
    <help>
**What it does**

RepMatch accepts two or more input datasets, and starts by defining peak-pair midpoints in the first dataset.  It then
discovers all peak-pair midpoints in the second dataset that are within the distance, defined by the tool's **Maximum
distance between peaks in different replicates to allow merging** parameter, from the peak-pair midpoint coordinate in
the first dataset.  When encountering multiple candidates to match (one-to-many), RepMatch uses the method defined by
the tool's **Method of finding match** parameter so that there is at most only a one-to-one match across the two datasets.
This method provides the following options:

 * **closest** - matches only the closest one in bp distance.
 * **largest** -  matches the one that contain the most number of reads.
 * **all** -  both methods are run separately.

RepMatch matching is an iterative process, as it attempts to find the centroid coordinate amongst all replicates. As such,
the centroid is the point of reference for "distqnce" and "closest".  This process can be sped up by increasing the tool's
**Step size** parameter.

The minimum number of replicates that can be matched for a match to occur is defined by the tool's **Minimum number of
replicates that must be matched for merging to occur** parameter.  Additional filters can be applied using the tool's
**Advanced options**, including a lower and upper limit for the C-W distance.

.. image:: $PATH_TO_IMAGES/repmatch.png

-----

**Options**

 * **Distance** - Maximum distance for discovering all peak-pair midpoints in a second dataset relative to the peak-pair midpoints in the first dataset
 * **Method** - Method to use when encountering multiple candidates to match so that there is at most only a one-to-one match across the two datasets.
 * **Step Size** - Distance for each iteration.
 * **Replicates** - Minimum number of replicates that can be matched for a match to occur.  This value must be at least 2.
 * **Lower Limit** - Lower limit for the Crick-Watson distance filter.
 * **Upper Limit** - Upper limit for the Crick-Watson distance filter.

-----

**Output Data Files**

 * **Data MP** - gff file consisting of only peak pairs

  - Columns are **chr**, **script**, **blank**, **peak start**, **peak end**, **blank**, **normalized tag counts**, **blank** and **info**.
  - Peak start and end are separated by one coordinate.
  - Normalized tag is the occupancy averaged across replicates.
  - Attributes include C-W distance, sum total of tag counts, number of replicates merged.

 * **Data D** - tabular file consisting of the list of all matched replicates.
 * **Data UP** - tabular file consisting of all unmatched peak-pairs.

**Output Statistics Files**

 * **Statistics Table** - tabular file providing the description key of **Data D**.
 * **Statistics Histogram** - graph of the number of matched locations having the indicated replicate counts.

**Comments on Replicates**

Three types of replicates may be considered.  Biological replicates represent independently collected biological samples.
At least two biological replicate must be performed for each experiment from which a conclusion is being drawn, and the
conclusion must be evident in both biological replicates when analyzed separately.  Technical replicates represent a re-run
of the assay on the same biological material.  This is usually done when one replicate fails to produce quality data, and is
used to replace that earlier replicate.  Sequencing replicates represent additional sequencing of the same successful library
in order to obtain more reads should the analysis require it.  The reads from individual sequencing replicates are usually
merged without need for separate analysis.

    </help>
    <expand macro="citations" />
</tool>
author	iuc
date	Fri, 13 Jan 2017 10:27:44 -0500
parents	e5c7fffdc078
children	6acaa2c93f47