Mercurial > repos > iuc > chromeister

<tool id="chromeister" name="Chromeister" version="@TOOL_VERSION@">
    <description>ultra-fast pairwise genome comparisons</description>
    <macros>
        <token name="@TOOL_VERSION@">1.2</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">chromeister</requirement>
    </requirements>
    <command><![CDATA[
    #import re

    #set $queryName=re.sub('[^\w\-_.]', '_', str($query.name))
    #set $dbName=re.sub('[^\w\-_.]', '_', str($db.name))

    ln -s $query '$queryName' &&
    ln -s $db '$dbName' &&
    CHROMEISTER
        -query '$queryName'
        -db '$dbName'
        -dimension $dimension
        -kmer $kmer
        -diffuse $diffuse
        -out '$queryName'-'$dbName'.mat &&

    #if str($grid) == "grid":
      compute_score.R
    #else:
      compute_score-nogrid.R
    #end if
    '$queryName'-'$dbName'.mat $dimension > '$output_score' &&

    mv '$queryName'-'$dbName'.mat '$output' &&
    mv '$queryName'-'$dbName'.mat.filt.png '$output_imagen' &&
    mv '$queryName'-'$dbName'.mat.events.txt '$output_events' &&
    mv '$queryName'-'$dbName'.mat.csv '$output_csv'
    ]]></command>
    <inputs>
        <param name="query" type="data" format="fasta" label="Query sequence" help="Query sequence file in fasta format"/>
        <param name="db" type="data" format="fasta" label="Reference sequence" help="Reference sequence file in fasta format"/>
        <param name="dimension" type="integer" value="1000" min="500" max="2000" label="Output dotplot size" help="Use around 1000 for chromosome-sized sequences and around 2000 for complete genomes"/>
        <param name="kmer" type="select" label="K-mer seed size" help="Use 32 as default, and 16 in case no similarities are found">
            <option value="32" selected="true">32</option>
            <option value="16">16</option>
        </param>
        <param name="diffuse" type="integer" value="4" min="1" max="4" label="Diffuse value" help="Level of the heuristic subsampling employed. Change to 1 or 2 if no similarity is found"/>
        <param name="grid" type="boolean" truevalue="grid" falsevalue="" checked="true" label="Add grid to plot for multi-fasta data sets" help="Do not use grid if your multi-fasta contains more than a hundred sequences (approximately)"/>
    </inputs>
    <outputs>
        <data name="output" format="txt" label="${tool.name} on ${on_string}: Comparison matrix"/>
        <data name="output_imagen" format="png" label="${tool.name} on ${on_string}: Comparison dotplot"/>
        <data name="output_csv" format="csv" label="${tool.name} on ${on_string}: Comparison metainformation"/>
        <data name="output_events" format="txt" label="${tool.name} on ${on_string}: Detected events"/>
        <data name="output_score" format="txt" label="${tool.name} on ${on_string}: Comparison score"/>
    </outputs>
    <tests>
        <!-- test run w defaults (and non default for dimension) -->
        <test expect_num_outputs="5">
            <param name="query" value="mycoplasma-232.fasta"/>
            <param name="db" value="mycoplasma-7422.fasta"/>
            <param name="dimension" value="500"/>
            <output name="output" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.mat"/>
            <output name="output_imagen" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.mat.filt.png" compare="sim_size"/>
            <output name="output_csv" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.mat.csv"/>
            <output name="output_events" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.mat.events.txt"/>
            <output name="output_score" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.scr.txt"/>
            <assert_command>
                <has_text text="-dimension 500"/>
                <has_text text="-kmer 32"/>
                <has_text text="-diffuse 4"/>
                <has_text text="compute_score.R"/>
            </assert_command>
        </test>
        <!-- test run w non defaults (and default for dimension) -->
        <test expect_num_outputs="5">
            <param name="query" value="mycoplasma-232.fasta"/>
            <param name="db" value="mycoplasma-7422.fasta"/>
            <param name="kmer" value="16"/>
            <param name="diffuse" value="3"/>
            <param name="grid" value=""/>
            <output name="output">
                <assert_contents>
                    <has_text text="892758"/>
                    <has_text text="898495"/>
                    <has_n_lines n="1002"/>
                </assert_contents>
            </output>
            <output name="output_imagen">
                <assert_contents>
                    <has_text text="PNG"/>
                </assert_contents>
            </output>
            <output name="output_csv" file="mycoplasma-232.fasta-mycoplasma-7422.fasta.mat.csv"/>
            <output name="output_events">
                <assert_contents>
                    <has_text text="892758,898495"/>
                    <has_text text="x1,y1,x2,y2,len,event"/>
                    <has_text text="inverted transposition"/>
                </assert_contents>
            </output>
            <output name="output_score">
                <assert_contents>
                    <has_text text="0.057"/>
                </assert_contents>
            </output>
            <assert_command>
                <has_text text="-dimension 1000"/>
                <has_text text="-kmer 16"/>
                <has_text text="-diffuse 3"/>
                <has_text text="compute_score-nogrid.R"/>
            </assert_command>
        </test>
    </tests>
    <help>
*CHROMEISTER*

CHROMEISTER is a heuristic approach for the ultra fast previsualization of pairwise genome comparisons. It is able to compare enormous genomes (up to 30 thousand million base pairs, or 10 times the size of the human genome) much faster than other methods while yielding significant, reusable and exploitable information such as synteny blocks, evolutionary events or pairwise genome similarity metrics.

What is CHROMEISTER good for?
It is specially suitable to obtain a fast visualization of a pairwise genome comparison. Due to its unique-seeds filtering, it is particularly useful to inspect noisy, full-of-repeats genome comparisons. Additionally, since it outputs a scoring metric for each comparison, it can be used for massive all vs all comparisons that get automatically processed based on such metric.

What is CHROMEISTER NOT good for?
It should NOT be used to obtain alignments. CHROMEISTER, as it is, does not produce a set of alignments (although it can be done using the GECKO pipeline, see github repository). It should also NOT be used to perform studies on DNA repeats, since CHROMEISTER filters these as part of its main signal detection process.

*How to use*

To use Chromeister, upload two .fasta data sets and select these as "Query sequence" and as "Reference sequence". Once so, choose the parameters that best suite your comparison:

**Input parameters**

- Output dotplot size (dimension): This parameter corresponds to the resolution of the comparison. That is, higher resolution is recommended for large genomes (e.g. use 2000 for more than 3 GBps) and lower resolutions (e.g. use 1000 for everything else) should be used for comparisons involving chromosomes or partial genomes. A value of 1000 will produce a 1,000 x 1,000 output dotplot png image.
- K-mer seed size: This parameter is the seed size used to find unique hits. The recommended value is 32 for all sequences except for small experiments such as bacterial, where 16 is recommended.
- Diffuse value: This parameter determines the level of heuristic subsampling employed. A level of 1 will use perfect indexing (no subsampling). The recommended level is 4, which represents a good trade-off between exact and inexact hits. Only use 1 if no similarity is found.

**Output data sets**

- Comparison matrix (plain text), i.e. a scaled matrix containing the number of unique and inexact hits per resolution cell.
- A .png dotplot of the comparison with an automatic scoring distance (useful for classifying) and a grid (if enabled) separating the different sequences (chromosomes for instance) compared.
- A .csv file including the coordinates of each sequence/chromosome contained within the query and reference sequences (useful for multi-fasta inputs).
- Events file. A text file where each row is a "Computational Synteny Block". This means that these events are Large-Scale Genome Rearrangements heuristically determined and classified as (Synteny block, transposition, inversion, ...). But this is only an informative labelling that only considers coordinates and does not employ genes nor other external information.
    </help>

    <citations>
        <citation type="doi">10.1038/s41598-019-46773-w</citation>
    </citations>
</tool>
author	iuc
date	Fri, 02 Oct 2020 18:23:26 +0000
parents	80f64f829785
children	7e2fe094981e