view sina.xml @ 5:56d7dc39eec8 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sina commit 2a3f764a5f93799ac60689cfe29601604bb68435"
author iuc
date Fri, 01 Jan 2021 22:59:04 +0000
parents 482052b30b78
children
line wrap: on
line source

<tool id="sina" name="SINA" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="18.01">
    <description>reference based multiple sequence alignment</description>
    <macros>
        <token name="@TOOL_VERSION@">1.7.2</token>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">sina</requirement>
    </requirements>
    <version_command>sina --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
## initialize
#if $db.select == 'custom'
    ## parse custom reference from fasta to arb file format
    #if $db.custom.ext == 'fasta'
        sina -i '$db.custom' --prealigned -o 'reference.arb' &&
    #else
        ln -s '$db.custom' 'reference.arb' &&
    #end if
#elif $db.select == 'local'
    ln -s '$db.arb_databases.fields.path' 'reference.arb' &&
#end if

## run
sina
--in='$in'
--db='reference.arb'
--threads=\${GALAXY_SLOTS:-4}
--num-pts=\${GALAXY_SLOTS:-4}
--show-conf
## Output options
#if 'arb' in $oo.out
    --out='output.arb'
#end if
#if 'fasta' in $oo.out
    --out='output.fasta'
    --min-idty=$oo.minidty
#end if
#if 'tsv' in $oo.out
    --out='output.csv' ## parsed into tabular in postprocessing step
#end if
#if 'log' in $oo.out
    --log-file='$logfile'
    --show-diff
    --show-dist
#end if
## Advanced alignment parameters
--overhang=$ap.overhang
--insertion=$ap.insertion
#if $ap.turn == 'yes'
    -t
#elif $ap.turn == 'all'
    -t all
#end if
## Expert alignment parameters
--fs-min=$eap.fsmin
--fs-max=$eap.fsmax
--fs-msc=$eap.fsmsc
--fs-weight=$eap.fsweight
--fs-req=$eap.fsreq
--fs-req-full=$eap.fsreqfull
--fs-full-len=$eap.fsfulllen
--gene-start=$eap.genestart
--gene-end=$eap.geneend
--fs-cover-gene=$eap.fscovergene
--match-score=$eap.matchscore
--mismatch-score=$eap.mismatchscore
--pen-gap=$eap.pengap
--pen-gapext=$eap.pengapext
--fs-kmer-len=$eap.fskmerlen
--fs-kmer-mm=$eap.fskmermm
$eap.fskmernofast
$eap.fskmernorel
## Advanced search and classification parameters
#if $asacp.activate == 'yes'
    --search
    --search-kmer-candidates=$asacp.searchkmercandidates
    --lca-quorum=$asacp.lcaquorum
    --search-kmer-len=$asacp.searchkmerlen
    --search-kmer-mm=$asacp.searchkmermm
    --search-max-result=$asacp.searchmaxresult
    $asacp.searchnofast
    $asacp.searchkmernorel
#end if

## postprocessing
## convert csv to tabular
#if 'tsv' in $oo.out
    && test -f 'output.csv' && cat 'output.csv' | sed 's/,/\t/g' > 'output.tsv' || echo 'No additional meta data file.'
#end if
    ]]></command>
    <inputs>
        <!-- Included paramaters, sections and default values are based on https://www.arb-silva.de/aligner -->
        <param argument="--in" type="data" format="fasta" label="Select sequence file"/>
        <conditional name="db">
            <param name="select" type="select" label="Select reference library type">
                <option value="custom" selected="true">Custom</option>
                <option value="local">Local cached</option>
            </param>
            <when value="custom">
                <param name="custom" type="data" format="data" label="Select reference library file" help="FASTA or ARB file format (--db)"/>
            </when>
            <when value="local">
                <param name="arb_databases" type="select" label="Select local cached library">
                    <options from_data_table="sina_references">
                        <validator message="No database is available" type="no_options"/>
                    </options>
                </param>
            </when>
        </conditional>
        <conditional name="asacp">
            <param name="activate" type="select" label="Enable search stage?" help="(--search)">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"></when>
            <when value="yes">
                <param name="searchkmercandidates" type="integer" value="1000" label="Set number of candidate reference sequences retrieved from the k-mer based search" help="For each candidate, the MSA based similarity is calculated and the search result based on these numbers. A value for n one or two orders larger than --search-max-result is usually quite sufficient. (--search-kmer-candidates)"/>
                <param name="lcaquorum" type="float" value="0.8" min="0.0" max="1.0" label="Set fraction of the search result that must share the same classification" help="Using the default parameters --search-max-result=10 and --lca-quorum=0.7, this means that the deepest classification shared by 7 out of the top 10 search results is chosen for the query sequence. (--lca-quorum)"/>
                <param name="searchkmerlen" type="integer" value="10" min="0" label="Set k for the kmer based candidate search" help="See --fs-kmer-len. (--search-kmer-len)"/>
                <param name="searchkmermm" type="integer" value="0" min="0" label="Set number of allowed mismatches within each kmer" help="See --fs-kmer-mm.  (--search-kmer-mm)"/>
                <param name="searchnofast" type="boolean" truevalue="--search-no-fast" falsevalue="" checked="true" label="Skip fast family search?" help="This option configures the same behavior for the search stage. See --fs-kmer-no-fast. (--search-no-fast)"/>
                <param name="searchkmernorel" type="boolean" truevalue="--search-kmer-norel" falsevalue="" checked="false" label="Configures the candidate search to use absolute rather than length-relative scores for ordering the results?" help="See --fs-kmer-norel. ( --search-kmer-norel)"/>
                <param name="searchmaxresult" type="integer" value="10" min="0" label="Set maximum number of search results to return for each query sequence" help="(--search-max-result)"/>
            </when>
        </conditional>
        <section name="ap" title="Alignment parameters">
            <param argument="--overhang" type="select" label="Bases remaining unaligned at the ends should be">
                <option value="attach">attached to the last aligned base</option>
                <option value="edge">moved to the edge of the alignment</option>
                <!-- maybe broken, see https://sina.readthedocs.io/en/latest/commandline.html#alignment-options -->
                <option value="remove">removed</option>
            </param>
            <param argument="--insertion" type="select" label="Insertions wider than available columns should be">
                <option value="forbid">forbidden during alignment</option>
                <option value="shift">accomodated by pushing out surrounding bases</option>
                <option value="remove">removed</option>
            </param>
            <param argument="--turn" type="select" label="Enable turn check stage?" help="Sequences not oriented in accordance with the reference database will be reverse complemented as needed. If all is specified, sequences will also be tested for only reversal or only complemented (this should only be necessary if your data was mishandled).">
                <option value="yes">Yes</option>
                <option value="no">No</option>
                <option value="all">All</option>
            </param>
        </section>
        <section name="eap" title="Expert alignment parameters">
            <param name="fsmin" type="integer" value="15" min="0" label="Set minimum number of reference sequences used for each query" help="(--fs-min)"/>
            <param name="fsmax" type="integer" value="40" min="0" label="Set maximum number of reference sequences used for each query" help="(--fs-max)"/>
            <param name="fsmsc" type="float" value="0.7" min="0.0" max="1.0" label="Set minimum similarity reference sequences" help="Required to have with the query sequence. This affects the range between --fs-min and --fs-max(--fs-msc)"/>
            <param name="fsweight" type="integer" value="1" min="0" label="Set weight factor for the frequency at which a node was observed in the reference alignment" help="Use 0 to disable weighting. This feature prefers the more common placement for bases with inconsistent alignment in the reference database. (--fs-weight)"/>
            <param name="fsreq" type="integer" value="1" min="0" label="Set minimum number of reference sequences that must be found in order to attempt alignment" help="If fewer sequences than indicated here are found, the respective query sequence will be discarded. (--fs-req)"/>
            <param name="fsreqfull" type="integer" value="1" min="0" label="Set minimum number of full length reference sequences that must be included in the selected reference set" help="The search will proceed regardless of other settings until this setting has been satisfied. If it cannot be satisfied by any sequence in the reference database, the query sequence will be discarded. This setting exists to ensure that the entire length of the query sequence will be covered in the presence of partial sequences contained within your reference database. Note: If you are working with sequences other than 16S, you need to adjust this value or the value of --fs-full-len accordingly. In particular when working with short reference sequences, this setting may prevent any acceptable reference sequences from being found, leading to no sequences being aligned. (--fs-req-full)"/>
            <param name="fsfulllen" type="integer" value="1400" min="0" label="Set minimum length a sequence is required to have to be considered full length" help="(--fs-full-len)"/>
            <param name="genestart" type="integer" value="0" min="0" label="Set beginning of the gene within the reference alignment" help="See --fs-cover-gene. (--gene-start)"/>
            <param name="geneend" type="integer" value="0" min="0" label="Set end of the gene within the reference alignment" help="See --fs-cover-gene. (--gene-end)"/>
            <param name="fscovergene" type="integer" value="0" label="Set total of n sequences to cover each the beginning and the end of the gene within the alignment" help="Similar to --fs-req-full.  This option is more precise than --fs-req-full, but requires that the column numbers for the range in which the full gene is expected be specified via --gene-start and --gene-end. (--fs-cover-gene)"/>
            <param name="matchscore" type="integer" value="2" min="0" label="Set score given for a match" help="(--match-score)"/>
            <param name="mismatchscore" type="integer" value="-1" max="0" label="Set score given for a mismatch" help="(--mismatch-score)"/>
            <param name="pengap" type="integer" value="5" min="0" label="Set penalty subtracted from the score for opening a gap" help="(--pen-gap)"/>
            <param name="pengapext" type="integer" value="2" min="0" label="Set penalty subtracted from the score for extending a gap" help="(--pen-gapext)"/>
            <param name="fskmerlen" type="integer" value="10" min="0" label="Set size of k for the reference search" help="For SSU rRNA sequences, the default of 10 is a good value. For different sequence types, different values may perform better. For 5S, for example, 6 has shown to be more effective. (--fs-kmer-len)"/>
            <param name="fskmermm" type="integer" value="0" min="0" label="Set k-mer matches in the reference database to contain this number of mismatches" help="(--fs-kmer-mm)"/>
            <param name="fskmernofast" type="boolean" truevalue="--fs-kmer-no-fast" falsevalue="" label="Use all k-mers occurring in the query sequence in the search?" help="By default, only k-mers starting with an A are used for extra performance. (--fs-kmer-no-fast)"/>
            <param name="fskmernorel" type="boolean" truevalue="--fs-kmer-norel" falsevalue="" label="Use absolute match scores in the kmer search?" help="Absolute (number of shared k-mers) rather than relative (number or shared k-mers divided by length of reference sequence) (--fs-kmer-norel)"/>
        </section>
        <section name="oo" title="Output options">
            <param name="out" type="select" multiple="true" optional="false" label="Select output file(s)">
                <option value="arb" selected="true">Alignment (ARB)</option>
                <option value="fasta" selected="true">Alignment (FASTA)</option>
                <option value="tsv">Additional meta data (Tabular)</option>
                <option value="log">Log</option>
            </param>
            <param name="minidty" type="float" value="0.7" min="0.0" max="1.0" label="Exclude sequences sharing less than this value fractional identity with any of the alignment reference sequences from the output?" help="Only relevant for FASTA output. (--min-idty)"/>
        </section>
    </inputs>
    <outputs>
        <data name="out_fasta" format="fasta" from_work_dir="output.fasta" label="${tool.name} on ${on_string}: Alignment (FASTA)">
            <filter>'fasta' in oo['out']</filter>
        </data>
        <data name="out_arb" format="data" from_work_dir="output.arb" label="${tool.name} on ${on_string}: Alignment (ARB)">
            <filter>'arb' in oo['out']</filter>
        </data>
        <data name="out_tsv" format="tabular" from_work_dir="output.tsv" label="${tool.name} on ${on_string}: Additional meta data (Tabular)">
            <filter>'tsv' in oo['out']</filter>
        </data>
        <data name="out_log" format="txt" label="${tool.name} on ${on_string}: Log">
            <filter>'log' in oo['out']</filter>
        </data>
    </outputs>
    <tests>
        <!-- #1 in: *.fasta; db: *.fasta; out: *.fasta; default parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.fasta"/>
            </conditional>
            <section name="oo">
                <param name="out" value="fasta,tsv"/>
                <param name="minidty" value="0.0"/>
            </section>
            <output name="out_tsv">
                <assert_contents>
                    <has_text text="FJ203641.1"/>
                    <has_n_lines n="18"/>
                </assert_contents>
            </output>
            <output name="out_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 in: *.fasta; db: *.fasta; out: *.arb; default parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.fasta"/>
            </conditional>
            <section name="oo">
                <param name="out" value="arb"/>
            </section>
            <output name="out_arb">
                <assert_contents>
                    <has_size value="21106"/>
                </assert_contents>
            </output>
        </test>
        <!-- #3 in: *.fasta; db: *.arb; out: *.fasta; default parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="oo">
                <param name="out" value="fasta,tsv"/>
                <param name="minidty" value="0.0"/>
            </section>
            <output name="out_tsv">
                <assert_contents>
                    <has_text text="FJ203641.1"/>
                    <has_n_lines n="18"/>
                </assert_contents>
            </output>
            <output name="out_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #4 in: *.fasta; db: *.arb; out: *.arb; default parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="oo">
                <param name="out" value="arb"/>
            </section>
            <output name="out_arb">
                <assert_contents>
                    <has_size value="19736"/>
                </assert_contents>
            </output>
        </test>
        <!-- #5 in: *.fasta; db: *.arb; out: *.fasta; custom parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="oo">
                <param name="out" value="fasta"/>
                <param name="minidty" value="0.0"/>
            </section>
            <section name="ap">
                <param name="overhang" value="edge"/>
                <param name="insertion" value="shift"/>
                <param name="turn" value="all"/>
            </section>
            <section name="eap">
                <param name="fsmin" value="14"/>
                <param name="fsmax" value="41"/>
                <param name="fsmsc" value="0.8"/>
                <param name="fsweight" value="1"/>
                <param name="fsreq" value="1"/>
                <param name="fsreqfull" value="1"/>
                <param name="fsfulllen" value="1399"/>
                <param name="genestart" value="0"/>
                <param name="geneend" value="0"/>
                <param name="fscovergene" value="0"/>
                <param name="matchscore" value="1"/>
                <param name="mismatchscore" value="-2"/>
                <param name="pengap" value="6"/>
                <param name="pengapext" value="1"/>
                <param name="fskmerlen" value="9"/>
                <param name="fskmermm" value="1"/>
                <param name="fskmernofast" value="--fs-kmer-no-fast"/>
                <param name="fskmernorel" value=""/>
            </section>
            <section name="asacp">
                <param name="searchkmercandidates" value="1001"/>
                <param name="lcaquorum" value="0.9"/>
                <param name="searchkmerlen" value="9"/>
                <param name="searchkmermm" value="1"/>
                <param name="searchnofast" value=""/>
                <param name="searchkmernorel" value="--search-kmer-norel"/>
                <param name="searchmaxresult" value="10"/>
            </section>
            <output name="out_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #6 in: *.fasta; db: <arb_databases>.arb; out: *.fasta; default parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="select" value="local"/>
                <param name="arb_databases" value="testarb"/>
            </conditional>
            <section name="oo">
                <param name="out" value="arb"/>
            </section>
            <output name="out_arb">
                <assert_contents>
                    <has_size value="19736"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**What it does**

SINA aligns nucleotide sequences to match a pre-existing MSA using a graph-based alignment algorithm similar to PoA. The graph approach allows SINA to incorporate information from many reference sequences without blurring highly variable regions. While pure NAST implementations are highly dependent on finding a good match in the reference database, SINA is able to align sequences relatively distant to references with good quality and will yield a robust result for query sequences with many close references.

While adding sequences to an MSA with SINA is usually faster than re-computing the entire MSA from an augmented set of unaligned sequences, the primary benefit lies in protecting investments made into the original MSA such as manual curation of the alignment, compute-intensive phylogenetic tree reconstruction and taxonomic annotation of the resulting phylogeny. Additionally, SINA includes a homology search which uses the previously computed alignment to determine the most similar sequences.  Based on the search results, a LCA-based classification of the query sequence can be computed using taxonomic classifications assigned to the sequences comprising the reference MSA.

SINA is used to compute the large and small subunit ribosomal RNA alignments provided by the `SILVA project <https://www.arb-silva.de/>`_ and is able to use the `ARB format reference databases <https://www.arb-silva.de/download/arb-files/>`_ released by the project.

**Input**

SINA requires sequences in FASTA file format, whereas libraries can be also provided as ARB files. Furthermore, reference databases can be added as data tables. See README.rst for more information.

**Output**

Results are provided in FASTA or ARB file format, whereas additional metadata is provided as tabular text file.

.. class:: infomark

**References**

More information can be found on the `project website <https://sina.readthedocs.io/en/latest>`_ and on `GitHub <https://github.com/epruesse/SINA>`_.  An `online version <https://www.arb-silva.de/aligner>`_ of SINA is provided by the SILVA project.
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/bts252</citation>
    </citations>
</tool>