view sina.xml @ 0:a21965c8bcf1 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sina commit 776b362db87d508ca44983eae02dcd4f119b95e8"
author iuc
date Wed, 23 Oct 2019 18:28:17 -0400
parents
children 49b937f12aa4
line wrap: on
line source

<tool id="sina" name="SINA" version="@TOOL_VERSION@+@WRAPPER_VERSION@">
    <description>reference based multiple sequence alignment</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <stdio></stdio>
    <command detect_errors="exit_code"><![CDATA[
        ## initialize
        ## parse custom reference from fasta to arb file format

        #if $db.select == 'custom'
            #if $db.custom.ext == 'fasta'
                sina -i '$db.custom' --prealigned -o ./reference.arb &&
            #else
                ln -s '$db.custom' ./reference.arb && 
            #end if
        #elif $db.select == 'local'
            ln -s '$db.arb_databases.fields.path' ./reference.arb &&
        #end if

        ## run
        sina
            --in='$in'
            --db=./reference.arb
            -p \${GALAXY_SLOTS:-4}
            --num-pts=\${GALAXY_SLOTS:-4}

            #if $os.type == 'arb'
                --outtype='arb'
                --out=output.arb
            #elif $os.type == 'fasta'
                --outtype='fasta'
                --out=output.fasta
                --min-idty=$os.minidty
            #elif $os.type == 'fasta_meta'
                --outtype='fasta'
                --out=output.fasta
                --min-idty=$os.minidty
                --meta-fmt='csv'
            #end if
            #if $log == 'yes'
                --log-file=$logfile
            #elif $log == 'yes_meta'
                --log-file=$logfile
                --show-diff
            #end if

            ## Advanced alignment parameters
            --overhang=$ap.overhang
            --insertion=$ap.insertion
            #if $ap.turn == 'yes'
                -t
            #elif $ap.turn == 'all'
                -t all
            #end if

            ## Expert Alignment Parameters
            --fs-min=$eap.fsmin
            --fs-max=$eap.fsmax
            --fs-msc=$eap.fsmsc
            --fs-weight=$eap.fsweight
            --fs-req=$eap.fsreq
            --fs-req-full=$eap.fsreqfull
            --fs-full-len=$eap.fsfulllen
            --gene-start=$eap.genestart
            --gene-end=$eap.geneend
            --fs-cover-gene=$eap.fscovergene
            --match-score=$eap.matchscore
            --mismatch-score=$eap.mismatchscore
            --pen-gap=$eap.pengap
            --pen-gapext=$eap.pengapext
            --fs-kmer-len=$eap.fskmerlen
            --fs-kmer-mm=$eap.fskmermm
            $eap.fskmernofast
            $eap.fskmernorel

            ## Advanced search and classification parameters
            #if $asacp.activate == 'yes'
                --search
                --search-kmer-candidates=$asacp.searchkmercandidates
                --lca-quorum=$asacp.lcaquorum
                --search-kmer-len=$asacp.searchkmerlen
                --search-kmer-mm=$asacp.searchkmermm
                --search-max-result=$asacp.searchmaxresult
                $asacp.searchnofast
                $asacp.searchkmernorel
            #end if

            ## convert meta file to tabular
            #if $os.type == 'fasta_meta'
                && cat output.csv | sed 's/,/\t/g' > output.tsv
            #end if
    ]]></command>
    <!-- Sections and default parameters are based on https://www.arb-silva.de/aligner -->
    <inputs>
        <param argument="--in" type="data" format="fasta" multiple="false" label="Sequence file" help="FASTA file format"/>
        <conditional name="db">
            <param name="select" type="select" label="Reference library type" help="">
                <option value="custom" selected="true">Custom</option>
                <option value="local">Local cached</option>
            </param>
            <when value="custom">
                <param name="custom" type="data" format="data" label="Reference library file" help="FASTA or ARB file format (--db)"/>
            </when>
            <when value="local">
                <param name="arb_databases" type="select" label="Local cached libraries" help="">
                    <options from_data_table="sina_references">
                        <validator message="No database is available" type="no_options"/>
                    </options>
                </param>
            </when>
        </conditional>
        <conditional name="asacp">
            <param name="activate" type="select" label="Enable search stage" help="(--search)">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no"></when>
            <when value="yes">
                <param name="searchkmercandidates" type="integer" value="1000" label="Set number of candidate reference sequences retrieved from the k-mer based search." help="For each candidate, the MSA based similarity is calculated and the search result based on these numbers. A value for n one or two orders larger than --search-max-result is usually quite sufficient. (--search-kmer-candidates)"/>
                <param name="lcaquorum" type="float" value="0.8" min="0.0" max="1.0" label="Fraction of the search result that must share the same classification" help="Using the default parameters --search-max-result=10 and --lca-quorum=0.7, this means that the deepest classification shared by 7 out of the top 10 search results is chosen for the query sequence. (--lca-quorum)"/>
                <param name="searchkmerlen" type="integer" value="10" min="0" label="Set k for the kmer based candidate search." help="See --fs-kmer-len. (--search-kmer-len)"/>
                <param name="searchkmermm" type="integer" value="0" min="0" label="Set the number of allowed mismatches within each kmer." help="See --fs-kmer-mm.  (--search-kmer-mm)"/>
                <param name="searchnofast" type="boolean" truevalue="--search-no-fast" falsevalue="" checked="true" label="Skip fast family search." help="This option configures the same behavior for the search stage. See --fs-kmer-no-fast. (--search-no-fast)"/>
                <param name="searchkmernorel" type="boolean" truevalue="--search-kmer-norel" falsevalue="" checked="false" label="Configures the candidate search to use absolute rather than length-relative scores for ordering the results." help="See --fs-kmer-norel. ( --search-kmer-norel)"/>
                <param name="searchmaxresult" type="integer" value="10" min="0" label="The maximum number of search results to return for each query sequence." help="(--search-max-result)"/>
            </when>
        </conditional>
        <conditional name="os">
            <param name="type" type="select" label="Output file type" help="(--outtype)">
                <option value="fasta">FASTA</option>
                <option value="fasta_meta">FASTA with meta-file</option>
                <option value="arb">ARB</option>
            </param>
            <when value="fasta">
                <param name="minidty" type="float" value="0.7" min="0.0" max="1.0" label="Exclude sequences sharing less than this value fractional identity with any of the alignment reference sequences from the output." help="(--min-idty)"/>
            </when>
            <when value="fasta_meta">
                <param name="minidty" type="float" value="0.7" min="0.0" max="1.0" label="Exclude sequences sharing less than this value fractional identity with any of the alignment reference sequences from the output." help="(--min-idty)"/>
            </when>
            <when value="arb"/>
        </conditional>
        <param name="log" type="select" label="Create log file" help="(--log-file, --show-diff)">
            <option value="no">no</option>
            <option value="yes">yes</option>
            <option value="yes_meta">yes with differences between the inferred alignment and the original alignment</option>
        </param>
        <section name="ap" title="Alignment parameters">
            <param argument="--overhang" type="select" label="Bases remaining unaligned at the ends should be" help="">
                <option value="attach">attached to the last aligned base.</option>
                <option value="edge">moved to the edge of the alignment.</option>
                <!-- maybe broken, see https://sina.readthedocs.io/en/latest/commandline.html#alignment-options -->
                <option value="remove">removed.</option>
            </param>
            <param argument="--insertion" type="select" label="Insertions wider than available columns should be">
                <option value="forbid">forbidden during alignment.</option>
                <option value="shift">accomodated by pushing out surrounding bases.</option>
                <option value="remove">removed.</option>
            </param>
            <param argument="--turn" type="select" label="Enable turn check stage" help="Sequences not oriented in accordance with the reference database will be reverse complemented as needed. If all is specified, sequences will also be tested for only reversal or only complemented (this should only be necessary if your data was mishandled).">
                <option value="yes">Yes</option>
                <option value="no">No</option>
                <option value="all">All</option>
            </param>
        </section>
        <section name="eap" title="Expert alignment parameters">
            <param name="fsmin" type="integer" value="15" min="0" label="Minimum number of reference sequences used for each query" help="(--fs-min)"/>
            <param name="fsmax" type="integer" value="40" min="0" label="Maximum number of reference sequences used for each query" help="(--fs-max)"/>
            <param name="fsmsc" type="float" value="0.7" min="0.0" max="1.0" label="Minimum similarity reference sequences" help="Required to have with the query sequence. This affects the range between --fs-min and --fs-max(--fs-msc)"/>
            <param name="fsweight" type="integer" value="1" min="0" label="Adjust the weight factor for the frequency at which a node was observed in the reference alignment." help="Use 0 to disable weighting. This feature prefers the more common placement for bases with inconsistent alignment in the reference database. (--fs-weight)"/>
            <param name="fsreq" type="integer" value="1" min="0" label="Minimum number of reference sequences that must be found in order to attempt alignment" help="If fewer sequences than indicated here are found, the respective query sequence will be discarded. (--fs-req)"/>
            <param name="fsreqfull" type="integer" value="1" min="0" label="Minimum number of full length reference sequences that must be included in the selected reference set" help="The search will proceed regardless of other settings until this setting has been satisfied. If it cannot be satisfied by any sequence in the reference database, the query sequence will be discarded. This setting exists to ensure that the entire length of the query sequence will be covered in the presence of partial sequences contained within your reference database. Note: If you are working with sequences other than 16S, you need to adjust this value or the value of --fs-full-len accordingly. In particular when working with short reference sequences, this setting may prevent any acceptable reference sequences from being found, leading to no sequences being aligned. (--fs-req-full)"/>
            <param name="fsfulllen" type="integer" value="1400" min="0" label="Minimum length a sequence is required to have to be considered full length" help="(--fs-full-len)"/>
            <param name="genestart" type="integer" value="0" min="0" label="Sets the beginning of the gene within the reference alignment." help="See --fs-cover-gene. (--gene-start)"/>
            <param name="geneend" type="integer" value="0" min="0" label="Sets the end of the gene within the reference alignment." help="See --fs-cover-gene. (--gene-end)"/>
            <param name="fscovergene" type="integer" value="0" label="Require total of n sequences to cover each the beginning and the end of the gene within the alignment." help="Similar to --fs-req-full.  This option is more precise than --fs-req-full, but requires that the column numbers for the range in which the full gene is expected be specified via --gene-start and --gene-end. (--fs-cover-gene)"/>
            <param name="matchscore" type="integer" value="2" min="0" label="Score given for a match" help="(--match-score)"/>
            <param name="mismatchscore" type="integer" value="-1" max="0" label="Score given for a mismatch" help="(--mismatch-score)"/>
            <param name="pengap" type="integer" value="5" min="0" label="Set the penalty subtracted from the score for opening a gap." help="(--pen-gap)"/>
            <param name="pengapext" type="integer" value="2" min="0" label="Set the penalty subtracted from the score for extending a gap." help="(--pen-gapext)"/>
            <param name="fskmerlen" type="integer" value="10" min="0" label="Size of k for the reference search" help="For SSU rRNA sequences, the default of 10 is a good value. For different sequence types, different values may perform better. For 5S, for example, 6 has shown to be more effective. (--fs-kmer-len)"/>
            <param name="fskmermm" type="integer" value="0" min="0" label="Allow k-mer matches in the reference database to contain this number of mismatches." help=" (--fs-kmer-mm)"/>
            <param name="fskmernofast" type="boolean" truevalue="--fs-kmer-no-fast" falsevalue="" checked="false" label="Use all k-mers occurring in the query sequence in the search." help="By default, only k-mers starting with an A are used for extra performance. (--fs-kmer-no-fast)"/>
            <param name="fskmernorel" type="boolean" truevalue="--fs-kmer-norel" falsevalue="" checked="false" label="Use absolute match scores in the kmer search." help="Absolute (number of shared k-mers) rather than relative (number or shared k-mers divided by length of reference sequence) (--fs-kmer-norel)"/>
        </section>
    </inputs>
    <outputs>
        <data name="output_fasta" format="fasta" from_work_dir="output.fasta" label="${tool.name} on ${on_string}: alignment">
            <filter>os['type'] == 'fasta' or os['type'] == 'fasta_meta'</filter>
        </data>
        <data name="output_arb" format="data" from_work_dir="output.arb" label="${tool.name} on ${on_string}: alignment">
            <filter>os['type'] == 'arb'</filter>
        </data>
        <data name="output_meta" format="csv" from_work_dir="output.tsv" label="${tool.name} on ${on_string}: meta information">
            <filter>os['type'] == 'fasta_meta'</filter>
        </data>
        <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log">
            <filter>log == 'yes' or log == 'yes_meta'</filter>
        </data>
    </outputs>
    <tests>
        <!-- name pattern of outputfile: output_<in_filetype>_<db_filetype>.<os.type>" -->
        <!-- #1 in: *.fasta; db: *.fasta; out: *.fasta; standard parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.fasta"/>
            </conditional>
            <section name="os">
                <param name="type" value="fasta_meta"/>
                <param name="minidty" value="0.0"/>
            </section>
            <output name="output_meta">
                <assert_contents>
                    <has_text text="FJ203641.1"/>
                    <has_n_lines n="18"/>
                </assert_contents>
            </output>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #2 in: *.fasta; db: *.fasta; out: *.arb; standard parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.fasta"/>
            </conditional>
            <section name="os">
                <param name="type" value="arb"/>
            </section>
            <output name="output_arb" file="output_fasta_fasta.arb" compare="sim_size" delta="100"/>
        </test>
        <!-- #3 in: *.fasta; db: *.arb; out: *.fasta; standard parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="os">
                <param name="type" value="fasta_meta"/>
                <param name="minidty" value="0.0"/>
            </section>
            <output name="output_meta">
                <assert_contents>
                    <has_text text="FJ203641.1"/>
                    <has_n_lines n="18"/>
                </assert_contents>
            </output>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #4 in: *.fasta; db: *.arb; out: *.arb; standard parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="os">
                <param name="type" value="arb"/>
            </section>
            <output name="output_arb" file="output_fasta_arb.arb" compare="sim_size" delta="1000"/>
        </test>
        <!-- #5 in: *.fasta; db: *.arb; out: *.fasta; custom parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="custom" value="reference.arb"/>
            </conditional>
            <section name="os">
                <param name="type" value="fasta"/>
                <param name="minidty" value="0.0"/>
            </section>
            <section name="ap">
                <param name="overhang" value="edge"/>
                <param name="insertion" value="shift"/>
                <param name="turn" value="all"/>
            </section>
            <section name="eap">
                <param name="fsmin" value="14"/>
                <param name="fsmax" value="41"/>
                <param name="fsmsc" value="0.8"/>
                <param name="fsweight" value="1"/>
                <param name="fsreq" value="1"/>
                <param name="fsreqfull" value="1"/>
                <param name="fsfulllen" value="1399"/>
                <param name="genestart" value="0"/>
                <param name="geneend" value="0"/>
                <param name="fscovergene" value="0"/>
                <param name="matchscore" value="1"/>
                <param name="mismatchscore" value="-2"/>
                <param name="pengap" value="6"/>
                <param name="pengapext" value="1"/>
                <param name="fskmerlen" value="9"/>
                <param name="fskmermm" value="1"/>
                <param name="fskmernofast" value="--fs-kmer-no-fast"/>
                <param name="fskmernorel" value=""/>
            </section>
            <section name="asacp">
                <param name="searchkmercandidates" value="1001"/>
                <param name="lcaquorum" value="0.9"/>
                <param name="searchkmerlen" value="9"/>
                <param name="searchkmermm" value="1"/>
                <param name="searchnofast" value=""/>
                <param name="searchkmernorel" value="--search-kmer-norel"/>
                <param name="searchmaxresult" value="10"/>
            </section>
            <output name="output_fasta">
                <assert_contents>
                    <has_line line=">FJ203641.1"/>
                    <has_line line=">WH051F03"/>
                    <has_n_lines n="34"/>
                </assert_contents>
            </output>
        </test>
        <!-- #6 in: *.fasta; db: <arb_databases>.arb; out: *.fasta; standard parameters -->
        <test>
            <param name="in" value="sequence.fasta"/>
            <conditional name="db">
                <param name="select" value="local"/> 
                <param name="arb_databases" value="testarb"/>
            </conditional>
            <section name="os">
                <param name="type" value="arb"/>
            </section>
            <output name="output_arb" file="output_fasta_arb.arb" compare="sim_size" delta="1000"/>
        </test>
    </tests>

    <help><![CDATA[
.. class:: infomark

**What it does**

SINA aligns nucleotide sequences to match a pre-existing MSA using a graph-based alignment algorithm similar to PoA. The graph approach allows SINA to incorporate information from many reference sequences without blurring highly variable regions. While pure NAST implementations are highly dependent on finding a good match in the reference database, SINA is able to align sequences relatively distant to references with good quality and will yield a robust result for query sequences with many close references.

While adding sequences to an MSA with SINA is usually faster than re-computing the entire MSA from an augmented set of unaligned sequences, the primary benefit lies in protecting investments made into the original MSA such as manual curation of the alignment, compute-intensive phylogenetic tree reconstruction and taxonomic annotation of the resulting phylogeny. Additionally, SINA includes a homology search which uses the previously computed alignment to determine the most similar sequences.  Based on the search results, a LCA-based classification of the query sequence can be computed using taxonomic classifications assigned to the sequences comprising the reference MSA.

SINA is used to compute the large and small subunit ribosomal RNA alignments provided by the `SILVA project <https://www.arb-silva.de/>`_ and is able to use the `ARB format reference databases <https://www.arb-silva.de/download/arb-files/>`_ released by the project.

**Input**

SINA requires sequences in FASTA file format, whereas libraries can be also provided as ARB files. Furthermore, reference databases can be added as data tables. See README.md for more information.

**Output**

Results are provided in FASTA or ARB file format, whereas additional metadata is provided as CSV.

.. class:: infomark

**References**

More information can be found on the `project website <https://sina.readthedocs.io/en/latest>`_, and on `github <https://github.com/epruesse/SINA>`_.  An `online version <https://www.arb-silva.de/aligner>`_ of SINA is provided by the SILVA project.
    ]]></help>
    <citations>
        <citation type="doi">10.1093/bioinformatics/bts252</citation>
    </citations>
</tool>