Mercurial > repos > iuc > nextalign

<tool id="nextalign" name="NextAlign" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.01">
    <description>Viral genome sequence alignment</description>
    <macros>
        <import>macros.xml</import>
        <token name="@VERSION_SUFFIX@">0</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">nextalign</requirement>
    </requirements>
    <version_command>nextalign --version-detailed</version_command>
    <command detect_errors="exit_code"><![CDATA[
@REF_FASTA@
ln -s '$sequences' sequences.fasta &&
nextalign
    --sequences sequences.fasta
    --reference reference.fa
    --output-fasta '$output_fasta'
#if $output_insertions:
    --output-insertions '$output_csv'
#end if
#if $translation.translation_select == "yes":
    --genes '${translation.genes}'
    --genemap '${translation.genemap}'
#end if
    --min-length '${min_length}'
    --penalty-gap-extend '${penalty_gap_extend}'
    --penalty-gap-open '${penalty_gap_open}'
    --penalty-gap-open-in-frame '${penalty_gap_open_in_frame}'
    --penalty-gap-open-out-of-frame '${penalty_gap_open_out_of_frame}'
    --penalty-mismatch '${penalty_mismatch}'
    --score-match '${score_match}'
    --max-indel '${max_indel}'
    --nuc-seed-length '${nuc_seed_length}'
    --nuc-min-seeds '${nuc_min_seeds}'
    --nuc-seed-spacing '${nuc_seed_spacing}'
    --nuc-mismatches-allowed '${nuc_mismatches_allowed}'
    ]]></command>
    <inputs>
        <expand macro="reference"/>
        <param argument="--sequences" type="data" format="fasta" label="FASTA file with input sequences"/>
        <param argument="--output-insertions" type="boolean" checked="true" label="Output insertion sequences?" help="Outputs stripped insertions relative to reference as CSV"/>
        <conditional name="translation">
            <param name="translation_select" type="select" label="Translate annotated genes based on GFF and gene list?">
                <option value="yes">Translate annotated genes</option>
                <option value="no">Don't translate genes</option>
            </param>
            <when value="yes">
                <param argument="--genes" type="text" label="Comma separated list of genes to translate.">
                    <sanitizer invalid_char="">
                        <valid initial="string.ascii_letters,string.digits">
                            <add value="-" />
                            <add value="." />
                            <add value="," />
                        </valid>
                    </sanitizer>
                </param>
                <param argument="--genemap" type="data" format="gtf" label="GTF file containing custom gene map"/> <!-- TODO - make sure they need GFF and not GTF or GFF3 -->
            </when>
            <when value="no"/>
        </conditional>
        <param argument="--min-length" type="integer" value="100" min="0"
            label="Minimum length of nucleotide sequence to consider for alignment"
            help=" If a sequence is shorter than that, alignment will not be attempted and a warning will be emitted. When adjusting this parameter, note that alignment of short sequences can be unreliable." />
        <param argument="--penalty-gap-extend" type="integer" value="0"
            label="Penalty for extending a gap."
            help="If zero, all gaps regardless of length incur the same penalty." />
        <param argument="--penalty-gap-open" type="integer" value="6"
            label="Penalty for opening of a gap."
            help="A higher penalty results in fewer gaps and more mismatches. Should be less than the penalty value of opening a gap in frame to avoid gaps in genes." />
        <param argument="--penalty-gap-open-in-frame" type="integer" value="7" min="1"
            label="Penalty for opening gaps at the beginning of a codon."
            help="Should be greater than the penalty of opening a and less than penalty of opening a gap out of frame, to avoid gaps in genes, but favor gaps that align with codons." />
        <param argument="--penalty-gap-open-out-of-frame" type="integer" value="8" min="1"
            label="Penalty for opening gaps in the body of a codon."
            help="Should be greater than the penalty for opening gaps in-frame to favor gaps that align with codons." />
        <param argument="--penalty-mismatch" type="integer" value="1" min="1"
            label="Penalty for aligned nucleotides or aminoacids that differ in state during alignment"
            help="Note that this is redundantly parameterized with score match." />
        <param argument="--score-match" type="integer" value="3" min="1"
            label="Score for encouraging aligned nucelotides or aminoacids with matching state."
            help="Note that this is redundantly parameterized with mismatch penalty." />
        <param argument="--max-indel" type="integer" value="400" min="0"
            label="Maximum length of insertions or deletions allowed to proceed with alignment."
            help="Alignments with long indels are slow to compute and require substantial memory in the current implementation. Alignment of sequences with indels that are longer than this value will not be attempted and a warning will be emitted." />
        <param argument="--nuc-seed-length" type="integer" value="21" min="1"
            label="Seed length for nucleotide alignment."
            help="Seeds should be long enough to be unique, but short enough to match with high probability." />
        <param argument="--nuc-min-seeds" type="integer" value="10" min="1"
            label="Minimum number of seeds to search for during nucleotide alignment."
            help="Relevant for short sequences. In long sequences, the number of seeds is determined by nucleotide seed spacing. This should be a positive integer." />
        <param argument="--nuc-seed-spacing" type="integer" value="100" min="0"
            label="Spacing between seeds during nucleotide alignment." />
        <param argument="--nuc-mismatches-allowed" type="integer" value="3" min="0"
            label="Maximum number of mismatching nucleotides."
            help="Maximum number of mismatching nucleotides allowed for a seed to be considered a match." />
    </inputs>
    <outputs>
        <data name="output_fasta" format="fasta" label="${tool.name} on ${on_string} (FASTA)"/>
        <data name="output_csv" format="csv" label="${tool.name} on ${on_string} (CSV)">
            <filter>output_insertions</filter>
        </data>
        <collection name="translations" type="list" label="${tool.name} on ${on_string} - Translations (FASTA)">
            <discover_datasets pattern="sequences\.gene\.(?P&lt;designation&gt;[a-zA-Z0-9\-]+)\.fasta" format="fasta"/>
            <filter>translation['translation_select'] == 'yes'</filter>
        </collection>
    </outputs>
    <tests>
        <!--
            Defaults, all outputs, reference from history
        -->
        <test expect_num_outputs="3">
            <conditional name="reference_source">
                <param name="reference_source_selector" value="history"/>
                <param name="ref_file" value="reference.fasta"/>
            </conditional>
            <param name="output_insertions" value="true"/>
            <param name="sequences" value="subsampled.fasta"/>
            <conditional name="translation">
                <param name="translation_select" value="yes"/>
                <param name="genes" value="E,M,N,ORF10,ORF14,ORF1a,ORF1b,ORF3a,ORF6,ORF7a,ORF7b,ORF8,ORF9b,S"/>
                <param name="genemap" value="genemap.gtf" ftype="gtf"/>
            </conditional>
            <output name="output_fasta" file="output.fasta" sort="true"/>
            <output name="output_csv" file="insertions.csv" sort="true"/>
            <output_collection name="translations" type="list" count="14">
                <element name="E" file="subsampled.gene.E.fasta" ftype="fasta" sort="true"/>
                <element name="M" file="subsampled.gene.M.fasta" ftype="fasta" sort="true"/>
                <element name="N" file="subsampled.gene.N.fasta" ftype="fasta" sort="true"/>
                <element name="ORF10" file="subsampled.gene.ORF10.fasta" ftype="fasta" sort="true"/>
                <element name="ORF14" file="subsampled.gene.ORF14.fasta" ftype="fasta" sort="true"/>
                <element name="ORF1a" file="subsampled.gene.ORF1a.fasta" ftype="fasta" sort="true"/>
                <element name="ORF1b" file="subsampled.gene.ORF1b.fasta" ftype="fasta" sort="true"/>
                <element name="ORF3a" file="subsampled.gene.ORF3a.fasta" ftype="fasta" sort="true"/>
                <element name="ORF6" file="subsampled.gene.ORF6.fasta" ftype="fasta" sort="true"/>
                <element name="ORF7a" file="subsampled.gene.ORF7a.fasta" ftype="fasta" sort="true"/>
                <element name="ORF7b" file="subsampled.gene.ORF7b.fasta" ftype="fasta" sort="true"/>
                <element name="ORF8" file="subsampled.gene.ORF8.fasta" ftype="fasta" sort="true"/>
                <element name="ORF9b" file="subsampled.gene.ORF9b.fasta" ftype="fasta" sort="true"/>
                <element name="S" file="subsampled.gene.S.fasta" ftype="fasta" sort="true"/>
            </output_collection>
        </test>
        <!--
            Defaults, only fasta output, reference from history
        -->
        <test expect_num_outputs="1">
            <conditional name="reference_source">
                <param name="reference_source_selector" value="history"/>
                <param name="ref_file" value="reference.fasta"/>
            </conditional>
            <param name="output_insertions" value="false"/>
            <param name="sequences" value="subsampled.fasta"/>
            <conditional name="translation">
                <param name="translation_select" value="no"/>
            </conditional>
            <output name="output_fasta" file="output.fasta" sort="true"/>
        </test>
        <!--
            Defaults, only fasta output, reference from cache
        -->
        <test expect_num_outputs="1">
            <conditional name="reference_source">
                <param name="reference_source_selector" value="cached"/>
                <param name="ref_file" value="reference_fasta"/>
            </conditional>
            <param name="output_insertions" value="false"/>
            <param name="sequences" value="subsampled.fasta"/>
            <conditional name="translation">
                <param name="translation_select" value="no"/>
            </conditional>
            <output name="output_fasta" file="output.fasta" sort="true"/>
        </test>
    </tests>
    <help><![CDATA[

**What it does**

Nextalign is a viral genome sequence alignment algorithm used in Nextclade.

It will perform a pairwise alignment of provided sequences against a given reference sequence using banded local alignment algorithm with
affine gap-cost. Band width and rough relative positions are determined through seed matching.

Nextalign will strip insertions relative to the reference and output them in a separate CSV file.

Optionally, when provided with a gene map and a list of genes, Nextalign can perform translation of these genes.

Currently Nextalign primarily focuses on SARS-CoV-2 genome, but it can be used on any virus, given a sufficiently similar
reference sequence (less than a 5% divergence).

    ]]></help>
    <expand macro="citations" />
</tool>
author	iuc
date	Fri, 04 Feb 2022 21:25:49 +0000
parents	1c785a6b4d90
children	4d93e708218c