view pangolin.xml @ 21:81804a978fc0 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit 35c4813d132f69036addcc447cc6c4ec7da04d92"
author iuc
date Sat, 07 May 2022 19:52:10 +0000
parents 14ae456b8cc5
children a2099fb98cdb
line wrap: on
line source

<tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy2" profile="20.01">
    <description>Phylogenetic Assignment of Outbreak Lineages</description>
    <macros>
        <token name="@TOOL_VERSION@">4.0.5</token>
    </macros>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement>
        <requirement type="package" version="0.3.17">scorpio</requirement>
        <requirement type="package" version="0.23.0">csvtk</requirement>
    </requirements>
    <version_command><![CDATA[pangolin --version]]></version_command>
    <command detect_errors="exit_code"><![CDATA[
      #if $str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache and str($db.source) != "download":
        ## This is no good. Better to fail immediately instead of downloading a lot of data first.
        echo "Using the latest assignment cache requires downloading the latest version of pangolin-data." 1>&2; exit 1
      #else:
        ## Sanity chceck was ok, lets do the real thing ...
        #if str($db.source) == "download"
            ## Pangolin version 4 tries to update from an existing directory
            mkdir datadir &&
            pangolin --update-data --datadir datadir &&
        #else if str($db.source) == "builtin"
            ln -s $db.db_release.fields.path datadir &&
        #end if
        #if str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache:
            ## We need to install also the latest UShER assignment cache data.
            ## Pangolin has functionality to do so, but uses it incorrectly.
            ## We use the pangolin function to install into --datadir here,
            ## then point pangolin to the downloaded file later using
            ## its --assignment-cache parameter

            ## Create a "honeypot" package that will be picked up by pangolin,
            ## but will trigger a download because of missing __version__ info.
            mkdir pangolin_assignment &&
            touch pangolin_assignment/__init__.py &&
            ## Call pangolin's assignment cache install function, but
            ## override pip's install path
            PIP_TARGET="datadir" PIP_UPGRADE=1 python -c "from pangolin.utils import update; update.install_pangolin_assignment()" &&
        #end if
        pangolin
        --threads \${GALAXY_SLOTS:-1}
        --tempdir "\${TMPDIR:-.}"
        #if str($db.source) == "download" or str($db.source) == "builtin"
            --datadir datadir
        #end if
        --analysis-mode $engine.analysis_mode
        #if str($engine.analysis_mode) == 'usher':
            $engine.use_assignment_cache
            #if $engine.use_assignment_cache:
                ## Point pangolin to the assignment cache file we've downloaded before
                --assignment-cache datadir/pangolin_assignment/usher_assignments.cache.csv.gz
            #end if
        #end if
        #if $alignment:
            $alignment --alignment-file '$align1'
        #end if
        --outfile report.csv
        --max-ambig $max_ambig
        --min-length $min_length
        $expanded_lineage
        '$input1'
        && csvtk csv2tab report.csv
        #if not $include_header:
            | tail -n+2
        #end if
        > '$output1'
      #end if
    ]]></command>
    <inputs>
        <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" />
        <conditional name="engine">
            <param argument="--analysis-mode" type="select" label="Analysis mode"
            help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster">
                <option value="usher">UShER</option>
                <option value="pangolearn">pangoLEARN</option>
            </param>
            <when value="usher">
                <param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Use latest UShER assignment cache"
                help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples. Also note that using the latest assignment cache will require you to select the 'Download latest from web' option for the pangolin-data source below because assignment cache and pangolin-data need to be synchronized." />
            </when>
            <when value="pangolearn" />
        </conditional>
        <conditional name="db">
            <param type="select" name="source" label="pangolin-data source" help="Where to find the pangolin-data to use for the tool run. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below.">
                <option value="download">Download latest from web</option>
                <option value="builtin">Use cached data from Galaxy server</option>
                <option value="default">Use default data shipped with this build of pangolin (not recommended)</option>
            </param>
            <when value="download">
                <!-- these are currently not supported by the pangolin downloader -->
                <!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> -->
                <!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> -->
            </when>
            <when value="builtin">
                <param name="db_release" label="pangoLEARN release" type="select">
                    <options from_data_table="pangolearn">
                        <column name="value" index="0" />
                        <column name="name" index="1" />
                        <column name="path" index="3" />
                        <filter type="sort_by" column="0" />
                        <filter type="static_value" column="2" value="4.0" />
                        <validator type="no_options" message="No cached pangolin-data release available" />
                    </options>
                </param>
            </when>
            <when value="default" />
        </conditional>
        <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" />
        <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" />
        <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." />
        <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." />
        <param name="include_header" type="boolean" truevalue="true" falsevalue="false" label="Include header line in output file" />
    </inputs>
    <outputs>
        <data name="output1" format="tabular" label="pangolin on ${on_string}">
            <actions>
                <conditional name="expanded_lineage">
                    <when value="">
                        <!-- default columns -->
                        <action name="column_names" type="metadata" default="taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note" />
                    </when>
                    <when value="--expanded-lineage">
                        <action name="column_names" type="metadata" default="taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note,expanded_lineage" />
                    </when>
                </conditional>
            </actions>
        </data>
        <data name="align1" format="fasta" label="pangolin alignment on ${on_string}">
            <filter>alignment</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <param name="input1" value="test1.fasta"/>
            <!-- Test only the default UShER mode for now since the
            pangolearn random forest model uses too much memory
            see https://github.com/cov-lineages/pangolin/issues/395
            <conditional name="engine">
            <conditional name="engine">
                <param name="analysis_mode" value="pangolearn" />
            </conditional>
            -->
            <conditional name="db">
                <param name="source" value="default" />
            </conditional>
            <output name="output1" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="B\.1\.1\t\d\.\d" />
                    <has_text text="pass" />
                    <has_n_lines n="1" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="1">
            <param name="input1" value="test1.fasta"/>
            <conditional name="db">
                <param name="source" value="download" />
            </conditional>
            <output name="output1" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" />
                    <has_text text="pass" />
                    <has_n_lines n="1" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="2">
            <param name="input1" value="test1.fasta" />
            <!-- Test only the default UShER mode for now since the
            pangolearn random forest model uses too much memory
            see https://github.com/cov-lineages/pangolin/issues/395
            <conditional name="engine">
                <param name="analysis_mode" value="pangolearn" />
            </conditional>
            -->
            <conditional name="db">
                <param name="source" value="download" />
            </conditional>
            <param name="alignment" value="--alignment" />
            <output name="output1" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="B\.1\.1\t\d\.\d" />
                    <has_text text="pass" />
                    <has_n_lines n="1" />
                </assert_contents>
            </output>
            <output name="align1" file="aln1.fasta" ftype="fasta">
                <assert_contents>
                    <has_text text="Consensus_EB232-crude-prep_S297" />
                    <has_n_lines n="2" />
                </assert_contents>
            </output>
        </test>
        <!-- test include-header option -->
        <test expect_num_outputs="1">
            <param name="input1" value="multiple_alignment.fasta.gz"/>
            <!-- Test only the default UShER mode for now since the
            pangolearn random forest model uses too much memory
            see https://github.com/cov-lineages/pangolin/issues/395
            <conditional name="engine">
                <param name="analysis_mode" value="pangolearn" />
            </conditional>
            -->
            <conditional name="db">
                <param name="source" value="default" />
            </conditional>
            <param name="include_header" value="true" />
            <output name="output1" ftype="tabular">
                <assert_contents>
                    <has_text text="pangolin_version" />
                    <has_text text="lineage" />
                    <has_text text="Serbia" />
                    <has_text text="Poland" />
                    <has_text text="USA" />
                    <has_n_lines n="35" />
                    <has_n_columns n="16" />
                </assert_contents>
            </output>
        </test>
        <!-- Test that use of latest assignment cache requires downloaded other data -->
        <test expect_failure="true">
            <param name="input1" value="multiple_alignment.fasta.gz"/>
            <conditional name="engine">
                <param name="use_assignment_cache" value="true" />
            </conditional>
            <conditional name="db">
                <param name="source" value="default" />
            </conditional>
        </test>
        <!-- test with extra expanded_lineage column -->
        <test expect_num_outputs="1">
            <param name="input1" value="multiple_alignment.fasta.gz"/>
            <!-- Test only the default UShER mode for now since the
            pangolearn random forest model uses too much memory
            see https://github.com/cov-lineages/pangolin/issues/395
            <conditional name="engine">
                <param name="analysis_mode" value="pangolearn" />
            </conditional>
            -->
            <conditional name="db">
                <param name="source" value="default" />
            </conditional>
            <param name="expanded_lineage" value="true" />
            <param name="include_header" value="true" />
            <output name="output1" ftype="tabular">
                <assert_contents>
                    <has_text text="pangolin_version" />
                    <has_text text="lineage" />
                    <has_text text="expanded_lineage" />
                    <has_text text="Serbia" />
                    <has_text text="Poland" />
                    <has_text text="USA" />
                    <has_n_lines n="35" />
                    <has_n_columns n="17" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[

**What it does**

`Pangolin <https://cov-lineages.org/pangolin.html>`_
(Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a
SARS-CoV-2 genome sequence the most likely lineage based on the PANGO
nomenclature system.


**Data sources/versioning**

Pangolin uses the
`pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as
a source of its required model, protobuf, designation hash and alias files, and
the `constellations <https://github.com/cov-lineages/constellations>`_
repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based
assignment of lineages of concern.
The tool ships with a copy of this data, but the data gets updated more
frequently than the tool! In general one should use the most recent model for
lineage assignment, and the default option for this tool is to download the
latest versions of pangolin-data and constellations before embarking on
analysis.
A pangoLEARN data manager exists so that the Galaxy admin can download specific
versions of the pangolin-data/constellations as required. Finally the pangolin
tool can use its default built-in data packages, but this is
**not recommended** as it will almost certainly be out of date.

.. class:: infomark

   The exact combination of pangolin, inference engine (UShER/pangoLEARN),
   scorpio, and data packages used for a particular run of the tool can be
   extracted from the four "version" columns in the output (see below for
   details).

.. class:: warningmark

   The "Download latest from web" updates the *pangolin-data* and
   *constellations* packages but not the software (pangolin and scorpio) using
   these data packages.
   If the data package format changes upstream, this can cause the tool run to
   fail. Cached data packages (or, in the worst case, the built-in data) can
   serve as a fallback until switching to an updated pangolin tool
   version.


**Output**

The main output of the tool is a tabular file with one line per input sequence
and with columns providing the
`following information <https://cov-lineages.org/resources/pangolin/output.html>`_:

taxon:
    The name of the input sequence

lineage:
    The most likely lineage assigned to a given sequence based on the inference
    engine used and the SARS-CoV-2 diversity designated.
    This assignment is sensitive to missing data at key sites.

conflict:
    In the pangoLEARN model, a given sequence gets assigned to the most likely
    category based on known diversity.
    If a sequence can fit into more than one category, the conflict score will
    be greater than 0 and reflect the number of categories the sequence could
    fit into.
    If the conflict score is 0, this means that within the current decision
    tree there is only one category that the sequence could be assigned to.

ambiguity_score:
    This score is a function of the quantity of missing data in a sequence.
    It represents the proportion of relevant sites in a sequnece which were
    imputed to the reference values.
    A score of 1 indicates that no sites were imputed, while a score of 0
    indicates that more sites were imputed than were not imputed.
    This score only includes sites which are used by the decision tree to
    classify a sequence.

scorpio_call:
    If a query is assigned a constellation by scorpio this call is output in
    this column.
    The full set of constellations searched by default can be found at the
    constellations repository.

scorpio_support:
    The support score is the proportion of defining variants which have the
    alternative allele in the sequence.

scorpio_conflict:
    The conflict score is the proportion of defining variants which have the
    reference allele in the sequence. Ambiguous/other non-ref/alt bases at each
    of the variant positions contribute only to the denominators of these
    scores.

scorpio_notes:
    A notes column specific to the scorpio output.

version:
    A version number that represents both the inference method and the
    pangolin-data version number, which as of pangolin 4.0 corresponds to the
    pango-designation version used to prepare the inference files. For example:

    PANGO-1.2 indicates an identical sequence has been previously designated
    this lineage, and has so gone through manual curation.
    The number 1.2 indicates the version of pango-designation that this
    assignment is based on. These hashes and pango-designation version are
    bundled with the pangoLEARN and UShER models.

    PLEARN-1.2 indicates that this sequence is different from any previously
    designated and that the pangoLEARN model was used as an inference engine to
    predict the most likely lineage based on the given version of
    pango-designation upon which the pangoLEARN model was trained.

    PUSHER-1.2 indicates that this sequence is different from any previously
    designated and that UShER was used as an inference engine with fast tree
    placement and parsimony-based lineage assignment, based on a guide tree
    (protobuf) file built from the data in a given pango-designation release
    version.

pangolin_version:
    The version of pangolin software running.

scorpio_version:
    The version of the scorpio software installed.

constellation_version:
    The version of constellations that scorpio has used to curate the lineage
    assignment.

is_designated:
    A boolean (True/False) column indicating whether that particular sequence
    has been offically designated a lineage.

qc_status:
    Indicates whether the sequence passed the QC thresholds for minimum length
    and maximum N content.

qc_notes:
    Notes specific to the QC checks run on the sequences.

note:
    If any conflicts from the decision tree, this field will output the
    alternative assignments. If the sequence failed QC this field will describe
    why.
    If the sequence met the SNP thresholds for scorpio to call a constellation,
    it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative,
    reference and ambiguous) alleles for that call.
    ]]></help>
    <citations>
      <citation type="doi">10.1093/ve/veab064</citation>
    </citations>
</tool>