pangolin: pangolin.xml comparison

comparison pangolin.xml @ 22:a2099fb98cdb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit d160f73f58eb515a2da4ba76096ed3d8b6c88bdc

author	iuc
date	Fri, 08 Jul 2022 08:33:57 +0000
parents	81804a978fc0
children	77402759b866

comparison

equal deleted inserted replaced

-:81804a978fc0
+:a2099fb98cdb
-<tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy2" profile="20.01">
+<tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy0" profile="20.01">
 <description>Phylogenetic Assignment of Outbreak Lineages</description>
 <macros>
-<token name="@TOOL_VERSION@">4.0.5</token>
+<token name="@TOOL_VERSION@">4.1.1</token>
+<token name="@PANGOLIN_DATA_VERSION@">1.11</token>
+<token name="@CONSTELLATIONS_VERSION@">0.1.10</token>
+<token name="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@">4</token>
+<!-- a regex describing the scorpio versions that this wrapper version
+is backwards-compatible with; can be used with the min_scorpio_version
+column of the constellations data table to offer only compatible
+versions of constellations data. -->
+<token name="@COMPATIBLE_SCORPIO_DATA_FORMAT@"><![CDATA[(^0\.[1-3]$|^0\.[0-2]\.\d+$|^0\.3\.\d$|^0\.3\.1[0-7]$|^0$)]]></token>
+<xml name="usher_download_option">
+<when value="download">
+<param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Download and use also latest UShER assignment cache?"
+help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples." />
+</when>
+</xml>
+<xml name="cached_usher_assignment_cache">
+<param name="assignment_cache_release" type="select" optional="true" label="Use corresponding UShER assignment cache?"
+help="If the server offers a copy of the UShER assignment cache along with the specified version of pangolin-data, you can select it here to speed up UShER lineage assignment. If no suitable assignment cache is available, it is perfectly fine to proceed without one, and the performance difference will only become obvious with very large numbers of samples.">
+<options from_data_table="pangolin_assignment">
+<column name="value" index="0" />
+<column name="description" index="1" />
+<column name="path" index="4" />
+<filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" />
+<filter type="param_value" ref="release" column="0" />
+</options>
+</param>
+</xml>
+<xml name="cached_pangolin_data">
+<when value="cached">
+<param name="release" label="Cached release of pangolin-data" type="select">
+<options from_data_table="pangolin_data">
+<column name="value" index="0" />
+<column name="description" index="1" />
+<column name="date" index="3" />
+<column name="path" index="4" />
+<filter type="sort_by" column="3" />
+<filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" />
+<validator type="no_options" message="No cached constellations release available" />
+</options>
+</param>
+<yield />
+</when>
+</xml>
+<xml name="pangolin_data_sources">
+<conditional name="pangolin_data">
+<param name="source" type="select" label="Version of pangolin-data to use">
+<option value="default">Use pangolin-data version (v@PANGOLIN_DATA_VERSION@) shipped with this version of the tool</option>
+<option value="cached">Use specific pangolin-data version cached on this Galaxy server</option>
+<option value="download">Download latest available pangolin-data version from web</option>
+</param>
+<when value="default" />
+<yield />
+</conditional>
+</xml>
 </macros>
 <requirements>
 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement>
+<!-- Pin also the versions of all core dependencies - the ones
+reported with the all-versions option of pangolin plus ucsc-fatovcf,
+which the command is intended to report but currently cannot for
+technical reasons - to the versions you'd get installed in an unpinned
+conda install of pangolin at the time of release of this wrapper
+version! By turning these dependencies into explicit requirements the
+requirements section of the tool interface becomes the equivalent of
+the all-versions option as long as the user doesn't update the data
+dependencies.
+Wrapper updates are **explicitly encouraged** when new dependency
+versions become available. Also, please check for updated dependencies
+when updating the wrapper for other reasons. -->
 <requirement type="package" version="0.3.17">scorpio</requirement>
+<requirement type="package" version="@PANGOLIN_DATA_VERSION@">pangolin-data</requirement>
+<requirement type="package" version="@CONSTELLATIONS_VERSION@">constellations</requirement>
+<requirement type="package" version="0.5.6">usher</requirement>
+<requirement type="package" version="1.1.0">gofasta</requirement>
+<requirement type="package" version="426">ucsc-fatovcf</requirement>
+<requirement type="package" version="2.24">minimap2</requirement>
+<!-- wrapper-specific requirements to turn pangolin's native
+comma-separated output into tab-separated one and to truncate
+pangolin's all-versions output. -->
 <requirement type="package" version="0.23.0">csvtk</requirement>
+<requirement type="package" version="3.4">grep</requirement>
 </requirements>
 <version_command><![CDATA[pangolin --version]]></version_command>
 <command detect_errors="exit_code"><![CDATA[
-#if $str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache and str($db.source) != "download":
+## Prepare a pangolin datadir if required:
-## This is no good. Better to fail immediately instead of downloading a lot of data first.
+#if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default':
-echo "Using the latest assignment cache requires downloading the latest version of pangolin-data." 1>&2; exit 1
+## for at least one of pangolin-data and constellations we need to
-#else:
+## provide a non-conda env version through a datadir
-## Sanity chceck was ok, lets do the real thing ...
+mkdir datadir &&
-#if str($db.source) == "download"
+#if str($engine.pangolin_data.source) == 'download' or str($constellations.source) == 'download':
-## Pangolin version 4 tries to update from an existing directory
+## If "download latest from web" got requested for any data component,
-mkdir datadir &&
+## we can make use of pangolin --update-data to do the job for us.
-pangolin --update-data --datadir datadir &&
+## However, this would download updated versions of *all* data
-#else if str($db.source) == "builtin"
+## packages into our datadir, while the user may have asked for
-ln -s $db.db_release.fields.path datadir &&
+## just a specific one. To avoid this, we set up a fake package
-#end if
+## with very high version number in the datadir to prevent
-#if str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache:
+## unwanted component updates. After updating the rest of the
-## We need to install also the latest UShER assignment cache data.
+## data, we remove the fake package again.
-## Pangolin has functionality to do so, but uses it incorrectly.
+#if str($engine.pangolin_data.source) != 'download':
-## We use the pangolin function to install into --datadir here,
+mkdir datadir/pangolin_data &&
-## then point pangolin to the downloaded file later using
+echo '__version__ = "999"' > datadir/pangolin_data/__init__.py &&
-## its --assignment-cache parameter
+#end if
+#if str($constellations.source) != 'download':
-## Create a "honeypot" package that will be picked up by pangolin,
+mkdir datadir/constellations &&
-## but will trigger a download because of missing __version__ info.
+## constellations versions start with a 'v'!
-mkdir pangolin_assignment &&
+echo '__version__ = "v999"' > datadir/constellations/__init__.py &&
-touch pangolin_assignment/__init__.py &&
+#end if
-## Call pangolin's assignment cache install function, but
+## download updated packages discarding stdout because we
-## override pip's install path
+## output final package versions separately below and because
-PIP_TARGET="datadir" PIP_UPGRADE=1 python -c "from pangolin.utils import update; update.install_pangolin_assignment()" &&
+## it would contain our fake package versions
-#end if
+pangolin --update-data --datadir datadir 2&> /dev/null &&
-pangolin
+#if str($engine.pangolin_data.source) != 'download':
---threads \${GALAXY_SLOTS:-1}
+rm -r datadir/pangolin_data &&
---tempdir "\${TMPDIR:-.}"
+#end if
-#if str($db.source) == "download" or str($db.source) == "builtin"
+#if str($constellations.source) != 'download':
---datadir datadir
+rm -r datadir/constellations &&
 #end if
---analysis-mode $engine.analysis_mode
+#end if
-#if str($engine.analysis_mode) == 'usher':
+#if str($engine.analysis_mode) == 'usher' and str($engine.pangolin_data.source) == 'download':
-$engine.use_assignment_cache
+#if $engine.pangolin_data.use_assignment_cache:
-#if $engine.use_assignment_cache:
+## We need to download also the latest UShER assignment cache data.
-## Point pangolin to the assignment cache file we've downloaded before
+## Since v4.1 pangolin's
---assignment-cache datadir/pangolin_assignment/usher_assignments.cache.csv.gz
+## --add-assignment-cache/--use-assignment-cache options respect
-#end if
+## --datadir so we can use them directly.
-#end if
+pangolin --datadir datadir --add-assignment-cache &&
-#if $alignment:
+#end if
-$alignment --alignment-file '$align1'
+#end if
-#end if
+## Handle data components to be taken from data tables
---outfile report.csv
+## The folder structure pointed to by the data tables can be used
---max-ambig $max_ambig
+## as is except that cannot symlink the folders themselves since
---min-length $min_length
+## pangolin inspects them using os.walk with the default
-$expanded_lineage
+## `followlinks=False`.
-'$input1'
+## Since data table versions of data packages can be older than
-&& csvtk csv2tab report.csv
+## the versions installed in the wrapper environment, we need to
-#if not $include_header:
+## use pangolin's --use-old-datadir option to actually have them
-| tail -n+2
+## used.
-#end if
+#set $use_old_datadir = ''
-> '$output1'
+#if str($engine.pangolin_data.source) == 'cached':
-#end if
+#set $use_old_datadir = '--use-old-datadir'
+cp -rs '${engine.pangolin_data.release.fields.path}' datadir/pangolin_data &&
+#if str($engine.analysis_mode) == 'usher' and $engine.pangolin_data.assignment_cache_release:
+cp -rs '${engine.pangolin_data.assignment_cache_release.fields.path}' datadir/pangolin_assignment &&
+#end if
+#end if
+#if str($constellations.source) == 'cached':
+#set $use_old_datadir = '--use-old-datadir'
+cp -rs '${constellations.release.fields.path}' datadir/constellations &&
+#end if
+## Report all data package versions that will be used in this run of the tool
+echo "Running pangolin with the following possibly updated data packages:" &&
+pangolin --datadir datadir $use_old_datadir --all-versions | grep -E "pangolin-data|assignment|constellations" &&
+#end if
+## Finally run the pangolin analysis
+pangolin
+--threads \${GALAXY_SLOTS:-1}
+--tempdir "\${TMPDIR:-.}"
+#if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default':
+--datadir datadir $use_old_datadir
+#end if
+--analysis-mode $engine.analysis_mode
+#if str($engine.analysis_mode) == 'usher':
+#if str($engine.pangolin_data.source) == 'download':
+$engine.pangolin_data.use_assignment_cache
+#else if str($engine.pangolin_data.source) == 'cached':
+#if $engine.pangolin_data.assignment_cache_release:
+--use-assignment-cache
+#end if
+#end if
+#end if
+#if $alignment:
+$alignment --alignment-file '$align1'
+#end if
+--outfile report.csv
+--max-ambig $max_ambig
+--min-length $min_length
+$expanded_lineage
+'$input1'
+&& csvtk csv2tab report.csv
+#if not $include_header:
+| tail -n+2
+#end if
+> '$output1'
 ]]></command>
 <inputs>
 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" />
 <conditional name="engine">
 <param argument="--analysis-mode" type="select" label="Analysis mode"
 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster">
 <option value="usher">UShER</option>
 <option value="pangolearn">pangoLEARN</option>
 </param>
 <when value="usher">
-<param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Use latest UShER assignment cache"
+<expand macro="pangolin_data_sources">
-help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples. Also note that using the latest assignment cache will require you to select the 'Download latest from web' option for the pangolin-data source below because assignment cache and pangolin-data need to be synchronized." />
+<expand macro="cached_pangolin_data">
+<expand macro="cached_usher_assignment_cache" />
+</expand>
+<expand macro="usher_download_option" />
+</expand>
 </when>
-<when value="pangolearn" />
+<when value="pangolearn">
+<expand macro="pangolin_data_sources">
+<expand macro="cached_pangolin_data" />
+<when value="download" />
+</expand>
+</when>
 </conditional>
-<conditional name="db">
+<conditional name="constellations">
-<param type="select" name="source" label="pangolin-data source" help="Where to find the pangolin-data to use for the tool run. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below.">
+<param name="source" type="select" label="Version of constellations to use">
-<option value="download">Download latest from web</option>
+<option value="default">Use constellations version (v@CONSTELLATIONS_VERSION@) shipped with this version of the tool</option>
-<option value="builtin">Use cached data from Galaxy server</option>
+<option value="cached">Use specific constellations version cached on this Galaxy server</option>
-<option value="default">Use default data shipped with this build of pangolin (not recommended)</option>
+<option value="download">Download latest available constellations version from web</option>
 </param>
-<when value="download">
+<when value="default" />
-<!-- these are currently not supported by the pangolin downloader -->
+<when value="cached">
-<!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> -->
+<param name="release" label="Cached constellations release" type="select">
-<!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> -->
+<options from_data_table="pangolin_constellations">
-</when>
-<when value="builtin">
-<param name="db_release" label="pangoLEARN release" type="select">
-<options from_data_table="pangolearn">
 <column name="value" index="0" />
-<column name="name" index="1" />
+<column name="description" index="1" />
-<column name="path" index="3" />
+<column name="date" index="3" />
-<filter type="sort_by" column="0" />
+<column name="path" index="4" />
-<filter type="static_value" column="2" value="4.0" />
+<filter type="sort_by" column="3" />
-<validator type="no_options" message="No cached pangolin-data release available" />
+<filter type="regexp" column="2" value="@COMPATIBLE_SCORPIO_DATA_FORMAT@" />
+<validator type="no_options" message="No cached constellations release available" />
 </options>
 </param>
 </when>
-<when value="default" />
+<when value="download" />
 </conditional>
 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" />
 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" />
 <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." />
 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." />
 </data>
 </outputs>
 <tests>
 <test expect_num_outputs="1">
 <param name="input1" value="test1.fasta"/>
-<!-- Test only the default UShER mode for now since the
-pangolearn random forest model uses too much memory
-see https://github.com/cov-lineages/pangolin/issues/395
 <conditional name="engine">
-<conditional name="engine">
+<!-- Test only the default UShER mode for now since the
+pangolearn random forest model uses too much memory
+see https://github.com/cov-lineages/pangolin/issues/395
 <param name="analysis_mode" value="pangolearn" />
-</conditional>
+-->
--->
+<conditional name="pangolin_data">
-<conditional name="db">
+<param name="source" value="default" />
-<param name="source" value="default" />
+</conditional>
 </conditional>
 <output name="output1" ftype="tabular">
 <assert_contents>
 <has_text_matching expression="B\.1\.1\t\d\.\d" />
 <has_text text="pass" />
 </assert_contents>
 </output>
 </test>
 <test expect_num_outputs="1">
 <param name="input1" value="test1.fasta"/>
-<conditional name="db">
+<conditional name="engine">
-<param name="source" value="download" />
+<conditional name="pangolin_data">
+<param name="source" value="download" />
+</conditional>
 </conditional>
 <output name="output1" ftype="tabular">
 <assert_contents>
 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" />
 <has_text text="pass" />
 </assert_contents>
 </output>
 </test>
 <test expect_num_outputs="2">
 <param name="input1" value="test1.fasta" />
-<!-- Test only the default UShER mode for now since the
-pangolearn random forest model uses too much memory
-see https://github.com/cov-lineages/pangolin/issues/395
 <conditional name="engine">
+<!-- Test only the default UShER mode for now since the
+pangolearn random forest model uses too much memory
+see https://github.com/cov-lineages/pangolin/issues/395
 <param name="analysis_mode" value="pangolearn" />
-</conditional>
+-->
--->
+<conditional name="pangolin_data">
-<conditional name="db">
+<param name="source" value="download" />
-<param name="source" value="download" />
+</conditional>
 </conditional>
 <param name="alignment" value="--alignment" />
 <output name="output1" ftype="tabular">
 <assert_contents>
 <has_text_matching expression="B\.1\.1\t\d\.\d" />
 </output>
 </test>
 <!-- test include-header option -->
 <test expect_num_outputs="1">
 <param name="input1" value="multiple_alignment.fasta.gz"/>
-<!-- Test only the default UShER mode for now since the
-pangolearn random forest model uses too much memory
-see https://github.com/cov-lineages/pangolin/issues/395
 <conditional name="engine">
+<!-- Test only the default UShER mode for now since the
+pangolearn random forest model uses too much memory
+see https://github.com/cov-lineages/pangolin/issues/395
 <param name="analysis_mode" value="pangolearn" />
-</conditional>
+-->
--->
+<conditional name="pangolin_data">
-<conditional name="db">
+<param name="source" value="default" />
-<param name="source" value="default" />
+</conditional>
 </conditional>
 <param name="include_header" value="true" />
 <output name="output1" ftype="tabular">
 <assert_contents>
 <has_text text="pangolin_version" />
 <has_n_lines n="35" />
 <has_n_columns n="16" />
 </assert_contents>
 </output>
 </test>
-<!-- Test that use of latest assignment cache requires downloaded other data -->
-<test expect_failure="true">
-<param name="input1" value="multiple_alignment.fasta.gz"/>
-<conditional name="engine">
-<param name="use_assignment_cache" value="true" />
-</conditional>
-<conditional name="db">
-<param name="source" value="default" />
-</conditional>
-</test>
 <!-- test with extra expanded_lineage column -->
 <test expect_num_outputs="1">
 <param name="input1" value="multiple_alignment.fasta.gz"/>
-<!-- Test only the default UShER mode for now since the
-pangolearn random forest model uses too much memory
-see https://github.com/cov-lineages/pangolin/issues/395
 <conditional name="engine">
+<!-- Test only the default UShER mode for now since the
+pangolearn random forest model uses too much memory
+see https://github.com/cov-lineages/pangolin/issues/395
 <param name="analysis_mode" value="pangolearn" />
-</conditional>
+-->
--->
+<conditional name="pangolin_data">
-<conditional name="db">
+<param name="source" value="default" />
-<param name="source" value="default" />
+</conditional>
 </conditional>
 <param name="expanded_lineage" value="true" />
 <param name="include_header" value="true" />
 <output name="output1" ftype="tabular">
 <assert_contents>
 `Pangolin <https://cov-lineages.org/pangolin.html>`_
 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a
 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO
 nomenclature system.
+**Data sources/versioning and reproducibility**
-**Data sources/versioning**
 Pangolin uses the
 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as
 a source of its required model, protobuf, designation hash and alias files, and
 the `constellations <https://github.com/cov-lineages/constellations>`_
 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based
 assignment of lineages of concern.
-The tool ships with a copy of this data, but the data gets updated more
-frequently than the tool! In general one should use the most recent model for
+The tool ships with copies of these two data packages, and using these shipped
-lineage assignment, and the default option for this tool is to download the
+versions is *recommended* for reproducibility (even across Galaxy servers) and
-latest versions of pangolin-data and constellations before embarking on
+speed of job execution.
-analysis.
-A pangoLEARN data manager exists so that the Galaxy admin can download specific
+If your instance of Galaxy offers cached alternative versions of
-versions of the pangolin-data/constellations as required. Finally the pangolin
+`pangolin-data` and/or `constellations`, you will be able to use them instead
-tool can use its default built-in data packages, but this is
+of the shipped versions, which can be useful to reproduce results obtained
-**not recommended** as it will almost certainly be out of date.
+earlier with previous versions of pangolin.
+Finally, you have the option to *download the latest version* of each data
+package at job runtime.
+.. class:: warningmark
+You can use this option as a workaround to get the most up-to-date lineage
+assignments even before the next Galaxy tool update (or before an admin
+installs new cached data versions on your server), but be aware of the
+following limitations:
+1. Using latest downloaded data package versions renders results hard to
+reproduce (e.g. rerunning a corresponding job will cause also a fresh
+data download, which may yield different data versions as in the intial
+run).
+2. Downloaded latest versions of the data packages may be incompatible
+with the *pangolin* and *scorpio* version run by the tool, which can
+result in failing tool runs, but occasionally also in harder to diagnose
+lineage assignment issues.
 .. class:: infomark
 The exact combination of pangolin, inference engine (UShER/pangoLEARN),
-scorpio, and data packages used for a particular run of the tool can be
+scorpio, and data packages that was used for a particular run of the tool
-extracted from the four "version" columns in the output (see below for
+can be extracted from the four "version" columns in the output (see below
-details).
+for details).
-.. class:: warningmark
+In addition, lineage assignment with pangolin can be affected by the exact
+versions of additional underlying software. The packaged versions of all
-The "Download latest from web" updates the *pangolin-data* and
+relevant dependencies are listed in the *Requirements* section below. This
-*constellations* packages but not the software (pangolin and scorpio) using
+section is the equivalent to running `pangolin --all-versions` on the
-these data packages.
+command line except that the listed versions of *pangolin-data* and
-If the data package format changes upstream, this can cause the tool run to
+*constellations* are the ones installed with pangolin and may have been
-fail. Cached data packages (or, in the worst case, the built-in data) can
+overridden with the versions reported in the corresponding output columns
-serve as a fallback until switching to an updated pangolin tool
+at tool runtime.
-version.
 **Output**
 The main output of the tool is a tabular file with one line per input sequence
 and with columns providing the
 The most likely lineage assigned to a given sequence based on the inference
 engine used and the SARS-CoV-2 diversity designated.
 This assignment is sensitive to missing data at key sites.
 conflict:
-In the pangoLEARN model, a given sequence gets assigned to the most likely
-category based on known diversity.
 If a sequence can fit into more than one category, the conflict score will
 be greater than 0 and reflect the number of categories the sequence could
 fit into.
-If the conflict score is 0, this means that within the current decision
+If the conflict score is 0, this means that within the current assignment
-tree there is only one category that the sequence could be assigned to.
+model / lineage tree there is only one category that the sequence could
+plausibly be assigned to.
 ambiguity_score:
 This score is a function of the quantity of missing data in a sequence.
-It represents the proportion of relevant sites in a sequnece which were
+It represents the proportion of relevant sites in a sequence which were
 imputed to the reference values.
 A score of 1 indicates that no sites were imputed, while a score of 0
 indicates that more sites were imputed than were not imputed.
-This score only includes sites which are used by the decision tree to
+This score only includes sites which are used by the assignment engine to
 classify a sequence.
 scorpio_call:
 If a query is assigned a constellation by scorpio this call is output in
 this column.
 The version of constellations that scorpio has used to curate the lineage
 assignment.
 is_designated:
 A boolean (True/False) column indicating whether that particular sequence
-has been offically designated a lineage.
+has been offically designated a lineage (via pango-designation).
 qc_status:
 Indicates whether the sequence passed the QC thresholds for minimum length
 and maximum N content.
 qc_notes:
 Notes specific to the QC checks run on the sequences.
 note:
-If any conflicts from the decision tree, this field will output the
+If any conflicts arose during assignment, this field will output the
 alternative assignments. If the sequence failed QC this field will describe
 why.
 If the sequence met the SNP thresholds for scorpio to call a constellation,
-it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative,
+it’ll describe the exact SNP counts of Alt, Ref and Amb (alternative,
 reference and ambiguous) alleles for that call.
 ]]></help>
 <citations>
 <citation type="doi">10.1093/ve/veab064</citation>
 </citations>

Mercurial > repos > iuc > pangolin

comparison pangolin.xml @ 22:a2099fb98cdb draft