# HG changeset patch # User iuc # Date 1657269237 0 # Node ID a2099fb98cdb10c544ccc1bbff23eb30680e8ce7 # Parent 81804a978fc04be5c1da8ae45c6e18edd4d3104e planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit d160f73f58eb515a2da4ba76096ed3d8b6c88bdc diff -r 81804a978fc0 -r a2099fb98cdb pangolin.xml --- a/pangolin.xml Sat May 07 19:52:10 2022 +0000 +++ b/pangolin.xml Fri Jul 08 08:33:57 2022 +0000 @@ -1,70 +1,188 @@ - + Phylogenetic Assignment of Outbreak Lineages - 4.0.5 + 4.1.1 + 1.11 + 0.1.10 + 4 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + pangolin + scorpio + pangolin-data + constellations + usher + gofasta + ucsc-fatovcf + minimap2 + csvtk + grep &2; exit 1 - #else: - ## Sanity chceck was ok, lets do the real thing ... - #if str($db.source) == "download" - ## Pangolin version 4 tries to update from an existing directory - mkdir datadir && - pangolin --update-data --datadir datadir && - #else if str($db.source) == "builtin" - ln -s $db.db_release.fields.path datadir && +## Prepare a pangolin datadir if required: +#if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default': + ## for at least one of pangolin-data and constellations we need to + ## provide a non-conda env version through a datadir + mkdir datadir && + #if str($engine.pangolin_data.source) == 'download' or str($constellations.source) == 'download': + ## If "download latest from web" got requested for any data component, + ## we can make use of pangolin --update-data to do the job for us. + ## However, this would download updated versions of *all* data + ## packages into our datadir, while the user may have asked for + ## just a specific one. To avoid this, we set up a fake package + ## with very high version number in the datadir to prevent + ## unwanted component updates. After updating the rest of the + ## data, we remove the fake package again. + #if str($engine.pangolin_data.source) != 'download': + mkdir datadir/pangolin_data && + echo '__version__ = "999"' > datadir/pangolin_data/__init__.py && #end if - #if str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache: - ## We need to install also the latest UShER assignment cache data. - ## Pangolin has functionality to do so, but uses it incorrectly. - ## We use the pangolin function to install into --datadir here, - ## then point pangolin to the downloaded file later using - ## its --assignment-cache parameter - - ## Create a "honeypot" package that will be picked up by pangolin, - ## but will trigger a download because of missing __version__ info. - mkdir pangolin_assignment && - touch pangolin_assignment/__init__.py && - ## Call pangolin's assignment cache install function, but - ## override pip's install path - PIP_TARGET="datadir" PIP_UPGRADE=1 python -c "from pangolin.utils import update; update.install_pangolin_assignment()" && + #if str($constellations.source) != 'download': + mkdir datadir/constellations && + ## constellations versions start with a 'v'! + echo '__version__ = "v999"' > datadir/constellations/__init__.py && + #end if + ## download updated packages discarding stdout because we + ## output final package versions separately below and because + ## it would contain our fake package versions + pangolin --update-data --datadir datadir 2&> /dev/null && + #if str($engine.pangolin_data.source) != 'download': + rm -r datadir/pangolin_data && + #end if + #if str($constellations.source) != 'download': + rm -r datadir/constellations && + #end if + #end if + #if str($engine.analysis_mode) == 'usher' and str($engine.pangolin_data.source) == 'download': + #if $engine.pangolin_data.use_assignment_cache: + ## We need to download also the latest UShER assignment cache data. + ## Since v4.1 pangolin's + ## --add-assignment-cache/--use-assignment-cache options respect + ## --datadir so we can use them directly. + pangolin --datadir datadir --add-assignment-cache && #end if - pangolin - --threads \${GALAXY_SLOTS:-1} - --tempdir "\${TMPDIR:-.}" - #if str($db.source) == "download" or str($db.source) == "builtin" - --datadir datadir - #end if - --analysis-mode $engine.analysis_mode - #if str($engine.analysis_mode) == 'usher': - $engine.use_assignment_cache - #if $engine.use_assignment_cache: - ## Point pangolin to the assignment cache file we've downloaded before - --assignment-cache datadir/pangolin_assignment/usher_assignments.cache.csv.gz - #end if + #end if + ## Handle data components to be taken from data tables + ## The folder structure pointed to by the data tables can be used + ## as is except that cannot symlink the folders themselves since + ## pangolin inspects them using os.walk with the default + ## `followlinks=False`. + ## Since data table versions of data packages can be older than + ## the versions installed in the wrapper environment, we need to + ## use pangolin's --use-old-datadir option to actually have them + ## used. + #set $use_old_datadir = '' + #if str($engine.pangolin_data.source) == 'cached': + #set $use_old_datadir = '--use-old-datadir' + cp -rs '${engine.pangolin_data.release.fields.path}' datadir/pangolin_data && + #if str($engine.analysis_mode) == 'usher' and $engine.pangolin_data.assignment_cache_release: + cp -rs '${engine.pangolin_data.assignment_cache_release.fields.path}' datadir/pangolin_assignment && #end if - #if $alignment: - $alignment --alignment-file '$align1' + #end if + #if str($constellations.source) == 'cached': + #set $use_old_datadir = '--use-old-datadir' + cp -rs '${constellations.release.fields.path}' datadir/constellations && + #end if + ## Report all data package versions that will be used in this run of the tool + echo "Running pangolin with the following possibly updated data packages:" && + pangolin --datadir datadir $use_old_datadir --all-versions | grep -E "pangolin-data|assignment|constellations" && +#end if +## Finally run the pangolin analysis +pangolin +--threads \${GALAXY_SLOTS:-1} +--tempdir "\${TMPDIR:-.}" +#if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default': + --datadir datadir $use_old_datadir +#end if +--analysis-mode $engine.analysis_mode +#if str($engine.analysis_mode) == 'usher': + #if str($engine.pangolin_data.source) == 'download': + $engine.pangolin_data.use_assignment_cache + #else if str($engine.pangolin_data.source) == 'cached': + #if $engine.pangolin_data.assignment_cache_release: + --use-assignment-cache #end if - --outfile report.csv - --max-ambig $max_ambig - --min-length $min_length - $expanded_lineage - '$input1' - && csvtk csv2tab report.csv - #if not $include_header: - | tail -n+2 - #end if - > '$output1' - #end if + #end if +#end if +#if $alignment: + $alignment --alignment-file '$align1' +#end if +--outfile report.csv +--max-ambig $max_ambig +--min-length $min_length +$expanded_lineage +'$input1' +&& csvtk csv2tab report.csv +#if not $include_header: + | tail -n+2 +#end if +> '$output1' ]]> @@ -75,35 +193,41 @@ - + + + + + + - + + + + + + - - - - - + + + + + - - - - - - - - + + + + - - - - - + + + + + + - + @@ -132,16 +256,15 @@ - - - + --> + + + @@ -153,8 +276,10 @@ - - + + + + @@ -166,15 +291,15 @@ - - - + --> + + + @@ -194,15 +319,15 @@ - - - + --> + + + @@ -217,28 +342,18 @@ - - - - - - - - - - - - - + --> + + + @@ -265,8 +380,7 @@ SARS-CoV-2 genome sequence the most likely lineage based on the PANGO nomenclature system. - -**Data sources/versioning** +**Data sources/versioning and reproducibility** Pangolin uses the `pangolin-data `_ repository as @@ -274,33 +388,51 @@ the `constellations `_ repository for `scorpio `_ -based assignment of lineages of concern. -The tool ships with a copy of this data, but the data gets updated more -frequently than the tool! In general one should use the most recent model for -lineage assignment, and the default option for this tool is to download the -latest versions of pangolin-data and constellations before embarking on -analysis. -A pangoLEARN data manager exists so that the Galaxy admin can download specific -versions of the pangolin-data/constellations as required. Finally the pangolin -tool can use its default built-in data packages, but this is -**not recommended** as it will almost certainly be out of date. + +The tool ships with copies of these two data packages, and using these shipped +versions is *recommended* for reproducibility (even across Galaxy servers) and +speed of job execution. + +If your instance of Galaxy offers cached alternative versions of +`pangolin-data` and/or `constellations`, you will be able to use them instead +of the shipped versions, which can be useful to reproduce results obtained +earlier with previous versions of pangolin. + +Finally, you have the option to *download the latest version* of each data +package at job runtime. + +.. class:: warningmark + + You can use this option as a workaround to get the most up-to-date lineage + assignments even before the next Galaxy tool update (or before an admin + installs new cached data versions on your server), but be aware of the + following limitations: + +1. Using latest downloaded data package versions renders results hard to + reproduce (e.g. rerunning a corresponding job will cause also a fresh + data download, which may yield different data versions as in the intial + run). + +2. Downloaded latest versions of the data packages may be incompatible + with the *pangolin* and *scorpio* version run by the tool, which can + result in failing tool runs, but occasionally also in harder to diagnose + lineage assignment issues. .. class:: infomark The exact combination of pangolin, inference engine (UShER/pangoLEARN), - scorpio, and data packages used for a particular run of the tool can be - extracted from the four "version" columns in the output (see below for - details). - -.. class:: warningmark + scorpio, and data packages that was used for a particular run of the tool + can be extracted from the four "version" columns in the output (see below + for details). - The "Download latest from web" updates the *pangolin-data* and - *constellations* packages but not the software (pangolin and scorpio) using - these data packages. - If the data package format changes upstream, this can cause the tool run to - fail. Cached data packages (or, in the worst case, the built-in data) can - serve as a fallback until switching to an updated pangolin tool - version. - + In addition, lineage assignment with pangolin can be affected by the exact + versions of additional underlying software. The packaged versions of all + relevant dependencies are listed in the *Requirements* section below. This + section is the equivalent to running `pangolin --all-versions` on the + command line except that the listed versions of *pangolin-data* and + *constellations* are the ones installed with pangolin and may have been + overridden with the versions reported in the corresponding output columns + at tool runtime. **Output** @@ -317,21 +449,20 @@ This assignment is sensitive to missing data at key sites. conflict: - In the pangoLEARN model, a given sequence gets assigned to the most likely - category based on known diversity. If a sequence can fit into more than one category, the conflict score will be greater than 0 and reflect the number of categories the sequence could fit into. - If the conflict score is 0, this means that within the current decision - tree there is only one category that the sequence could be assigned to. + If the conflict score is 0, this means that within the current assignment + model / lineage tree there is only one category that the sequence could + plausibly be assigned to. ambiguity_score: This score is a function of the quantity of missing data in a sequence. - It represents the proportion of relevant sites in a sequnece which were + It represents the proportion of relevant sites in a sequence which were imputed to the reference values. A score of 1 indicates that no sites were imputed, while a score of 0 indicates that more sites were imputed than were not imputed. - This score only includes sites which are used by the decision tree to + This score only includes sites which are used by the assignment engine to classify a sequence. scorpio_call: @@ -387,7 +518,7 @@ is_designated: A boolean (True/False) column indicating whether that particular sequence - has been offically designated a lineage. + has been offically designated a lineage (via pango-designation). qc_status: Indicates whether the sequence passed the QC thresholds for minimum length @@ -397,11 +528,11 @@ Notes specific to the QC checks run on the sequences. note: - If any conflicts from the decision tree, this field will output the + If any conflicts arose during assignment, this field will output the alternative assignments. If the sequence failed QC this field will describe why. If the sequence met the SNP thresholds for scorpio to call a constellation, - it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative, + it’ll describe the exact SNP counts of Alt, Ref and Amb (alternative, reference and ambiguous) alleles for that call. ]]>