comparison pangolin.xml @ 22:a2099fb98cdb draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit d160f73f58eb515a2da4ba76096ed3d8b6c88bdc
author iuc
date Fri, 08 Jul 2022 08:33:57 +0000
parents 81804a978fc0
children 77402759b866
comparison
equal deleted inserted replaced
21:81804a978fc0 22:a2099fb98cdb
1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy2" profile="20.01"> 1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy0" profile="20.01">
2 <description>Phylogenetic Assignment of Outbreak Lineages</description> 2 <description>Phylogenetic Assignment of Outbreak Lineages</description>
3 <macros> 3 <macros>
4 <token name="@TOOL_VERSION@">4.0.5</token> 4 <token name="@TOOL_VERSION@">4.1.1</token>
5 <token name="@PANGOLIN_DATA_VERSION@">1.11</token>
6 <token name="@CONSTELLATIONS_VERSION@">0.1.10</token>
7 <token name="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@">4</token>
8 <!-- a regex describing the scorpio versions that this wrapper version
9 is backwards-compatible with; can be used with the min_scorpio_version
10 column of the constellations data table to offer only compatible
11 versions of constellations data. -->
12 <token name="@COMPATIBLE_SCORPIO_DATA_FORMAT@"><![CDATA[(^0\.[1-3]$|^0\.[0-2]\.\d+$|^0\.3\.\d$|^0\.3\.1[0-7]$|^0$)]]></token>
13 <xml name="usher_download_option">
14 <when value="download">
15 <param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Download and use also latest UShER assignment cache?"
16 help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples." />
17 </when>
18 </xml>
19 <xml name="cached_usher_assignment_cache">
20 <param name="assignment_cache_release" type="select" optional="true" label="Use corresponding UShER assignment cache?"
21 help="If the server offers a copy of the UShER assignment cache along with the specified version of pangolin-data, you can select it here to speed up UShER lineage assignment. If no suitable assignment cache is available, it is perfectly fine to proceed without one, and the performance difference will only become obvious with very large numbers of samples.">
22 <options from_data_table="pangolin_assignment">
23 <column name="value" index="0" />
24 <column name="description" index="1" />
25 <column name="path" index="4" />
26 <filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" />
27 <filter type="param_value" ref="release" column="0" />
28 </options>
29 </param>
30 </xml>
31 <xml name="cached_pangolin_data">
32 <when value="cached">
33 <param name="release" label="Cached release of pangolin-data" type="select">
34 <options from_data_table="pangolin_data">
35 <column name="value" index="0" />
36 <column name="description" index="1" />
37 <column name="date" index="3" />
38 <column name="path" index="4" />
39 <filter type="sort_by" column="3" />
40 <filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" />
41 <validator type="no_options" message="No cached constellations release available" />
42 </options>
43 </param>
44 <yield />
45 </when>
46 </xml>
47 <xml name="pangolin_data_sources">
48 <conditional name="pangolin_data">
49 <param name="source" type="select" label="Version of pangolin-data to use">
50 <option value="default">Use pangolin-data version (v@PANGOLIN_DATA_VERSION@) shipped with this version of the tool</option>
51 <option value="cached">Use specific pangolin-data version cached on this Galaxy server</option>
52 <option value="download">Download latest available pangolin-data version from web</option>
53 </param>
54 <when value="default" />
55 <yield />
56 </conditional>
57 </xml>
5 </macros> 58 </macros>
6 <requirements> 59 <requirements>
7 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement> 60 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement>
61 <!-- Pin also the versions of all core dependencies - the ones
62 reported with the all-versions option of pangolin plus ucsc-fatovcf,
63 which the command is intended to report but currently cannot for
64 technical reasons - to the versions you'd get installed in an unpinned
65 conda install of pangolin at the time of release of this wrapper
66 version! By turning these dependencies into explicit requirements the
67 requirements section of the tool interface becomes the equivalent of
68 the all-versions option as long as the user doesn't update the data
69 dependencies.
70 Wrapper updates are **explicitly encouraged** when new dependency
71 versions become available. Also, please check for updated dependencies
72 when updating the wrapper for other reasons. -->
8 <requirement type="package" version="0.3.17">scorpio</requirement> 73 <requirement type="package" version="0.3.17">scorpio</requirement>
74 <requirement type="package" version="@PANGOLIN_DATA_VERSION@">pangolin-data</requirement>
75 <requirement type="package" version="@CONSTELLATIONS_VERSION@">constellations</requirement>
76 <requirement type="package" version="0.5.6">usher</requirement>
77 <requirement type="package" version="1.1.0">gofasta</requirement>
78 <requirement type="package" version="426">ucsc-fatovcf</requirement>
79 <requirement type="package" version="2.24">minimap2</requirement>
80 <!-- wrapper-specific requirements to turn pangolin's native
81 comma-separated output into tab-separated one and to truncate
82 pangolin's all-versions output. -->
9 <requirement type="package" version="0.23.0">csvtk</requirement> 83 <requirement type="package" version="0.23.0">csvtk</requirement>
84 <requirement type="package" version="3.4">grep</requirement>
10 </requirements> 85 </requirements>
11 <version_command><![CDATA[pangolin --version]]></version_command> 86 <version_command><![CDATA[pangolin --version]]></version_command>
12 <command detect_errors="exit_code"><![CDATA[ 87 <command detect_errors="exit_code"><![CDATA[
13 #if $str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache and str($db.source) != "download": 88 ## Prepare a pangolin datadir if required:
14 ## This is no good. Better to fail immediately instead of downloading a lot of data first. 89 #if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default':
15 echo "Using the latest assignment cache requires downloading the latest version of pangolin-data." 1>&2; exit 1 90 ## for at least one of pangolin-data and constellations we need to
16 #else: 91 ## provide a non-conda env version through a datadir
17 ## Sanity chceck was ok, lets do the real thing ... 92 mkdir datadir &&
18 #if str($db.source) == "download" 93 #if str($engine.pangolin_data.source) == 'download' or str($constellations.source) == 'download':
19 ## Pangolin version 4 tries to update from an existing directory 94 ## If "download latest from web" got requested for any data component,
20 mkdir datadir && 95 ## we can make use of pangolin --update-data to do the job for us.
21 pangolin --update-data --datadir datadir && 96 ## However, this would download updated versions of *all* data
22 #else if str($db.source) == "builtin" 97 ## packages into our datadir, while the user may have asked for
23 ln -s $db.db_release.fields.path datadir && 98 ## just a specific one. To avoid this, we set up a fake package
24 #end if 99 ## with very high version number in the datadir to prevent
25 #if str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache: 100 ## unwanted component updates. After updating the rest of the
26 ## We need to install also the latest UShER assignment cache data. 101 ## data, we remove the fake package again.
27 ## Pangolin has functionality to do so, but uses it incorrectly. 102 #if str($engine.pangolin_data.source) != 'download':
28 ## We use the pangolin function to install into --datadir here, 103 mkdir datadir/pangolin_data &&
29 ## then point pangolin to the downloaded file later using 104 echo '__version__ = "999"' > datadir/pangolin_data/__init__.py &&
30 ## its --assignment-cache parameter 105 #end if
31 106 #if str($constellations.source) != 'download':
32 ## Create a "honeypot" package that will be picked up by pangolin, 107 mkdir datadir/constellations &&
33 ## but will trigger a download because of missing __version__ info. 108 ## constellations versions start with a 'v'!
34 mkdir pangolin_assignment && 109 echo '__version__ = "v999"' > datadir/constellations/__init__.py &&
35 touch pangolin_assignment/__init__.py && 110 #end if
36 ## Call pangolin's assignment cache install function, but 111 ## download updated packages discarding stdout because we
37 ## override pip's install path 112 ## output final package versions separately below and because
38 PIP_TARGET="datadir" PIP_UPGRADE=1 python -c "from pangolin.utils import update; update.install_pangolin_assignment()" && 113 ## it would contain our fake package versions
39 #end if 114 pangolin --update-data --datadir datadir 2&> /dev/null &&
40 pangolin 115 #if str($engine.pangolin_data.source) != 'download':
41 --threads \${GALAXY_SLOTS:-1} 116 rm -r datadir/pangolin_data &&
42 --tempdir "\${TMPDIR:-.}" 117 #end if
43 #if str($db.source) == "download" or str($db.source) == "builtin" 118 #if str($constellations.source) != 'download':
44 --datadir datadir 119 rm -r datadir/constellations &&
45 #end if 120 #end if
46 --analysis-mode $engine.analysis_mode 121 #end if
47 #if str($engine.analysis_mode) == 'usher': 122 #if str($engine.analysis_mode) == 'usher' and str($engine.pangolin_data.source) == 'download':
48 $engine.use_assignment_cache 123 #if $engine.pangolin_data.use_assignment_cache:
49 #if $engine.use_assignment_cache: 124 ## We need to download also the latest UShER assignment cache data.
50 ## Point pangolin to the assignment cache file we've downloaded before 125 ## Since v4.1 pangolin's
51 --assignment-cache datadir/pangolin_assignment/usher_assignments.cache.csv.gz 126 ## --add-assignment-cache/--use-assignment-cache options respect
52 #end if 127 ## --datadir so we can use them directly.
53 #end if 128 pangolin --datadir datadir --add-assignment-cache &&
54 #if $alignment: 129 #end if
55 $alignment --alignment-file '$align1' 130 #end if
56 #end if 131 ## Handle data components to be taken from data tables
57 --outfile report.csv 132 ## The folder structure pointed to by the data tables can be used
58 --max-ambig $max_ambig 133 ## as is except that cannot symlink the folders themselves since
59 --min-length $min_length 134 ## pangolin inspects them using os.walk with the default
60 $expanded_lineage 135 ## `followlinks=False`.
61 '$input1' 136 ## Since data table versions of data packages can be older than
62 && csvtk csv2tab report.csv 137 ## the versions installed in the wrapper environment, we need to
63 #if not $include_header: 138 ## use pangolin's --use-old-datadir option to actually have them
64 | tail -n+2 139 ## used.
65 #end if 140 #set $use_old_datadir = ''
66 > '$output1' 141 #if str($engine.pangolin_data.source) == 'cached':
67 #end if 142 #set $use_old_datadir = '--use-old-datadir'
143 cp -rs '${engine.pangolin_data.release.fields.path}' datadir/pangolin_data &&
144 #if str($engine.analysis_mode) == 'usher' and $engine.pangolin_data.assignment_cache_release:
145 cp -rs '${engine.pangolin_data.assignment_cache_release.fields.path}' datadir/pangolin_assignment &&
146 #end if
147 #end if
148 #if str($constellations.source) == 'cached':
149 #set $use_old_datadir = '--use-old-datadir'
150 cp -rs '${constellations.release.fields.path}' datadir/constellations &&
151 #end if
152 ## Report all data package versions that will be used in this run of the tool
153 echo "Running pangolin with the following possibly updated data packages:" &&
154 pangolin --datadir datadir $use_old_datadir --all-versions | grep -E "pangolin-data|assignment|constellations" &&
155 #end if
156 ## Finally run the pangolin analysis
157 pangolin
158 --threads \${GALAXY_SLOTS:-1}
159 --tempdir "\${TMPDIR:-.}"
160 #if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default':
161 --datadir datadir $use_old_datadir
162 #end if
163 --analysis-mode $engine.analysis_mode
164 #if str($engine.analysis_mode) == 'usher':
165 #if str($engine.pangolin_data.source) == 'download':
166 $engine.pangolin_data.use_assignment_cache
167 #else if str($engine.pangolin_data.source) == 'cached':
168 #if $engine.pangolin_data.assignment_cache_release:
169 --use-assignment-cache
170 #end if
171 #end if
172 #end if
173 #if $alignment:
174 $alignment --alignment-file '$align1'
175 #end if
176 --outfile report.csv
177 --max-ambig $max_ambig
178 --min-length $min_length
179 $expanded_lineage
180 '$input1'
181 && csvtk csv2tab report.csv
182 #if not $include_header:
183 | tail -n+2
184 #end if
185 > '$output1'
68 ]]></command> 186 ]]></command>
69 <inputs> 187 <inputs>
70 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" /> 188 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" />
71 <conditional name="engine"> 189 <conditional name="engine">
72 <param argument="--analysis-mode" type="select" label="Analysis mode" 190 <param argument="--analysis-mode" type="select" label="Analysis mode"
73 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster"> 191 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster">
74 <option value="usher">UShER</option> 192 <option value="usher">UShER</option>
75 <option value="pangolearn">pangoLEARN</option> 193 <option value="pangolearn">pangoLEARN</option>
76 </param> 194 </param>
77 <when value="usher"> 195 <when value="usher">
78 <param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Use latest UShER assignment cache" 196 <expand macro="pangolin_data_sources">
79 help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples. Also note that using the latest assignment cache will require you to select the 'Download latest from web' option for the pangolin-data source below because assignment cache and pangolin-data need to be synchronized." /> 197 <expand macro="cached_pangolin_data">
198 <expand macro="cached_usher_assignment_cache" />
199 </expand>
200 <expand macro="usher_download_option" />
201 </expand>
80 </when> 202 </when>
81 <when value="pangolearn" /> 203 <when value="pangolearn">
204 <expand macro="pangolin_data_sources">
205 <expand macro="cached_pangolin_data" />
206 <when value="download" />
207 </expand>
208 </when>
82 </conditional> 209 </conditional>
83 <conditional name="db"> 210 <conditional name="constellations">
84 <param type="select" name="source" label="pangolin-data source" help="Where to find the pangolin-data to use for the tool run. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below."> 211 <param name="source" type="select" label="Version of constellations to use">
85 <option value="download">Download latest from web</option> 212 <option value="default">Use constellations version (v@CONSTELLATIONS_VERSION@) shipped with this version of the tool</option>
86 <option value="builtin">Use cached data from Galaxy server</option> 213 <option value="cached">Use specific constellations version cached on this Galaxy server</option>
87 <option value="default">Use default data shipped with this build of pangolin (not recommended)</option> 214 <option value="download">Download latest available constellations version from web</option>
88 </param> 215 </param>
89 <when value="download"> 216 <when value="default" />
90 <!-- these are currently not supported by the pangolin downloader --> 217 <when value="cached">
91 <!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> --> 218 <param name="release" label="Cached constellations release" type="select">
92 <!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> --> 219 <options from_data_table="pangolin_constellations">
93 </when>
94 <when value="builtin">
95 <param name="db_release" label="pangoLEARN release" type="select">
96 <options from_data_table="pangolearn">
97 <column name="value" index="0" /> 220 <column name="value" index="0" />
98 <column name="name" index="1" /> 221 <column name="description" index="1" />
99 <column name="path" index="3" /> 222 <column name="date" index="3" />
100 <filter type="sort_by" column="0" /> 223 <column name="path" index="4" />
101 <filter type="static_value" column="2" value="4.0" /> 224 <filter type="sort_by" column="3" />
102 <validator type="no_options" message="No cached pangolin-data release available" /> 225 <filter type="regexp" column="2" value="@COMPATIBLE_SCORPIO_DATA_FORMAT@" />
226 <validator type="no_options" message="No cached constellations release available" />
103 </options> 227 </options>
104 </param> 228 </param>
105 </when> 229 </when>
106 <when value="default" /> 230 <when value="download" />
107 </conditional> 231 </conditional>
108 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" /> 232 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" />
109 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" /> 233 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" />
110 <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." /> 234 <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." />
111 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." /> 235 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." />
130 </data> 254 </data>
131 </outputs> 255 </outputs>
132 <tests> 256 <tests>
133 <test expect_num_outputs="1"> 257 <test expect_num_outputs="1">
134 <param name="input1" value="test1.fasta"/> 258 <param name="input1" value="test1.fasta"/>
135 <!-- Test only the default UShER mode for now since the
136 pangolearn random forest model uses too much memory
137 see https://github.com/cov-lineages/pangolin/issues/395
138 <conditional name="engine"> 259 <conditional name="engine">
139 <conditional name="engine"> 260 <!-- Test only the default UShER mode for now since the
261 pangolearn random forest model uses too much memory
262 see https://github.com/cov-lineages/pangolin/issues/395
140 <param name="analysis_mode" value="pangolearn" /> 263 <param name="analysis_mode" value="pangolearn" />
141 </conditional> 264 -->
142 --> 265 <conditional name="pangolin_data">
143 <conditional name="db"> 266 <param name="source" value="default" />
144 <param name="source" value="default" /> 267 </conditional>
145 </conditional> 268 </conditional>
146 <output name="output1" ftype="tabular"> 269 <output name="output1" ftype="tabular">
147 <assert_contents> 270 <assert_contents>
148 <has_text_matching expression="B\.1\.1\t\d\.\d" /> 271 <has_text_matching expression="B\.1\.1\t\d\.\d" />
149 <has_text text="pass" /> 272 <has_text text="pass" />
151 </assert_contents> 274 </assert_contents>
152 </output> 275 </output>
153 </test> 276 </test>
154 <test expect_num_outputs="1"> 277 <test expect_num_outputs="1">
155 <param name="input1" value="test1.fasta"/> 278 <param name="input1" value="test1.fasta"/>
156 <conditional name="db"> 279 <conditional name="engine">
157 <param name="source" value="download" /> 280 <conditional name="pangolin_data">
281 <param name="source" value="download" />
282 </conditional>
158 </conditional> 283 </conditional>
159 <output name="output1" ftype="tabular"> 284 <output name="output1" ftype="tabular">
160 <assert_contents> 285 <assert_contents>
161 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" /> 286 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" />
162 <has_text text="pass" /> 287 <has_text text="pass" />
164 </assert_contents> 289 </assert_contents>
165 </output> 290 </output>
166 </test> 291 </test>
167 <test expect_num_outputs="2"> 292 <test expect_num_outputs="2">
168 <param name="input1" value="test1.fasta" /> 293 <param name="input1" value="test1.fasta" />
169 <!-- Test only the default UShER mode for now since the
170 pangolearn random forest model uses too much memory
171 see https://github.com/cov-lineages/pangolin/issues/395
172 <conditional name="engine"> 294 <conditional name="engine">
295 <!-- Test only the default UShER mode for now since the
296 pangolearn random forest model uses too much memory
297 see https://github.com/cov-lineages/pangolin/issues/395
173 <param name="analysis_mode" value="pangolearn" /> 298 <param name="analysis_mode" value="pangolearn" />
174 </conditional> 299 -->
175 --> 300 <conditional name="pangolin_data">
176 <conditional name="db"> 301 <param name="source" value="download" />
177 <param name="source" value="download" /> 302 </conditional>
178 </conditional> 303 </conditional>
179 <param name="alignment" value="--alignment" /> 304 <param name="alignment" value="--alignment" />
180 <output name="output1" ftype="tabular"> 305 <output name="output1" ftype="tabular">
181 <assert_contents> 306 <assert_contents>
182 <has_text_matching expression="B\.1\.1\t\d\.\d" /> 307 <has_text_matching expression="B\.1\.1\t\d\.\d" />
192 </output> 317 </output>
193 </test> 318 </test>
194 <!-- test include-header option --> 319 <!-- test include-header option -->
195 <test expect_num_outputs="1"> 320 <test expect_num_outputs="1">
196 <param name="input1" value="multiple_alignment.fasta.gz"/> 321 <param name="input1" value="multiple_alignment.fasta.gz"/>
197 <!-- Test only the default UShER mode for now since the
198 pangolearn random forest model uses too much memory
199 see https://github.com/cov-lineages/pangolin/issues/395
200 <conditional name="engine"> 322 <conditional name="engine">
323 <!-- Test only the default UShER mode for now since the
324 pangolearn random forest model uses too much memory
325 see https://github.com/cov-lineages/pangolin/issues/395
201 <param name="analysis_mode" value="pangolearn" /> 326 <param name="analysis_mode" value="pangolearn" />
202 </conditional> 327 -->
203 --> 328 <conditional name="pangolin_data">
204 <conditional name="db"> 329 <param name="source" value="default" />
205 <param name="source" value="default" /> 330 </conditional>
206 </conditional> 331 </conditional>
207 <param name="include_header" value="true" /> 332 <param name="include_header" value="true" />
208 <output name="output1" ftype="tabular"> 333 <output name="output1" ftype="tabular">
209 <assert_contents> 334 <assert_contents>
210 <has_text text="pangolin_version" /> 335 <has_text text="pangolin_version" />
215 <has_n_lines n="35" /> 340 <has_n_lines n="35" />
216 <has_n_columns n="16" /> 341 <has_n_columns n="16" />
217 </assert_contents> 342 </assert_contents>
218 </output> 343 </output>
219 </test> 344 </test>
220 <!-- Test that use of latest assignment cache requires downloaded other data -->
221 <test expect_failure="true">
222 <param name="input1" value="multiple_alignment.fasta.gz"/>
223 <conditional name="engine">
224 <param name="use_assignment_cache" value="true" />
225 </conditional>
226 <conditional name="db">
227 <param name="source" value="default" />
228 </conditional>
229 </test>
230 <!-- test with extra expanded_lineage column --> 345 <!-- test with extra expanded_lineage column -->
231 <test expect_num_outputs="1"> 346 <test expect_num_outputs="1">
232 <param name="input1" value="multiple_alignment.fasta.gz"/> 347 <param name="input1" value="multiple_alignment.fasta.gz"/>
233 <!-- Test only the default UShER mode for now since the
234 pangolearn random forest model uses too much memory
235 see https://github.com/cov-lineages/pangolin/issues/395
236 <conditional name="engine"> 348 <conditional name="engine">
349 <!-- Test only the default UShER mode for now since the
350 pangolearn random forest model uses too much memory
351 see https://github.com/cov-lineages/pangolin/issues/395
237 <param name="analysis_mode" value="pangolearn" /> 352 <param name="analysis_mode" value="pangolearn" />
238 </conditional> 353 -->
239 --> 354 <conditional name="pangolin_data">
240 <conditional name="db"> 355 <param name="source" value="default" />
241 <param name="source" value="default" /> 356 </conditional>
242 </conditional> 357 </conditional>
243 <param name="expanded_lineage" value="true" /> 358 <param name="expanded_lineage" value="true" />
244 <param name="include_header" value="true" /> 359 <param name="include_header" value="true" />
245 <output name="output1" ftype="tabular"> 360 <output name="output1" ftype="tabular">
246 <assert_contents> 361 <assert_contents>
263 `Pangolin <https://cov-lineages.org/pangolin.html>`_ 378 `Pangolin <https://cov-lineages.org/pangolin.html>`_
264 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a 379 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a
265 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO 380 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO
266 nomenclature system. 381 nomenclature system.
267 382
268 383 **Data sources/versioning and reproducibility**
269 **Data sources/versioning**
270 384
271 Pangolin uses the 385 Pangolin uses the
272 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as 386 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as
273 a source of its required model, protobuf, designation hash and alias files, and 387 a source of its required model, protobuf, designation hash and alias files, and
274 the `constellations <https://github.com/cov-lineages/constellations>`_ 388 the `constellations <https://github.com/cov-lineages/constellations>`_
275 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based 389 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based
276 assignment of lineages of concern. 390 assignment of lineages of concern.
277 The tool ships with a copy of this data, but the data gets updated more 391
278 frequently than the tool! In general one should use the most recent model for 392 The tool ships with copies of these two data packages, and using these shipped
279 lineage assignment, and the default option for this tool is to download the 393 versions is *recommended* for reproducibility (even across Galaxy servers) and
280 latest versions of pangolin-data and constellations before embarking on 394 speed of job execution.
281 analysis. 395
282 A pangoLEARN data manager exists so that the Galaxy admin can download specific 396 If your instance of Galaxy offers cached alternative versions of
283 versions of the pangolin-data/constellations as required. Finally the pangolin 397 `pangolin-data` and/or `constellations`, you will be able to use them instead
284 tool can use its default built-in data packages, but this is 398 of the shipped versions, which can be useful to reproduce results obtained
285 **not recommended** as it will almost certainly be out of date. 399 earlier with previous versions of pangolin.
400
401 Finally, you have the option to *download the latest version* of each data
402 package at job runtime.
403
404 .. class:: warningmark
405
406 You can use this option as a workaround to get the most up-to-date lineage
407 assignments even before the next Galaxy tool update (or before an admin
408 installs new cached data versions on your server), but be aware of the
409 following limitations:
410
411 1. Using latest downloaded data package versions renders results hard to
412 reproduce (e.g. rerunning a corresponding job will cause also a fresh
413 data download, which may yield different data versions as in the intial
414 run).
415
416 2. Downloaded latest versions of the data packages may be incompatible
417 with the *pangolin* and *scorpio* version run by the tool, which can
418 result in failing tool runs, but occasionally also in harder to diagnose
419 lineage assignment issues.
286 420
287 .. class:: infomark 421 .. class:: infomark
288 422
289 The exact combination of pangolin, inference engine (UShER/pangoLEARN), 423 The exact combination of pangolin, inference engine (UShER/pangoLEARN),
290 scorpio, and data packages used for a particular run of the tool can be 424 scorpio, and data packages that was used for a particular run of the tool
291 extracted from the four "version" columns in the output (see below for 425 can be extracted from the four "version" columns in the output (see below
292 details). 426 for details).
293 427
294 .. class:: warningmark 428 In addition, lineage assignment with pangolin can be affected by the exact
295 429 versions of additional underlying software. The packaged versions of all
296 The "Download latest from web" updates the *pangolin-data* and 430 relevant dependencies are listed in the *Requirements* section below. This
297 *constellations* packages but not the software (pangolin and scorpio) using 431 section is the equivalent to running `pangolin --all-versions` on the
298 these data packages. 432 command line except that the listed versions of *pangolin-data* and
299 If the data package format changes upstream, this can cause the tool run to 433 *constellations* are the ones installed with pangolin and may have been
300 fail. Cached data packages (or, in the worst case, the built-in data) can 434 overridden with the versions reported in the corresponding output columns
301 serve as a fallback until switching to an updated pangolin tool 435 at tool runtime.
302 version.
303
304 436
305 **Output** 437 **Output**
306 438
307 The main output of the tool is a tabular file with one line per input sequence 439 The main output of the tool is a tabular file with one line per input sequence
308 and with columns providing the 440 and with columns providing the
315 The most likely lineage assigned to a given sequence based on the inference 447 The most likely lineage assigned to a given sequence based on the inference
316 engine used and the SARS-CoV-2 diversity designated. 448 engine used and the SARS-CoV-2 diversity designated.
317 This assignment is sensitive to missing data at key sites. 449 This assignment is sensitive to missing data at key sites.
318 450
319 conflict: 451 conflict:
320 In the pangoLEARN model, a given sequence gets assigned to the most likely
321 category based on known diversity.
322 If a sequence can fit into more than one category, the conflict score will 452 If a sequence can fit into more than one category, the conflict score will
323 be greater than 0 and reflect the number of categories the sequence could 453 be greater than 0 and reflect the number of categories the sequence could
324 fit into. 454 fit into.
325 If the conflict score is 0, this means that within the current decision 455 If the conflict score is 0, this means that within the current assignment
326 tree there is only one category that the sequence could be assigned to. 456 model / lineage tree there is only one category that the sequence could
457 plausibly be assigned to.
327 458
328 ambiguity_score: 459 ambiguity_score:
329 This score is a function of the quantity of missing data in a sequence. 460 This score is a function of the quantity of missing data in a sequence.
330 It represents the proportion of relevant sites in a sequnece which were 461 It represents the proportion of relevant sites in a sequence which were
331 imputed to the reference values. 462 imputed to the reference values.
332 A score of 1 indicates that no sites were imputed, while a score of 0 463 A score of 1 indicates that no sites were imputed, while a score of 0
333 indicates that more sites were imputed than were not imputed. 464 indicates that more sites were imputed than were not imputed.
334 This score only includes sites which are used by the decision tree to 465 This score only includes sites which are used by the assignment engine to
335 classify a sequence. 466 classify a sequence.
336 467
337 scorpio_call: 468 scorpio_call:
338 If a query is assigned a constellation by scorpio this call is output in 469 If a query is assigned a constellation by scorpio this call is output in
339 this column. 470 this column.
385 The version of constellations that scorpio has used to curate the lineage 516 The version of constellations that scorpio has used to curate the lineage
386 assignment. 517 assignment.
387 518
388 is_designated: 519 is_designated:
389 A boolean (True/False) column indicating whether that particular sequence 520 A boolean (True/False) column indicating whether that particular sequence
390 has been offically designated a lineage. 521 has been offically designated a lineage (via pango-designation).
391 522
392 qc_status: 523 qc_status:
393 Indicates whether the sequence passed the QC thresholds for minimum length 524 Indicates whether the sequence passed the QC thresholds for minimum length
394 and maximum N content. 525 and maximum N content.
395 526
396 qc_notes: 527 qc_notes:
397 Notes specific to the QC checks run on the sequences. 528 Notes specific to the QC checks run on the sequences.
398 529
399 note: 530 note:
400 If any conflicts from the decision tree, this field will output the 531 If any conflicts arose during assignment, this field will output the
401 alternative assignments. If the sequence failed QC this field will describe 532 alternative assignments. If the sequence failed QC this field will describe
402 why. 533 why.
403 If the sequence met the SNP thresholds for scorpio to call a constellation, 534 If the sequence met the SNP thresholds for scorpio to call a constellation,
404 it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative, 535 it’ll describe the exact SNP counts of Alt, Ref and Amb (alternative,
405 reference and ambiguous) alleles for that call. 536 reference and ambiguous) alleles for that call.
406 ]]></help> 537 ]]></help>
407 <citations> 538 <citations>
408 <citation type="doi">10.1093/ve/veab064</citation> 539 <citation type="doi">10.1093/ve/veab064</citation>
409 </citations> 540 </citations>