Mercurial > repos > iuc > pangolin
comparison pangolin.xml @ 22:a2099fb98cdb draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit d160f73f58eb515a2da4ba76096ed3d8b6c88bdc
author | iuc |
---|---|
date | Fri, 08 Jul 2022 08:33:57 +0000 |
parents | 81804a978fc0 |
children | 77402759b866 |
comparison
equal
deleted
inserted
replaced
21:81804a978fc0 | 22:a2099fb98cdb |
---|---|
1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy2" profile="20.01"> | 1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy0" profile="20.01"> |
2 <description>Phylogenetic Assignment of Outbreak Lineages</description> | 2 <description>Phylogenetic Assignment of Outbreak Lineages</description> |
3 <macros> | 3 <macros> |
4 <token name="@TOOL_VERSION@">4.0.5</token> | 4 <token name="@TOOL_VERSION@">4.1.1</token> |
5 <token name="@PANGOLIN_DATA_VERSION@">1.11</token> | |
6 <token name="@CONSTELLATIONS_VERSION@">0.1.10</token> | |
7 <token name="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@">4</token> | |
8 <!-- a regex describing the scorpio versions that this wrapper version | |
9 is backwards-compatible with; can be used with the min_scorpio_version | |
10 column of the constellations data table to offer only compatible | |
11 versions of constellations data. --> | |
12 <token name="@COMPATIBLE_SCORPIO_DATA_FORMAT@"><![CDATA[(^0\.[1-3]$|^0\.[0-2]\.\d+$|^0\.3\.\d$|^0\.3\.1[0-7]$|^0$)]]></token> | |
13 <xml name="usher_download_option"> | |
14 <when value="download"> | |
15 <param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Download and use also latest UShER assignment cache?" | |
16 help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples." /> | |
17 </when> | |
18 </xml> | |
19 <xml name="cached_usher_assignment_cache"> | |
20 <param name="assignment_cache_release" type="select" optional="true" label="Use corresponding UShER assignment cache?" | |
21 help="If the server offers a copy of the UShER assignment cache along with the specified version of pangolin-data, you can select it here to speed up UShER lineage assignment. If no suitable assignment cache is available, it is perfectly fine to proceed without one, and the performance difference will only become obvious with very large numbers of samples."> | |
22 <options from_data_table="pangolin_assignment"> | |
23 <column name="value" index="0" /> | |
24 <column name="description" index="1" /> | |
25 <column name="path" index="4" /> | |
26 <filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" /> | |
27 <filter type="param_value" ref="release" column="0" /> | |
28 </options> | |
29 </param> | |
30 </xml> | |
31 <xml name="cached_pangolin_data"> | |
32 <when value="cached"> | |
33 <param name="release" label="Cached release of pangolin-data" type="select"> | |
34 <options from_data_table="pangolin_data"> | |
35 <column name="value" index="0" /> | |
36 <column name="description" index="1" /> | |
37 <column name="date" index="3" /> | |
38 <column name="path" index="4" /> | |
39 <filter type="sort_by" column="3" /> | |
40 <filter type="static_value" column="2" value="@MIN_COMPATIBLE_PANGOLIN_DATA_FORMAT@" /> | |
41 <validator type="no_options" message="No cached constellations release available" /> | |
42 </options> | |
43 </param> | |
44 <yield /> | |
45 </when> | |
46 </xml> | |
47 <xml name="pangolin_data_sources"> | |
48 <conditional name="pangolin_data"> | |
49 <param name="source" type="select" label="Version of pangolin-data to use"> | |
50 <option value="default">Use pangolin-data version (v@PANGOLIN_DATA_VERSION@) shipped with this version of the tool</option> | |
51 <option value="cached">Use specific pangolin-data version cached on this Galaxy server</option> | |
52 <option value="download">Download latest available pangolin-data version from web</option> | |
53 </param> | |
54 <when value="default" /> | |
55 <yield /> | |
56 </conditional> | |
57 </xml> | |
5 </macros> | 58 </macros> |
6 <requirements> | 59 <requirements> |
7 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement> | 60 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement> |
61 <!-- Pin also the versions of all core dependencies - the ones | |
62 reported with the all-versions option of pangolin plus ucsc-fatovcf, | |
63 which the command is intended to report but currently cannot for | |
64 technical reasons - to the versions you'd get installed in an unpinned | |
65 conda install of pangolin at the time of release of this wrapper | |
66 version! By turning these dependencies into explicit requirements the | |
67 requirements section of the tool interface becomes the equivalent of | |
68 the all-versions option as long as the user doesn't update the data | |
69 dependencies. | |
70 Wrapper updates are **explicitly encouraged** when new dependency | |
71 versions become available. Also, please check for updated dependencies | |
72 when updating the wrapper for other reasons. --> | |
8 <requirement type="package" version="0.3.17">scorpio</requirement> | 73 <requirement type="package" version="0.3.17">scorpio</requirement> |
74 <requirement type="package" version="@PANGOLIN_DATA_VERSION@">pangolin-data</requirement> | |
75 <requirement type="package" version="@CONSTELLATIONS_VERSION@">constellations</requirement> | |
76 <requirement type="package" version="0.5.6">usher</requirement> | |
77 <requirement type="package" version="1.1.0">gofasta</requirement> | |
78 <requirement type="package" version="426">ucsc-fatovcf</requirement> | |
79 <requirement type="package" version="2.24">minimap2</requirement> | |
80 <!-- wrapper-specific requirements to turn pangolin's native | |
81 comma-separated output into tab-separated one and to truncate | |
82 pangolin's all-versions output. --> | |
9 <requirement type="package" version="0.23.0">csvtk</requirement> | 83 <requirement type="package" version="0.23.0">csvtk</requirement> |
84 <requirement type="package" version="3.4">grep</requirement> | |
10 </requirements> | 85 </requirements> |
11 <version_command><![CDATA[pangolin --version]]></version_command> | 86 <version_command><![CDATA[pangolin --version]]></version_command> |
12 <command detect_errors="exit_code"><![CDATA[ | 87 <command detect_errors="exit_code"><![CDATA[ |
13 #if $str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache and str($db.source) != "download": | 88 ## Prepare a pangolin datadir if required: |
14 ## This is no good. Better to fail immediately instead of downloading a lot of data first. | 89 #if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default': |
15 echo "Using the latest assignment cache requires downloading the latest version of pangolin-data." 1>&2; exit 1 | 90 ## for at least one of pangolin-data and constellations we need to |
16 #else: | 91 ## provide a non-conda env version through a datadir |
17 ## Sanity chceck was ok, lets do the real thing ... | 92 mkdir datadir && |
18 #if str($db.source) == "download" | 93 #if str($engine.pangolin_data.source) == 'download' or str($constellations.source) == 'download': |
19 ## Pangolin version 4 tries to update from an existing directory | 94 ## If "download latest from web" got requested for any data component, |
20 mkdir datadir && | 95 ## we can make use of pangolin --update-data to do the job for us. |
21 pangolin --update-data --datadir datadir && | 96 ## However, this would download updated versions of *all* data |
22 #else if str($db.source) == "builtin" | 97 ## packages into our datadir, while the user may have asked for |
23 ln -s $db.db_release.fields.path datadir && | 98 ## just a specific one. To avoid this, we set up a fake package |
24 #end if | 99 ## with very high version number in the datadir to prevent |
25 #if str($engine.analysis_mode) == 'usher' and $engine.use_assignment_cache: | 100 ## unwanted component updates. After updating the rest of the |
26 ## We need to install also the latest UShER assignment cache data. | 101 ## data, we remove the fake package again. |
27 ## Pangolin has functionality to do so, but uses it incorrectly. | 102 #if str($engine.pangolin_data.source) != 'download': |
28 ## We use the pangolin function to install into --datadir here, | 103 mkdir datadir/pangolin_data && |
29 ## then point pangolin to the downloaded file later using | 104 echo '__version__ = "999"' > datadir/pangolin_data/__init__.py && |
30 ## its --assignment-cache parameter | 105 #end if |
31 | 106 #if str($constellations.source) != 'download': |
32 ## Create a "honeypot" package that will be picked up by pangolin, | 107 mkdir datadir/constellations && |
33 ## but will trigger a download because of missing __version__ info. | 108 ## constellations versions start with a 'v'! |
34 mkdir pangolin_assignment && | 109 echo '__version__ = "v999"' > datadir/constellations/__init__.py && |
35 touch pangolin_assignment/__init__.py && | 110 #end if |
36 ## Call pangolin's assignment cache install function, but | 111 ## download updated packages discarding stdout because we |
37 ## override pip's install path | 112 ## output final package versions separately below and because |
38 PIP_TARGET="datadir" PIP_UPGRADE=1 python -c "from pangolin.utils import update; update.install_pangolin_assignment()" && | 113 ## it would contain our fake package versions |
39 #end if | 114 pangolin --update-data --datadir datadir 2&> /dev/null && |
40 pangolin | 115 #if str($engine.pangolin_data.source) != 'download': |
41 --threads \${GALAXY_SLOTS:-1} | 116 rm -r datadir/pangolin_data && |
42 --tempdir "\${TMPDIR:-.}" | 117 #end if |
43 #if str($db.source) == "download" or str($db.source) == "builtin" | 118 #if str($constellations.source) != 'download': |
44 --datadir datadir | 119 rm -r datadir/constellations && |
45 #end if | 120 #end if |
46 --analysis-mode $engine.analysis_mode | 121 #end if |
47 #if str($engine.analysis_mode) == 'usher': | 122 #if str($engine.analysis_mode) == 'usher' and str($engine.pangolin_data.source) == 'download': |
48 $engine.use_assignment_cache | 123 #if $engine.pangolin_data.use_assignment_cache: |
49 #if $engine.use_assignment_cache: | 124 ## We need to download also the latest UShER assignment cache data. |
50 ## Point pangolin to the assignment cache file we've downloaded before | 125 ## Since v4.1 pangolin's |
51 --assignment-cache datadir/pangolin_assignment/usher_assignments.cache.csv.gz | 126 ## --add-assignment-cache/--use-assignment-cache options respect |
52 #end if | 127 ## --datadir so we can use them directly. |
53 #end if | 128 pangolin --datadir datadir --add-assignment-cache && |
54 #if $alignment: | 129 #end if |
55 $alignment --alignment-file '$align1' | 130 #end if |
56 #end if | 131 ## Handle data components to be taken from data tables |
57 --outfile report.csv | 132 ## The folder structure pointed to by the data tables can be used |
58 --max-ambig $max_ambig | 133 ## as is except that cannot symlink the folders themselves since |
59 --min-length $min_length | 134 ## pangolin inspects them using os.walk with the default |
60 $expanded_lineage | 135 ## `followlinks=False`. |
61 '$input1' | 136 ## Since data table versions of data packages can be older than |
62 && csvtk csv2tab report.csv | 137 ## the versions installed in the wrapper environment, we need to |
63 #if not $include_header: | 138 ## use pangolin's --use-old-datadir option to actually have them |
64 | tail -n+2 | 139 ## used. |
65 #end if | 140 #set $use_old_datadir = '' |
66 > '$output1' | 141 #if str($engine.pangolin_data.source) == 'cached': |
67 #end if | 142 #set $use_old_datadir = '--use-old-datadir' |
143 cp -rs '${engine.pangolin_data.release.fields.path}' datadir/pangolin_data && | |
144 #if str($engine.analysis_mode) == 'usher' and $engine.pangolin_data.assignment_cache_release: | |
145 cp -rs '${engine.pangolin_data.assignment_cache_release.fields.path}' datadir/pangolin_assignment && | |
146 #end if | |
147 #end if | |
148 #if str($constellations.source) == 'cached': | |
149 #set $use_old_datadir = '--use-old-datadir' | |
150 cp -rs '${constellations.release.fields.path}' datadir/constellations && | |
151 #end if | |
152 ## Report all data package versions that will be used in this run of the tool | |
153 echo "Running pangolin with the following possibly updated data packages:" && | |
154 pangolin --datadir datadir $use_old_datadir --all-versions | grep -E "pangolin-data|assignment|constellations" && | |
155 #end if | |
156 ## Finally run the pangolin analysis | |
157 pangolin | |
158 --threads \${GALAXY_SLOTS:-1} | |
159 --tempdir "\${TMPDIR:-.}" | |
160 #if str($engine.pangolin_data.source) != 'default' or str($constellations.source) != 'default': | |
161 --datadir datadir $use_old_datadir | |
162 #end if | |
163 --analysis-mode $engine.analysis_mode | |
164 #if str($engine.analysis_mode) == 'usher': | |
165 #if str($engine.pangolin_data.source) == 'download': | |
166 $engine.pangolin_data.use_assignment_cache | |
167 #else if str($engine.pangolin_data.source) == 'cached': | |
168 #if $engine.pangolin_data.assignment_cache_release: | |
169 --use-assignment-cache | |
170 #end if | |
171 #end if | |
172 #end if | |
173 #if $alignment: | |
174 $alignment --alignment-file '$align1' | |
175 #end if | |
176 --outfile report.csv | |
177 --max-ambig $max_ambig | |
178 --min-length $min_length | |
179 $expanded_lineage | |
180 '$input1' | |
181 && csvtk csv2tab report.csv | |
182 #if not $include_header: | |
183 | tail -n+2 | |
184 #end if | |
185 > '$output1' | |
68 ]]></command> | 186 ]]></command> |
69 <inputs> | 187 <inputs> |
70 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" /> | 188 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" /> |
71 <conditional name="engine"> | 189 <conditional name="engine"> |
72 <param argument="--analysis-mode" type="select" label="Analysis mode" | 190 <param argument="--analysis-mode" type="select" label="Analysis mode" |
73 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster"> | 191 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster"> |
74 <option value="usher">UShER</option> | 192 <option value="usher">UShER</option> |
75 <option value="pangolearn">pangoLEARN</option> | 193 <option value="pangolearn">pangoLEARN</option> |
76 </param> | 194 </param> |
77 <when value="usher"> | 195 <when value="usher"> |
78 <param argument="--use-assignment-cache" type="boolean" truevalue="--use-assignment-cache" falsevalue="" label="Use latest UShER assignment cache" | 196 <expand macro="pangolin_data_sources"> |
79 help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples. Also note that using the latest assignment cache will require you to select the 'Download latest from web' option for the pangolin-data source below because assignment cache and pangolin-data need to be synchronized." /> | 197 <expand macro="cached_pangolin_data"> |
198 <expand macro="cached_usher_assignment_cache" /> | |
199 </expand> | |
200 <expand macro="usher_download_option" /> | |
201 </expand> | |
80 </when> | 202 </when> |
81 <when value="pangolearn" /> | 203 <when value="pangolearn"> |
204 <expand macro="pangolin_data_sources"> | |
205 <expand macro="cached_pangolin_data" /> | |
206 <when value="download" /> | |
207 </expand> | |
208 </when> | |
82 </conditional> | 209 </conditional> |
83 <conditional name="db"> | 210 <conditional name="constellations"> |
84 <param type="select" name="source" label="pangolin-data source" help="Where to find the pangolin-data to use for the tool run. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below."> | 211 <param name="source" type="select" label="Version of constellations to use"> |
85 <option value="download">Download latest from web</option> | 212 <option value="default">Use constellations version (v@CONSTELLATIONS_VERSION@) shipped with this version of the tool</option> |
86 <option value="builtin">Use cached data from Galaxy server</option> | 213 <option value="cached">Use specific constellations version cached on this Galaxy server</option> |
87 <option value="default">Use default data shipped with this build of pangolin (not recommended)</option> | 214 <option value="download">Download latest available constellations version from web</option> |
88 </param> | 215 </param> |
89 <when value="download"> | 216 <when value="default" /> |
90 <!-- these are currently not supported by the pangolin downloader --> | 217 <when value="cached"> |
91 <!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> --> | 218 <param name="release" label="Cached constellations release" type="select"> |
92 <!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> --> | 219 <options from_data_table="pangolin_constellations"> |
93 </when> | |
94 <when value="builtin"> | |
95 <param name="db_release" label="pangoLEARN release" type="select"> | |
96 <options from_data_table="pangolearn"> | |
97 <column name="value" index="0" /> | 220 <column name="value" index="0" /> |
98 <column name="name" index="1" /> | 221 <column name="description" index="1" /> |
99 <column name="path" index="3" /> | 222 <column name="date" index="3" /> |
100 <filter type="sort_by" column="0" /> | 223 <column name="path" index="4" /> |
101 <filter type="static_value" column="2" value="4.0" /> | 224 <filter type="sort_by" column="3" /> |
102 <validator type="no_options" message="No cached pangolin-data release available" /> | 225 <filter type="regexp" column="2" value="@COMPATIBLE_SCORPIO_DATA_FORMAT@" /> |
226 <validator type="no_options" message="No cached constellations release available" /> | |
103 </options> | 227 </options> |
104 </param> | 228 </param> |
105 </when> | 229 </when> |
106 <when value="default" /> | 230 <when value="download" /> |
107 </conditional> | 231 </conditional> |
108 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" /> | 232 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" /> |
109 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" /> | 233 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" /> |
110 <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." /> | 234 <param argument="--min-length" type="integer" value="0" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." /> |
111 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." /> | 235 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." /> |
130 </data> | 254 </data> |
131 </outputs> | 255 </outputs> |
132 <tests> | 256 <tests> |
133 <test expect_num_outputs="1"> | 257 <test expect_num_outputs="1"> |
134 <param name="input1" value="test1.fasta"/> | 258 <param name="input1" value="test1.fasta"/> |
135 <!-- Test only the default UShER mode for now since the | |
136 pangolearn random forest model uses too much memory | |
137 see https://github.com/cov-lineages/pangolin/issues/395 | |
138 <conditional name="engine"> | 259 <conditional name="engine"> |
139 <conditional name="engine"> | 260 <!-- Test only the default UShER mode for now since the |
261 pangolearn random forest model uses too much memory | |
262 see https://github.com/cov-lineages/pangolin/issues/395 | |
140 <param name="analysis_mode" value="pangolearn" /> | 263 <param name="analysis_mode" value="pangolearn" /> |
141 </conditional> | 264 --> |
142 --> | 265 <conditional name="pangolin_data"> |
143 <conditional name="db"> | 266 <param name="source" value="default" /> |
144 <param name="source" value="default" /> | 267 </conditional> |
145 </conditional> | 268 </conditional> |
146 <output name="output1" ftype="tabular"> | 269 <output name="output1" ftype="tabular"> |
147 <assert_contents> | 270 <assert_contents> |
148 <has_text_matching expression="B\.1\.1\t\d\.\d" /> | 271 <has_text_matching expression="B\.1\.1\t\d\.\d" /> |
149 <has_text text="pass" /> | 272 <has_text text="pass" /> |
151 </assert_contents> | 274 </assert_contents> |
152 </output> | 275 </output> |
153 </test> | 276 </test> |
154 <test expect_num_outputs="1"> | 277 <test expect_num_outputs="1"> |
155 <param name="input1" value="test1.fasta"/> | 278 <param name="input1" value="test1.fasta"/> |
156 <conditional name="db"> | 279 <conditional name="engine"> |
157 <param name="source" value="download" /> | 280 <conditional name="pangolin_data"> |
281 <param name="source" value="download" /> | |
282 </conditional> | |
158 </conditional> | 283 </conditional> |
159 <output name="output1" ftype="tabular"> | 284 <output name="output1" ftype="tabular"> |
160 <assert_contents> | 285 <assert_contents> |
161 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" /> | 286 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" /> |
162 <has_text text="pass" /> | 287 <has_text text="pass" /> |
164 </assert_contents> | 289 </assert_contents> |
165 </output> | 290 </output> |
166 </test> | 291 </test> |
167 <test expect_num_outputs="2"> | 292 <test expect_num_outputs="2"> |
168 <param name="input1" value="test1.fasta" /> | 293 <param name="input1" value="test1.fasta" /> |
169 <!-- Test only the default UShER mode for now since the | |
170 pangolearn random forest model uses too much memory | |
171 see https://github.com/cov-lineages/pangolin/issues/395 | |
172 <conditional name="engine"> | 294 <conditional name="engine"> |
295 <!-- Test only the default UShER mode for now since the | |
296 pangolearn random forest model uses too much memory | |
297 see https://github.com/cov-lineages/pangolin/issues/395 | |
173 <param name="analysis_mode" value="pangolearn" /> | 298 <param name="analysis_mode" value="pangolearn" /> |
174 </conditional> | 299 --> |
175 --> | 300 <conditional name="pangolin_data"> |
176 <conditional name="db"> | 301 <param name="source" value="download" /> |
177 <param name="source" value="download" /> | 302 </conditional> |
178 </conditional> | 303 </conditional> |
179 <param name="alignment" value="--alignment" /> | 304 <param name="alignment" value="--alignment" /> |
180 <output name="output1" ftype="tabular"> | 305 <output name="output1" ftype="tabular"> |
181 <assert_contents> | 306 <assert_contents> |
182 <has_text_matching expression="B\.1\.1\t\d\.\d" /> | 307 <has_text_matching expression="B\.1\.1\t\d\.\d" /> |
192 </output> | 317 </output> |
193 </test> | 318 </test> |
194 <!-- test include-header option --> | 319 <!-- test include-header option --> |
195 <test expect_num_outputs="1"> | 320 <test expect_num_outputs="1"> |
196 <param name="input1" value="multiple_alignment.fasta.gz"/> | 321 <param name="input1" value="multiple_alignment.fasta.gz"/> |
197 <!-- Test only the default UShER mode for now since the | |
198 pangolearn random forest model uses too much memory | |
199 see https://github.com/cov-lineages/pangolin/issues/395 | |
200 <conditional name="engine"> | 322 <conditional name="engine"> |
323 <!-- Test only the default UShER mode for now since the | |
324 pangolearn random forest model uses too much memory | |
325 see https://github.com/cov-lineages/pangolin/issues/395 | |
201 <param name="analysis_mode" value="pangolearn" /> | 326 <param name="analysis_mode" value="pangolearn" /> |
202 </conditional> | 327 --> |
203 --> | 328 <conditional name="pangolin_data"> |
204 <conditional name="db"> | 329 <param name="source" value="default" /> |
205 <param name="source" value="default" /> | 330 </conditional> |
206 </conditional> | 331 </conditional> |
207 <param name="include_header" value="true" /> | 332 <param name="include_header" value="true" /> |
208 <output name="output1" ftype="tabular"> | 333 <output name="output1" ftype="tabular"> |
209 <assert_contents> | 334 <assert_contents> |
210 <has_text text="pangolin_version" /> | 335 <has_text text="pangolin_version" /> |
215 <has_n_lines n="35" /> | 340 <has_n_lines n="35" /> |
216 <has_n_columns n="16" /> | 341 <has_n_columns n="16" /> |
217 </assert_contents> | 342 </assert_contents> |
218 </output> | 343 </output> |
219 </test> | 344 </test> |
220 <!-- Test that use of latest assignment cache requires downloaded other data --> | |
221 <test expect_failure="true"> | |
222 <param name="input1" value="multiple_alignment.fasta.gz"/> | |
223 <conditional name="engine"> | |
224 <param name="use_assignment_cache" value="true" /> | |
225 </conditional> | |
226 <conditional name="db"> | |
227 <param name="source" value="default" /> | |
228 </conditional> | |
229 </test> | |
230 <!-- test with extra expanded_lineage column --> | 345 <!-- test with extra expanded_lineage column --> |
231 <test expect_num_outputs="1"> | 346 <test expect_num_outputs="1"> |
232 <param name="input1" value="multiple_alignment.fasta.gz"/> | 347 <param name="input1" value="multiple_alignment.fasta.gz"/> |
233 <!-- Test only the default UShER mode for now since the | |
234 pangolearn random forest model uses too much memory | |
235 see https://github.com/cov-lineages/pangolin/issues/395 | |
236 <conditional name="engine"> | 348 <conditional name="engine"> |
349 <!-- Test only the default UShER mode for now since the | |
350 pangolearn random forest model uses too much memory | |
351 see https://github.com/cov-lineages/pangolin/issues/395 | |
237 <param name="analysis_mode" value="pangolearn" /> | 352 <param name="analysis_mode" value="pangolearn" /> |
238 </conditional> | 353 --> |
239 --> | 354 <conditional name="pangolin_data"> |
240 <conditional name="db"> | 355 <param name="source" value="default" /> |
241 <param name="source" value="default" /> | 356 </conditional> |
242 </conditional> | 357 </conditional> |
243 <param name="expanded_lineage" value="true" /> | 358 <param name="expanded_lineage" value="true" /> |
244 <param name="include_header" value="true" /> | 359 <param name="include_header" value="true" /> |
245 <output name="output1" ftype="tabular"> | 360 <output name="output1" ftype="tabular"> |
246 <assert_contents> | 361 <assert_contents> |
263 `Pangolin <https://cov-lineages.org/pangolin.html>`_ | 378 `Pangolin <https://cov-lineages.org/pangolin.html>`_ |
264 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a | 379 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a |
265 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO | 380 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO |
266 nomenclature system. | 381 nomenclature system. |
267 | 382 |
268 | 383 **Data sources/versioning and reproducibility** |
269 **Data sources/versioning** | |
270 | 384 |
271 Pangolin uses the | 385 Pangolin uses the |
272 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as | 386 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as |
273 a source of its required model, protobuf, designation hash and alias files, and | 387 a source of its required model, protobuf, designation hash and alias files, and |
274 the `constellations <https://github.com/cov-lineages/constellations>`_ | 388 the `constellations <https://github.com/cov-lineages/constellations>`_ |
275 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based | 389 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based |
276 assignment of lineages of concern. | 390 assignment of lineages of concern. |
277 The tool ships with a copy of this data, but the data gets updated more | 391 |
278 frequently than the tool! In general one should use the most recent model for | 392 The tool ships with copies of these two data packages, and using these shipped |
279 lineage assignment, and the default option for this tool is to download the | 393 versions is *recommended* for reproducibility (even across Galaxy servers) and |
280 latest versions of pangolin-data and constellations before embarking on | 394 speed of job execution. |
281 analysis. | 395 |
282 A pangoLEARN data manager exists so that the Galaxy admin can download specific | 396 If your instance of Galaxy offers cached alternative versions of |
283 versions of the pangolin-data/constellations as required. Finally the pangolin | 397 `pangolin-data` and/or `constellations`, you will be able to use them instead |
284 tool can use its default built-in data packages, but this is | 398 of the shipped versions, which can be useful to reproduce results obtained |
285 **not recommended** as it will almost certainly be out of date. | 399 earlier with previous versions of pangolin. |
400 | |
401 Finally, you have the option to *download the latest version* of each data | |
402 package at job runtime. | |
403 | |
404 .. class:: warningmark | |
405 | |
406 You can use this option as a workaround to get the most up-to-date lineage | |
407 assignments even before the next Galaxy tool update (or before an admin | |
408 installs new cached data versions on your server), but be aware of the | |
409 following limitations: | |
410 | |
411 1. Using latest downloaded data package versions renders results hard to | |
412 reproduce (e.g. rerunning a corresponding job will cause also a fresh | |
413 data download, which may yield different data versions as in the intial | |
414 run). | |
415 | |
416 2. Downloaded latest versions of the data packages may be incompatible | |
417 with the *pangolin* and *scorpio* version run by the tool, which can | |
418 result in failing tool runs, but occasionally also in harder to diagnose | |
419 lineage assignment issues. | |
286 | 420 |
287 .. class:: infomark | 421 .. class:: infomark |
288 | 422 |
289 The exact combination of pangolin, inference engine (UShER/pangoLEARN), | 423 The exact combination of pangolin, inference engine (UShER/pangoLEARN), |
290 scorpio, and data packages used for a particular run of the tool can be | 424 scorpio, and data packages that was used for a particular run of the tool |
291 extracted from the four "version" columns in the output (see below for | 425 can be extracted from the four "version" columns in the output (see below |
292 details). | 426 for details). |
293 | 427 |
294 .. class:: warningmark | 428 In addition, lineage assignment with pangolin can be affected by the exact |
295 | 429 versions of additional underlying software. The packaged versions of all |
296 The "Download latest from web" updates the *pangolin-data* and | 430 relevant dependencies are listed in the *Requirements* section below. This |
297 *constellations* packages but not the software (pangolin and scorpio) using | 431 section is the equivalent to running `pangolin --all-versions` on the |
298 these data packages. | 432 command line except that the listed versions of *pangolin-data* and |
299 If the data package format changes upstream, this can cause the tool run to | 433 *constellations* are the ones installed with pangolin and may have been |
300 fail. Cached data packages (or, in the worst case, the built-in data) can | 434 overridden with the versions reported in the corresponding output columns |
301 serve as a fallback until switching to an updated pangolin tool | 435 at tool runtime. |
302 version. | |
303 | |
304 | 436 |
305 **Output** | 437 **Output** |
306 | 438 |
307 The main output of the tool is a tabular file with one line per input sequence | 439 The main output of the tool is a tabular file with one line per input sequence |
308 and with columns providing the | 440 and with columns providing the |
315 The most likely lineage assigned to a given sequence based on the inference | 447 The most likely lineage assigned to a given sequence based on the inference |
316 engine used and the SARS-CoV-2 diversity designated. | 448 engine used and the SARS-CoV-2 diversity designated. |
317 This assignment is sensitive to missing data at key sites. | 449 This assignment is sensitive to missing data at key sites. |
318 | 450 |
319 conflict: | 451 conflict: |
320 In the pangoLEARN model, a given sequence gets assigned to the most likely | |
321 category based on known diversity. | |
322 If a sequence can fit into more than one category, the conflict score will | 452 If a sequence can fit into more than one category, the conflict score will |
323 be greater than 0 and reflect the number of categories the sequence could | 453 be greater than 0 and reflect the number of categories the sequence could |
324 fit into. | 454 fit into. |
325 If the conflict score is 0, this means that within the current decision | 455 If the conflict score is 0, this means that within the current assignment |
326 tree there is only one category that the sequence could be assigned to. | 456 model / lineage tree there is only one category that the sequence could |
457 plausibly be assigned to. | |
327 | 458 |
328 ambiguity_score: | 459 ambiguity_score: |
329 This score is a function of the quantity of missing data in a sequence. | 460 This score is a function of the quantity of missing data in a sequence. |
330 It represents the proportion of relevant sites in a sequnece which were | 461 It represents the proportion of relevant sites in a sequence which were |
331 imputed to the reference values. | 462 imputed to the reference values. |
332 A score of 1 indicates that no sites were imputed, while a score of 0 | 463 A score of 1 indicates that no sites were imputed, while a score of 0 |
333 indicates that more sites were imputed than were not imputed. | 464 indicates that more sites were imputed than were not imputed. |
334 This score only includes sites which are used by the decision tree to | 465 This score only includes sites which are used by the assignment engine to |
335 classify a sequence. | 466 classify a sequence. |
336 | 467 |
337 scorpio_call: | 468 scorpio_call: |
338 If a query is assigned a constellation by scorpio this call is output in | 469 If a query is assigned a constellation by scorpio this call is output in |
339 this column. | 470 this column. |
385 The version of constellations that scorpio has used to curate the lineage | 516 The version of constellations that scorpio has used to curate the lineage |
386 assignment. | 517 assignment. |
387 | 518 |
388 is_designated: | 519 is_designated: |
389 A boolean (True/False) column indicating whether that particular sequence | 520 A boolean (True/False) column indicating whether that particular sequence |
390 has been offically designated a lineage. | 521 has been offically designated a lineage (via pango-designation). |
391 | 522 |
392 qc_status: | 523 qc_status: |
393 Indicates whether the sequence passed the QC thresholds for minimum length | 524 Indicates whether the sequence passed the QC thresholds for minimum length |
394 and maximum N content. | 525 and maximum N content. |
395 | 526 |
396 qc_notes: | 527 qc_notes: |
397 Notes specific to the QC checks run on the sequences. | 528 Notes specific to the QC checks run on the sequences. |
398 | 529 |
399 note: | 530 note: |
400 If any conflicts from the decision tree, this field will output the | 531 If any conflicts arose during assignment, this field will output the |
401 alternative assignments. If the sequence failed QC this field will describe | 532 alternative assignments. If the sequence failed QC this field will describe |
402 why. | 533 why. |
403 If the sequence met the SNP thresholds for scorpio to call a constellation, | 534 If the sequence met the SNP thresholds for scorpio to call a constellation, |
404 it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative, | 535 it’ll describe the exact SNP counts of Alt, Ref and Amb (alternative, |
405 reference and ambiguous) alleles for that call. | 536 reference and ambiguous) alleles for that call. |
406 ]]></help> | 537 ]]></help> |
407 <citations> | 538 <citations> |
408 <citation type="doi">10.1093/ve/veab064</citation> | 539 <citation type="doi">10.1093/ve/veab064</citation> |
409 </citations> | 540 </citations> |