comparison pangolin.xml @ 19:abf6dbe8c9d7 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/pangolin commit 98c0d716cbd1237ae735ce83e0153ee246abd5d8"
author iuc
date Thu, 21 Apr 2022 11:40:56 +0000
parents 2fa9d4f1b48f
children 14ae456b8cc5
comparison
equal deleted inserted replaced
18:2fa9d4f1b48f 19:abf6dbe8c9d7
1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy0" profile="20.01"> 1 <tool id="pangolin" name="Pangolin" version="@TOOL_VERSION@+galaxy0" profile="20.01">
2 <description>Phylogenetic Assignment of Outbreak Lineages</description> 2 <description>Phylogenetic Assignment of Outbreak Lineages</description>
3 <macros> 3 <macros>
4 <token name="@TOOL_VERSION@">3.1.20</token> 4 <token name="@TOOL_VERSION@">4.0.5</token>
5 </macros> 5 </macros>
6 <requirements> 6 <requirements>
7 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement> 7 <requirement type="package" version="@TOOL_VERSION@">pangolin</requirement>
8 <requirement type="package" version="0.3.16">scorpio</requirement> 8 <requirement type="package" version="0.3.16">scorpio</requirement>
9 <requirement type="package" version="0.23.0">csvtk</requirement> 9 <requirement type="package" version="0.23.0">csvtk</requirement>
10 </requirements> 10 </requirements>
11 <version_command><![CDATA[pangolin --version]]></version_command>
11 <command detect_errors="exit_code"><![CDATA[ 12 <command detect_errors="exit_code"><![CDATA[
12 #if str($db.source) == "download" 13 #if str($db.source) == "download"
14 ## Pangolin version 4 tries to update from an existing directory
15 mkdir datadir &&
13 pangolin --update-data --datadir datadir && 16 pangolin --update-data --datadir datadir &&
14 #else if str($db.source) == "builtin" 17 #else if str($db.source) == "builtin"
15 ln -s $db.db_release.fields.path datadir && 18 ln -s $db.db_release.fields.path datadir &&
16 #end if 19 #end if
17 pangolin 20 pangolin
18 --threads \${GALAXY_SLOTS:-1} 21 --threads \${GALAXY_SLOTS:-1}
22 --tempdir "\${TMPDIR:-.}"
19 #if str($db.source) == "download" or str($db.source) == "builtin" 23 #if str($db.source) == "download" or str($db.source) == "builtin"
20 --datadir 'datadir' 24 --datadir datadir
21 #end if 25 #end if
22 $usher 26 --analysis-mode $engine.analysis_mode
23 $alignment 27 #if str($engine.analysis_mode) == 'usher':
28 $engine.use_assignment_cache
29 #end if
30 #if $alignment:
31 $alignment --alignment-file '$align1'
32 #end if
24 --outfile report.csv 33 --outfile report.csv
25 --max-ambig $max_ambig 34 --max-ambig $max_ambig
26 --min-length $min_length 35 --min-length $min_length
36 $expanded_lineage
27 '$input1' 37 '$input1'
28 && csvtk csv2tab report.csv 38 && csvtk csv2tab report.csv
29 #if not $include_header: 39 #if not $include_header:
30 | tail -n+2 40 | tail -n+2
31 #end if 41 #end if
32 > '$output1' 42 > '$output1'
33 #if $alignment
34 && mv sequences.aln.fasta '$align1'
35 #end if
36 ]]></command> 43 ]]></command>
37 <inputs> 44 <inputs>
38 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" /> 45 <param type="data" name="input1" format="fasta" label="Input FASTA File(s)" />
39 <param argument="--usher" type="boolean" label="Use UShER model" truevalue="--usher" falsevalue="" help="Use UShER model instead of default pangoLEARN model" /> 46 <conditional name="engine">
40 <param argument="--alignment" type="boolean" label="Generate output alignment" 47 <param argument="--analysis-mode" type="select" label="Analysis mode"
41 truevalue="--alignment" falsevalue="" /> 48 help="The analysis engine to use for lineage assignment. UShER is considered more accurate, but pangoLEARN is faster">
42 <param argument="--max-ambig" type="float" label="Maximum proportion of Ns allowed" 49 <option value="usher">UShER</option>
43 value="0.5" min="0" max="1" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" /> 50 <option value="pangolearn">pangoLEARN</option>
44 <param argument="--min-length" type="integer" label="Minimum query length allowed" 51 </param>
45 value="10000" min="0" help="Minimum query length allowed for pangolin to attempt assignment"/> 52 <when value="usher">
46 <param name="include_header" type="boolean" label="Include header line in output file" 53 <param argument="--use-assignment-cache" type="boolean" truevalue="--add-assignment-cache --use-assignment-cache" falsevalue="" label="Use latest UShER assignment cache"
47 truevalue="true" falsevalue="false" /> 54 help="Get the latest UShER assignment cache from the pangolin-assignment online repository and use it to speed up UShER lineage assignment. Note: Downloading the cached assignments will only pay off for large numbers of input samples. Also note that using the latest assignment cache in combination with the built-in or a cached pangolin-data source (see option below), will make your otherwise reproducible results dependent on an external data source." />
55 </when>
56 <when value="pangolearn" />
57 </conditional>
48 <conditional name="db"> 58 <conditional name="db">
49 <param type="select" name="source" label="pangoLEARN source" help="Where to find the pangoLEARN database. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below."> 59 <param type="select" name="source" label="pangolin-data source" help="Where to find the pangolin-data to use for the tool run. While 'Download latest from web' is recommended, if errors occur see the warning in the main help text below.">
50 <option value="download">Download latest from web</option> 60 <option value="download">Download latest from web</option>
51 <option value="builtin">Use database from Galaxy server</option> 61 <option value="builtin">Use cached data from Galaxy server</option>
52 <option value="default">Use default database built in to pangolin (not recommended)</option> 62 <option value="default">Use default data shipped with this build of pangolin (not recommended)</option>
53 </param> 63 </param>
54 <when value="download"> 64 <when value="download">
55 <!-- these are currently not supported by the pangolin downloader --> 65 <!-- these are currently not supported by the pangolin downloader -->
56 <!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> --> 66 <!-- <param name="max_retries" label="Max download retries" help="How many times to retry downloading the pangoLEARN database" type="integer" value="5" /> -->
57 <!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> --> 67 <!-- <param name="timeout" label="Download timeout" help="How many seconds to wait when downloading the pangoLEARN database" type="float" value="60.0" /> -->
61 <options from_data_table="pangolearn"> 71 <options from_data_table="pangolearn">
62 <column name="value" index="0" /> 72 <column name="value" index="0" />
63 <column name="name" index="1" /> 73 <column name="name" index="1" />
64 <column name="path" index="3" /> 74 <column name="path" index="3" />
65 <filter type="sort_by" column="0" /> 75 <filter type="sort_by" column="0" />
66 <filter type="static_value" column="2" value="3.0" /> 76 <filter type="static_value" column="2" value="4.0" />
77 <validator type="no_options" message="No cached pangolin-data release available" />
67 </options> 78 </options>
68 </param> 79 </param>
69 </when> 80 </when>
70 <when value="default"> 81 <when value="default" />
71 </when> 82 </conditional>
72 </conditional> 83 <param argument="--alignment" type="boolean" truevalue="--alignment" falsevalue="" label="Output multiple sequence alignment of input sequences" />
84 <param argument="--max-ambig" type="float" value="0.3" min="0" max="1" label="Maximum proportion of Ns allowed" help="Maximum proportion of Ns allowed for pangolin to attempt assignment" />
85 <param argument="--min-length" type="integer" value="25000" min="0" max="29903" label="Minimum query length allowed" help="Minimum query length allowed for pangolin to attempt assignment. Please note that in the current implementation this parameter is used to calculate an alternate value for the 'Maximum proportion of Ns allowed' parameter as 1-(minlen/reflen). The smaller of the two will be used." />
86 <param argument="--expanded-lineage" type="boolean" truevalue="--expanded-lineage" falsevalue="" label="Add expanded lineage column to output" help="Optional expanded lineage information as defined in the alias.json file in pangolin-data can be appended as an additional column to the output." />
87 <param name="include_header" type="boolean" truevalue="true" falsevalue="false" label="Include header line in output file" />
73 </inputs> 88 </inputs>
74 <outputs> 89 <outputs>
75 <data name="output1" format="tabular" label="pangolin on ${on_string}"> 90 <data name="output1" format="tabular" label="pangolin on ${on_string}">
76 <actions> 91 <actions>
77 <action name="column_names" type="metadata" default="taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,version,pangolin_version,pangoLEARN_version,pango_version,status,note" /> 92 <conditional name="expanded_lineage">
93 <when value="">
94 <!-- default columns -->
95 <action name="column_names" type="metadata" default="taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note" />
96 </when>
97 <when value="--expanded-lineage">
98 <action name="column_names" type="metadata" default="taxon,lineage,conflict,ambiguity_score,scorpio_call,scorpio_support,scorpio_conflict,scorpio_notes,version,pangolin_version,scorpio_version,constellation_version,is_designated,qc_status,qc_notes,note,expanded_lineage" />
99 </when>
100 </conditional>
78 </actions> 101 </actions>
79 </data> 102 </data>
80 <data name="align1" format="fasta" label="pangolin alignment on ${on_string}"> 103 <data name="align1" format="fasta" label="pangolin alignment on ${on_string}">
81 <filter>alignment</filter> 104 <filter>alignment</filter>
82 </data> 105 </data>
83 </outputs> 106 </outputs>
84 <tests> 107 <tests>
85 <test expect_num_outputs="1"> 108 <test expect_num_outputs="1">
86 <param name="input1" value="test1.fasta"/> 109 <param name="input1" value="test1.fasta"/>
87 <conditional name="db"> 110 <!-- Test only the default UShER mode for now since the
88 <param name="source" value="download" /> 111 pangolearn random forest model uses too much memory
112 see https://github.com/cov-lineages/pangolin/issues/395
113 <conditional name="engine">
114 <conditional name="engine">
115 <param name="analysis_mode" value="pangolearn" />
116 </conditional>
117 -->
118 <conditional name="db">
119 <param name="source" value="default" />
89 </conditional> 120 </conditional>
90 <output name="output1" ftype="tabular"> 121 <output name="output1" ftype="tabular">
91 <assert_contents> 122 <assert_contents>
92 <has_text_matching expression="B\.1\.1\t\d\.\d" /> 123 <has_text_matching expression="B\.1\.1\t\d\.\d" />
93 <has_text text="passed_qc" /> 124 <has_text text="pass" />
94 <has_n_lines n="1" /> 125 <has_n_lines n="1" />
95 </assert_contents> 126 </assert_contents>
96 </output> 127 </output>
97 </test> 128 </test>
98 <!-- test UShER mode -->
99 <test expect_num_outputs="1"> 129 <test expect_num_outputs="1">
100 <param name="input1" value="test1.fasta"/> 130 <param name="input1" value="test1.fasta"/>
101 <param name="usher" value="true" />
102 <conditional name="db"> 131 <conditional name="db">
103 <param name="source" value="download" /> 132 <param name="source" value="download" />
104 </conditional> 133 </conditional>
105 <output name="output1" ftype="tabular"> 134 <output name="output1" ftype="tabular">
106 <assert_contents> 135 <assert_contents>
107 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" /> 136 <has_text_matching expression="B\.1\.1.*\t\d\.\d\t*PUSHER" />
108 <has_text text="passed_qc" /> 137 <has_text text="pass" />
109 <has_n_lines n="1" /> 138 <has_n_lines n="1" />
110 </assert_contents> 139 </assert_contents>
111 </output> 140 </output>
112 </test> 141 </test>
113 <test expect_num_outputs="2"> 142 <test expect_num_outputs="2">
143 <param name="input1" value="test1.fasta" />
144 <!-- Test only the default UShER mode for now since the
145 pangolearn random forest model uses too much memory
146 see https://github.com/cov-lineages/pangolin/issues/395
147 <conditional name="engine">
148 <param name="analysis_mode" value="pangolearn" />
149 </conditional>
150 -->
151 <conditional name="db">
152 <param name="source" value="download" />
153 </conditional>
114 <param name="alignment" value="--alignment" /> 154 <param name="alignment" value="--alignment" />
115 <param name="input1" value="test1.fasta" />
116 <conditional name="db">
117 <param name="source" value="download" />
118 </conditional>
119 <output name="output1" ftype="tabular"> 155 <output name="output1" ftype="tabular">
120 <assert_contents> 156 <assert_contents>
121 <has_text_matching expression="B\.1\.1\t\d\.\d" /> 157 <has_text_matching expression="B\.1\.1\t\d\.\d" />
122 <has_text text="passed_qc" /> 158 <has_text text="pass" />
123 <has_n_lines n="1" /> 159 <has_n_lines n="1" />
124 </assert_contents> 160 </assert_contents>
125 </output> 161 </output>
126 <output name="align1" file="aln1.fasta" ftype="fasta"> 162 <output name="align1" file="aln1.fasta" ftype="fasta">
127 <assert_contents> 163 <assert_contents>
131 </output> 167 </output>
132 </test> 168 </test>
133 <!-- test include-header option --> 169 <!-- test include-header option -->
134 <test expect_num_outputs="1"> 170 <test expect_num_outputs="1">
135 <param name="input1" value="multiple_alignment.fasta.gz"/> 171 <param name="input1" value="multiple_alignment.fasta.gz"/>
172 <!-- Test only the default UShER mode for now since the
173 pangolearn random forest model uses too much memory
174 see https://github.com/cov-lineages/pangolin/issues/395
175 <conditional name="engine">
176 <param name="analysis_mode" value="pangolearn" />
177 </conditional>
178 -->
179 <conditional name="db">
180 <param name="source" value="default" />
181 </conditional>
136 <param name="include_header" value="true" /> 182 <param name="include_header" value="true" />
137 <conditional name="db"> 183 <output name="output1" ftype="tabular">
138 <param name="source" value="download" /> 184 <assert_contents>
139 </conditional> 185 <has_text text="pangolin_version" />
140 <output name="output1" ftype="tabular">
141 <assert_contents>
142 <has_text text="pangoLEARN_version" />
143 <has_text text="lineage" /> 186 <has_text text="lineage" />
144 <has_text text="Serbia" /> 187 <has_text text="Serbia" />
145 <has_text text="Poland" /> 188 <has_text text="Poland" />
146 <has_text text="USA" /> 189 <has_text text="USA" />
147 <has_n_lines n="35" /> 190 <has_n_lines n="35" />
191 <has_n_columns n="16" />
192 </assert_contents>
193 </output>
194 </test>
195 <!-- test with extra expanded_lineage column -->
196 <test expect_num_outputs="1">
197 <param name="input1" value="multiple_alignment.fasta.gz"/>
198 <!-- Test only the default UShER mode for now since the
199 pangolearn random forest model uses too much memory
200 see https://github.com/cov-lineages/pangolin/issues/395
201 <conditional name="engine">
202 <param name="analysis_mode" value="pangolearn" />
203 </conditional>
204 -->
205 <conditional name="db">
206 <param name="source" value="default" />
207 </conditional>
208 <param name="expanded_lineage" value="true" />
209 <param name="include_header" value="true" />
210 <output name="output1" ftype="tabular">
211 <assert_contents>
212 <has_text text="pangolin_version" />
213 <has_text text="lineage" />
214 <has_text text="expanded_lineage" />
215 <has_text text="Serbia" />
216 <has_text text="Poland" />
217 <has_text text="USA" />
218 <has_n_lines n="35" />
219 <has_n_columns n="17" />
148 </assert_contents> 220 </assert_contents>
149 </output> 221 </output>
150 </test> 222 </test>
151 </tests> 223 </tests>
152 <help><![CDATA[ 224 <help><![CDATA[
153 225
226 **What it does**
227
228 `Pangolin <https://cov-lineages.org/pangolin.html>`_
229 (Phylogenetic Assignment of Named Global Outbreak LINeages) is used to assign a
230 SARS-CoV-2 genome sequence the most likely lineage based on the PANGO
231 nomenclature system.
232
233
234 **Data sources/versioning**
235
236 Pangolin uses the
237 `pangolin-data <https://github.com/cov-lineages/pangolin-data>`_ repository as
238 a source of its required model, protobuf, designation hash and alias files, and
239 the `constellations <https://github.com/cov-lineages/constellations>`_
240 repository for `scorpio <https://github.com/cov-lineages/scorpio>`_ -based
241 assignment of lineages of concern.
242 The tool ships with a copy of this data, but the data gets updated more
243 frequently than the tool! In general one should use the most recent model for
244 lineage assignment, and the default option for this tool is to download the
245 latest versions of pangolin-data and constellations before embarking on
246 analysis.
247 A pangoLEARN data manager exists so that the Galaxy admin can download specific
248 versions of the pangolin-data/constellations as required. Finally the pangolin
249 tool can use its default built-in data packages, but this is
250 **not recommended** as it will almost certainly be out of date.
251
154 .. class:: infomark 252 .. class:: infomark
155 253
156 `Pangolin <https://cov-lineages.org/pangolin.html>`_ (Phylogenetic Assignment of Named Global Outbreak LINeages) 254 The exact combination of pangolin, inference engine (UShER/pangoLEARN),
157 is used to assign a SARS-CoV-2 genome sequence the most likely lineage based on the PANGO nomenclature system. 255 scorpio, and data packages used for a particular run of the tool can be
158 256 extracted from the four "version" columns in the output (see below for
159 Pangolin uses the `pangoLEARN <https://github.com/cov-lineages/pangoLEARN>`_ stored model for lineage assignment. This 257 details).
160 model is updated more frequently than the pangolin tool is. In general one should use the most recent model for lineage
161 assignment, and the default option for this tool is to download the latest version of the model before the pangolin
162 tool runs. A pangoLEARN data manager exists so that the Galaxy admin can download specific versions of the pangoLEARN
163 model as required. Finally the pangolin tool can use its default built-in model, but this is **not recommended** as the
164 default model rapidly becomes out of date.
165 258
166 .. class:: warningmark 259 .. class:: warningmark
167 260
168 The "Download latest from web" updates the pangolin database but not the pangolin (and scorpio) software. If 261 The "Download latest from web" updates the *pangolin-data* and
169 the database format changes this can cause the pangolin job and the tool to fail. The solution to this kind of 262 *constellations* packages but not the software (pangolin and scorpio) using
170 failure is to update the pangolin tool installed in the Galaxy server. 263 these data packages.
264 If the data package format changes upstream, this can cause the tool run to
265 fail. Cached data packages (or, in the worst case, the built-in data) can
266 serve as a fallback until switching to an updated pangolin tool
267 version.
268
269
270 **Output**
271
272 The main output of the tool is a tabular file with one line per input sequence
273 and with columns providing the
274 `following information <https://cov-lineages.org/resources/pangolin/output.html>`_:
275
276 taxon:
277 The name of the input sequence
278
279 lineage:
280 The most likely lineage assigned to a given sequence based on the inference
281 engine used and the SARS-CoV-2 diversity designated.
282 This assignment is sensitive to missing data at key sites.
283
284 conflict:
285 In the pangoLEARN model, a given sequence gets assigned to the most likely
286 category based on known diversity.
287 If a sequence can fit into more than one category, the conflict score will
288 be greater than 0 and reflect the number of categories the sequence could
289 fit into.
290 If the conflict score is 0, this means that within the current decision
291 tree there is only one category that the sequence could be assigned to.
292
293 ambiguity_score:
294 This score is a function of the quantity of missing data in a sequence.
295 It represents the proportion of relevant sites in a sequnece which were
296 imputed to the reference values.
297 A score of 1 indicates that no sites were imputed, while a score of 0
298 indicates that more sites were imputed than were not imputed.
299 This score only includes sites which are used by the decision tree to
300 classify a sequence.
301
302 scorpio_call:
303 If a query is assigned a constellation by scorpio this call is output in
304 this column.
305 The full set of constellations searched by default can be found at the
306 constellations repository.
307
308 scorpio_support:
309 The support score is the proportion of defining variants which have the
310 alternative allele in the sequence.
311
312 scorpio_conflict:
313 The conflict score is the proportion of defining variants which have the
314 reference allele in the sequence. Ambiguous/other non-ref/alt bases at each
315 of the variant positions contribute only to the denominators of these
316 scores.
317
318 scorpio_notes:
319 A notes column specific to the scorpio output.
320
321 version:
322 A version number that represents both the inference method and the
323 pangolin-data version number, which as of pangolin 4.0 corresponds to the
324 pango-designation version used to prepare the inference files. For example:
325
326 PANGO-1.2 indicates an identical sequence has been previously designated
327 this lineage, and has so gone through manual curation.
328 The number 1.2 indicates the version of pango-designation that this
329 assignment is based on. These hashes and pango-designation version are
330 bundled with the pangoLEARN and UShER models.
331
332 PLEARN-1.2 indicates that this sequence is different from any previously
333 designated and that the pangoLEARN model was used as an inference engine to
334 predict the most likely lineage based on the given version of
335 pango-designation upon which the pangoLEARN model was trained.
336
337 PUSHER-1.2 indicates that this sequence is different from any previously
338 designated and that UShER was used as an inference engine with fast tree
339 placement and parsimony-based lineage assignment, based on a guide tree
340 (protobuf) file built from the data in a given pango-designation release
341 version.
342
343 pangolin_version:
344 The version of pangolin software running.
345
346 scorpio_version:
347 The version of the scorpio software installed.
348
349 constellation_version:
350 The version of constellations that scorpio has used to curate the lineage
351 assignment.
352
353 is_designated:
354 A boolean (True/False) column indicating whether that particular sequence
355 has been offically designated a lineage.
356
357 qc_status:
358 Indicates whether the sequence passed the QC thresholds for minimum length
359 and maximum N content.
360
361 qc_notes:
362 Notes specific to the QC checks run on the sequences.
363
364 note:
365 If any conflicts from the decision tree, this field will output the
366 alternative assignments. If the sequence failed QC this field will describe
367 why.
368 If the sequence met the SNP thresholds for scorpio to call a constellation,
369 it’ll describe the exact SNP counts of Alt, Ref and Amb (Alternative,
370 reference and ambiguous) alleles for that call.
171 ]]></help> 371 ]]></help>
172 <citations> 372 <citations>
173 <citation type="doi">10.1093/ve/veab064</citation> 373 <citation type="doi">10.1093/ve/veab064</citation>
174 </citations> 374 </citations>
175 </tool> 375 </tool>