comparison bakta.xml @ 0:54ca20519f70 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit 6b06711cfba45855d5a992ed1c73c472eaef644f-dirty
author thanhlv
date Fri, 28 Apr 2023 17:18:48 +0000
parents
children 30c7c559d5f6
comparison
equal deleted inserted replaced
-1:000000000000 0:54ca20519f70
1 <tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
2 <description>
3 Genome annotation via alignment-free sequence identification
4 </description>
5 <macros>
6 <import>macro.xml</import>
7 </macros>
8 <expand macro='edam'/>
9 <expand macro='xrefs'/>
10 <expand macro="requirements"/>
11 <expand macro="version_command"/>
12
13 <command detect_errors="aggressive"><![CDATA[
14 mkdir -p ./database_path/amrfinderplus-db &&
15 ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
16 ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
17 bakta
18 #*======================================
19 CPU option
20 ======================================*#
21 --threads \${GALAXY_SLOTS:-1}
22 #*======================================
23 Bakta database
24 ======================================*#
25 --db ./database_path
26 #if $input_option.min_contig_length
27 --min-contig-length $input_option.min_contig_length
28 #else if $annotation.compliant
29 --min-contig-length 200
30 #else
31 --min-contig-length 1
32 #end if
33 --prefix bakta_output
34 #*======================================
35 Organism options
36 genus/species/strain/plasmid
37 ======================================*#
38 #if $organism.genus
39 --genus '$organism.genus'
40 #end if
41 #if $organism.species
42 --species '$organism.species'
43 #end if
44 #if $organism.strain
45 --strain '$organism.strain'
46 #end if
47 #if $organism.plasmid
48 --plasmid '$organism.plasmid'
49 #end if
50 #*======================================
51 Annotation options
52 gram type, prodigal/protein file
53 ======================================*#
54 $annotation.complete
55 #if $annotation.prodigal
56 --prodigal-tf '$annotation.prodigal'
57 #end if
58 #if $annotation.translation_table
59 --translation-table '$annotation.translation_table'
60 #end if
61 --gram '?'
62 $annotation.keep_contig_headers
63 #if $annotation.replicons
64 --replicons '$annotation.replicons'
65 #end if
66 $annotation.compliant
67 #if $annotation.proteins
68 --proteins '$annotation.proteins'
69 #end if
70 #*======================================
71 Workflow OPTIONS
72 skip some step of the bakta analysis
73 ======================================*#
74
75 #echo " ".join($workflow.skip_analysis)
76
77 #*======================================
78 Genome file
79 ======================================*#
80 '$input_option.input_file'
81 #*======================================
82 LOG file
83 ======================================*#
84 | tee '$logfile'
85 ]]></command>
86 <inputs>
87 <!-- DB and file INPUT -->
88 <section name="input_option" title="Input/Output options" expanded="true">
89 <param name="bakta_db_select" type="select" label="The bakta database">
90 <options from_data_table="bakta_database">
91 <filter type="static_value" value="@BAKTA_VERSION@" column="bakta_version"/>
92 <validator message="No bakta database is available" type="no_options"/>
93 </options>
94 </param>
95 <param name="amrfinder_db_select" type="select" label="The amrfinderplus database">
96 <options from_data_table="amrfinderplus_database">
97 <validator message="No amrfinderplus database is available" type="no_options"/>
98 </options>
99 </param>
100
101 <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
102 <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
103 </section>
104 <!-- Organism INFORMATION OPTIONS -->
105 <section name="organism" title="Optional organism options" expanded="false">
106 <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia">
107 <validator type="regex">^[a-zA-Z]+$</validator>
108 </param>
109 <param argument="--species" type="text" optional="true" label="Specify species name" help="ex. 'coli O157:H7'">
110 <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
111 </param>
112 <param argument="--strain" type="text" optional="true" label="Specify strain name" help="ex. Sakai">
113 <validator type="regex">^[a-zA-Z]+$</validator>
114 </param>
115 <param argument="--plasmid" type="text" optional="true" label="Specify plasmid name" help="ex. pOSAK1">
116 <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
117 </param>
118 </section>
119 <!-- ANNOTATION -->
120 <section name="annotation" title="Optional annotation">
121 <param argument="--complete" type="boolean" truevalue="--complete" falsevalue="" label="Complete replicons" help="All sequences are complete replicons (chromosome/plasmid[s])"/>
122 <param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/>
123 <param name="translation_table" type="select" optional="true" label="Translation table" help="Default is the bacterial table 11">
124 <option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
125 <option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
126 </param>
127 <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/>
128 <param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
129 <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
130 <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
131 </section>
132 <!-- PARAMETER FOR WORKFLOW ANALYSIS -->
133 <section name="workflow" title="Workflow option to skip steps">
134 <param name="skip_analysis" type="select" display="checkboxes" multiple="true" label="Select steps to skip">
135 <option value="--skip-trna"> Skip tRNA detection and annotation </option>
136 <option value="--skip-tmrna"> Skip tmRNA detection and annotation </option>
137 <option value="--skip-rrna"> Skip rRNA detection and annotation </option>
138 <option value="--skip-ncrna"> Skip ncRNA detection and annotation </option>
139 <option value="--skip-ncrna-region"> Skip ncRNA region detection and annotation </option>
140 <option value="--skip-crispr"> Skip CRISPR array detection and annotation </option>
141 <option value="--skip-cds"> Skip CDS detection and annotation </option>
142 <option value="--skip-pseudo"> Skip pseudogene detection and annotation </option>
143 <option value="--skip-sorf"> Skip sORF detection and annotation </option>
144 <option value="--skip-gap"> Skip gap detection and annotation </option>
145 <option value="--skip-ori"> Skip oriC/oriT detection and annotation </option>
146 </param>
147 </section>
148 <section name="output_files" title="Selection of the output files">
149 <param name="output_selection" type="select" display="checkboxes" multiple="true" label="Output files selection">
150 <option value="file_tsv" selected="true"> Annotation file in TSV </option>
151 <option value="file_gff3" selected="true"> Annotation and sequence in GFF3 </option>
152 <option value="file_gbff" selected="false"> Annotations and sequences in GenBank format </option>
153 <option value="file_embl" selected="false"> Annotations and sequences in EMBL format </option>
154 <option value="file_fna" selected="false"> Replicon/contig DNA sequences as FASTA </option>
155 <option value="file_ffn" selected="true"> Feature nucleotide sequences as FASTA </option>
156 <option value="file_faa" selected="false"> CDS/sORF amino acid sequences as FASTA </option>
157 <option value="hypo_tsv" selected="false"> Hypothetical protein CDS in TSV</option>
158 <option value="hypo_fa" selected="false"> Hypothetical protein CDS amino sequences as FASTA</option>
159 <option value="sum_txt" selected="false"> Summary as TXT</option>
160 <option value="file_json" selected="false"> Information on each annotated feature as JSON </option>
161 <option value="file_plot" selected="true"> Plot of the annotation result as SVG </option>
162 <option value="log_txt" selected="false"> Log file as TXT </option>
163 </param>
164 </section>
165
166 </inputs>
167 <outputs>
168 <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary">
169 <filter> output_files['output_selection'] and "file_tsv" in output_files['output_selection'] </filter>
170 </data>
171 <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences">
172 <filter> output_files['output_selection'] and "file_gff3" in output_files['output_selection'] </filter>
173 </data>
174 <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff">
175 <filter> output_files['output_selection'] and "file_gbff" in output_files['output_selection'] </filter>
176 </data>
177 <data name="annotation_embl" format="tabular" from_work_dir="bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl">
178 <filter> output_files['output_selection'] and "file_embl" in output_files['output_selection'] </filter>
179 </data>
180 <data name="annotation_fna" format="fasta" from_work_dir="bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences">
181 <filter> output_files['output_selection'] and "file_fna" in output_files['output_selection'] </filter>
182 </data>
183 <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences">
184 <filter> output_files['output_selection'] and "file_ffn" in output_files['output_selection'] </filter>
185 </data>
186 <data name="annotation_faa" format="fasta" from_work_dir="bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences">
187 <filter> output_files['output_selection'] and "file_faa" in output_files['output_selection'] </filter>
188 </data>
189 <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary">
190 <filter> output_files['output_selection'] and "hypo_tsv" in output_files['output_selection'] </filter>
191 </data>
192 <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences">
193 <filter> output_files['output_selection'] and "hypo_fa" in output_files['output_selection'] </filter>
194 </data>
195 <data name="summary_txt" format="txt" from_work_dir="bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary">
196 <filter> output_files['output_selection'] and "sum_txt" in output_files['output_selection'] </filter>
197 </data>
198 <data name="annotation_json" format="json" from_work_dir="bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable">
199 <filter> output_files['output_selection'] and "file_json" in output_files['output_selection'] </filter>
200 </data>
201 <data name="annotation_plot" format="svg" from_work_dir="bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
202 <filter> output_files['output_selection'] and "file_plot" in output_files['output_selection'] </filter>
203 </data>
204 <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
205 <filter> output_files['output_selection'] and "log_txt" in output_files['output_selection'] </filter>
206 </data>
207 </outputs>
208 <tests>
209 <test expect_num_outputs="13"> <!-- TEST_1 database + input -->
210 <section name="input_option" >
211 <param name="bakta_db_select" value="V0.1_2022-08-29"/>
212 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
213 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
214 <param name="min_contig_length" value="250"/>
215 </section>
216 <section name="output_files">
217 <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
218 </section>
219 <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/>
220 <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/>
221 <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/>
222 <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/>
223 <output name="annotation_fna" value="TEST_1/TEST_1.fna"/>
224 <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/>
225 <output name="annotation_faa" value="TEST_1/TEST_1.faa"/>
226 <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/>
227 <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/>
228 <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/>
229 <output name="annotation_plot">
230 <assert_contents>
231 <has_size value="418991" delta="1000"/>
232 </assert_contents>
233 </output>
234 <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/>
235 <output name="logfile" value="TEST_1/TEST_1.log" lines_diff="6"/>
236 </test>
237 <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps -->
238 <section name="input_option" >
239 <param name="bakta_db_select" value="V0.1_2022-08-29"/>
240 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
241 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
242 <param name="min_contig_length" value="250"/>
243 </section>
244 <section name="organism">
245 <param name="genus" value="Escherichia"/>
246 <param name="species" value="coli O157:H7"/>
247 <param name="strain" value="Sakai"/>
248 <param name="plasmid" value="pOSAK1"/>
249 </section>
250 <section name="annotation">
251 <param name="keep_contig_headers" value="true"/>
252 </section>
253 <section name="workflow">
254 <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
255 </section>
256 <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4">
257 <assert_contents>
258 <has_text_matching expression="IHHALP_00005"/>
259 </assert_contents>
260 </output>
261 <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4">
262 <assert_contents>
263 <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
264 </assert_contents>
265 </output>
266 <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/>
267 <output name="annotation_plot">
268 <assert_contents>
269 <has_size value="418991" delta="1000"/>
270 </assert_contents>
271 </output>
272 </test>
273 <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps -->
274 <section name="input_option" >
275 <param name="bakta_db_select" value="V0.1_2022-08-29"/>
276 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
277 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
278 <param name="min_contig_length" value="350"/>
279 </section>
280 <section name="workflow">
281 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/>
282 </section>
283 <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/>
284 <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/>
285 <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/>
286 <output name="annotation_plot">
287 <assert_contents>
288 <has_size value="418399" delta="1000"/>
289 </assert_contents>
290 </output>
291 </test>
292 <test expect_num_outputs="4"> <!-- TEST_4 annotations -->
293 <section name="input_option" >
294 <param name="bakta_db_select" value="V0.1_2022-08-29"/>
295 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
296 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
297 </section>
298 <section name="annotation">
299 <param name="complete" value="true"/>
300 <param name="prodigal" value="prodigal.tf"/>
301 <param name="translation_table" value="4"/>
302 <param name="replicons" value="replicons.tsv" ftype="tabular"/>
303 <param name="compliant" value="true"/>
304 <param name="proteins" value="user-proteins.faa" ftype="fasta"/>
305 </section>
306 <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/>
307 <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/>
308 <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/>
309 <output name="annotation_plot">
310 <assert_contents>
311 <has_size value="418399" delta="1000"/>
312 </assert_contents>
313 </output>
314 </test>
315 <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
316 <section name="input_option" >
317 <param name="bakta_db_select" value="V0.1_2022-08-29"/>
318 <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
319 <param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
320 </section>
321 <section name="annotation">
322 <param name="complete" value="true"/>
323 <param name="translation_table" value="4"/>
324 </section>
325 <section name="workflow">
326 <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/>
327 </section>
328 <section name="output_files">
329 <param name="output_selection" value="log_txt,sum_txt"/>
330 </section>
331 <output name="logfile" value="TEST_5/TEST_5.log" lines_diff="6"/>
332 <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/>
333 </test>
334 </tests>
335 <help><![CDATA[**What it does**
336 Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
337
338 *Comprehensive & taxonomy-independent database*
339 Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
340
341 *Protein sequence identification*
342 Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
343 allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
344 This is achieved via an alignment-free sequence identification (AFSI) approach
345 using full-length MD5 protein sequence hash digests.
346 *Small proteins/short open reading frames*
347 Bakta detects and annotates small proteins/short open reading frames (sORF).
348
349 *Expert annotation systems*
350 To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
351 Bakta includes & merges different expert annotation systems.
352 Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
353 as well as an generalized protein sequence expert system with distinct
354 coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
355
356 *Comprehensive workflow*
357 Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
358 and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
359
360 *GFF3 & INSDC conform annotations*
361 Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
362 (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
363 respectively for representative genomes of all ESKAPE species).
364
365 *Bacteria & plasmids*
366 Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
367
368 **Input options**
369 1. Choose a genome or assembly in fasta format to use bakta annotations
370 2. Choose A version of the Bakta database
371
372 **Organism options**
373 You can specify informations about analysed fasta as text input for:
374 - genus
375 - species
376 - strain
377 - plasmid
378
379 **Annotation options**
380 1. You can specify if all sequences (chromosome or plasmids) are complete or not
381 2. You can add your own prodigal training file for CDS predictionœ
382 3. The translation table could be modified, default is the 11th for bacteria
383 4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
384 5. You can keep the name of contig present in the input file
385 6. You can specify your own replicon table as a TSV/CSV file
386 7. The compliance option is for ready to submit annotation file to Public database
387 as ENA, Genbank EMBL
388 8. You can specify a protein sequence file for annotation in GenBank or fasta formats
389 Using the Fasta format, each reference sequence can be provided in a short or long format:
390
391 # short:
392 >id gene~~~product~~~dbxrefs
393 MAQ...
394
395 # long:
396 >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
397 MAQ...
398
399 **Skip steps**
400 Some steps could be skiped:
401 - skip-trna Skip tRNA detection & annotation
402 - skip-tmrna Skip tmRNA detection & annotation
403 - skip-rrna Skip rRNA detection & annotation
404 - skip-ncrna Skip ncRNA detection & annotation
405 - skip-ncrna-region Skip ncRNA region detection & annotation
406 - skip-crispr Skip CRISPR array detection & annotation
407 - skip-cds Skip CDS detection & annotation
408 - skip-pseudo Skip pseudogene detection & annotation
409 - skip-sorf Skip sORF detection & annotation
410 - skip-gap Skip gap detection & annotation
411 - skip-ori Skip oriC/oriT detection & annotation
412
413 **Output options**
414 Bakta produce numbers of output files, you can select what type of file you want:
415 - Summary of the annotation
416 - Annotated files
417 - Sequence files for nucleotide and/or amino acid
418 ]]></help>
419 <expand macro="citations"/>
420 </tool>