bakta: bakta.xml comparison

comparison bakta.xml @ 0:54ca20519f70 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit 6b06711cfba45855d5a992ed1c73c472eaef644f-dirty

author	thanhlv
date	Fri, 28 Apr 2023 17:18:48 +0000
parents
children	30c7c559d5f6

comparison

equal deleted inserted replaced

--1:000000000000
+:54ca20519f70
+<tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
+<description>
+Genome annotation via alignment-free sequence identification
+</description>
+<macros>
+<import>macro.xml</import>
+</macros>
+<expand macro='edam'/>
+<expand macro='xrefs'/>
+<expand macro="requirements"/>
+<expand macro="version_command"/>
+<command detect_errors="aggressive"><![CDATA[
+mkdir -p ./database_path/amrfinderplus-db &&
+ln -s '$(input_option.bakta_db_select.fields.path)'/* database_path &&
+ln -s '$(input_option.amrfinder_db_select.fields.path)/' database_path/amrfinderplus-db/latest &&
+bakta
+#*======================================
+CPU option
+======================================*#
+--threads \${GALAXY_SLOTS:-1}
+#*======================================
+Bakta database
+======================================*#
+--db ./database_path
+#if $input_option.min_contig_length
+--min-contig-length $input_option.min_contig_length
+#else if $annotation.compliant
+--min-contig-length 200
+#else
+--min-contig-length 1
+#end if
+--prefix bakta_output
+#*======================================
+Organism options
+genus/species/strain/plasmid
+======================================*#
+#if $organism.genus
+--genus '$organism.genus'
+#end if
+#if $organism.species
+--species '$organism.species'
+#end if
+#if $organism.strain
+--strain '$organism.strain'
+#end if
+#if $organism.plasmid
+--plasmid '$organism.plasmid'
+#end if
+#*======================================
+Annotation options
+gram type, prodigal/protein file
+======================================*#
+$annotation.complete
+#if $annotation.prodigal
+--prodigal-tf '$annotation.prodigal'
+#end if
+#if $annotation.translation_table
+--translation-table '$annotation.translation_table'
+#end if
+--gram '?'
+$annotation.keep_contig_headers
+#if $annotation.replicons
+--replicons '$annotation.replicons'
+#end if
+$annotation.compliant
+#if $annotation.proteins
+--proteins '$annotation.proteins'
+#end if
+#*======================================
+Workflow OPTIONS
+skip some step of the bakta analysis
+======================================*#
+#echo " ".join($workflow.skip_analysis)
+#*======================================
+Genome file
+======================================*#
+'$input_option.input_file'
+#*======================================
+LOG file
+======================================*#
+| tee '$logfile'
+]]></command>
+<inputs>
+<!-- DB and file INPUT -->
+<section name="input_option" title="Input/Output options" expanded="true">
+<param name="bakta_db_select" type="select" label="The bakta database">
+<options from_data_table="bakta_database">
+<filter type="static_value" value="@BAKTA_VERSION@" column="bakta_version"/>
+<validator message="No bakta database is available" type="no_options"/>
+</options>
+</param>
+<param name="amrfinder_db_select" type="select" label="The amrfinderplus database">
+<options from_data_table="amrfinderplus_database">
+<validator message="No amrfinderplus database is available" type="no_options"/>
+</options>
+</param>
+<param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/>
+<param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/>
+</section>
+<!-- Organism INFORMATION OPTIONS -->
+<section name="organism" title="Optional organism options" expanded="false">
+<param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia">
+<validator type="regex">^[a-zA-Z]+$</validator>
+</param>
+<param argument="--species" type="text" optional="true" label="Specify species name" help="ex. 'coli O157:H7'">
+<validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
+</param>
+<param argument="--strain" type="text" optional="true" label="Specify strain name" help="ex. Sakai">
+<validator type="regex">^[a-zA-Z]+$</validator>
+</param>
+<param argument="--plasmid" type="text" optional="true" label="Specify plasmid name" help="ex. pOSAK1">
+<validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator>
+</param>
+</section>
+<!-- ANNOTATION -->
+<section name="annotation" title="Optional annotation">
+<param argument="--complete" type="boolean" truevalue="--complete" falsevalue="" label="Complete replicons" help="All sequences are complete replicons (chromosome/plasmid[s])"/>
+<param argument="--prodigal" type="data" format="txt" optional="true" label="Prodigal file" help="Prodigal training file for CDS prediction"/>
+<param name="translation_table"  type="select" optional="true" label="Translation table" help="Default is the bacterial table 11">
+<option value="4">4 Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+<option value="11" selected="true">11 Bacterial, Archaeal and Plant Plastid Code</option>
+</param>
+<param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/>
+<param argument="--replicons" type="data" format="tabular,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/>
+<param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/>
+<param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/>
+</section>
+<!-- PARAMETER FOR WORKFLOW ANALYSIS -->
+<section name="workflow" title="Workflow option to skip steps">
+<param name="skip_analysis" type="select" display="checkboxes" multiple="true"  label="Select steps to skip">
+<option value="--skip-trna"> Skip tRNA detection and annotation </option>
+<option value="--skip-tmrna"> Skip tmRNA detection and annotation </option>
+<option value="--skip-rrna"> Skip rRNA detection and annotation </option>
+<option value="--skip-ncrna"> Skip ncRNA detection and annotation </option>
+<option value="--skip-ncrna-region"> Skip ncRNA region detection and annotation </option>
+<option value="--skip-crispr"> Skip CRISPR array detection and annotation </option>
+<option value="--skip-cds"> Skip CDS detection and annotation </option>
+<option value="--skip-pseudo"> Skip pseudogene detection and annotation </option>
+<option value="--skip-sorf"> Skip sORF detection and annotation </option>
+<option value="--skip-gap"> Skip gap detection and annotation </option>
+<option value="--skip-ori"> Skip oriC/oriT detection and annotation </option>
+</param>
+</section>
+<section name="output_files" title="Selection of the output files">
+<param name="output_selection" type="select" display="checkboxes" multiple="true"  label="Output files selection">
+<option value="file_tsv" selected="true"> Annotation file in TSV </option>
+<option value="file_gff3" selected="true"> Annotation and sequence in GFF3 </option>
+<option value="file_gbff" selected="false"> Annotations and sequences in GenBank format </option>
+<option value="file_embl" selected="false"> Annotations and sequences in EMBL format </option>
+<option value="file_fna" selected="false"> Replicon/contig DNA sequences as FASTA  </option>
+<option value="file_ffn" selected="true"> Feature nucleotide sequences as FASTA </option>
+<option value="file_faa" selected="false"> CDS/sORF amino acid sequences as FASTA </option>
+<option value="hypo_tsv" selected="false"> Hypothetical protein CDS in TSV</option>
+<option value="hypo_fa" selected="false"> Hypothetical protein CDS amino sequences as FASTA</option>
+<option value="sum_txt" selected="false"> Summary as TXT</option>
+<option value="file_json" selected="false"> Information on each annotated feature as JSON </option>
+<option value="file_plot" selected="true"> Plot of the annotation result as SVG </option>
+<option value="log_txt" selected="false"> Log file as TXT </option>
+</param>
+</section>
+</inputs>
+<outputs>
+<data name="annotation_tsv" format="tabular" from_work_dir="bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary">
+<filter> output_files['output_selection'] and "file_tsv" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_gff3" format="gff3" from_work_dir="bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences">
+<filter> output_files['output_selection'] and "file_gff3" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_gbff" format="tabular" from_work_dir="bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff">
+<filter> output_files['output_selection'] and "file_gbff" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_embl" format="tabular" from_work_dir="bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl">
+<filter> output_files['output_selection'] and "file_embl" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_fna" format="fasta" from_work_dir="bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences">
+<filter> output_files['output_selection'] and "file_fna" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_ffn" format="fasta" from_work_dir="bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences">
+<filter> output_files['output_selection'] and "file_ffn" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_faa" format="fasta" from_work_dir="bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences">
+<filter> output_files['output_selection'] and "file_faa" in output_files['output_selection'] </filter>
+</data>
+<data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary">
+<filter> output_files['output_selection'] and "hypo_tsv" in output_files['output_selection'] </filter>
+</data>
+<data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences">
+<filter> output_files['output_selection'] and "hypo_fa" in output_files['output_selection'] </filter>
+</data>
+<data name="summary_txt" format="txt" from_work_dir="bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary">
+<filter> output_files['output_selection'] and "sum_txt" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_json" format="json" from_work_dir="bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable">
+<filter> output_files['output_selection'] and "file_json" in output_files['output_selection'] </filter>
+</data>
+<data name="annotation_plot" format="svg" from_work_dir="bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation">
+<filter> output_files['output_selection'] and "file_plot" in output_files['output_selection'] </filter>
+</data>
+<data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file">
+<filter> output_files['output_selection'] and "log_txt" in output_files['output_selection'] </filter>
+</data>
+</outputs>
+<tests>
+<test expect_num_outputs="13"> <!-- TEST_1 database + input -->
+<section name="input_option" >
+<param name="bakta_db_select" value="V0.1_2022-08-29"/>
+<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<param name="min_contig_length" value="250"/>
+</section>
+<section name="output_files">
+<param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/>
+</section>
+<output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/>
+<output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/>
+<output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/>
+<output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/>
+<output name="annotation_fna" value="TEST_1/TEST_1.fna"/>
+<output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/>
+<output name="annotation_faa" value="TEST_1/TEST_1.faa"/>
+<output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/>
+<output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/>
+<output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/>
+<output name="annotation_plot">
+<assert_contents>
+<has_size value="418991" delta="1000"/>
+</assert_contents>
+</output>
+<output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/>
+<output name="logfile" value="TEST_1/TEST_1.log" lines_diff="6"/>
+</test>
+<test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps  -->
+<section name="input_option" >
+<param name="bakta_db_select" value="V0.1_2022-08-29"/>
+<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<param name="min_contig_length" value="250"/>
+</section>
+<section name="organism">
+<param name="genus" value="Escherichia"/>
+<param name="species" value="coli O157:H7"/>
+<param name="strain" value="Sakai"/>
+<param name="plasmid" value="pOSAK1"/>
+</section>
+<section name="annotation">
+<param name="keep_contig_headers" value="true"/>
+</section>
+<section name="workflow">
+<param name="skip_analysis" value="--skip-trna,--skip-tmrna"/>
+</section>
+<output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4">
+<assert_contents>
+<has_text_matching expression="IHHALP_00005"/>
+</assert_contents>
+</output>
+<output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4">
+<assert_contents>
+<has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/>
+</assert_contents>
+</output>
+<output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/>
+<output name="annotation_plot">
+<assert_contents>
+<has_size value="418991" delta="1000"/>
+</assert_contents>
+</output>
+</test>
+<test expect_num_outputs="4"> <!-- TEST_3 test all skip steps  -->
+<section name="input_option" >
+<param name="bakta_db_select" value="V0.1_2022-08-29"/>
+<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+<param name="min_contig_length" value="350"/>
+</section>
+<section name="workflow">
+<param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/>
+</section>
+<output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/>
+<output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/>
+<output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/>
+<output name="annotation_plot">
+<assert_contents>
+<has_size value="418399" delta="1000"/>
+</assert_contents>
+</output>
+</test>
+<test expect_num_outputs="4"> <!-- TEST_4 annotations   -->
+<section name="input_option" >
+<param name="bakta_db_select" value="V0.1_2022-08-29"/>
+<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+</section>
+<section name="annotation">
+<param name="complete" value="true"/>
+<param name="prodigal" value="prodigal.tf"/>
+<param name="translation_table" value="4"/>
+<param name="replicons" value="replicons.tsv" ftype="tabular"/>
+<param name="compliant" value="true"/>
+<param name="proteins" value="user-proteins.faa" ftype="fasta"/>
+</section>
+<output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/>
+<output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/>
+<output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/>
+<output name="annotation_plot">
+<assert_contents>
+<has_size value="418399" delta="1000"/>
+</assert_contents>
+</output>
+</test>
+<test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary -->
+<section name="input_option" >
+<param name="bakta_db_select" value="V0.1_2022-08-29"/>
+<param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/>
+<param name="input_file" value="NC_002127.1.fna" ftype="fasta"/>
+</section>
+<section name="annotation">
+<param name="complete" value="true"/>
+<param name="translation_table" value="4"/>
+</section>
+<section name="workflow">
+<param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/>
+</section>
+<section name="output_files">
+<param name="output_selection" value="log_txt,sum_txt"/>
+</section>
+<output name="logfile" value="TEST_5/TEST_5.log" lines_diff="6"/>
+<output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/>
+</test>
+</tests>
+<help><![CDATA[**What it does**
+Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs.
+*Comprehensive & taxonomy-independent database*
+Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe.
+*Protein sequence identification*
+Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt
+allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families.
+This is achieved via an alignment-free sequence identification (AFSI) approach
+using full-length MD5 protein sequence hash digests.
+*Small proteins/short open reading frames*
+Bakta detects and annotates small proteins/short open reading frames (sORF).
+*Expert annotation systems*
+To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes,
+Bakta includes & merges different expert annotation systems.
+Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations
+as well as an generalized protein sequence expert system with distinct
+coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules.
+*Comprehensive workflow*
+Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT
+and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS.
+*GFF3 & INSDC conform annotations*
+Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission
+(checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats,
+respectively for representative genomes of all ESKAPE species).
+*Bacteria & plasmids*
+Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only.
+**Input options**
+1. Choose a genome or assembly in fasta format to use bakta annotations
+2. Choose A version of the Bakta database
+**Organism options**
+You can specify informations about analysed fasta as text input for:
+- genus
+- species
+- strain
+- plasmid
+**Annotation options**
+1. You can specify if all sequences (chromosome or plasmids) are complete or not
+2. You can add your own prodigal training file for CDS predictionœ
+3. The translation table could be modified, default is the 11th for bacteria
+4. You can specify if bacteria is gram -/+ or unknonw (default value is unknow)
+5. You can keep the name of contig present in the input file
+6. You can specify your own replicon table as a TSV/CSV file
+7. The compliance option is for ready to submit annotation file to Public database
+as ENA, Genbank EMBL
+8. You can specify a protein sequence file for annotation in GenBank or fasta formats
+Using the Fasta format, each reference sequence can be provided in a short or long format:
+# short:
+>id gene~~~product~~~dbxrefs
+MAQ...
+# long:
+>id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs
+MAQ...
+**Skip steps**
+Some steps could be skiped:
+- skip-trna           Skip tRNA detection & annotation
+- skip-tmrna          Skip tmRNA detection & annotation
+- skip-rrna           Skip rRNA detection & annotation
+- skip-ncrna          Skip ncRNA detection & annotation
+- skip-ncrna-region   Skip ncRNA region detection & annotation
+- skip-crispr         Skip CRISPR array detection & annotation
+- skip-cds            Skip CDS detection & annotation
+- skip-pseudo         Skip pseudogene detection & annotation
+- skip-sorf           Skip sORF detection & annotation
+- skip-gap            Skip gap detection & annotation
+- skip-ori            Skip oriC/oriT detection & annotation
+**Output options**
+Bakta produce numbers of output files, you can select what type of file you want:
+- Summary of the annotation
+- Annotated files
+- Sequence files for nucleotide and/or amino acid
+]]></help>
+<expand macro="citations"/>
+</tool>

Mercurial > repos > thanhlv > bakta

comparison bakta.xml @ 0:54ca20519f70 draft