Mercurial > repos > iuc > gtdbtk_classify_wf
comparison gtdbtk_classify_wf.xml @ 0:c4db8c4de66f draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gtdbtk commit 8487d2c73793be0afa5b34388b122e686ac8a094
author | iuc |
---|---|
date | Tue, 13 Dec 2022 09:48:28 +0000 |
parents | |
children | dbf1798c0dcc |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c4db8c4de66f |
---|---|
1 <tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> | |
2 <description>by placement in GTDB reference tree</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements"/> | |
7 <command detect_errors="exit_code"><![CDATA[ | |
8 #import re | |
9 | |
10 mkdir input_dir && | |
11 mkdir output_dir && | |
12 mkdir output_tsv_dir && | |
13 mkdir output_newick_dir && | |
14 mkdir output_fasta_dir && | |
15 #for $i in $input: | |
16 ## gtdbtk uses the file extension to determine the input format. | |
17 #set ext = "." + $i.ext | |
18 #set input_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + $ext | |
19 ln -s '${i}' input_dir/'${input_identifier}' && | |
20 #end for | |
21 export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && | |
22 gtdbtk classify_wf | |
23 --genome_dir input_dir | |
24 --extension '$ext' | |
25 --out_dir output_dir | |
26 --cpus \${GALAXY_SLOTS:-4} | |
27 --min_perc_aa $advanced.min_perc_aa | |
28 $advanced.force | |
29 --min_af $advanced.min_af | |
30 #if str($advanced.output_process_log) == 'yes': | |
31 && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' | |
32 #end if | |
33 ]]></command> | |
34 <inputs> | |
35 <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> | |
36 <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> | |
37 <options from_data_table="gtdbtk_database"> | |
38 <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> | |
39 </options> | |
40 </param> | |
41 <section name="advanced" title="Advanced options"> | |
42 <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> | |
43 <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> | |
44 <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> | |
45 <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> | |
46 </section> | |
47 </inputs> | |
48 <outputs> | |
49 <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> | |
50 <filter>advanced['output_process_log'] == 'yes'</filter> | |
51 </data> | |
52 <collection name="output_tsv" type="list" format="tsv" label="${tool.name} on ${on_string} (tsv)"> | |
53 <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> | |
54 </collection> | |
55 <collection name="output_newick" type="list" format="newick" label="${tool.name} on ${on_string} (newick)"> | |
56 <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir"/> | |
57 </collection> | |
58 <collection name="output_fasta" type="list" format="fasta" label="${tool.name} on ${on_string} (fasta)"> | |
59 <discover_datasets pattern="(?P<designation>.+)\.fasta" ext="fasta" directory="output_dir"/> | |
60 </collection> | |
61 </outputs> | |
62 <tests> | |
63 <!-- The commented test here is valid if we could store the GTDB-Tk database --> | |
64 <!-- | |
65 <test expect_num_outputs="3"> | |
66 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> | |
67 <param name="gtdbtk_db" value="gtdbtk202"/> | |
68 <output_collection name="output_tsv" type="list" count="6"> | |
69 <element name="gtdbtk.ar122.filtered" ftype="tsv"> | |
70 <assert_contents> | |
71 <has_size value="0"/> | |
72 </assert_contents> | |
73 </element> | |
74 <element name="gtdbtk.ar122.markers_summary" ftype="tsv"> | |
75 <assert_contents> | |
76 <has_text text="number_unique_genes"/> | |
77 </assert_contents> | |
78 </element> | |
79 <element name="gtdbtk.ar122.summary" ftype="tsv"> | |
80 <assert_contents> | |
81 <has_text text="genome_1_fna_gz"/> | |
82 </assert_contents> | |
83 </element> | |
84 <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> | |
85 <assert_contents> | |
86 <has_text text="genome_1_fna_gz"/> | |
87 </assert_contents> | |
88 </element> | |
89 <element name="gtdbtk.failed_genomes" ftype="tsv"> | |
90 <assert_contents> | |
91 <has_size value="0"/> | |
92 </assert_contents> | |
93 </element> | |
94 <element name="gtdbtk.translation_table_summary" ftype="tsv"> | |
95 <assert_contents> | |
96 <has_text text="genome_1_fna_gz"/> | |
97 </assert_contents> | |
98 </element> | |
99 </output_collection> | |
100 <output_collection name="output_newick" type="list" count="1"> | |
101 <element name="gtdbtk.ar122.classify" ftype="newick"> | |
102 <assert_contents> | |
103 <has_text text="GB_GCA_"/> | |
104 </assert_contents> | |
105 </element> | |
106 </output_collection> | |
107 <output_collection name="output_fasta" type="list" count="2"> | |
108 <element name="gtdbtk.ar122.msa" ftype="fasta"> | |
109 <assert_contents> | |
110 <has_text text="GB_GCA_000008085"/> | |
111 </assert_contents> | |
112 </element> | |
113 <element name="gtdbtk.ar122.user_msa" ftype="fasta"> | |
114 <assert_contents> | |
115 <has_text text="genome_1_fna_gz"/> | |
116 </assert_contents> | |
117 </element> | |
118 </output_collection> | |
119 </test> | |
120 --> | |
121 <!-- GTDB-Tk databases are far too large to test currently --> | |
122 <test expect_failure="true"> | |
123 <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> | |
124 <param name="gtdbtk_db" value="gtdbtk202"/> | |
125 <assert_stderr> | |
126 <has_text text="Fatal error: Exit code 1"/> | |
127 </assert_stderr> | |
128 </test> | |
129 </tests> | |
130 <help><![CDATA[ | |
131 **What it does** | |
132 | |
133 GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes | |
134 based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or | |
135 thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also | |
136 be applied to isolate and single-cell genomes. | |
137 | |
138 This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by | |
139 maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and | |
140 classify. | |
141 | |
142 The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial | |
143 and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by | |
144 aligning marker genes to their respective HMM model. | |
145 | |
146 The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 | |
147 amino acids. | |
148 | |
149 Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk | |
150 reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary | |
151 divergence, and/or average nucleotide identity (ANI) to reference genomes. | |
152 | |
153 Results can be impacted by a lack of marker genes or contamination. | |
154 ]]></help> | |
155 <expand macro="citations"/> | |
156 </tool> |