comparison blast_annotations_processor.xml @ 2:9ca209477dfd draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author onnodg
date Mon, 15 Dec 2025 16:43:36 +0000
parents 2acf82433aa4
children
comparison
equal deleted inserted replaced
1:2acf82433aa4 2:9ca209477dfd
1 <tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.1"> 1 <tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="2.0.0">
2 <description>Process BLAST annotation results with taxonomic analysis</description> 2 <description>Process BLAST annotation results with taxonomic analysis</description>
3 3
4 <requirements> 4 <requirements>
5 <requirement type="package" version="3.12.3">python</requirement> 5 <requirement type="package" version="3.12.3">python</requirement>
6 <requirement type="package" version="3.10.6">matplotlib</requirement> 6 <requirement type="package" version="3.10.6">matplotlib</requirement>
11 11
12 <command detect_errors="exit_code"><![CDATA[ 12 <command detect_errors="exit_code"><![CDATA[
13 python '$__tool_directory__/blast_annotations_processor.py' 13 python '$__tool_directory__/blast_annotations_processor.py'
14 --input-anno '$input_anno' 14 --input-anno '$input_anno'
15 --input-unanno '$input_unanno' 15 --input-unanno '$input_unanno'
16 16 --filtered-fasta '$filtered_fasta'
17 #if $outputs and 'eval_plot' in $outputs 17 #if $outputs and 'eval_plot' in $outputs
18 --eval-plot '$eval_plot' 18 --eval-plot '$eval_plot'
19 #end if 19 #end if
20 20
21 #if $outputs and 'taxa_output' in $outputs 21 #if $outputs and 'taxa_output' in $outputs
27 #end if 27 #end if
28 28
29 #if $outputs and 'header_anno' in $outputs 29 #if $outputs and 'header_anno' in $outputs
30 --header-anno '$header_anno' 30 --header-anno '$header_anno'
31 #end if 31 #end if
32 32 --log '$log'
33 #if $outputs and 'anno_stats' in $outputs
34 --anno-stats '$anno_stats'
35 #end if
36 33
37 --uncertain-threshold $advanced.uncertain_threshold 34 --uncertain-threshold $advanced.uncertain_threshold
38 --eval-threshold $advanced.eval_threshold 35 --eval-threshold $advanced.blast.eval_threshold
36 --min-identity $advanced.blast.min_identity
37 --min-coverage $advanced.blast.min_coverage
38 --min-bitscore $advanced.blast.min_bitscore
39 --bitscore-perc-cutoff $advanced.blast.bitscore_perc_cutoff
40 --min-support $advanced.fasta.min_support
41 #if $advanced.blast.ignore_seqids != ""
42 --ignore-seqids '$advanced.blast.ignore_seqids'
43 #end if
44 #if $advanced.blast.ignore_rank != ""
45 --ignore-rank '$advanced.blast.ignore_rank'
46 #end if
47 #if $advanced.blast.ignore_taxonomy != ""
48 --ignore-taxonomy '$advanced.blast.ignore_taxonomy'
49 #end if
50 #if $advanced.fasta.ignore_obiclean_type != ""
51 --ignore-obiclean-type '$advanced.fasta.ignore_obiclean_type'
52 #end if
53 #if $advanced.fasta.ignore_illuminapairend_type != ""
54 --ignore-illuminapairend-type '$advanced.fasta.ignore_illuminapairend_type'
55 #end if
39 #if $advanced.use_counts 56 #if $advanced.use_counts
40 --use-counts 57 --use-counts
41 #end if 58 #end if
42 ]]></command> 59 ]]></command>
43 60
44 <inputs> 61 <inputs>
45 <!-- Required Input Files -->
46 <param name="input_anno" type="data" format="tabular" 62 <param name="input_anno" type="data" format="tabular"
47 label="Annotated BLAST output file" 63 label="Annotated BLAST output file"
48 help="Tabular BLAST output with taxonomic annotations"/> 64 help="Tabular BLAST output with taxonomic annotations"/>
49 65
50 <param name="input_unanno" type="data" format="fasta" 66 <param name="input_unanno" type="data" format="fasta"
51 label="Original unannotated sequences" 67 label="Original unannotated sequences"
52 help="FASTA file with original sequences before BLAST annotation"/> 68 help="FASTA file with original sequences before BLAST annotation"/>
53 69
54 <!-- Output Selection -->
55 <param name="outputs" type="select" multiple="true" display="checkboxes" 70 <param name="outputs" type="select" multiple="true" display="checkboxes"
56 label="Select outputs to generate" help="Choose which analysis outputs to create"> 71 label="Select outputs to generate" help="Choose which analysis outputs to create">
57 <option value="eval_plot">E-value distribution plot</option> 72 <option value="eval_plot">E-value distribution plot</option>
58 <option value="taxa_output">Taxonomic report (Kraken2-like format)</option> 73 <option value="taxa_output">Taxonomic report (Kraken2-like format)</option>
59 <option value="circle_data">Circular taxonomic datafile</option> 74 <option value="circle_data">Circular taxonomic datafile</option>
60 <option value="header_anno">Annotations per header (in Excel)</option> 75 <option value="header_anno">Annotations per header (in Excel)</option>
61 <option value="anno_stats">Annotation statistics</option>
62 </param> 76 </param>
63 77
64 <!-- Processing Parameters -->
65 <section name="advanced" title="Advanced Parameters" expanded="false"> 78 <section name="advanced" title="Advanced Parameters" expanded="false">
66 <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0" 79 <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0"
67 label="Uncertain threshold" 80 label="Uncertainty threshold"
68 help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/> 81 help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/>
69
70 <param name="eval_threshold" type="float" value="1e-10" min="0"
71 label="E-value threshold"
72 help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/>
73
74 <param name="use_counts" type="boolean" checked="true" 82 <param name="use_counts" type="boolean" checked="true"
75 label="Use read counts in circular diagrams" 83 label="Use read counts in circular diagrams"
76 help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/> 84 help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/>
85 <section name="fasta" title="Fasta filters" expanded="false">
86 <param name="min_support" type="integer" value="1" min="1" max="1000" label="Minimum support"
87 help="The minimum times a read should occur before dereplication"/>
88 <param name="ignore_obiclean_type" type="text" value="singleton" label="Ignore obiclean type"
89 help="The tool skips reads that are flagged as this obiclean type, options are: singleton,variant,head. Values must be comma seperated"/>
90 <param name="ignore_illuminapairend_type" type="text" value="pairend" label="Ignore R1-R2 merge failure"
91 help="The tool skips reads that are flagged as this illuminapairend type, options are: pairend,consensus. Values must be comma seperated"/>
92 </section>
93 <section name="blast" title="Blast filters" expanded="false">
94 <param name="min_identity" type="integer" value="80" min="1" max="100" label="Minimum identity"/>
95 <param name="min_coverage" type="integer" value="70" min="1" max="100" label="Minimum coverage"/>
96 <param name="min_bitscore" type="integer" value="100" min="1" max="1000" label="Minimum bitscore"/>
97 <param name="bitscore_perc_cutoff" type="float" value="8" min="0" max="100" label="Top bitscore percentage cutoff"
98 help="The percentage that the bitscore can be lower than the top bitscore to still be considered. To disable this function put the value as 0"/>
99 <param name="eval_threshold" type="text" value="1e-10" label="E-value threshold"/>
100 <param name="ignore_seqids" type="text" value="" label="Ignore sequence identifiers"
101 help="The tool skips hits that have these sequence identifiers. Values must be comma seperated"/>
102 <param name="ignore_rank" type="text" value="unknown" label="Ignore rank when containing:"
103 help="The tool skips hits that have this string in taxonomy ranks. Values must be comma seperated"/>
104 <param name="ignore_taxonomy" type="text" value="environmental" label="Ignore taxonomy when containing:"
105 help="The tool skips hits that have this string as taxonomy. Values must be comma seperated"/>
106 </section>
77 </section> 107 </section>
78 </inputs> 108 </inputs>
79 109
80 <outputs> 110 <outputs>
81 <!-- E-value Plot -->
82 <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}"> 111 <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}">
83 <filter>outputs and 'eval_plot' in outputs</filter> 112 <filter>outputs and 'eval_plot' in outputs</filter>
84 </data> 113 </data>
85
86 <!-- Taxa Output Report -->
87 <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}"> 114 <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}">
88 <filter>outputs and 'taxa_output' in outputs</filter> 115 <filter>outputs and 'taxa_output' in outputs</filter>
89 </data> 116 </data>
90
91 <!-- Circular Taxonomy Diagram -->
92 <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}"> 117 <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}">
93 <filter>outputs and 'circle_data' in outputs</filter> 118 <filter>outputs and 'circle_data' in outputs</filter>
94 </data> 119 </data>
95
96 <!-- Header Annotations -->
97 <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}"> 120 <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}">
98 <filter>outputs and 'header_anno' in outputs</filter> 121 <filter>outputs and 'header_anno' in outputs</filter>
99 </data> 122 </data>
100 123 <data name="log" format="txt" label="log on ${on_string}"/>
101 <!-- Annotation Statistics --> 124 <data name="filtered_fasta" format="fasta" label="Filtered fasta on ${on_string}"/>
102 <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}">
103 <filter>outputs and 'anno_stats' in outputs</filter>
104 </data>
105 </outputs> 125 </outputs>
106 126
107 <tests> 127 <tests>
108 <test expect_num_outputs="5"> 128 <test expect_num_outputs="6">
129 <param name="input_anno" value="test_curated_nov_blast_headers.tabular"/>
130 <param name="input_unanno" value="test_curated_nov.fasta"/>
131 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/>
132 <output name="eval_plot" file="test_nov_eval.png" compare="sim_size"/>
133 <output name="taxa_output" file="test_curated_nova_taxa_output.txt"/>
134 <output name="circle_data" file="test_curated_nova.txt"/>
135 <output name="header_anno" file="test_curated_nova_header_anno_excel.xlsx" decompress="true"/>
136 <output name="log" file="test_curated_nova_anno_out.txt"/>
137 <output name="filtered_fasta" file="test_curated_nov_filtered.fasta"/>
138 <section name="advanced">
139 <param name="uncertain_threshold" value="0.9"/>
140 <param name="use_counts" value="True"/>
141 <section name="fasta">
142 <param name="min_support" value="1"/>
143 <param name="ignore_obiclean_type" value="singleton"/>
144 <param name="ignore_illuminapairend_type" value="pairend"/>
145 </section>
146 <section name="blast">
147 <param name="min_identity" value="80"/>
148 <param name="min_coverage" value="70"/>
149 <param name="min_bitscore" value="100"/>
150 <param name="bitscore_perc_cutoff" value="8"/>
151 <param name="eval_threshold" value="1e-10"/>
152 <param name="ignore_seqids" value=""/>
153 <param name="ignore_rank" value="unknown"/>
154 <param name="ignore_taxonomy" value="environmental"/>
155 </section>
156 </section>
157 </test>
158
159 <test expect_num_outputs="6">
160 <param name="input_anno" value="test_genbank_nov_blast.tabular"/>
161 <param name="input_unanno" value="test_genbank_nov.fasta"/>
162 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/>
163 <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
164 <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
165 <output name="circle_data" file="output_genbank_circle_data.txt"/>
166 <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
167 <output name="log" file="output_genbank_anno_out.txt"/>
168 <output name="filtered_fasta" file="genbank_filtered.fasta"/>
169 <section name="advanced">
170 <param name="uncertain_threshold" value="0.9"/>
171 <param name="use_counts" value="True"/>
172 <section name="fasta">
173 <param name="min_support" value="1"/>
174 <param name="ignore_obiclean_type" value="singleton"/>
175 <param name="ignore_illuminapairend_type" value="pairend"/>
176 </section>
177 <section name="blast">
178 <param name="min_identity" value="80"/>
179 <param name="min_coverage" value="70"/>
180 <param name="min_bitscore" value="100"/>
181 <param name="bitscore_perc_cutoff" value="8"/>
182 <param name="eval_threshold" value="1e-10"/>
183 <param name="ignore_seqids" value=""/>
184 <param name="ignore_rank" value="unknown"/>
185 <param name="ignore_taxonomy" value="environmental"/>
186 </section>
187 </section>
188 </test>
189 <test expect_num_outputs="4">
190 <param name="input_anno" value="test_genbank_nov_blast.tabular"/>
191 <param name="input_unanno" value="test_genbank_nov.fasta"/>
192 <param name="outputs" value="circle_data,header_anno"/>
193 <output name="circle_data" file="advanced_circle_data.txt"/>
194 <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
195 <output name="log" file="output_advanced_anno_out.txt"/>
196 <output name="filtered_fasta" file="advanced_filtered.fasta"/>
197 <section name="advanced">
198 <param name="uncertain_threshold" value="0.8"/>
199 <param name="use_counts" value="False"/>
200 <section name="fasta">
201 <param name="min_support" value="2"/>
202 <param name="ignore_obiclean_type" value="singleton,variant"/>
203 <param name="ignore_illuminapairend_type" value="pairend"/>
204 </section>
205 <section name="blast">
206 <param name="min_identity" value="70"/>
207 <param name="min_coverage" value="60"/>
208 <param name="min_bitscore" value="80"/>
209 <param name="bitscore_perc_cutoff" value="0"/>
210 <param name="eval_threshold" value="1e-8"/>
211 <param name="ignore_seqids" value="NC_051949"/>
212 <param name="ignore_rank" value="unknown"/>
213 <param name="ignore_taxonomy" value="environmental"/>
214 </section>
215 </section>
216 </test>
217 <test expect_num_outputs="3">
109 <param name="input_anno" value="input_test_curated_labels.tabular"/> 218 <param name="input_anno" value="input_test_curated_labels.tabular"/>
110 <param name="input_unanno" value="input_test_curated.fasta"/> 219 <param name="input_unanno" value="input_test_curated.fasta"/>
111 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> 220 <param name="outputs" value="header_anno"/>
112 <output name="taxa_output" file="output_taxa_output.txt"/> 221 <output name="header_anno" file="strict_header_anno.xlsx" decompress="true"/>
113 <output name="eval_plot" file="output_eval.png" compare="sim_size"/> 222 <output name="log" file="strict_anno_stats.txt" lines_diff="50"/>
114 <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/> 223 <output name="filtered_fasta" file="strict_filtered.fasta"/>
115 <output name="anno_stats" file="output_anno_out.txt"/> 224
116 <output name="circle_data" file="output_circle_data.txt"/>
117 <section name="advanced"> 225 <section name="advanced">
118 <param name="uncertain_threshold" value="0.9"/> 226 <param name="uncertain_threshold" value="0.95"/>
119 <param name="eval_threshold" value="1e-10"/> 227 <param name="use_counts" value="False"/>
120 <param name="use_counts" value="True"/> 228
229 <section name="fasta">
230 <param name="min_support" value="1"/>
231 <param name="ignore_obiclean_type" value=""/>
232 <param name="ignore_illuminapairend_type" value=""/>
233 </section>
234
235 <section name="blast">
236 <param name="min_identity" value="98"/>
237 <param name="min_coverage" value="95"/>
238 <param name="min_bitscore" value="150"/>
239 <param name="bitscore_perc_cutoff" value="0"/>
240 <param name="eval_threshold" value="1e-20"/>
241 <param name="ignore_seqids" value=""/>
242 <param name="ignore_rank" value=""/>
243 <param name="ignore_taxonomy" value=""/>
244 </section>
121 </section> 245 </section>
122 </test> 246 </test>
123 <test expect_num_outputs="5">
124 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
125 <param name="input_unanno" value="galaxy_input_pre.fasta"/>
126 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
127 <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
128 <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
129 <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
130 <output name="anno_stats" file="output_genbank_anno_out.txt"/>
131 <output name="circle_data" file="output_genbank_circle_data.txt"/>
132 </test>
133 <test expect_num_outputs="3">
134 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
135 <param name="input_unanno" value="galaxy_input_pre.fasta"/>
136 <param name="outputs" value="circle_data,header_anno,anno_stats"/>
137 <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
138 <output name="anno_stats" file="output_advanced_anno_out.txt"/>
139 <output name="circle_data" file="advanced_circle_data.txt"/>
140 <section name="advanced">
141 <param name="uncertain_threshold" value="0.8"/>
142 <param name="eval_threshold" value="1e-8"/>
143 <param name="use_counts" value="True"/>
144 </section>
145 </test>
146
147 </tests> 247 </tests>
148 248
149 <help><![CDATA[ 249 <help><![CDATA[
150 **BLAST Annotation Processor** 250 **BLAST Annotation Processor**
151 251
157 257
158 - **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics. 258 - **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics.
159 259
160 **Outputs:** 260 **Outputs:**
161 261
262 - **Filtered fasta: This is a fasta file with all the fasta sequences that passed the fasta filtering (obiclean, illuminapairend and minimum support filters)**
263
162 - **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences. 264 - **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences.
163 265
164 - **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments. 266 - **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments.
165 267
166 - **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species). 268 - **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species).
174 - **Uncertain threshold**: Treshold for lca. When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa". 276 - **Uncertain threshold**: Treshold for lca. When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa".
175 277
176 - **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis. 278 - **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis.
177 279
178 - **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked). 280 - **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked).
281
282 - **E-value threshold**: Maximum allowed E-value.
283
284 - **Minimum identity (%)**
285
286 - **Minimum coverage (%)**
287
288 - **Minimum bitscore**
289
290 - **Bitscore percentile cutoff (%)**: Relative cutoff vs. best hit in a query.
291
292 - **Minimum read support (FASTA)**: Only keep headers with at least N counts.
293
294 - **Ignore OBIClean type**: Remove reads with this OBIClean category (singleton / variant / head).
295
296 - **Ignore Illumina pairend type**: Remove reads based on pairend status.
297
298 - **Ignore taxonomy keywords**: Skip hits whose taxonomic annotation contains these strings.
299
300 - **Ignore sequence identifiers**: Remove BLAST hits whose subject/seq IDs match given list.
301
302 - **Use counts**: Circular diagram uses abundance (checked) or uniqueness (unchecked).
303
179 304
180 **Expected Input Format:** 305 **Expected Input Format:**
181 306
182 The annotated BLAST file should be in tabular format with at least 7 columns: 307 The annotated BLAST file should be in tabular format with at least 7 columns:
183 308