comparison blast_annotations_processor.xml @ 0:a3989edf0a4a draft

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit c944fd5685f295acba06679e85b67973c173b137
author onnodg
date Tue, 14 Oct 2025 09:08:30 +0000
parents
children 2acf82433aa4
comparison
equal deleted inserted replaced
-1:000000000000 0:a3989edf0a4a
1 <tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.0">
2 <description>Process BLAST annotation results with taxonomic analysis</description>
3
4 <requirements>
5 <requirement type="package" version="3.12.3">python</requirement>
6 <requirement type="package" version="3.10.6">matplotlib</requirement>
7 <requirement type="package" version="2.3.2">pandas</requirement>
8 <requirement type="package" version="2.3.2">numpy</requirement>
9 <requirement type="package" version="3.1.5">openpyxl</requirement>
10 </requirements>
11
12 <command detect_errors="exit_code"><![CDATA[
13 python '$__tool_directory__/blast_annotations_processor.py'
14 --input-anno '$input_anno'
15 --input-unanno '$input_unanno'
16
17 #if $outputs and 'eval_plot' in $outputs
18 --eval-plot '$eval_plot'
19 #end if
20
21 #if $outputs and 'taxa_output' in $outputs
22 --taxa-output '$taxa_output'
23 #end if
24
25 #if $outputs and 'circle_data' in $outputs
26 --circle-data '$circle_data'
27 #end if
28
29 #if $outputs and 'header_anno' in $outputs
30 --header-anno '$header_anno'
31 #end if
32
33 #if $outputs and 'anno_stats' in $outputs
34 --anno-stats '$anno_stats'
35 #end if
36
37 --uncertain-threshold $advanced.uncertain_threshold
38 --eval-threshold $advanced.eval_threshold
39 #if $advanced.use_counts
40 --use-counts
41 #end if
42 ]]></command>
43
44 <inputs>
45 <!-- Required Input Files -->
46 <param name="input_anno" type="data" format="tabular"
47 label="Annotated BLAST output file"
48 help="Tabular BLAST output with taxonomic annotations"/>
49
50 <param name="input_unanno" type="data" format="fasta"
51 label="Original unannotated sequences"
52 help="FASTA file with original sequences before BLAST annotation"/>
53
54 <!-- Output Selection -->
55 <param name="outputs" type="select" multiple="true" display="checkboxes"
56 label="Select outputs to generate" help="Choose which analysis outputs to create">
57 <option value="eval_plot">E-value distribution plot</option>
58 <option value="taxa_output">Taxonomic report (Kraken2-like format)</option>
59 <option value="circle_data">Circular taxonomic datafile</option>
60 <option value="header_anno">Header annotations table</option>
61 <option value="anno_stats">Annotation statistics</option>
62 </param>
63
64 <!-- Processing Parameters -->
65 <section name="advanced" title="Advanced Parameters" expanded="false">
66 <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0"
67 label="Uncertain threshold"
68 help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/>
69
70 <param name="eval_threshold" type="float" value="1e-10" min="0"
71 label="E-value threshold"
72 help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/>
73
74 <param name="use_counts" type="boolean" checked="true"
75 label="Use read counts in circular diagrams"
76 help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/>
77 </section>
78 </inputs>
79
80 <outputs>
81 <!-- E-value Plot -->
82 <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}">
83 <filter>outputs and 'eval_plot' in outputs</filter>
84 </data>
85
86 <!-- Taxa Output Report -->
87 <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}">
88 <filter>outputs and 'taxa_output' in outputs</filter>
89 </data>
90
91 <!-- Circular Taxonomy Diagram -->
92 <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}">
93 <filter>outputs and 'circle_data' in outputs</filter>
94 </data>
95
96 <!-- Header Annotations -->
97 <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}">
98 <filter>outputs and 'header_anno' in outputs</filter>
99 </data>
100
101 <!-- Annotation Statistics -->
102 <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}">
103 <filter>outputs and 'anno_stats' in outputs</filter>
104 </data>
105 </outputs>
106
107 <tests>
108 <test expect_num_outputs="5">
109 <param name="input_anno" value="input_test_curated_labels.tabular"/>
110 <param name="input_unanno" value="input_test_curated.fasta"/>
111 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
112 <output name="taxa_output" file="output_taxa_output.txt"/>
113 <output name="eval_plot" file="output_eval.png" compare="sim_size"/>
114 <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/>
115 <output name="anno_stats" file="output_anno_out.txt"/>
116 <output name="circle_data" file="output_circle_data.txt"/>
117 <section name="advanced">
118 <param name="uncertain_threshold" value="0.9"/>
119 <param name="eval_threshold" value="1e-10"/>
120 <param name="use_counts" value="True"/>
121 </section>
122 </test>
123 <test expect_num_outputs="5">
124 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
125 <param name="input_unanno" value="galaxy_input_pre.fasta"/>
126 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
127 <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
128 <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
129 <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
130 <output name="anno_stats" file="output_genbank_anno_out.txt"/>
131 <output name="circle_data" file="output_genbank_circle_data.txt"/>
132 </test>
133 <test expect_num_outputs="3">
134 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
135 <param name="input_unanno" value="galaxy_input_pre.fasta"/>
136 <param name="outputs" value="circle_data,header_anno,anno_stats"/>
137 <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
138 <output name="anno_stats" file="output_advanced_anno_out.txt"/>
139 <output name="circle_data" file="advanced_circle_data.txt"/>
140 <section name="advanced">
141 <param name="uncertain_threshold" value="0.8"/>
142 <param name="eval_threshold" value="1e-8"/>
143 <param name="use_counts" value="True"/>
144 </section>
145 </test>
146
147 </tests>
148
149 <help><![CDATA[
150 **BLAST Annotation Processor**
151
152 This tool processes BLAST annotation results and generates various quality control and visualization outputs.
153
154 **Inputs:**
155
156 - **Annotated BLAST output**: Tabular BLAST output file with taxonomic annotations. Expected format is standard BLAST tabular output with taxonomic information in the last column.
157
158 - **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics.
159
160 **Outputs:**
161
162 - **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences.
163
164 - **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments.
165
166 - **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species).
167
168 - **Header annotations table**: Excel workbook listing each sequence header with its taxonomic assignment and E-value.
169
170 - **Annotation statistics**: Summary statistics about annotation success rates and sequence counts.
171
172 **Parameters:**
173
174 - **Uncertain threshold**: When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa".
175
176 - **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis.
177
178 - **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked).
179 #Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage
180 #Coverage #evalue #bitscore #Source #Taxonomy
181 **Expected Input Format:**
182
183 The annotated BLAST file should be in tabular format with at least 7 columns:
184 1. Query ID
185 2. Subject ID
186 3. Subject accession
187 4. Subject Taxonomy ID
188 5. Identity percentage
189 6. Coverage
190 7. Evalue
191 8. Bitscore
192 9. Source
193 10. Taxonomy
194
195 **Note:** This tool processes files that have been deduplicated and contain read count information in the sequence headers in the format: `sequence_name(count_number)`.
196
197 **Credits**
198 Authors = Onno de Gorter, 2025.
199 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
200 Developed for the New light on old remedies project, a PhD research by Anja Fischer
201 ]]></help>
202 </tool>