Mercurial > repos > onnodg > blast_annotations_processor
comparison blast_annotations_processor.xml @ 0:a3989edf0a4a draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:08:30 +0000 |
| parents | |
| children | 2acf82433aa4 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a3989edf0a4a |
|---|---|
| 1 <tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.0"> | |
| 2 <description>Process BLAST annotation results with taxonomic analysis</description> | |
| 3 | |
| 4 <requirements> | |
| 5 <requirement type="package" version="3.12.3">python</requirement> | |
| 6 <requirement type="package" version="3.10.6">matplotlib</requirement> | |
| 7 <requirement type="package" version="2.3.2">pandas</requirement> | |
| 8 <requirement type="package" version="2.3.2">numpy</requirement> | |
| 9 <requirement type="package" version="3.1.5">openpyxl</requirement> | |
| 10 </requirements> | |
| 11 | |
| 12 <command detect_errors="exit_code"><![CDATA[ | |
| 13 python '$__tool_directory__/blast_annotations_processor.py' | |
| 14 --input-anno '$input_anno' | |
| 15 --input-unanno '$input_unanno' | |
| 16 | |
| 17 #if $outputs and 'eval_plot' in $outputs | |
| 18 --eval-plot '$eval_plot' | |
| 19 #end if | |
| 20 | |
| 21 #if $outputs and 'taxa_output' in $outputs | |
| 22 --taxa-output '$taxa_output' | |
| 23 #end if | |
| 24 | |
| 25 #if $outputs and 'circle_data' in $outputs | |
| 26 --circle-data '$circle_data' | |
| 27 #end if | |
| 28 | |
| 29 #if $outputs and 'header_anno' in $outputs | |
| 30 --header-anno '$header_anno' | |
| 31 #end if | |
| 32 | |
| 33 #if $outputs and 'anno_stats' in $outputs | |
| 34 --anno-stats '$anno_stats' | |
| 35 #end if | |
| 36 | |
| 37 --uncertain-threshold $advanced.uncertain_threshold | |
| 38 --eval-threshold $advanced.eval_threshold | |
| 39 #if $advanced.use_counts | |
| 40 --use-counts | |
| 41 #end if | |
| 42 ]]></command> | |
| 43 | |
| 44 <inputs> | |
| 45 <!-- Required Input Files --> | |
| 46 <param name="input_anno" type="data" format="tabular" | |
| 47 label="Annotated BLAST output file" | |
| 48 help="Tabular BLAST output with taxonomic annotations"/> | |
| 49 | |
| 50 <param name="input_unanno" type="data" format="fasta" | |
| 51 label="Original unannotated sequences" | |
| 52 help="FASTA file with original sequences before BLAST annotation"/> | |
| 53 | |
| 54 <!-- Output Selection --> | |
| 55 <param name="outputs" type="select" multiple="true" display="checkboxes" | |
| 56 label="Select outputs to generate" help="Choose which analysis outputs to create"> | |
| 57 <option value="eval_plot">E-value distribution plot</option> | |
| 58 <option value="taxa_output">Taxonomic report (Kraken2-like format)</option> | |
| 59 <option value="circle_data">Circular taxonomic datafile</option> | |
| 60 <option value="header_anno">Header annotations table</option> | |
| 61 <option value="anno_stats">Annotation statistics</option> | |
| 62 </param> | |
| 63 | |
| 64 <!-- Processing Parameters --> | |
| 65 <section name="advanced" title="Advanced Parameters" expanded="false"> | |
| 66 <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0" | |
| 67 label="Uncertain threshold" | |
| 68 help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/> | |
| 69 | |
| 70 <param name="eval_threshold" type="float" value="1e-10" min="0" | |
| 71 label="E-value threshold" | |
| 72 help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/> | |
| 73 | |
| 74 <param name="use_counts" type="boolean" checked="true" | |
| 75 label="Use read counts in circular diagrams" | |
| 76 help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/> | |
| 77 </section> | |
| 78 </inputs> | |
| 79 | |
| 80 <outputs> | |
| 81 <!-- E-value Plot --> | |
| 82 <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}"> | |
| 83 <filter>outputs and 'eval_plot' in outputs</filter> | |
| 84 </data> | |
| 85 | |
| 86 <!-- Taxa Output Report --> | |
| 87 <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}"> | |
| 88 <filter>outputs and 'taxa_output' in outputs</filter> | |
| 89 </data> | |
| 90 | |
| 91 <!-- Circular Taxonomy Diagram --> | |
| 92 <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}"> | |
| 93 <filter>outputs and 'circle_data' in outputs</filter> | |
| 94 </data> | |
| 95 | |
| 96 <!-- Header Annotations --> | |
| 97 <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}"> | |
| 98 <filter>outputs and 'header_anno' in outputs</filter> | |
| 99 </data> | |
| 100 | |
| 101 <!-- Annotation Statistics --> | |
| 102 <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}"> | |
| 103 <filter>outputs and 'anno_stats' in outputs</filter> | |
| 104 </data> | |
| 105 </outputs> | |
| 106 | |
| 107 <tests> | |
| 108 <test expect_num_outputs="5"> | |
| 109 <param name="input_anno" value="input_test_curated_labels.tabular"/> | |
| 110 <param name="input_unanno" value="input_test_curated.fasta"/> | |
| 111 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> | |
| 112 <output name="taxa_output" file="output_taxa_output.txt"/> | |
| 113 <output name="eval_plot" file="output_eval.png" compare="sim_size"/> | |
| 114 <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/> | |
| 115 <output name="anno_stats" file="output_anno_out.txt"/> | |
| 116 <output name="circle_data" file="output_circle_data.txt"/> | |
| 117 <section name="advanced"> | |
| 118 <param name="uncertain_threshold" value="0.9"/> | |
| 119 <param name="eval_threshold" value="1e-10"/> | |
| 120 <param name="use_counts" value="True"/> | |
| 121 </section> | |
| 122 </test> | |
| 123 <test expect_num_outputs="5"> | |
| 124 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> | |
| 125 <param name="input_unanno" value="galaxy_input_pre.fasta"/> | |
| 126 <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> | |
| 127 <output name="taxa_output" file="output_genbank_taxa_output.txt"/> | |
| 128 <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/> | |
| 129 <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/> | |
| 130 <output name="anno_stats" file="output_genbank_anno_out.txt"/> | |
| 131 <output name="circle_data" file="output_genbank_circle_data.txt"/> | |
| 132 </test> | |
| 133 <test expect_num_outputs="3"> | |
| 134 <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> | |
| 135 <param name="input_unanno" value="galaxy_input_pre.fasta"/> | |
| 136 <param name="outputs" value="circle_data,header_anno,anno_stats"/> | |
| 137 <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/> | |
| 138 <output name="anno_stats" file="output_advanced_anno_out.txt"/> | |
| 139 <output name="circle_data" file="advanced_circle_data.txt"/> | |
| 140 <section name="advanced"> | |
| 141 <param name="uncertain_threshold" value="0.8"/> | |
| 142 <param name="eval_threshold" value="1e-8"/> | |
| 143 <param name="use_counts" value="True"/> | |
| 144 </section> | |
| 145 </test> | |
| 146 | |
| 147 </tests> | |
| 148 | |
| 149 <help><![CDATA[ | |
| 150 **BLAST Annotation Processor** | |
| 151 | |
| 152 This tool processes BLAST annotation results and generates various quality control and visualization outputs. | |
| 153 | |
| 154 **Inputs:** | |
| 155 | |
| 156 - **Annotated BLAST output**: Tabular BLAST output file with taxonomic annotations. Expected format is standard BLAST tabular output with taxonomic information in the last column. | |
| 157 | |
| 158 - **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics. | |
| 159 | |
| 160 **Outputs:** | |
| 161 | |
| 162 - **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences. | |
| 163 | |
| 164 - **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments. | |
| 165 | |
| 166 - **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species). | |
| 167 | |
| 168 - **Header annotations table**: Excel workbook listing each sequence header with its taxonomic assignment and E-value. | |
| 169 | |
| 170 - **Annotation statistics**: Summary statistics about annotation success rates and sequence counts. | |
| 171 | |
| 172 **Parameters:** | |
| 173 | |
| 174 - **Uncertain threshold**: When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa". | |
| 175 | |
| 176 - **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis. | |
| 177 | |
| 178 - **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked). | |
| 179 #Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage | |
| 180 #Coverage #evalue #bitscore #Source #Taxonomy | |
| 181 **Expected Input Format:** | |
| 182 | |
| 183 The annotated BLAST file should be in tabular format with at least 7 columns: | |
| 184 1. Query ID | |
| 185 2. Subject ID | |
| 186 3. Subject accession | |
| 187 4. Subject Taxonomy ID | |
| 188 5. Identity percentage | |
| 189 6. Coverage | |
| 190 7. Evalue | |
| 191 8. Bitscore | |
| 192 9. Source | |
| 193 10. Taxonomy | |
| 194 | |
| 195 **Note:** This tool processes files that have been deduplicated and contain read count information in the sequence headers in the format: `sequence_name(count_number)`. | |
| 196 | |
| 197 **Credits** | |
| 198 Authors = Onno de Gorter, 2025. | |
| 199 Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, | |
| 200 Developed for the New light on old remedies project, a PhD research by Anja Fischer | |
| 201 ]]></help> | |
| 202 </tool> |
