Mercurial > repos > onnodg > blast_annotations_processor
diff blast_annotations_processor.xml @ 0:a3989edf0a4a draft
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit c944fd5685f295acba06679e85b67973c173b137
| author | onnodg |
|---|---|
| date | Tue, 14 Oct 2025 09:08:30 +0000 |
| parents | |
| children | 2acf82433aa4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/blast_annotations_processor.xml Tue Oct 14 09:08:30 2025 +0000 @@ -0,0 +1,202 @@ +<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.0"> + <description>Process BLAST annotation results with taxonomic analysis</description> + + <requirements> + <requirement type="package" version="3.12.3">python</requirement> + <requirement type="package" version="3.10.6">matplotlib</requirement> + <requirement type="package" version="2.3.2">pandas</requirement> + <requirement type="package" version="2.3.2">numpy</requirement> + <requirement type="package" version="3.1.5">openpyxl</requirement> + </requirements> + + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/blast_annotations_processor.py' + --input-anno '$input_anno' + --input-unanno '$input_unanno' + + #if $outputs and 'eval_plot' in $outputs + --eval-plot '$eval_plot' + #end if + + #if $outputs and 'taxa_output' in $outputs + --taxa-output '$taxa_output' + #end if + + #if $outputs and 'circle_data' in $outputs + --circle-data '$circle_data' + #end if + + #if $outputs and 'header_anno' in $outputs + --header-anno '$header_anno' + #end if + + #if $outputs and 'anno_stats' in $outputs + --anno-stats '$anno_stats' + #end if + + --uncertain-threshold $advanced.uncertain_threshold + --eval-threshold $advanced.eval_threshold + #if $advanced.use_counts + --use-counts + #end if + ]]></command> + + <inputs> + <!-- Required Input Files --> + <param name="input_anno" type="data" format="tabular" + label="Annotated BLAST output file" + help="Tabular BLAST output with taxonomic annotations"/> + + <param name="input_unanno" type="data" format="fasta" + label="Original unannotated sequences" + help="FASTA file with original sequences before BLAST annotation"/> + + <!-- Output Selection --> + <param name="outputs" type="select" multiple="true" display="checkboxes" + label="Select outputs to generate" help="Choose which analysis outputs to create"> + <option value="eval_plot">E-value distribution plot</option> + <option value="taxa_output">Taxonomic report (Kraken2-like format)</option> + <option value="circle_data">Circular taxonomic datafile</option> + <option value="header_anno">Header annotations table</option> + <option value="anno_stats">Annotation statistics</option> + </param> + + <!-- Processing Parameters --> + <section name="advanced" title="Advanced Parameters" expanded="false"> + <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0" + label="Uncertain threshold" + help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/> + + <param name="eval_threshold" type="float" value="1e-10" min="0" + label="E-value threshold" + help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/> + + <param name="use_counts" type="boolean" checked="true" + label="Use read counts in circular diagrams" + help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/> + </section> + </inputs> + + <outputs> + <!-- E-value Plot --> + <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}"> + <filter>outputs and 'eval_plot' in outputs</filter> + </data> + + <!-- Taxa Output Report --> + <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}"> + <filter>outputs and 'taxa_output' in outputs</filter> + </data> + + <!-- Circular Taxonomy Diagram --> + <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}"> + <filter>outputs and 'circle_data' in outputs</filter> + </data> + + <!-- Header Annotations --> + <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}"> + <filter>outputs and 'header_anno' in outputs</filter> + </data> + + <!-- Annotation Statistics --> + <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}"> + <filter>outputs and 'anno_stats' in outputs</filter> + </data> + </outputs> + + <tests> + <test expect_num_outputs="5"> + <param name="input_anno" value="input_test_curated_labels.tabular"/> + <param name="input_unanno" value="input_test_curated.fasta"/> + <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> + <output name="taxa_output" file="output_taxa_output.txt"/> + <output name="eval_plot" file="output_eval.png" compare="sim_size"/> + <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/> + <output name="anno_stats" file="output_anno_out.txt"/> + <output name="circle_data" file="output_circle_data.txt"/> + <section name="advanced"> + <param name="uncertain_threshold" value="0.9"/> + <param name="eval_threshold" value="1e-10"/> + <param name="use_counts" value="True"/> + </section> + </test> + <test expect_num_outputs="5"> + <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> + <param name="input_unanno" value="galaxy_input_pre.fasta"/> + <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> + <output name="taxa_output" file="output_genbank_taxa_output.txt"/> + <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/> + <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/> + <output name="anno_stats" file="output_genbank_anno_out.txt"/> + <output name="circle_data" file="output_genbank_circle_data.txt"/> + </test> + <test expect_num_outputs="3"> + <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> + <param name="input_unanno" value="galaxy_input_pre.fasta"/> + <param name="outputs" value="circle_data,header_anno,anno_stats"/> + <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/> + <output name="anno_stats" file="output_advanced_anno_out.txt"/> + <output name="circle_data" file="advanced_circle_data.txt"/> + <section name="advanced"> + <param name="uncertain_threshold" value="0.8"/> + <param name="eval_threshold" value="1e-8"/> + <param name="use_counts" value="True"/> + </section> + </test> + + </tests> + + <help><![CDATA[ +**BLAST Annotation Processor** + +This tool processes BLAST annotation results and generates various quality control and visualization outputs. + +**Inputs:** + +- **Annotated BLAST output**: Tabular BLAST output file with taxonomic annotations. Expected format is standard BLAST tabular output with taxonomic information in the last column. + +- **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics. + +**Outputs:** + +- **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences. + +- **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments. + +- **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species). + +- **Header annotations table**: Excel workbook listing each sequence header with its taxonomic assignment and E-value. + +- **Annotation statistics**: Summary statistics about annotation success rates and sequence counts. + +**Parameters:** + +- **Uncertain threshold**: When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa". + +- **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis. + +- **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked). +#Query ID #Subject #Subject accession #Subject Taxonomy ID #Identity percentage + #Coverage #evalue #bitscore #Source #Taxonomy +**Expected Input Format:** + +The annotated BLAST file should be in tabular format with at least 7 columns: +1. Query ID +2. Subject ID +3. Subject accession +4. Subject Taxonomy ID +5. Identity percentage +6. Coverage +7. Evalue +8. Bitscore +9. Source +10. Taxonomy + +**Note:** This tool processes files that have been deduplicated and contain read count information in the sequence headers in the format: `sequence_name(count_number)`. + +**Credits** +Authors = Onno de Gorter, 2025. +Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter, +Developed for the New light on old remedies project, a PhD research by Anja Fischer + ]]></help> +</tool> \ No newline at end of file
