view blast_annotations_processor.xml @ 1:2acf82433aa4 draft default tip

planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit d771f9fbfd42bcdeda1623d954550882a0863847-dirty
author onnodg
date Mon, 20 Oct 2025 12:26:51 +0000
parents a3989edf0a4a
children
line wrap: on
line source

<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.1">
    <description>Process BLAST annotation results with taxonomic analysis</description>

    <requirements>
        <requirement type="package" version="3.12.3">python</requirement>
        <requirement type="package" version="3.10.6">matplotlib</requirement>
        <requirement type="package" version="2.3.2">pandas</requirement>
        <requirement type="package" version="2.3.2">numpy</requirement>
        <requirement type="package" version="3.1.5">openpyxl</requirement>
    </requirements>

    <command detect_errors="exit_code"><![CDATA[
python '$__tool_directory__/blast_annotations_processor.py'
            --input-anno '$input_anno'
            --input-unanno '$input_unanno'

            #if $outputs and 'eval_plot' in $outputs
                --eval-plot '$eval_plot'
            #end if

            #if $outputs and 'taxa_output' in $outputs
                --taxa-output '$taxa_output'
            #end if

            #if $outputs and 'circle_data' in $outputs
                --circle-data '$circle_data'
            #end if

            #if $outputs and 'header_anno' in $outputs
                --header-anno '$header_anno'
            #end if

            #if $outputs and 'anno_stats' in $outputs
                --anno-stats '$anno_stats'
            #end if

            --uncertain-threshold $advanced.uncertain_threshold
            --eval-threshold $advanced.eval_threshold
            #if $advanced.use_counts
                --use-counts
            #end if
    ]]></command>

    <inputs>
        <!-- Required Input Files -->
        <param name="input_anno" type="data" format="tabular"
               label="Annotated BLAST output file"
               help="Tabular BLAST output with taxonomic annotations"/>

        <param name="input_unanno" type="data" format="fasta"
               label="Original unannotated sequences"
               help="FASTA file with original sequences before BLAST annotation"/>

        <!-- Output Selection -->
        <param name="outputs" type="select" multiple="true" display="checkboxes"
               label="Select outputs to generate" help="Choose which analysis outputs to create">
            <option value="eval_plot">E-value distribution plot</option>
            <option value="taxa_output">Taxonomic report (Kraken2-like format)</option>
            <option value="circle_data">Circular taxonomic datafile</option>
            <option value="header_anno">Annotations per header (in Excel)</option>
            <option value="anno_stats">Annotation statistics</option>
        </param>

        <!-- Processing Parameters -->
        <section name="advanced" title="Advanced Parameters" expanded="false">
            <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0"
                   label="Uncertain threshold"
                   help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/>

            <param name="eval_threshold" type="float" value="1e-10" min="0"
                   label="E-value threshold"
                   help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/>

            <param name="use_counts" type="boolean" checked="true"
                   label="Use read counts in circular diagrams"
                   help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/>
        </section>
    </inputs>

    <outputs>
        <!-- E-value Plot -->
        <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}">
            <filter>outputs and 'eval_plot' in outputs</filter>
        </data>
    
        <!-- Taxa Output Report -->
        <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}">
            <filter>outputs and 'taxa_output' in outputs</filter>
        </data>

        <!-- Circular Taxonomy Diagram -->
        <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}">
            <filter>outputs and 'circle_data' in outputs</filter>
        </data>

        <!-- Header Annotations -->
        <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}">
            <filter>outputs and 'header_anno' in outputs</filter>
        </data>

        <!-- Annotation Statistics -->
        <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}">
            <filter>outputs and 'anno_stats' in outputs</filter>
        </data>
    </outputs>

    <tests>
        <test expect_num_outputs="5">
            <param name="input_anno" value="input_test_curated_labels.tabular"/>
            <param name="input_unanno" value="input_test_curated.fasta"/>
            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
            <output name="taxa_output" file="output_taxa_output.txt"/>
            <output name="eval_plot" file="output_eval.png" compare="sim_size"/>
            <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/>
            <output name="anno_stats" file="output_anno_out.txt"/>
            <output name="circle_data" file="output_circle_data.txt"/>
            <section name="advanced">
               <param name="uncertain_threshold" value="0.9"/>
               <param name="eval_threshold" value="1e-10"/>
               <param name="use_counts" value="True"/>
            </section>
        </test>
        <test expect_num_outputs="5">
            <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
            <param name="input_unanno" value="galaxy_input_pre.fasta"/>
            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
            <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
            <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
            <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
            <output name="anno_stats" file="output_genbank_anno_out.txt"/>
            <output name="circle_data" file="output_genbank_circle_data.txt"/>
        </test>
        <test expect_num_outputs="3">
            <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
            <param name="input_unanno" value="galaxy_input_pre.fasta"/>
            <param name="outputs" value="circle_data,header_anno,anno_stats"/>
            <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
            <output name="anno_stats" file="output_advanced_anno_out.txt"/>
            <output name="circle_data" file="advanced_circle_data.txt"/>
            <section name="advanced">
               <param name="uncertain_threshold" value="0.8"/>
               <param name="eval_threshold" value="1e-8"/>
               <param name="use_counts" value="True"/>
            </section>
        </test>

    </tests>

    <help><![CDATA[
**BLAST Annotation Processor**

This tool processes BLAST annotation results and generates various quality control and visualization outputs.

**Inputs:**

- **Annotated BLAST output**: Tabular BLAST output file with taxonomic annotations. Expected format is standard BLAST tabular output with taxonomic information in the last column.

- **Original unannotated sequences**: FASTA file containing the original sequences that were used for BLAST search. This is used to calculate annotation statistics.

**Outputs:**

- **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences.

- **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments.

- **Circular taxonomic data**: Json data to generate a circular sunburst-style diagram showing taxonomic composition across all taxonomic levels (Kingdom -> Species).

- **Annotations per header**: Excel workbook listing each sequence header with its taxonomic assignment and E-value.

- **Annotation statistics**: Summary statistics about annotation success rates and sequence counts.

**Parameters:**

- **Uncertain threshold**: Treshold for lca. When multiple conflicting taxonomic assignments exist for a sequence, this threshold determines whether to use the most common assignment (if it exceeds the threshold) or mark it as "Uncertain taxa".

- **E-value threshold**: Sequences with E-values higher than this threshold are filtered out from the analysis.

- **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked).

**Expected Input Format:**

The annotated BLAST file should be in tabular format with at least 7 columns:

- 1. Query ID

- 2. Subject ID

- 3. Subject accession

- 4. Subject Taxonomy ID

- 5. Identity percentage

- 6. Coverage

- 7. Evalue

- 8. Bitscore

- 9. Source

- 10. Taxonomy

**Note:** This tool processes files that have been deduplicated and contain read count information in the sequence headers in the format: `sequence_name(count_number)`.

-------------

.. class:: infomark

**Credits**

Based on a script by Nick Kortleven, translated, modified and wrapped by Onno de Gorter,
Developed for the New light on old remedies project, a PhD research by Anja Fischer.

Link to the project website:

* https://ahm.uva.nl/funded-research-projects/new-lights-on-old-remedies/new-lights-on-old-remedies.html

    ]]></help>
    <creator>
        <organization name="Naturalis Biodiversity Center" url="https://www.naturalis.nl/en/science" />
        <person givenName="Onno" familyName="de Gorter" url="https://github.com/Onnodg"/>
        <person givenName="Nick" familyName="Kortleven" url="https://github.com/tombkingsts" />
    </creator>
</tool>