Mercurial > repos > onnodg > blast_annotations_processor

diff blast_annotations_processor.xml @ 2:9ca209477dfd draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
author: onnodg
date: Mon, 15 Dec 2025 16:43:36 +0000
parents: 2acf82433aa4
--- a/blast_annotations_processor.xml	Mon Oct 20 12:26:51 2025 +0000
+++ b/blast_annotations_processor.xml	Mon Dec 15 16:43:36 2025 +0000
@@ -1,4 +1,4 @@
-<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.1">
+<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="2.0.0">
     <description>Process BLAST annotation results with taxonomic analysis</description>
 
     <requirements>
@@ -13,7 +13,7 @@
 python '$__tool_directory__/blast_annotations_processor.py'
             --input-anno '$input_anno'
             --input-unanno '$input_unanno'
-
+            --filtered-fasta '$filtered_fasta'
             #if $outputs and 'eval_plot' in $outputs
                 --eval-plot '$eval_plot'
             #end if
@@ -29,20 +29,36 @@
             #if $outputs and 'header_anno' in $outputs
                 --header-anno '$header_anno'
             #end if
-
-            #if $outputs and 'anno_stats' in $outputs
-                --anno-stats '$anno_stats'
-            #end if
+            --log '$log'
 
             --uncertain-threshold $advanced.uncertain_threshold
-            --eval-threshold $advanced.eval_threshold
+            --eval-threshold $advanced.blast.eval_threshold
+            --min-identity $advanced.blast.min_identity
+            --min-coverage $advanced.blast.min_coverage
+            --min-bitscore $advanced.blast.min_bitscore
+            --bitscore-perc-cutoff $advanced.blast.bitscore_perc_cutoff
+            --min-support $advanced.fasta.min_support
+            #if $advanced.blast.ignore_seqids != ""
+                --ignore-seqids '$advanced.blast.ignore_seqids'
+            #end if
+            #if $advanced.blast.ignore_rank != ""
+                --ignore-rank '$advanced.blast.ignore_rank'
+            #end if
+            #if $advanced.blast.ignore_taxonomy != ""
+                --ignore-taxonomy '$advanced.blast.ignore_taxonomy'
+            #end if
+            #if $advanced.fasta.ignore_obiclean_type != ""
+                --ignore-obiclean-type '$advanced.fasta.ignore_obiclean_type'
+            #end if
+            #if $advanced.fasta.ignore_illuminapairend_type != ""
+                --ignore-illuminapairend-type '$advanced.fasta.ignore_illuminapairend_type'
+            #end if
             #if $advanced.use_counts
                 --use-counts
             #end if
     ]]></command>
 
     <inputs>
-        <!-- Required Input Files -->
         <param name="input_anno" type="data" format="tabular"
                label="Annotated BLAST output file"
                help="Tabular BLAST output with taxonomic annotations"/>
@@ -51,99 +67,183 @@
                label="Original unannotated sequences"
                help="FASTA file with original sequences before BLAST annotation"/>
 
-        <!-- Output Selection -->
         <param name="outputs" type="select" multiple="true" display="checkboxes"
                label="Select outputs to generate" help="Choose which analysis outputs to create">
             <option value="eval_plot">E-value distribution plot</option>
             <option value="taxa_output">Taxonomic report (Kraken2-like format)</option>
             <option value="circle_data">Circular taxonomic datafile</option>
             <option value="header_anno">Annotations per header (in Excel)</option>
-            <option value="anno_stats">Annotation statistics</option>
         </param>
 
-        <!-- Processing Parameters -->
         <section name="advanced" title="Advanced Parameters" expanded="false">
             <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0"
-                   label="Uncertain threshold"
+                   label="Uncertainty threshold"
                    help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/>
-
-            <param name="eval_threshold" type="float" value="1e-10" min="0"
-                   label="E-value threshold"
-                   help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/>
-
             <param name="use_counts" type="boolean" checked="true"
                    label="Use read counts in circular diagrams"
                    help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/>
+            <section name="fasta" title="Fasta filters" expanded="false">
+                <param name="min_support" type="integer" value="1" min="1" max="1000" label="Minimum support"
+                help="The minimum times a read should occur before dereplication"/>
+                <param name="ignore_obiclean_type" type="text" value="singleton" label="Ignore obiclean type"
+                help="The tool skips reads that are flagged as this obiclean type, options are: singleton,variant,head. Values must be comma seperated"/>
+                <param name="ignore_illuminapairend_type" type="text" value="pairend" label="Ignore R1-R2 merge failure"
+                help="The tool skips reads that are flagged as this illuminapairend type, options are: pairend,consensus. Values must be comma seperated"/>
+            </section>
+            <section name="blast" title="Blast filters" expanded="false">
+                <param name="min_identity" type="integer" value="80" min="1" max="100" label="Minimum identity"/>
+                <param name="min_coverage" type="integer" value="70" min="1" max="100" label="Minimum coverage"/>
+                <param name="min_bitscore" type="integer" value="100" min="1" max="1000" label="Minimum bitscore"/>
+                <param name="bitscore_perc_cutoff" type="float" value="8" min="0" max="100" label="Top bitscore percentage cutoff"
+                help="The percentage that the bitscore can be lower than the top bitscore to still be considered. To disable this function put the value as 0"/>
+                <param name="eval_threshold" type="text" value="1e-10" label="E-value threshold"/>
+                <param name="ignore_seqids" type="text" value="" label="Ignore sequence identifiers"
+                help="The tool skips hits that have these sequence identifiers. Values must be comma seperated"/>
+                <param name="ignore_rank" type="text" value="unknown" label="Ignore rank when containing:"
+                help="The tool skips hits that have this string in taxonomy ranks. Values must be comma seperated"/>
+                <param name="ignore_taxonomy" type="text" value="environmental" label="Ignore taxonomy when containing:"
+                help="The tool skips hits that have this string as taxonomy. Values must be comma seperated"/>
+            </section>
         </section>
     </inputs>
 
     <outputs>
-        <!-- E-value Plot -->
         <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}">
             <filter>outputs and 'eval_plot' in outputs</filter>
         </data>
-    
-        <!-- Taxa Output Report -->
         <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}">
             <filter>outputs and 'taxa_output' in outputs</filter>
         </data>
-
-        <!-- Circular Taxonomy Diagram -->
         <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}">
             <filter>outputs and 'circle_data' in outputs</filter>
         </data>
-
-        <!-- Header Annotations -->
         <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}">
             <filter>outputs and 'header_anno' in outputs</filter>
         </data>
-
-        <!-- Annotation Statistics -->
-        <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}">
-            <filter>outputs and 'anno_stats' in outputs</filter>
-        </data>
+        <data name="log" format="txt" label="log on ${on_string}"/>
+        <data name="filtered_fasta" format="fasta" label="Filtered fasta on ${on_string}"/>
     </outputs>
 
     <tests>
-        <test expect_num_outputs="5">
-            <param name="input_anno" value="input_test_curated_labels.tabular"/>
-            <param name="input_unanno" value="input_test_curated.fasta"/>
-            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
-            <output name="taxa_output" file="output_taxa_output.txt"/>
-            <output name="eval_plot" file="output_eval.png" compare="sim_size"/>
-            <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/>
-            <output name="anno_stats" file="output_anno_out.txt"/>
-            <output name="circle_data" file="output_circle_data.txt"/>
+        <test expect_num_outputs="6">
+            <param name="input_anno" value="test_curated_nov_blast_headers.tabular"/>
+            <param name="input_unanno" value="test_curated_nov.fasta"/>
+            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/>
+            <output name="eval_plot" file="test_nov_eval.png" compare="sim_size"/>
+            <output name="taxa_output" file="test_curated_nova_taxa_output.txt"/>
+            <output name="circle_data" file="test_curated_nova.txt"/>
+            <output name="header_anno" file="test_curated_nova_header_anno_excel.xlsx" decompress="true"/>
+            <output name="log" file="test_curated_nova_anno_out.txt"/>
+            <output name="filtered_fasta" file="test_curated_nov_filtered.fasta"/>
             <section name="advanced">
-               <param name="uncertain_threshold" value="0.9"/>
-               <param name="eval_threshold" value="1e-10"/>
-               <param name="use_counts" value="True"/>
-            </section>
-        </test>
-        <test expect_num_outputs="5">
-            <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
-            <param name="input_unanno" value="galaxy_input_pre.fasta"/>
-            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/>
-            <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
-            <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
-            <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
-            <output name="anno_stats" file="output_genbank_anno_out.txt"/>
-            <output name="circle_data" file="output_genbank_circle_data.txt"/>
-        </test>
-        <test expect_num_outputs="3">
-            <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/>
-            <param name="input_unanno" value="galaxy_input_pre.fasta"/>
-            <param name="outputs" value="circle_data,header_anno,anno_stats"/>
-            <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
-            <output name="anno_stats" file="output_advanced_anno_out.txt"/>
-            <output name="circle_data" file="advanced_circle_data.txt"/>
-            <section name="advanced">
-               <param name="uncertain_threshold" value="0.8"/>
-               <param name="eval_threshold" value="1e-8"/>
-               <param name="use_counts" value="True"/>
+                <param name="uncertain_threshold" value="0.9"/>
+                <param name="use_counts" value="True"/>
+                <section name="fasta">
+                    <param name="min_support" value="1"/>
+                    <param name="ignore_obiclean_type" value="singleton"/>
+                    <param name="ignore_illuminapairend_type" value="pairend"/>
+                </section>
+                <section name="blast">
+                    <param name="min_identity" value="80"/>
+                    <param name="min_coverage" value="70"/>
+                    <param name="min_bitscore" value="100"/>
+                    <param name="bitscore_perc_cutoff" value="8"/>
+                    <param name="eval_threshold" value="1e-10"/>
+                    <param name="ignore_seqids" value=""/>
+                    <param name="ignore_rank" value="unknown"/>
+                    <param name="ignore_taxonomy" value="environmental"/>
+                </section>
             </section>
         </test>
 
+        <test expect_num_outputs="6">
+            <param name="input_anno" value="test_genbank_nov_blast.tabular"/>
+            <param name="input_unanno" value="test_genbank_nov.fasta"/>
+            <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/>
+            <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/>
+            <output name="taxa_output" file="output_genbank_taxa_output.txt"/>
+            <output name="circle_data" file="output_genbank_circle_data.txt"/>
+            <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/>
+            <output name="log" file="output_genbank_anno_out.txt"/>
+            <output name="filtered_fasta" file="genbank_filtered.fasta"/>
+            <section name="advanced">
+                <param name="uncertain_threshold" value="0.9"/>
+                <param name="use_counts" value="True"/>
+                <section name="fasta">
+                    <param name="min_support" value="1"/>
+                    <param name="ignore_obiclean_type" value="singleton"/>
+                    <param name="ignore_illuminapairend_type" value="pairend"/>
+                </section>
+                <section name="blast">
+                    <param name="min_identity" value="80"/>
+                    <param name="min_coverage" value="70"/>
+                    <param name="min_bitscore" value="100"/>
+                    <param name="bitscore_perc_cutoff" value="8"/>
+                    <param name="eval_threshold" value="1e-10"/>
+                    <param name="ignore_seqids" value=""/>
+                    <param name="ignore_rank" value="unknown"/>
+                    <param name="ignore_taxonomy" value="environmental"/>
+                </section>
+            </section>
+        </test>
+        <test expect_num_outputs="4">
+            <param name="input_anno" value="test_genbank_nov_blast.tabular"/>
+            <param name="input_unanno" value="test_genbank_nov.fasta"/>
+            <param name="outputs" value="circle_data,header_anno"/>
+            <output name="circle_data" file="advanced_circle_data.txt"/>
+            <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/>
+            <output name="log" file="output_advanced_anno_out.txt"/>
+            <output name="filtered_fasta" file="advanced_filtered.fasta"/>
+            <section name="advanced">
+                <param name="uncertain_threshold" value="0.8"/>
+                <param name="use_counts" value="False"/>
+                <section name="fasta">
+                    <param name="min_support" value="2"/>
+                    <param name="ignore_obiclean_type" value="singleton,variant"/>
+                    <param name="ignore_illuminapairend_type" value="pairend"/>
+                </section>
+                <section name="blast">
+                    <param name="min_identity" value="70"/>
+                    <param name="min_coverage" value="60"/>
+                    <param name="min_bitscore" value="80"/>
+                    <param name="bitscore_perc_cutoff" value="0"/>
+                    <param name="eval_threshold" value="1e-8"/>
+                    <param name="ignore_seqids" value="NC_051949"/>
+                    <param name="ignore_rank" value="unknown"/>
+                    <param name="ignore_taxonomy" value="environmental"/>
+                </section>
+            </section>
+        </test>
+        <test expect_num_outputs="3">
+            <param name="input_anno" value="input_test_curated_labels.tabular"/>
+            <param name="input_unanno" value="input_test_curated.fasta"/>
+            <param name="outputs" value="header_anno"/>
+            <output name="header_anno" file="strict_header_anno.xlsx" decompress="true"/>
+            <output name="log" file="strict_anno_stats.txt" lines_diff="50"/>
+            <output name="filtered_fasta" file="strict_filtered.fasta"/>
+
+            <section name="advanced">
+                <param name="uncertain_threshold" value="0.95"/>
+                <param name="use_counts" value="False"/>
+
+                <section name="fasta">
+                    <param name="min_support" value="1"/>
+                    <param name="ignore_obiclean_type" value=""/>
+                    <param name="ignore_illuminapairend_type" value=""/>
+                </section>
+
+                <section name="blast">
+                    <param name="min_identity" value="98"/>
+                    <param name="min_coverage" value="95"/>
+                    <param name="min_bitscore" value="150"/>
+                    <param name="bitscore_perc_cutoff" value="0"/>
+                    <param name="eval_threshold" value="1e-20"/>
+                    <param name="ignore_seqids" value=""/>
+                    <param name="ignore_rank" value=""/>
+                    <param name="ignore_taxonomy" value=""/>
+                </section>
+            </section>
+        </test>
     </tests>
 
     <help><![CDATA[
@@ -159,6 +259,8 @@
 
 **Outputs:**
 
+- **Filtered fasta: This is a fasta file with all the fasta sequences that passed the fasta filtering (obiclean, illuminapairend and minimum support filters)**
+
 - **E-value distribution plot**: Visualization showing the distribution of E-values across all annotated sequences.
 
 - **Taxonomic report**: Kraken2-like format report showing taxonomic composition with read counts and percentages. Includes information about uncertain taxonomic assignments.
@@ -177,6 +279,29 @@
 
 - **Use read counts**: Determines whether circular data reflects the abundance of reads (checked) or just count unique taxonomic assignments (unchecked).
 
+- **E-value threshold**:  Maximum allowed E-value.
+
+- **Minimum identity (%)**
+
+- **Minimum coverage (%)**
+
+- **Minimum bitscore**
+
+- **Bitscore percentile cutoff (%)**:  Relative cutoff vs. best hit in a query.
+
+- **Minimum read support (FASTA)**: Only keep headers with at least N counts.
+
+- **Ignore OBIClean type**: Remove reads with this OBIClean category (singleton / variant / head).
+
+- **Ignore Illumina pairend type**: Remove reads based on pairend status.
+
+- **Ignore taxonomy keywords**: Skip hits whose taxonomic annotation contains these strings.
+
+- **Ignore sequence identifiers**: Remove BLAST hits whose subject/seq IDs match given list.
+
+- **Use counts**: Circular diagram uses abundance (checked) or uniqueness (unchecked).
+
+
 **Expected Input Format:**
 
 The annotated BLAST file should be in tabular format with at least 7 columns:
author	onnodg
date	Mon, 15 Dec 2025 16:43:36 +0000
parents	2acf82433aa4
children