Mercurial > repos > onnodg > blast_annotations_processor
diff blast_annotations_processor.xml @ 2:9ca209477dfd draft default tip
planemo upload for repository https://github.com/Onnodg/Naturalis_NLOOR/tree/main/NLOOR_scripts/process_annotations_tool commit 4017d38cf327c48a6252e488ba792527dae97a70-dirty
| author | onnodg |
|---|---|
| date | Mon, 15 Dec 2025 16:43:36 +0000 |
| parents | 2acf82433aa4 |
| children |
line wrap: on
line diff
--- a/blast_annotations_processor.xml Mon Oct 20 12:26:51 2025 +0000 +++ b/blast_annotations_processor.xml Mon Dec 15 16:43:36 2025 +0000 @@ -1,4 +1,4 @@ -<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="1.0.1"> +<tool id="blast_annotation_processor" name="BLAST Annotation Processor" version="2.0.0"> <description>Process BLAST annotation results with taxonomic analysis</description> <requirements> @@ -13,7 +13,7 @@ python '$__tool_directory__/blast_annotations_processor.py' --input-anno '$input_anno' --input-unanno '$input_unanno' - + --filtered-fasta '$filtered_fasta' #if $outputs and 'eval_plot' in $outputs --eval-plot '$eval_plot' #end if @@ -29,20 +29,36 @@ #if $outputs and 'header_anno' in $outputs --header-anno '$header_anno' #end if - - #if $outputs and 'anno_stats' in $outputs - --anno-stats '$anno_stats' - #end if + --log '$log' --uncertain-threshold $advanced.uncertain_threshold - --eval-threshold $advanced.eval_threshold + --eval-threshold $advanced.blast.eval_threshold + --min-identity $advanced.blast.min_identity + --min-coverage $advanced.blast.min_coverage + --min-bitscore $advanced.blast.min_bitscore + --bitscore-perc-cutoff $advanced.blast.bitscore_perc_cutoff + --min-support $advanced.fasta.min_support + #if $advanced.blast.ignore_seqids != "" + --ignore-seqids '$advanced.blast.ignore_seqids' + #end if + #if $advanced.blast.ignore_rank != "" + --ignore-rank '$advanced.blast.ignore_rank' + #end if + #if $advanced.blast.ignore_taxonomy != "" + --ignore-taxonomy '$advanced.blast.ignore_taxonomy' + #end if + #if $advanced.fasta.ignore_obiclean_type != "" + --ignore-obiclean-type '$advanced.fasta.ignore_obiclean_type' + #end if + #if $advanced.fasta.ignore_illuminapairend_type != "" + --ignore-illuminapairend-type '$advanced.fasta.ignore_illuminapairend_type' + #end if #if $advanced.use_counts --use-counts #end if ]]></command> <inputs> - <!-- Required Input Files --> <param name="input_anno" type="data" format="tabular" label="Annotated BLAST output file" help="Tabular BLAST output with taxonomic annotations"/> @@ -51,99 +67,183 @@ label="Original unannotated sequences" help="FASTA file with original sequences before BLAST annotation"/> - <!-- Output Selection --> <param name="outputs" type="select" multiple="true" display="checkboxes" label="Select outputs to generate" help="Choose which analysis outputs to create"> <option value="eval_plot">E-value distribution plot</option> <option value="taxa_output">Taxonomic report (Kraken2-like format)</option> <option value="circle_data">Circular taxonomic datafile</option> <option value="header_anno">Annotations per header (in Excel)</option> - <option value="anno_stats">Annotation statistics</option> </param> - <!-- Processing Parameters --> <section name="advanced" title="Advanced Parameters" expanded="false"> <param name="uncertain_threshold" type="float" value="0.9" min="0.0" max="1.0" - label="Uncertain threshold" + label="Uncertainty threshold" help="Threshold for resolving taxonomic conflicts (0.0-1.0). If one taxon represents more than this fraction of reads, it will be used instead of 'Uncertain taxa'"/> - - <param name="eval_threshold" type="float" value="1e-10" min="0" - label="E-value threshold" - help="Maximum E-value to consider for annotations. Results with higher E-values will be filtered out"/> - <param name="use_counts" type="boolean" checked="true" label="Use read counts in circular diagrams" help="If checked, circular diagrams will reflect read abundance. If unchecked, only unique taxa are counted"/> + <section name="fasta" title="Fasta filters" expanded="false"> + <param name="min_support" type="integer" value="1" min="1" max="1000" label="Minimum support" + help="The minimum times a read should occur before dereplication"/> + <param name="ignore_obiclean_type" type="text" value="singleton" label="Ignore obiclean type" + help="The tool skips reads that are flagged as this obiclean type, options are: singleton,variant,head. Values must be comma seperated"/> + <param name="ignore_illuminapairend_type" type="text" value="pairend" label="Ignore R1-R2 merge failure" + help="The tool skips reads that are flagged as this illuminapairend type, options are: pairend,consensus. Values must be comma seperated"/> + </section> + <section name="blast" title="Blast filters" expanded="false"> + <param name="min_identity" type="integer" value="80" min="1" max="100" label="Minimum identity"/> + <param name="min_coverage" type="integer" value="70" min="1" max="100" label="Minimum coverage"/> + <param name="min_bitscore" type="integer" value="100" min="1" max="1000" label="Minimum bitscore"/> + <param name="bitscore_perc_cutoff" type="float" value="8" min="0" max="100" label="Top bitscore percentage cutoff" + help="The percentage that the bitscore can be lower than the top bitscore to still be considered. To disable this function put the value as 0"/> + <param name="eval_threshold" type="text" value="1e-10" label="E-value threshold"/> + <param name="ignore_seqids" type="text" value="" label="Ignore sequence identifiers" + help="The tool skips hits that have these sequence identifiers. Values must be comma seperated"/> + <param name="ignore_rank" type="text" value="unknown" label="Ignore rank when containing:" + help="The tool skips hits that have this string in taxonomy ranks. Values must be comma seperated"/> + <param name="ignore_taxonomy" type="text" value="environmental" label="Ignore taxonomy when containing:" + help="The tool skips hits that have this string as taxonomy. Values must be comma seperated"/> + </section> </section> </inputs> <outputs> - <!-- E-value Plot --> <data name="eval_plot" format="png" label="E-value distribution plot on ${on_string}"> <filter>outputs and 'eval_plot' in outputs</filter> </data> - - <!-- Taxa Output Report --> <data name="taxa_output" format="txt" label="Taxonomic report on ${on_string}"> <filter>outputs and 'taxa_output' in outputs</filter> </data> - - <!-- Circular Taxonomy Diagram --> <data name="circle_data" format="txt" label="Circular taxonomic data on ${on_string}"> <filter>outputs and 'circle_data' in outputs</filter> </data> - - <!-- Header Annotations --> <data name="header_anno" format="xlsx" label="Header annotations on ${on_string}"> <filter>outputs and 'header_anno' in outputs</filter> </data> - - <!-- Annotation Statistics --> - <data name="anno_stats" format="txt" label="Annotation statistics on ${on_string}"> - <filter>outputs and 'anno_stats' in outputs</filter> - </data> + <data name="log" format="txt" label="log on ${on_string}"/> + <data name="filtered_fasta" format="fasta" label="Filtered fasta on ${on_string}"/> </outputs> <tests> - <test expect_num_outputs="5"> - <param name="input_anno" value="input_test_curated_labels.tabular"/> - <param name="input_unanno" value="input_test_curated.fasta"/> - <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> - <output name="taxa_output" file="output_taxa_output.txt"/> - <output name="eval_plot" file="output_eval.png" compare="sim_size"/> - <output name="header_anno" file="header_anno_excel.xlsx" decompress="true"/> - <output name="anno_stats" file="output_anno_out.txt"/> - <output name="circle_data" file="output_circle_data.txt"/> + <test expect_num_outputs="6"> + <param name="input_anno" value="test_curated_nov_blast_headers.tabular"/> + <param name="input_unanno" value="test_curated_nov.fasta"/> + <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/> + <output name="eval_plot" file="test_nov_eval.png" compare="sim_size"/> + <output name="taxa_output" file="test_curated_nova_taxa_output.txt"/> + <output name="circle_data" file="test_curated_nova.txt"/> + <output name="header_anno" file="test_curated_nova_header_anno_excel.xlsx" decompress="true"/> + <output name="log" file="test_curated_nova_anno_out.txt"/> + <output name="filtered_fasta" file="test_curated_nov_filtered.fasta"/> <section name="advanced"> - <param name="uncertain_threshold" value="0.9"/> - <param name="eval_threshold" value="1e-10"/> - <param name="use_counts" value="True"/> - </section> - </test> - <test expect_num_outputs="5"> - <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> - <param name="input_unanno" value="galaxy_input_pre.fasta"/> - <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno,anno_stats"/> - <output name="taxa_output" file="output_genbank_taxa_output.txt"/> - <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/> - <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/> - <output name="anno_stats" file="output_genbank_anno_out.txt"/> - <output name="circle_data" file="output_genbank_circle_data.txt"/> - </test> - <test expect_num_outputs="3"> - <param name="input_anno" value="galaxy_input_genbank.fa.tabular"/> - <param name="input_unanno" value="galaxy_input_pre.fasta"/> - <param name="outputs" value="circle_data,header_anno,anno_stats"/> - <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/> - <output name="anno_stats" file="output_advanced_anno_out.txt"/> - <output name="circle_data" file="advanced_circle_data.txt"/> - <section name="advanced"> - <param name="uncertain_threshold" value="0.8"/> - <param name="eval_threshold" value="1e-8"/> - <param name="use_counts" value="True"/> + <param name="uncertain_threshold" value="0.9"/> + <param name="use_counts" value="True"/> + <section name="fasta"> + <param name="min_support" value="1"/> + <param name="ignore_obiclean_type" value="singleton"/> + <param name="ignore_illuminapairend_type" value="pairend"/> + </section> + <section name="blast"> + <param name="min_identity" value="80"/> + <param name="min_coverage" value="70"/> + <param name="min_bitscore" value="100"/> + <param name="bitscore_perc_cutoff" value="8"/> + <param name="eval_threshold" value="1e-10"/> + <param name="ignore_seqids" value=""/> + <param name="ignore_rank" value="unknown"/> + <param name="ignore_taxonomy" value="environmental"/> + </section> </section> </test> + <test expect_num_outputs="6"> + <param name="input_anno" value="test_genbank_nov_blast.tabular"/> + <param name="input_unanno" value="test_genbank_nov.fasta"/> + <param name="outputs" value="eval_plot,taxa_output,circle_data,header_anno"/> + <output name="eval_plot" file="output_genbank_eval.png" compare="sim_size"/> + <output name="taxa_output" file="output_genbank_taxa_output.txt"/> + <output name="circle_data" file="output_genbank_circle_data.txt"/> + <output name="header_anno" file="output_genbank_header_anno.xlsx" decompress="true"/> + <output name="log" file="output_genbank_anno_out.txt"/> + <output name="filtered_fasta" file="genbank_filtered.fasta"/> + <section name="advanced"> + <param name="uncertain_threshold" value="0.9"/> + <param name="use_counts" value="True"/> + <section name="fasta"> + <param name="min_support" value="1"/> + <param name="ignore_obiclean_type" value="singleton"/> + <param name="ignore_illuminapairend_type" value="pairend"/> + </section> + <section name="blast"> + <param name="min_identity" value="80"/> + <param name="min_coverage" value="70"/> + <param name="min_bitscore" value="100"/> + <param name="bitscore_perc_cutoff" value="8"/> + <param name="eval_threshold" value="1e-10"/> + <param name="ignore_seqids" value=""/> + <param name="ignore_rank" value="unknown"/> + <param name="ignore_taxonomy" value="environmental"/> + </section> + </section> + </test> + <test expect_num_outputs="4"> + <param name="input_anno" value="test_genbank_nov_blast.tabular"/> + <param name="input_unanno" value="test_genbank_nov.fasta"/> + <param name="outputs" value="circle_data,header_anno"/> + <output name="circle_data" file="advanced_circle_data.txt"/> + <output name="header_anno" file="output_advanced_header_anno.xlsx" decompress="true"/> + <output name="log" file="output_advanced_anno_out.txt"/> + <output name="filtered_fasta" file="advanced_filtered.fasta"/> + <section name="advanced"> + <param name="uncertain_threshold" value="0.8"/> + <param name="use_counts" value="False"/> + <section name="fasta"> + <param name="min_support" value="2"/> + <param name="ignore_obiclean_type" value="singleton,variant"/> + <param name="ignore_illuminapairend_type" value="pairend"/> + </section> + <section name="blast"> + <param name="min_identity" value="70"/> + <param name="min_coverage" value="60"/> + <param name="min_bitscore" value="80"/> + <param name="bitscore_perc_cutoff" value="0"/> + <param name="eval_threshold" value="1e-8"/> + <param name="ignore_seqids" value="NC_051949"/> + <param name="ignore_rank" value="unknown"/> + <param name="ignore_taxonomy" value="environmental"/> + </section> + </section> + </test> + <test expect_num_outputs="3"> + <param name="input_anno" value="input_test_curated_labels.tabular"/> + <param name="input_unanno" value="input_test_curated.fasta"/> + <param name="outputs" value="header_anno"/> + <output name="header_anno" file="strict_header_anno.xlsx" decompress="true"/> + <output name="log" file="strict_anno_stats.txt" lines_diff="50"/> + <output name="filtered_fasta" file="strict_filtered.fasta"/> + + <section name="advanced"> + <param name="uncertain_threshold" value="0.95"/> + <param name="use_counts" value="False"/> + + <section name="fasta"> + <param name="min_support" value="1"/> + <param name="ignore_obiclean_type" value=""/> + <param name="ignore_illuminapairend_type" value=""/> + </section> + + <section name="blast"> + <param name="min_identity" value="98"/> + <param name="min_coverage" value="95"/> + <param name="min_bitscore" value="150"/> + <param name="bitscore_perc_cutoff" value="0"/> + <param name="eval_threshold" value="1e-20"/> + <param name="ignore_seqids" value=""/> + <param name="ignore_rank" value=""/> + <param name="ignore_taxonomy" value=""/> + </section> + </section> + </test> </tests> <help><