diff bmtagger.xml @ 0:49a1cbbe5767 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/bmtagger commit e3d492d96b0ffe79370ca090b3f749b0869e8b60
author iuc
date Wed, 12 Nov 2025 12:03:46 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bmtagger.xml	Wed Nov 12 12:03:46 2025 +0000
@@ -0,0 +1,277 @@
+<tool id="bmtagger" name="bmtagger" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description>remove contaminant reads</description>
+    <macros>
+        <token name="@TOOL_VERSION@">3.101</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">25.0</token>
+
+        <xml name="assert">
+            <assert_contents>
+                <has_n_lines n="2668"/>
+            </assert_contents>
+        </xml>
+        <xml name="element_assert" tokens="name,ftype" token_decompress="false">
+            <element name="@NAME@" ftype="@FTYPE@" decompress="@DECOMPRESS@">
+                <expand macro="assert"/>
+            </element>
+        </xml>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">bmtagger</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">bmtagger</requirement>
+    </requirements>
+    <version_command><![CDATA[bmtagger.sh -V 2> /dev/null | grep version | cut -d" " -f2]]></version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        set -eo pipefail;
+        #set gz = False
+        #set fasta = False
+        #if $sequences.type == "single"
+            #if $sequences.reads.ext.startswith("fasta")
+                #set fasta = True
+            #end if
+            #if $sequences.reads.ext.endswith(".gz")
+                gunzip -c '$sequences.reads' > forward &&
+                #set gz = True
+            #else
+                ln -s '$sequences.reads' forward &&
+            #end if
+
+        #else
+            #if $sequences.reads.forward.ext.startswith("fasta")
+                #set fasta = True
+            #end if
+            #if $sequences.reads.forward.ext.endswith(".gz")
+                gunzip -c '$sequences.reads.forward' > forward &&
+                gunzip -c '$sequences.reads.reverse' > reverse &&
+                #set gz = True
+            #else
+                ln -s '$sequences.reads.forward' forward &&
+                ln -s '$sequences.reads.reverse' reverse &&
+            #end if
+        #end if
+
+        #if $host.source == "cached"
+            #set reference = $host.reference.fields.path
+            ## srprism test data is to large (>100MB) to store ar IUC
+            ## hence we generate it on the fly for tool tests using the
+            ## fasta file which we keep in the path referred by the
+            ## data table (not needed otherwise)
+            #if $test == "true"
+                srprism mkindex -i '${host.reference.fields.path}.fa' -o reference.srprism &&
+            #end if
+        #else
+            #if $host.sequence.ext == "fasta.gz"
+                gunzip -c '$host.sequence' > reference.fa &&
+            #else
+                ln -s '$host.sequence' reference.fa &&
+            #end if
+            ## bmtool creates multi GB file if used with default parameters
+            ## -> use much smaller word size for testing
+            bmtool -d reference.fa -o reference.bitmask -w #if $test != "" then 10 else 18 # &&
+            srprism mkindex -i reference.fa -o reference.srprism &&
+            makeblastdb -in reference.fa -dbtype nucl &&
+            #set reference = "reference"
+        #end if
+
+        bmtagger.sh
+            -q #if $fasta then 0 else 1#
+            -1 forward
+            #if $sequences.type == "paired"
+                -2 reverse
+            #end if
+            -b '${reference}.bitmask'
+            #if $test == "" or $host.source != "cached"
+                -x '${reference}.srprism'
+            #else
+                -x reference.srprism
+            #end if
+            -d '${reference}'
+            -o host_ids
+            &&
+
+        extract_fullseq host_ids -keep -fastq 
+        #if $sequences.type == "single"
+            -single
+        #else
+            -mate1
+        #end if
+        'forward' 
+        #if $gz
+            | gzip -c
+        #end if
+        #if $sequences.type == "single"
+            > '$out_single'
+        #else
+            > '$out_pair.forward'
+            &&
+            extract_fullseq host_ids -keep -fastq -mate2 'reverse' 
+            #if $gz
+                | gzip -c
+            #end if
+            > '$out_pair.reverse'
+        #end if
+    ]]></command>
+    <inputs>
+        <conditional name="sequences">
+            <param name="type" type="select" label="Sequence type">
+                <option value="single">Single end data</option>
+                <option value="paired">Paired end data</option>
+            </param>
+            <when value="single">
+                <param name="reads" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz" label="Single end reads"/>
+            </when>
+            <when value="paired">
+                <param name="reads" type="data_collection" collection_type="paired" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz" label="Paired end reads" />
+            </when>
+        </conditional>
+        <conditional name="host">
+            <param name="source" type="select" label="Host data source">
+                <option value="cached">Precomputed indices</option>
+                <option value="history">Sequence from History</option>
+            </param>
+            <when value="cached">
+                <param name="reference" type="select" label="Reference">
+                    <options from_data_table="bmtagger">
+                        <filter type="sort_by" column="2"/>
+                        <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+                    </options>
+                </param>
+            </when>
+            <when value="history">
+                <param name="sequence" type="data" format="fasta,fasta.gz" label="Host sequence" help="nucleotide sequence" />
+            </when>
+        </conditional>
+        <param name="test" type="hidden"/>
+    </inputs>
+    <outputs>
+        <data name="out_single" format_source="reads" label="${tool.name} on ${on_string}">
+            <filter>sequences["type"] == "single"</filter>
+        </data>
+        <collection name="out_pair" type="paired" label="${tool.name} on ${on_string}: pairs">
+            <data name="forward" format_source="reads" />
+            <data name="reverse" format_source="reads" />
+            <filter>sequences["type"] == "paired"</filter>
+        </collection>
+    </outputs>
+    <tests>
+        <!-- single input, cached reference -->
+        <test expect_num_outputs="1">
+            <conditional name="sequences">
+                <param name="type" value="single"/>
+                <param name="reads" value="host_and_contaminant.fq1.fq" ftype="fastqsanger"/>
+            </conditional>
+            <param name="test" value="true"/>
+            <output name="out_single" ftype="fastqsanger">
+                <expand macro="assert"/>
+            </output>
+        </test>
+        <!-- paired input, cached reference -->
+        <test expect_num_outputs="3">
+            <conditional name="sequences">
+                <param name="type" value="paired"/>
+                <param name="reads">
+                    <collection type="paired_or_unpaired" name="reads">
+                        <element name="forward" value="host_and_contaminant.fq1.fq" ftype="fastqsanger"/>
+                        <element name="reverse" value="host_and_contaminant.fq2.fq" ftype="fastqsanger"/>
+                    </collection>
+                </param>
+            </conditional>
+            <param name="test" value="true"/>
+            <output_collection name="out_pair" count="2">
+                <expand macro="element_assert" name="forward" ftype="fastqsanger"/>
+                <expand macro="element_assert" name="reverse" ftype="fastqsanger"/>
+            </output_collection>
+        </test>
+        <!-- gz input, cached reference -->
+        <test expect_num_outputs="3">
+            <conditional name="sequences">
+                <param name="type" value="paired"/>
+                <param name="reads">
+                    <collection type="paired_or_unpaired" name="reads">
+                        <element name="forward" value="host_and_contaminant.fq1.fq.gz" ftype="fastqsanger.gz"/>
+                        <element name="reverse" value="host_and_contaminant.fq2.fq.gz" ftype="fastqsanger.gz"/>
+                    </collection>
+                </param>
+            </conditional>
+            <param name="test" value="true"/>
+            <output_collection name="out_pair" count="2">
+                <expand macro="element_assert" name="forward" ftype="fastqsanger.gz" decompress="true"/>
+                <expand macro="element_assert" name="reverse" ftype="fastqsanger.gz" decompress="true"/>
+            </output_collection>
+        </test>
+
+        <!-- single gz input, fasta reference -->
+        <test expect_num_outputs="1">
+            <conditional name="sequences">
+                <param name="type" value="single"/>
+                <param name="reads" value="host_and_contaminant.fq1.fq.gz" ftype="fastqsanger.gz"/>
+            </conditional>
+            <conditional name="host">
+                <param name="source" value="history"/>
+                <param name="sequence" value="host.fa" ftype="fasta"/>
+            </conditional>
+            <param name="test" value="true"/>
+            <output name="out_single" ftype="fastqsanger.gz" decompress="true">
+                <expand macro="assert"/>
+            </output>
+        </test>
+
+        <!-- gz input, gzipped fasta reference -->
+        <test expect_num_outputs="3">
+            <conditional name="sequences">
+                <param name="type" value="paired"/>
+                <param name="reads">
+                    <collection type="paired_or_unpaired" name="reads">
+                        <element name="forward" value="host_and_contaminant.fq1.fq.gz" ftype="fastqsanger.gz"/>
+                        <element name="reverse" value="host_and_contaminant.fq2.fq.gz" ftype="fastqsanger.gz"/>
+                    </collection>
+                </param>
+            </conditional>
+            <conditional name="host">
+                <param name="source" value="history"/>
+                <param name="sequence" value="host.fa.gz" ftype="fasta.gz"/>
+            </conditional>
+            <param name="test" value="true"/>
+            <output_collection name="out_pair" count="2">
+                <expand macro="element_assert" name="forward" ftype="fastqsanger.gz" decompress="true"/>
+                <expand macro="element_assert" name="reverse" ftype="fastqsanger.gz" decompress="true"/>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+Filter contaminant sequences from input FASTA or FASTQ sequences.
+
+This is done by iteratively applying
+
+- bmfilter
+- srprism
+- blastn (megablast)
+
+Usage
+.....
+
+**Input**
+
+FASTA/FASTQ sequences and a reference database.
+
+**Output**
+
+FASTA/FASTQ sequences
+
+    ]]></help>
+    <citations>
+        <citation type="bibtex">@article{rotmistrovsky2011bmtagger,
+            title={BMTagger: Best Match Tagger for removing human reads from metagenomics datasets},
+            author={Rotmistrovsky, Kirill and Agarwala, Richa},
+            journal={NCBI/NLM, National Institutes of Health},
+            year={2011}
+         }</citation>
+    </citations>
+</tool>
\ No newline at end of file