view kodoja_search.xml @ 3:d4111d1de76f draft default tip

v0.0.8, expose kodoja_VRL.tsv output
author peterjc
date Fri, 14 Sep 2018 09:55:56 -0400
parents ee917702dbd8
children
line wrap: on
line source

<tool id="kodoja_search" name="Kodoja database search" version="0.0.8">
    <description>identify viruses from plant RNA sequencing data</description>
    <requirements>
        <requirement type="package" version="0.0.8">kodoja</requirement>
    </requirements>
    <version_command>kodoja_search.py --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
## This if statement is for backward compatibility as early versions of the Kraken
## wrapper assumed the UI facing field name was also part of the directory path
if [ -d '${kraken_db.fields.path}/${kraken_db.fields.name}' ]; then export KRAKEN_DEFAULT_DB='${kraken_db.fields.path}/${kraken_db.fields.name}'; else export KRAKEN_DEFAULT_DB='${kraken_db.fields.path}'; fi &&
kodoja_search.py
-t="\${GALAXY_SLOTS:-4}"
--kraken_db "\$KRAKEN_DEFAULT_DB"
--kaiju_db '${kaiju_db.fields.path}'

#if $single_paired.single_paired_selector == 'yes'
    #if $forward_input.is_of_type( 'fastq' ):
        --data_format fastq
    #else:
        --data_format fasta
    #end if
    --read1 '${single_paired.forward_input}'
    --read2 '${single_paired.reverse_input}'
#else:
    #if $single_paired.input_sequences.is_of_type('fastq')
        --data_format fastq
    #else:
        --data_format fasta
    #end if
    --read1 '${single_paired.input_sequences}'
#end if

## TODO:
## -m min_trim
## -a trim_adapt
## -q kraken_quick
## -p kraken_preload
## -c kaiju_score
## -l kaiju_minlen
## -i kaiju_mismatch

## We'll capture predictably named output files from here:
-o .
&&
mv ./virus_table.txt '$combined_table'
#if $capture_reads_table:
&&
mv ./kodoja_VRL.txt '$reads_table'
#end if
]]></command>
    <inputs>
        <param label="Select a Kraken database" name="kraken_db" type="select">
            <options from_data_table="kraken_databases">
                <validator message="No Kraken database is available" type="no_options" />
            </options>
        </param>
        <param label="Select a Kaiju database" name="kaiju_db" type="select">
            <options from_data_table="kaiju_databases">
                <validator message="No Kaiju database is available" type="no_options" />
            </options>
        </param>
        <conditional name="single_paired">
            <param name="single_paired_selector" type="select" label="Single or paired reads">
                <!-- TODO?
                <option value="collection">Collection</option>
                -->
                <option value="yes">Paired</option>
                <option selected="True" value="no">Single</option>
            </param>
            <when value="yes">
                <param format="fasta,fastq" name="forward_input" type="data" label="Forward strand" help="FASTA or FASTQ dataset"/>
                <param format="fasta,fastq" name="reverse_input" type="data" label="Reverse strand" help="FASTA or FASTQ dataset"/>
            </when>
            <when value="no">
                <param format="fasta,fastq" label="Input sequences" name="input_sequences" type="data" help="FASTA or FASTQ datasets"/>
            </when>
        </conditional>
        <param name="capture_reads_table" type="boolean" value="false" label="Capture read assignment table" help="This table can be used to filter out reads matched to (individual) viruses"/>
    </inputs>
    <outputs>
        <data name="combined_table" format="tabular" label="Kodoja species report for ${on_string}" />
        <data name="reads_table" format="tabular" label="Kodoja read assignment for ${on_string}">
            <filter>capture_reads_table</filter>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="kraken_db" value="kraken3viruses" />
            <param name="kaiju_db" value="kaiju3viruses" />
            <param name="single_paired_selector" value="no" />
            <param name="input_sequences" value="testData_1.fastq" ftype="fastq" />
            <output name="combined_table" file="virus_table_SE_fastq.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="kraken_db" value="kraken3viruses" />
            <param name="kaiju_db" value="kaiju3viruses" />
            <param name="single_paired_selector" value="yes" />
            <param name="forward_input" value="testData_1.fastq" ftype="fastq" />
            <param name="reverse_input" value="testData_2.fastq" ftype="fastq" />
            <output name="combined_table" file="virus_table_PE_fastq.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="kraken_db" value="kraken3viruses" />
            <param name="kaiju_db" value="kaiju3viruses" />
            <param name="single_paired_selector" value="yes" />
            <param name="forward_input" value="testData_1.fastq" ftype="fastq" />
            <param name="reverse_input" value="testData_2.fastq" ftype="fastq" />
            <param name="capture_reads_table" value="true" />
            <output name="combined_table" file="virus_table_PE_fastq.tabular" ftype="tabular" />
            <output name="reads_table" file="read_table_PE_fastq.tabular" ftype="tabular" />
        </test>
        <test>
            <param name="kraken_db" value="kraken3viruses" />
            <param name="kaiju_db" value="kaiju3viruses" />
            <param name="single_paired_selector" value="yes" />
            <param name="forward_input" value="testData_1.fasta" ftype="fasta" />
            <param name="reverse_input" value="testData_2.fasta" ftype="fasta" />
            <output name="combined_table" file="virus_table_PE_fasta.tabular" ftype="tabular" />
        </test>
    </tests>
    <help><![CDATA[
Kodoja is a tool intended to identify viral sequences in a
FASTQ/FASTA sequencing run by matching them against both
Kraken and Kaiju databases.

The main output is a tab-separated table as follows (tabular format in Galaxy)
with the following columns:

1. Species name
2. Species NCBI taxonomy identifier (TaxID)
3. Number of reads assigned by *either* Kraken or Kaiju to this species
4. Number of Reads assigned by *both* Kraken and Kaiju to this species
5. Genus name
6. Number of reads assigned by *either* Kraken or Kaiju to this genus
7. Number of reads assigned by *both* Kraken and Kaiju to this genus

The counts in columns 6 and 7 are for reads assigned to that genus, but not
to any species within it.

For example,

================================== ============= ================= ============================= ========== =============== ===========================
Species                            Species TaxID Species sequences Species sequences (stringent) Genus      Genus sequences Genus sequences (stringent)
---------------------------------- ------------- ----------------- ----------------------------- ---------- --------------- ---------------------------
Cassava brown streak virus                137758                45                            45 Ipomovirus               0                           0
Ugandan cassava brown streak virus        946046                28                            28 Ipomovirus               0                           0
Tobacco etch virus                         12227                21                            19 Potyvirus                0                           0
================================== ============= ================= ============================= ========== =============== ===========================

The second most important output, which you can optionally capture
for use within Galaxy, is a per-read table summarising matches found
with Kraken and/or Kaiju. The Kodoja Retrieve tool is not currently
available within Galaxy, but you can instead use this file directly
within Galaxy to filter out just the virus reads, or even reads
matched to a specific taxid. See for example ``seq_filter_by_id``
which is available via the Galaxy Tool Shed:

http://toolshed.g2.bx.psu.edu/view/peterjc/seq_filter_by_id
https://github.com/peterjc/pico_galaxy/tree/master/tools/seq_filter_by_id

The Kodoja Search command line tool offers additional options not
currently exposed in Galaxy, including::

                            Number of threads
      -s, --host_subset     Subset host sequences before Kaiju
      -m TRIM_MINLEN, --trim_minlen TRIM_MINLEN
                            Trimmomatic minimum length
      -a TRIM_ADAPT, --trim_adapt TRIM_ADAPT
                            Illumina adapter sequence file
      -q KRAKEN_QUICK, --kraken_quick KRAKEN_QUICK
                            Number of minium hits by Kraken
      -p, --kraken_preload  Kraken preload database
      -c KAIJU_SCORE, --kaiju_score KAIJU_SCORE
                            Kaju alignment score
      -l KAIJU_MINLEN, --kaiju_minlen KAIJU_MINLEN
                            Kaju minimum length
      -i KAIJU_MISMATCH, --kaiju_mismatch KAIJU_MISMATCH
                            Kaju allowed mismatches

For more information, please see the Kodoja manual
https://github.com/abaizan/kodoja/wiki/Kodoja-Manual
    ]]></help>
    <citations>
        <citation type="bibtex">
@misc{githubkodoja,
  author = {Baizan Edge, Amanda},
  year = {2018},
  title = {Kodoja},
  publisher = {GitHub},
  journal = {GitHub repository},
  url = {https://github.com/abaizan/kodoja},
}</citation>
    </citations>
</tool>