view vsnp_build_tables.xml @ 6:8a7fae0cccfc draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/vsnp commit 91a484f6994b0da7858876b17013d9547f080e2a"
author iuc
date Fri, 26 Feb 2021 20:18:27 +0000
parents b08cc87b2888
children e54b96acea98
line wrap: on
line source

<tool id="vsnp_build_tables" name="vSNP: build tables" version="@WRAPPER_VERSION@.2" profile="@PROFILE@">
    <description></description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <requirements>
        <requirement type="package" version="1.76">biopython</requirement>
        <requirement type="package" version="0.25.3">pandas</requirement>
        <requirement type="package" version="1.2.8">xlsxwriter</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
#import re

mkdir 'output_excel_dir' &&

## The input_snps_json and input_avg_mq_json identifiers
## are typically the same string, so we append a uniquq
## extension to enable the links.
#set input_snps_json_identifier = re.sub('[^\s\w\-]', '_', str($input_snps_json.element_identifier)) + '.snps'
ln -s '${input_snps_json}' '${input_snps_json_identifier}' &&
#set input_avg_mq_json_identifier = re.sub('[^\s\w\-]', '_', str($input_avg_mq_json.element_identifier)) + '.avg_mq'
ln -s '${input_avg_mq_json}' '${input_avg_mq_json_identifier}' &&
#set input_newick_identifier = re.sub('[^\s\w\-]', '_', str($input_newick.element_identifier))
ln -s '${input_newick}' '${input_newick_identifier}' &&

python '$__tool_directory__/vsnp_build_tables.py'
--input_snps_json '${input_snps_json_identifier}'
--input_avg_mq_json '${input_avg_mq_json_identifier}'
--input_newick '${input_newick_identifier}'
#if str($gbk_cond.gbk_param) == 'yes':
    #if str($gbk_cond.gbk_source_cond.gbk_source) == 'cached':
        --gbk_file '$gbk_cond.gbk_source_cond.gbk_file.fields.path'
    #else:
        --gbk_file '$gbk_cond.gbk_source_cond.gbk_file'
    #end if
#end if
]]></command>
    <inputs>
        <param name="input_snps_json" type="data" format="json" label="SNPs json file"/>
        <param name="input_avg_mq_json" type="data" format="json" label="Average MQ json file"/>
        <param name="input_newick" type="data" format="newick" label="Best-scoring ML tree file"/>
        <conditional name="gbk_cond">
            <param name="gbk_param" type="select" label="Use Genbank file?">
                <option value="yes" selected="true">yes</option>
                <option value="no">No</option>
            </param>
            <when value="yes">
                <conditional name="gbk_source_cond">
                    <param name="gbk_source" type="select" label="Choose the source for the Genbank file">
                        <option value="cached" selected="true">locally cached</option>
                        <option value="history">from history</option>
                    </param>
                    <when value="cached">
                        <param name="gbk_file" type="select" label="Genbank file">
                            <options from_data_table="vsnp_genbank">
                                <filter type="data_meta" column="0" key="dbkey" ref="input_avg_mq_json"/>
                                <validator type="no_options" message="A cached Genbank file is not available for the build associated with the selected average MQ json file"/>
                            </options>
                        </param>
                    </when>
                    <when value="history">
                        <param name="gbk_file" type="data" format="genbank" label="Genbank file">
                            <validator type="no_options" message="The current history does not include a genbank dataset"/>
                        </param>
                    </when>
                </conditional>
            </when>
            <when value="no"/>
        </conditional>
    </inputs>
    <outputs>
        <collection name="excel" type="list" label="${tool.name} on ${on_string}">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.(?P&lt;ext&gt;xlsx)" directory="output_excel_dir"/>
        </collection>
    </outputs>
    <tests>
        <test>
            <param name="input_snps_json" value="input_snps_json.json" ftype="json" dbkey="89"/>
            <param name="input_newick" value="input_newick.newick" ftype="newick" dbkey="89"/>
            <param name="input_avg_mq_json" value="input_avg_mq_json.json" ftype="json" dbkey="89"/>
            <param name="gbk_param" value="no"/>
            <output_collection name="excel" type="list" count="2">
                <element name="input_newick_newick_cascade_table" file="cascade_table.xlsx" ftype="xlsx" compare="sim_size"/>
                <element name="input_newick_newick_sort_table" file="sort_table.xlsx" ftype="xlsx" compare="sim_size"/>
            </output_collection>
        </test>
        <test>
            <param name="input_snps_json" value="Mbovis-01_snps.json" ftype="json" dbkey="89"/>
            <param name="input_newick" value="Mbovis-01_snps.newick" ftype="newick" dbkey="89"/>
            <param name="input_avg_mq_json" value="Mbovis-01_avg_mq.json" ftype="json" dbkey="89"/>
            <param name="gbk_param" value="no"/>
            <output_collection name="excel" type="list" count="2">
                <element name="Mbovis-01_snps_newick_cascade_table" file="Mbovis-01_cascade_table.xlsx" ftype="xlsx" compare="sim_size"/>
                <element name="Mbovis-01_snps_newick_sort_table" file="Mbovis-01_sort_table.xlsx" ftype="xlsx" compare="sim_size"/>
            </output_collection>
        </test>
        <test>
            <param name="input_snps_json" value="Mbovis-01D_snps.json" ftype="json" dbkey="89"/>
            <param name="input_newick" value="Mbovis-01D_snps.newick" ftype="newick" dbkey="89"/>
            <param name="input_avg_mq_json" value="Mbovis-01D_avg_mq.json" ftype="json" dbkey="89"/>
            <param name="gbk_param" value="no"/>
            <output_collection name="excel" type="list" count="2">
                <element name="Mbovis-01D_snps_newick_cascade_table" file="Mbovis-01D_cascade_table.xlsx" ftype="xlsx" compare="sim_size"/>
                <element name="Mbovis-01D_snps_newick_sort_table" file="Mbovis-01D_sort_table.xlsx" ftype="xlsx" compare="sim_size"/>
            </output_collection>
        </test>
        <test>
            <param name="input_snps_json" value="Mbovis-01D6_snps.json" ftype="json" dbkey="89"/>
            <param name="input_newick" value="Mbovis-01D6_snps.newick" ftype="newick" dbkey="89"/>
            <param name="input_avg_mq_json" value="Mbovis-01D6_avg_mq.json" ftype="json" dbkey="89"/>
            <param name="gbk_param" value="no"/>
            <output_collection name="excel" type="list" count="2">
                <element name="Mbovis-01D6_snps_newick_cascade_table" file="Mbovis-01D6_cascade_table.xlsx" ftype="xlsx" compare="sim_size"/>
                <element name="Mbovis-01D6_snps_newick_sort_table" file="Mbovis-01D6_sort_table.xlsx" ftype="xlsx" compare="sim_size"/>
            </output_collection>
        </test>
    </tests>
    <help>
**What it does**

Accepts a combination of single SNPs json, average MQ json and newick files (or associated collections of
each) to produce annotated SNPs tables in the form of Excel spreadsheets.  The SNPs json and average MQ json
files are typically produced by the **vSNP: get SNPs** tool and the newick files are typically produced by
the **Phyogenetic reconstruction with RaXML** tool.

The SNPs tables display closely related isolates and enables identification of mixed SNPs when multiple
bacterial strains are infecting an organism.  The table structure is shown below.  The columns identify the
genome location of the SNP calls and the isolates are contained within the rows.  The reference (or ancestral
strain if the reference is an outgroup) is listed across the top, identified as the "reference call".  SNPs
that are not highlighted will match the reference.  The map-quality row values are the average of the map
quality scores of each isolate  in that position.  These scores measure the confidence that the read has been
mapped to the correct location on the genome.  The maximum score possible is 60, and lower scores lessen the
confidence that the SNP was correctly identified.  The annotation of the position is provided at the bottom
of the table.

.. image:: table_description.png

SNPs are sorted according to their evolutionary age within the table.  The oldest SNPs (encompassing the most
isolates) are furthest to the left.  This sorting is somewhat crude - the intent is to improve readibility or
more easily match a related tree.

For a more detailed discussion, see the **Validating and correcting SNP calls** section of
the `vSNP documentation`_.

.. _vSNP documentation: https://github.com/USDA-VS/vSNP/blob/master/docs/detailed_usage.md

**Required Options**

 * **Use Genbank file** - Select "yes" to annotate the tables using the information in the Genbank file.  Locally cached files, if available, provide the most widely used annotations, but more custom Genbank files can be chosen from the current history.
    </help>
    <expand macro="citations"/>
</tool>