view hyphy_bgm.xml @ 5:823a5afee916 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hyphy/ commit 8d5ae1d04c43988fdcc458f4f08376a15e72db8e"
author iuc
date Thu, 20 Feb 2020 18:09:03 -0500
parents 1df765cc6bcb
children 26ad9a6b1293
line wrap: on
line source

<?xml version="1.0"?>
<tool id="hyphy_bgm" name="HyPhy-BGM" version="@VERSION@+galaxy0">
    <description>- Detecting coevolving sites via Bayesian graphical models</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
        ln -s '$input_file' bgm_input.fa &&
        ln -s '$input_nhx' bgm_input.nhx &&
        hyphy bgm
            --alignment ./bgm_input.fa
            --tree ./bgm_input.nhx
            --run_type $datatype.value
            #if $datatype.value == "codon":
                --code '$datatype.gencodeid'
            #end if
            #if $datatype.value == "protein":
                --baseline_model '$datatype.model'
            #end if
            --branches '$branches'
            --chain '$chain_length'
            --burn_in '$burn_in'
            --samples '$samples'
            --parents '$parents'
            --min_subs '$min_subs'
            > '$bgm_log'
    ]]></command>
    <inputs>
        <param name="input_file" type="data" format="fasta" label="Input FASTA file"/>
        <param name="input_nhx" type="data" format="nhx" label="Input newick file"/>
        <conditional name="datatype">
            <param name="value" type="select" label="Type of data">
                <option value="nucleotide">Nucleotide</option>
                <option value="amino-acid">Amino acid</option>
                <option value="codon">Codon</option>
            </param>
            <when value="nucleotide"/>
            <when value="amino-acid">
                <expand macro="substitution" />
            </when>
            <when value="codon">
                <expand macro="gencode" />
            </when>
        </conditional>
        <expand macro="branches"/>
        <param name="chain_length" type="integer" value="100000" min="0" max="1000000000" label="Length of MCMC chain"/>
        <param name="burn_in" type="integer" value="10000" min="0" max="1000000000" label="Number of samples to discard for burn-in"/>
        <param name="samples" type="integer" value="100" min="0" max="100" label="Number of steps to extract from chain sample"/>
        <param argument="--max-parents" name="parents" type="integer" value="1" min="1" max="3" label="Maximum number of parents allowed per node" />
        <param argument="--min-subs" name="min_subs" type="integer" value="1" min="1" max="100000" label="Minimum number of ubstitutions per site to be included in the analysis" />
    </inputs>
    <outputs>
        <data name="bgm_log" format="txt"/>
        <data name="bgm_output" format="hyphy_results.json" from_work_dir="bgm_input.fa.BGM.json"/>
    </outputs>
    <tests>
        <test>
            <param name="input_file" ftype="fasta" value="bgm-in1.fa"/>
            <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/>
            <output name="bgm_output" file="bgm-out1.json" compare="sim_size"/>
        </test>
    </tests>
    <help><![CDATA[

BGM : Bayesian Graphical Models
===============================

What does this do?
------------------

This tools identifies groups of sites in the alignments that experience substitutions along the same branches,
i.g. *co-evolve*.

Brief description
-----------------

GM (Bayesian Graphical Model) uses a maximum likelihood ancestral state
reconstruction to map substitution (non-synonymous only for coding data)
events to branches in the phylogeny and then analyzes the joint
distribution of the substitution map using a Bayesian graphical model
(network). Next, a Markov chain Monte Carlo analysis is used to generate
a random sample of network structures from the posterior distribution
given the data. Each node in the network represents a site in the
alignment, and links (edges) between nodes indicate high posterior
support for correlated substitutions at the two sites over time, which
implies coevolution.


Input
-----

1. A *FASTA* sequence alignment.
2. A phylogenetic tree in the *Newick* format

Note: the names of sequences in the alignment must match the names of the sequences in the tree.

Output
------

A JSON file with analysis results (http://hyphy.org/resources/json-fields.pdf).

A custom visualization module for viewing these results is available (see http://vision.hyphy.org/BGM for an example)

Further reading
---------------

http://hyphy.org/methods/selection-methods/#BGM


Tool options
------------
::

    --branches          Which branches should be tested for selection?
                            All [default] : test all branches

                            Internal : test only internal branches (suitable for
                            intra-host pathogen evolution for example, where terminal branches
                            may contain polymorphism data)

                            Leaves: test only terminal (leaf) branches

                            Unlabeled: if the Newick string is labeled using the {} notation,
                            test only branches without explicit labels
                            (see http://hyphy.org/tutorials/phylotree/)

    --max-parents      The maximum number of parents allowed per node, i.e. how many sites
                       can directly influence substitution patterns at another site
                       Increasing this number scales complexity nonlinearly
	                    default value: 1

    --min-subs         The minium number of substitutions per site to include it in the analysis
                       Filter low complexity (too few substitution) sites
	                     default value: 1

    --chains           How many MCMC chains to run (does not apply to Variational-Bayes)
                            default value: 5

    --steps            MCMC chain length (does not apply to Variational-Bayes)
                            default value: 100,000

    --burn-in          MCMC chain burn in (does not apply to Variational-Bayes)
                            default value: 10,000

    --samples          MCMC samples to draw (does not apply to Variational-Bayes)
                            default value: 100


    ]]></help>
    <expand macro="citations">
        <citation type="doi">10.1371/journal.pcbi.0030231</citation>
    </expand>
</tool>