view phylogenetic_tree.xml @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 248b06e86022
children a631c2f6d913
line wrap: on
line source

<tool id="gd_phylogenetic_tree" name="Phylogenetic Tree" version="1.1.0">
  <description>: Show genetic relationships among individuals</description>

  <command interpreter="python">
    #import json
    #import base64
    #import zlib
    #set $ind_names = $input.dataset.metadata.individual_names
    #set $ind_colms = $input.dataset.metadata.individual_columns
    #set $ind_dict = dict(zip($ind_names, $ind_colms))
    #set $ind_json = json.dumps($ind_dict, separators=(',',':'))
    #set $ind_comp = zlib.compress($ind_json, 9)
    #set $ind_arg = base64.b64encode($ind_comp)
    phylogenetic_tree.py '$input' '$output' '$output.files_path'
    #if $input_type.choice == '0'
      'gd_snp'
      #if $input_type.data_source.choice == '0'
        'sequence_coverage'
        '$input_type.data_source.minimum_coverage'
        '$input_type.data_source.minimum_quality'
      #else if $input_type.data_source.choice == '1'
        'estimated_genotype' '0' '0'
      #end if
    #else if $input_type.choice == '1'
      'gd_genotype' 'estimated_genotype' '0' '0'
    #end if
    #if $individuals.choice == '0'
      'all_individuals'
    #else if $individuals.choice == '1'
      '$individuals.p1_input'
    #end if
	#if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0')
        'none'
    #else
        '$input.metadata.dbkey'
    #end if
    #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style])
    #if $draw_tree_options == ''
        ''
    #else
        '-$draw_tree_options'
    #end if
    '$ind_arg'
  </command>

  <inputs>
    <conditional name="input_type">
      <param name="choice" type="select" format="integer" label="Input format">
        <option value="0" selected="true">gd_snp</option>
        <option value="1">gd_genotype</option>
      </param>
      <when value="0">
        <param name="input" type="data" format="gd_snp" label="SNP dataset" />

        <conditional name="data_source">
          <param name="choice" type="select" format="integer" label="Distance metric">
            <option value="0">sequence coverage</option>
            <option value="1" selected="true">estimated genotype</option>
          </param>
          <when value="0">
            <param name="minimum_coverage" type="integer" min="0" value="0" label="Minimum SNP coverage" />
            <param name="minimum_quality" type="integer" min="0" value="0" label="Minimum SNP quality"
                   help="Note: minimum coverage and minimum quality cannot both be 0" />
          </when>
          <when value="1"/>
        </conditional>
      </when>
      <when value="1">
        <param name="input" type="data" format="gd_genotype" label="Genotype dataset" />
      </when>
    </conditional>

    <conditional name="individuals">
      <param name="choice" type="select" label="Compute for">
        <option value="0" selected="true">All individuals</option>
        <option value="1">Individuals in a population</option>
      </param>
      <when value="0" />
      <when value="1">
        <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
      </when>
    </conditional>

    <param name="include_reference" type="select" format="integer" label="Include reference sequence">
      <option value="1" selected="true">Yes</option>
      <option value="0">No</option>
    </param>

    <param name="branch_style" type="select" display="radio">
      <label>Branch type</label>
      <option value="" selected="true">square</option>
      <option value="d">diagonal</option>
    </param>
     
    <param name="scale_style" type="select" display="radio">
      <label>Draw branches to scale</label>
      <option value="" selected="true">yes</option>
      <option value="s">no</option>
    </param>
     
    <param name="length_style" type="select" display="radio">
      <label>Show branch lengths</label>
      <option value="" selected="true">yes</option>
      <option value="b">no</option>
    </param>
     
    <param name="layout_style" type="select" display="radio">
      <label>Tree layout</label>
      <option value="" selected="true">horizontal</option>
      <option value="v">vertical</option>
    </param>
  </inputs>

  <outputs>
    <data name="output" format="html" />
  </outputs>

  <tests>
    <test>
      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
      <param name="choice" value="0" />
      <param name="minimum_coverage" value="3" />
      <param name="minimum_quality" value="30" />
      <param name="data_source" value="0" />
      <param name="branch_style" value="" />
      <param name="scale_style" value="" />
      <param name="length_style" value="" />
      <param name="layout_style" value="" />
      <output name="output" file="test_out/phylogenetic_tree/phylogenetic_tree.html" ftype="html" compare="diff" lines_diff="2">
        <extra_files type="file" name="distance_matrix.phylip" value="test_out/phylogenetic_tree/distance_matrix.phylip" />
        <extra_files type="file" name="informative_snps.txt" value="test_out/phylogenetic_tree/informative_snps.txt" />
        <extra_files type="file" name="mega_distance_matrix.txt" value="test_out/phylogenetic_tree/mega_distance_matrix.txt" />
        <extra_files type="file" name="phylogenetic_tree.newick" value="test_out/phylogenetic_tree/phylogenetic_tree.newick" />
        <extra_files type="file" name="tree.pdf" value="test_out/phylogenetic_tree/tree.pdf" compare="sim_size" delta = "1000"/>
      </output>
    </test>
  </tests>

  <help>

**Dataset formats**

The input dataset is in gd_snp_ or gd_genotype_ format.
The output is a composite dataset, containing the tree in both text (Newick_)
and PostScript formats, as well as supplemental text information.
(`Dataset missing?`_)

.. _gd_snp: ./static/formatHelp.html#gd_snp
.. _gd_genotype: ./static/formatHelp.html#gd_genotype
.. _Newick: http://evolution.genetics.washington.edu/phylip/newicktree.html
.. _Dataset missing?: ./static/formatHelp.html

-----

**What it does**

This tool uses a gd_snp dataset to determine a kind of "genetic distance"
between each pair of individuals.  That information is used to
produce a tree-shaped figure that depicts how the individuals are related,
both as a text files and as a diagram.
The text files include a common tree format, Newick, as well as distance
matrices and counts of informative SNPs for each pairwise comparison.
The informative SNPs can be used as a guide to how reliable the tree is.

The input parameters are:

SNP dataset
   A table of SNPs for various individuals, in gd_snp format.

Individuals
   By default all individuals are included in the analysis, but this can
   optionally be restricted to a subset that has been defined using the
   Specify Individuals tool.

Minimum SNP coverage
   For each pair of individuals, the tool looks for informative SNPs, i.e.,
   where the sequence data for both individuals is adequate.  Specifying,
   say, 7 for this option instructs the tool to consider only SNPs with
   at least 7 reads in each of the two individuals (regardless of the
   alleles) when estimating their genetic distance.

Minimum SNP quality
   Specifying, say, 37 for this option instructs the tool to consider
   only SNPs with a quality score of at least 37 in both individuals
   when estimating their genetic distance.

Include reference sequence
   For gd_snp datasets containing columns for a reference sequence, the
   user can ask that the reference be indicated in the tree, to help with
   rooting it.  If the dataset has no reference columns, this option has
   no effect.

Distance metric
   The genetic distance between two individuals at a given SNP can
   be estimated two ways.  One method is to use the absolute value of the
   difference in the frequency of the first allele (or equivalently, the
   second allele).  For instance, if the first individual has 5 reads of
   each allele and the second individual has respectively 3 and 6 reads,
   then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that
   SNP.  The other approach is to use the genotype calls to estimate
   the difference in the number of occurrences of the first allele.
   For instance, if the two genotypes are 2 and 1, i.e., the individuals
   are estimated to have respectively 2 and 1 occurrences of the first
   allele at this location, then the distance is 1 (the absolute value
   of the difference of the two numbers).

Output options
   The final four options apply mostly to the graphical drawing of the
   tree, except that the branch lengths are also added to the Newick text
   file.

-----

**Acknowledgments**

To convert the distance matrix to a Newick-formatted tree, we use the
QuickTree program from
http://www.sanger.ac.uk/resources/software/quicktree/ .

To make the diagram we use draw_tree, available at
http://compgen.bscb.cornell.edu/phast/ .

  </help>
</tool>