Galaxy |

Changeset 12:4b6590dd7250 (2012-09-12)

Previous changeset 11:d4ec09e8079f (2012-09-12) Next changeset 13:fdb4240fb565 (2012-09-28)

Commit message:
Uploaded

removed:
test-data/test_in/a.gd_indivs
test-data/test_in/b.gd_indivs
test-data/test_in/c.gd_indivs
test-data/test_in/sample.gd_sap
test-data/test_in/sample.gd_snp
test-data/test_out/add_fst_column/add_fst_column.gd_snp
test-data/test_out/modify_snp_table/modify.gd_snp
test-data/test_out/pca/admix.gd_indivs
test-data/test_out/pca/admix.gd_snp
test-data/test_out/select_snps/select_snps.gd_snp
test-data/test_out/specify_restriction_enzymes/specify_restriction_enzymes.gd_snp

diff -r d4ec09e8079f -r 4b6590dd7250 BeautifulSoup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/BeautifulSoup.py Wed Sep 12 17:10:26 2012 -0400

b'@@ -0,0 +1,2014 @@\n+"""Beautiful Soup\n+Elixir and Tonic\n+"The Screen-Scraper\'s Friend"\n+http://www.crummy.com/software/BeautifulSoup/\n+\n+Beautiful Soup parses a (possibly invalid) XML or HTML document into a\n+tree representation. It provides methods and Pythonic idioms that make\n+it easy to navigate, search, and modify the tree.\n+\n+A well-formed XML/HTML document yields a well-formed data\n+structure. An ill-formed XML/HTML document yields a correspondingly\n+ill-formed data structure. If your document is only locally\n+well-formed, you can use this library to find and process the\n+well-formed part of it.\n+\n+Beautiful Soup works with Python 2.2 and up. It has no external\n+dependencies, but you\'ll have more success at converting data to UTF-8\n+if you also install these three packages:\n+\n+* chardet, for auto-detecting character encodings\n+ http://chardet.feedparser.org/\n+* cjkcodecs and iconv_codec, which add more encodings to the ones supported\n+ by stock Python.\n+ http://cjkpython.i18n.org/\n+\n+Beautiful Soup defines classes for two main parsing strategies:\n+\n+ * BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific\n+ language that kind of looks like XML.\n+\n+ * BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid\n+ or invalid. This class has web browser-like heuristics for\n+ obtaining a sensible parse tree in the face of common HTML errors.\n+\n+Beautiful Soup also defines a class (UnicodeDammit) for autodetecting\n+the encoding of an HTML or XML document, and converting it to\n+Unicode. Much of this code is taken from Mark Pilgrim\'s Universal Feed Parser.\n+\n+For more than you ever wanted to know about Beautiful Soup, see the\n+documentation:\n+http://www.crummy.com/software/BeautifulSoup/documentation.html\n+\n+Here, have some legalese:\n+\n+Copyright (c) 2004-2010, Leonard Richardson\n+\n+All rights reserved.\n+\n+Redistribution and use in source and binary forms, with or without\n+modification, are permitted provided that the following conditions are\n+met:\n+\n+ * Redistributions of source code must retain the above copyright\n+ notice, this list of conditions and the following disclaimer.\n+\n+ * Redistributions in binary form must reproduce the above\n+ copyright notice, this list of conditions and the following\n+ disclaimer in the documentation and/or other materials provided\n+ with the distribution.\n+\n+ * Neither the name of the the Beautiful Soup Consortium and All\n+ Night Kosher Bakery nor the names of its contributors may be\n+ used to endorse or promote products derived from this software\n+ without specific prior written permission.\n+\n+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS\n+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT\n+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR\n+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\n+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\n+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\n+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.\n+\n+"""\n+from __future__ import generators\n+\n+__author__ = "Leonard Richardson (leonardr@segfault.org)"\n+__version__ = "3.2.0"\n+__copyright__ = "Copyright (c) 2004-2010 Leonard Richardson"\n+__license__ = "New-style BSD"\n+\n+from sgmllib import SGMLParser, SGMLParseError\n+import codecs\n+import markupbase\n+import types\n+import re\n+import sgmllib\n+try:\n+ from htmlentitydefs import name2codepoint\n+except ImportError:\n+ name2codepoint = {}\n+try:\n+ set\n+except NameError:\n+ from sets import Set as set\n+\n+#These hacks make Beautiful Soup able to parse XML with namespaces\n+sgmllib.tagfind = re.co'..b' \'utf-32\', \'utf_16\', \'utf_32\',\n+ \'utf16\', \'u16\')):\n+ xml_encoding = sniffed_xml_encoding\n+ return xml_data, xml_encoding, sniffed_xml_encoding\n+\n+\n+ def find_codec(self, charset):\n+ return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \\\n+ or (charset and self._codec(charset.replace("-", ""))) \\\n+ or (charset and self._codec(charset.replace("-", "_"))) \\\n+ or charset\n+\n+ def _codec(self, charset):\n+ if not charset: return charset\n+ codec = None\n+ try:\n+ codecs.lookup(charset)\n+ codec = charset\n+ except (LookupError, ValueError):\n+ pass\n+ return codec\n+\n+ EBCDIC_TO_ASCII_MAP = None\n+ def _ebcdic_to_ascii(self, s):\n+ c = self.__class__\n+ if not c.EBCDIC_TO_ASCII_MAP:\n+ emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,\n+ 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,\n+ 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,\n+ 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,\n+ 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,\n+ 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,\n+ 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,\n+ 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,\n+ 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,\n+ 201,202,106,107,108,109,110,111,112,113,114,203,204,205,\n+ 206,207,208,209,126,115,116,117,118,119,120,121,122,210,\n+ 211,212,213,214,215,216,217,218,219,220,221,222,223,224,\n+ 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,\n+ 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,\n+ 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,\n+ 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,\n+ 250,251,252,253,254,255)\n+ import string\n+ c.EBCDIC_TO_ASCII_MAP = string.maketrans( \\\n+ \'\'.join(map(chr, range(256))), \'\'.join(map(chr, emap)))\n+ return s.translate(c.EBCDIC_TO_ASCII_MAP)\n+\n+ MS_CHARS = { \'\\x80\' : (\'euro\', \'20AC\'),\n+ \'\\x81\' : \' \',\n+ \'\\x82\' : (\'sbquo\', \'201A\'),\n+ \'\\x83\' : (\'fnof\', \'192\'),\n+ \'\\x84\' : (\'bdquo\', \'201E\'),\n+ \'\\x85\' : (\'hellip\', \'2026\'),\n+ \'\\x86\' : (\'dagger\', \'2020\'),\n+ \'\\x87\' : (\'Dagger\', \'2021\'),\n+ \'\\x88\' : (\'circ\', \'2C6\'),\n+ \'\\x89\' : (\'permil\', \'2030\'),\n+ \'\\x8A\' : (\'Scaron\', \'160\'),\n+ \'\\x8B\' : (\'lsaquo\', \'2039\'),\n+ \'\\x8C\' : (\'OElig\', \'152\'),\n+ \'\\x8D\' : \'?\',\n+ \'\\x8E\' : (\'#x17D\', \'17D\'),\n+ \'\\x8F\' : \'?\',\n+ \'\\x90\' : \'?\',\n+ \'\\x91\' : (\'lsquo\', \'2018\'),\n+ \'\\x92\' : (\'rsquo\', \'2019\'),\n+ \'\\x93\' : (\'ldquo\', \'201C\'),\n+ \'\\x94\' : (\'rdquo\', \'201D\'),\n+ \'\\x95\' : (\'bull\', \'2022\'),\n+ \'\\x96\' : (\'ndash\', \'2013\'),\n+ \'\\x97\' : (\'mdash\', \'2014\'),\n+ \'\\x98\' : (\'tilde\', \'2DC\'),\n+ \'\\x99\' : (\'trade\', \'2122\'),\n+ \'\\x9a\' : (\'scaron\', \'161\'),\n+ \'\\x9b\' : (\'rsaquo\', \'203A\'),\n+ \'\\x9c\' : (\'oelig\', \'153\'),\n+ \'\\x9d\' : \'?\',\n+ \'\\x9e\' : (\'#x17E\', \'17E\'),\n+ \'\\x9f\' : (\'Yuml\', \'\'),}\n+\n+#######################################################################\n+\n+\n+#By default, act as an HTML pretty-printer.\n+if __name__ == \'__main__\':\n+ import sys\n+ soup = BeautifulSoup(sys.stdin)\n+ print soup.prettify()\n'

diff -r d4ec09e8079f -r 4b6590dd7250 LocationFile.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LocationFile.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+import sys
+
+def die( message ):
+    print >> sys.stderr, message
+    sys.exit(1)
+
+def open_or_die( filename, mode='r', message=None ):
+    if message is None:
+        message = 'Error opening {0}'.format( filename )
+    try:
+        fh = open( filename, mode )
+    except IOError, err:
+        die( '{0}: {1}'.format( message, err.strerror ) )
+    return fh
+
+class LocationFile( object ):
+    def __init__( self, filename, comment_chars=None, delimiter='\t', key_column=0 ):
+        self.filename = filename
+        if comment_chars is None:
+            self.comment_chars = ( '#' )
+        else:
+            self.comment_chars = tuple( comment_chars )
+        self.delimiter = delimiter
+        self.key_column = key_column
+        self._map = {}
+        self._populate_map()
+
+    def _populate_map( self ):
+        try:
+            with open( self.filename ) as fh:
+                line_number = 0
+                for line in fh:
+                    line_number += 1
+                    line = line.rstrip( '\r\n' )
+                    if not line.startswith( self.comment_chars ):
+                        elems = line.split( self.delimiter )
+                        if len( elems ) <= self.key_column:
+                            die( 'Location file {0} line {1}: less than {2} columns'.format( self.filename, line_number, self.key_column + 1 ) )
+                        else:
+                            key = elems.pop( self.key_column )
+                            if key in self._map:
+                                if self._map[key] != elems:
+                                    die( 'Location file {0} line {1}: duplicate key "{2}"'.format( self.filename, line_number, key ) )
+                            else:
+                                self._map[key] = elems
+        except IOError, err:
+            die( 'Error opening location file {0}: {1}'.format( self.filename, err.strerror ) )
+
+    def get_values( self, key ):
+        if key in self._map:
+            rval = self._map[key]
+            if len( rval ) == 1:
+                return rval[0]
+            else:
+                return rval
+        else:
+            die( 'key "{0}" not found in location file {1}'.format( key, self.filename ) )
+
+    def get_values_if_exists( self, key ):
+        if key in self._map:
+            rval = self._map[key]
+            if len( rval ) == 1:
+                return rval[0]
+            else:
+                return rval
+        else:
+            return None

diff -r d4ec09e8079f -r 4b6590dd7250 OrderedDict.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/OrderedDict.py Wed Sep 12 17:10:26 2012 -0400

[

b"@@ -0,0 +1,259 @@\n+# http://code.activestate.com/recipes/576693/\n+# Backport of OrderedDict() class that runs on Python 2.4, 2.5, 2.6, 2.7 and pypy.\n+# Passes Python2.7's test suite and incorporates all the latest updates.\n+\n+try:\n+ from thread import get_ident as _get_ident\n+except ImportError:\n+ from dummy_thread import get_ident as _get_ident\n+\n+try:\n+ from _abcoll import KeysView, ValuesView, ItemsView\n+except ImportError:\n+ pass\n+\n+\n+class OrderedDict(dict):\n+ 'Dictionary that remembers insertion order'\n+ # An inherited dict maps keys to values.\n+ # The inherited dict provides __getitem__, __len__, __contains__, and get.\n+ # The remaining methods are order-aware.\n+ # Big-O running times for all methods are the same as for regular dictionaries.\n+\n+ # The internal self.__map dictionary maps keys to links in a doubly linked list.\n+ # The circular doubly linked list starts and ends with a sentinel element.\n+ # The sentinel element never gets deleted (this simplifies the algorithm).\n+ # Each link is stored as a list of length three: [PREV, NEXT, KEY].\n+\n+ def __init__(self, *args, **kwds):\n+ '''Initialize an ordered dictionary. Signature is the same as for\n+ regular dictionaries, but keyword arguments are not recommended\n+ because their insertion order is arbitrary.\n+\n+ '''\n+ if len(args) > 1:\n+ raise TypeError('expected at most 1 arguments, got %d' % len(args))\n+ try:\n+ self.__root\n+ except AttributeError:\n+ self.__root = root = [] # sentinel node\n+ root[:] = [root, root, None]\n+ self.__map = {}\n+ self.__update(*args, **kwds)\n+\n+ def __setitem__(self, key, value, dict_setitem=dict.__setitem__):\n+ 'od.__setitem__(i, y) <==> od[i]=y'\n+ # Setting a new item creates a new link which goes at the end of the linked\n+ # list, and the inherited dictionary is updated with the new key/value pair.\n+ if key not in self:\n+ root = self.__root\n+ last = root[0]\n+ last[1] = root[0] = self.__map[key] = [last, root, key]\n+ dict_setitem(self, key, value)\n+\n+ def __delitem__(self, key, dict_delitem=dict.__delitem__):\n+ 'od.__delitem__(y) <==> del od[y]'\n+ # Deleting an existing item uses self.__map to find the link which is\n+ # then removed by updating the links in the predecessor and successor nodes.\n+ dict_delitem(self, key)\n+ link_prev, link_next, key = self.__map.pop(key)\n+ link_prev[1] = link_next\n+ link_next[0] = link_prev\n+\n+ def __iter__(self):\n+ 'od.__iter__() <==> iter(od)'\n+ root = self.__root\n+ curr = root[1]\n+ while curr is not root:\n+ yield curr[2]\n+ curr = curr[1]\n+\n+ def __reversed__(self):\n+ 'od.__reversed__() <==> reversed(od)'\n+ root = self.__root\n+ curr = root[0]\n+ while curr is not root:\n+ yield curr[2]\n+ curr = curr[0]\n+\n+ def clear(self):\n+ 'od.clear() -> None. Remove all items from od.'\n+ try:\n+ for node in self.__map.itervalues():\n+ del node[:]\n+ root = self.__root\n+ root[:] = [root, root, None]\n+ self.__map.clear()\n+ except AttributeError:\n+ pass\n+ dict.clear(self)\n+\n+ def popitem(self, last=True):\n+ '''od.popitem() -> (k, v), return and remove a (key, value) pair.\n+ Pairs are returned in LIFO order if last is true or FIFO order if false.\n+\n+ '''\n+ if not self:\n+ raise KeyError('dictionary is empty')\n+ root = self.__root\n+ if last:\n+ link = root[0]\n+ link_prev = link[0]\n+ link_prev[1] = root\n+ root[0] = link_prev\n+ else:\n+ link = root[1]\n+ link_next = link[1]\n+ root[1] = link_next\n+ "..b' E: od[k] = E[k]\n+ If E has a .keys() method, does: for k in E.keys(): od[k] = E[k]\n+ Or if E is an iterable of items, does: for k, v in E: od[k] = v\n+ In either case, this is followed by: for k, v in F.items(): od[k] = v\n+\n+ \'\'\'\n+ if len(args) > 2:\n+ raise TypeError(\'update() takes at most 2 positional \'\n+ \'arguments (%d given)\' % (len(args),))\n+ elif not args:\n+ raise TypeError(\'update() takes at least 1 argument (0 given)\')\n+ self = args[0]\n+ # Make progressively weaker assumptions about "other"\n+ other = ()\n+ if len(args) == 2:\n+ other = args[1]\n+ if isinstance(other, dict):\n+ for key in other:\n+ self[key] = other[key]\n+ elif hasattr(other, \'keys\'):\n+ for key in other.keys():\n+ self[key] = other[key]\n+ else:\n+ for key, value in other:\n+ self[key] = value\n+ for key, value in kwds.items():\n+ self[key] = value\n+\n+ __update = update # let subclasses override update without breaking __init__\n+\n+ __marker = object()\n+\n+ def pop(self, key, default=__marker):\n+ \'\'\'od.pop(k[,d]) -> v, remove specified key and return the corresponding value.\n+ If key is not found, d is returned if given, otherwise KeyError is raised.\n+\n+ \'\'\'\n+ if key in self:\n+ result = self[key]\n+ del self[key]\n+ return result\n+ if default is self.__marker:\n+ raise KeyError(key)\n+ return default\n+\n+ def setdefault(self, key, default=None):\n+ \'od.setdefault(k[,d]) -> od.get(k,d), also set od[k]=d if k not in od\'\n+ if key in self:\n+ return self[key]\n+ self[key] = default\n+ return default\n+\n+ def __repr__(self, _repr_running={}):\n+ \'od.__repr__() <==> repr(od)\'\n+ call_key = id(self), _get_ident()\n+ if call_key in _repr_running:\n+ return \'...\'\n+ _repr_running[call_key] = 1\n+ try:\n+ if not self:\n+ return \'%s()\' % (self.__class__.__name__,)\n+ return \'%s(%r)\' % (self.__class__.__name__, self.items())\n+ finally:\n+ del _repr_running[call_key]\n+\n+ def __reduce__(self):\n+ \'Return state information for pickling\'\n+ items = [[k, self[k]] for k in self]\n+ inst_dict = vars(self).copy()\n+ for k in vars(OrderedDict()):\n+ inst_dict.pop(k, None)\n+ if inst_dict:\n+ return (self.__class__, (items,), inst_dict)\n+ return self.__class__, (items,)\n+\n+ def copy(self):\n+ \'od.copy() -> a shallow copy of od\'\n+ return self.__class__(self)\n+\n+ @classmethod\n+ def fromkeys(cls, iterable, value=None):\n+ \'\'\'OD.fromkeys(S[, v]) -> New ordered dictionary with keys from S\n+ and values equal to v (which defaults to None).\n+\n+ \'\'\'\n+ d = cls()\n+ for key in iterable:\n+ d[key] = value\n+ return d\n+\n+ def __eq__(self, other):\n+ \'\'\'od.__eq__(y) <==> od==y. Comparison to another OD is order-sensitive\n+ while comparison to a regular mapping is order-insensitive.\n+\n+ \'\'\'\n+ if isinstance(other, OrderedDict):\n+ return len(self)==len(other) and self.items() == other.items()\n+ return dict.__eq__(self, other)\n+\n+ def __ne__(self, other):\n+ return not self == other\n+\n+ # -- the following methods are only used in Python 2.7 --\n+\n+ def viewkeys(self):\n+ "od.viewkeys() -> a set-like object providing a view on od\'s keys"\n+ return KeysView(self)\n+\n+ def viewvalues(self):\n+ "od.viewvalues() -> an object providing a view on od\'s values"\n+ return ValuesView(self)\n+\n+ def viewitems(self):\n+ "od.viewitems() -> a set-like object providing a view on od\'s items"\n+ return ItemsView(self)\n'

diff -r d4ec09e8079f -r 4b6590dd7250 Population.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Population.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+from OrderedDict import OrderedDict
+
+class Individual(object):
+    __slots__ = ['_column', '_name', '_alias']
+
+    def __init__(self, column, name, alias=None):
+        self._column = column
+        self._name = name
+        self._alias = alias
+
+    @property
+    def column(self):
+        return self._column
+
+    @property
+    def name(self):
+        return self._name if self._alias is None else self._alias
+
+    @property
+    def alias(self):
+        return self._alias
+
+    @alias.setter
+    def alias(self, alias):
+        self._alias = alias
+
+    @property
+    def real_name(self):
+        return self._name
+
+    def __eq__(self, other):
+        return self._column == other._column and self._name == other._name
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        return 'Individual: column={0} name={1} alias={2}'.format(self._column, self._name, self._alias)
+
+
+class Population(object):
+    def __init__(self, name=None):
+        self._columns = OrderedDict()
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+    @name.setter
+    def name(self, name):
+        self._name = name
+
+    def add_individual(self, individual, alias=None):
+        if individual.column not in self._columns:
+            self._columns[individual.column] = individual
+        elif self._columns[individual.column] == individual:
+            # should should this be an error?
+            # should we replace the alias using this entry?
+            pass
+        else:
+            raise 'Duplicate column: {0}'.format(individual)
+
+    def is_superset(self, other):
+        for column, other_individual in other._columns.items():
+            our_individual = self._columns.get(column)
+            if our_individual is None or our_individual != other_individual:
+                return False
+        return True
+
+    def is_disjoint(self, other):
+        for column, our_individual in self._columns.items():
+            other_individual = other._columns.get(column)
+            if other_individual is not None and other_individual == our_individual:
+                return False
+        return True
+
+    def column_list(self):
+        return self._columns.keys()
+
+    def individual_with_column(self, column):
+        if column in self._columns:
+            return self._columns[column]
+        return None
+
+    def tag_list(self, delimiter=':'):
+        entries = []
+        for column, individual in self._columns.items():
+            entry = '{0}{1}{2}'.format(column, delimiter, individual.name)
+            entries.append(entry)
+        return entries
+
+    def to_string(self, delimiter=':', separator=' ', replace_names_with=None):
+        entries = []
+        for column, individual in self._columns.items():
+            value = individual.name
+            if replace_names_with is not None:
+                value = replace_names_with
+            entry = '{0}{1}{2}'.format(column, delimiter, value)
+            entries.append(entry)
+        return separator.join(entries)
+
+    def __str__(self):
+        return self.to_string()
+
+    def from_population_file(self, filename):
+        with open(filename) as fh:
+            for line in fh:
+                line = line.rstrip('\r\n')
+                column, name, alias = line.split('\t')
+                alias = alias.strip()
+                individual = Individual(column, name)
+                if alias:
+                    individual.alias = alias
+                self.add_individual(individual)
+
+    def from_tag_list(self, tag_list):
+        for tag in tag_list:
+            column, name = tag.split(':')
+            individual = Individual(column, name)
+            self.add_individual(individual)
+
+    def individual_names(self):
+        for column, individual in self._columns.items():
+            yield individual.name
+

diff -r d4ec09e8079f -r 4b6590dd7250 README
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,13 @@
+Source code for the executables needed by these tools can be found in
+the genome_diversity directory.
+
+Additionally, you'll need the following python modules:
+    matplotlib (we used version 1.1.0) http://pypi.python.org/packages/source/m/matplotlib/
+    mechanize  (we used version 0.2.5) http://pypi.python.org/packages/source/m/mechanize/
+    networkx   (we used version 1.6)   http://pypi.python.org/packages/source/n/networkx/
+
+And the following software:
+    ADMIXTURE  (we used version 1.22)  http://www.genetics.ucla.edu/software/admixture/
+    EIGENSOFT  (we used version 3.0)   http://genepath.med.harvard.edu/~reich/Software.htm
+    PHAST      (we used version 1.2.1) http://compgen.bscb.cornell.edu/phast/
+    QuickTree  (we used version 1.1)   http://www.sanger.ac.uk/resources/software/quicktree/

diff -r d4ec09e8079f -r 4b6590dd7250 add_fst_column.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/add_fst_column.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+#  <command interpreter="python">
+#    add_fst_column.py "$input" "$p1_input" "$p2_input" "$data_source.choice" "$data_source.min_value" "$retain" "$discard_fixed" "$biased" "$output"
+#    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+#        #set $arg = '%s:%s' % ($individual_col, $individual)
+#        "$arg"
+#    #end for
+#  </command>
+
+import sys
+import subprocess
+from Population import Population
+
+################################################################################
+
+if len(sys.argv) < 12:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input, p1_input, p2_input, genotypes, min_reads, min_qual, retain, discard_fixed, biased, output = sys.argv[1:11]
+individual_metadata = sys.argv[11:]
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+p1 = Population()
+p1.from_population_file(p1_input)
+if not p_total.is_superset(p1):
+    print >> sys.stderr, 'There is an individual in population 1 that is not in the SNP table'
+    sys.exit(1)
+
+p2 = Population()
+p2.from_population_file(p2_input)
+if not p_total.is_superset(p2):
+    print >> sys.stderr, 'There is an individual in population 2 that is not in the SNP table'
+    sys.exit(1)
+
+################################################################################
+
+prog = 'Fst_column'
+
+args = []
+args.append(prog)
+args.append(input)
+args.append(genotypes)
+args.append(min_reads)
+args.append(min_qual)
+args.append(retain)
+args.append(discard_fixed)
+args.append(biased)
+
+columns = p1.column_list()
+for column in columns:
+    args.append('{0}:1'.format(column))
+
+columns = p2.column_list()
+for column in columns:
+    args.append('{0}:2'.format(column))
+
+fh = open(output, 'w')
+
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr)
+rc = p.wait()
+fh.close()
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 add_fst_column.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/add_fst_column.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,90 @@
+<tool id="gd_add_fst_column" name="Add an FST column" version="1.0.0">
+  <description>to a table</description>
+
+  <command interpreter="python">
+    add_fst_column.py "$input" "$p1_input" "$p2_input" "$data_source" "$min_reads" "$min_qual" "$retain" "$discard_fixed" "$biased" "$output"
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+        #set $arg = '%s:%s' % ($individual_col, $individual)
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="SNP table" />
+    <param name="p1_input" type="data" format="gd_indivs" label="Population 1 individuals" />
+    <param name="p2_input" type="data" format="gd_indivs" label="Population 2 individuals" />
+
+    <param name="data_source" type="select" format="integer" label="Data source">
+      <option value="0" selected="true">sequence coverage</option>
+      <option value="1">estimated genotype</option>
+    </param>
+
+    <param name="min_reads" type="integer" min="0" value="0" label="Minimum total read count for a population" />
+    <param name="min_qual" type="integer" min="0" value="0" label="Minimum individual genotype quality" />
+
+    <param name="retain" type="select" label="Special treatment">
+      <option value="0" selected="true">Skip row</option>
+      <option value="1">Set FST = -1</option>
+    </param>
+
+    <param name="discard_fixed" type="select" label="Apparently fixed SNPs">
+      <option value="0">Retain SNPs that appear fixed in the two populations</option>
+      <option value="1" selected="true">Delete SNPs that appear fixed in the two populations</option>
+    </param>
+
+    <param name="biased" type="select" label="FST estimator">
+      <option value="0" selected="true">Wright's original definition</option>
+      <option value="1">Weir's unbiased estimator</option>
+    </param>
+
+  </inputs>
+
+  <outputs>
+    <data name="output" format="gd_snp" metadata_source="input" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="p1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" />
+      <param name="p2_input" value="test_in/b.gd_indivs" ftype="gd_indivs" />
+      <param name="data_source" value="0" />
+      <param name="min_reads" value="3" />
+      <param name="min_qual" value="0" />
+      <param name="retain" value="0" />
+      <param name="discard_fixed" value="1" />
+      <param name="biased" value="0" />
+      <output name="output" file="test_out/add_fst_column/add_fst_column.gd_snp" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+The user specifies a SNP table and two "populations" of individuals,
+both previously defined using the Galaxy tool to select individuals from
+a SNP table.  No individual can be in both populations.  Other choices are
+as follows.
+
+Data source.  The allele frequencies of a SNP in the two populations can be
+estimated either by the total number of reads of each allele, or by adding
+the frequencies inferred from genotypes of individuals in the populations.
+
+After specifying the data source, the user sets lower bounds on amount
+of data required at a SNP.  For estimating the Fst using read counts,
+the bound is the minimum count of reads of the two alleles in a population.
+For estimations based on genotype, the bound is the minimum reported genotype
+quality per individual.
+
+The user specifies whether the SNPs that violate the lower bound should be
+ignored or the Fst set to -1.
+
+The user specifies whether SNPs where both populations appear to be fixed
+for the same allele should be retained or discarded.
+
+Finally, the user chooses which definition of Fst to use:  Wright's original
+definition or Weir's unbiased estimator.
+
+A column is appended to the SNP table giving the Fst for each retained SNP.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 average_fst.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/average_fst.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+
+import sys
+import subprocess
+from Population import Population
+
+################################################################################
+
+if len(sys.argv) < 12:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input, p1_input, p2_input, data_source, min_total_count, discard_fixed, biased, output, shuffles, p0_input = sys.argv[1:11]
+individual_metadata = sys.argv[11:]
+
+try:
+    shuffle_count = int(shuffles)
+except:
+    shuffle_count = 0
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+p1 = Population()
+p1.from_population_file(p1_input)
+if not p_total.is_superset(p1):
+    print >> sys.stderr, 'There is an individual in population 1 that is not in the SNP table'
+    sys.exit(1)
+
+p2 = Population()
+p2.from_population_file(p2_input)
+if not p_total.is_superset(p2):
+    print >> sys.stderr, 'There is an individual in population 2 that is not in the SNP table'
+    sys.exit(1)
+
+p0 = None
+if shuffle_count > 0:
+    p0 = Population()
+    p0.from_population_file(p0_input)
+    if not p_total.is_superset(p0):
+        print >> sys.stderr, 'There is an individual in population 0 that is not in the SNP table'
+        sys.exit(1)
+
+################################################################################
+
+prog = 'Fst_ave'
+
+args = []
+args.append(prog)
+args.append(input)
+args.append(data_source)
+args.append(min_total_count)
+args.append(discard_fixed)
+args.append(biased)
+args.append(shuffles)
+
+columns = p1.column_list()
+for column in columns:
+    args.append('{0}:1'.format(column))
+
+columns = p2.column_list()
+for column in columns:
+    args.append('{0}:2'.format(column))
+
+if p0 is not None:
+    columns = p0.column_list()
+    for column in columns:
+        args.append('{0}:0'.format(column))
+
+fh = open(output, 'w')
+
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr)
+rc = p.wait()
+fh.close()
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 average_fst.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/average_fst.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,112 @@
+<tool id="gd_average_fst" name="Average FST" version="1.0.0">
+  <description>of two populations</description>
+
+  <command interpreter="python">
+    average_fst.py "$input" "$p1_input" "$p2_input" "$data_source.ds_choice" "$data_source.min_value" "$discard_fixed" "$biased" "$output"
+    #if $use_randomization.ur_choice == '1'
+      "$use_randomization.shuffles" "$use_randomization.p0_input"
+    #else
+      "0" "/dev/null"
+    #end if
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+        #set $arg = '%s:%s' % ($individual_col, $individual)
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="SNP table" />
+    <param name="p1_input" type="data" format="gd_indivs" label="Population 1 individuals" />
+    <param name="p2_input" type="data" format="gd_indivs" label="Population 2 individuals" />
+
+    <conditional name="data_source">
+      <param name="ds_choice" type="select" format="integer" label="Data source">
+          <option value="0" selected="true">sequence coverage and ..</option>
+          <option value="1">estimated genotype and ..</option>
+      </param>
+      <when value="0">
+        <param name="min_value" type="integer" min="1" value="1" label="Minimum total read count for a population" />
+      </when>
+      <when value="1">
+        <param name="min_value" type="integer" min="1" value="1" label="Minimum individual genotype quality" />
+      </when>
+    </conditional>
+
+    <param name="discard_fixed" type="select" label="Apparently fixed SNPs">
+      <option value="0">Retain SNPs that appear fixed in the two populations</option>
+      <option value="1" selected="true">Delete SNPs that appear fixed in the two populations</option>
+    </param>
+
+    <param name="biased" type="select" label="FST estimator">
+      <option value="0" selected="true">Wright's original definition</option>
+      <option value="1">Weir's unbiased estimator</option>
+    </param>
+
+    <conditional name="use_randomization">
+      <param name="ur_choice" type="select" format="integer" label="Use randomization">
+        <option value="0" selected="true">No</option>
+        <option value="1">Yes</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="shuffles" type="integer" min="0" value="0" label="Shuffles" />
+        <param name="p0_input" type="data" format="gd_indivs" label="Individuals for randomization" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="txt" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="p1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" />
+      <param name="p2_input" value="test_in/b.gd_indivs" ftype="gd_indivs" />
+      <param name="ds_choice" value="0" />
+      <param name="min_value" value="3" />
+      <param name="discard_fixed" value="1" />
+      <param name="biased" value="0" />
+      <param name="ur_choice" value="0" />
+      <output name="output" file="test_out/average_fst/average_fst.txt" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+The user specifies a SNP table and two "populations" of individuals,
+both previously defined using the Galaxy tool to select individuals from
+a SNP table.  No individual can be in both populations.  Other choices are
+as follows.
+
+Data source.  The allele frequencies of a SNP in the two populations can be
+estimated either by the total number of reads of each allele, or by adding
+the frequencies inferred from genotypes of individuals in the populations.
+
+After specifying the data source, the user sets lower bounds on amount
+of data required at a SNP.  For estimating the Fst using read counts,
+the bound is the minimum count of reads of the two alleles in a population.
+For estimations based on genotype, the bound is the minimum reported genotype
+quality per individual.  SNPs not meeting these lower bounds are ignored.
+
+The user specifies whether SNPs where both populations appear to be fixed
+for the same allele should be retained or discarded.
+
+The user chooses which definition of Fst to use: Wright's original definition
+or Weir's unbiased estimator.
+
+Finally, the user decides whether to use randomizations.  If so, then the
+user specifies how many randomly generated population pairs (retaining
+the numbers of individuals of the originals) to generate, as well as the
+"population" of additional individuals (not in the first two populations)
+that can be used in the randomization process.
+
+The program prints the average Fst for the original populations and the
+number of SNPs used to compute it.  If randomizations were requested,
+it prints the average Fst for each randomly generated population pair,
+ending with a summary that includes the maximum and average value, and the
+highest-scoring population pair.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 calclenchange.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/calclenchange.py Wed Sep 12 17:10:26 2012 -0400

[

b"@@ -0,0 +1,280 @@\n+#!/usr/bin/env python\n+# -*- coding: utf-8 -*-\n+#\n+# calclenchange.py\n+# \n+# Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>\n+# \n+# This program is free software; you can redistribute it and/or modify\n+# it under the terms of the GNU General Public License as published by\n+# the Free Software Foundation; either version 2 of the License, or\n+# (at your option) any later version.\n+# \n+# This program is distributed in the hope that it will be useful,\n+# but WITHOUT ANY WARRANTY; without even the implied warranty of\n+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n+# GNU General Public License for more details.\n+# \n+# You should have received a copy of the GNU General Public License\n+# along with this program; if not, write to the Free Software\n+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,\n+# MA 02110-1301, USA.\n+\n+import argparse,mechanize,os,sys\n+from decimal import Decimal,getcontext\n+from xml.etree.ElementTree import ElementTree,tostring\n+import networkx as nx\n+from copy import copy\n+\n+#method to rank the the pthways by mut. freq.\n+def rankdN(ltfreqs):\n+\tordvals=sorted(ltfreqs)#sort and reverse freqs.\n+\t#~ \n+\toutrnk=[]\n+\ttmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value\n+\tif tmpOri=='C':\n+\t\tif tmpMut!='C':\n+\t\t\ttmpChng0='C-%s'%tmpMut\n+\t\telse:\n+\t\t\ttmpChng0=Decimal('0')\n+\tcrank=1\n+\toutrnk.append([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank),tmpPthw])\n+\ttotalnvals=len(ordvals)\n+\tcnt=0\n+\twhile totalnvals>cnt:\n+\t\tcnt+=1\n+\t\ttmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()\n+\t\tif tmpOri=='C':\n+\t\t\tif tmpMut!='C':\n+\t\t\t\ttmpChng='C-%s'%tmpMut\n+\t\t\telse:\n+\t\t\t\ttmpChng=Decimal('0')\n+\t\tif tmpChng!=tmpChng0:\n+\t\t\tcrank=len(outrnk)+1\n+\t\t\ttmpChng0=tmpChng\n+\t\toutrnk.append([str(tmpChng),str(tmpOri),str(tmpMut),str(crank),tmpPthw])\n+\treturn outrnk\n+\n+#method to rank the the pthways by mut. freq.\n+def rankdAvr(ltfreqs):\n+\tordvals=sorted(ltfreqs)#sort and reverse freqs.\n+\t#~ \n+\toutrnk={}\n+\ttmpChng0,tmpOri,tmpMut,tmpPthw=ordvals.pop()#the highest possible value\n+\tif tmpOri=='I':\n+\t\tif tmpMut!='I':\n+\t\t\ttmpChng0='I-%s'%tmpMut\n+\t\telse:\n+\t\t\ttmpChng0=Decimal('0')\n+\tcrank=1\n+\toutrnk[tmpPthw]='\\t'.join([str(tmpChng0),str(tmpOri),str(tmpMut),str(crank)])\n+\ttotalnvals=len(ordvals)\n+\tcnt=0\n+\twhile totalnvals>cnt:\n+\t\tcnt+=1\n+\t\ttmpChng,tmpOri,tmpMut,tmpPthw=ordvals.pop()\n+\t\tif tmpOri=='I':\n+\t\t\tif tmpMut!='I':\n+\t\t\t\ttmpChng='I-%s'%tmpMut\n+\t\t\telse:\n+\t\t\t\ttmpChng=Decimal('0')\n+\t\tif tmpChng!=tmpChng0:\n+\t\t\tcrank=len(outrnk)+1\n+\t\t\ttmpChng0=tmpChng\n+\t\toutrnk[tmpPthw]='\\t'.join([str(tmpChng),str(tmpOri),str(tmpMut),str(crank)])\n+\treturn outrnk\n+\n+#this method takes as input a list of pairs of edges(beginNod,endNod) and returns a list of nodes with indegree 0 and outdegree 0\n+def returnstartanendnodes(edges):\n+\tlistID0st=set()#starts\n+\tlistOD0en=set()#end\n+\tfor beginNod,endNod in edges:# O(n)\n+\t\tlistID0st.add(beginNod)\n+\t\tlistOD0en.add(endNod)\n+\tstartNdsID0=listID0st.difference(listOD0en)\n+\tendNdsOD0=listOD0en.difference(listID0st)\n+\treturn startNdsID0,endNdsOD0\n+\n+#~ Method to return nodes and edges\n+def returnNodesNEdgesfKXML(fpthwKGXML):\n+\t#~ \n+\ttree = ElementTree()\n+\tptree=tree.parse(fpthwKGXML)\n+\t#~ \n+\ttitle=ptree.get('title')\n+\tprots=ptree.findall('entry')\n+\treactns=ptree.findall('reaction')\n+\t#~ \n+\tedges,ndstmp=set(),set()\n+\tnreactns=len(reactns)\n+\tcr=0#count reacts\n+\twhile nreactns>cr:\n+\t\tcr+=1\n+\t\treactn=reactns.pop()\n+\t\tmainid=reactn.get('id')\n+\t\tndstmp.add(mainid)#add node\n+\t\treacttyp=reactn.get('type')\n+\t\tsbstrts=reactn.findall('substrate')\n+\t\twhile len(sbstrts)>0:\n+\t\t\tcsbstrt=sbstrts.pop()\n+\t\t\tcsbtsid=csbstrt.get('id')\n+\t\t\tndstmp.add(csbtsid)#add node\n+\t\t\tif reacttyp=='irreversible':\n+\t\t\t\tedges.add((csbtsid,mainid))#add edges\n+\t\t\telif reacttyp=='reversible':\n+\t\t\t\tedges.add((mainid,csbtsid))#add edges\n+\t\t\t\tedges.add((csbtsid,mainid))#add edges\n+\t\t#~ \n+\t\tprdcts=reactn.findall('"..b" I the pathway name' )\n+\tparser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')\n+\tparser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')\n+\tparser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')\n+\t#~ \n+\t#~Open arguments \n+\tclass C(object):\n+\t\tpass\n+\tfulargs=C()\n+\tparser.parse_args(sys.argv[1:],namespace=fulargs)\n+\t#test input vars\n+\tinputf,loc_file,species,output,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn\n+\tposKEGGclmn-=1#correct pos\n+\tKgeneposcolmn-=1\n+\t#~ Get the extra variables\n+\tcrDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0]\n+\tsppPrefx,dinput=crDB[1],crDB[2]\n+\t#~ set decimal positions\n+\tgetcontext().prec = 3\n+\t#make a dictionary of valid genes\n+\tdKEGGcPthws=dict([(x.split('\\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()])\n+\tsdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1])\n+\twhile True:#to crrect names with more than one gene\n+\t\ttry:\n+\t\t\tmgenes=sdGenes.pop()\n+\t\t\tpthwsAssotd=dKEGGcPthws.pop(mgenes)\n+\t\t\tmgenes=mgenes.split('.')\n+\t\t\tfor eachg in mgenes:\n+\t\t\t\tdKEGGcPthws[eachg]=pthwsAssotd\n+\t\texcept:\n+\t\t\tbreak\n+\t#~ \n+\tlPthwsF=[x for x in os.listdir(dinput) if x.find('.xml')>-1 if x not in ['cfa04070.xml']]\n+\tnPthws=len(lPthwsF)\n+\tcPthw=0\n+\tlPthwPthN=[]#the output list for number of paths\n+\tlPthwPthAvr=[]#the output list for the length of paths\n+\t#~ \n+\twhile cPthw<nPthws:\n+\t\tcPthw+=1\n+\t\tKEGGpathw=lPthwsF.pop()\n+\t\tcomdKEGGpathw=KEGGpathw.split('.')[0]\n+\t\ttmpddGenrcgenPresent=set()\n+\t\tsKEGGc=dKEGGcPthws.keys()\n+\t\tlsKEGGc=len(sKEGGc)\n+\t\tctPthw=0\n+\t\twhile ctPthw < lsKEGGc:#to save memory\n+\t\t\teachK=sKEGGc.pop()\n+\t\t\talPthws=dKEGGcPthws[eachK]\n+\t\t\tif comdKEGGpathw in alPthws:\n+\t\t\t\ttmpddGenrcgenPresent.add(':'.join([sppPrefx,eachK]))\n+\t\t\tctPthw+=1\n+\t\t#~ Make graph calculations\t\n+\t\tdnodes,edges,title=returnNodesNEdgesfKXML(open(os.path.join(dinput,KEGGpathw)))\n+\t\tstartNdsID0,endNdsOD0=returnstartanendnodes(edges)\n+\t\tstartNdsOri=copy(startNdsID0)\n+\t\t#~ \n+\t\tnPaths='C'#stands for circuit\n+\t\tAvrgPthLen='I'#stand for infinite\n+\t\tif len(startNdsID0)>0 and len(endNdsOD0)>0:\n+\t\t\tnPaths,AvrgPthLen=rtrnAvrgLen(edges,startNdsID0,endNdsOD0)\n+\t\t#~ work with the genes in the list\n+\t\tgenestodel=set()\n+\t\tlnodes=len(dnodes)\n+\t\tsNds=set(dnodes)\n+\t\tctPthw=0\n+\t\twhile ctPthw<lnodes:\n+\t\t\tctPthw+=1\n+\t\t\tcNod=sNds.pop()\n+\t\t\tsgenes=dnodes.pop(cNod)\n+\t\t\tif len(sgenes.intersection(tmpddGenrcgenPresent))==len(sgenes):\n+\t\t\t\tgenestodel.add(cNod)\n+\t\t#~ del nodes from graph edges\n+\t\twnPaths,wAvrgPthLen=copy(nPaths),copy(AvrgPthLen)\n+\t\tif len(genestodel)>0:\n+\t\t\twedges=set([x for x in edges if len(set(x).intersection(genestodel))==0])\n+\t\t\twstartNds,wendNds=returnstartanendnodes(wedges)\n+\t\t\tif nPaths!='C':\n+\t\t\t\twstartNds=[x for x in wstartNds if x in startNdsOri]\n+\t\t\t\twendNds=[x for x in wendNds if x in endNdsOD0]\n+\t\t\tif len(wstartNds)>0 and len(wendNds)>0:\n+\t\t\t\twnPaths,wAvrgPthLen=rtrnAvrgLen(wedges,wstartNds,wendNds)\n+\t\t#~ Calculate the differences\n+\t\torNP,mutNP,oriLen,mutLen=nPaths,wnPaths,AvrgPthLen,wAvrgPthLen\n+\t\tif nPaths=='C':\n+\t\t\torNP=Decimal('1000')\n+\t\t\toriLen=Decimal('1000')\n+\t\tif wnPaths=='C':\n+\t\t\tmutNP=Decimal('1000')\n+\t\t\tmutLen=Decimal('1000')\n+\t\tlPthwPthN.append([orNP-mutNP,nPaths,wnPaths,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen\n+\t\tlPthwPthAvr.append([oriLen-mutLen,AvrgPthLen,wAvrgPthLen,'='.join([comdKEGGpathw,title])])#print nPaths,AvrgPthLen\n+\tdoutrnkPthN=rankdN(lPthwPthN)\n+\tdoutrnkPthAvr=rankdAvr(lPthwPthAvr)\n+\t#~ \n+\tsall=['\\t'.join([doutrnkPthAvr[x[4]],'\\t'.join(x)]) for x in doutrnkPthN]\n+\tsalef=open(output,'w')\n+\tsalef.write('\\n'.join(sall))\n+\tsalef.close()\n+\treturn 0\n+\t\n+\n+if __name__ == '__main__':\n+\tmain()\n+\n"

diff -r d4ec09e8079f -r 4b6590dd7250 calctfreq.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/calctfreq.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       calcfreq.py
+#
+#       Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
+#
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse,os,sys
+from decimal import Decimal,getcontext
+from LocationFile import LocationFile
+
+#method to rank the the pthways by mut. freq.
+def rankd(ltfreqs):
+ ordvals=sorted(ltfreqs)#sort and reverse freqs.
+ #~
+ outrnk=[]
+ tmpFreq0,tmpCount,tmpPthw=ordvals.pop()#the highest possible value
+ crank=1
+ outrnk.append('\t'.join([str(tmpCount),str(tmpFreq0),str(crank),tmpPthw]))
+ totalnvals=len(ordvals)
+ cnt=0
+ while totalnvals>cnt:
+ cnt+=1
+ tmpFreq,tmpCount,tmpPthw=ordvals.pop()
+ if tmpFreq!=tmpFreq0:
+ crank=len(outrnk)+1
+ tmpFreq0=tmpFreq
+ outrnk.append('\t'.join([str(tmpCount),str(tmpFreq),str(crank),tmpPthw]))
+ return outrnk
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.')
+ parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
+ parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. Column 1 is the count of genes in the list, Column 2 is the percentage of the pathway genes present on the list. Column 3 is the rank based on column 2')
+ parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')
+ parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')
+ parser.add_argument('--loc_file',metavar='location file',type=str,help='location file')
+ parser.add_argument('--species',metavar='species',type=str,help='species')
+ #~Open arguments
+ class C(object):
+ pass
+ fulargs=C()
+ parser.parse_args(sys.argv[1:],namespace=fulargs)
+ #test input vars
+ inputf,outputf,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn
+ locf,species=fulargs.loc_file,fulargs.species
+ #make a dictionary of valid genes
+ posKEGGclmn-=1
+ Kgeneposcolmn-=1
+ dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set(x.split('\t')[posKEGGclmn].split('.'))) for x in open(inputf).read().splitlines()[1:] if x.strip()])
+ sdGenes=set([x for x in dKEGGcPthws.keys() if x.find('.')>-1])
+ while True:#to correct names with more than one gene
+ try:
+ mgenes=sdGenes.pop()
+ pthwsAssotd=dKEGGcPthws.pop(mgenes)
+ mgenes=mgenes.split('.')
+ for eachg in mgenes:
+ dKEGGcPthws[eachg]=pthwsAssotd
+ except:
+ break
+ #~ Count genes
+ getcontext().prec=2#set 2 decimal places
+
+ location_file = LocationFile(locf)
+ prefix, kxml_dir_path, dict_file = location_file.get_values(species)
+ dPthContsTotls = {}
+ try:
+     with open(dict_file) as fh:
+         for line in fh:
+             line = line.rstrip('\r\n')
+             value, key = line.split('\t')
+             dPthContsTotls[key] = int(value)
+ except IOError, err:
+     print >> sys.stderr, 'Error opening dict file {0}: {1}'.format(dict_file, err.strerror)
+     sys.exit(1)
+
+ dPthContsTmp=dict([(x,0) for x in dPthContsTotls.keys()])#create a list of genes
+ sdGenes=set([x for x in dKEGGcPthws.keys()])#list of all genes
+ cntGens=0
+ ltGens=len(sdGenes)
+ while cntGens<ltGens:
+ cGen=sdGenes.pop()
+ sKEGGcPthws=dKEGGcPthws.pop(cGen)
+ for eachP in sKEGGcPthws:
+ if eachP!='N':
+ dPthContsTmp[eachP]+=1
+ cntGens+=1
+ #~ Calculate Freqs.
+ ltfreqs=[((Decimal(dPthContsTmp[x])/Decimal(dPthContsTotls[x])),Decimal(dPthContsTmp[x]),x) for x in dPthContsTotls]
+ tabllfreqs='\n'.join(rankd(ltfreqs))
+ salef=open(outputf,'w')
+ salef.write(tabllfreqs)
+ salef.close()
+ return 0
+
+
+if __name__ == '__main__':
+ main()

diff -r d4ec09e8079f -r 4b6590dd7250 cdblib.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/cdblib.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+
+'''
+Manipulate DJB's Constant Databases. These are 2 level disk-based hash tables
+that efficiently handle many keys, while remaining space-efficient.
+
+    http://cr.yp.to/cdb.html
+
+When generated databases are only used with Python code, consider using hash()
+rather than djb_hash() for a tidy speedup.
+'''
+
+from _struct import Struct
+from itertools import chain
+
+
+def py_djb_hash(s):
+    '''Return the value of DJB's hash function for the given 8-bit string.'''
+    h = 5381
+    for c in s:
+        h = (((h << 5) + h) ^ ord(c)) & 0xffffffff
+    return h
+
+try:
+    from _cdblib import djb_hash
+except ImportError:
+    djb_hash = py_djb_hash
+
+read_2_le4 = Struct('<LL').unpack
+write_2_le4 = Struct('<LL').pack
+
+
+class Reader(object):
+    '''A dictionary-like object for reading a Constant Database accessed
+    through a string or string-like sequence, such as mmap.mmap().'''
+
+    def __init__(self, data, hashfn=djb_hash):
+        '''Create an instance reading from a sequence and using hashfn to hash
+        keys.'''
+        if len(data) < 2048:
+            raise IOError('CDB too small')
+
+        self.data = data
+        self.hashfn = hashfn
+
+        self.index = [read_2_le4(data[i:i+8]) for i in xrange(0, 2048, 8)]
+        self.table_start = min(p[0] for p in self.index)
+        # Assume load load factor is 0.5 like official CDB.
+        self.length = sum(p[1] >> 1 for p in self.index)
+
+    def iteritems(self):
+        '''Like dict.iteritems(). Items are returned in insertion order.'''
+        pos = 2048
+        while pos < self.table_start:
+            klen, dlen = read_2_le4(self.data[pos:pos+8])
+            pos += 8
+
+            key = self.data[pos:pos+klen]
+            pos += klen
+
+            data = self.data[pos:pos+dlen]
+            pos += dlen
+
+            yield key, data
+
+    def items(self):
+        '''Like dict.items().'''
+        return list(self.iteritems())
+
+    def iterkeys(self):
+        '''Like dict.iterkeys().'''
+        return (p[0] for p in self.iteritems())
+    __iter__ = iterkeys
+
+    def itervalues(self):
+        '''Like dict.itervalues().'''
+        return (p[1] for p in self.iteritems())
+
+    def keys(self):
+        '''Like dict.keys().'''
+        return [p[0] for p in self.iteritems()]
+
+    def values(self):
+        '''Like dict.values().'''
+        return [p[1] for p in self.iteritems()]
+
+    def __getitem__(self, key):
+        '''Like dict.__getitem__().'''
+        value = self.get(key)
+        if value is None:
+            raise KeyError(key)
+        return value
+
+    def has_key(self, key):
+        '''Return True if key exists in the database.'''
+        return self.get(key) is not None
+    __contains__ = has_key
+
+    def __len__(self):
+        '''Return the number of records in the database.'''
+        return self.length
+
+    def gets(self, key):
+        '''Yield values for key in insertion order.'''
+        # Truncate to 32 bits and remove sign.
+        h = self.hashfn(key) & 0xffffffff
+        start, nslots = self.index[h & 0xff]
+
+        if nslots:
+            end = start + (nslots << 3)
+            slot_off = start + (((h >> 8) % nslots) << 3)
+
+            for pos in chain(xrange(slot_off, end, 8),
+                             xrange(start, slot_off, 8)):
+                rec_h, rec_pos = read_2_le4(self.data[pos:pos+8])
+
+                if not rec_h:
+                    break
+                elif rec_h == h:
+                    klen, dlen = read_2_le4(self.data[rec_pos:rec_pos+8])
+                    rec_pos += 8
+
+                    if self.data[rec_pos:rec_pos+klen] == key:
+                        rec_pos += klen
+                        yield self.data[rec_pos:rec_pos+dlen]
+
+    def get(self, key, default=None):
+        '''Get the first value for key, returning default if missing.'''
+        # Avoid exception catch when handling default case; much faster.
+        return chain(self.gets(key), (default,)).next()
+
+    def getint(self, key, default=None, base=0):
+        '''Get the first value for key converted it to an int, returning
+        default if missing.'''
+        value = self.get(key, default)
+        if value is not default:
+            return int(value, base)
+        return value
+
+    def getints(self, key, base=0):
+        '''Yield values for key in insertion order after converting to int.'''
+        return (int(v, base) for v in self.gets(key))
+
+    def getstring(self, key, default=None, encoding='utf-8'):
+        '''Get the first value for key decoded as unicode, returning default if
+        not found.'''
+        value = self.get(key, default)
+        if value is not default:
+            return value.decode(encoding)
+        return value
+
+    def getstrings(self, key, encoding='utf-8'):
+        '''Yield values for key in insertion order after decoding as
+        unicode.'''
+        return (v.decode(encoding) for v in self.gets(key))
+
+
+class Writer(object):
+    '''Object for building new Constant Databases, and writing them to a
+    seekable file-like object.'''
+
+    def __init__(self, fp, hashfn=djb_hash):
+        '''Create an instance writing to a file-like object, using hashfn to
+        hash keys.'''
+        self.fp = fp
+        self.hashfn = hashfn
+
+        fp.write('\x00' * 2048)
+        self._unordered = [[] for i in xrange(256)]
+
+    def put(self, key, value=''):
+        '''Write a string key/value pair to the output file.'''
+        assert type(key) is str and type(value) is str
+
+        pos = self.fp.tell()
+        self.fp.write(write_2_le4(len(key), len(value)))
+        self.fp.write(key)
+        self.fp.write(value)
+
+        h = self.hashfn(key) & 0xffffffff
+        self._unordered[h & 0xff].append((h, pos))
+
+    def puts(self, key, values):
+        '''Write more than one value for the same key to the output file.
+        Equivalent to calling put() in a loop.'''
+        for value in values:
+            self.put(key, value)
+
+    def putint(self, key, value):
+        '''Write an integer as a base-10 string associated with the given key
+        to the output file.'''
+        self.put(key, str(value))
+
+    def putints(self, key, values):
+        '''Write zero or more integers for the same key to the output file.
+        Equivalent to calling putint() in a loop.'''
+        self.puts(key, (str(value) for value in values))
+
+    def putstring(self, key, value, encoding='utf-8'):
+        '''Write a unicode string associated with the given key to the output
+        file after encoding it as UTF-8 or the given encoding.'''
+        self.put(key, unicode.encode(value, encoding))
+
+    def putstrings(self, key, values, encoding='utf-8'):
+        '''Write zero or more unicode strings to the output file. Equivalent to
+        calling putstring() in a loop.'''
+        self.puts(key, (unicode.encode(value, encoding) for value in values))
+
+    def finalize(self):
+        '''Write the final hash tables to the output file, and write out its
+        index. The output file remains open upon return.'''
+        index = []
+        for tbl in self._unordered:
+            length = len(tbl) << 1
+            ordered = [(0, 0)] * length
+            for pair in tbl:
+                where = (pair[0] >> 8) % length
+                for i in chain(xrange(where, length), xrange(0, where)):
+                    if not ordered[i][0]:
+                        ordered[i] = pair
+                        break
+
+            index.append((self.fp.tell(), length))
+            for pair in ordered:
+                self.fp.write(write_2_le4(*pair))
+
+        self.fp.seek(0)
+        for pair in index:
+            self.fp.write(write_2_le4(*pair))
+        self.fp = None # prevent double finalize()

diff -r d4ec09e8079f -r 4b6590dd7250 coverage_distributions.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coverage_distributions.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,187 @@
+#!/usr/bin/env python
+
+import os
+import errno
+import sys
+import shutil
+import subprocess
+from Population import Population
+import gd_composite
+
+################################################################################
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+################################################################################
+
+if len(sys.argv) < 7:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input, data_source, output, extra_files_path = sys.argv[1:5]
+
+individual_metadata = []
+population_info = []
+p1_input = None
+all_individuals = False
+
+for arg in sys.argv[5:]:
+    if arg == 'all_individuals':
+        all_individuals = True
+    elif len(arg) > 12 and arg[:12] == 'individuals:':
+        p1_input = arg[12:]
+    elif len(arg) > 11:
+        if arg[:11] == 'population:':
+            file, name = arg[11:].split(':', 1)
+            population_info.append((file, name))
+        elif arg[:11] == 'individual:':
+            individual_metadata.append(arg[11:])
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+################################################################################
+
+mkdir_p(extra_files_path)
+
+################################################################################
+
+prog = 'coverage'
+
+args = []
+args.append(prog)
+args.append(input)
+args.append(data_source)
+
+user_coverage_file = os.path.join(extra_files_path, 'coverage.txt')
+args.append(user_coverage_file)
+
+population_list = []
+
+if all_individuals:
+    tags = p_total.tag_list()
+elif p1_input is not None:
+    p1 = Population()
+    this_pop = Population()
+    this_pop.from_population_file(p1_input)
+    population_list.append(this_pop)
+    p1.from_population_file(p1_input)
+    if not p_total.is_superset(p1):
+        print >> sys.stderr, 'There is an individual in the population that is not in the SNP table'
+        sys.exit(1)
+    tags = p1.tag_list()
+else:
+    tags = []
+    for population_file, population_name in population_info:
+        population = Population()
+        this_pop = Population()
+        this_pop.from_population_file(population_file)
+        population_list.append(this_pop)
+        population.from_population_file(population_file)
+        if not p_total.is_superset(population):
+            print >> sys.stderr, 'There is an individual in the {} population that is not in the SNP table'.format(population_name)
+            sys.exit(1)
+        columns = population.column_list()
+        for column in columns:
+            tags.append('{0}:{1}'.format(column, population_name))
+
+for tag in tags:
+    args.append(tag)
+
+## text output
+coverage_file = 'coverage.txt'
+fh = open(coverage_file, 'w')
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr)
+rc = p.wait()
+fh.close()
+
+## graphical output
+fh = open(coverage_file)
+coverage2_file = 'coverage2.txt'
+ofh = open(coverage2_file, 'w')
+
+for line in fh:
+    line = line.rstrip('\r\n')
+    elems = line.split('\t')
+    name = elems.pop(0)
+    values = [ elems[0] ]
+    for idx in range(1, len(elems)):
+        val = str(float(elems[idx]) - float(elems[idx-1]))
+        values.append(val)
+    print >> ofh, '{0}\t{1}'.format(name, '\t'.join(values))
+
+fh.close()
+ofh.close()
+
+################################################################################
+
+prog = 'R'
+
+args = []
+args.append(prog)
+args.append('--vanilla')
+args.append('--quiet')
+
+_realpath = os.path.realpath(__file__)
+_script_dir = os.path.dirname(_realpath)
+r_script_file = os.path.join(_script_dir, 'coverage_plot.r')
+
+ifh = open(r_script_file)
+ofh = open('/dev/null', 'w')
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=ifh, stdout=ofh, stderr=None)
+rc = p.wait()
+ifh.close()
+ofh.close()
+
+pdf_file = os.path.join(extra_files_path, 'coverage.pdf')
+shutil.copy2('coverage.pdf', pdf_file)
+os.remove('coverage.pdf')
+os.remove(coverage2_file)
+
+################################################################################
+
+info_page = gd_composite.InfoPage()
+info_page.set_title('Coverage distributions Galaxy Composite Dataset')
+
+display_file = gd_composite.DisplayFile()
+display_value = gd_composite.DisplayValue()
+
+out_pdf = gd_composite.Parameter(name='coverage.pdf', value='coverage.pdf', display_type=display_file)
+out_txt = gd_composite.Parameter(name='coverage.txt', value='coverage.txt', display_type=display_file)
+
+info_page.add_output_parameter(out_pdf)
+info_page.add_output_parameter(out_txt)
+
+
+if data_source == '0':
+    data_source_value = 'sequence coverage'
+elif data_source == '1':
+    data_source_value = 'estimated genotype'
+
+in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value)
+
+info_page.add_input_parameter(in_data_source)
+
+if population_list:
+    misc_populations =  gd_composite.Parameter(name='Populations', value=population_list, display_type=gd_composite.DisplayPopulationList())
+    info_page.add_misc(misc_populations)
+else:
+    misc_individuals = gd_composite.Parameter(name='Individuals', value=tags, display_type=gd_composite.DisplayTagList())
+    info_page.add_misc(misc_individuals)
+
+
+
+
+with open (output, 'w') as ofh:
+    print >> ofh, info_page.render()
+
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 coverage_distributions.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coverage_distributions.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,75 @@
+<tool id="gd_coverage_distributions" name="Coverage" version="1.0.0">
+  <description>distributions</description>
+
+  <command interpreter="python">
+    coverage_distributions.py "$input" "0" "$output" "$output.files_path"
+    #if $individuals.choice == '0'
+      "all_individuals"
+    #else if $individuals.choice == '1'
+      #set $arg = 'individuals:%s' % str($individuals.p1_input)
+        "$arg"
+    #else if $individuals.choice == '2'
+      #for $population in $individuals.populations
+        #set $arg = 'population:%s:%s' % (str($population.p_input), str($population.p_input.name))
+        "$arg"
+      #end for
+    #end if
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+      #set $individual_arg = 'individual:%s:%s' % ($individual_col, $individual)
+      "$individual_arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="SNP table" />
+
+    <conditional name="individuals">
+      <param name="choice" type="select" label="Individuals">
+        <option value="0" selected="true">All</option>
+        <option value="1">Individuals in a population</option>
+        <option value="2">Population totals</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+      </when>
+      <when value="2">
+        <repeat name="populations" title="Population" min="1">
+          <param name="p_input" type="data" format="gd_indivs" label="individuals" />
+        </repeat>
+      </when>
+    </conditional>
+
+    
+  </inputs>
+
+  <outputs>
+    <data name="output" format="html" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="choice" value="0" />
+      <output name="output" file="test_out/coverage_distributions/coverage.html" ftype="html" compare="diff" lines_diff="2">
+        <extra_files type="file" name="coverage.pdf" value="test_out/coverage_distributions/coverage.pdf" compare="sim_size" delta = "1000"/>
+        <extra_files type="file" name="coverage.txt" value="test_out/coverage_distributions/coverage.txt" />
+      </output>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool reports distributions of SNP reliability indicators for
+individuals or populations.  The reliability is measured by the sequence
+coverage.  Textual and graphical reports are generated, where the text
+output gives the cumulative distributions.
+  </help>
+</tool>
+

diff -r d4ec09e8079f -r 4b6590dd7250 coverage_plot.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/coverage_plot.r Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,31 @@
+x <- read.table('coverage2.txt', skip=1, sep='\t')
+
+individuals <- dim(x)[1]
+max_cov <- dim(x)[2] - 2
+max_val <- max(x[-1]) / 100
+colors <- rainbow(individuals)
+
+line_width = 3
+xt = t(x)
+
+xvals <- c(0:max_cov)
+values <- as.numeric(as.vector(xt[,1][-1]))/100
+
+pdf(file='coverage.pdf', onefile=TRUE, width=10, height=6);
+
+plot(xvals, values, type='l', ylim=c(0, max_val), xlim=c(0, max_cov), col=colors[1], lwd=line_width, xlab="Coverage", ylab="Proportion")
+
+if (individuals > 1) {
+    for (i in 2:individuals) {
+        values <- as.numeric(as.vector(xt[,i][-1]))/100;
+        lines(xvals, values, col=colors[i], lwd=line_width);
+    }
+}
+
+
+names <- as.vector(t(x[1]))
+legend(x='topright', legend=names, fill=colors, bty='n')
+
+dev.off()
+
+

diff -r d4ec09e8079f -r 4b6590dd7250 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<datatypes>
+  <datatype_files>
+    <datatype_file name="wsf.py"/>
+  </datatype_files>
+  <registration>
+    <datatype extension="gd_indivs" type="galaxy.datatypes.wsf:Individuals" display_in_upload="true"/>
+    <datatype extension="gd_ped" type="galaxy.datatypes.wsf:Wped" display_in_upload="true"/>
+    <datatype extension="gd_snp" type="galaxy.datatypes.wsf:GDSnp" display_in_upload="true"/>
+    <datatype extension="gd_sap" type="galaxy.datatypes.wsf:GDSap" display_in_upload="true"/>
+  </registration>
+  <sniffers/>
+</datatypes>

diff -r d4ec09e8079f -r 4b6590dd7250 dpmix.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dpmix.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,162 @@
+#!/usr/bin/env python
+
+import errno
+import sys
+import os
+import subprocess
+from Population import Population
+import gd_composite
+from dpmix_plot import make_dpmix_plot
+from LocationFile import LocationFile
+
+################################################################################
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+def run_program(prog, args, stdout_file=None, space_to_tab=False):
+    #print "args: ", ' '.join(args)
+    p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = p.communicate()
+    rc = p.returncode
+
+    if stdout_file is not None:
+        with open(stdout_file, 'w') as ofh:
+            lines = stdoutdata.split('\n')
+            for line in lines:
+                line = line.strip()
+                if line:
+                    if space_to_tab:
+                        line = line.replace(' ', '\t')
+                    print >> ofh, line
+
+    if rc != 0:
+        print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
+        print >> sys.stderr, stderrdata
+        sys.exit(1)
+
+################################################################################
+
+if len(sys.argv) < 15:
+    print "usage"
+    sys.exit(1)
+
+input, data_source, switch_penalty, ap1_input, ap2_input, p_input, output, output2, output2_dir, dbkey, ref_column, galaxy_data_index_dir, heterochromatin_loc_file = sys.argv[1:14]
+individual_metadata = sys.argv[14:]
+
+chrom = 'all'
+add_logs = '0'
+
+loc_path = os.path.join(galaxy_data_index_dir, heterochromatin_loc_file)
+location_file = LocationFile(loc_path)
+heterochrom_path = location_file.get_values_if_exists(dbkey)
+if heterochrom_path is None:
+    heterochrom_path = '/dev/null'
+
+population_list = []
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+ap1 = Population(name='Ancestral population 1')
+ap1.from_population_file(ap1_input)
+population_list.append(ap1)
+if not p_total.is_superset(ap1):
+    print >> sys.stderr, 'There is an individual in ancestral population 1 that is not in the SNP table'
+    sys.exit(1)
+
+ap2 = Population(name='Ancestral population 2')
+ap2.from_population_file(ap2_input)
+population_list.append(ap2)
+if not p_total.is_superset(ap2):
+    print >> sys.stderr, 'There is an individual in ancestral population 2 that is not in the SNP table'
+    sys.exit(1)
+
+p = Population(name='Potentially admixed')
+p.from_population_file(p_input)
+population_list.append(p)
+if not p_total.is_superset(p):
+    print >> sys.stderr, 'There is an individual in the population that is not in the SNP table'
+    sys.exit(1)
+
+mkdir_p(output2_dir)
+
+################################################################################
+# Create tabular file
+################################################################################
+
+misc_file = os.path.join(output2_dir, 'misc.txt')
+
+prog = 'dpmix'
+args = [ prog ]
+args.append(input)
+args.append(ref_column)
+args.append(chrom)
+args.append(data_source)
+args.append(add_logs)
+args.append(switch_penalty)
+args.append(heterochrom_path)
+args.append(misc_file)
+
+columns = ap1.column_list()
+for column in columns:
+    args.append('{0}:1:{1}'.format(column, ap1.individual_with_column(column).name))
+
+columns = ap2.column_list()
+for column in columns:
+    args.append('{0}:2:{1}'.format(column, ap2.individual_with_column(column).name))
+
+columns = p.column_list()
+for column in columns:
+    args.append('{0}:0:{1}'.format(column, p.individual_with_column(column).name))
+
+run_program(None, args, stdout_file=output, space_to_tab=True)
+
+################################################################################
+# Create pdf file
+################################################################################
+
+pdf_file = os.path.join(output2_dir, 'dpmix.pdf')
+make_dpmix_plot(dbkey, output, pdf_file, galaxy_data_index_dir)
+
+################################################################################
+# Create html
+################################################################################
+
+info_page = gd_composite.InfoPage()
+info_page.set_title('dpmix Galaxy Composite Dataset')
+
+display_file = gd_composite.DisplayFile()
+display_value = gd_composite.DisplayValue()
+
+out_pdf = gd_composite.Parameter(name='dpmix.pdf', value='dpmix.pdf', display_type=display_file)
+out_misc = gd_composite.Parameter(name='misc.txt', value='misc.txt', display_type=display_file)
+
+info_page.add_output_parameter(out_pdf)
+info_page.add_output_parameter(out_misc)
+
+if data_source == '0':
+    data_source_value = 'sequence coverage'
+elif data_source == '1':
+    data_source_value = 'estimated genotype'
+
+in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value)
+in_switch_penalty = gd_composite.Parameter(description='Switch penalty', value=switch_penalty, display_type=display_value)
+
+info_page.add_input_parameter(in_data_source)
+info_page.add_input_parameter(in_switch_penalty)
+
+misc_populations =  gd_composite.Parameter(name='Populations', value=population_list, display_type=gd_composite.DisplayPopulationList())
+
+info_page.add_misc(misc_populations)
+
+with open(output2, 'w') as ofh:
+    print >> ofh, info_page.render()
+
+sys.exit(0)
+
+

diff -r d4ec09e8079f -r 4b6590dd7250 dpmix.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dpmix.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,68 @@
+<tool id="gd_dpmix" name="Admixture" version="1.0.0">
+  <description>using dynamic programming</description>
+
+  <command interpreter="python">
+    dpmix.py "$input" "$data_source" "$switch_penalty" "$ap1_input" "$ap2_input" "$p_input" "$output" "$output2" "$output2.files_path" "$input.dataset.metadata.dbkey" "$input.dataset.metadata.ref" "$GALAXY_DATA_INDEX_DIR" "gd.heterochromatic.loc"
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+      #set $arg = '%s:%s' % ($individual_col, $individual)
+      "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="Dataset">
+      <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" />
+    </param>
+    <param name="ap1_input" type="data" format="gd_indivs" label="Ancestral population 1 individuals" />
+    <param name="ap2_input" type="data" format="gd_indivs" label="Ancestral population 2 individuals" />
+    <param name="p_input" type="data" format="gd_indivs" label="Potentially admixed individuals" />
+
+    <param name="data_source" type="select" format="integer" label="Data source">
+      <option value="0" selected="true">sequence coverage</option>
+      <option value="1">estimated genotype</option>
+    </param>
+
+    <param name="switch_penalty" type="integer" min="0" value="10" label="Switch penalty" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="tabular" />
+    <data name="output2" format="html" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="ap1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" />
+      <param name="ap2_input" value="test_in/b.gd_indivs" ftype="gd_indivs" />
+      <param name="p_input" value="test_in/c.gd_indivs" ftype="gd_indivs" />
+      <param name="data_source" value="0" />
+      <param name="switch_penalty" value="10" />
+
+      <output name="output" file="test_out/dpmix/dpmix.tabular" />
+
+      <output name="output2" file="test_out/dpmix/dpmix.html" ftype="html" compare="diff" lines_diff="2">
+        <extra_files type="file" name="dpmix.pdf" value="test_out/dpmix/dpmix.pdf" compare="sim_size" delta = "10000" />
+        <extra_files type="file" name="misc.txt" value="test_out/dpmix/misc.txt" />
+      </output>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+The user specifies two "ancestral" populations (i.e., sources for
+chromosomes) and a set of potentially admixed individuals, and chooses
+between the sequence coverage or the estimated genotypes to measure
+the similarity of genomic intervals in admixed individuals to the two
+classes of ancestral chromosomes.  The user also picks a "switch penalty",
+typically between 10 and 100.  For each potentially admixed individual,
+the program divides the genome into three "genotypes": (0) homozygous
+for the second ancestral population (i.e., both chromosomes from that
+population), (1) heterozygous, or (2) homozygous for the second ancestral
+population.  Parts of a chromosome that are labeled as "heterochromatic"
+are given the non-genotype, 3.  Smaller values of the switch penalty
+(corresponding to more ancient admixture events) generally lead to the
+reconstruction of more frequent changes between genotypes.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 dpmix_plot.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/dpmix_plot.py Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,297 @@\n+#!/usr/bin/env python\n+\n+import os\n+import sys\n+import math\n+import matplotlib as mpl\n+mpl.use(\'PDF\')\n+import matplotlib.pyplot as plt\n+from matplotlib.path import Path\n+import matplotlib.patches as patches\n+\n+################################################################################\n+\n+def build_chrom_len_dict(dbkey, galaxy_data_index_dir):\n+ chrom_len_root = os.path.join(galaxy_data_index_dir, \'shared/ucsc/chrom\')\n+ chrom_len_file = \'{0}.len\'.format(dbkey)\n+ chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)\n+\n+ chrom_len = {}\n+\n+ try:\n+ with open(chrom_len_path) as fh:\n+ for line in fh:\n+ line = line.rstrip(\'\\r\\n\')\n+ elems = line.split()\n+ if len(elems) == 2:\n+ chrom = elems[0]\n+ length = int(elems[1])\n+ chrom_len[chrom] = length\n+ except:\n+ pass\n+\n+ return chrom_len\n+\n+def parse_input_file(input_file):\n+ chroms = []\n+ individuals = []\n+ data = {}\n+ chrom_len = {}\n+\n+ with open(input_file) as fh:\n+ for line in fh:\n+ line = line.strip()\n+ if line:\n+ elems = line.split()\n+ chrom = elems[0]\n+ p1, p2, state = map(int, elems[1:4])\n+ id = elems[4]\n+\n+ if chrom not in chroms:\n+ chroms.append(chrom)\n+\n+ if id not in individuals:\n+ individuals.append(id)\n+\n+ data.setdefault(chrom, {})\n+ data[chrom].setdefault(id, [])\n+ data[chrom][id].append((p1, p2, state))\n+\n+ if p2 > chrom_len.setdefault(chrom, 0):\n+ chrom_len[chrom] = p2\n+\n+ return chroms, individuals, data, chrom_len\n+\n+def check_chroms(chroms, chrom_len, dbkey):\n+ error = 0\n+ for chrom in chroms:\n+ if chrom not in chrom_len:\n+ print >> sys.stderr, "Can\'t find length for {0} chromosome {1}".format(dbkey, chrom)\n+ error = 1\n+ if error:\n+ sys.exit(1)\n+\n+def check_data(data, chrom_len, dbkey):\n+ error = 0\n+ for chrom in data:\n+ chrom_beg = 0\n+ chrom_end = chrom_len[chrom]\n+ for individual in data[chrom]:\n+ for p1, p2, state in data[chrom][individual]:\n+ if p1 >= p2:\n+ print >> sys.stderr, "Bad data line: begin >= end: {0} {1} {2} {3}".format(chrom, p1, p2, state, individual)\n+ error = 1\n+ if p1 < chrom_beg or p2 > chrom_end:\n+ print >> sys.stderr, "Bad data line: outside {0} boundaries[{1} - {2}]: {3} {4} {5} {6}".format(dbkey, chrom_beg, chrom_end, chrom, p1, p2, state, individual)\n+ error = 1\n+ if error:\n+ sys.exit(1)\n+\n+def make_rectangle(p1, p2, color, bottom=0.0, top=1.0):\n+ verts = [\n+ (p1, bottom), # left, bottom\n+ (p1, top), # left, top\n+ (p2, top), # right, top\n+ (p2, bottom), # right, bottom\n+ (0.0, 0.0) # ignored\n+ ]\n+\n+ codes = [\n+ Path.MOVETO,\n+ Path.LINETO,\n+ Path.LINETO,\n+ Path.LINETO,\n+ Path.CLOSEPOLY\n+ ]\n+\n+ path = Path(verts, codes)\n+ return patches.PathPatch(path, facecolor=color, lw=0)\n+\n+def make_split_rectangle(p1, p2, top_color, bottom_color):\n+ patch1 = make_rectangle(p1, p2, bottom_color, top=0.5)\n+ patch2 = make_rectangle(p1, p2, top_color, bottom=0.5)\n+ return [patch1, patch2]\n+\n+def make_state_rectangle(p1, p2, state, chrom, individual):\n+ if state == 0:\n+ return [ make_rectangle(p1, p2, \'r\') ]\n+ elif state == 1:\n+ return make_split_rectangle(p1, p2, \'r\', \'g\')\n+ elif state == 2:\n+ return [ make_rectangle(p1, p2, \'g\') ]\n+ elif state == 3:\n+ return [ make_rectangle(p1, p2, \'#c7c7c7\') ]\n+ else:\n+ print >> sys.stderr, "Unknown state: {0}: {1} {2} {3} {4}".format(state,'..b" vals.append(int(x))\n+ x += d\n+\n+ vals = vals[1:]\n+\n+# if not loose:\n+# if vals[-1] < graph_max:\n+# vals.append(int(graph_max))\n+\n+ labels = []\n+ for val in vals:\n+ labels.append('{0}'.format(int(val/math.pow(10, digits))))\n+\n+# labels.append('{0:.1f}'.format(vals[-1]/math.pow(10, digits)))\n+\n+ return vals, labels\n+\n+################################################################################\n+\n+def make_dpmix_plot(input_dbkey, input_file, output_file, galaxy_data_index_dir):\n+ fs_chrom_len = build_chrom_len_dict(input_dbkey, galaxy_data_index_dir)\n+ chroms, individuals, data, chrom_len = parse_input_file(input_file)\n+\n+ for chrom in chrom_len.keys():\n+ if chrom in fs_chrom_len:\n+ chrom_len[chrom] = fs_chrom_len[chrom]\n+\n+ #check_chroms(chroms, chrom_len, input_dbkey)\n+ check_data(data, chrom_len, input_dbkey)\n+\n+ ## units below are inches\n+ top_space = 0.10\n+ chrom_space = 0.25\n+ chrom_height = 0.25\n+ ind_space = 0.10\n+ ind_height = 0.25\n+\n+ total_height = 0.0\n+ at_top = True\n+ for chrom in chroms:\n+ if at_top:\n+ total_height += (top_space + chrom_height)\n+ at_top = False\n+ else:\n+ total_height += (top_space + chrom_space + chrom_height)\n+ \n+ individual_count = 0\n+ for individual in individuals:\n+ if individual in data[chrom]:\n+ individual_count += 1\n+ total_height += individual_count * (ind_space + ind_height)\n+\n+ width = 7.5\n+ height = math.ceil(total_height)\n+\n+ bottom = 1.0\n+\n+ fig = plt.figure(figsize=(width, height))\n+\n+ at_top = True\n+ for_webb = False\n+\n+ for chrom in chroms:\n+ length = chrom_len[chrom]\n+ vals, labels = tick_foo(0, length)\n+\n+ if at_top:\n+ bottom -= (top_space + chrom_height)/height\n+ at_top = False\n+ else:\n+ bottom -= (top_space + chrom_space + chrom_height)/height\n+\n+ if not for_webb:\n+ ax = fig.add_axes([0.0, bottom, 1.0, chrom_height/height])\n+ plt.axis('off')\n+ plt.text(0.5, 0.5, chrom, fontsize=14, ha='center')\n+\n+ individual_count = 0\n+ for individual in individuals:\n+ if individual in data[chrom]:\n+ individual_count += 1\n+\n+ i = 0\n+ for individual in individuals:\n+ if individual in data[chrom]:\n+ i += 1\n+\n+ bottom -= (ind_space + ind_height)/height\n+ if not for_webb:\n+ # [left, bottom, width, height]\n+ ax1 = fig.add_axes([0.0, bottom, 0.09, ind_height/height])\n+ plt.axis('off')\n+ plt.text(1.0, 0.5, individual, fontsize=10, ha='right', va='center')\n+ # [left, bottom, width, height]\n+ ax2 = fig.add_axes([0.10, bottom, 0.88, ind_height/height], frame_on=False)\n+ ax2.set_xlim(0, length)\n+ ax2.set_ylim(0, 1)\n+ if i != individual_count:\n+ plt.axis('off')\n+ else:\n+ if not for_webb:\n+ ax2.tick_params(top=False, left=False, right=False, labelleft=False)\n+ ax2.set_xticks(vals)\n+ ax2.set_xticklabels(labels)\n+ else:\n+ plt.axis('off')\n+ for p1, p2, state in sorted(data[chrom][individual]):\n+ for patch in make_state_rectangle(p1, p2, state, chrom, individual):\n+ ax2.add_patch(patch)\n+\n+ plt.savefig(output_file)\n+\n+################################################################################\n+\n+if __name__ == '__main__':\n+ input_dbkey, input_file, output_file, galaxy_data_index_dir = sys.argv[1:5]\n+ make_dpmix_plot(input_dbkey, input_file, output_file, galaxy_data_index_dir)\n+ sys.exit(0)\n+\n"

diff -r d4ec09e8079f -r 4b6590dd7250 echo.bash
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/echo.bash Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+if [ $# -lt 3 ]; then
+    echo "usage"
+    exit 1
+fi
+
+input="$1"
+output="$2"
+shift 2
+
+for individual in "$@"; do
+    echo "$individual" >> "$output"
+done
+
+exit 0
+

diff -r d4ec09e8079f -r 4b6590dd7250 evaluate_population_numbers.bash
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/evaluate_population_numbers.bash Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+if [ $# -ne 3 ]; then
+    echo "usage"
+    exit 1
+fi
+
+input_ped_file="$1"
+output_file="$2"
+max_populations="$3"
+
+ADMIXTURE=admixture
+
+for (( i=1; $i <= $max_populations; i++ )); do
+    $ADMIXTURE --cv "$input_ped_file" $i 2>&1 | grep CV >> "$output_file"
+done
+

diff -r d4ec09e8079f -r 4b6590dd7250 evaluate_population_numbers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/evaluate_population_numbers.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,56 @@
+<tool id="gd_evaluate_population_numbers" name="Evaluate" version="1.0.0">
+  <description>possible numbers of populations</description>
+
+  <command interpreter="bash">
+    evaluate_population_numbers.bash "${input.extra_files_path}/admix.ped" "$output" "$max_populations"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_ped" label="Dataset" />
+    <param name="max_populations" type="integer" min="1" value="5" label="Maximum number of populations" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="txt" />
+  </outputs>
+
+  
+
+  <help>
+**What it does**
+
+The users selects a set of data generated by the Galaxy tool to "prepare
+to look for population structure".  For all possible numbers K of ancestral
+populations, from 1 up to a user-specified maximum, this tool produces values
+that indicate how well the data can be explained as genotypes from individuals
+derived from K ancestral populations.  These values are computed by a 5-fold
+cross-validation procedure, so that a good choice for K will exhibit a low
+cross-validation error compared with other potential settings for K.
+
+**Acknowledgments**
+
+We use the program "Admixture", downloaded from
+
+http://www.genetics.ucla.edu/software/admixture/
+
+and described in the paper "Fast model-based estimation of ancestry in
+unrelated individuals" by David H. Alexander, John Novembre and Kenneth Lange,
+Genome Research 19 (2009), pp. 1655-1664. Admixture is called with the "--cv"
+flag to produce these values.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 extract_flanking_dna.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_flanking_dna.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--snps_loc',
+                        type='string', dest='snps_loc',
+                        help='snps .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--output_format',
+                        type="string", dest='output_format',
+                        help='output format, fasta or primer3')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.snps_loc:
+        raise RuntimeError( 'missing --snps_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.output_format:
+        raise RuntimeError( 'missing --output_format option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    snpcalls_file = gd.get_filename_from_loc( options.species, options.snps_loc )
+    file_root, file_ext = os.path.splitext( snpcalls_file )
+    snpcalls_index_file = file_root + ".cdb"
+    snpcalls = gd.SnpcallsFile( data_file=snpcalls_file, index_file=snpcalls_index_file )
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        flanking_dna = snpcalls.get_flanking_dna( sequence=seq, position=pos, format=options.output_format )
+        if flanking_dna:
+            out_fh.write( flanking_dna )
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+

diff -r d4ec09e8079f -r 4b6590dd7250 extract_flanking_dna.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_flanking_dna.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,93 @@
+<tool id="gd_extract_flanking_dna" name="Extract" version="1.0.0">
+  <description>DNA flanking chosen SNPs</description>
+
+  <command interpreter="python">
+    extract_flanking_dna.py "--input=$input" "--output=$output" "--snps_loc=${GALAXY_DATA_INDEX_DIR}/gd.snps.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+    "--output_format=$output_format"
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <param name="output_format" type="select" format="integer" label="output format">
+        <option value="fasta" selected="true">FastA format</option>
+        <option value="primer3">Primer3 input</option>
+    </param>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="output"/>
+  </outputs>
+
+  
+
+  <help>
+**What it does**
+
+  It reports a DNA segment containing each SNP, with up to 200 nucleotides on
+  either side of the SNP position, which is indicated by "n". Fewer nucleotides
+  are reported if the SNP is near an end of the assembled genome fragment.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    > chr2_75111355_75112576 314 A C
+    TATCTTCATTTTTATTATAGACTCTCTGAACCAATTTGCCCTGAGGCAGACTTTTTAAAGTACTGTGTAATGTATGAAGTCCTTCTGCTCAAGCAAATCATTGGCATGAAAACAGTTGCAAACTTATTGTGAGAGAAGAGTCCAAGAGTTTTAACAGTCTGTAAGTATATAGCCTGTGAGTTTGATTTCCTTCTTGTTTTTnTTCCAGAAACATGATCAGGGGCAAGTTCTATTGGATATAGTCTTCAAGCATCTTGATTTGACTGAGCGTGACTATTTTGGTTTGCAGTTGACTGACGATTCCACTGATAACCCAGTAAGTTTAAGCTGTTGTCTTTCATTGTCATTGCAATTTTTCTGTCTTTATACTAGGTCCTTTCTGATTTACATTGTTCACTGATT
+    > chr8_93901796_93905612 2471 A C
+    GCTGCCGCTGGATTTACTTCTGCTTGGGTCGAGAGCGGGCTGGATGGGTGAAGAGTGGGCTCCCCGGCCCCTGACCAGGCAGGTGCAGACAAGTCGGAAGAAGGCCCGCCGCATCTCCTTGCTGGCCAGCGTGTAGATGACGGGGTTCATGGCAGAGTTGAGCACGGCCAGCACGATGAACCACTGGGCCTTGAACAGGATnGCGCACTCCTTCACCTTGCAGGCCACATCCACAAGGAAAAGGATGAAGAGTGGGGACCAGCAGGCGATGAACACGCTCACCACGATCACCACGGTCCGCAGCAGGGCCATGGACCGCTCTGAGTTGTGCGGGCTGGCCACCCTGCGGCTGCTGGACTTCACCAGGAAGTAGATGCGTGCGTACAGGATCACGATGGTCAC
+    > chr10_7434473_7435447 524 T C
+    ATTATTAACAGAAACATTTCTTTTTCATTACCCAGGGGTTACACTGGTCGTTGATGTTAATCAGTTTTTGGAGAAGGAGAAGCAAAGTGATATTTTGTCTGTTCTGAAGCCTGCCGTTGGTAATACAAATGACGTAATCCCTGAATGTGCTGACAGGTACCATGACGCCCTGGCAAAAGCAAAAGAGCAAAAATCTAGAAGnGGTAAGCATCTTCACTGTTTAGCACAAATTAAATAGCACTTTGAATATGATGATTTCTGTGGTATTGTGTTATCTTACTTTTGAGACAAATAATCGCTTTCAAATGAATATTTCTGAATGTTTGTCATCTCTGGCAAGGAAATTTTTTAGTGTTTCTTTTCCTTTTTTGTCTTTTGGAAATCTGTGATTAACTTGGTGGC
+    > chr14_80021455_80022064 138 G A
+    ACCCAGGGATCAAACCCAGGTCTCCCGCATTGCAGGCGGATTCTTTACTGTCTGAGCCTCCAGGGAAGCCCTCGGGGCTGAAGGGATGGTTATGAAGGTGAGAAACAGGGGCCACCTGTCCCCAAGGTACCTTGCGACnTGCCATCTGCGCTCCACCAGTAAATGGACGTCTTCGATCCTTCTGTTGTTGGCGTAGTGCAAACGTTTGGGAAGGTGCTGTTTCAAGTAAGGCTTAAAGTGCTGGTCTGGTTTTTTACACTGAAATATAAATGGACATTGGATTTTGCAATGGAGAGTCTTCTAGAAGAGTCCAAGACATTCTCTCCAGAAAGCTGAAGG
+    > chr15_64470252_64471048 89 G A
+    TGTGTGTGTGTGTGTGTGTGTGTGCCTGTGTCTGTACATGCACACCACGTGGCCTCACCCAGTGCCCTCAGCTCCATGGTGATGTCCACnTAGCCGTGCTCCGCGCTGTAGTACATGGCCTCCTGGAGGGCCTTGGTGCGCGTCCGGCTCAGGCGCATGGGCCCCTCGCTGCCGCTGCCCTGGCTGGATGCATCGCTCTCTTCCACGCCCTCAGCCAGGATCTCCTCCAGGGACAGCACATCTGCTTTGGCCTGCTGTGGCTGAGTCAGGAGCTTCCTCAGGACGTTCCT
+    etc.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 extract_primers.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_primers.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,85 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--primers_loc',
+                        type='string', dest='primers_loc',
+                        help='primers .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.primers_loc:
+        raise RuntimeError( 'missing --primers_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc )
+
+    file_root, file_ext = os.path.splitext( primer_data_file )
+    primer_index_file = file_root + ".cdb"
+    primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file )
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        primer = primers.get_entry( seq, pos )
+        if primer:
+            out_fh.write( primer )
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+

diff -r d4ec09e8079f -r 4b6590dd7250 extract_primers.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/extract_primers.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,88 @@
+<tool id="gd_extract_primers" name="Extract primers" version="1.0.0">
+  <description>for selected SNPs</description>
+
+  <command interpreter="python">
+    extract_primers.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="txt" name="output"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_out/select_snps/select_snps.gd_snp" ftype="gd_snp" />
+      <param name="choice" value="0"/>
+      <output name="output" file="test_out/extract_primers/extract_primers.txt" />
+    </test>
+  </tests>
+
+
+  <help>
+**What it does**
+
+  This tool extracts primers for SNPs in the dataset using the Primer3 program.
+  The first line of output for a given SNP reports the name of the assembled
+  contig, the SNP's position in the contig, the two variant nucleotides, and
+  Primer3's "pair penalty".  The next line, if not blank, names restriction
+  enzymes (from the user-adjustable list) that differentially cut at that
+  site, but do not cut at any other position between and including the
+  primer positions.  The next lines show the SNP's flanking regions, with
+  the SNP position indicated by "n", including the primer positions and an
+  additional 3 nucleotides.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr5_30800874_30802049    734   G  A  chr5   30801606   A  24  0  99   4  11  97   Y  496  0.502  0.033  0.215  6
+    chr8_55117827_55119487    994   A  G  chr8   55118815   G  25  0  102  4  11  96   Y  22   0.502  0.025  2.365  1
+    chr9_100484836_100485311  355   C  T  chr9   100485200  T  27  0  108  6  17  100  Y  190  0.512  0.880  2.733  4
+    chr12_3635530_3637738     2101  T  C  chr12  3637630    T  25  0  102  4  13  93   Y  169  0.554  0.024  0.366  4
+
+- output file::
+
+    chr5_30800874_30802049 734 G A 0.352964
+     BglII,MboI,Sau3AI,Tru9I,XhoII
+      1 CTGAAGGTGAGCAGGATTCAGGAGACAGAAAACAAAGCCCAGGCCTGCCCAAGGTGGAAA
+           >>>>>>>>>>>>>>>>>>>>
+
+     61 AGTCTAACAACTCGCCCTCTGCTTAnATCTGAGACTCACAGGGATAATAACACACTTGGT
+
+
+     21 CAAGGAATAAACTAGATATTATTCACTCCTCTAGAAGGCTGCCAGGAAAATTGCCTGACT
+                                                             <<<<<<<
+
+    181 TGAACCTTGGCTCTGA
+        <<<<<<<<<<<<<
+    etc.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 find_intervals.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_intervals.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+
+import errno
+import os
+import subprocess
+import sys
+
+################################################################################
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+def run_program(prog, args, stdout_file=None):
+    #print "args:", ' '.join(args)
+    p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = p.communicate()
+    rc = p.returncode
+
+    if stdout_file is not None:
+        with open(stdout_file, 'w') as ofh:
+            print >> ofh, stdoutdata.rstrip('\r\n')
+
+    if rc != 0:
+        print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
+        print >> sys.stderr, stderrdata
+        sys.exit(1)
+
+################################################################################
+
+if len(sys.argv) != 11:
+    print "usage"
+    sys.exit(1)
+
+input, dbkey, output, output_files_path, chrom_col, pos_col, score_col, shuffles, cutoff, report_snps = sys.argv[1:11]
+
+prog = 'sweep'
+
+args = [ prog ]
+args.append(input)
+args.append(chrom_col)
+args.append(pos_col)
+args.append(score_col)
+args.append(cutoff)
+args.append(shuffles)
+args.append(report_snps)
+
+run_program(None, args, stdout_file=output)
+
+if report_snps == "0":
+    sys.exit(0)
+
+################################################################################
+
+mkdir_p(output_files_path)
+
+bedgraph_filename = 'bedgraph.txt'
+links_filename = os.path.join(output_files_path, 'links.txt')
+
+data = []
+links_data = []
+
+with open(output) as fh:
+    chrom = None
+    for line in fh:
+        line = line.rstrip('\r\n')
+        if not line:
+            continue
+        if line[0] != ' ':
+            # chrom line, add a link
+            chrom, interval_begin, interval_end, interval_value = line.split('\t')
+            links_data.append((chrom, int(interval_begin), int(interval_end)))
+        else:
+            # data line, add a bedgraph line
+            begin, value = line.split()
+            data.append((chrom, int(begin), value))
+
+with open(bedgraph_filename, 'w') as ofh:
+    print >> ofh, 'track type=bedGraph'
+    for chrom, begin, value in sorted(data):
+        print >> ofh, chrom, begin, begin+1, value
+
+with open(links_filename, 'w') as ofh:
+    for chrom, begin, end in sorted(links_data):
+        print >> ofh, chrom, begin, end
+
+################################################################################
+
+chrom_sizes_filename = '{0}.chrom.sizes'.format(dbkey)
+
+prog = 'fetchChromSizes'
+
+args = [ prog ]
+args.append(dbkey)
+
+run_program(None, args, stdout_file=chrom_sizes_filename)
+
+################################################################################
+
+prog = 'bedGraphToBigWig'
+
+args = [ prog ]
+args.append(bedgraph_filename)
+args.append(chrom_sizes_filename)
+args.append(output)
+
+run_program(None, args)
+
+################################################################################
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 find_intervals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/find_intervals.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,100 @@
+<tool id="gd_find_intervals" name="Find remarkable" version="1.0.0">
+  <description>genomic intervals</description>
+
+  <command interpreter="python">
+    find_intervals.py "$input" "$input.metadata.dbkey" "$output" "$output.files_path"
+
+    #if $override_metadata.choice == "0"
+      "$input.metadata.ref" "$input.metadata.rPos"
+    #else
+      "$override_metadata.ref_col" "$override_metadata.rpos_col"
+    #end if
+
+    "$score_col" "$shuffles"
+
+    #if $cutoff.type == 'percentage'
+      "$cutoff.cutoff_pct"
+    #else
+      "=$cutoff.cutoff_val"
+    #end if
+
+    "$out_format"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Input">
+      <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" />
+    </param>
+
+    <param name="score_col" type="data_column" data_ref="input" numerical="true" label="Column with score"/>
+
+    <conditional name="cutoff">
+      <param name="type" type="select" label="Cutoff type">
+        <option value="percentage">percentage</option>
+        <option value="value">value</option>
+      </param>
+      <when value="percentage">
+        <param name="cutoff_pct" type="float" value="95" min="0" max="100" label="Percentage cutoff"/>
+      </when>
+      <when value="value">
+        <param name="cutoff_val" type="float" value="0.0" label="Value cutoff"/>
+      </when>
+    </conditional>
+
+    <param name="shuffles" type="integer" min="0" value="0" label="Number of randomizations"/>
+
+    <param name="out_format" type="select" format="integer" label="Report SNPs">
+      <option value="0" selected="true">No</option>
+      <option value="1">Yes</option>
+    </param>
+
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="Choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="ref_col" type="data_column" data_ref="input" numerical="false" label="Column with reference chromosome"/>
+        <param name="rpos_col" type="data_column" data_ref="input" numerical="true" label="Column with reference position"/>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="interval">
+        <change_format>
+            <when input="out_format" value="1" format="bigwigpos" />
+        </change_format>
+    </data>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="score_col" value="5" />
+      <param name="type" value="value" />
+      <param name="cutoff_val" value="700.0" />
+      <param name="shuffles" value="10" />
+      <param name="out_format" value="0" />
+      <param name="choice" value="0" />
+
+      <output name="output" file="test_out/find_intervals/find_intervals.interval" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+The user selects a SNP table and specifies the columns containing (1)
+chromosome, (2) position, (3) scores (such as an Fst-value for the SNP), (4)
+a percentage or raw score for the "cutoff" and (5) the number of times the
+data should be randomized (only intervals with score exceeding the maximum for
+the randomized data are reported).  If a percentage (e.g. 95%) is specified
+for #3, then that percentile of the scores is used as the cutoff; this may
+not work well if many SNPs have the same score.  The program subtracts the
+cutoff from every score, then finds genomic intervals (i.e., consecutive runs
+of SNPs) whose total score cannot be increased by adding or subtracting one
+or more SNPs at the ends of the interval.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 gd_composite.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gd_composite.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,128 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import pkg_resources
+pkg_resources.require( "Cheetah" )
+from Cheetah.Template import Template
+
+import errno
+import os
+from datetime import datetime
+
+################################################################################
+
+def die(message):
+    print >> sys.stderr, message
+    sys.exit(1)
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+################################################################################
+
+class Display(object):
+    def display(self, parameter):
+        print parameter
+
+class DisplayFile(Display):
+    def display(self, parameter):
+        return '<a href="{0}">{1}</a>'.format(parameter.value, parameter.name)
+
+class DisplayValue(Display):
+    def display(self, parameter):
+        if parameter.value is not None:
+            return '{0}: {1}'.format(parameter.description, parameter.value)
+        else:
+            return '{0}'.format(parameter.description)
+
+class DisplayTagList(Display):
+    def display(self, parameter):
+        rv = []
+        if parameter.name:
+            rv.append(parameter.name)
+        rv.append('<ol>')
+        for tag in parameter.value:
+            col, individual_name = tag.split(':')
+            rv.append('<li>{0}</li>'.format(individual_name))
+        rv.append('</ol>')
+        return '\n'.join(rv)
+
+class DisplayPopulationList(Display):
+    def display(self, parameter):
+        rv = []
+        rv.append('Populations')
+        rv.append('<ul>')
+        for population in parameter.value:
+            rv.append('<li>')
+            if population.name is not None:
+                rv.append(population.name)
+            rv.append('<ol>')
+            for name in population.individual_names():
+                rv.append('<li>{0}</li>'.format(name))
+            rv.append('</ol>')
+            rv.append('</li>')
+        rv.append('</ul>')
+        return '\n'.join(rv)
+
+#    def display(self, parameter, name=''):
+#        print '<ul> {0}'.format(name)
+#        for individual_name in parameter.individual_names():
+#            print '<li>{0}>/li>'.format(individual_name)
+#        print '</ul>'
+
+
+class Parameter(object):
+    def __init__(self, name=None, value=None, description=None, display_type=None):
+        self.name = name
+        self.value = value
+        self.description = description
+        if display_type is None:
+            self.display_type = Display()
+        else:
+            self.display_type = display_type
+
+    def display(self):
+        return self.display_type.display(self)
+
+class InfoPage(object):
+    _realpath = os.path.realpath(__file__)
+    _script_dir = os.path.dirname(_realpath)
+    template_file = os.path.join(_script_dir, 'gd_composite_template.html')
+    def __init__(self):
+        self.timestamp = datetime.now().strftime('%Y-%m-%d %I:%M:%S %p')
+        self.title = 'Genome Diversity Composite Dataset'
+        self.inputs = []
+        self.outputs = []
+        self.misc = ''
+        self.template = self.load_template()
+
+    def load_template(self):
+        with open(self.template_file) as f:
+            return f.read().rstrip('\r\n')
+
+    def set_title(self, title):
+        self.title = title
+
+    def add_input_parameter(self, parameter):
+        self.inputs.append(parameter)
+
+    def add_output_parameter(self, parameter):
+        self.outputs.append(parameter)
+
+    def add_misc(self, misc):
+        self.misc = misc
+
+    def render(self):
+        return Template(self.template, searchList=[{'tool': self}])
+
+
+
+
+
+
+
+

diff -r d4ec09e8079f -r 4b6590dd7250 gd_composite_template.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/gd_composite_template.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,40 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>${tool.title}</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: $tool.timestamp
+      <p/>
+      #if $tool.outputs
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+          #for output in $tool.outputs
+            <li>${output.display()}</li>
+          #end for
+        </ul>
+      </div>
+      #end if
+      #if $tool.inputs
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+          #for input in $tool.inputs
+            <li>${input.display()}</li>
+          #end for
+        </ul>
+      </div>
+      #end if
+      #if $tool.misc
+      <div id="gd_misc">
+        $tool.misc.display()
+      </div>
+      #end if
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity.py Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,266 @@\n+#!/usr/bin/env python\n+\n+import sys\n+import cdblib\n+\n+def _openfile( filename=None, mode=\'r\' ):\n+ try:\n+ fh = open( filename, mode )\n+ except IOError, err:\n+ raise RuntimeError( "can\'t open file: %s\\n" % str( err ) )\n+ return fh\n+\n+def get_filename_from_loc( species=None, filename=None ):\n+ fh = _openfile( filename )\n+ for line in fh:\n+ if line and not line.startswith( \'#\' ):\n+ line = line.rstrip( \'\\r\\n\' )\n+ if line:\n+ elems = line.split( \'\\t\' )\n+ if len( elems ) >= 2 and elems[0] == species:\n+ return elems[1]\n+\n+ raise RuntimeError( "can\'t find \'%s\' in location file: %s\\n" % ( species, filename ) )\n+\n+\n+class SnpFile( object ):\n+ def __init__( self, filename=None, seq_col=1, pos_col=2, ref_seq_col=7, ref_pos_col=8 ):\n+ self.filename = filename\n+ self.fh = _openfile( filename )\n+ self.seq_col = seq_col\n+ self.pos_col = pos_col\n+ self.ref_seq_col = ref_seq_col\n+ self.ref_pos_col = ref_pos_col\n+ self.elems = None\n+ self.line = None\n+ self.comments = []\n+\n+ def next( self ):\n+ while self.fh:\n+ try:\n+ self.line = self.fh.next()\n+ except StopIteration:\n+ self.line = None\n+ self.elems = None\n+ return None\n+ if self.line:\n+ self.line = self.line.rstrip( \'\\r\\n\' )\n+ if self.line:\n+ if self.line.startswith( \'#\' ):\n+ self.comments.append( self.line )\n+ else:\n+ self.elems = self.line.split( \'\\t\' )\n+ return 1\n+\n+ def get_seq_pos( self ):\n+ if self.elems:\n+ return self.elems[ self.seq_col - 1 ], self.elems[ self.pos_col - 1 ]\n+ else:\n+ return None, None\n+\n+ def get_ref_seq_pos( self ):\n+ if self.elems:\n+ return self.elems[ self.ref_seq_seq - 1 ], self.elems[ self.ref_pos_col - 1 ]\n+ else:\n+ return None, None\n+\n+\n+class IndexedFile( object ):\n+\n+ def __init__( self, data_file=None, index_file=None ):\n+ self.data_file = data_file\n+ self.index_file = index_file\n+ self.data_fh = _openfile( data_file )\n+ self.index_fh = _openfile( index_file )\n+ self._reader = cdblib.Reader( self.index_fh.read(), hash )\n+\n+ def get_indexed_line( self, key=None ):\n+ line = None\n+ if key in self._reader:\n+ offset = self._reader.getint( key )\n+ self.data_fh.seek( offset )\n+ try:\n+ line = self.data_fh.next()\n+ except StopIteration:\n+ raise RuntimeError( \'index file out of sync for %s\' % key )\n+ return line\n+\n+class PrimersFile( IndexedFile ):\n+ def get_primer_header( self, sequence=None, position=None ):\n+ key = "%s %s" % ( str( sequence ), str( position ) )\n+ header = self.get_indexed_line( key )\n+ if header:\n+ if header.startswith( \'>\' ):\n+ elems = header.split()\n+ if len( elems ) < 3:\n+ raise RuntimeError( \'short primers header for %s\' % key )\n+ if sequence != elems[1] or str( position ) != elems[2]:\n+ raise RuntimeError( \'primers index for %s finds %s %s\' % ( key, elems[1], elems[2] ) )\n+ else:\n+ raise RuntimeError( \'primers index out of sync for %s\' % key )\n+ return header\n+\n+ def get_entry( self, sequence=None, position=None ):\n+ entry = self.get_primer_header( sequence, position )\n+ if entry:\n+ while self.data_fh:\n+ try:\n+ line = self.data_fh.next()\n+ except StopIteration:\n+ break\n+ if line.startswith( \'>\' ):\n+ break\n+ entry += lin'..b'\n+ return None\n+\n+ def get_flanking_dna( self, sequence=None, position=None, format=\'fasta\' ):\n+ if format != \'fasta\' and format != \'primer3\':\n+ raise RuntimeError( \'invalid format for flanking dna: %s\' % str( format ) )\n+ seq = self.get_snp_seq( sequence, position )\n+ if seq:\n+ p = seq.find(\'[\')\n+ if p == -1:\n+ raise RuntimeError( \'snpcalls entry for %s %s missing left bracket: %s\' % ( str( sequence ), str( position ), seq ) )\n+ q = seq.find(\']\', p + 1)\n+ if q == -1:\n+ raise RuntimeError( \'snpcalls entry for %s %s missing right bracket: %s\' % ( str( sequence ), str( position ), seq ) )\n+ q += 1\n+\n+ if format == \'fasta\':\n+ flanking_seq = \'> \'\n+ else:\n+ flanking_seq = \'SEQUENCE_ID=\'\n+\n+ flanking_seq += "%s %s %s %s\\n" % ( str( sequence ), str( position ), seq[p+1], seq[p+3] )\n+\n+ if format == \'primer3\':\n+ flanking_seq += \'SEQUENCE_TEMPLATE=\'\n+\n+ flanking_seq += "%sn%s\\n" % ( seq[0:p], seq[q:] )\n+\n+ if format == \'primer3\':\n+ flanking_seq += "SEQUENCE_TARGET=%d,11\\n=\\n" % ( p - 5 )\n+\n+ return flanking_seq\n+ else:\n+ return None\n+\n+\n+\n+class LocationFile( object ):\n+ def __init__(self, filename):\n+ self.build_map(filename)\n+\n+ def build_map(self, filename):\n+ self.map = {}\n+ self.open_file(filename)\n+ for line in self.read_lines():\n+ elems = line.split(\'\\t\', 1)\n+ if len(elems) == 2:\n+ self.map[ elems[0].strip() ] = elems[1].strip()\n+ self.close_file()\n+\n+ def read_lines(self):\n+ for line in self.fh:\n+ if not line.startswith(\'#\'):\n+ line = line.rstrip(\'\\r\\n\')\n+ yield line\n+\n+ def open_file(self, filename):\n+ self.filename = filename\n+ try:\n+ self.fh = open(filename, \'r\')\n+ except IOError, err:\n+ print >> sys.stderr, "Error opening location file \'%s\': %s" % (filename, str(err))\n+ sys.exit(1)\n+\n+ def close_file(self):\n+ self.fh.close()\n+\n+ def loc_file( self, key ):\n+ if key in self.map:\n+ return self.map[key]\n+ else:\n+ print >> sys.stderr, "\'%s\' does not appear in location file \'%s\'" % (key, self.filename)\n+ sys.exit(1)\n+ \n+class ChrLens( object ):\n+ def __init__( self, chrlen_filename ):\n+ self.chrlen_filename = chrlen_filename\n+ self.build_map()\n+\n+ def build_map(self):\n+ self.map = {}\n+ self.open_file(self.chrlen_filename)\n+ for line in self.read_lines():\n+ elems = line.split(\'\\t\', 1)\n+ if len(elems) == 2:\n+ chrom = elems[0].strip()\n+ chrom_len_text = elems[1].strip()\n+ try:\n+ chrom_len = int( chrom_len_text )\n+ except ValueError:\n+ print >> sys.stderr, "Bad length \'%s\' for chromosome \'%s\' in \'%s\'" % (chrom_len_text, chrom, self.chrlen_filename)\n+ self.map[ chrom ] = chrom_len\n+ self.close_file()\n+\n+ def read_lines(self):\n+ for line in self.fh:\n+ if not line.startswith(\'#\'):\n+ line = line.rstrip(\'\\r\\n\')\n+ yield line\n+\n+ def open_file(self, filename):\n+ self.filename = filename\n+ try:\n+ self.fh = open(filename, \'r\')\n+ except IOError, err:\n+ print >> sys.stderr, "Error opening chromosome length file \'%s\': %s" % (filename, str(err))\n+ sys.exit(1)\n+\n+ def close_file(self):\n+ self.fh.close()\n+\n+ def length( self, key ):\n+ if key in self.map:\n+ return self.map[key]\n+ else:\n+ return None\n+\n+ def __iter__( self ):\n+ for chrom in self.map:\n+ yield chrom\n+\n'

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/Makefile Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,8 @@
+all:
+ cd src && make
+
+clean:
+ cd src && make clean
+
+install:
+ cd src && make install

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/bin/gd_ploteig
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/bin/gd_ploteig Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,172 @@
+#!/usr/bin/env perl
+
+### ploteig -i eigfile -p pops -c a:b [-t title] [-s stem] [-o outfile] [-x] [-k]  [-y] [-z sep]
+use Getopt::Std ;
+use File::Basename ;
+use warnings ;
+
+## pops : separated  -x = make postscript and pdf  -z use another separator
+##  -k keep intermediate files
+## NEW if pops is a file names are read one per line
+
+getopts('i:o:p:c:s:d:z:t:xky',\%opts) ;
+$postscmode = $opts{"x"} ;
+$oldkeystyle =  $opts{"y"} ;
+$kflag = $opts{"k"} ;
+$keepflag = 1 if ($kflag) ;
+$keepflag = 1 unless ($postscmode) ;
+
+$zsep = ":" ;
+if (defined $opts{"z"}) {
+ $zsep = $opts{"z"} ;
+ $zsep = "\+" if ($zsep eq "+") ;
+}
+
+$title = "" ;
+if (defined $opts{"t"}) {
+ $title = $opts{"t"} ;
+}
+if (defined $opts{"i"}) {
+ $infile = $opts{"i"} ;
+}
+else {
+ usage() ;
+ exit 0 ;
+}
+open (FF, $infile) || die "can't open $infile\n" ;
+@L = (<FF>) ;
+chomp @L ;
+$nf = 0 ;
+foreach $line (@L) {
+ next if ($line =~ /^\s+#/) ;
+ @Z = split " ", $line ;
+ $x = @Z ;
+ $nf = $x if ($nf < $x) ;
+}
+printf "## number of fields: %d\n", $nf ;
+$popcol = $nf-1 ;
+
+
+if (defined $opts{"p"}) {
+ $pops = $opts{"p"} ;
+}
+else {
+ die "p parameter compulsory\n" ;
+}
+
+$popsname = setpops ($pops) ;
+print "$popsname\n" ;
+
+$c1 = 1; $c2 =2 ;
+if (defined $opts{"c"}) {
+ $cols = $opts{"c"} ;
+ ($c1, $c2) = split ":", $cols ;
+ die "bad c param: $cols\n" unless (defined $cols) ;
+}
+
+$stem = "$infile.$c1:$c2" ;
+if (defined $opts{"s"}) {
+ $stem = $opts{"s"} ;
+}
+$gnfile = "$stem.$popsname.xtxt" ;
+
+if (defined $opts{"o"}) {
+ $gnfile = $opts{"o"} ;
+}
+
+@T = () ; ## trash
+open (GG, ">$gnfile") || die "can't open $gnfile\n" ;
+print GG "## " unless ($postscmode) ;
+print GG "set terminal postscript color\n" ;
+print GG "set style line  2 lc rgbcolor \"#376600\"\n";
+print GG "set style line 11 lc rgbcolor \"#376600\"\n";
+print GG "set style line 20 lc rgbcolor \"#376600\"\n";
+print GG "set style line 29 lc rgbcolor \"#376600\"\n";
+print GG "set style line  6 lc rgbcolor \"#FFCC00\"\n";
+print GG "set style line 15 lc rgbcolor \"#FFCC00\"\n";
+print GG "set style line 24 lc rgbcolor \"#FFCC00\"\n";
+print GG "set style increment user\n";
+print GG "set title  \"$title\" \n" ;
+print GG "set key outside\n" unless ($oldkeystyle) ;
+print GG "set xlabel  \"eigenvector $c1\" \n" ;
+print GG "set ylabel  \"eigenvector $c2\" \n" ;
+print GG "plot " ;
+$np = @P ;
+$lastpop = $P[$np-1] ;
+$d1 = $c1+1 ;
+$d2 = $c2+1 ;
+foreach $pop (@P)  {
+ $dfile = "$stem:$pop" ;
+ push @T, $dfile ;
+ print GG " \"$dfile\" using $d1:$d2 title \"$pop\" " ;
+ print GG ", \\\n" unless ($pop eq $lastpop) ;
+ open (YY, ">$dfile") || die "can't open $dfile\n" ;
+ foreach $line (@L) {
+  next if ($line =~ /^\s+#/) ;
+  @Z = split " ", $line ;
+  next unless (defined $Z[$popcol]) ;
+  next unless ($Z[$popcol] eq $pop) ;
+  print YY "$line\n" ;
+ }
+ close YY ;
+}
+print GG "\n" ;
+print GG "## "  if ($postscmode) ;
+print GG "pause 9999\n"  ;
+close GG ;
+
+if ($postscmode) {
+$psfile = "$stem.ps" ;
+
+ if ($gnfile =~ /xtxt/) {
+  $psfile = $gnfile ;
+  $psfile  =~ s/xtxt/ps/ ;
+ }
+system "gnuplot < $gnfile > $psfile" ;
+#system "fixgreen  $psfile" ;
+system "ps2pdf  $psfile " ;
+}
+unlink (@T) unless $keepflag ;
+
+sub usage {
+
+print "ploteig -i eigfile -p pops -c a:b [-t title] [-s stem] [-o outfile] [-x] [-k]\n" ;
+print "-i eigfile     input file first col indiv-id last col population\n" ;
+print "## as output by smartpca in outputvecs \n" ;
+print "-c a:b         a, b columns to plot.  1:2 would be common and leading 2 eigenvectors\n" ;
+print "-p pops        Populations to plot.  : delimited.   eg  -p Bantu:San:French\n" ;
+print "## pops can also be a filename.  List populations 1 per line\n" ;
+print "[-s stem]      stem will start various output files\n"  ;
+print "[-o ofile]     ofile will be gnuplot control file.  Should have xtxt suffix\n";
+print "[-x]           make ps and pdf files\n" ;
+print "[-k]           keep various intermediate files although  -x set\n" ;
+print "## necessary if .xtxt file is to be hand edited\n" ;
+print "[-y]           put key at top right inside box (old mode)\n" ;
+print "[-t]           title (legend)\n" ;
+
+print "The xtxt file is a gnuplot file and can be easily hand edited.  Intermediate files
+needed if you want to make your own plot\n" ;
+
+}
+sub setpops {
+ my ($pops) = @_  ;
+ local (@a, $d, $b, $e) ;
+
+ if (-e $pops) {
+  open (FF1, $pops) || die "can't open $pops\n" ;
+  @P = () ;
+  foreach $line (<FF1>) {
+  ($a) = split " ", $line ;
+  next unless (defined $a) ;
+  next if ($a =~ /\#/) ;
+  push  @P, $a ;
+  }
+  $out = join ":", @P ;
+  print "## pops: $out\n" ;
+  ($b, $d , $e) = fileparse($pops) ;
+  return $b ;
+ }
+ @P = split $zsep, $pops ;
+ return $pops ;
+
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Fst_ave.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Fst_ave.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,228 @@
+/* Fst_ave -- determine the average Fst values between two specified populations
+*  and between two random populations
+*
+*    argv{1] = a Galaxy SNP table. For each of several individuals, the table
+*              has four columns (#A, #B, genotype, quality).
+*    argv[2] = 1 if Fst is estimated from SAMtools genotypes; 0 means use
+*         read-coverage data.
+*    argv[3] = lower bound, for individual quality value if argv[2] = 1
+*        or for total number of reads per population if argv[2] = 0.
+*        SNPs not satisfying these lower bounds are ignored.
+*    argv[4] = 1 to discard SNPs that appear fixed in the two populations
+*    argv[5] = 1 for unbiased estimator, else 0 for the original Wright form.
+*    argv[6] = k => 0 says report the average Fst and the largest average over k
+*              randomly chosen splits into two populations of those sizes
+*    argv[7], argv[8], ...,  have the form "13:1", "13:2" or "13:0", meaning
+*             that the 13th and 14th columns (base 1) give the allele counts
+*             for an individual that is in population 1, in population 2,
+*             or in neither population.
+
+What it does on Galaxy
+
+The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to select individuals from a SNP table. No individual can be in both populations. Other choices are as follows.
+
+Data soure. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations.
+
+After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual. SMPs not meeting these lower bounds are ignored.
+
+The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded.
+
+The user chooses which definition of Fst to use: Wright's original definition or Weir's unbiased estimator.
+
+Finally, the user decides whether to use randomizations. If so, then the user specifies how many randomly generated population pairs (retaining the numbers of individuals of the originals) to generate, as well as the "population" of additional individuals (not in the first two popuations) that can be used in the ransmization process.
+
+The program prints the average Fst for the original populations and the number of SNPs used to compute it. If randomizations were requested, it prints the average Fst for each randomly generated population pair, ending with a summary that includes the maximum and average value, and the highest-scoring population pair.
+*/
+
+#include "lib.h"
+#include "Fst_lib.h"
+
+// maximum legth of a line from the table
+#define MOST 5000
+
+// information about the specified individuals
+// x is an array of nI values 0, 1, or 2;
+// shuffling x creates random "populations"
+int col[MOST], x[MOST], best_x[MOST];
+int nI, lower_bound, unbiased, discard, genotypes, nsnp;
+
+// each SNP has an array of counts
+struct count {
+ int A, B;
+};
+
+// linked list summarizes the Galaxy table
+struct snp {
+ struct count *c;
+ struct snp *next;
+} *start, *last;
+
+// given the two populations specified by x[], return the average Fst
+double ave_Fst() {
+ double tot_Fst;
+ struct snp *s;
+ int i, A1, B1, A2, B2, too_few;
+
+
+ // scan the SNPs
+ tot_Fst = 0.0;
+ nsnp = 0;
+ for (s = start; s != NULL; s = s->next) {
+ // get counts for the two populations at this SNP
+ for (A1 = B1 = A2 = B2 = i = 0; i < nI; ++i) {
+ if (s->c[i].A < 0) // no genotypes
+ continue;
+ if (x[i] == 1) {
+ A1 += s->c[i].A;
+ B1 += s->c[i].B;
+ } else if (x[i] == 2) {
+ A2 += s->c[i].A;
+ B2 += s->c[i].B;
+ }
+ }
+ if (discard && ((A1 == 0 && A2 == 0) || (B1 == 0 && B2 == 0)))
+ continue; // fixed in these two populations
+ too_few = (genotypes ? 1 : lower_bound);
+ if (A1+B1 >= too_few && A2+B2 >= too_few) {
+ ++nsnp;
+ tot_Fst += Fst(A1, B1, A2, B2, unbiased);
+ }
+ }
+ return tot_Fst/nsnp;
+}
+
+/* shuffle the values x[0], x[1], ... , x[nI-1];
+*  Uses Algorithm P in page 125 of "The Art of Computer Programming (Vol II)
+*  Seminumerical Programming", by Donald Knuth, Addison-Wesley, 1971.
+*/
+void shuffle() {
+ int i, j, temp;
+
+ for (i = nI - 1; i > 0; --i) {
+ // swap what's in location i with location j, where 0 <= j <= i
+ j = random() % (i+1);
+ temp = x[i];
+ x[i] = x[j];
+ x[j] = temp;
+ }
+}
+
+int main(int argc, char **argv) {
+ FILE *fp;
+ char *p, *z = "\t\n", buf[MOST];
+ int X[MOST], nshuff, n, i, j, k, saw[3], larger, all = 1;
+ struct snp *new;
+ double F, F1, largest_F, tot_F;
+
+ if (argc < 7)
+ fatal("args: table data-source lower_bound discard? unbiased? #shuffles n:1 m:2 ...");
+
+ // handle command-line arguments
+ genotypes = atoi(argv[2]);
+ lower_bound = atoi(argv[3]);
+ if (!genotypes && lower_bound <= 0)
+ fatal("minimum coverage should exceed 0");
+ discard = atoi(argv[4]);
+ unbiased = atoi(argv[5]);
+ nshuff = atoi(argv[6]);
+ saw[0] = saw[1] = saw[2] = 0;
+ // populations 1 and 2 must be disjoint
+ // population 0 can be replaced by population 1 or 2
+ for (i = 7; i < argc; ++i) {
+ if (sscanf(argv[i], "%d:%d", &j, &k) != 2)
+ fatalf("not like 13:2 : %s", argv[i]);
+ if (k < 0 || k > 2)
+ fatalf("not population 0, 1 or 2: %s", argv[i]);
+ saw[k] = 1;
+ // seen this individual (i.e., column) before??
+ for (n = 0; n < nI && col[n] != j; ++n)
+ ;
+ if (n < nI) { // OK if one of the populations is 0
+ if (k > 0) {
+ if (x[n] > 0 && x[n] != k)
+   fatalf("column %d is in both populations", j);
+ x[n] = k;
+ }
+ } else {
+ col[nI] = j;
+ x[nI] = k;
+ ++nI;
+ }
+ }
+ if (saw[1] == 0)
+ fatal("population 1 is empty");
+ if (saw[2] == 0)
+ fatal("population 2 is empty");
+
+ // read the table of SNPs and store the essential allele counts
+ fp = ckopen(argv[1], "r");
+ while (fgets(buf, MOST, fp)) {
+ if (buf[0] == '#')
+ continue;
+ new = ckalloc(sizeof(*new));
+ new->next = NULL;
+ new->c = ckalloc(nI*sizeof(struct count));
+ // set X[i] = atoi(i-th word of buf), i is base 1
+ for (i = 1, p = strtok(buf, z); p != NULL;
+   ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = 0; i < nI; ++i) {
+ n = col[i];
+ if (genotypes) {
+ k = X[n+2];
+ if (k == -1 || X[n+3] < lower_bound)
+ new->c[i].A = new->c[i].B = -1;
+ else {
+ new->c[i].A = k;
+ new->c[i].B = 2 - k;
+ }
+ } else {
+ new->c[i].A = X[n];
+ new->c[i].B = X[n+1];
+ }
+ }
+ if (start == NULL)
+ start = new;
+ else
+ last->next = new;
+ last = new;
+ }
+ fclose(fp);
+
+ F1 = ave_Fst();
+ printf("average Fst is %5.5f, using %d SNPs\n", F1, nsnp);
+ for (j = 0; j < nI; ++j)
+ best_x[j] = x[j];
+ for (tot_F = largest_F = 0.0, larger = i = 0; i < nshuff; ++i) {
+ shuffle();
+ if ((F = ave_Fst()) > F1)
+ ++larger;
+ if (F > largest_F) {
+ largest_F = F;
+ for (j = 0; j < nI; ++j)
+ best_x[j] = x[j];
+ }
+ tot_F += F;
+ if (all) // make this optional?
+ printf("%d: %f\n", i+1, F);
+ }
+ if (nshuff > 0) {
+ printf("%d of %d random groupings had a larger average Fst\n",
+   larger, nshuff);
+ printf("largest = %5.5f, mean = %5.5f\n", largest_F,
+   tot_F/nshuff);
+ if (largest_F > F1) {
+ printf("first columns for the best two populations:\n");
+ for (i = 0; i < nI; ++i)
+ if (best_x[i] == 1)
+ printf("%d ", col[i]);
+ printf("and\n");
+ for (i = 0; i < nI; ++i)
+ if (best_x[i] == 2)
+ printf("%d ", col[i]);
+ putchar('\n');
+ }
+ }
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Fst_column.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Fst_column.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,132 @@
+/* Fst_column -- add an Fst column to a Galaxy table
+*
+*    argv{1] = a Galaxy SNP table. For each of several individuals, the table
+*              has four columns (#A, #B, genotype, quality).
+*    argv[2] = 1 if Fst is estimated from SAMtools genotypes; 0 means use
+*         read-coverage data.
+*    argv[3] = lower bound for total number of reads per population
+*    argv[4] = lower bound for individual quality value
+*    argv[5] = 1 to retain SNPs that fail to satisfy the lower bound and set
+*        Fst = -1; delete them if argv[4] = 0.
+*    argv[6] = 1 to discard SNPs that appear fixed in the two populations
+*    argv[7] = 1 for unbiased estimator, else 0 for the original Wright form.
+*    argv[8], argv[9], ...,  have the form "13:1" or "13:2", meaning that
+*             the 13th, 14th, and 15th columns (base 1) give the allele counts
+*             and genotype for an individual that is in population 1 or
+*       population 2, respectively.
+
+What It Does on Galaxy
+
+The user specifies a SNP table and two "populations" of individuals, both previously defined using the Galaxy tool to select individuals from a SNP table. No individual can be in both populations. Other choices are as follows.
+
+Data soure. The allele frequencies of a SNP in the two populations can be estimated either by the total number of reads of each allele, or by adding the frequencies inferred from genotypes of individuals in the populations.
+
+After specifying the data source, the user sets lower bounds on amount of data required at a SNP. For estimating the Fst using read counts, the bound is the minimum count of reads of the two alleles in a population. For estimations based on genotype, the bound is the minimum reported genotype quality per individual.
+
+The user specifies whether the SNPs that violate the lower bound should be ignored or the Fst set to -1.
+
+The user specifies whether SNPs where both populations appear to be fixed for the same allele should be retained or discarded.
+
+Finally, the user chooses which definition of Fst to use: Wright's original definition or Weir's unbiased estimator.
+
+A column is appended to the SNP table giving the Fst for each retained SNP.
+
+*/
+
+#include "lib.h"
+#include "Fst_lib.h"
+
+// most characters allowed in a row of the table
+#define MOST 5000
+
+// column and population for the relevant individuals/groups
+int col[MOST], pop[MOST];
+int nI;
+
+int main(int argc, char **argv) {
+ FILE *fp;
+ char *p, *z = "\t\n", buf[MOST], trash[MOST];
+ int X[MOST], min_cov, min_qual, retain, discard, unbiased, genotypes,
+   n, i, g, A1, B1, A2, B2, saw[3], x1, y1, x2, y2;
+ double F;
+
+ if (argc < 7)
+ fatal("args: table data-source lower-bound retain? discard? unbiased? n:1 m:2 ...");
+ genotypes = atoi(argv[2]);
+ min_cov = atoi(argv[3]);
+ min_qual = atoi(argv[4]);
+ retain = atoi(argv[5]);
+ discard = atoi(argv[6]);
+ unbiased = atoi(argv[7]);
+ saw[1] = saw[2] = 0;
+ for (i = 8; i < argc; ++i, ++nI) {
+ if (sscanf(argv[i], "%d:%d", &(col[nI]), &(pop[nI])) != 2)
+ fatalf("not like 13:2 : %s", argv[i]);
+ if (pop[nI] < 1 || pop[nI] > 2)
+ fatalf("not population 1 or 2: %s", argv[i]);
+ saw[pop[nI]] = 1;
+ // seen this individual before?
+ for (n = 0; n < nI && col[n] != col[nI]; ++n)
+ ;
+ if (n < nI)
+ fatalf("individual at column %d is mentioned twice",
+   col[n]);
+ }
+ if (saw[1] == 0)
+ fatal("population 1 is empty");
+ if (saw[2] == 0)
+ fatal("population 2 is empty");
+
+ fp = ckopen(argv[1], "r");
+ while (fgets(buf, MOST, fp)) {
+ if (buf[0] == '#')
+ continue;
+ strcpy(trash, buf);
+ // set X[i] = atoi(i-th word of s), i is base 0
+ for (i = 1, p = strtok(trash, z); p != NULL;
+   ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = A1 = B1 = A2 = B2 = x1 = y1 = x2 = y2 = 0;
+      i < nI; ++i) {
+ n = col[i];
+ g = X[n+2]; // save genotype
+ if ((genotypes && g == -1) || X[n+3] < min_qual)
+ continue;
+ if (pop[i] == 1) {
+ // column n (base 1) corresponds to entry X[n]
+ x1 += X[n];
+ y1 += X[n+1];
+ if (genotypes) {
+ A1 += g;
+ B1 += (2 - g);
+ } else {
+ A1 += X[n];
+ B1 += X[n+1];
+ }
+ } else if (pop[i] == 2) {
+ x2 += X[n];
+ y2 += X[n+1];
+ if (genotypes) {
+ A2 += g;
+ B2 += (2 - g);
+ } else {
+ A2 += X[n];
+ B2 += X[n+1];
+ }
+ }
+ }
+ if (discard && ((A1 == 0 && A2 == 0) || (B1 == 0 && B2 == 0)))
+ continue; // not variable in the two populations
+ if (x1+y1 < min_cov || x2+y2 < min_cov)
+ F = -1.0;
+ else
+ F = Fst(A1, B1, A2, B2, unbiased);
+ if (F == -1.0 && !retain)
+ continue;
+ if ((p = strchr(buf, '\n')) != NULL)
+ *p = '\0';
+ printf("%s\t%5.4f\n", buf, F);
+ }
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Fst_lib.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Fst_lib.c Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,49 @@
+// procedure to compute either Wright's Fst or an unbiased estimator of if
+
+#include "lib.h"
+// Wright's Fst
+static double Wright(double f1, double f2) {
+ double
+   f, // frequency in the pooled population
+   H_ave, // average of HWE heterogosity in the two populations
+   H_all; // HWE heterozygosity in the pooled popuations
+
+ H_ave = f1*(1.0 - f1) + f2*(1.0 - f2);
+ f = (f1 + f2)/2.0;
+ if (f == 0.0 || f == 1.0)
+ return 0.0;
+ H_all = 2.0*f*(1.0 - f);
+ return (H_all - H_ave) / H_all;
+}
+
+/* unbiased estimator of Fst from:
+  Weir, B.S. and Cockerham, C.C. 1984. Estimating F-statistics for the
+  analysis of population structure. Evolution 38: 1358–1370.
+as interpreted by:
+  Akey, J.M., Zhang, G., Zhang, K., Jin, L., and Shriver, M.D. 2002.
+  Interrogating a high-density SNP map for signatures of natural
+  selection. Genome Res. 12: 1805–1814.
+*/
+static double Weir(int n1, double p1, int n2, double p2) {
+ double F, p_bar, nc, MSP, MSG, N = n1 + n2;
+
+ if (p1 == p2)
+ return 0.0;
+ MSG = (n1*p1*(1.0-p1) + n2*p2*(1.0-p2))/(N-1.0);
+ p_bar = (n1*p1 + n2*p2)/N;
+ MSP = n1*(p1-p_bar)*(p1-p_bar) + n2*(p2-p_bar)*(p2-p_bar);
+        nc = N - (double)(n1*n1 + n2*n2)/N;
+ F = (MSP - MSG) / (MSP + (nc-1)*MSG);
+ if (F < 0.0)
+ F = 0.0;
+ return F;
+}
+
+double Fst(int nA1, int na1, int nA2, int na2, int unbiased) {
+ double p1, p2;
+
+ p1 = (double)nA1 / (double)(nA1+na1);
+ p2 = (double)nA2 / (double)(nA2+na2);
+
+ return (unbiased ? Weir(nA1+na1, p1, nA2+na2, p2) : Wright(p1, p2));
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Fst_lib.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Fst_lib.h Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,8 @@
+/* return either Sewall Wright's Fst or its Weir unbiased estimator
+*  parameters are as follows
+*  1, 2 : frequencies of the two alleles in population 1
+*  3, 4 : frequencies of the two alleles in population 2
+*  5 : 0 = return Wright's formulation, 1 = return unbiased estimator
+*/
+
+double Fst(int, int, int, int, int);

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Huang.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Huang.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,44 @@
+// Find highest scoring intervals, as discussed in Huang.h.
+
+#include "lib.h"
+#include "Huang.h"
+
+void Huang(double x[], int n) {
+ double Score, oldScore;
+ int v, L, i;
+
+ top = 0; // don't use location 0, so as to follow Fig. 6
+ for (Score = 0.0, v = 0; v < n; ++v) {
+ oldScore = Score;
+ Score += x[v];
+ if (x[v] < 0)
+ continue;
+ if (top > 0 && R[top].Rpos == v-1) {
+ // add edge to top subpath
+ R[top].Rpos = v;
+ R[top].Rscore = Score;
+ } else {
+ // create a one-edge subpath
+ ++top;
+ if (top >= MAX_R)
+ fatal("In Haung(), top is too big");
+ R[top].Lpos = v-1;
+ R[top].Lscore = oldScore;
+ R[top].Rpos = v;
+ R[top].Rscore = Score;
+ R[top].Lower = top-1;
+ while ((L = R[top].Lower) > 0 &&
+ R[L].Lscore > R[top].Lscore)
+ R[top].Lower = R[L].Lower;
+ }
+ // merge subpaths
+ while (top > 1 && (L = R[top].Lower) > 0 &&
+ R[L].Rscore <= R[top].Rscore) {
+ R[L].Rpos = R[top].Rpos;
+ R[L].Rscore = R[top].Rscore;
+ top = L;
+ }
+ }
+ for (i = 1; i <= top; ++i)
+ R[i].Score = R[i].Rscore - R[i].Lscore;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Huang.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Huang.h Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,20 @@
+/* Find intervals of highest total score, i.e., such that adding postions to
+*  either end will decrease the total. We use the method of Fig. 6 of the paper:
+*  Xiaoqiu Huang, Pavel Pevzner, Webb Miller (1994) Parametric recomputing in
+*  alignment graphs. Combinatorial Pattern Matching (Springer Lecture Notes in
+*  Computer Science, 807), 87-101.
+*
+*  The input scores are in x[0], x[1], ..., x[n-1], but the output regions
+*  are in R[1], R[2], ..., R[top]. R[i].Score is the total score of the i-th
+*  (in order of position) positive-scoring interval of x, which consists of of
+*  x[R[i].Lpos + 1] to x[R[i].Rpos].
+*/
+#define MAX_R 5000000
+
+struct region { // a consecutive (relative to the reference) run of SNPs
+ double Lscore, Rscore, Score;
+ int Lpos, Rpos, Lower;
+} R[MAX_R];
+int top;
+
+void Huang(double *x, int n);

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/Makefile
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/Makefile Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,49 @@
+CC = gcc
+COPT = -O2
+CWARN = -W -Wall
+CFLAGS = $(COPT) $(CWARN)
+INSTALL_DIR = ../bin
+
+TARGETS = admix_prep coords2admix coverage dist_mat dpmix eval2pct \
+ Fst_ave Fst_column pop sweep
+
+all: $(TARGETS)
+
+install: $(TARGETS)
+ if [ ! -d "$(INSTALL_DIR)" ]; then mkdir -p "$(INSTALL_DIR)"; fi
+ cp $(TARGETS) $(INSTALL_DIR)
+
+admix_prep: admix_prep.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+coords2admix: coords2admix.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+coverage: coverage.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+dist_mat: dist_mat.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+dpmix: dpmix.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+eval2pct: eval2pct.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+Fst_ave: Fst_ave.c Fst_lib.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+Fst_column: Fst_column.c Fst_lib.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+pop: pop.c lib.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+sweep: sweep.c lib.c Huang.c
+ $(CC) $(CFLAGS) $^ -o $@
+
+.PHONY: clean
+
+clean:
+ rm -f $(TARGETS)

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/admix_prep.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/admix_prep.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,119 @@
+/* admix_prep -- prepare the ".ped" and ".map" files (PLINK format) for input to
+*  the "admixture" program.
+*
+*  argv[1] -- a Galaxy SNP table
+*  argv[2] -- required number of reads for each individual to use a SNP
+*  argv[3] -- required genotype quality for each individual to use a SNP
+*  argv[4] -- minimum spacing between SNPs on the same scaffold
+*  argv[k] for k > 4 have the form "13:fred", meaning that the 13th and 14th
+*    columns (base 0) give the allele counts for the individual or group named
+*    "fred".
+
+What it does on Galaxy
+The tool converts a SNP table into two tables, called "admix.map" and "admix.ped", needed for estimating the population structure. The user can read or download those files, or simply pass this tool's output on to other programs. The user imposes conditions on which SNPs to consider, such as the minimum coverage and/or quality value for every individual, or the distance to the closest SNP in the same contig (as named in the first column of the SNP table). A useful piece of information produced by the tool is the number of SNPs meeting those conditions, which can be found by clicking on the "eye" after the program runs.
+
+*/
+
+#include "lib.h"
+
+// bounds line length for a line of the Galaxy table
+#define MOST 5000
+struct individual {
+ int column;
+ char *name;
+} I[MOST/8]; // each individual has 4 columns and 4 tab characters
+int nI; // number of individuals
+int X[MOST]; // integer values in a row of the SNP table
+
+// bounds the number of SNPs that can be kept
+#define MAX_KEEP 10000000
+char *S[MAX_KEEP]; // S[i] is a row of 2*nI alleles
+int nK;
+
+int main(int argc, char **argv) {
+ FILE *fp, *ped, *map;
+ char *p, *z = " \t\n", buf[MOST], trash[MOST], name[100], *s,
+   scaf[100], prev_scaf[100];
+ int i, j, m, min_coverage, min_quality, min_space, nsnp, genotype,
+    pos, prev_pos;
+
+ if (argc < 5)
+ fatal("args: Galaxy-table min-cov min-qual min-space 13:fred 16:mary ...");
+ min_coverage = atoi(argv[2]);
+ min_quality = atoi(argv[3]);
+ min_space = atoi(argv[4]);
+
+ for (i = 5; i < argc; ++i, ++nI) {
+ if (nI >= MOST/8)
+ fatal("Too many individuals");
+ if (sscanf(argv[i], "%d:%s", &(I[nI].column), name) != 2)
+ fatalf("bad arg: %s", argv[i]);
+ I[nI].name = copy_string(name);
+ }
+
+ map = ckopen("admix.map", "w");
+
+ fp = ckopen(argv[1], "r");
+ prev_scaf[0] = '\0';
+ prev_pos = 0;
+ for (nsnp = 0; fgets(buf, MOST, fp); ) {
+ if (buf[0] == '#')
+ continue;
+ ++nsnp;
+ if (sscanf(buf, "%s %d", scaf, &pos) != 2)
+ fatalf("choke: %s", buf);
+ if (same_string(scaf, prev_scaf)) {
+ if (pos < prev_pos + min_space)
+ continue;
+ } else {
+ strcpy(prev_scaf, scaf);
+ prev_pos = -min_space;
+ }
+
+ // X[i] = atoi(i-th word base-1)
+ strcpy(trash, buf);
+ for (i = 1, p = strtok(trash, z); p != NULL;
+      ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = 0; i < nI; ++i) {
+ m = I[i].column;
+ if (X[m] + X[m+1] < min_coverage || X[m+3] < min_quality)
+ break;
+ }
+ if (i < nI)
+ continue;
+ prev_pos = pos;
+
+ if (nK >= MAX_KEEP)
+ fatal("Too many SNPs");
+ fprintf(map, "1 snp%d 0 %d\n", nsnp, nsnp+1);
+ s = S[nK++] = ckalloc(2*nI*sizeof(char));
+ for (i = j = 0; i < nI; ++i, j += 2) {
+ genotype = X[I[i].column+2];
+ if (genotype == 2)
+ s[j] = s[j+1] = '1';
+ else if (genotype == 0)
+ s[j] = s[j+1] = '2';
+ else if (genotype == 1) {
+ s[j] = '1';
+ s[j+1] = '2';
+ } else // undefined genotype
+ s[j] = s[j+1] = '0';
+ }
+ }
+
+ fclose(map);
+
+ ped = ckopen("admix.ped", "w");
+ for (i = 0; i < nI; ++i) {
+ fprintf(ped, "%s 1 0 0 1 1", I[i].name);
+ for (j = 0; j < nK; ++j)
+ fprintf(ped, " %c %c", S[j][2*i], S[j][2*i+1]);
+ putc('\n', ped);
+ }
+
+ printf("Using %d of %d SNPs\n", nK, nsnp);
+ fclose(ped);
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/coords2admix.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/coords2admix.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,91 @@
+// coords2admix -- add projections onto chords to information about
+// coordinates in PCA plots
+
+#include "lib.h"
+
+#define MAX_POP 1000
+struct pop {
+ char *name;
+ float x, y;
+} P[MAX_POP];
+int nP;
+
+int main(int argc, char **argv) {
+ FILE *fp;
+ char buf[500], x[100], y[100], z[100], cur_pop[100];
+ int ncur, i, j, k;
+ float eig1, eig2, tot_x = 0.0, tot_y = 0.0, x1, y1, x2, y2, a, b, c, d;
+
+ if (argc == 1)
+ fp = stdin;
+ else if (argc == 2)
+ fp = ckopen(argv[1], "r");
+ else
+ fatal("optional arg: smartpca coordinates");
+
+ if (!fgets(buf, 500, fp))
+ fatal("empty set of coordinates");
+ if (sscanf(buf, "%s %s %s", x, y, z) != 3 ||
+     !same_string(x, "#eigvals:"))
+ fatalf("cannot find eigenvalues: %s", buf);
+ printf("%s", buf);
+ eig1 = atof(y);
+ eig2 = atof(z);
+ //printf("eig1 = %f, eig2 = %f\n", eig1, eig2);
+
+ strcpy(cur_pop, "");
+ ncur = 0;
+ while (fgets(buf, 500, fp)) {
+ if (sscanf(buf, "%*s %s %s %s", x, y, z) != 3)
+ fatalf("gag: %s", buf);
+ printf("%s", buf);
+ if (!same_string(cur_pop, z)) {
+ if (ncur > 0) {
+ P[nP].name = copy_string(cur_pop);
+ P[nP].x = tot_x/ncur;
+ P[nP].y = tot_y/ncur;
+ ++nP;
+ }
+ ncur = 1;
+ strcpy(cur_pop, z);
+ tot_x = atof(x);
+ tot_y = atof(y);
+ } else {
+ ++ncur;
+ tot_x += atof(x);
+ tot_y += atof(y);
+ }
+ }
+ P[nP].name = copy_string(cur_pop);
+ P[nP].x = tot_x/ncur;
+ P[nP].y = tot_y/ncur;
+ ++nP;
+
+/*
+for (i = 0; i < nP; ++i)
+printf("%s %f %f\n", P[i].name, P[i].x, P[i].y);
+*/
+
+ // loop over pairs of populations
+ for (i = 0; i < nP; ++i) {
+ x1 = eig1*P[i].x;
+ y1 = eig2*P[i].y;
+ for (j = i+1; j < nP; ++j) {
+ printf("\nprojection along chord %s -> %s\n",
+   P[i].name, P[j].name);
+ x2 = eig1*P[j].x;
+ y2 = eig2*P[j].y;
+ c = (x1-x2)*(x1-x2) + (y1-y2)*(y1-y2);
+ for (k = 0; k < nP; ++k)
+ if (k != i && k != j) {
+ a = eig1*P[k].x;
+ b = eig2*P[k].y;
+ d = (x2-x1)*(a-x1) + (y2-y1)*(b-y1);
+ printf("  %s: %f\n", P[k].name, d/c);
+ }
+ }
+ }
+
+ return 0;
+}
+

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/coverage.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/coverage.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,155 @@
+/* coverage -- report distributions of SNP coverage or quality for individuals,
+*  or coverage for populations
+*
+*    argv{1] -- a Galaxy SNP table. For each individuals, the table has four
+* columns (count of each allele, genotype, quality).
+*    argv[2] -- 0 = sequence coverage, 1 = genotype quality
+*    argv[3] -- file name for the text version of output (input for producing
+* the graphical summary goes to stdout)
+*    argv[4], argv[5], ...,  have the form "13:fred",  meaning that the 13th
+* 14th, and 16th columns (base 1) give the two allele counts
+* and the quality for "fred", where "fred" can be the name of
+* a population with several individuals (all named "fred")
+What it does on Galaxy
+The tool reports distributions of SNP reliability indicators for individuals or populations. The reliability can be measured by either the sequence coverage or the SAMtools quality value, though the notion of a population-level quality is not supported. Textual and graphical reports are generated, where the text output gives the cumulative distributions.
+*/
+
+#include "lib.h"
+
+// maximum length of a line from the table
+#define MOST 5000
+
+// the largest coverage or quality value being considered
+#define MAX_VAL 1000
+
+FILE *gp; // for text output
+
+// a population is the set of all indivuals with the same name
+// (perhaps just a single individual)
+struct pop {
+ int cov, n[MAX_VAL+1];
+ long long sum, tot;
+ char *name;
+} P[MOST/4];
+int nP; // number of populations
+
+// maps column to population
+struct individual {
+ int col, pop;
+} I[MOST/4];
+int nI;
+
+/* Report the distribution for each individual. P[i].n[k] is the number of SNPs
+*  of value (coverage or quality) k in population i, for k < MAX_VAL;
+*  I[i].n[MAX_VAL] is the number of SNPs of value k >= MAX_VAL.
+*  We print the percentages, p, of SNPs with value <= k, ending when all
+*  populations have reached a p >= 98%.
+*/
+void print_cov() {
+ int i, j, k, last_j;
+ long long sum;
+
+ // find where to stop printing
+ for (last_j = i = 0; i < nP; ++i) {
+ for (sum = j = 0; j <= MAX_VAL; ++j)
+ sum += P[i].n[j];
+ P[i].tot = sum;
+ for (sum = j = 0; j <= MAX_VAL; ++j) {
+ sum += P[i].n[j];
+ if (sum >= 0.98*P[i].tot)
+ break;
+ }
+ last_j = MAX(last_j, j);
+ }
+
+
+ ++last_j;
+ // print to stdout the output for graphing; not broken into short lines
+ for (j = 0; j < last_j; ++j)
+ printf("\t%3d", j);
+ putchar('\n');
+ for (i = 0; i < nP; ++i) {
+ printf("%s", P[i].name);
+ for (sum = j = 0; j < last_j; ++j) {
+ sum += P[i].n[j];
+ printf("\t%4.2f", 100.0*(float)sum/(float)P[i].tot);
+ }
+ putchar('\n');
+ }
+
+ // print a user-friendly version to the named file
+ // <= 20 numbers per row
+ for (j = 0; j < last_j; j += 20) {
+ fprintf(gp, "\n          ");
+ for (k = j; k < MIN(j+20, last_j); ++k)
+ fprintf(gp, "%3d", k);
+ for (i = 0; i < nP; ++i) {
+ fprintf(gp, "\n%10s", P[i].name);
+ for (k = j; k < MIN(j+20, last_j); ++k) {
+ P[i].sum += P[i].n[k];
+ fprintf(gp, "%3lld",
+   MIN(99, 100*P[i].sum/P[i].tot));
+ }
+ }
+ fprintf(gp,"\n\n");
+ }
+}
+
+int main(int argc, char **argv) {
+ FILE *fp;
+ char buf[MOST], *z = " \t\n", *p;
+ int X[MOST], i, j, cov, m, quality, is_pop;
+
+ if (argc < 5)
+ fatal("args: SNP-file quality-value? out-name 13:fred ... ");
+ quality = atoi(argv[2]);
+ gp = ckopen(argv[3], "w");
+ // record the individuals and populations
+ for (nI = 0, i = 4; i < argc; ++i, ++nI) {
+ if (nI >= MOST)
+ fatal("Too many individuals");
+ // allow spaces in names
+ if ((p = strchr(argv[i], ':')) == NULL)
+ fatalf("no colon: %s", argv[i]);
+ I[nI].col = atoi(argv[i]);
+ for (j = 0; j < nP && !same_string(p+1, P[j].name); ++j)
+ ;
+ if (j == nP) { // new population
+ is_pop = 1;
+ P[nP++].name = copy_string(p+1);
+ }
+ I[nI].pop = j;
+ }
+ if (is_pop && quality)
+ fatal("quality values for a population are not supported.");
+
+ // Record the number of SNPs with coverage 0, 1, ..., MAX_VAL-1,
+ // or >= MAX_VAL for each individual.
+ fp = ckopen(argv[1], "r");
+ while (fgets(buf, MOST, fp)) {
+ if (buf[0] == '#')
+ continue;
+ // P[i].cov is the total coverage for all individuals in pop i
+ for (i = 0; i < nP; ++i)
+ P[i].cov = 0;
+ // X[i] = atoi(i-th word base-1)
+ for (i = 1, p = strtok(buf, z); p != NULL;
+      ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = 0; i < nI; ++i) {
+ m = I[i].col;
+ if (quality)
+ cov = X[m+3];
+ else
+ cov = X[m] + X[m+1];
+ P[I[i].pop].cov += cov;
+ }
+ for (i = 0; i < nP; ++i)
+ P[i].n[MIN(P[i].cov, MAX_VAL)]++;
+ }
+
+ // Print the distributions.
+ print_cov();
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/dist_mat.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/dist_mat.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,196 @@
+/* dist_mat -- create a distance matrix in PHYLIP format for pairs of
+*  specified individuals, including by default the reference sequence
+*
+*  argv[1] -- a Galaxy SNP table
+*  argv[2] -- min coverage
+*  argv[3] -- min quality
+*  argv[4] -- name of reference species (or "none")
+*  argv[5] -- 0 = distance from coverage; 1 = distance from genotype
+*  argv[6] -- name of file for the numbers of informative SNPs
+*  argv[7] -- name of file to write the Mega-format distance matrix
+*  argv[k] for k > 7 have the form "13:fred", meaning that the 13th and 14th
+*    columns (base 0) give the allele counts for the individual or group named
+*    "fred".
+
+What it does on Galaxy
+This tool uses the selected SNP table to determine a "genetic distance" between each pair of selected individuals; the table of pairwise distances can be used by the Neighbor-Joining methods to construct a tree that depicts how the individuals are related. For a given pair of individuals, we find all SNP positions where both individuals have at least a minimum number of sequence "reads"; the individuals' distance at that SNP is defined as the absolute value of difference in the frequency of the first allele (equivalently: the second allele). For instance, if the first individuals has 5 reads of each allele and the second individual has respectivley 3 and 6 reads, then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that SNP (provided that the minimum read total is at most 9). The output includes a report of the numbers of SNPs passing that thresold for each pair of individuals.
+
+*/
+
+#include "lib.h"
+
+// bounds line length for a line of the Galaxy table
+
+#define MOST 5000
+#define MIN_SNPS 3
+
+struct argument {
+ int column;
+ char *name;
+} A[MOST];
+int nA; // number of individuals or groups + 1 (for the reference species)
+
+#define MOST_INDIVIDUALS 100
+#define SIZ 1+MOST_INDIVIDUALS // includes the reference
+
+double tot_diff[SIZ][SIZ];
+int ndiff[SIZ][SIZ], X[MOST];
+
+int main(int argc, char **argv) {
+ FILE *fp, *gp, *mega;
+ char *p, *z = "\t\n", buf[MOST], name[100], B[100], C[100], D[100],
+   *nucs = "ACGT";
+ int i, j, m, n, min_coverage, too_few, ref_allele = -1, has_ref,
+   min_quality, genotype;
+ double fi, fj, dist;
+
+ if (argc < 8)
+ fatal("args: Galaxy-table min-cov min-qual min-snp ref-name genotype dist-out mega-out 13:fred 16:mary ...");
+ min_coverage = atoi(argv[2]);
+ min_quality = atoi(argv[3]);
+ if (min_coverage <= 0 && min_quality <= 0)
+ fatal("coverage and/or quality of SNPs should be constrained");
+
+ if (same_string(argv[4], "none"))
+ has_ref = 0;
+ else {
+ has_ref = 1;
+ A[0].name = copy_string(argv[4]);
+ }
+ genotype = atoi(argv[5]);
+ gp = ckopen(argv[6], "w");
+ mega = ckopen(argv[7], "w");
+ fprintf(mega, "#mega\n!Title: Galaxy;\n");
+
+ for (nA = has_ref, i = 8; i < argc; ++i, ++nA) {
+ if (nA >= SIZ)
+ fatal("Too many individuals");
+ if (sscanf(argv[i], "%d:%s", &(A[nA].column), name) != 2)
+ fatalf("bad arg: %s", argv[i]);
+ A[nA].name = copy_string(name);
+ }
+ fprintf(mega,
+   "!Format DataType=Distance DataFormat=LowerLeft NTaxa=%d;\n\n",
+   nA);
+ for (i = 0; i < nA; ++i)
+ fprintf(mega, "[%d] #%s\n", i+1, A[i].name);
+ fprintf(mega, "\n\n\n[");
+ for (i = 1; i <= nA; ++i)
+ fprintf(mega, "%4d", i);
+ fprintf(mega, " ]\n");
+ fp = ckopen(argv[1], "r");
+ while (fgets(buf, MOST, fp)) {
+ if (buf[0] == '#')
+ continue;
+ if (has_ref) {
+ // get the reference allele
+ if (sscanf(buf, "%*s %*s %s %s %*s %*s %*s %s", B, C, D)
+     != 3)
+ fatalf("3 fields: %s", buf);
+ if (strchr(nucs, B[0]) == NULL ||
+     strchr(nucs, C[0]) == NULL)
+ fatalf("not nucs : %s %s", B, C);
+ if (D[0] == B[0])
+ ref_allele = 1;
+ else if (D[0] == C[0])
+ ref_allele = 2;
+ else if (strchr(nucs, D[0]) != NULL)
+ ref_allele = 3;
+ else {
+ if (D[0] != '-' && D[0] != 'N')
+ fatalf("what is this: %s", D);
+ ref_allele = -1;
+ }
+ }
+
+ // X[i] = atoi(i-th word base-1)
+ for (i = 1, p = strtok(buf, z); p != NULL;
+      ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = has_ref; i < nA; ++i) {
+ m = A[i].column;
+ if (X[m] + X[m+1] < min_coverage ||
+     X[m+3] < min_quality)
+ continue;
+
+ // frequency of the second allele
+ if (genotype) {
+ if (X[m+2] == -1)
+ continue; // no genotype
+ fi = (double)X[m+2];
+ } else
+ fi = (double)X[m+1] / (double)(X[m]+X[m+1]);
+ if (has_ref && ref_allele > 0) {
+ ndiff[0][i]++;
+ // reference allele might be different from both
+ if (ref_allele == 1)
+ tot_diff[0][i] += fi;
+ else if (ref_allele == 2)
+ tot_diff[0][i] += (1.0 - fi);
+ else
+ tot_diff[0][i] += 1.0;
+ }
+ for (j = i+1; j < nA; ++j) {
+ n = A[j].column;
+ if (X[n] + X[n+1] < min_coverage ||
+    X[n+3] < min_quality)
+ continue;
+ if (genotype && X[n+2] == -1)
+ continue;
+ ndiff[i][j]++;
+ if (genotype)
+ fj = (double)X[n+2];
+ else
+ fj = (double)X[n+1] /
+      (double)(X[n] + X[n+1]);
+ fj -= fi;
+ // add abs. value of difference in frequencies
+ tot_diff[i][j] += (fj >= 0.0 ? fj : -fj);
+ }
+
+ }
+ }
+ for (i = too_few = 0; i < nA; ++i)
+ for (j = i+1; j < nA; ++j)
+ if (ndiff[i][j] < MIN_SNPS) {
+ too_few = 1;
+ fprintf(stderr,
+   "%s and %s have only %d informative SNPs\n",
+   A[i].name, A[j].name, ndiff[i][j]);
+ }
+ if (too_few)
+ fatal("remove individuals or relax constraints");
+
+ // print distances
+ printf("%d\n", nA);
+ for (i = 0; i < nA; ++i) {
+ printf("%9s", A[i].name);
+ fprintf(mega, "[%d] ", i+1);
+ for (j = 0; j < i; ++j) {
+ dist = tot_diff[j][i]/(double)ndiff[j][i];
+ printf(" %6.4f", dist);
+ fprintf(mega, " %6.4f", dist);
+ }
+ fprintf(mega, "  \n");
+ printf(" 0.0000");
+ for (j = i+1; j < nA; ++j)
+ printf(" %6.4f",
+   tot_diff[i][j]/(double)ndiff[i][j]);
+ putchar('\n');
+ }
+ fprintf(mega, "\n\n\n\n\n");
+ fclose(mega);
+
+ // print numbers of SNPs
+ for (i = 0; i < nA; ++i) {
+ fprintf(gp, "%9s", A[i].name);
+ for (j = 0; j < i; ++j)
+ fprintf(gp, " %8d", ndiff[j][i]);
+ fprintf(gp, "        0");
+ for (j = i+1; j < nA; ++j)
+ fprintf(gp," %8d", ndiff[i][j]);
+ putc('\n', gp);
+ }
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/dpmix.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/dpmix.c Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,510 @@\n+/* dpmix -- admixture using dynamic programming\n+*\n+* argv{1] = a Galaxy SNP table. For each of several individuals, the table\n+* has four columns (#A, #B, genotype, quality) -- SNPs on the same\n+*\t chromosome must appear together, and in order of position\n+* argv[2] = column with the chromosome name (position is the next column)\n+* argv[3] = "all" or e.g., "chr20"\n+* argv[4] = 1 if ancestral allele frequencies are estimated from SAMtools\n+*\t\tgenotypes; 0 means use read-coverage data.\n+* argv[5] = 1 to add logarithms of probabilities, allowing unobserve alleles,\n+*\t 0 to simply add probabilities\n+* argv[6] = switch penalty (>= 0)\n+* argv[7] = file giving heterochromatic intervals (\'-\' means that no file is\n+*\t given)\n+* argv[8] = file name for additional output\n+* argv[9], argv[10], ..., have the form "13:1:Peter", "13:2:Paul" or\n+*\t "13:0:Mary", meaning that the 13th and 14th columns (base 1)\n+*\t give the allele counts for an individual that is in ancestral\n+*\t population 1, ancestral population 2, or is a potentially admixed\n+*\t individual, resp.\n+\n+What it does on Galaxy\n+The user specifies two "ancestral" populations (i.e., sources for chromosomes) and a set of potentially admixed individuals, and chooses between the sequence coverage or the estimated genotypes to measure the similarity of genomic intervals in admixed individuals to the two classes of ancestral chromosomes. The user also picks a "switch penalty", typically between 10 and 100. For each potentially admixed individual, the program divides the genome into three "genotypes": (0) homozygous for the second ancestral population (i.e., both chromosomes from that population), (1) heterozygous, or (2) homozygous for the second ancestral population. Parts of a reference chromosome that are labeled as "heterochromatic" are given the non-genotype, 3. Smaller values of the switch penalty (corresponding to more ancient admixture events) generally lead to the reconstruction of more frequent changes between genotypes.\n+*/\n+\n+#include "lib.h"\n+//#include <math.h>\n+\n+// maximum length of a line from the table\n+#define MOST 5000\n+\n+// we create a linked list of "events" on a chromosome -- mostly SNPs, but\n+// also ends of hetorochomatic intervals\n+struct snp {\n+\tdouble F1, F2;\t// reference allele frequencies in the two populations\n+\tint pos, *g,\t// position and an array of admixed genotypes\n+\t type;\t\t// 0 = SNP, 1 = start of het. interval, 2 = end\n+\tstruct snp *prev;\t// we keep the list in order of decreasing pos\n+} *last;\n+\n+// array of potentially admixed individuals\n+struct admixed {\n+\tchar *name;\n+\tint gcol, ge20, gt02;\n+\tlong long x[4];\t\t// number of reference bp in each state\n+} A[MOST];\n+\n+// information about "ancestral" individuals, namely column and population\n+struct ances {\n+\tint col, pop;\n+\tchar *name;\n+} C[MOST];\n+\n+// heterochromatic intervals\n+struct het {\n+\tchar *chr;\n+\tint b, e;\n+} H[MOST];\n+\n+// global variables\n+int *B[4],\t// backpointer to state at the previous SNP (or event)\n+ *P;\t\t// chromosome position\n+int nH, nI, nG, genotypes, nsnp, debug, chr_col, logs;\n+char this_chr[100];\n+double switch_penalty;\n+char buf[MOST], *status;\n+FILE *fp, *out;\n+\n+// probability of producing genotype g in admixture state s\n+// given reference allele frequencies f1 and f2 in the ancestral populations\n+double score (double f1, double f2, int g, int s) {\n+\tdouble p;\n+\n+\tif (s == 2) { // homozygous for the first ancestral population\n+\t\tif (g == 2)\n+\t\t\tp = f1*f1;\n+\t\telse if (g == 0)\n+\t\t\tp = (1.0-f1)*(1.0-f1);\n+\t\telse\n+\t\t\tp = 2.0*f1*(1.0-f1);\n+\t} else if (s == 0) { // homozygous for the second ancestral population\n+\t\tif (g == 2)\n+\t\t\tp = f2*f2;\n+\t\telse if (g == 0)\n+\t\t\tp = (1.0-f2)*(1.0-f2);\n+\t\telse\n+\t\t\tp = 2.0*f2*(1.0-f2);\n+\t} else { // one chromosome from each ancestral population\n+\t\tif (s != 1)\n+\t\t\tfatalf("bad state %d", s);\n+\t\tif (g == 2)\n+\t\t\tp = f1*f2;\n+\t\telse if (g == 0)\n'..b' {\t// space for back-pointers\n+\t\tB[i] = ckalloc((nsnp+1)*sizeof(int));\n+\t\tB[i][nsnp] = 0;\n+\t}\n+\t\n+\t// loop over possibly admixed individuals\n+\tfor (a = 0; a < nG; ++a)\n+\t\tone_admix(a);\n+\n+\t// free the allocated storage\n+\twhile (last != NULL) {\n+\t\tnew = last;\n+\t\tlast = last->prev;\n+\t\tfree(new->g);\n+\t\tfree(new);\n+\t}\n+\tfree(P);\n+\tfor (i = 0; i < 4; ++i)\n+\t\tfree(B[i]);\n+}\n+\n+int main(int argc, char **argv) {\n+\tint n, i, j, k, saw[3];\n+\tlong long het_len, ref_len;\n+\tfloat N;\n+\tchar nam[100], *chr;\n+\n+\tif (argc < 9)\n+\t\tfatal("args: table chr-col chr data-source logs switch heterochrom outfile n:1:name1 m:2:name2 ...");\n+\tif (same_string(argv[argc-1], "debug")) {\n+\t\tdebug = 1;\n+\t\t--argc;\n+\t}\n+\n+\t// handle command-line arguments\n+\tchr_col = atoi(argv[2]);\n+\tchr = argv[3];\n+\tgenotypes = atoi(argv[4]);\n+\n+\tlogs = atoi(argv[5]);\n+\tif (logs)\n+\t\tfatal("logarithms of probabilities -- under development");\n+\t//if (logs) switch_penalty = log(switch_penalty);\n+\n+\tswitch_penalty = atof(argv[6]);\n+\tif (switch_penalty < 0.0)\n+\t\tfatal("negative switch penalty");\n+\tout = ckopen(argv[8], "w");\n+\n+\thet_len = ref_len = 0;\n+\tif (!same_string(argv[7], "-")) {\n+\t\tfp = ckopen(argv[7], "r");\n+\t\twhile (fgets(buf, MOST, fp)) {\n+\t\t\tif (nH >= MOST)\n+\t\t\t\tfatal("Too many heterochromatic intervals");\n+\t\t\tif (sscanf(buf, "%s %d %d", nam, &i, &j) != 3)\n+\t\t\t\tfatalf("gagging: %s", buf);\n+\t\t\tH[nH].chr = copy_string(nam);\n+\t\t\tH[nH].b = i;\n+\t\t\tH[nH].e = j;\n+\t\t\t// assumes last event per chrom. is a het. interval\n+\t\t\tif (nH > 0 && !same_string(nam, H[nH-1].chr))\n+\t\t\t\tref_len += j;\n+\t\t\thet_len += (j - i);\n+\t\t\t++nH;\n+\t\t}\n+\t\tfclose(fp);\n+\t}\n+\tref_len += H[nH-1].e;\n+\n+\t// populations must be disjoint\n+\tsaw[1] = saw[2] = 0;\n+\tfor (i = 9; i < argc; ++i) {\n+\t\tif (sscanf(argv[i], "%d:%d:%s", &j, &k, nam) != 3)\n+\t\t\tfatalf("not like 13:2:fred : %s", argv[i]);\n+\t\tif (k < 0 || k > 2)\n+\t\t\tfatalf("not population 0, 1 or 2: %s", argv[i]);\n+\t\tsaw[k] = 1;\n+\n+\t\t// seen this individual (i.e., column) before??\n+\t\tfor (n = 0; n < nI && C[n].col != j; ++n)\n+\t\t\t;\n+\t\tif (n < nI)\n+\t\t\tfatal("populations are not disjoint");\n+\t\tif (k == 0) {\t// admixed individual\n+\t\t\tif (nG >= MOST)\n+\t\t\t\tfatal("Too many admixed individuals");\n+\t\t\tA[nG].name = copy_string(nam);\n+\t\t\tA[nG++].gcol = j+2;\n+\t\t} else {\t// in an ancestral population\n+\t\t\tif (nI >= MOST)\n+\t\t\t\tfatal("Too many ancestral individuals");\n+\t\t\tC[nI].col = j;\n+\t\t\tC[nI].pop = k;\n+\t\t\tC[nI++].name = copy_string(nam);\n+\t\t}\n+\t}\n+\tif (saw[0] == 0)\n+\t\tfatal("no admixed individual is specified");\n+\tif (saw[1] == 0)\n+\t\tfatal("first reference population is empty");\n+\tif (saw[2] == 0)\n+\t\tfatal("second reference population is empty");\n+\n+\t// start the output file of text\n+\tfor (k = 1; k <= 2; ++k) {\n+\t\tfprintf(out, "state %d agrees with:", k == 1 ? 2 : 0);\n+\t\tfor (i = 0; i < nI; ++i)\n+\t\t\tif (C[i].pop == k)\n+\t\t\t\tfprintf(out, " %s", C[i].name);\n+\t\tputc(\'\\n\', out);\n+\t}\n+\tputc(\'\\n\', out);\n+\n+\tfp = ckopen(argv[1], "r");\n+\twhile ((status = fgets(buf, MOST, fp)) != NULL && buf[0] == \'#\')\n+\t\t;\n+\tif (same_string(chr, "all"))\n+\t\twhile (status != NULL)\n+\t\t\tone_chr();\n+\telse {\t// skip to the specified chromosome\n+\t\twhile (!same_string(chr, get_chr_name()) &&\n+\t\t (status = fgets(buf, MOST, fp)) != NULL)\n+\t\t\t;\n+\t\tif (status != NULL)\n+\t\t\tone_chr();\n+\t}\n+\tfor (i = 0; i < nG; ++i) {\n+\t\tfprintf(out,\n+\t\t "%s: %d SNPs where state 2 is at least as likely as state 0\\n",\n+\t\t A[i].name, A[i].ge20);\n+\t\tfprintf(out,\n+\t\t "%s: %d SNPs where state 0 is more likely than state 2\\n\\n",\n+\t\t A[i].name, A[i].gt02);\n+\t}\n+\t// write fractions in each state to the output text file\n+\n+\tif (ref_len)\n+\t\tfprintf(out,\n+\t\t "%lld of %lld reference bp (%1.1f%%) are heterochromatin\\n\\n",\n+\t\t het_len, ref_len, 100.0*(float)het_len/(float)ref_len);\n+\n+\tfor (i = 0; i < nG; ++i) {\n+\t\tN = (float)(A[i].x[0] + A[i].x[1] + A[i].x[2])/100.0;\n+\t\tfprintf(out, "%s: 0 = %1.1f%%, 1 = %1.1f%%, 2 = %1.1f%%\\n",\n+\t\t A[i].name, (float)A[i].x[0]/N, (float)A[i].x[1]/N,\n+\t\t (float)A[i].x[2]/N); \n+\t}\n+\n+\treturn 0;\n+}\n'

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/eval2pct.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/eval2pct.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,26 @@
+#include "lib.h"
+
+#define MAX_EVAL 1000
+
+float E[MAX_EVAL];
+int nE;
+
+int main (int argc, char **argv) {
+ FILE *fp;
+ char buf[500];
+ int i;
+ float tot;
+
+ fp = (argc== 1 ? stdin : ckopen(argv[1], "r"));
+ while (fgets(buf, 500, fp)) {
+ if (nE >= MAX_EVAL)
+ fatal("Too many eigenvalues");
+ E[nE++] = atof(buf);
+ }
+ for (tot = 0.0, i = 0; i < nE; ++i)
+ tot += E[i];
+ printf("Percentage explained by eigenvectors:\n");
+ for (i = 0 ; i < nE && E[i] > 0.0; ++i)
+ printf("%d: %1.1f%%\n", i+1, 100.0*(float)E[i]/tot);
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/lib.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/lib.c Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,71 @@
+// lib.c -- a little library of C procudures
+
+#include "lib.h"
+
+char *argv0;
+
+/* print_argv0 ---------------------------------------- print name of program */
+void print_argv0(void)
+{
+ if (argv0) {
+ char *p = strrchr(argv0, '/');
+ (void)fprintf(stderr, "%s: ", p ? p+1 : argv0);
+ }
+}
+
+/* fatal ---------------------------------------------- print message and die */
+void fatal(const char *msg)
+{
+ fatalf("%s", msg);
+}
+
+/* fatalf --------------------------------- format message, print it, and die */
+void fatalf(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ fflush(stdout);
+ print_argv0();
+ (void)vfprintf(stderr, fmt, ap);
+ (void)fputc('\n', stderr);
+ va_end(ap);
+ exit(1);
+}
+
+/* ckopen -------------------------------------- open file; check for success */
+FILE *ckopen(const char *name, const char *mode)
+{
+ FILE *fp;
+
+ if ((fp = fopen(name, mode)) == NULL)
+ fatalf("Cannot open %s.", name);
+ return fp;
+}
+
+/* ckalloc -------------------------------- allocate space; check for success */
+void *ckalloc(size_t amount)
+{
+ void *p;
+
+ if ((long)amount < 0) /* was "<= 0" -CR */
+ fatal("ckalloc: request for negative space.");
+ if (amount == 0)
+ amount = 1; /* ANSI portability hack */
+ if ((p = malloc(amount)) == NULL)
+ fatalf("Ran out of memory trying to allocate %lu.",
+ (unsigned long)amount);
+ return p;
+}
+
+/* same_string ------------------ determine whether two strings are identical */
+bool same_string(const char *s, const char *t)
+{
+ return (strcmp(s, t) == 0);
+}
+
+/* copy_string ---------------------- save string s somewhere; return address */
+char *copy_string(const char *s)
+{
+ char *p = ckalloc(strlen(s)+1); /* +1 to hold '\0' */
+ return strcpy(p, s);
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/lib.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/lib.h Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,33 @@
+// lib.h -- header file for some useful procedures
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <limits.h> /* INT_MAX, INT_MIN, LONG_MAX, LONG_MIN, etc. */
+#include <stdarg.h>
+
+typedef unsigned char uchar;
+typedef int bool;
+
+extern char *argv0;
+
+void print_argv0(void);
+#ifdef __GNUC__ /* avoid some "foo might be used uninitialized" warnings */
+ void fatal(const char *msg) __attribute__ ((noreturn));
+ void fatalf(const char *fmt, ...) __attribute__ ((noreturn));
+ void fatalfr(const char *fmt, ...) __attribute__ ((noreturn));
+#else
+ void fatal(const char *msg);
+ void fatalf(const char *fmt, ...);
+ void fatalfr(const char *fmt, ...);
+#endif
+FILE *ckopen(const char *name, const char *mode);
+void *ckalloc(size_t amount);
+bool same_string(const char *s, const char *t);
+char *copy_string(const char *s);
+
+#undef MAX
+#define MAX(x,y) ((x) > (y) ? (x) : (y))
+#undef MIN
+#define MIN(x,y) ((x) < (y) ? (x) : (y))

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/pop.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/pop.c Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,76 @@
+/* pop -- add four columns (allele counts, genotype, maximum quality) for a
+*  specified population to a Galaxy SNP table, or enforce bounds
+*
+*  argv[1] = file containing a Galaxy table
+*  argv[2] = lower bound on total coverage (-1 = no lower bound)
+*  argv[3] = upper bound on total coverae (-1 if no bound)
+*  argv[4] = lower bound on individual coverage (-1 = no bound)
+*  argv[5] = lower bound on individual quality value (-1 = no bound)
+*  argv[6] ... are the starting columns (base-1) for the chosen individuals
+
+What it does on Galaxy
+The user specifies that some of the individuals in the selected SNP table are form a "population" that has been previously defined using the Galaxy tool to select individuals from a SNP table. One option is for the program to append four columns to the table, giving the total counts for the two alleles, the "genotype" for the population and the maximum quality value, taken over all indivuals in the population. If all defined genotypes in the population are 2 (agree with the reference), the population's genotype is 2; similarly for 0; otherwise the genoype is 1 (unless all individuals have undefined genotype, in which case it is -1.  The other option is to remove rows from the table for which the total coverage for the population is either too low or too high, and/or if the individual coverage or quality value is too low.
+*/
+
+#include "lib.h"
+
+// most characters allowed in a row of the table
+#define MOST 50000
+
+// column for the relevant individuals/groups
+int col[MOST];
+int nI;
+
+int main(int argc, char **argv) {
+ FILE *fp;
+ char *p, *z = "\t\n", buf[MOST], trash[MOST];
+ int X[MOST], m, i, A, B, G, Q, lo, hi, indiv, qual, g, q;
+
+ if (argc < 3)
+ fatalf("args: SNP-table low high col1 col2 ...");
+
+ lo = atoi(argv[2]);
+ hi = atoi(argv[3]);
+ indiv = atoi(argv[4]);
+ qual = atoi(argv[5]);
+ for (i = 6, nI = 0; i < argc; ++i, ++nI)
+ col[nI] = atoi(argv[i]);
+
+ fp = ckopen(argv[1], "r");
+ while (fgets(buf, MOST, fp)) {
+ if (buf[0] == '#')
+ continue;
+ strcpy(trash, buf);
+ // set X[i] = atoi(i-th word of s), i is base 0
+ for (i = 1, p = strtok(trash, z); p != NULL;
+   ++i, p = strtok(NULL, z))
+ X[i] = atoi(p);
+ for (i = A = B = Q = 0, G = -1; i < nI; ++i) {
+ m = col[i];
+ if (X[m]+X[m+1] < indiv || (q = X[m+3]) < qual)
+ break;
+ A += X[m];
+ B += X[m+1];
+ g = X[m+2];
+ if (g != -1) {
+ if (G == -1) // first time
+ G = g;
+ else if (G != g)
+ G = 1;
+ }
+ Q = MAX(Q, q);
+ }
+ if (i < nI) // check bounds on the population's individuals
+ continue;
+ if (lo == -1 && hi == -1 && indiv == -1 && qual == -1) {
+ // add columns
+ if ((p = strchr(buf, '\n')) != NULL)
+ *p = '\0';
+ printf("%s\t%d\t%d\t%d\t%d\n", buf, A, B, G, Q);
+ } else if (A+B >= lo && (hi == -1 || A+B <= hi))
+ // coverage meets the population-level restrictions
+ printf("%s", buf);
+ }
+
+ return 0;
+}

diff -r d4ec09e8079f -r 4b6590dd7250 genome_diversity/src/sweep.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genome_diversity/src/sweep.c Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,279 @@\n+/* sweep -- find regions of the genome with high scores (e.g., Fst scores).\n+*\n+* argv[1] -- file containing a Galaxy table\n+* argv[2] -- column number (base-1) for the chromosome name\n+* argv[3] -- column number for the (base-0) chromosomal position\n+* argv[4] -- column number for a score for the position\n+* argv[5] -- a percentage, such as "95", or a raw score, such as "=0.9".\n+* argv[6] -- the number of randomizations (shuffles) of the scores\n+* argv[7] -- [optional] if present and non-zero, report SNPs\n+*\n+* The program first determines a threshold such that the stated percentage\n+* of the scores are below that threshold (or uses the provided number if\n+* argv[5] starts with "="). The program subtracts the threshold\n+* from each score, then looks for maximal-scoring runs of SNPs, i.e., where\n+* adding or subtracting SNPs from an end of then run always decreases the\n+* total score. These regions are printed in order of descreasing total score.\n+* To determine a cutoff for the printed regions, the programs takes the maximum\n+* score over all regions observed in a specified number of shuffles of the\n+* list of scores. If argv[6] = 0, then all maximal-scoring runs of at least\n+* 4 table entries are printed.\n+\n+What it does on Galaxy\n+The user selects a SNP table and specifies the columns containing (1) chromosome, (2) position, (3) scores (such as an Fst-value for the SNP), (4) a percentage or raw score for the "cutoff" and (5) the number of times the data should be radomized (only intervals with score exceeding the maximum for the randomized data are reported). If a percentage (e.g. 95%) is specified for #3, then that percentile of the scores is used as the cutoff; this may not work well if many SNPs have the same score. The program subtracts the cutoff from every score, then finds genomic intervals (i.e., consecutive runs of SNPs) whose total score cannot be increased by adding or subtracting one or more SNPs at the ends of the interval.\n+*/\n+\n+#include "lib.h"\n+#include "Huang.h"\n+\n+// maximum number of rows in any processed table\n+#define MANY 20000000\n+#define BUF_SIZE 5000\n+#define MAX_WINDOW 1000000\n+\n+double X[MANY];\t// holds all scores\n+int nX;\n+\n+// position-score pairs for a single chromosome\n+struct score {\n+\tint pos;\n+\tdouble x; // original score, then shifted score\n+} S[MANY];\n+int nS;\n+\n+struct snp {\n+\tint pos;\n+\tdouble x;\n+\tstruct snp *next;\n+};\n+\n+// structure to hold the maximum-scoring chromosomal intervals\n+struct sweep {\n+\tfloat score;\n+\tchar *chr;\n+\tint b, e;\n+\tstruct snp *snps;\n+} W[MAX_WINDOW];\n+int nW;\n+\n+// return the linked list of SNPs in positions b to e\n+struct snp *add_snps(int b, int e) {\n+\tstruct snp *first = NULL, *last = NULL, *new;\n+\tint i;\n+\tfor (i = b; i <= e; ++i)\n+\t\tif (S[i].pos >= 0) {\n+\t\t\tnew = ckalloc(sizeof(*new));\n+\t\t\tnew->pos = S[i].pos;\n+\t\t\tnew->x = S[i].x;\n+\t\t\tnew->next = NULL;\n+\t\t\tif (first == NULL)\n+\t\t\t\tfirst = new;\n+\t\t\telse\n+\t\t\t\tlast->next = new;\n+\t\t\tlast = new;\n+\t\t}\n+\treturn first;\n+}\n+\n+// given a table row, return a pointer to the item in a particular column\n+char *get_col(char *buf, int col) {\n+\tstatic char temp[BUF_SIZE], *p;\n+\tint i;\n+\tchar *z = " \\t\\n";\n+\n+\tstrcpy(temp, buf);\n+\tfor (p = strtok(temp, z), i = 1; *p && i < col;\n+\t p = strtok(NULL, z), ++i)\n+\t\t;\n+\tif (p == NULL)\n+\t\tfatalf("no column %d in %s", col, buf);\n+\treturn p;\n+}\n+\n+// fill S[] with position-score pairs for the next chromosome\n+// return 0 for EOF\n+int get_chr(FILE *fp, int chr_col, int pos_col, int score_col, char *chr) {\n+\tstatic char buf[BUF_SIZE];\n+\tstatic int init = 1;\n+\tchar *status;\n+\n+\tif (init) {\n+\t\twhile ((status = fgets(buf, BUF_SIZE, fp)) != NULL &&\n+\t\t buf[0] == \'#\')\n+\t\t\t;\n+\t\tif (status == NULL)\n+\t\t\tfatal("empty table");\n+\t\tinit = 0;\n+\t}\n+\tif (buf[0] == \'\\0\')\n+\t\treturn 0;\n+\t\n+\tif (buf[0] == \'#\')\n+\t\tfatal("cannot happen");\n+\tstrcpy(chr, get_col(buf, chr_col));\n+\tS[0].pos = atoi(get_col(buf, pos_col));\n+\tS[0].x = atof(get_col(buf, score_col));\n+\tfo'..b'uf, BUF_SIZE, fp)) {\n+\t\t\tbuf[0] = \'\\0\';\n+\t\t\treturn 1;\n+\t\t}\n+\t\tif (!same_string(chr, get_col(buf, chr_col)))\n+\t\t\tbreak;\n+\t\tS[nS].pos = atoi(get_col(buf, pos_col));\n+\t\tS[nS].x = atof(get_col(buf, score_col));\n+\t}\n+\treturn 1;\n+}\n+\n+// for sorting genomic intervals by *decreasing* score\n+int Wcompar(struct sweep *a, struct sweep *b) {\n+\tfloat y = a->score, z = b->score;\n+\n+\tif (y > z)\n+\t\treturn -1;\n+\tif (y < z)\n+\t\treturn 1;\n+\treturn 0;\n+}\n+\n+// for sorting an array of scores into increasing order\n+int fcompar(double *a, double *b) {\n+\tif (*a < *b)\n+\t\treturn -1;\n+\tif (*a > *b)\n+\t\treturn 1;\n+\treturn 0;\n+}\n+\n+/* shuffle the values S[0], S[1], ... , S[nscores-1];\n+* Uses Algorithm P in page 125 of "The Art of Computer Programming (Vol II)\n+* Seminumerical Programming", by Donald Knuth, Addison-Wesley, 1971.\n+*/\n+void shuffle_scores() {\n+\tint i, j;\n+\tdouble temp;\n+\n+\tfor (i = nX-1; i > 0; --i) {\n+\t\t// swap what\'s in location i with location j, where 0 <= j <= i\n+\t\tj = random() % (i+1);\n+\t\ttemp = X[i];\n+\t\tX[i] = X[j];\n+\t\tX[j] = temp;\n+\t}\n+}\n+\n+// return the best interval score (R[i] is the struct operated by Huang())\n+double best() {\n+\tint i;\n+\tdouble bestScore;\n+\n+\tHuang(X, nX);\n+\n+\tfor (bestScore = 0.0, i = 1; i <= top; ++i) \n+\t\tbestScore = MAX(R[i].Score, bestScore);\n+\treturn bestScore;\n+}\n+\n+int main(int argc, char **argv) {\n+\tFILE *fp;\n+\tchar buf[BUF_SIZE], chr[100], *a;\n+\tdouble shift = 0.0, cutoff;\n+\tint i, b, e, chr_col, pos_col, score_col, nshuffle, snps = 0;\n+\tstruct snp *s;\n+\n+\tif (argc != 7 && argc != 8)\n+\t\tfatal("args: table chr_col pos_col score_col threhold randomizations [SNPs]");\n+\n+\t// process command-line arguments\n+\tchr_col = atoi(argv[2]);\n+\tpos_col = atoi(argv[3]);\n+\tscore_col = atoi(argv[4]);\n+\ta = argv[5];\n+\tfp = ckopen(argv[1], "r");\n+\tif (argc == 8)\n+\t\tsnps = atoi(argv[7]);\n+\tif (isdigit(a[0])) {\n+\t\tfor (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) {\n+\t\t\tif (buf[0] == \'#\') \n+\t\t\t\tcontinue;\n+\t\t\tX[nX++] = atof(get_col(buf, score_col));\n+\t\t}\n+\t\tif (nX == MANY)\n+\t\t\tfatal("Too many rows");\n+\t\tqsort((void *)X, (size_t)nX, sizeof(double),\n+\t\t (const void *)fcompar);\n+\t\tshift = X[atoi(a)*nX/100];\n+\t\trewind(fp);\n+\t} else if (a[0] == \'=\')\n+\t\tshift = atof(a+1);\n+\n+//fprintf(stderr, "shift = %4.3f\\n", shift);\n+\tnshuffle = atoi(argv[6]);\n+\tif (nshuffle == 0)\n+\t\tcutoff = 0;\n+\telse {\n+\t\tfor (nX = 0; nX < MANY && fgets(buf, BUF_SIZE, fp); ) { \n+\t\t\tif (buf[0] == \'#\')\n+\t\t\t\tcontinue;\n+\t\t\tX[nX++] = atof(get_col(buf, score_col)) - shift;\n+\t\t}\n+\t\tif (nX == MANY)\n+\t\t\tfatal("Too many rows");\n+\t\tfor (cutoff = 0.0, i = 0; i < nshuffle; ++i) {\n+\t\t\tshuffle_scores();\n+\t\t\tcutoff = MAX(cutoff, best());\n+\t\t}\n+\t\trewind(fp);\n+\t}\n+//fprintf(stderr, "cutoff = %4.3f\\n", cutoff);\n+\n+\t// loop over chromosomes;\n+\t// start by getting the chromosome\'s scores\n+\twhile (get_chr(fp, chr_col, pos_col, score_col, chr)) {\n+\t\t// subtract shift from the scores\n+\t\tfor (i = 0; i < nS; ++i)\n+\t\t\tX[i] = S[i].x - shift;\n+\n+\t\t// find the maximum=scoring regions\n+\t\tHuang(X, nS);\n+\t\n+\t\t// save any regions with >= 4 points and score >= cutoff\n+\t\tfor (i = 0; i <= top; ++i) {\n+\t\t\tif (nW >= MAX_WINDOW)\n+\t\t\t\tfatalf("too many windows");\n+\n+\t\t\t// get indices of the first and last SNP in the interval\n+\t\t\tb = R[i].Lpos + 1;\n+\t\t\te = R[i].Rpos;\n+\n+\t\t\t// remove unmapped SNP position from intervals\' ends\n+\t\t\twhile (b < e && S[b].pos == -1)\n+\t\t\t\t++b;\n+\t\t\twhile (e > b && S[e].pos == -1)\n+\t\t\t\t--e;\n+\n+\t\t\t// record intervals\n+\t\t\tif (e - b < 3 || R[i].Score < cutoff)\n+\t\t\t\tcontinue;\n+\t\t\tW[nW].score = R[i].Score;\n+\t\t\tW[nW].chr = copy_string(chr);\n+\t\t\tW[nW].b = S[b].pos;\n+\t\t\tW[nW].e = S[e].pos+1;\t// Ws are half-open\n+\t\t\tif (snps)\n+\t\t\t\tW[nW].snps = add_snps(b, e);\n+\t\t\t++nW;\n+\t\t}\n+\t}\n+\n+\t// sort by decreasing score\n+\tqsort((void *)W, (size_t)nW, sizeof(W[0]), (const void *)Wcompar);\n+\n+\tfor (i = 0; i < nW; ++i) {\n+\t\tprintf("%s\\t%d\\t%d\\t%4.4f\\n", \n+\t\t\tW[i].chr, W[i].b, W[i].e, W[i].score);\n+\t\tfor (s = W[i].snps; s; s = s->next)\n+\t\t\tprintf(" %d %3.2f\\n", s->pos, s->x);\n+\t}\n+\treturn 0;\n+}\n'

diff -r d4ec09e8079f -r 4b6590dd7250 lib/galaxy/datatypes/wsf.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/galaxy/datatypes/wsf.py Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,185 @@\n+"""\n+SnpFile datatype\n+"""\n+\n+import galaxy.datatypes.data\n+import tempfile\n+import os\n+import simplejson\n+from galaxy import util\n+from galaxy.datatypes.sniff import *\n+from galaxy.datatypes.tabular import Tabular\n+from galaxy.datatypes.images import Html\n+from galaxy.datatypes import metadata\n+from galaxy.datatypes.metadata import MetadataElement\n+\n+class Wped( Html ):\n+ allow_datatype_change = False\n+ composite_type = \'basic\'\n+ file_ext = \'gd_ped\'\n+\n+ MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default=\'WpedData\', readonly=True, set_in_upload=True )\n+\n+ def __init__( self, **kwd ):\n+ Html.__init__( self, **kwd )\n+ self.add_composite_file( \'%s.ped\', description = \'Pedigree File\', substitute_name_with_metadata = \'base_name\', is_binary = False )\n+ self.add_composite_file( \'%s.map\', description = \'Map File\', substitute_name_with_metadata = \'base_name\', is_binary = False )\n+\n+class Individuals( Tabular ):\n+ file_ext = \'gd_indivs\'\n+ def __init__(self, **kwd):\n+ Tabular.__init__( self, **kwd )\n+ self.column_names = [ \'Column\', \'Name\', \'Alias\' ]\n+\n+ def display_peek( self, dataset ):\n+ return Tabular.make_html_table( self, dataset, column_names=self.column_names )\n+\n+class DatasetComments( object ):\n+ def __init__( self, dataset, comment_string=\'#\' ):\n+ self.dataset = dataset\n+ self.comment_string = comment_string\n+ self.comment_string_len = len(comment_string)\n+ self._comments = []\n+ self._read_comments()\n+\n+ def _read_comments( self ):\n+ if self.dataset.has_data():\n+ try:\n+ for line in open(self.dataset.file_name, \'rU\'):\n+ if line.startswith(self.comment_string):\n+ comment = line[self.comment_string_len:]\n+ self._comments.append(comment)\n+ else:\n+ break\n+ except:\n+ pass\n+\n+ def __str__( self ):\n+ return "".join(self._comments)\n+\n+ @property\n+ def comments( self ):\n+ return self._comments\n+\n+class DatasetCommentMetadata( object ):\n+ def __init__( self, dataset, comment_string=\'#\' ):\n+ self.dataset_comments = DatasetComments( dataset, comment_string )\n+ self._comment_metadata = {}\n+ self._decode_dataset_comments()\n+\n+ def _decode_dataset_comments( self ):\n+ dataset_comment_string = str( self.dataset_comments )\n+ try:\n+ self._comment_metadata = simplejson.loads( dataset_comment_string )\n+ except simplejson.JSONDecodeError as e:\n+ pass\n+\n+ @property\n+ def comment_metadata( self ):\n+ return self._comment_metadata\n+\n+class AnnotatedTabular( Tabular ):\n+ """ Tabular file with optional comment block containing JSON to be imported into metadata """\n+ MetadataElement( name="comment_metadata", desc="comment metadata", param=metadata.DictParameter, visible=False, readonly=True )\n+\n+ def set_meta( self, dataset, overwrite = True, **kwd ):\n+ Tabular.set_meta( self, dataset, overwrite=overwrite, max_data_lines=None, max_guess_type_data_lines=1000, **kwd )\n+ if dataset.metadata.comment_metadata is None:\n+ dataset_comment_metadata = DatasetCommentMetadata( dataset )\n+ dataset.metadata.comment_metadata = dataset_comment_metadata.comment_metadata.copy()\n+ self.set_dataset_metadata_from_comments( dataset )\n+\n+ def set_dataset_metadata_from_comments( self, dataset ):\n+ pass\n+\n+ def set_peek( self, dataset, line_count=None, is_multi_byte=False ):\n+ super(Tabular, self).set_peek( dataset, line_count=line_count, is_multi_byte=is_multi_byte, WIDTH=\'unlimited\', skipchars=[\'#\'] )\n+\n+ def display_peek( self, dataset ):\n+ """Returns formated html of peek"""\n+ return Tabular.make_html_table( self, dataset, skipchars'..b'ter, default=0 )\n+ MetadataElement( name="pos", desc="pos column", param=metadata.ColumnParameter, default=0 )\n+ MetadataElement( name="ref", desc="ref column", param=metadata.ColumnParameter, default=0 )\n+ MetadataElement( name="rPos", desc="rPos column", param=metadata.ColumnParameter, default=0 )\n+ MetadataElement( name="species", desc="species", default=\'\', no_value=\'\', visible=False, readonly=True )\n+\n+ def set_dataset_metadata_from_comments( self, dataset ):\n+ self.set_dataset_column_names_metadata( dataset )\n+ self.set_dataset_columnParameter_metadata( dataset )\n+ self.set_dataset_species_metadata( dataset )\n+ self.set_dataset_dbkey_metadata( dataset )\n+\n+ def set_dataset_column_names_metadata( self, dataset ):\n+ value_from_comment_metadata = dataset.metadata.comment_metadata.get( \'column_names\', None )\n+ if isinstance( value_from_comment_metadata, list ):\n+ dataset.metadata.column_names = value_from_comment_metadata[:]\n+\n+ def set_dataset_columnParameter_metadata( self, dataset ):\n+ for name, spec in dataset.metadata.spec.items():\n+ if isinstance( spec.param, metadata.ColumnParameter ):\n+ value_from_comment_metadata = dataset.metadata.comment_metadata.get( name, None )\n+ if value_from_comment_metadata is not None:\n+ try:\n+ i = int( value_from_comment_metadata )\n+ except:\n+ i = 0\n+ if 0 <= i <= dataset.metadata.columns:\n+ setattr( dataset.metadata, name, i )\n+\n+ def set_dataset_species_metadata( self, dataset ):\n+ value_from_comment_metadata = dataset.metadata.comment_metadata.get( \'species\', None )\n+ if isinstance( value_from_comment_metadata, basestring ):\n+ dataset.metadata.species = value_from_comment_metadata\n+\n+ def set_dataset_dbkey_metadata( self, dataset ):\n+ value_from_comment_metadata = dataset.metadata.comment_metadata.get( \'dbkey\', \'?\' )\n+ if isinstance( value_from_comment_metadata, basestring ):\n+ dataset.metadata.dbkey = value_from_comment_metadata\n+\n+class GDSnp( Fake ):\n+ """ Webb\'s SNP file format """\n+ file_ext = \'gd_snp\'\n+\n+ MetadataElement( name="individual_names", desc="individual names", visible=False, readonly=True )\n+ MetadataElement( name="individual_columns", desc="individual columns", visible=False, readonly=True )\n+\n+ def set_dataset_metadata_from_comments( self, dataset ):\n+ Fake.set_dataset_metadata_from_comments( self, dataset )\n+ self.set_dataset_individual_metadata( dataset )\n+\n+ def set_dataset_individual_metadata( self, dataset ):\n+ individual_list = dataset.metadata.comment_metadata.get( \'individuals\', None )\n+ if not isinstance( individual_list, list ):\n+ individual_list = []\n+\n+ individual_names = []\n+ individual_columns = []\n+\n+ for individual in individual_list:\n+ if not isinstance( individual, list ) or len( individual ) != 2:\n+ continue\n+ name, col = individual\n+ if not isinstance( name, basestring ):\n+ name = \'\'\n+ try:\n+ c = int( col )\n+ except:\n+ c = 0\n+ if 0 < c <= dataset.metadata.columns:\n+ individual_names.append( name )\n+ individual_columns.append( c )\n+\n+ if individual_names:\n+ dataset.metadata.individual_names = individual_names[:]\n+ dataset.metadata.individual_columns = individual_columns[:]\n+\n+class GDSap( Fake ):\n+ """ Webb\'s SAP file format """\n+ file_ext = \'gd_sap\'\n+\n+ MetadataElement( name="kegg_gene", desc="KEGG gene code column", param=metadata.ColumnParameter, default=0 )\n+ MetadataElement( name="kegg_path", desc="KEGG pathway code/name column", param=metadata.ColumnParameter, default=0 )\n+\n'

diff -r d4ec09e8079f -r 4b6590dd7250 map_ensembl_transcripts.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/map_ensembl_transcripts.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,40 @@
+<tool id="gd_new_oscar" name="Map" version="1.0.0">
+  <description>Ensembl transcripts to KEGG pathways</description>
+
+  <command interpreter="python">
+    rtrnKEGGpthwfENSEMBLTc.py
+      "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.oscar.loc"
+      "--species=${input.metadata.dbkey}"
+      "--input=${input}"
+      "--posENSEMBLclmn=${ensembl_col}"
+      "--output=${output}"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Table" />
+    <param name="ensembl_col" type="data_column" data_ref="input" label="Column with ENSEMBL transcript code" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+
+  
+
+  <help>
+**What it does**
+
+Adds the fields KEGG gene codes and KEGG pathways to an input table of ENSEMBL transcript codes.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 mkpthwpng.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mkpthwpng.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       mkpthwpng.py
+#
+#       Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
+#
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse,mechanize,os,sys
+
+#this return an image made up from a list of genes and pathway code
+def rtnHTMLformat(tmpddGenrcgenPresent,sppPrefx,pthwcod,ouPthwpng):
+ inpx='\n'.join(tmpddGenrcgenPresent)#inpx="ALDH2 color \nALDH3A1 color"
+ request=mechanize.Request("http://www.genome.jp/kegg/tool/map_pathway2.html")
+ response = mechanize.urlopen(request)
+ forms = mechanize.ParseResponse(response, backwards_compat=False)
+ form=forms[0]
+ form["unclassified"]=inpx
+ form["org_name"]=[sppPrefx]
+ request2 = form.click()
+ response2 = mechanize.urlopen(request2)
+ a=str(response2.read()).split('href="/kegg-bin/show_pathway?')[1]
+ code=a.split('/')[0]#response2.read()
+ request=mechanize.Request("http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args"%(code,pthwcod))#request=mechanize.Request("http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args"%('13171478854246','hsa00410'))
+ response = mechanize.urlopen(request)
+ forms = mechanize.ParseResponse(response, backwards_compat=False)
+ form=forms[1]
+ status=' NOT '
+ try:
+ imgf=str(forms[1]).split('/mark_pathway')[1].split('/')[0]
+ os.system("wget --quiet http://www.genome.jp/tmp/mark_pathway%s/%s.png -O %s"%(imgf,pthwcod,ouPthwpng))
+ status=' '
+ except:
+ pass
+ return 'A pathway image was%ssuccefully produced...'%status
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Obtain KEGG images from a list of genes.')
+ parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
+ parser.add_argument('--output',metavar='output PNG image',type=str,help='the output image file in png format')
+ parser.add_argument('--KEGGpath',metavar='KEGG pathway code (i.e. cfa00230)',type=str,help='the code of the pathway of interest')
+ parser.add_argument('--posKEGGclmn',metavar='column number',type=int,help='the column with the KEGG pathway code/name')
+ parser.add_argument('--KEGGgeneposcolmn',metavar='column number',type=int,help='column with the KEGG gene code')
+ #~Open arguments
+ class C(object):
+ pass
+ fulargs=C()
+ parser.parse_args(sys.argv[1:],namespace=fulargs)
+ #test input vars
+ inputf,outputf,KEGGpathw,posKEGGclmn,Kgeneposcolmn=fulargs.input,fulargs.output,fulargs.KEGGpath,fulargs.posKEGGclmn,fulargs.KEGGgeneposcolmn
+    # make posKEGGclmn, Kgeneposcolmn 0-based
+ sppPrefx= KEGGpathw[:3]
+ posKEGGclmn -= 1
+ Kgeneposcolmn -= 1
+ #make a dictionary of valid genes
+ dKEGGcPthws=dict([(x.split('\t')[Kgeneposcolmn],set([y.split('=')[0] for y in x.split('\t')[posKEGGclmn].split('.')])) for x in open(inputf).read().splitlines()[1:] if x.strip()])
+ for mt1gene in [x for x in dKEGGcPthws.keys() if x.find('.')>-1]:#to crrect names with more than one gene
+ pthwsAssotd=dKEGGcPthws.pop(mt1gene)
+ for eachg in mt1gene.split('.'):
+ dKEGGcPthws[eachg]=pthwsAssotd
+ tmpddGenrcgenPresent=set()
+ sKEGGc=dKEGGcPthws.keys()
+ lsKEGGc=len(sKEGGc)
+ ctPthw=0
+ while ctPthw < lsKEGGc:#to save memory
+ eachK=sKEGGc.pop()
+ alPthws=dKEGGcPthws[eachK]
+ if KEGGpathw in alPthws:
+ tmpddGenrcgenPresent.add('\t'.join([eachK,'red']))
+ ctPthw+=1
+ #run the program
+ rtnHTMLformat(tmpddGenrcgenPresent,sppPrefx,KEGGpathw,outputf)
+ return 0
+
+
+if __name__ == '__main__':
+ main()

diff -r d4ec09e8079f -r 4b6590dd7250 modify_snp_table.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/modify_snp_table.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import sys
+import subprocess
+from Population import Population
+
+################################################################################
+
+if len(sys.argv) < 9:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input, p1_input, output, lo, hi, lo_ind, lo_ind_qual = sys.argv[1:8]
+individual_metadata = sys.argv[8:]
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+p1 = Population()
+p1.from_population_file(p1_input)
+
+if not p_total.is_superset(p1):
+    print >> sys.stderr, 'There is an individual in the population that is not in the SNP table'
+    sys.exit(1)
+
+################################################################################
+
+prog = 'pop'
+
+args = []
+args.append(prog)
+args.append(input)
+args.append(lo)
+args.append(hi)
+args.append(lo_ind)
+args.append(lo_ind_qual)
+
+columns = p1.column_list()
+
+for column in sorted(columns):
+    args.append(column)
+
+fh = open(output, 'w')
+
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=fh, stderr=sys.stderr)
+rc = p.wait()
+fh.close()
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 modify_snp_table.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/modify_snp_table.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,107 @@
+<tool id="gd_modify_gd_snp" name="Modify gd_snp" version="1.0.0">
+  <description>modify a gd_snp dataset</description>
+
+  <command interpreter="python">
+    modify_snp_table.py "$input" "$p1_input" "$output"
+    #if $limit_coverage.choice == "0"
+        "-1" "-1" "-1" "-1"
+    #else
+        "${limit_coverage.lo_coverage}" "${limit_coverage.hi_coverage}" "${limit_coverage.low_ind_cov}" "${limit_coverage.lo_quality}"
+    #end if
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+        #set $arg = '%s:%s' % ($individual_col, $individual)
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="gd_snp dataset" />
+    <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+    <conditional name="limit_coverage">
+      <param name="choice" type="select" format="integer" label="Option">
+        <option value="0" selected="true">add columns to the gd_snp table</option>
+        <option value="1">discard some SNPs</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="lo_coverage" type="integer" min="0" value="0" label="Lower bound on total coverage" />
+        <param name="hi_coverage" type="integer" min="0" value="1000" label="Upper bound on total coverage" />
+        <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" />
+        <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="gd_snp" metadata_source="input" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="p1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" />
+      <param name="choice" value="1" />
+      <param name="lo_coverage" value="0" />
+      <param name="hi_coverage" value="1000" />
+      <param name="low_ind_cov" value="3" />
+      <param name="lo_quality" value="30" />
+      <output name="output" file="test_out/modify_snp_table/modify.gd_snp" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input datasets are gd_snp_ and gd_indivs_ formats.
+The output dataset is in gd_snp_ format.  (`Dataset missing?`_)
+
+.. _Dataset missing?: ./static/formatHelp.html
+.. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_indivs: ./static/formatHelp.html#gd_indivs
+
+**What it does**
+
+The user specifies that some of the individuals in the selected gd_snp_ table are
+form a "population" that has been previously defined using the Galaxy tool to
+select individuals from a gd_snp dataset.  One option is for the program to append
+four columns to the table, giving the total counts for the two alleles, the
+"genotype" for the population and the maximum quality value, taken over all
+individuals in the population.  If all defined genotypes in the population
+are 2 (agree with the reference), the population's genotype is 2; similarly
+for 0; otherwise the genotype is 1 (unless all individuals have undefined
+genotype, in which case it is -1.  The other option is to remove rows from
+the table for which the total coverage for the population is either too low
+or too high, and/or if the individual coverage or quality value is too low.
+
+.. _gd_snp: ./static/formatHelp.html#gd_snp
+
+**Examples**
+
+- input gd_snp::
+
+    Contig161_chr1_4641264_4641879  115     C       T       73.5    chr1    4641382     C       6       0       2       45      8       0       2       51      15      0       2       72      5       0       2       42      6       0       2       45      10      0       2       57      Y       54      0.323   0
+    Contig48_chr1_10150253_10151311 11      A       G       94.3    chr1    10150264        A       1       0       2       30      1       0       2       30      1       0       2       30      3       0       2       36      1       0       2       30      1       0       2       30      Y       22      +99.    0
+    Contig20_chr1_21313469_21313570 66      C       T       54.0    chr1    21313534        C       4       0       2       39      4       0       2       39      5       0       2       42      4       0       2       39      4       0       2       39      5       0       2       42      N       1       +99.    0
+    etc.
+
+- input individuals::
+
+    9 PB1
+    13 PB2
+    17 PB3
+
+- output from appending columns::
+
+    Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382         C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 29 0 2 72
+    Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0 3 0 2 30
+    Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 13 0 2 42
+    etc.
+
+- output from filter SNPs with minimum count of 3 for the individuals::
+
+    Contig161_chr1_4641264_4641879  115     C       T       73.5    chr1    4641382     C       6       0       2       45      8       0       2       51      15      0       2       72      5       0       2       42      6       0       2       45      10      0       2       57      Y       54      0.323   0
+    Contig20_chr1_21313469_21313570 66      C       T       54.0    chr1    21313534        C       4       0       2       39      4       0       2       39      5       0       2       42      4       0       2       39      4       0       2       39      5       0       2       42      N       1       +99.    0
+    etc.
+
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 pathway_image.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pathway_image.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,46 @@
+<tool id="gd_pathway_image" name="Generate" version="1.0.0">
+  <description>KEGG pathway images</description>
+
+  <command interpreter="python">
+    mkpthwpng.py
+      "--input=${input}"
+      "--output=${output}"
+      "--KEGGpath=${pathway}"
+      "--posKEGGclmn=${input.metadata.kegg_path}"
+      "--KEGGgeneposcolmn=${input.metadata.kegg_gene}"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_sap" label="Table">
+      <validator type="metadata" check="kegg_gene,kegg_path" message="Missing KEGG gene code column and/or KEGG pathway code/name column metadata.  Click the pencil icon in the history item to edit/save the metadata attributes" />
+    </param>
+    <param name="pathway" type="select">
+      <options from_file="gd.pathways.txt">
+        <column name="value" index="1"/>
+        <column name="name" index="2"/>
+        <filter type="data_meta" ref="input" key="dbkey" column="0" separator="\t" />
+      </options>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="png" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_sap" ftype="gd_sap" />
+      <param name="pathway" value="cfa05214" />
+      <output name="output" file="test_out/pathway_image/pathway_image.png" compare="sim_size" delta = "10000" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool produces an image of an input KEGG pathway, highlighting the
+modules representing genes in an input list.  NOTE:  a given gene can
+be assigned to multiple modules, and different genes can be assigned to
+the same module.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 pca.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pca.py Wed Sep 12 17:10:26 2012 -0400

[

b'@@ -0,0 +1,258 @@\n+#!/usr/bin/env python\n+\n+import errno\n+import os\n+import shutil\n+import subprocess\n+import sys\n+from BeautifulSoup import BeautifulSoup\n+import gd_composite\n+\n+################################################################################\n+\n+def mkdir_p(path):\n+ try:\n+ os.makedirs(path)\n+ except OSError, e:\n+ if e.errno <> errno.EEXIST:\n+ raise\n+\n+################################################################################\n+\n+def run_program(prog, args, stdout_file=None):\n+ #print "args: ", \' \'.join(args)\n+ p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n+ (stdoutdata, stderrdata) = p.communicate()\n+ rc = p.returncode\n+\n+ if stdout_file is not None:\n+ with open(stdout_file, \'w\') as ofh:\n+ print >> ofh, stdoutdata\n+\n+ if rc != 0:\n+ print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, \' \'.join(args))\n+ print >> sys.stderr, stderrdata\n+ sys.exit(1)\n+\n+################################################################################\n+\n+def do_ped2geno(input, output):\n+ lines = []\n+ with open(input) as fh:\n+ for line in fh:\n+ line = line.rstrip(\'\\r\\n\')\n+ lines.append(line.split())\n+\n+ pair_map = {\n+ \'0\':{ \'0\':\'9\', \'1\':\'9\', \'2\':\'9\' },\n+ \'1\':{ \'0\':\'1\', \'1\':\'2\', \'2\':\'1\' },\n+ \'2\':{ \'0\':\'1\', \'1\':\'1\', \'2\':\'0\' }\n+ }\n+ with open(output, \'w\') as ofh:\n+ for a_idx in xrange(6, len(lines[0]), 2):\n+ b_idx = a_idx + 1\n+ print >> ofh, \'\'.join(map(lambda line: pair_map[line[a_idx]][line[b_idx]], lines))\n+\n+def do_map2snp(input, output):\n+ with open(output, \'w\') as ofh:\n+ with open(input) as fh:\n+ for line in fh:\n+ elems = line.split()\n+ print >> ofh, \' {0} 11 0.002 2000 A T\'.format(elems[1])\n+\n+def make_ind_file(ind_file, input):\n+ pops = []\n+\n+ ofh = open(ind_file, \'w\')\n+\n+ with open(input) as fh:\n+ soup = BeautifulSoup(fh)\n+ misc = soup.find(\'div\', {\'id\': \'gd_misc\'})\n+ populations = misc(\'ul\')[0]\n+\n+ i = 0\n+ for entry in populations:\n+ if i % 2 == 1:\n+ population_name = entry.contents[0].encode(\'utf8\').strip().replace(\' \', \'_\')\n+ pops.append(population_name)\n+ individuals = entry.ol(\'li\')\n+ for individual in individuals:\n+ individual_name = individual.string.encode(\'utf8\').strip()\n+ print >> ofh, individual_name, \'M\', population_name\n+ i += 1\n+\n+ ofh.close()\n+ return pops\n+\n+def make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file):\n+ with open(par_file, \'w\') as fh:\n+ print >> fh, \'genotypename: {0}\'.format(geno_file)\n+ print >> fh, \'snpname: {0}\'.format(snp_file)\n+ print >> fh, \'indivname: {0}\'.format(ind_file)\n+ print >> fh, \'evecoutname: {0}\'.format(evec_file)\n+ print >> fh, \'evaloutname: {0}\'.format(eval_file)\n+ print >> fh, \'altnormstyle: NO\'\n+ print >> fh, \'numoutevec: 2\'\n+\n+def do_smartpca(par_file):\n+ prog = \'smartpca\'\n+\n+ args = [ prog ]\n+ args.append(\'-p\')\n+ args.append(par_file)\n+\n+ #print "args: ", \' \'.join(args)\n+ p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr)\n+ (stdoutdata, stderrdata) = p.communicate()\n+ rc = p.returncode\n+\n+ if rc != 0:\n+ print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, \' \'.join(args))\n+ print >> sys.stderr, stderrdata\n+ sys.exit(1)\n+\n+ stats = []\n+\n+ save_line = False\n+ for line in stdoutdata.split(\'\\n\'):\n+ if line.startswith((\'## Average divergence\', \'## Anova statistics\', \'## Statistical significance\')):\n+ stats.append(\'\')\n+ save_line = True\n+ if line.strip() == \'\':\n+ save_line = False\n+ '..b'derr=subprocess.PIPE)\n+ (stdoutdata, stderrdata) = p.communicate()\n+ rc = p.returncode\n+\n+ if rc != 0:\n+ print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, \' \'.join(args))\n+ print >> sys.stderr, stderrdata\n+ sys.exit(1)\n+\n+def do_coords2admix(coords_file):\n+ prog = \'coords2admix\'\n+\n+ args = [ prog ]\n+ args.append(coords_file)\n+\n+ with open(\'fake\', \'w\') as ofh:\n+ #print "args:", \' \'.join(args)\n+ p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=subprocess.PIPE)\n+ (stdoutdata, stderrdata) = p.communicate()\n+ rc = p.returncode\n+\n+ if rc != 0:\n+ print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, \' \'.join(args))\n+ print >> sys.stderr, stderrdata\n+ sys.exit(1)\n+\n+ shutil.copy2(\'fake\', coords_file)\n+\n+################################################################################\n+\n+if len(sys.argv) != 5:\n+ print "usage"\n+ sys.exit(1)\n+\n+input, input_files_path, output, output_files_path = sys.argv[1:5]\n+\n+mkdir_p(output_files_path)\n+\n+ped_file = os.path.join(input_files_path, \'admix.ped\')\n+geno_file = os.path.join(output_files_path, \'admix.geno\')\n+do_ped2geno(ped_file, geno_file)\n+\n+map_file = os.path.join(input_files_path, \'admix.map\')\n+snp_file = os.path.join(output_files_path, \'admix.snp\')\n+do_map2snp(map_file, snp_file)\n+\n+ind_file = os.path.join(output_files_path, \'admix.ind\')\n+population_names = make_ind_file(ind_file, input)\n+\n+par_file = os.path.join(output_files_path, \'par.admix\')\n+evec_file = os.path.join(output_files_path, \'coordinates.txt\')\n+eval_file = os.path.join(output_files_path, \'admix.eval\')\n+make_par_file(par_file, geno_file, snp_file, ind_file, evec_file, eval_file)\n+\n+smartpca_stats = do_smartpca(par_file)\n+\n+do_ploteig(evec_file, population_names)\n+plot_file = \'coordinates.txt.1:2.{0}.pdf\'.format(\':\'.join(population_names))\n+output_plot_file = os.path.join(output_files_path, \'PCA.pdf\')\n+shutil.copy2(plot_file, output_plot_file)\n+os.unlink(plot_file)\n+\n+do_eval2pct(eval_file, os.path.join(output_files_path, \'explained.txt\'))\n+os.unlink(eval_file)\n+\n+do_coords2admix(evec_file)\n+\n+################################################################################\n+\n+info_page = gd_composite.InfoPage()\n+info_page.set_title(\'PCA Galaxy Composite Dataset\')\n+\n+display_file = gd_composite.DisplayFile()\n+display_value = gd_composite.DisplayValue()\n+\n+out_pdf = gd_composite.Parameter(name=\'PCA.pdf\', value=\'PCA.pdf\', display_type=display_file)\n+out_evec = gd_composite.Parameter(name=\'coordinates.txt\', value=\'coordinates.txt\', display_type=display_file)\n+out_explained = gd_composite.Parameter(name=\'explained.txt\', value=\'explained.txt\', display_type=display_file)\n+\n+evec_prefix = \'coordinates.txt.1:2.{0}\'.format(\':\'.join(population_names))\n+ps_file = \'{0}.ps\'.format(evec_prefix)\n+xtxt_file = \'{0}.xtxt\'.format(evec_prefix)\n+\n+os.unlink(os.path.join(output_files_path, ps_file))\n+os.unlink(os.path.join(output_files_path, xtxt_file))\n+\n+info_page.add_output_parameter(out_pdf)\n+info_page.add_output_parameter(out_evec)\n+info_page.add_output_parameter(out_explained)\n+\n+in_admix = gd_composite.Parameter(name=\'par.admix\', value=\'par.admix\', display_type=display_file)\n+in_geno = gd_composite.Parameter(name=\'admix.geno\', value=\'admix.geno\', display_type=display_file)\n+in_snp = gd_composite.Parameter(name=\'admix.snp\', value=\'admix.snp\', display_type=display_file)\n+in_ind = gd_composite.Parameter(name=\'admix.ind\', value=\'admix.ind\', display_type=display_file)\n+\n+info_page.add_input_parameter(in_admix)\n+info_page.add_input_parameter(in_geno)\n+info_page.add_input_parameter(in_snp)\n+info_page.add_input_parameter(in_ind)\n+\n+misc_stats = gd_composite.Parameter(description=\'Stats<p/><pre>\\n{0}\\n</pre>\'.format(smartpca_stats), display_type=display_value)\n+\n+info_page.add_misc(misc_stats)\n+\n+with open (output, \'w\') as ofh:\n+ print >> ofh, info_page.render()\n+\n+sys.exit(0)\n+\n'

diff -r d4ec09e8079f -r 4b6590dd7250 pca.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/pca.xml Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,66 @@
+<tool id="gd_pca" name="PCA" version="1.0.0">
+
+  <command interpreter="python">
+    pca.py "$input" "$input.extra_files_path" "$output" "$output.files_path"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_ped" label="Dataset" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="html" />
+  </outputs>
+
+  
+
+  <help>
+**What it does**
+
+The users selects a set of data generated by the Galaxy tool to "prepare to look for population structure". The PCA tool runs a Principal Component Analysis on the input genotype data and constructs a plot of the top two principal components. It also reports the following estimates of the statistical significance of the analysis.
+
+1. Average divergence between each pair of populations.  Specifically, from the covariance matrix X whose eigenvectors were computed, we can compute a "distance", d, for each pair of individuals (i,j): d(i,j) = X(i,i) + X(j,j) - 2X(i,j).  For each pair of populations (a,b) now define an average distance: D(a,b) = \sum d(i,j) (in pop a, in pop b) / (\|pop a\| * \|pop b\|).  We then normalize D so that the diagonal has mean 1 and report it.
+
+2. Anova statistics for population differences along each eigenvector. For each eigenvector, a P-value for statistical significance of differences between each pair of populations along that eigenvector is printed.  +++ is used to highlight P-values less than 1e-06.  \*\*\* is used to highlight P-values between 1e-06 and 1e-03.  If there are more than 2 populations, then an overall P-value is also printed for that eigenvector, as are the populations with minimum (minv) and maximum (maxv) eigenvector coordinate. [If there is only 1 population, no Anova statistics are printed.]
+
+3. Statistical significance of differences between populations. For each pair of populations, the above Anova statistics are summed across eigenvectors. The result is approximately chisq with d.o.f. equal to the number of eigenvectors. The chisq statistic and its p-value are printed. [If there is only 1 population, no statistics are printed.]
+
+We post-process the output of the PCA tool to estimate "admixture fractions".  For this, we take three populations at a time and determine each one's average point in the PCA plot (by separately averaging first and second coordinates).  For each combination of two center points, modeling two ancestral populations, we try to model the third central point as having a certain fraction, r, of its SNP genotypes from the second ancestral population and the remainder from the first ancestral population, where we estimate r.  The output file "coordinates.txt" then contains pairs of lines like
+
+projection along chord Population1 -> Population2
+  Population3: 0.12345
+
+where the number (in this case 0.1245) is the estimation of r.  Computations with simulated data suggests that the true r is systematically underestimated, perhaps giving roughly 0.6 times r.
+
+**Acknowledgments**
+
+We use the programs "smartpca" and "ploteig" downloaded from
+
+http://genepath.med.harvard.edu/~reich/Software.htm
+
+and described in the paper "Population structure and eigenanalysis". by Nick Patterson, Alkes L.Price and David Reich, PLoS Genetics, 2 (2006), e190.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 phylogenetic_tree.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phylogenetic_tree.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,219 @@
+#!/usr/bin/env python
+
+import os
+import errno
+import sys
+import subprocess
+import shutil
+from Population import Population
+import gd_composite
+
+################################################################################
+
+def mkdir_p(path):
+  try:
+    os.makedirs(path)
+  except OSError, e:
+    if e.errno <> errno.EEXIST:
+      raise
+
+################################################################################
+
+if len(sys.argv) < 11:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input, p1_input, output, extra_files_path, minimum_coverage, minimum_quality, dbkey, data_source, draw_tree_options = sys.argv[1:10]
+
+individual_metadata = sys.argv[10:]
+
+# note: TEST THIS
+if dbkey in ['', '?', 'None']:
+    dbkey = 'none'
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+
+################################################################################
+
+mkdir_p(extra_files_path)
+
+################################################################################
+
+def run_program(prog, args, ofh):
+    #print "args: ", ' '.join(args)
+    p = subprocess.Popen(args, bufsize=-1, executable=prog, stdin=None, stdout=ofh, stderr=subprocess.PIPE)
+    (stdoutdata, stderrdata) = p.communicate()
+    rc = p.returncode
+    ofh.close()
+
+    if rc != 0:
+        #print >> sys.stderr, "FAILED: rc={0}: {1}".format(rc, ' '.join(args))
+        print >> sys.stderr, stderrdata
+        sys.exit(1)
+
+################################################################################
+
+phylip_outfile = os.path.join(extra_files_path, 'distance_matrix.phylip')
+newick_outfile = os.path.join(extra_files_path, 'phylogenetic_tree.newick')
+ps_outfile = 'tree.ps'
+pdf_outfile = os.path.join(extra_files_path, 'tree.pdf')
+
+################################################################################
+
+informative_snp_file = os.path.join(extra_files_path, 'informative_snps.txt')
+mega_distance_matrix_file = os.path.join(extra_files_path, 'mega_distance_matrix.txt')
+
+prog = 'dist_mat'
+
+args = []
+args.append(prog)
+args.append(input)
+args.append(minimum_coverage)
+args.append(minimum_quality)
+args.append(dbkey)
+args.append(data_source)
+args.append(informative_snp_file)
+args.append(mega_distance_matrix_file)
+
+if p1_input == "all_individuals":
+    tags = p_total.tag_list()
+else:
+    p1 = Population()
+    p1.from_population_file(p1_input)
+    if not p_total.is_superset(p1):
+        print >> sys.stderr, 'There is an individual in the population that is not in the SNP table'
+        sys.exit(1)
+    tags = p1.tag_list()
+
+for tag in tags:
+    args.append(tag)
+
+fh = open(phylip_outfile, 'w')
+run_program(None, args, fh)
+
+################################################################################
+
+prog = 'quicktree'
+
+args = []
+args.append(prog)
+args.append('-in')
+args.append('m')
+args.append('-out')
+args.append('t')
+args.append(phylip_outfile)
+
+fh = open(newick_outfile, 'w')
+run_program(None, args, fh)
+
+################################################################################
+
+prog = 'draw_tree'
+
+args = []
+args.append(prog)
+if draw_tree_options:
+    args.append(draw_tree_options)
+args.append(newick_outfile)
+
+fh = open(ps_outfile, 'w')
+run_program(None, args, fh)
+
+################################################################################
+
+prog = 'ps2pdf'
+
+args = []
+args.append(prog)
+args.append('-dPDFSETTINGS=/prepress')
+args.append(ps_outfile)
+args.append('-')
+
+fh = open(pdf_outfile, 'w')
+run_program(None, args, fh)
+
+shutil.copyfile(pdf_outfile, output)
+
+################################################################################
+
+info_page = gd_composite.InfoPage()
+info_page.set_title('Phylogenetic tree Galaxy Composite Dataset')
+
+display_file = gd_composite.DisplayFile()
+display_value = gd_composite.DisplayValue()
+
+out_pdf = gd_composite.Parameter(name='tree.pdf', value='tree.pdf', display_type=display_file)
+out_newick = gd_composite.Parameter(value='phylogenetic_tree.newick', name='phylogenetic tree (newick)', display_type=display_file)
+out_phylip = gd_composite.Parameter(value='distance_matrix.phylip', name='Phylip distance matrix', display_type=display_file)
+out_mega = gd_composite.Parameter(value='mega_distance_matrix.txt', name='Mega distance matrix', display_type=display_file)
+out_snps = gd_composite.Parameter(value='informative_snps.txt', name='informative SNPs', display_type=display_file)
+
+info_page.add_output_parameter(out_pdf)
+info_page.add_output_parameter(out_newick)
+info_page.add_output_parameter(out_phylip)
+info_page.add_output_parameter(out_mega)
+info_page.add_output_parameter(out_snps)
+
+in_min_cov = gd_composite.Parameter(description='Minimum coverage', value=minimum_coverage, display_type=display_value)
+in_min_qual = gd_composite.Parameter(description='Minimum quality', value=minimum_quality, display_type=display_value)
+
+include_ref_value = 'no'
+if dbkey != 'none':
+    include_ref_value = 'yes'
+
+in_include_ref = gd_composite.Parameter(description='Include reference sequence', value=include_ref_value, display_type=display_value)
+
+if data_source == '0':
+    data_source_value = 'sequence coverage'
+elif data_source == '1':
+    data_source_value = 'estimated genotype'
+
+in_data_source = gd_composite.Parameter(description='Data source', value=data_source_value, display_type=display_value)
+
+branch_type_value = 'square'
+if 'd' in draw_tree_options:
+    branch_type_value = 'diagonal'
+
+in_branch_type = gd_composite.Parameter(description='Branch type', value=branch_type_value, display_type=display_value)
+
+branch_scale_value = 'yes'
+if 's' in draw_tree_options:
+    branch_scale_value = 'no'
+
+in_branch_scale = gd_composite.Parameter(description='Draw branches to scale', value=branch_scale_value, display_type=display_value)
+
+branch_length_value = 'yes'
+if 'b' in draw_tree_options:
+    branch_length_value = 'no'
+
+in_branch_length = gd_composite.Parameter(description='Show branch lengths', value=branch_length_value, display_type=display_value)
+
+tree_layout_value = 'horizontal'
+if 'v' in draw_tree_options:
+    tree_layout_value = 'vertical'
+
+in_tree_layout = gd_composite.Parameter(description='Tree layout', value=tree_layout_value, display_type=display_value)
+
+info_page.add_input_parameter(in_min_cov)
+info_page.add_input_parameter(in_min_qual)
+info_page.add_input_parameter(in_include_ref)
+info_page.add_input_parameter(in_data_source)
+info_page.add_input_parameter(in_branch_type)
+info_page.add_input_parameter(in_branch_scale)
+info_page.add_input_parameter(in_branch_length)
+info_page.add_input_parameter(in_tree_layout)
+
+misc_individuals = gd_composite.Parameter(name='Individuals', value=tags, display_type=gd_composite.DisplayTagList())
+
+info_page.add_misc(misc_individuals)
+
+
+with open(output, 'w') as ofh:
+    print >> ofh, info_page.render()
+
+################################################################################
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 phylogenetic_tree.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/phylogenetic_tree.xml Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,168 @@
+<tool id="gd_phylogenetic_tree" name="Phylogenetic" version="1.0.0">
+  <description>tree</description>
+
+  <command interpreter="python">
+    phylogenetic_tree.py "$input"
+    #if $individuals.choice == '0'
+      "all_individuals"
+    #else if $individuals.choice == '1'
+      "$p1_input"
+    #end if
+    "$output" "$output.files_path" "$minimum_coverage" "$minimum_quality"
+ #if ((str($input.metadata.scaffold) == str($input.metadata.ref)) and (str($input.metadata.pos) == str($input.metadata.rPos))) or (str($include_reference) == '0')
+        "none"
+    #else
+        "$input.metadata.dbkey"
+    #end if
+    "$data_source"
+    #set $draw_tree_options = ''.join(str(x) for x in [$branch_style, $scale_style, $length_style, $layout_style])
+    #if $draw_tree_options == ''
+        ""
+    #else
+        "-$draw_tree_options"
+    #end if
+    #for $individual_name, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+        #set $arg = '%s:%s' % ($individual_col, $individual_name)
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="SNP table" />
+
+    <conditional name="individuals">
+      <param name="choice" type="select" label="Individuals">
+        <option value="0" selected="true">All</option>
+        <option value="1">Individuals in a population</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+      </when>
+    </conditional>
+
+    <param name="minimum_coverage" type="integer" min="0" value="0" label="Minimum coverage" help="Note: Minimum coverage and Minimum quality cannot both be 0" />
+
+    <param name="minimum_quality" type="integer" min="0" value="0" label="Minimum quality" help="Note: Minimum coverage and Minimum quality cannot both be 0" />
+
+    <param name="include_reference" type="select" format="integer" label="Include reference sequence">
+      <option value="1" selected="true">Yes</option>
+      <option value="0">No</option>
+    </param>
+
+    <param name="data_source" type="select" format="integer" label="Data source">
+      <option value="0" selected="true">sequence coverage</option>
+      <option value="1">estimated genotype</option>
+    </param>
+
+    <param name="branch_style" type="select" display="radio">
+      <label>Branch type</label>
+      <option value="" selected="true">square</option>
+      <option value="d">diagonal</option>
+    </param>
+
+    <param name="scale_style" type="select" display="radio">
+      <label>Draw branches to scale</label>
+      <option value="" selected="true">yes</option>
+      <option value="s">no</option>
+    </param>
+
+    <param name="length_style" type="select" display="radio">
+      <label>Show branch lengths</label>
+      <option value="" selected="true">yes</option>
+      <option value="b">no</option>
+    </param>
+
+    <param name="layout_style" type="select" display="radio">
+      <label>Tree layout</label>
+      <option value="" selected="true">horizontal</option>
+      <option value="v">vertical</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="html" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="choice" value="0" />
+      <param name="minimum_coverage" value="3" />
+      <param name="minimum_quality" value="30" />
+      <param name="data_source" value="0" />
+      <param name="branch_style" value="" />
+      <param name="scale_style" value="" />
+      <param name="length_style" value="" />
+      <param name="layout_style" value="" />
+      <output name="output" file="test_out/phylogenetic_tree/phylogenetic_tree.html" ftype="html" compare="diff" lines_diff="2">
+        <extra_files type="file" name="distance_matrix.phylip" value="test_out/phylogenetic_tree/distance_matrix.phylip" />
+        <extra_files type="file" name="informative_snps.txt" value="test_out/phylogenetic_tree/informative_snps.txt" />
+        <extra_files type="file" name="mega_distance_matrix.txt" value="test_out/phylogenetic_tree/mega_distance_matrix.txt" />
+        <extra_files type="file" name="phylogenetic_tree.newick" value="test_out/phylogenetic_tree/phylogenetic_tree.newick" />
+        <extra_files type="file" name="tree.pdf" value="test_out/phylogenetic_tree/tree.pdf" compare="sim_size" delta = "1000"/>
+      </output>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool uses a SNP table to determine a kind of "genetic distance" between
+each pair of individuals.  Optionally, that information can be used to
+produce a tree-shaped figure that depicts how the individuals are related,
+either as a text file in a common format, called NEWICK, or as a picture.
+The user specifies the following inputs to the tool.
+
+SNP table
+
+Individuals
+  By default, all individuals are included in the analysis; an option
+  is to analyze only a subset of individuals that has been specified
+  using the tool to "Select individuals from a SNP table".
+
+Minimum coverage
+  For each pair of individuals, the tool looks for informative SNPs, i.e.,
+  where the sequence data for both individuals is adequate according to
+  some criterion.  Specifying, say, 7 for this option instructs the tool
+  to consider only SNPs with coverage at least 7 in both individuals
+  when estimating their "genetic distance".
+
+Minimum quality
+  Specifying, say, 37 for this option instructs the tool to consider
+  only SNPs with SAMtools quality value at least 37 in both individuals
+  when estimating their "genetic distance".
+
+Minimum number of informative SNPs
+  This option instructs the tool to terminate execution if at least one
+  pair of individuals does not have a required number of informative SNPs.
+
+Include reference sequence
+  For SNP tables with a reference sequence, the user can ask that the
+  reference be indicated in the tree, to help with rooting it.  If the
+  SNP table has no reference sequence, this option has no effect.
+
+Data source
+  The genetic distance between two individuals at a given SNP can
+  be estimated two ways.  One method is to use the absolute value of
+  difference in the frequency of the first allele (equivalently: the
+  second allele).  For instance, if the first individual has 5 reads of
+  each allele and the second individual has respectively 3 and 6 reads,
+  then the frequencies are 1/2 and 1/3, giving a distance 1/6 at that
+  SNP.  The other approach is to use the SAMtools genotypes to estimate
+  the difference in the number of occurrences of the first allele.
+  For instance, if the two genotypes are 2 and 1, i.e., the individuals
+  are estimated to have respectively 2 and 1 occurrences of the first
+  allele at this location, then the distance is 1 (the absolute value
+  of the difference of the two numbers).
+
+Output format
+  There are three options, as described above.
+
+**Acknowledgments**
+
+To convert the distance matrix to a NEWICK-formatted tree, we use the QuickTree program, downloaded from: http://www.sanger.ac.uk/resources/software/quicktree/
+
+To draw the tree, we use the program draw_tree, downloaded from: http://compgen.bscb.cornell.edu/phast/
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 population_structure.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/population_structure.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+import errno
+import os
+import shutil
+import subprocess
+import sys
+from BeautifulSoup import BeautifulSoup
+import gd_composite
+
+################################################################################
+
+def run_admixture(ped_file, populations):
+    prog = 'admixture'
+
+    args = []
+    args.append(prog)
+    args.append(input_ped_file)
+    args.append(populations)
+
+    #print "args:", ' '.join(args)
+    ofh = open('/dev/null', 'w')
+    p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=ofh, stderr=sys.stderr)
+    rc = p.wait()
+    ofh.close()
+
+def run_r(input_file, output_file, populations):
+    prog = 'R'
+
+    args = []
+    args.append(prog)
+    args.append('--vanilla')
+    args.append('--quiet')
+    args.append('--args')
+    args.append(input_file)
+    args.append(output_file)
+    args.append(populations)
+
+    _realpath = os.path.realpath(__file__)
+    _script_dir = os.path.dirname(_realpath)
+    r_script_file = os.path.join(_script_dir, 'population_structure.r')
+
+    ifh = open(r_script_file)
+    ofh = open('/dev/null', 'w')
+    p = subprocess.Popen(args, bufsize=-1, stdin=ifh, stdout=ofh, stderr=None)
+    rc = p.wait()
+    ifh.close()
+    ofh.close()
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+def get_populations(input):
+    pops = []
+    pop_names = {}
+
+    with open(input) as fh:
+        soup = BeautifulSoup(fh)
+        misc = soup.find('div', {'id': 'gd_misc'})
+
+        return 'Populations\n{0}'.format(misc('ul')[0])
+
+################################################################################
+
+if len(sys.argv) != 6:
+    print >> sys.stderr, "Usage"
+    sys.exit(1)
+
+input_html_file, input_ped_file, output_file, extra_files_path, populations = sys.argv[1:6]
+populations_html = get_populations(input_html_file)
+
+run_admixture(input_ped_file, populations)
+
+ped_base = os.path.basename(input_ped_file)
+if ped_base.endswith('.ped'):
+    ped_base = ped_base[:-4]
+
+p_file = '%s.%s.P' % (ped_base, populations)
+q_file = '%s.%s.Q' % (ped_base, populations)
+
+mkdir_p(extra_files_path)
+numeric_output_file = os.path.join(extra_files_path, 'numeric.txt')
+shutil.copy2(q_file, numeric_output_file)
+os.remove(p_file)
+os.remove(q_file)
+
+graphical_output_file = os.path.join(extra_files_path, 'graphical.pdf')
+run_r(numeric_output_file, graphical_output_file, populations)
+
+################################################################################
+
+info_page = gd_composite.InfoPage()
+info_page.set_title('Population structure Galaxy Composite Dataset')
+
+display_file = gd_composite.DisplayFile()
+display_value = gd_composite.DisplayValue()
+
+out_pdf = gd_composite.Parameter(name='graphical.pdf', value='graphical.pdf', display_type=display_file)
+out_txt = gd_composite.Parameter(name='numeric.txt', value='numeric.txt', display_type=display_file)
+
+info_page.add_output_parameter(out_pdf)
+info_page.add_output_parameter(out_txt)
+
+in_pops = gd_composite.Parameter(description='Number of populations', value=populations, display_type=display_value)
+
+info_page.add_input_parameter(in_pops)
+
+misc_pops = gd_composite.Parameter(description=populations_html, display_type=display_value)
+
+info_page.add_misc(misc_pops)
+
+
+with open (output_file, 'w') as ofh:
+    print >> ofh, info_page.render()
+
+
+sys.exit(0)

diff -r d4ec09e8079f -r 4b6590dd7250 population_structure.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/population_structure.r Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,19 @@
+library(RColorBrewer)
+
+args = commandArgs(trailingOnly=TRUE)
+q_file = args[[1]]
+output_file = args[[2]]
+populations = args[[3]]
+
+tbl <- read.table(q_file)
+
+if ( populations >= 3 && populations <= 12 ) {
+ colors = brewer.pal(populations, 'Paired')
+} else {
+ colors = rainbow(populations)
+}
+
+pdf(file=output_file, onefile=TRUE, width=7, height=3)
+barplot(t(as.matrix(tbl)), col=colors, xlab="Individual #", ylab="Ancestry", border=NA)
+
+dev.off()

diff -r d4ec09e8079f -r 4b6590dd7250 population_structure.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/population_structure.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,57 @@
+<tool id="gd_population_structure" name="Population" version="1.0.0">
+  <description>structure</description>
+
+  <command interpreter="python">
+    population_structure.py "$input" "${input.extra_files_path}/admix.ped" "$output" "$output.files_path" "$populations"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_ped" label="Dataset" />
+    <param name="populations" type="integer" min="1" value="2" label="Number of populations" />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="html" />
+  </outputs>
+
+  
+
+
+  <help>
+**What it does**
+
+The users selects a set of data generated by the Galaxy tool to "prepare
+to look for population structure", and specifies a number, K, of ancestral
+populations.  The tool estimates the proportion of each individual's ancestry
+coming from each ancestral population.  The proportions are shown both as
+numbers and graphically.
+
+**Acknowledgments**
+
+We use the program "Admixture", downloaded from
+
+http://www.genetics.ucla.edu/software/admixture/
+
+and described in the paper "Fast model-based estimation of ancestry in
+unrelated individuals" by David H. Alexander, John Novembre and Kenneth Lange,
+Genome Research 19 (2009), pp. 1655-1664.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 prepare_population_structure.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prepare_population_structure.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,144 @@
+#!/usr/bin/env python
+
+import errno
+import os
+import shutil
+import subprocess
+import sys
+from Population import Population
+import gd_composite
+
+################################################################################
+
+def do_import(filename, files_path, min_reads, min_qual, min_spacing, tags, using_info, population_list):
+    info_page = gd_composite.InfoPage()
+    info_page.set_title('Prepare to look for population structure Galaxy Composite Dataset')
+
+    display_file = gd_composite.DisplayFile()
+    display_value = gd_composite.DisplayValue()
+
+    out_ped = gd_composite.Parameter(name='admix.ped', value='admix.ped', display_type=display_file)
+    out_map = gd_composite.Parameter(name='admix.map', value='admix.map', display_type=display_file)
+    out_use = gd_composite.Parameter(description=using_info, display_type=display_value)
+
+    info_page.add_output_parameter(out_ped)
+    info_page.add_output_parameter(out_map)
+    info_page.add_output_parameter(out_use)
+
+    in_min_reads = gd_composite.Parameter(description='Minimum reads covering a SNP, per individual', value=min_reads, display_type=display_value)
+    in_min_qual = gd_composite.Parameter(description='Minimum quality value, per individual', value=min_qual, display_type=display_value)
+    in_min_spacing = gd_composite.Parameter(description='Minimum spacing between SNPs on the same scaffold', value=min_spacing, display_type=display_value)
+
+    info_page.add_input_parameter(in_min_reads)
+    info_page.add_input_parameter(in_min_qual)
+    info_page.add_input_parameter(in_min_spacing)
+
+    misc_populations = gd_composite.Parameter(name='Populations', value=population_list, display_type=gd_composite.DisplayPopulationList())
+    info_page.add_misc(misc_populations)
+
+    with open(filename, 'w') as ofh:
+        print >> ofh, info_page.render()
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError, e:
+        if e.errno <> errno.EEXIST:
+            raise
+
+def die(message, exit=True):
+    print >> sys.stderr, message
+    if exit:
+        sys.exit(1)
+
+################################################################################
+
+if len(sys.argv) < 9:
+    die("Usage")
+
+# parse command line
+input_snp_filename, min_reads, min_qual, min_spacing, output_filename, output_files_path = sys.argv[1:7]
+args = sys.argv[7:]
+
+individual_metadata = []
+population_files = []
+population_names = []
+all_individuals = False
+
+for arg in args:
+    if arg == 'all_individuals':
+        all_individuals = True
+    elif len(arg) > 11:
+        tag = arg[:11]
+        value = arg[11:]
+        if tag == 'individual:':
+            individual_metadata.append(value)
+        elif tag == 'population:':
+            filename, name = value.split(':', 1)
+            population_files.append(filename)
+            population_names.append(name)
+
+p_total = Population()
+p_total.from_tag_list(individual_metadata)
+
+individual_population = {}
+
+population_list = []
+
+if all_individuals:
+    p1 = p_total
+    p1.name = 'All Individuals'
+    population_list.append(p1)
+else:
+    p1 = Population()
+    for idx in range(len(population_files)):
+        population_file = population_files[idx]
+        population_name = population_names[idx]
+        this_pop = Population(population_name)
+        this_pop.from_population_file(population_file)
+        population_list.append(this_pop)
+        p1.from_population_file(population_file)
+        tags = p1.tag_list()
+        for tag in tags:
+            if tag not in individual_population:
+                individual_population[tag] = population_name
+
+if not p_total.is_superset(p1):
+    print >> sys.stderr, 'There is an individual in the population that is not in the SNP table'
+    sys.exit(1)
+
+# run tool
+prog = 'admix_prep'
+
+args = []
+args.append(prog)
+args.append(input_snp_filename)
+args.append(min_reads)
+args.append(min_qual)
+args.append(min_spacing)
+
+tags = p1.tag_list()
+for tag in tags:
+    args.append(tag)
+
+#print "args:", ' '.join(args)
+p = subprocess.Popen(args, bufsize=-1, stdin=None, stdout=subprocess.PIPE, stderr=sys.stderr)
+(stdoutdata, stderrdata) = p.communicate()
+rc = p.returncode
+
+if rc != 0:
+    die('admix_prep failed: rc={0}'.format(rc))
+
+using_info = stdoutdata.rstrip('\r\n')
+mkdir_p(output_files_path)
+output_ped_filename = os.path.join(output_files_path, 'admix.ped')
+output_map_filename = os.path.join(output_files_path, 'admix.map')
+shutil.copy2('admix.ped', output_ped_filename)
+shutil.copy2('admix.map', output_map_filename)
+do_import(output_filename, output_files_path, min_reads, min_qual, min_spacing, tags, using_info, population_list)
+
+os.unlink('admix.ped')
+os.unlink('admix.map')
+
+sys.exit(0)
+

diff -r d4ec09e8079f -r 4b6590dd7250 prepare_population_structure.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/prepare_population_structure.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,74 @@
+<tool id="gd_prepare_population_structure" name="Prepare" version="1.0.0">
+  <description>to look for population structure</description>
+
+  <command interpreter="python">
+    prepare_population_structure.py "$input" "$min_reads" "$min_qual" "$min_spacing" "$output" "$output.files_path"
+    #if $individuals.choice == '0'
+        "all_individuals"
+    #else if $individuals.choice == '1'
+        #for $population in $individuals.populations
+          #set $pop_arg = 'population:%s:%s' % (str($population.p_input), str($population.p_input.name))
+          "$pop_arg"
+        #end for
+    #end if
+    #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+        #set $arg = 'individual:%s:%s' % ($individual_col, $individual)
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="SNP table" />
+    <param name="min_reads" type="integer" min="0" value="0" label="Minimum reads covering a SNP, per individual" />
+    <param name="min_qual" type="integer" min="0" value="0" label="Minimum quality value, per individual" />
+    <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs on the same scaffold" />
+    <conditional name="individuals">
+      <param name="choice" type="select" label="Individuals">
+        <option value="0" selected="true">All</option>
+        <option value="1">Choose</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <repeat name="populations" title="Population" min="1">
+          <param name="p_input" type="data" format="gd_indivs" label="Individuals" />
+        </repeat>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="gd_ped">
+      <actions>
+        <action type="metadata" name="base_name" default="admix" />
+      </actions>
+    </data>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="min_reads" value="3" />
+      <param name="min_qual" value="30" />
+      <param name="min_spacing" value="0" />
+      <param name="choice" value="0" />
+      <output name="output" file="test_out/prepare_population_structure/prepare_population_structure.html" ftype="html" compare="diff" lines_diff="2">
+        <extra_files type="file" name="admix.map" value="test_out/prepare_population_structure/admix.map" />
+        <extra_files type="file" name="admix.ped" value="test_out/prepare_population_structure/admix.ped" />
+      </output>
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+The tool converts a SNP table into two tables, called "admix.map" and
+"admix.ped", needed for estimating the population structure.  The user
+can read or download those files, or simply pass this tool's output on to
+other programs.  The user imposes conditions on which SNPs to consider,
+such as the minimum coverage and/or quality value for every individual,
+or the distance to the closest SNP in the same contig (as named in the
+first column of the SNP table).  A useful piece of information produced
+by the tool is the number of SNPs meeting those conditions, which can
+be found by clicking on the "eye" after the program runs.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 rank_pathways.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rank_pathways.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,74 @@
+<tool id="gd_calc_freq" name="Rank" version="1.0.0">
+  <description>affected KEGG pathways</description>
+
+  <command interpreter="python">
+    #if str($output_format) == 'a'
+      calctfreq.py
+    #else if str($output_format) == 'b'
+      calclenchange.py
+    #end if
+        "--loc_file=${GALAXY_DATA_INDEX_DIR}/gd.rank.loc"
+        "--species=${input.metadata.dbkey}"
+        "--input=${input}"
+        "--output=${output}"
+        "--posKEGGclmn=${input.metadata.kegg_path}"
+        "--KEGGgeneposcolmn=${input.metadata.kegg_gene}"
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_sap" label="Table">
+        <validator type="metadata" check="kegg_gene,kegg_path" message="Missing KEGG gene code column and/or KEGG pathway code/name column metadata.  Click the pencil icon in the history item to edit/save the metadata attributes" />
+    </param>
+    <param name="output_format" type="select" label="Output format">
+      <option value="a" selected="true">ranked by percentage of genes affected</option>
+      <option value="b">ranked by change in length and number of paths</option>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="tabular" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_sap" ftype="gd_sap" />
+      <param name="output_format" value="a" />
+      <output name="output" file="test_out/rank_pathways/rank_pathways.tabular" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool produces a table ranking the pathways based on the percentage
+of genes in an input dataset, out of the total in each pathway.
+Alternatively, the tool ranks the pathways based on the change in
+length and number of paths connecting sources and sinks.  This change is
+calculated between graphs representing pathways with and without excluding
+the nodes that represent the genes in an input list.  Sources are all
+the nodes representing the initial reactants/products in the pathway.
+Sinks are all the nodes representing the final reactants/products in
+the pathway.
+
+If pathways are ranked by percentage of genes affected, the output is
+a tabular dataset with the following columns:
+
+   1. number of genes in the pathway present in the input dataset
+   2. percentage of the total genes in the pathway included in the input dataset
+   3. rank of the frequency (from high freq to low freq)
+   4. name of the pathway
+
+If pathways are ranked by change in length and number of paths, the
+output is a tabular dataset with the following columns:
+
+   1. change in the mean length of paths between sources and sinks
+   2. mean length of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
+   3. mean length of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, the length is assumed to be infinite (I)
+   4. rank of the change in the mean length of paths between sources and sinks (from high change to low change)
+   5. change in the number of paths between sources and sinks
+   6. number of paths between sources and sinks in the pathway including the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
+   7. number of paths between sources and sinks in the pathway excluding the genes in the input dataset.  If the pathway do not have sources/sinks, it is assumed to be a circuit (C)
+   8. rank of the change in the number of paths between sources and sinks (from high change to low change)
+   9. name of the pathway
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 rtrnKEGGpthwfENSEMBLTc.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rtrnKEGGpthwfENSEMBLTc.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,78 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#       calclenchange.py
+#
+#       Copyright 2011 Oscar Bedoya-Reina <oscar@niska.bx.psu.edu>
+#
+#       This program is free software; you can redistribute it and/or modify
+#       it under the terms of the GNU General Public License as published by
+#       the Free Software Foundation; either version 2 of the License, or
+#       (at your option) any later version.
+#
+#       This program is distributed in the hope that it will be useful,
+#       but WITHOUT ANY WARRANTY; without even the implied warranty of
+#       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#       GNU General Public License for more details.
+#
+#       You should have received a copy of the GNU General Public License
+#       along with this program; if not, write to the Free Software
+#       Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+#       MA 02110-1301, USA.
+
+import argparse,os,sys
+
+
+def main():
+ parser = argparse.ArgumentParser(description='Adds the fields KEGG gene codes and KEGG pathways to an input table of ENSEMBL transcript codes.')
+ parser.add_argument('--loc_file',metavar='correlational database',type=str,help='correlational database')
+ parser.add_argument('--species',metavar='species name',type=str,help='the species of interest in loc_file')
+ parser.add_argument('--output',metavar='output TXT file',type=str,help='the output file with the table in txt format. The output will have two more fields: KEGG gene codes and KEGG pathways of each ENSEMBL code' )
+ parser.add_argument('--posENSEMBLclmn',metavar='column number',type=int,help='the column with the ENSEMBLE transcript code')
+ parser.add_argument('--input',metavar='input TXT file',type=str,help='the input file with the table in txt format')
+ #~
+ #~Open arguments
+ class C(object):
+ pass
+ fulargs=C()
+ parser.parse_args(sys.argv[1:],namespace=fulargs)
+ #test input vars
+ inputf,loc_file,species,output,posENSEMBLclmn=fulargs.input,fulargs.loc_file,fulargs.species,fulargs.output,fulargs.posENSEMBLclmn
+ posENSEMBLclmn-=1#correct pos
+ #~ Get the extra variables
+ crDB=[x.split() for x in open(loc_file).read().splitlines() if x.split()[0]==species][0]
+ sppPrefx,dinput=crDB[0],crDB[1]#X should be replaced by the position in which the Conversion Dictionary File (CDF) is placed
+ #make a dictionary of the input CDF
+ dKEGGcPthws=dict([(x.split('\t')[0],'\t'.join(x.split('\t')[1:])) for x in open(dinput).read().splitlines() if x.strip()])
+ #~ add the two new columns
+ sall=[]
+ #lENSEMBLTc=[x.split('\t') for x in open(inputf).read().splitlines() if x.strip()]
+ lENSEMBLTc = []
+ with open(inputf) as fh:
+     for line in fh:
+         if line.startswith('#'):
+             continue
+         lENSEMBLTc.append(line.rstrip('\r\n').split('\t'))
+ nLines=len(lENSEMBLTc)
+ cLines=0
+ sall=[]#the output list for with additional fields
+ #~
+ while cLines<nLines:
+ cLines+=1
+ lENSEMBLTcKEGGgKEGGpth=lENSEMBLTc.pop(0)
+ ENSEMBLTc=lENSEMBLTcKEGGgKEGGpth[posENSEMBLclmn]
+ try:
+ KEGGgKEGGpth=dKEGGcPthws[ENSEMBLTc]
+ except:
+ KEGGgKEGGpth='\t'.join(['U','N'])
+ sall.append('\t'.join(['\t'.join(lENSEMBLTcKEGGgKEGGpth),KEGGgKEGGpth]))
+ #~
+ salef=open(output,'w')
+ salef.write('\n'.join(sall))
+ salef.close()
+ return 0
+
+
+if __name__ == '__main__':
+ main()
+

diff -r d4ec09e8079f -r 4b6590dd7250 select_individuals.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_individuals.xml Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,72 @@
+<tool id="gd_select_individuals" name="Select individuals" version="1.0.0">
+  <description>from a gd_snp dataset</description>
+
+  <command interpreter="bash">
+    echo.bash "$input" "$output"
+    #for $individual in str($individuals).split(',')
+        #set $individual_idx = $input.dataset.metadata.individual_names.index($individual)
+        #set $individual_col = str( $input.dataset.metadata.individual_columns[$individual_idx] )
+        #set $arg = '\t'.join([$individual_col, $individual, ''])
+        "$arg"
+    #end for
+  </command>
+
+  <inputs>
+    <param name="input" type="data" format="gd_snp" label="gd_snp dataset"/>
+    <param name="individuals" type="select" display="checkboxes" multiple="true" label="Individuals to include">
+      <options>
+        <filter type="data_meta" ref="input" key="individual_names" />
+      </options>
+      <validator type="no_options" message="You must select at least one individual"/>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="gd_indivs" label="Individuals from ${input.hid}" />
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
+      <param name="individuals" value="PB1,PB2" />
+      <output name="output" file="test_in/a.gd_indivs" />
+    </test>
+  </tests>
+
+  <help>
+**Dataset formats**
+
+The input dataset is in the gd_snp_ format.
+The output dataset is in gd_indivs_ format.  (`Dataset missing?`_)
+
+.. _Dataset missing?: ./static/formatHelp.html
+.. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_indivs: ./static/formatHelp.html#gd_indivs
+
+**What it does**
+
+This selects a set of individuals or groups from a gd_snp dataset.   This set
+can then be used by other tools to work on just part of the gd_snp dataset.
+
+**Examples**
+
+- input::
+
+   Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382         C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0
+   Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0
+   Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0
+   etc.
+
+- input metadata::
+
+   #{"column_names":["scaf","pos","A","B","qual","ref","rpos","rnuc","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","pair","dist",
+   #"prim","rflp"],"dbkey":"canFam2","individuals":[["PB1",9],["PB2",13],["PB3",17],["PB4",21],["PB6",25],["PB8",29]],"pos":2,"rPos":7,"ref":6,"scaffold":1,"species":"bear"}
+
+- output when individuals PB1, PB2, and PB3 are selected::
+
+   9 PB1
+   13 PB2
+   17 PB3
+
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 select_snps.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_snps.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,153 @@
+#!/usr/bin/env python
+
+import os
+import sys
+import math
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function(parse_arguments=None):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: (None, arguments)
+    def main_decorator(to_decorate):
+        def decorated_main(arguments=None):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments(arguments)
+            sys.exit(to_decorate(options, arguments))
+        return decorated_main
+    return main_decorator
+
+def parse_arguments(arguments):
+    parser = OptionParser()
+    parser.add_option('--input', dest='input')
+    parser.add_option('--output', dest='output')
+    parser.add_option('--index_dir', dest='index_dir')
+    parser.add_option('--num_snps', dest='num_snps')
+    parser.add_option('--ref_chrom_col', dest='ref_chrom_col')
+    parser.add_option('--ref_pos_col', dest='ref_pos_col')
+    parser.add_option('--ref_species', dest='ref_species')
+    return parser.parse_args(arguments[1:])
+
+@main_function(parse_arguments)
+def main(options, arguments):
+
+    ref_chrom_idx = to_int( options.ref_chrom_col ) -1
+    ref_pos_idx = to_int( options.ref_pos_col ) -1
+
+    if (ref_chrom_idx < 1) or (ref_pos_idx < 1) or (ref_chrom_idx == ref_pos_idx):
+        print >> sys.stderr, "Cannot locate reference genome sequence (ref) or reference genome position (rPos) column for this dataset."
+        sys.exit(1)
+
+    chrom_len_root = os.path.join( options.index_dir, 'shared/ucsc/chrom')
+    chrom_len_file = '%s.len' % options.ref_species
+    chrom_len_path = os.path.join(chrom_len_root, chrom_len_file)
+
+    chrlens = gd.ChrLens( chrom_len_path )
+
+    total_len = 0
+    for chrom in chrlens:
+        total_len += chrlens.length(chrom)
+
+    total_requested = int( options.num_snps )
+    lines, data, comments = get_snp_lines_data_and_comments( options.input, ref_chrom_idx, ref_pos_idx )
+    selected = select_snps( data, total_len, total_requested )
+    out_data = fix_selection_and_order_like_input(data, selected, total_requested)
+    write_selected_snps( options.output, out_data, lines, comments )
+
+def to_int( value ):
+    try:
+        int_value = int( value )
+    except ValueError:
+        int_value = 0
+    return int_value
+
+def get_snp_lines_data_and_comments( filename, chrom_idx, pos_idx ):
+    fh = open( filename, 'r' )
+    if (chrom_idx >= pos_idx):
+        needed = chrom_idx + 1
+    else:
+        needed = pos_idx + 1
+    lines = []
+    data = []
+    comments = []
+    line_idx = 0
+    line_num = 0
+    for line in fh:
+        line_num += 1
+        line = line.rstrip('\r\n')
+        if line:
+            if line.startswith('#'):
+                comments.append(line)
+            else:
+                elems = line.split('\t')
+                if len(elems) >= needed:
+                    chrom = elems[chrom_idx]
+                    try:
+                        pos = int(elems[pos_idx])
+                    except ValueError:
+                        sys.stderr.write( "bad reference position in line %d column %d: %s\n" % ( line_num, pos_idx+1, elems[pos_idx] ) )
+                        sys.exit(1)
+                    lines.append(line)
+                    chrom_sort = chrom.lstrip('chr')
+                    data.append( [chrom_sort, chrom, pos, line_num, line_idx] )
+                    line_idx += 1
+    fh.close()
+    data = sorted( data, key=lambda x: (x[0], x[2]) )
+    return lines, data, comments
+
+def select_snps( data, total_len, requested ):
+    old_chrom = None
+    next_print = 0
+    selected = []
+    space = total_len / requested
+    for data_idx, datum in enumerate( data ):
+        chrom = datum[1]
+        pos = datum[2]
+        if chrom != old_chrom:
+            old_chrom = chrom
+            next_print = 0
+        if pos >= next_print:
+            selected.append(data_idx)
+            next_print += space
+    return selected
+
+def fix_selection_and_order_like_input(data, selected, requested):
+    total_selected = len( selected )
+    a = float( total_selected ) / requested
+    b = a / 2
+
+    idx_list = []
+    for i in range( requested ):
+        idx = int( math.ceil( i * a + b ) - 1 )
+        idx_list.append( idx )
+
+    out_data = []
+
+    for i, data_idx in enumerate(selected):
+        if total_selected > requested:
+            if i in idx_list:
+                out_data.append(data[data_idx])
+        else:
+            out_data.append(data[data_idx])
+
+    out_data = sorted( out_data, key=lambda x: x[3] )
+
+    return out_data
+
+def write_selected_snps( filename, data, lines, comments ):
+    fh = open( filename, 'w' )
+
+    for comment in comments:
+        fh.write("%s\n" % comment )
+
+    for datum in data:
+        line_idx = datum[4]
+        fh.write("%s\n" % lines[line_idx])
+
+    fh.close()
+
+if __name__ == "__main__":
+    main()
+
+

diff -r d4ec09e8079f -r 4b6590dd7250 select_snps.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_snps.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,87 @@
+<tool id="gd_select_snps" name="Select" version="1.0.0">
+  <description>a specified number of SNPs</description>
+
+  <command interpreter="python">
+    select_snps.py "--input=$input" "--output=$output" "--index_dir=$GALAXY_DATA_INDEX_DIR" "--num_snps=$num_snps"
+    #if $override_metadata.choice == "0":
+      "--ref_chrom_col=${input.metadata.ref}" "--ref_pos_col=${input.metadata.rPos}" "--ref_species=${input.metadata.dbkey}"
+    #else
+      "--ref_chrom_col=$ref_col" "--ref_pos_col=$rpos_col" "--ref_species=$ref_species"
+    #end if
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset">
+      <validator type="unspecified_build" message="This dataset does not have a reference species and cannot be used with this tool" />
+    </param>
+    <param name="num_snps" type="integer" value="10" optional="false" min="1" label="Number of SNPs"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="ref_col" type="data_column" data_ref="input" numerical="false" label="Column with reference chromosome"/>
+        <param name="rpos_col" type="data_column" data_ref="input" numerical="true" label="Column with reference position"/>
+        <param name="ref_species" type="select" label="Choose reference species">
+          <options from_file="gd.ref_species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data format="gd_snp" name="output" metadata_source="input"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp"/>
+      <param name="num_snps" value="100"/>
+      <param name="choice" value="0"/>
+      <output name="output" file="test_out/select_snps/select_snps.gd_snp" />
+    </test>
+  </tests>
+
+
+  <help>
+**What it does**
+
+  It attempts to select a specified number of SNPs from the dataset, making them
+  approximately uniformly spaced relative to the reference genome. The number
+  actually selected may be slightly more than the specified number.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    etc.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 specify_restriction_enzymes.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/specify_restriction_enzymes.py Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+import os
+import sys
+from optparse import OptionParser
+import genome_diversity as gd
+
+def main_function( parse_arguments=None ):
+    if parse_arguments is None:
+        parse_arguments = lambda arguments: ( None, arguments )
+    def main_decorator( to_decorate ):
+        def decorated_main( arguments=None ):
+            if arguments is None:
+                arguments = sys.argv
+            options, arguments = parse_arguments( arguments )
+            rc = 1
+            try:
+                rc = to_decorate( options, arguments )
+            except Exception, err:
+                sys.stderr.write( 'ERROR: %s\n' % str( err ) )
+                traceback.print_exc()
+            finally:
+                sys.exit( rc )
+        return decorated_main
+    return main_decorator
+
+def parse_arguments( arguments ):
+    parser = OptionParser()
+    parser.add_option('--input',
+                        type='string', dest='input',
+                        help='file of selected SNPs')
+    parser.add_option('--output',
+                        type='string', dest='output',
+                        help='output file')
+    parser.add_option('--primers_loc',
+                        type='string', dest='primers_loc',
+                        help='primers .loc file')
+    parser.add_option('--scaffold_col',
+                        type="int", dest='scaffold_col',
+                        help='scaffold column in the input file')
+    parser.add_option('--pos_col',
+                        type="int", dest='pos_col',
+                        help='position column in the input file')
+    parser.add_option('--enzyme_list',
+                        type="string", dest='enzyme_list_string',
+                        help='comma separated list of enzymes')
+    parser.add_option('--species',
+                        type="string", dest='species',
+                        help='species')
+    return parser.parse_args( arguments[1:] )
+
+
+@main_function( parse_arguments )
+def main( options, arguments ):
+    if not options.input:
+        raise RuntimeError( 'missing --input option' )
+    if not options.output:
+        raise RuntimeError( 'missing --output option' )
+    if not options.primers_loc:
+        raise RuntimeError( 'missing --primers_loc option' )
+    if not options.scaffold_col:
+        raise RuntimeError( 'missing --scaffold_col option' )
+    if not options.pos_col:
+        raise RuntimeError( 'missing --pos_col option' )
+    if not options.enzyme_list_string:
+        raise RuntimeError( 'missing --enzyme_list option' )
+    if not options.species:
+        raise RuntimeError( 'missing --species option' )
+
+    snps = gd.SnpFile( filename=options.input, seq_col=int( options.scaffold_col ), pos_col=int( options.pos_col ) )
+
+    out_fh = gd._openfile( options.output, 'w' )
+
+    enzyme_dict = {}
+    for enzyme in options.enzyme_list_string.split( ',' ):
+        enzyme = enzyme.strip()
+        if enzyme:
+            enzyme_dict[enzyme] = 1
+
+    primer_data_file = gd.get_filename_from_loc( options.species, options.primers_loc )
+    file_root, file_ext = os.path.splitext( primer_data_file )
+    primer_index_file = file_root + ".cdb"
+    primers = gd.PrimersFile( data_file=primer_data_file, index_file=primer_index_file )
+
+    comments_printed = False
+
+    while snps.next():
+        seq, pos = snps.get_seq_pos()
+        enzyme_list = primers.get_enzymes( seq, pos )
+        for enzyme in enzyme_list:
+            if enzyme in enzyme_dict:
+                if not comments_printed:
+                    for comment in snps.comments:
+                        out_fh.write( "%s\n" % comment )
+                    comments_printed = True
+                out_fh.write( "%s\n" % snps.line )
+                break
+
+    out_fh.close()
+
+if __name__ == "__main__":
+    main()
+

diff -r d4ec09e8079f -r 4b6590dd7250 specify_restriction_enzymes.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/specify_restriction_enzymes.xml Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,88 @@
+<tool id="gd_specify_restriction_enzymes" name="Specify" version="1.0.0">
+  <description>a set of restriction enzymes</description>
+
+  <command interpreter="python">
+    specify_restriction_enzymes.py "--input=$input" "--output=$output" "--primers_loc=${GALAXY_DATA_INDEX_DIR}/gd.primers.loc"
+    #if $override_metadata.choice == "0":
+      "--scaffold_col=${input.metadata.scaffold}" "--pos_col=${input.metadata.pos}" "--species=${input.metadata.species}"
+    #else
+      "--scaffold_col=$scaf_col" "--pos_col=$pos_col" "--species=$species"
+    #end if
+    "--enzyme_list=$enzymes"
+  </command>
+
+  <inputs>
+    <param format="tabular" name="input" type="data" label="Selected SNPS dataset"/>
+    <conditional name="override_metadata">
+      <param name="choice" type="select" format="integer" label="choose columns">
+        <option value="0" selected="true">No, get columns from metadata</option>
+        <option value="1" >Yes, choose columns</option>
+      </param>
+      <when value="0" />
+      <when value="1">
+        <param name="scaf_col" type="data_column" data_ref="input" numerical="false" label="Column with scaffold"/>
+        <param name="pos_col" type="data_column" data_ref="input" numerical="true" label="Column with position"/>
+        <param name="species" type="select" label="Choose species">
+          <options from_file="gd.species.txt">
+            <column name="name" index="1"/>
+            <column name="value" index="0"/>
+          </options>
+        </param>
+      </when>
+    </conditional>
+
+    <param name="enzymes" type="select" display="checkboxes" multiple="true" label="Choose enzymes">
+        <options from_file="gd.restriction_enzymes.txt">
+            <column name="name" index="0"/>
+            <column name="value" index="1"/>
+        </options>
+    </param>
+  </inputs>
+
+  <outputs>
+    <data format="gd_snp" name="output" metadata_source="input"/>
+  </outputs>
+
+  <tests>
+    <test>
+      <param name="input" value="test_out/select_snps/select_snps.gd_snp" ftype="gd_snp" />
+      <param name="choice" value="0" />
+      <param name="enzymes" value="Bsp1286I,HaeII,RsaI" />
+      <output name="output" file="test_out/specify_restriction_enzymes/specify_restriction_enzymes.gd_snp" />
+    </test>
+  </tests>
+
+  <help>
+**What it does**
+
+  It selects the SNPs that are differentially cut by at least one of the
+  specified restriction enzymes. The enzymes are required to cut the amplified
+  segment (for the specified PCR primers) only at the SNP.
+
+-----
+
+**Example**
+
+- input file::
+
+    chr2_75111355_75112576    314  A  C  L  F  chr2   75111676  C  F  15  4  53   2   9  48   Y  96   0.369  0.355  0.396  0
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr10_7434473_7435447    524   T  C  S  S  chr10  7435005   T  S  11  5  90   14  0  69   Y  626  0.066  0.406  0.727  0
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr15_64470252_64471048  89    G  A  Y  Y  chr15  64470341  G  Y  5   6  109  14  0  69   Y  312  0.247  0.998  0.393  0
+    chr18_48070585_48071386  514   C  T  E  K  chr18  48071100  T  K  7   7  46   14  0  69   Y  2    0.200  0.032  0.163  0
+    chr18_50154905_50155664  304   A  G  Y  C  chr18  50155208  A  Y  4   2  17   5   1  22   Y  8    0.022  0.996  0.128  0
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_14240610_14242055  232   C  T  A  V  chr19  14240840  C  A  18  8  56   15  5  42   Y  73   0.003  0.153  0.835  0
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+
+- output file::
+
+    chr8_93901796_93905612   2471  A  C  A  A  chr8   93904264  A  A  8   0  51   10  2  14   Y  961  0.016  0.534  0.114  2
+    chr14_80021455_80022064  138   G  A  H  H  chr14  80021593  G  H  14  0  69   9   6  124  Y  377  0.118  0.997  0.195  1
+    chr18_57379354_57380496  315   C  T  V  V  chr18  57379669  G  V  11  0  60   9   6  62   Y  726  0.118  0.048  0.014  1
+    chr19_39866997_39874915  3117  C  T  P  P  chr19  39870110  C  P  3   7  65   14  2  32   Y  6    0.321  0.911  0.462  4
+    etc.
+  </help>
+</tool>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/a.gd_indivs
--- a/test-data/test_in/a.gd_indivs Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,2 +0,0 @@
-9 PB1
-13 PB2

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/b.gd_indivs
--- a/test-data/test_in/b.gd_indivs Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,2 +0,0 @@
-17 PB3
-21 PB4

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/c.gd_indivs
--- a/test-data/test_in/c.gd_indivs Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,2 +0,0 @@
-25 PB6
-29 PB8

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/ensembl.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_in/ensembl.tabular Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,150 @@
+ENSCAFT00000000001
+ENSCAFT00000000144
+ENSCAFT00000000160
+ENSCAFT00000000215
+ENSCAFT00000000233
+ENSCAFT00000000365
+ENSCAFT00000000507
+ENSCAFT00000000517
+ENSCAFT00000000674
+ENSCAFT00000000724
+ENSCAFT00000000760
+ENSCAFT00000000762
+ENSCAFT00000001047
+ENSCAFT00000001052
+ENSCAFT00000001063
+ENSCAFT00000001076
+ENSCAFT00000001104
+ENSCAFT00000001141
+ENSCAFT00000001146
+ENSCAFT00000001204
+ENSCAFT00000001219
+ENSCAFT00000001250
+ENSCAFT00000001352
+ENSCAFT00000001363
+ENSCAFT00000001421
+ENSCAFT00000001523
+ENSCAFT00000001575
+ENSCAFT00000001587
+ENSCAFT00000001597
+ENSCAFT00000002056
+ENSCAFT00000002100
+ENSCAFT00000002110
+ENSCAFT00000002175
+ENSCAFT00000002259
+ENSCAFT00000002460
+ENSCAFT00000002537
+ENSCAFT00000002577
+ENSCAFT00000002578
+ENSCAFT00000002660
+ENSCAFT00000002792
+ENSCAFT00000002849
+ENSCAFT00000002999
+ENSCAFT00000003163
+ENSCAFT00000003223
+ENSCAFT00000003307
+ENSCAFT00000003515
+ENSCAFT00000003560
+ENSCAFT00000003644
+ENSCAFT00000003824
+ENSCAFT00000003840
+ENSCAFT00000004092
+ENSCAFT00000004103
+ENSCAFT00000004208
+ENSCAFT00000004253
+ENSCAFT00000004311
+ENSCAFT00000004464
+ENSCAFT00000004511
+ENSCAFT00000004609
+ENSCAFT00000004673
+ENSCAFT00000004726
+ENSCAFT00000004799
+ENSCAFT00000004933
+ENSCAFT00000004993
+ENSCAFT00000005126
+ENSCAFT00000005142
+ENSCAFT00000005225
+ENSCAFT00000005323
+ENSCAFT00000005467
+ENSCAFT00000005496
+ENSCAFT00000005518
+ENSCAFT00000005653
+ENSCAFT00000005746
+ENSCAFT00000005749
+ENSCAFT00000005832
+ENSCAFT00000005972
+ENSCAFT00000006025
+ENSCAFT00000006114
+ENSCAFT00000006157
+ENSCAFT00000006219
+ENSCAFT00000006272
+ENSCAFT00000006453
+ENSCAFT00000006479
+ENSCAFT00000006507
+ENSCAFT00000006669
+ENSCAFT00000006689
+ENSCAFT00000006827
+ENSCAFT00000006891
+ENSCAFT00000007130
+ENSCAFT00000007145
+ENSCAFT00000007244
+ENSCAFT00000007375
+ENSCAFT00000007440
+ENSCAFT00000007467
+ENSCAFT00000007484
+ENSCAFT00000007527
+ENSCAFT00000007553
+ENSCAFT00000007697
+ENSCAFT00000007703
+ENSCAFT00000007747
+ENSCAFT00000007774
+ENSCAFT00000007776
+ENSCAFT00000007779
+ENSCAFT00000007859
+ENSCAFT00000007951
+ENSCAFT00000007959
+ENSCAFT00000008012
+ENSCAFT00000008063
+ENSCAFT00000008142
+ENSCAFT00000008198
+ENSCAFT00000008413
+ENSCAFT00000008540
+ENSCAFT00000008586
+ENSCAFT00000008588
+ENSCAFT00000008673
+ENSCAFT00000008678
+ENSCAFT00000008728
+ENSCAFT00000008769
+ENSCAFT00000008831
+ENSCAFT00000009074
+ENSCAFT00000009114
+ENSCAFT00000009614
+ENSCAFT00000009698
+ENSCAFT00000009710
+ENSCAFT00000010094
+ENSCAFT00000010141
+ENSCAFT00000010439
+ENSCAFT00000010496
+ENSCAFT00000010516
+ENSCAFT00000010531
+ENSCAFT00000010559
+ENSCAFT00000010593
+ENSCAFT00000010616
+ENSCAFT00000010630
+ENSCAFT00000010829
+ENSCAFT00000010865
+ENSCAFT00000010931
+ENSCAFT00000010977
+ENSCAFT00000010988
+ENSCAFT00000011187
+ENSCAFT00000011380
+ENSCAFT00000011397
+ENSCAFT00000011721
+ENSCAFT00000011730
+ENSCAFT00000011771
+ENSCAFT00000011789
+ENSCAFT00000011968
+ENSCAFT00000012081
+ENSCAFT00000012133
+ENSCAFT00000012159
+ENSCAFT00000012254

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/sample.gd_sap
--- a/test-data/test_in/sample.gd_sap Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,401 +0,0 @@\n-#{"column_names":["contig","pos","ref","rPos","trns","pep","AA1","loc","AA2","KEGG","pred","path"],"pos":2,"rPos":4,"ref":3,"dbkey":"canFam2","scaffold":1,"species":"bear","kegg_gene":10,"kegg_path":12}\n-Contig39_chr1_3261104_3261850\t414\tchr1\t3261546\tENSCAFT00000000001\tENSCAFP00000000001\tS\t667\tF\t476153\tprobably damaging\tcfa00230=Purine metabolism.cfa00500=Starch and sucrose metabolism.cfa00740=Riboflavin metabolism.cfa00760=Nicotinate and nicotinamide metabolism.cfa00770=Pantothenate and CoA biosynthesis.cfa01100=Metabolic pathways\n-Contig62_chr1_19011969_19012646\t265\tchr1\t19012240\tENSCAFT00000000144\tENSCAFP00000000125\t*\t161\tR\t483960\tprobably damaging\tN\n-Contig36_chr1_20102654_20103213\t365\tchr1\t20103029\tENSCAFT00000000160\tENSCAFP00000000140\tR\t407\tQ\t610160\tpossibly damaging\tN\n-Contig136_chr10_3710404_3714591\t3079\tchr10\t3713499\tENSCAFT00000000215\tENSCAFP00000000194\tT\t103\tP\tU\tbenign\tN\n-Contig36_chr1_23682012_23682647\t374\tchr1\t23682388\tENSCAFT00000000233\tENSCAFP00000000210\tN\t234\tS\t483973\tbenign\tN\n-Contig163_chr10_4573526_4574494\t487\tchr10\t4574010\tENSCAFT00000000365\tENSCAFP00000000332\tR\t186\tK\t474414\tbenign\tcfa00450=Selenocompound metabolism.cfa00970=Aminoacyl-tRNA biosynthesis\n-Contig55_chr1_40056604_40059808\t2081\tchr1\t40058686\tENSCAFT00000000507\tENSCAFP00000000458\tI\t247\tK\t484023\tpossibly damaging\tN\n-Contig17_chr1_40203628_40205630\t1417\tchr1\t40205044\tENSCAFT00000000517\tENSCAFP00000000468\tN\t109\tS\t476233\tbenign\tN\n-Contig97_chr1_44847984_44848380\t285\tchr1\t44848272\tENSCAFT00000000674\tENSCAFP00000000618\tQ\t27\tR\t611986\tbenign\tN\n-Contig214_chr10_16106753_16106969\t121\tchr10\t16106873\tENSCAFT00000000724\tENSCAFP00000000668\tA\t301\tT\t609478\tbenign\tN\n-Contig75_chr1_45731970_45732932\t436\tchr1\t45732397\tENSCAFT00000000760\tENSCAFP00000000701\tI\t490\tV\tU\tbenign\tN\n-Contig33_chr1_45614845_45617413\t1835\tchr1\t45616685\tENSCAFT00000000760\tENSCAFP00000000701\tA\t4390\tV\tU\tbenign\tN\n-Contig95_chr10_18829724_18831056\t914\tchr10\t18830645\tENSCAFT00000000762\tENSCAFP00000000703\tA\t512\tV\tU\tpossibly damaging\tN\n-Contig197_chr13_8622062_8623071\t606\tchr13\t8622665\tENSCAFT00000001047\tENSCAFP00000000959\tT\t406\tI\t475067\tpossibly damaging\tcfa00240=Pyrimidine metabolism.cfa00410=beta-Alanine metabolism.cfa00770=Pantothenate and CoA biosynthesis.cfa00983=Drug metabolism - other enzymes.cfa01100=Metabolic pathways\n-Contig243_chr10_19959210_19960069\t701\tchr10\t19959858\tENSCAFT00000001052\tENSCAFP00000000964\tE\t1345\tK\tU\tbenign\tN\n-Contig137_chr13_10622950_10624043\t1039\tchr13\t10623979\tENSCAFT00000001063\tENSCAFP00000000975\tE\t10\tK\t481999\tbenign\tN\n-Contig137_chr13_10622950_10624043\t1006\tchr13\t10623946\tENSCAFT00000001063\tENSCAFP00000000975\tR\t21\tC\t481999\tprobably damaging\tN\n-Contig115_chr12_4411478_4412322\t124\tchr12\t4411614\tENSCAFT00000001076\tENSCAFP00000000986\tR\t177\tH\tU\tbenign\tN\n-Contig150_chr12_4438230_4439944\t385\tchr12\t4438614\tENSCAFT00000001104\tENSCAFP00000001014\tY\t277\tD\t607591\tbenign\tN\n-Contig84_chr1_52076858_52077103\t80\tchr1\t52076943\tENSCAFT00000001141\tENSCAFP00000001046\tC\t147\tY\t484064\tbenign\tN\n-Contig29_chr13_13215547_13217183\t793\tchr13\t13216352\tENSCAFT00000001146\tENSCAFP00000001050\tP\t1\tR\t475076\tprobably damaging\tN\n-Contig251_chr10_22876556_22877097\t152\tchr10\t22876714\tENSCAFT00000001204\tENSCAFP00000001103\tE\t1162\tD\t481203\tbenign\tN\n-Contig21_chr10_22964856_22965302\t202\tchr10\t22965058\tENSCAFT00000001219\tENSCAFP00000001115\tP\t6\tQ\t474465\tbenign\tN\n-Contig199_chr12_5083018_5084534\t453\tchr12\t5083472\tENSCAFT00000001250\tENSCAFP00000001144\tI\t185\tT\t481729.481731\tbenign\tN.cfa04145=Phagosome.cfa04514=Cell adhesion molecules (CAMs).cfa04612=Antigen processing and presentation.cfa04672=Intestinal immune network for IgA production.cfa04940=Type I diabetes mellitus.cfa05140=Leishmaniasis.cfa05145=Toxoplasmosis.cfa05150=Staphylococcus aureus infection.cfa05152=Tuberculosis.cfa05164=Influenza A.cfa05166=HTLV-I infection.cfa05168=Herpes simplex infection.cfa05310=Asthma.cfa05320=Autoimmune thyroid disease.cfa05322=Systemic lupus erythematosus.cfa05323=Rheumatoid arthritis.cfa05330=Allograft re'..b'1\t701\tchr6\t25357665\tENSCAFT00000035750\tENSCAFP00000031044\tP\t479\tS\t608555\tbenign\tcfa04142=Lysosome\n-Contig18_chr9_58576258_58576773\t215\tchr9\t58576474\tENSCAFT00000035914\tENSCAFP00000031224\tK\t118\tE\t480706\tbenign\tN\n-Contig8_chr15_38734005_38734403\t242\tchr15\t38734244\tENSCAFT00000035916\tENSCAFP00000031226\tA\t237\tV\t611996\tpossibly damaging\tN\n-Contig76_chr3_30625909_30626247\t159\tchr3\t30626069\tENSCAFT00000036198\tENSCAFP00000031549\tT\t135\tS\t479171\tbenign\tcfa00260=Glycine, serine and threonine metabolism.cfa00270=Cysteine and methionine metabolism.cfa01100=Metabolic pathways\n-Contig86_chr37_14528768_14530343\t873\tchr37\t14529628\tENSCAFT00000036570\tENSCAFP00000031969\tV\t738\tD\t478875.609202\tpossibly damaging\tcfa04060=Cytokine-cytokine receptor interaction.cfa04350=TGF-beta signaling pathway\n-Contig9_chr5_54124181_54125739\t1134\tchr5\t54125291\tENSCAFT00000036640\tENSCAFP00000032043\tA\t187\tT\t610286\tbenign\tN\n-Contig107_chr9_8990420_8991676\t1178\tchr9\t8991591\tENSCAFT00000036774\tENSCAFP00000032186\tT\t55\tM\t483288\tbenign\tN\n-Contig47_chr12_20319418_20320775\t1212\tchr12\t20320622\tENSCAFT00000036825\tENSCAFP00000032241\tK\t606\tT\t474930\tbenign\tcfa00280=Valine, leucine and isoleucine degradation.cfa00630=Glyoxylate and dicarboxylate metabolism.cfa00640=Propanoate metabolism.cfa01100=Metabolic pathways\n-Contig4_chr2_45195542_45196115\t233\tchr2\t45195785\tENSCAFT00000037022\tENSCAFP00000032463\tD\t833\tN\t478055\tpossibly damaging\tN\n-Contig8_chr8_77227029_77227651\t339\tchr8\t77227366\tENSCAFT00000037096\tENSCAFP00000032544\tT\t61\tA\t490895.612602\tbenign\tcfa04020=Calcium signaling pathway.cfa04145=Phagosome.cfa04640=Hematopoietic cell lineage.cfa04650=Natural killer cell mediated cytotoxicity.cfa04662=B cell receptor signaling pathway.cfa04664=Fc epsilon RI signaling pathway.cfa04666=Fc gamma R-mediated phagocytosis.cfa04672=Intestinal immune network for IgA production.cfa05140=Leishmaniasis.cfa05143=African trypanosomiasis.cfa05146=Amoebiasis.cfa05150=Staphylococcus aureus infection.cfa05152=Tuberculosis.cfa05162=Measles.cfa05310=Asthma.cfa05320=Autoimmune thyroid disease.cfa05322=Systemic lupus erythematosus.cfa05323=Rheumatoid arthritis.cfa05330=Allograft rejection.cfa05340=Primary immunodeficiency.cfa05414=Dilated cardiomyopathy.cfa05416=Viral myocarditis\n-Contig2_chr7_60049092_60051693\t266\tchr7\t60049361\tENSCAFT00000038176\tENSCAFP00000033857\tT\t195\tM\tU\tprobably damaging\tN\n-Contig31_chr30_24179816_24187402\t4867\tchr30\t24184686\tENSCAFT00000038211\tENSCAFP00000033897\tG\t103\tS\tU\tbenign\tN\n-Contig9_chr27_48250956_48251793\t192\tchr27\t48251161\tENSCAFT00000038256\tENSCAFP00000033944\tT\t166\tM\t477739\tprobably damaging\tN\n-Contig45_chr27_43537046_43537944\t568\tchr27\t43537599\tENSCAFT00000038301\tENSCAFP00000033996\tM\t69\tI\t611773\tbenign\tcfa04010=MAPK signaling pathway.cfa04810=Regulation of actin cytoskeleton.cfa05200=Pathways in cancer.cfa05218=Melanoma\n-Contig133_chr18_28371600_28372547\t83\tchr18\t28371695\tENSCAFT00000038383\tENSCAFP00000034090\tL\t102\tQ\t475933\tprobably damaging\tN\n-Contig11_chr28_8532951_8533892\t511\tchr28\t8533462\tENSCAFT00000038937\tENSCAFP00000034728\tR\t19\tC\t477763\tprobably damaging\tcfa03008=Ribosome biogenesis in eukaryotes.cfa03013=RNA transport\n-Contig1_chr14_5733966_5735336\t783\tchr14\t5734754\tENSCAFT00000039094\tENSCAFP00000034905\tA\t166\tT\tU\tbenign\tN\n-Contig48_chr27_6001075_6001818\t392\tchr27\t6001478\tENSCAFT00000039109\tENSCAFP00000034919\tR\t103\tH\tU\tprobably damaging\tN\n-Contig40_chr11_43589173_43590288\t973\tchr11\t43590138\tENSCAFT00000039148\tENSCAFP00000034962\tR\t1617\tP\t481557\tbenign\tN\n-Contig1_chr14_30424688_30425258\t179\tchr14\t30424861\tENSCAFT00000039390\tENSCAFP00000035239\tT\t648\tI\t475245\tbenign\tcfa04666=Fc gamma R-mediated phagocytosis.cfa04810=Regulation of actin cytoskeleton\n-Contig58_chr8_7461111_7462065\t323\tchr8\t7461423\tENSCAFT00000039451\tENSCAFP00000035309\tL\t112\tF\tU\tbenign\tN\n-Contig1_chr25_43094809_43095852\t908\tchr25\t43095708\tENSCAFT00000039609\tENSCAFP00000035483\tW\t18\tG\tU\tunknown\tN\n-Contig114_chr25_43076436_43076800\t141\tchr25\t43076581\tENSCAFT00000039609\tENSCAFP00000035483\tS\t45\tC\tU\tunknown\tN\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_in/sample.gd_snp
--- a/test-data/test_in/sample.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,402 +0,0 @@\n-#{"column_names":["scaf","pos","A","B","qual","ref","rpos","rnuc","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","pair","dist",\n-#"prim","rflp"],"dbkey":"canFam2","individuals":[["PB1",9],["PB2",13],["PB3",17],["PB4",21],["PB6",25],["PB8",29]],"pos":2,"rPos":7,"ref":6,"scaffold":1,"species":"bear"}\n-Contig161_chr1_4641264_4641879\t115\tC\tT\t73.5\tchr1\t4641382\tC\t6\t0\t2\t45\t8\t0\t2\t51\t15\t0\t2\t72\t5\t0\t2\t42\t6\t0\t2\t45\t10\t0\t2\t57\tY\t54\t0.323\t0\n-Contig48_chr1_10150253_10151311\t11\tA\tG\t94.3\tchr1\t10150264\tA\t1\t0\t2\t30\t1\t0\t2\t30\t1\t0\t2\t30\t3\t0\t2\t36\t1\t0\t2\t30\t1\t0\t2\t30\tY\t22\t+99.\t0\n-Contig20_chr1_21313469_21313570\t66\tC\tT\t54.0\tchr1\t21313534\tC\t4\t0\t2\t39\t4\t0\t2\t39\t5\t0\t2\t42\t4\t0\t2\t39\t4\t0\t2\t39\t5\t0\t2\t42\tN\t1\t+99.\t0\n-Contig86_chr1_30984450_30985684\t670\tC\tT\t365.0\tchr1\t30985133\tC\t9\t0\t2\t54\t10\t0\t2\t57\t13\t0\t2\t66\t3\t0\t2\t36\t9\t0\t2\t54\t7\t0\t2\t48\tY\t145\t0.031\t0\n-Contig5_chr1_32562160_32563940\t1215\tG\tT\t163.0\tchr1\t32563356\tG\t17\t0\t2\t78\t19\t0\t2\t84\t20\t0\t2\t87\t14\t0\t2\t69\t12\t0\t2\t63\t10\t0\t2\t57\tY\t17\t0.251\t0\n-Contig110_chr1_33385093_33386888\t510\tC\tT\t270.0\tchr1\t33385587\tA\t14\t0\t2\t69\t11\t0\t2\t60\t19\t0\t2\t84\t11\t0\t2\t60\t10\t0\t2\t57\t13\t0\t2\t66\tY\t13\t0.126\t0\n-Contig100_chr1_33562920_33564288\t743\tC\tT\t178.0\tchr1\t33563655\tC\t6\t0\t2\t45\t10\t0\t2\t57\t8\t0\t2\t51\t5\t0\t2\t42\t13\t0\t2\t66\t7\t0\t2\t48\tY\t13\t0.090\t3\n-Contig7_chr1_37302355_37302489\t97\tA\tG\t59.2\tchr1\t37302452\tG\t3\t0\t2\t36\t8\t0\t2\t51\t5\t0\t2\t42\t8\t0\t2\t51\t7\t0\t2\t48\t6\t0\t2\t45\tN\t56\t2.812\t0\n-Contig62_chr1_41880715_41882180\t1078\tT\tG\t57.6\tchr1\t41881785\tT\t14\t0\t2\t69\t15\t0\t2\t72\t16\t0\t2\t75\t13\t0\t2\t66\t8\t0\t2\t51\t10\t0\t2\t57\tY\t21\t0.477\t0\n-Contig47_chr1_48409178_48409384\t37\tC\tT\t134.0\tchr1\t48409215\tT\t5\t0\t2\t42\t6\t0\t2\t45\t8\t0\t2\t51\t9\t0\t2\t54\t4\t0\t2\t39\t6\t0\t2\t45\tN\t66\t+99.\t0\n-Contig119_chr1_49647683_49650077\t1618\tC\tA\t99.7\tchr1\t49649276\tA\t8\t0\t2\t51\t11\t0\t2\t60\t10\t0\t2\t57\t9\t0\t2\t54\t10\t0\t2\t57\t14\t0\t2\t69\tY\t16\t0.166\t0\n-Contig21_chr1_60697952_60699446\t307\tG\tA\t51.9\tchr1\t60698265\tG\t12\t0\t2\t63\t9\t0\t2\t54\t4\t0\t2\t39\t6\t0\t2\t45\t9\t0\t2\t54\t4\t0\t2\t39\tY\t98\t0.507\t0\n-Contig131_chr1_62319542_62320564\t169\tC\tG\t103.0\tchr1\t62319709\tC\t12\t0\t2\t63\t12\t0\t2\t66\t14\t0\t2\t69\t12\t0\t2\t63\t9\t0\t2\t54\t9\t0\t2\t54\tY\t73\t0.307\t1\n-Contig14_chr1_63450425_63450680\t101\tT\tA\t102.0\tchr1\t63450530\tT\t8\t0\t2\t51\t10\t0\t2\t57\t18\t0\t2\t81\t8\t0\t2\t51\t8\t0\t2\t34\t8\t0\t2\t51\tN\t99\t1.085\t0\n-Contig83_chr1_63869778_63869942\t40\tT\tC\t23.7\tchr1\t63869819\tC\t5\t0\t2\t42\t7\t0\t2\t48\t2\t0\t2\t33\t4\t0\t2\t39\t6\t0\t2\t48\t4\t0\t2\t39\tN\t654\t1.364\t0\n-Contig30_chr1_64702572_64703138\t178\tA\tT\t117.0\tchr1\t64702750\tT\t10\t0\t2\t57\t10\t0\t2\t57\t20\t0\t2\t87\t21\t0\t2\t90\t6\t0\t2\t45\t12\t0\t2\t63\tY\t50\t3.872\t0\n-Contig101_chr1_69868406_69868872\t287\tG\tA\t14.6\tchr1\t69868689\tG\t13\t0\t2\t66\t17\t0\t2\t78\t10\t0\t2\t57\t8\t0\t2\t51\t7\t0\t2\t48\t8\t0\t2\t51\tN\t137\t0.305\t0\n-Contig35_chr1_74482577_74482791\t170\tG\tA\t45.4\tchr1\t74482751\tA\t3\t0\t2\t36\t4\t0\t2\t39\t13\t0\t2\t66\t2\t0\t2\t33\t5\t0\t2\t42\t2\t0\t2\t33\tN\t20\t+99.\t3\n-Contig49_chr1_83865731_83865944\t85\tG\tA\t34.1\tchr1\t-1\tN\t4\t0\t2\t39\t4\t0\t2\t39\t8\t0\t2\t51\t2\t0\t2\t33\t5\t0\t2\t42\t4\t0\t2\t39\tN\t-1\t1.485\t0\n-Contig64_chr1_87343284_87345672\t163\tT\tA\t3.76\tchr1\t87343443\tC\t0\t2\t2\t1\t0\t0\t-1\t0\t5\t0\t2\t42\t2\t0\t2\t33\t0\t1\t2\t14\t0\t0\t-1\t0\tN\t3\t0.039\t2\n-Contig20_chr1_110679280_110679687\t181\tC\tT\t87.4\tchr1\t110679454\t-\t1\t0\t2\t30\t7\t0\t2\t48\t4\t0\t2\t39\t2\t0\t2\t33\t2\t0\t2\t33\t0\t0\t-1\t0\tN\t31\t0.660\t2\n-Contig129_chr1_117547123_117548666\t926\tG\tA\t126.0\tchr1\t117548059\tG\t19\t0\t2\t84\t9\t0\t2\t54\t11\t0\t2\t60\t10\t0\t2\t57\t12\t0\t2\t63\t11\t0\t2\t60\tY\t64\t0.049\t0\n-Contig7_chr1_125154638_125154844\t190\tG\tT\t130.0\tchr1\t125154818\tA\t5\t0\t2\t42\t4\t0\t2\t39\t7\t0\t2\t48\t2\t0\t2\t33\t7\t0\t2\t48\t4\t0\t2\t39\tN\t33\t+99.\t0\n-Contig222_chr2_9817738_9818143\t220\tC\tT\t888.0\tchr2\t9817960\tC\t17\t0\t2\t78\t12\t0\t2\t63\t20\t0\t2\t87\t8\t0\t2\t51\t11\t0\t2\t60\t12\t0\t2\t63\tY\t76\t0.093\t1\n-Contig47_chr2_25470778_25471576\t126\tG\tA\t888.0\tchr2\t25470896\tG\t12\t0\t2\t63\t14\t0\t2\t69\t14\t0\t2\t69\t10\t0\t2\t57\t18\t0\t2\t81\t13\t0\t2\t66\tN\t11\t0.289\t1\n-Contig10_chr2_40859744_40860534\t637\tG\tA\t888.0\tchr2\t40860397\tA\t3\t0\t2\t36\t3\t0\t2\t36\t2\t0\t2\t33\t7\t0\t2\t48\t6\t0\t2\t45\t8\t0\t2\t51\tY\t42\t1.435\t0\n-Contig52_chr2_41421981_41422725\t604\tC\tA\t888.0\tchr2\t41422583\tA\t17\t0\t2\t78\t18\t0\t2\t81\t14\t0\t2\t69\t17\t0\t2\t78\t12\t0\t2\t63\t14\t0\t2\t69\tY\t44\t0.882\t0\n-Contig94_chr2_43869105_43870358\t220\tG\tA\t888.0\tchr2'..b'\t0\n-Contig5_chr36_4562983_4563634\t343\tC\tT\t151.0\tchr36\t4563324\tT\t20\t0\t2\t87\t20\t0\t2\t87\t23\t0\t2\t96\t24\t0\t2\t99\t9\t0\t2\t54\t8\t0\t2\t51\tY\t40\t1.169\t0\n-Contig75_chr36_7885319_7885588\t53\tG\tA\t25.7\tchr36\t7885372\tG\t10\t0\t2\t57\t8\t0\t2\t51\t13\t0\t2\t66\t7\t0\t2\t48\t4\t0\t2\t39\t7\t0\t2\t48\tN\t7\t2.653\t0\n-Contig184_chr36_18956191_18958552\t187\tA\tG\t11.5\tchr36\t18956371\tG\t10\t0\t2\t57\t11\t0\t2\t60\t21\t0\t2\t90\t14\t0\t2\t69\t7\t0\t2\t48\t4\t0\t2\t39\tN\t278\t1.434\t2\n-Contig12_chr36_21557176_21557828\t513\tT\tA\t159.0\tchr36\t21557695\tA\t11\t0\t2\t60\t14\t0\t2\t69\t21\t0\t2\t90\t12\t0\t2\t63\t15\t0\t2\t72\t11\t0\t2\t60\tY\t55\t0.222\t0\n-Contig2_chr36_22436067_22436794\t653\tC\tT\t73.0\tchr36\t22436730\tC\t11\t0\t2\t60\t16\t0\t2\t75\t13\t0\t2\t66\t11\t0\t2\t60\t21\t0\t2\t90\t21\t0\t2\t90\tY\t9\t0.534\t0\n-Contig133_chr36_32954045_32955409\t136\tA\tG\t116.0\tchr36\t32954182\tA\t16\t0\t2\t75\t15\t0\t2\t72\t20\t0\t2\t87\t11\t0\t2\t60\t18\t0\t2\t81\t13\t0\t2\t66\tY\t74\t3.772\t1\n-Contig53_chr37_6665763_6665919\t116\tC\tT\t111.0\tchr37\t6665875\tC\t9\t0\t2\t54\t9\t0\t2\t54\t5\t0\t2\t42\t9\t0\t2\t54\t8\t0\t2\t51\t10\t0\t2\t57\tN\t15\t10.875\t1\n-Contig42_chr37_9589176_9591269\t252\tG\tA\t25.1\tchr37\t9589430\tG\t10\t0\t2\t40\t13\t0\t2\t66\t18\t0\t2\t81\t21\t0\t2\t90\t9\t0\t2\t54\t17\t0\t2\t78\tN\t67\t1.170\t2\n-Contig2_chr37_17134963_17136513\t1140\tA\tC\t158.0\tchr37\t17136092\tA\t14\t0\t2\t69\t24\t0\t2\t99\t17\t0\t2\t78\t16\t0\t2\t75\t15\t0\t2\t75\t13\t0\t2\t66\tY\t12\t0.053\t1\n-Contig18_chr37_17147806_17149851\t291\tT\tG\t112.0\tchr37\t17148084\tT\t4\t6\t1\t45\t16\t0\t2\t75\t17\t0\t2\t78\t14\t0\t2\t69\t22\t0\t2\t93\t13\t0\t2\t66\tY\t41\t4.442\t0\n-Contig64_chr37_17606895_17607534\t565\tC\tT\t30.2\tchr37\t17607439\tA\t9\t0\t2\t54\t16\t0\t2\t75\t20\t0\t2\t87\t14\t0\t2\t69\t16\t0\t2\t75\t10\t0\t2\t57\tN\t20\t1.622\t0\n-Contig126_chr37_21587881_21590621\t373\tG\tT\t132.0\tchr37\t21588256\tG\t11\t0\t2\t60\t11\t0\t2\t60\t23\t0\t2\t96\t12\t0\t2\t63\t8\t0\t2\t51\t18\t0\t2\t81\tY\t12\t0.549\t0\n-Contig2_chr37_31197993_31198256\t182\tC\tT\t39.6\tchr37\t31198171\tT\t6\t0\t2\t45\t10\t0\t2\t57\t7\t0\t2\t48\t9\t0\t2\t54\t10\t0\t2\t57\t12\t0\t2\t63\tN\t2\t0.595\t0\n-Contig46_chr37_31852376_31853555\t825\tA\tG\t111.0\tchr37\t31853191\tG\t19\t0\t2\t84\t14\t0\t2\t69\t15\t0\t2\t72\t7\t0\t2\t48\t8\t0\t2\t51\t16\t0\t2\t75\tY\t17\t0.128\t1\n-Contig7_chr38_12217200_12218387\t1163\tA\tT\t44.4\tchr38\t12218353\tA\t11\t0\t2\t60\t13\t0\t2\t66\t17\t0\t2\t78\t10\t0\t2\t57\t11\t0\t2\t60\t11\t0\t2\t60\tY\t67\t+99.\t0\n-Contig15_chr38_12282020_12282253\t150\tC\tT\t156.0\tchr38\t12282164\tA\t17\t0\t2\t78\t11\t0\t2\t60\t19\t0\t2\t84\t14\t0\t2\t69\t5\t0\t2\t42\t14\t0\t2\t69\tY\t26\t2.952\t1\n-Contig4_chr38_14807432_14807747\t275\tA\tG\t36.5\tchr38\t14807715\tG\t1\t0\t2\t30\t2\t0\t2\t33\t2\t0\t2\t33\t4\t0\t2\t39\t1\t0\t2\t30\t0\t0\t-1\t0\tY\t28\t+99.\t1\n-Contig6_chr38_16185744_16186110\t325\tA\tG\t74.9\tchr38\t16186061\tA\t5\t0\t2\t42\t3\t0\t2\t36\t9\t0\t2\t54\t7\t0\t2\t48\t1\t0\t2\t30\t12\t0\t2\t63\tY\t40\t+99.\t0\n-Contig265_chrX_2689247_2689484\t114\tC\tG\t103.0\tchrX\t2689356\tC\t11\t0\t2\t60\t9\t0\t2\t54\t13\t0\t2\t66\t16\t0\t2\t75\t14\t0\t2\t69\t10\t0\t2\t57\tN\t2\t9.232\t1\n-Contig122_chrX_6026976_6027327\t330\tC\tT\t79.4\tchrX\t6027303\tC\t3\t0\t2\t36\t3\t0\t2\t36\t3\t0\t2\t36\t4\t0\t2\t39\t3\t0\t2\t36\t6\t0\t2\t45\tY\t30\t+99.\t0\n-Contig15_chrX_15659909_15660340\t15\tA\tC\t14.9\tchrX\t15659924\tC\t1\t0\t2\t30\t1\t0\t2\t30\t3\t0\t2\t36\t6\t0\t2\t45\t2\t0\t2\t33\t0\t0\t-1\t0\tY\t216\t+99.\t1\n-Contig12_chrX_23243561_23244412\t479\tC\tG\t67.7\tchrX\t23244037\tC\t2\t0\t2\t33\t4\t2\t2\t8\t2\t6\t1\t43\t7\t0\t2\t48\t6\t0\t2\t45\t4\t0\t2\t39\tY\t208\t1.620\t0\n-Contig113_chrX_26287829_26288398\t385\tC\tT\t59.6\tchrX\t26288213\tC\t9\t0\t2\t54\t9\t0\t2\t54\t17\t0\t2\t78\t11\t0\t2\t60\t3\t8\t1\t44\t4\t0\t2\t39\tN\t13\t0.077\t0\n-Contig186_chrX_29118735_29118939\t192\tG\tA\t7.01\tchrX\t29118931\tG\t1\t0\t2\t30\t7\t0\t2\t48\t4\t0\t2\t39\t5\t0\t2\t42\t8\t0\t2\t51\t4\t0\t2\t39\tN\t50\t+99.\t0\n-Contig237_chrX_31256648_31257654\t165\tT\tA\t246.0\tchrX\t31256814\tT\t7\t0\t2\t48\t23\t0\t2\t96\t19\t0\t2\t84\t17\t0\t2\t78\t14\t0\t2\t69\t8\t0\t2\t51\tY\t37\t1.481\t0\n-Contig25_chrX_40729418_40730089\t332\tC\tT\t31.2\tchrX\t40729745\tC\t0\t0\t-1\t0\t2\t0\t2\t33\t4\t0\t2\t39\t5\t0\t2\t42\t3\t0\t2\t36\t3\t0\t2\t36\tY\t34\t0.212\t0\n-Contig90_chrX_57430715_57431566\t548\tC\tT\t116.0\tchrX\t57431266\tT\t9\t0\t2\t54\t18\t0\t2\t81\t13\t0\t2\t66\t14\t0\t2\t69\t8\t0\t2\t54\t7\t0\t2\t48\tY\t261\t0.154\t1\n-Contig133_chrX_84833782_84834125\t182\tG\tA\t69.7\tchrX\t84833962\tG\t5\t0\t2\t42\t18\t0\t2\t81\t12\t0\t2\t63\t19\t0\t2\t84\t6\t3\t1\t27\t7\t0\t2\t48\tN\t619\t0.278\t0\n-Contig129_chrX_90586053_90586467\t135\tA\tT\t120.0\tchrX\t90586195\tA\t1\t0\t2\t30\t6\t0\t2\t45\t8\t0\t2\t51\t5\t0\t2\t42\t1\t0\t2\t30\t2\t0\t2\t33\tN\t637\t0.245\t0\n-Contig125_chrX_93319363_93320877\t349\tA\tC\t145.0\tchrX\t93319721\tA\t4\t0\t2\t39\t6\t0\t2\t45\t11\t0\t2\t60\t10\t0\t2\t57\t13\t0\t2\t66\t6\t0\t2\t45\tY\t59\t1.686\t0\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/add_fst_column/add_fst_column.gd_snp
--- a/test-data/test_out/add_fst_column/add_fst_column.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,21 +0,0 @@
-Contig113_chr5_11052263_11052603 28 C T 38.2 chr5 11052280 C 1 2 1 12 3 2 1 10 5 0 2 42 2 1 2 13 3 0 2 36 8 0 2 51 Y 161 +99. 0 0.1636
-Contig215_chr5_70946445_70947428 363 T G 28.2 chr5 70946809 C 4 0 2 39 0 5 0 12 9 0 2 54 6 0 2 45 3 3 2 1 9 0 2 54 N 43 0.153 0 0.3846
-Contig132_chr7_20426224_20428145 1815 A G 28.3 chr7 20428041 A 11 1 2 43 12 0 2 63 19 0 2 84 23 0 2 96 14 0 2 69 10 0 2 57 N 11 0.264 0 0.0213
-Contig30_chr8_17147743_17147923 13 G A 105.0 chr8 17147756 A 1 3 1 19 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 3 0 2 36 N 6 +99. 0 0.4286
-Contig44_chr8_71186368_71188207 1455 G T 147.0 chr8 71187818 G 4 10 1 74 3 0 2 36 20 0 2 87 12 0 2 63 8 0 2 51 10 0 2 57 Y 88 0.036 0 0.4167
-Contig103_chr11_8844784_8845095 214 T G 135.0 chr11 8844993 T 1 1 2 12 10 0 2 57 5 4 1 26 2 3 1 13 2 7 1 34 1 1 2 13 Y 75 0.731 0 0.2101
-Contig37_chr13_15910164_15910426 245 G A 32.9 chr13 -1 N 3 4 1 41 4 0 2 39 3 0 2 36 4 0 2 39 3 0 2 36 10 0 2 57 N -1 2.159 1 0.2222
-Contig50_chr17_12247973_12249183 889 G T 47.6 chr17 12248878 G 0 1 2 9 8 0 2 51 9 2 2 21 7 2 2 21 15 0 2 72 0 3 0 9 Y 1 1.181 0 0.0150
-Contig159_chr22_7896450_7896974 109 G C 151.0 chr22 7896570 G 16 0 2 75 5 7 1 62 14 0 2 69 16 0 2 75 13 0 2 66 13 0 2 66 Y 16 0.465 0 0.1429
-Contig77_chr22_49764414_49764875 353 C A 148.0 chr22 49764777 C 7 4 1 65 18 0 2 81 16 0 2 75 20 0 2 87 4 3 1 52 9 4 1 67 Y 12 0.941 0 0.0741
-Contig61_chr24_30465488_30465834 149 G T 68.2 chr24 30465637 G 13 0 2 66 4 2 2 11 18 0 2 81 11 0 2 60 11 0 2 60 9 0 2 54 N 99 0.105 2 0.0556
-Contig59_chr25_18196776_18197707 785 G A 112.0 chr25 18197551 G 8 10 1 42 27 0 2 108 21 0 2 90 18 0 2 81 10 0 2 57 14 0 2 69 N 36 3.625 0 0.1250
-Contig85_chr27_45471750_45472022 211 G A 53.1 chr27 45471964 G 18 0 2 81 10 0 2 57 15 0 2 72 0 13 0 36 16 0 2 75 14 0 2 69 N 75 2.502 1 0.3023
-Contig175_chr28_36441165_36441915 68 T C 3.83 chr28 36441234 T 4 4 1 15 6 0 2 45 12 0 2 63 15 0 2 72 6 0 2 45 9 0 2 54 N 4 1.610 2 0.1667
-Contig114_chr30_33636712_33637208 34 C T 142.0 chr30 33636744 C 7 0 2 48 4 1 2 20 6 0 2 45 6 0 2 45 3 4 1 29 5 0 2 42 Y 14 8.028 0 0.0435
-Contig42_chr32_38900713_38901320 320 A G 134.0 chr32 38901021 T 12 0 2 63 10 0 2 57 9 11 1 104 5 0 2 42 19 0 2 84 7 6 1 56 Y 71 0.165 0 0.2821
-Contig41_chr34_16544482_16545449 46 T C 102.0 chr34 16544523 T 5 0 2 42 11 0 2 60 6 0 2 45 0 2 0 3 7 0 2 48 8 0 2 51 Y 215 1.156 0 0.1429
-Contig19_chr35_23887144_23888282 90 C A 10.1 chr35 23887242 - 3 3 1 12 4 4 1 19 8 6 1 37 4 3 1 11 8 3 2 7 9 3 2 11 Y 105 0.199 0 0.0051
-Contig74_chr35_25394343_25394813 303 A T 221.0 chr35 25394646 G 23 0 2 96 15 0 2 72 25 0 2 105 7 7 1 49 18 0 2 81 16 0 2 75 Y 58 4.298 0 0.0986
-Contig18_chr37_17147806_17149851 291 T G 112.0 chr37 17148084 T 4 6 1 45 16 0 2 75 17 0 2 78 14 0 2 69 22 0 2 93 13 0 2 66 Y 41 4.442 0 0.1304
-Contig12_chrX_23243561_23244412 479 C G 67.7 chrX 23244037 C 2 0 2 33 4 2 2 8 2 6 1 43 7 0 2 48 6 0 2 45 4 0 2 39 Y 208 1.620 0 0.0256

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/average_fst/average_fst.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/average_fst/average_fst.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,1 @@
+average Fst is 0.16461, using 21 SNPs

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/coverage_distributions/coverage.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/coverage_distributions/coverage.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,39 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>Coverage distributions Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 01:57:24 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="coverage.pdf">coverage.pdf</a></li>
+            <li><a href="coverage.txt">coverage.txt</a></li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li>Data source: sequence coverage</li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Individuals
+<ol>
+<li>PB1</li>
+<li>PB2</li>
+<li>PB3</li>
+<li>PB4</li>
+<li>PB6</li>
+<li>PB8</li>
+</ol>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/coverage_distributions/coverage.pdf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/coverage_distributions/coverage.pdf Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,363 @@
+%PDF-1.4
+%��ρ�\r
+1 0 obj
+<<
+/CreationDate (D:20120403135724)
+/ModDate (D:20120403135724)
+/Title (R Graphics Output)
+/Producer (R 2.11.0)
+/Creator (R)
+>>
+endobj
+2 0 obj
+<<
+/Type /Catalog
+/Pages 3 0 R
+>>
+endobj
+5 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Contents 6 0 R
+/Resources 4 0 R
+>>
+endobj
+6 0 obj
+<<
+/Length 7 0 R
+>>
+stream
+1 J 1 j q
+Q q 59.04 73.44 630.72 299.52 re W n
+1.000 0.000 0.000 RG
+2.25 w
+[] 0 d
+1 J
+1 j
+10.00 M
+82.40 174.26 m
+106.73 206.89 l
+131.07 206.89 l
+155.40 263.98 l
+179.73 263.98 l
+204.07 223.20 l
+228.40 312.93 l
+252.73 304.77 l
+277.07 255.83 l
+301.40 280.30 l
+325.73 312.93 l
+350.07 321.08 l
+374.40 255.83 l
+398.73 263.98 l
+423.07 231.36 l
+447.40 231.36 l
+471.73 174.26 l
+496.07 215.04 l
+520.40 174.26 l
+544.73 133.47 l
+569.07 157.95 l
+593.40 109.00 l
+617.73 109.00 l
+642.07 92.69 l
+666.40 84.53 l
+S
+Q q
+0.000 0.000 0.000 RG
+0.75 w
+[] 0 d
+1 J
+1 j
+10.00 M
+82.40 73.44 m 569.07 73.44 l S
+82.40 73.44 m 82.40 66.24 l S
+204.07 73.44 m 204.07 66.24 l S
+325.73 73.44 m 325.73 66.24 l S
+447.40 73.44 m 447.40 66.24 l S
+569.07 73.44 m 569.07 66.24 l S
+BT
+0.000 0.000 0.000 rg
+/F2 1 Tf 12.00 0.00 -0.00 12.00 79.06 47.52 Tm (0) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 200.73 47.52 Tm (5) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 319.06 47.52 Tm (10) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 440.73 47.52 Tm (15) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 562.39 47.52 Tm (20) Tj
+ET
+59.04 84.53 m 59.04 345.55 l S
+59.04 84.53 m 51.84 84.53 l S
+59.04 149.79 m 51.84 149.79 l S
+59.04 215.04 m 51.84 215.04 l S
+59.04 280.30 m 51.84 280.30 l S
+59.04 345.55 m 51.84 345.55 l S
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 72.86 Tm (0.00) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 138.11 Tm (0.02) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 203.37 Tm (0.04) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 268.62 Tm (0.06) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 333.88 Tm (0.08) Tj
+ET
+59.04 73.44 m
+689.76 73.44 l
+689.76 372.96 l
+59.04 372.96 l
+59.04 73.44 l
+S
+Q q
+BT
+0.000 0.000 0.000 rg
+/F2 1 Tf 12.00 0.00 -0.00 12.00 348.69 18.72 Tm [(Co) 15 (v) 25 (er) 10 (age)] TJ
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 12.96 195.28 Tm [(Propor) -40 (tion)] TJ
+ET
+Q q 59.04 73.44 630.72 299.52 re W n
+1.000 1.000 0.000 RG
+2.25 w
+[] 0 d
+1 J
+1 j
+10.00 M
+82.40 157.95 m
+106.73 166.10 l
+131.07 231.36 l
+155.40 215.04 l
+179.73 280.30 l
+204.07 263.98 l
+228.40 272.14 l
+252.73 231.36 l
+277.07 345.55 l
+301.40 321.08 l
+325.73 288.45 l
+350.07 329.24 l
+374.40 255.83 l
+398.73 280.30 l
+423.07 247.67 l
+447.40 239.51 l
+471.73 215.04 l
+496.07 157.95 l
+520.40 174.26 l
+544.73 166.10 l
+569.07 133.47 l
+593.40 92.69 l
+617.73 100.85 l
+642.07 100.85 l
+666.40 100.85 l
+S
+0.000 1.000 0.000 RG
+82.40 141.63 m
+106.73 166.10 l
+131.07 182.42 l
+155.40 182.42 l
+179.73 231.36 l
+204.07 198.73 l
+228.40 206.89 l
+252.73 263.98 l
+277.07 263.98 l
+301.40 263.98 l
+325.73 239.51 l
+350.07 280.30 l
+374.40 198.73 l
+398.73 304.77 l
+423.07 231.36 l
+447.40 247.67 l
+471.73 239.51 l
+496.07 239.51 l
+520.40 215.04 l
+544.73 198.73 l
+569.07 231.36 l
+593.40 149.79 l
+617.73 166.10 l
+642.07 166.10 l
+666.40 100.85 l
+S
+0.000 1.000 1.000 RG
+82.40 133.47 m
+106.73 133.47 l
+131.07 255.83 l
+155.40 231.36 l
+179.73 272.14 l
+204.07 272.14 l
+228.40 337.40 l
+252.73 280.30 l
+277.07 280.30 l
+301.40 280.30 l
+325.73 337.40 l
+350.07 288.45 l
+374.40 296.61 l
+398.73 223.20 l
+423.07 272.14 l
+447.40 255.83 l
+471.73 239.51 l
+496.07 190.57 l
+520.40 117.16 l
+544.73 125.32 l
+569.07 149.79 l
+593.40 109.00 l
+617.73 109.00 l
+642.07 92.69 l
+666.40 92.69 l
+S
+0.000 0.000 1.000 RG
+82.40 157.95 m
+106.73 190.57 l
+131.07 215.04 l
+155.40 288.45 l
+179.73 231.36 l
+204.07 272.14 l
+228.40 272.14 l
+252.73 280.30 l
+277.07 296.61 l
+301.40 361.87 l
+325.73 329.24 l
+350.07 329.24 l
+374.40 296.61 l
+398.73 272.14 l
+423.07 215.04 l
+447.40 239.51 l
+471.73 190.57 l
+496.07 157.95 l
+520.40 166.10 l
+544.73 125.32 l
+569.07 100.85 l
+593.40 92.69 l
+617.73 109.00 l
+642.07 84.53 l
+666.40 92.69 l
+S
+1.000 0.000 1.000 RG
+82.40 198.73 m
+106.73 157.95 l
+131.07 215.04 l
+155.40 215.04 l
+179.73 304.77 l
+204.07 223.20 l
+228.40 321.08 l
+252.73 361.87 l
+277.07 280.30 l
+301.40 280.30 l
+325.73 329.24 l
+350.07 280.30 l
+374.40 337.40 l
+398.73 231.36 l
+423.07 272.14 l
+447.40 223.20 l
+471.73 174.26 l
+496.07 198.73 l
+520.40 149.79 l
+544.73 117.16 l
+569.07 100.85 l
+593.40 109.00 l
+617.73 100.85 l
+642.07 84.53 l
+666.40 100.85 l
+S
+1.000 0.000 0.000 rg
+0.000 0.000 0.000 RG
+0.75 w
+[] 0 d
+642.24 362.16 8.64 -7.20 re B
+1.000 1.000 0.000 rg
+642.24 347.76 8.64 -7.20 re B
+0.000 1.000 0.000 rg
+642.24 333.36 8.64 -7.20 re B
+0.000 1.000 1.000 rg
+642.24 318.96 8.64 -7.20 re B
+0.000 0.000 1.000 rg
+642.24 304.56 8.64 -7.20 re B
+1.000 0.000 1.000 rg
+642.24 290.16 8.64 -7.20 re B
+BT
+0.000 0.000 0.000 rg
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 354.25 Tm (PB1) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 339.85 Tm (PB2) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 325.45 Tm (PB3) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 311.05 Tm (PB4) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 296.65 Tm (PB6) Tj
+ET
+BT
+/F2 1 Tf 12.00 0.00 -0.00 12.00 661.68 282.25 Tm (PB8) Tj
+ET
+Q
+endstream
+endobj
+7 0 obj
+4763
+endobj
+3 0 obj
+<<
+/Type /Pages
+/Kids [
+5 0 R
+]
+/Count 1
+/MediaBox [0 0 720 432]
+>>
+endobj
+4 0 obj
+<<
+/ProcSet [/PDF /Text]
+/Font <</F2 9 0 R >>
+/ExtGState << >>
+>>
+endobj
+8 0 obj
+<<
+/Type /Encoding
+/BaseEncoding /WinAnsiEncoding
+/Differences [ 45/minus 96/quoteleft
+144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent
+/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space]
+>>
+endobj
+9 0 obj <<
+/Type /Font
+/Subtype /Type1
+/Name /F2
+/BaseFont /Helvetica
+/Encoding 8 0 R
+>> endobj
+xref
+0 10
+0000000000 65535 f
+0000000021 00000 n
+0000000164 00000 n
+0000005129 00000 n
+0000005212 00000 n
+0000000213 00000 n
+0000000293 00000 n
+0000005109 00000 n
+0000005293 00000 n
+0000005550 00000 n
+trailer
+<<
+/Size 10
+/Info 1 0 R
+/Root 2 0 R
+>>
+startxref
+5646
+%%EOF

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/coverage_distributions/coverage.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/coverage_distributions/coverage.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,18 @@
+
+            0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19
+       PB1  2  6 10 15 21 25 32 39 44 50 57 64 70 75 80 84 87 91 94 95
+       PB2  2  4  9 13 19 24 30 35 43 50 56 64 69 75 80 85 89 91 94 96
+       PB3  1  4  7 10 14 18 22 27 33 38 43 49 52 59 64 69 73 78 82 86
+       PB4  1  3  8 12 18 24 32 38 44 50 57 64 70 74 80 85 90 93 94 96
+       PB6  2  5  9 15 20 26 31 37 44 52 60 67 74 80 84 88 92 94 96 98
+       PB8  3  5  9 13 20 24 32 40 46 52 60 66 73 78 84 88 91 94 96 97
+
+
+           20 21 22 23 24
+       PB1 97 98 99 99 99
+       PB2 98 98 98 99 99
+       PB3 90 92 95 97 98
+       PB4 98 98 99 99 99
+       PB6 98 98 99 99 99
+       PB8 98 98 99 99 99
+

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/dpmix/dpmix.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/dpmix/dpmix.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,56 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>dpmix Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 02:22:23 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="dpmix.pdf">dpmix.pdf</a></li>
+            <li><a href="misc.txt">misc.txt</a></li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li>Data source: sequence coverage</li>
+            <li>Switch penalty: 10</li>
+            <li>Also analyze random chromosome: no</li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Populations
+<ul>
+<li>
+Ancestral population 1
+<ol>
+<li>PB1</li>
+<li>PB2</li>
+</ol>
+</li>
+<li>
+Ancestral population 2
+<ol>
+<li>PB3</li>
+<li>PB4</li>
+</ol>
+</li>
+<li>
+Potentially admixed
+<ol>
+<li>PB6</li>
+<li>PB8</li>
+</ol>
+</li>
+</ul>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/dpmix/dpmix.pdf

Binary file test-data/test_out/dpmix/dpmix.pdf has changed

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/dpmix/dpmix.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/dpmix/dpmix.tabular Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,78 @@
+chr1 0 125154818 0 PB6
+chr1 0 125154818 0 PB8
+chr2 0 85243509 0 PB6
+chr2 0 85243509 0 PB8
+chr3 0 92410450 0 PB6
+chr3 0 92410450 0 PB8
+chr4 0 75619257 0 PB6
+chr4 0 75619257 0 PB8
+chr5 0 90203461 0 PB6
+chr5 0 90203461 0 PB8
+chr6 0 74848993 0 PB6
+chr6 0 74848993 0 PB8
+chr7 0 55833450 0 PB6
+chr7 0 55833450 0 PB8
+chr8 0 71187818 0 PB6
+chr8 0 71187818 0 PB8
+chr9 0 39008708 0 PB6
+chr9 0 39008708 0 PB8
+chr10 0 59511126 0 PB6
+chr10 0 59511126 0 PB8
+chr11 0 53408638 0 PB6
+chr11 0 53408638 2 PB8
+chr12 0 71364712 0 PB6
+chr12 0 71364712 0 PB8
+chr13 0 66022136 0 PB6
+chr13 0 66022136 0 PB8
+chr14 0 56768832 0 PB6
+chr14 0 56768832 0 PB8
+chr15 0 45107015 0 PB6
+chr15 0 45107015 0 PB8
+chr16 0 49888550 0 PB6
+chr16 0 49888550 0 PB8
+chr17 0 61714821 2 PB6
+chr17 0 61714821 0 PB8
+chr18 0 58130413 0 PB6
+chr18 0 58130413 0 PB8
+chr19 0 56559549 0 PB6
+chr19 0 56559549 0 PB8
+chr20 0 46551277 0 PB6
+chr20 0 46551277 0 PB8
+chr21 0 43475551 0 PB6
+chr21 0 43475551 0 PB8
+chr22 0 62406302 0 PB6
+chr22 0 62406302 0 PB8
+chr23 0 48285470 0 PB6
+chr23 0 48285470 0 PB8
+chr24 0 46598214 0 PB6
+chr24 0 46598214 0 PB8
+chr25 0 51074589 0 PB6
+chr25 0 51074589 0 PB8
+chr26 0 36606979 0 PB6
+chr26 0 36606979 0 PB8
+chr27 0 45471964 2 PB6
+chr27 0 45471964 2 PB8
+chr28 0 36441234 0 PB6
+chr28 0 36441234 0 PB8
+chr29 0 21150118 0 PB6
+chr29 0 21150118 0 PB8
+chr30 0 33636744 2 PB6
+chr30 0 33636744 0 PB8
+chr31 0 26434322 0 PB6
+chr31 0 26434322 0 PB8
+chr32 0 38901021 2 PB6
+chr32 0 38901021 0 PB8
+chr33 0 26189703 0 PB6
+chr33 0 26189703 0 PB8
+chr34 0 42800126 2 PB6
+chr34 0 42800126 2 PB8
+chr35 0 25394646 2 PB6
+chr35 0 25394646 2 PB8
+chr36 0 32954182 0 PB6
+chr36 0 32954182 0 PB8
+chr37 0 31853191 0 PB6
+chr37 0 31853191 0 PB8
+chr38 0 16186061 0 PB6
+chr38 0 16186061 0 PB8
+chrX 0 93319721 2 PB6
+chrX 0 93319721 2 PB8

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/dpmix/misc.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/dpmix/misc.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,11 @@
+state 2 agrees with: PB1 PB2
+state 0 agrees with: PB3 PB4
+
+PB6: 360 SNPs where state 2 is as likely as state 0
+PB6: 12 SNPs where state 0 is more likely than state 2
+
+PB8: 358 SNPs where state 2 is as likely as state 0
+PB8: 14 SNPs where state 0 is more likely than state 2
+
+PB6: 0 = 83.7%, 1 = 0.0%, 2 = 16.3%
+PB8: 0 = 87.6%, 1 = 0.0%, 2 = 12.4%

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/evaluate_population_numbers/evaluate_population_numbers.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/evaluate_population_numbers/evaluate_population_numbers.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,2 @@
+CV error (K=1): 0.07423
+CV error (K=2): 0.07708

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/extract_primers/extract_primers.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/extract_primers/extract_primers.txt Wed Sep 12 17:10:26 2012 -0400

b'@@ -0,0 +1,1265 @@\n+> Contig161_chr1_4641264_4641879 115 C T 0.323016\n+\n+ 1 TCCGAACCGCTAAATCCTGACGACTGTTCAGTGAGAACGGGnTTCCAGCTCAGTGGAGAC\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ACTCAGAGCTTATGTGATGCACCGTCGTGCCCGTGTCTGACTAAATGTGTTGCCAGAGAA\n+ <<<<\n+\n+121 CAAAACGAAAGCCCCTATT\n+ <<<<<<<<<<<<<<<< \n+\n+> Contig86_chr1_30984450_30985684 670 C T 0.031427\n+\n+ 1 TAATTCATGACGACTGCAGAAGGGCACTCAGAGGCAATTCTACTTGAGGATATTGTCTGG\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 TATACTCTGTCCTTGCTCAGGACATCAGTGAGAACATAGAAACATTCACnTCCCCACACC\n+ \n+\n+121 GAAAGCGTCTGTAGACCGGCCCACGGGCCGAAGTCTTTGCATTTCCTCTTGCCATGCACG\n+ \n+\n+181 AGCATTCCCAGTGGCAATCAGGGGCCAGCCCTTCTGTTTGGCCTCTGCAAGCTTGTATCC\n+ <<<<<<<<<<<<<<<<<<<<\n+\n+241 TTG\n+ \n+\n+> Contig21_chr1_60697952_60699446 307 G A 0.507396\n+\n+ 1 TCTGGGGCCATGTTTCTGAAGTAAGGCTGTTTCTGCAGCCTTGCGGGCTGTGTCTTGCTC\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 nCACCCCTTAATTCTTACCTGTAGGTGGTATTTGGTAGAGTGGAGTAAAACTGGAAACTG\n+ <<\n+\n+121 GTTCTCTGTGTTCCTGCATCT\n+ <<<<<<<<<<<<<<<<<< \n+\n+> Contig64_chr1_87343284_87345672 163 T A 0.038702\n+ VspI\n+ 1 ATGGCCAATTCTGGTTTAcGCATCATTGTTAACAACTCTTCCATTCATTCTCAGAATTTT\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 CCCAATTCACATGATAAATTGTATGGTCACCTACcTACAACTAAACACTTAGTTTATTTC\n+ \n+\n+121 TATTATTATTATTATTATTATTATTATTATTAnTAtTATTATTGAAATACATTTTTTTTT\n+ \n+\n+181 CATAAACCGTTCACcCTTGTGAGAAC\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig20_chr1_110679280_110679687 181 C T 0.659726\n+\n+ 1 GAGCACTCAATGAGGGGTTCGACCCTTTGCAGACACAGCATGTAGGAGGAAGAAATGCAA\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 cGGGGCACCCCTGCGGGGGCAGGCTTCCAGTTCAAACTGATCnGGTCTGGTCCTGGGGCC\n+ \n+\n+121 GGGCCAAAGTTGTGGTTTCcCGCACTCAAGTCTCCAC\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig222_chr2_9817738_9818143 220 C T 0.092668\n+ SpeI\n+ 1 AGATTTAGCTGGAGCATGCCTTTGCCCTTTTTAGCCTTTCCCTTTTACCTTTATCCTTCT\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 TATTCTTGAAATGTTGAAATAGATGGAAGTATAGCAGCTATCTTGTCCCATAATGATGAA\n+ \n+\n+121 AACCAGGTACAAAGTTGGTGAAAACTAAAAGAGAGGAGGAGCCTGGGTTCTTGGTGGCAT\n+ \n+\n+181 CATGAACACCTGCACnAGTCTAGCATGGTCTGTGCAAAATCTCCTGATCCAAGAAAAATA\n+ \n+\n+241 TAAACATCCTTCTGTAGGGTTTTATTgCCTGAAGCAAAA\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig47_chr2_25470778_25471576 126 G A 0.289103\n+ Bsp1286I\n+ 1 GCCAGGCGTCCCTCTTTTTGAGTTCtAATTGTGTACATCCAATCCCCATCTCAACAAATA\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 GCTGAACCAGCTTCCTaTTTATTTGGTAGGTnAGCACTCTAGAAATTTGCTACACTGAAC\n+ \n+\n+121 TCACCAAATTTATAATGTaAATTATGACCATTCTTTGCCATAATAATTTGGGGTAGGTCA\n+ \n+\n+181 GATTTGGTTTTGGGGGCAGAAGAAATCATCATATCACAAGCATGTGACAGCTTCCAGCCC\n+ <<<\n+\n+241 CATCTCAACTCCAAGAAATT\n+ <<<<<<<<<<<<<<<<< \n+\n+> Contig6_chr2_56859179_56859956 671 T C 5.308026\n+ MspA1I\n+ 1 TATCCCAAAGACGTGTGTCTCAAAGCCCTGAGGTTTACAGCCAAACATGATGGACTGCCC\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ATGACAAcGGATACAAATGCTAGCgTGGGTTTAATTATGCTAGAATTTTTATGATAATTA\n+ \n+\n+121 TAATGATA'..b' <<<<<<<<<<<<<<<<<<<\n+\n+301 GGaAGT\n+ <<< \n+\n+> Contig5_chr36_4562983_4563634 343 C T 1.168507\n+\n+ 1 ATATGAATGGTGGTGATGGATTCAGCATCTTGACTCTTTTTCAACTATGTCAAGATTTGC\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ACTGGATCTTGTCTAAAGTCACTCTTCTAGGGGAAGTCAAAGAGACTGGGTCaGTCCtCA\n+ \n+\n+121 AGATAcGATGTAAGCAGGTAAGATAGCACTATAGTAGGTCTTCTTGTCATGGTGAGTCAA\n+ \n+\n+181 TAACCATTCAATATTCTTTCnACCTACTCTTTACCTGCTCAATCAAGGTAGGGGTC\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig133_chr36_32954045_32955409 136 A G 3.772017\n+ TaqI\n+ 1 ATTAAATGAAAACAGTGTCAGGCAATAAGATGTATTAAGTACAGTATGCCTGAGGATATA\n+ >>>>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ATATTAAACACAGATTCTGCTGTTACTATCnAAGTGGATATTAAAATAACAGTGCTACTT\n+ \n+\n+121 TGAGGGTAATGCTACTTTGGAGAATATTTTCTAATAAGCTCACCaTAAAATGACggATAA\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig53_chr37_6665763_6665919 116 C T 10.874746\n+ BstOI\n+ 1 AGTCCTCATGTTGTACTTTACCTCACCTGAATTTACTCATCtGATAGTTGGAAATTTGTA\n+ >>>>>>>>>>>>>>>>>>>>>>>> \n+\n+ 61 TCCATTGCCCATCtTCACCACCCCATGTCnCTGGAAACCAACAAtCTGTTCTCTGTATGa\n+ <<<<<<<<<<<<<<<<<<<<<<<<<\n+\n+121 CTT\n+ \n+\n+> Contig2_chr37_31197993_31198256 182 C T 0.594606\n+\n+ 1 CTCTCACCACATGGAGAATCCTGTATGTTCAGCTGTATGACGTGGGGGGAACGTCAGAGC\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 TCAGTTTCATAGCAGTCAGCTCCATGTTATGGGTTCAAgAnGAAAACAGGTGGCAGGCtT\n+ \n+\n+121 GCCACAGCCTCCCTCAGGGGTGgCCTTGACAGATAAAcGT\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig7_chr38_12217200_12218387 1163 A T\n+\n+> Contig265_chrX_2689247_2689484 114 C G 9.232233\n+\n+ 1 CTTAGAGAATTCCCTGATTCACTGAGTTAAATTATTACCAAATCTGATAATAATAAAAGA\n+ >>>>>>>>>>>>>>>>>>>>>>> \n+\n+ 61 AGTAATTACAGATCAATAATTAATCTATATGTCTGAATACATTTTAATAAGTCCnAcTCA\n+ \n+\n+121 ACAATATGCTGACAAAACAATACATCTTGTCT\n+ <<<<<<<<<<<<<<<<<<<<<<< \n+\n+> Contig113_chrX_26287829_26288398 385 C T 0.077485\n+\n+ 1 AAAGCCGTAACAGTCGCTAGGAGAATCATAATTTTAAGCTTTGTGTGTCCCGGGcTTGAG\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 TCCCTCAGGAGTAGTTAGATGCGGCCTTAAATTCTCcCAGTAAATTCACnTTGACGGCCT\n+ \n+\n+121 ATTTTTGACCTGGGGGCACACGCTGCTATACACTCTAGCCACCTCTGATCCTCTGGCCTC\n+ \n+\n+181 CTCTGTTACAATGACAGAAACGACAGAAGCATTTCTTTAAAATAAGTCCCAGTACGTGCA\n+ \n+\n+241 CACAAACGTTCAGGGCAGCCTTCTCCATAAACGGCACGAAATGGC\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig90_chrX_57430715_57431566 548 C T 0.153995\n+ EcoRV\n+ 1 CTCATTCCCAGCTACCTCCACCTCTATACCAACCCCTAGTTCCTGTACATCCCTGCTTCT\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ATAGGAAATCTTCCTGGTGTTGATATnATTCCCAAGGTCAGGCTGTCCTCCTAGCTCCCT\n+ \n+\n+121 CTCCTCATCTGCATCAAGTCCTCCAAACTGGGCAGTAGAC\n+ <<<<<<<<<<<<<<<<<<<< \n+\n+> Contig133_chrX_84833782_84834125 182 G A 0.277794\n+\n+ 1 CACCAGAGTGCAATCGAGAACCATCTGATCACAGAACCATAGAAAAGATTGCTGTACAAG\n+ >>>>>>>>>>>>>>>>>>>> \n+\n+ 61 ACTTAGGAACTCATTCTGTTCAGGATGGAGAAGCTGATGCCCAAAAAGGGAAAGGAACTT\n+ \n+\n+121 AACCAAAGTCCATACAnTATCAACTCTACACATAAAGGAAGGGAGTGGAGGGAGCAGTAA\n+ \n+\n+181 GACCAGAGATATAGACCCCAGTGAGGAGGCTGTGAGCTCCTG\n+ <<<<<<<<<<<<<<<<<<<< \n+\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/find_intervals/find_intervals.interval
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/find_intervals/find_intervals.interval Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,1 @@
+chr2 9817960 67331624 1272.2000

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/map_ensembl_transcripts/map_ensembl_transcripts.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/map_ensembl_transcripts/map_ensembl_transcripts.tabular Wed Sep 12 17:10:26 2012 -0400

b'@@ -0,0 +1,150 @@\n+ENSCAFT00000000001\t476153\tcfa00230=Purine metabolism.cfa00500=Starch and sucrose metabolism.cfa00740=Riboflavin metabolism.cfa00760=Nicotinate and nicotinamide metabolism.cfa00770=Pantothenate and CoA biosynthesis.cfa01100=Metabolic pathways\n+ENSCAFT00000000144\t483960\tN\n+ENSCAFT00000000160\t610160\tN\n+ENSCAFT00000000215\tU\tN\n+ENSCAFT00000000233\t483973\tN\n+ENSCAFT00000000365\t474414\tcfa00450=Selenocompound metabolism.cfa00970=Aminoacyl-tRNA biosynthesis\n+ENSCAFT00000000507\t484023\tN\n+ENSCAFT00000000517\t476233\tN\n+ENSCAFT00000000674\t611986\tN\n+ENSCAFT00000000724\t609478\tN\n+ENSCAFT00000000760\tU\tN\n+ENSCAFT00000000762\tU\tN\n+ENSCAFT00000001047\t475067\tcfa00240=Pyrimidine metabolism.cfa00410=beta-Alanine metabolism.cfa00770=Pantothenate and CoA biosynthesis.cfa00983=Drug metabolism - other enzymes.cfa01100=Metabolic pathways\n+ENSCAFT00000001052\tU\tN\n+ENSCAFT00000001063\t481999\tN\n+ENSCAFT00000001076\tU\tN\n+ENSCAFT00000001104\t607591\tN\n+ENSCAFT00000001141\t484064\tN\n+ENSCAFT00000001146\t475076\tN\n+ENSCAFT00000001204\t481203\tN\n+ENSCAFT00000001219\t474465\tN\n+ENSCAFT00000001250\t481729.481731\tcfa04145=Phagosome.cfa04514=Cell adhesion molecules (CAMs).cfa04612=Antigen processing and presentation.cfa04672=Intestinal immune network for IgA production.cfa04940=Type I diabetes mellitus.cfa05140=Leishmaniasis.cfa05145=Toxoplasmosis.cfa05150=Staphylococcus aureus infection.cfa05152=Tuberculosis.cfa05164=Influenza A.cfa05166=HTLV-I infection.cfa05168=Herpes simplex infection.cfa05310=Asthma.cfa05320=Autoimmune thyroid disease.cfa05322=Systemic lupus erythematosus.cfa05323=Rheumatoid arthritis.cfa05330=Allograft rejection.cfa05332=Graft-versus-host disease.cfa05416=Viral myocarditis\n+ENSCAFT00000001352\t482026\tcfa00565=Ether lipid metabolism\n+ENSCAFT00000001363\t475084\tcfa03022=Basal transcription factors\n+ENSCAFT00000001421\t484096\tN\n+ENSCAFT00000001523\t475088\tN\n+ENSCAFT00000001575\t481744\tcfa04141=Protein processing in endoplasmic reticulum\n+ENSCAFT00000001587\t482035\tN\n+ENSCAFT00000001597\t609411\tN\n+ENSCAFT00000002056\t610014\tN\n+ENSCAFT00000002100\tU\tN\n+ENSCAFT00000002110\t481249\tN\n+ENSCAFT00000002175\t476310\tN\n+ENSCAFT00000002259\t484151\tN\n+ENSCAFT00000002460\t481785\tN\n+ENSCAFT00000002537\tU\tN\n+ENSCAFT00000002577\t484157\tN\n+ENSCAFT00000002578\t608906\tN\n+ENSCAFT00000002660\tU\tN\n+ENSCAFT00000002792\t474523\tN\n+ENSCAFT00000002849\t475216\tN\n+ENSCAFT00000002999\tU\tN\n+ENSCAFT00000003163\t474921\tcfa03040=Spliceosome\n+ENSCAFT00000003223\t474925\tN\n+ENSCAFT00000003307\t609995\tN\n+ENSCAFT00000003515\t482316\tN\n+ENSCAFT00000003560\tU\tN\n+ENSCAFT00000003644\t484216\tcfa00970=Aminoacyl-tRNA biosynthesis\n+ENSCAFT00000003824\t475249\tN\n+ENSCAFT00000003840\t482333\tN\n+ENSCAFT00000004092\t474960\tN\n+ENSCAFT00000004103\t484298\tN\n+ENSCAFT00000004208\t481637\tN\n+ENSCAFT00000004253\t100534006.100534007.474588\tN\n+ENSCAFT00000004311\t482346\tN\n+ENSCAFT00000004464\t481892\tN\n+ENSCAFT00000004511\t481893\tN\n+ENSCAFT00000004609\t611755\tN\n+ENSCAFT00000004673\t611817\tN\n+ENSCAFT00000004726\t610047\tcfa00230=Purine metabolism.cfa00240=Pyrimidine metabolism.cfa01100=Metabolic pathways.cfa03030=DNA replication.cfa03410=Base excision repair.cfa03420=Nucleotide excision repair.cfa03430=Mismatch repair.cfa03440=Homologous recombination.cfa05166=HTLV-I infection\n+ENSCAFT00000004799\tU\tN\n+ENSCAFT00000004933\t482382\tcfa04621=NOD-like receptor signaling pathway.cfa05133=Pertussis\n+ENSCAFT00000004993\t474995\tcfa03008=Ribosome biogenesis in eukaryotes\n+ENSCAFT00000005126\tU\tN\n+ENSCAFT00000005142\t606804\tN\n+ENSCAFT00000005225\t475647\tN\n+ENSCAFT00000005323\tU\tN\n+ENSCAFT00000005467\tU\tN\n+ENSCAFT00000005496\t481925\tN\n+ENSCAFT00000005518\t492302\tcfa02010=ABC transporters.cfa04971=Gastric acid secretion.cfa04972=Pancreatic secretion.cfa04976=Bile secretion\n+ENSCAFT00000005653\t403417\tcfa04145=Phagosome.cfa04620=Toll-like receptor signaling pathway.cfa05132=Salmonella infection.cfa05133=Pertussis.cfa05134=Legionellosis.cfa05140=Leishmaniasis.cfa05142=Chagas disease (American trypanosomiasis).cfa05144=Malaria.cfa05145=Toxoplasmosis.cfa05146=Amoebiasis.cfa0'..b'72=Intestinal immune network for IgA production.cfa05166=HTLV-I infection.cfa05168=Herpes simplex infection.cfa05323=Rheumatoid arthritis\n+ENSCAFT00000005972\t475012\tN\n+ENSCAFT00000006025\t482980\tN\n+ENSCAFT00000006114\t483829\tN\n+ENSCAFT00000006157\t475021\tN\n+ENSCAFT00000006219\t483261\tcfa04972=Pancreatic secretion.cfa04978=Mineral absorption\n+ENSCAFT00000006272\t484394\tcfa00280=Valine, leucine and isoleucine degradation.cfa00290=Valine, leucine and isoleucine biosynthesis.cfa00770=Pantothenate and CoA biosynthesis.cfa01100=Metabolic pathways\n+ENSCAFT00000006453\t475893\tN\n+ENSCAFT00000006479\tU\tN\n+ENSCAFT00000006507\t484622\tcfa03030=DNA replication.cfa04110=Cell cycle\n+ENSCAFT00000006669\t476094\tN\n+ENSCAFT00000006689\t475897\tN\n+ENSCAFT00000006827\tU\tN\n+ENSCAFT00000006891\t610021\tN\n+ENSCAFT00000007130\t485445\tcfa04020=Calcium signaling pathway.cfa04080=Neuroactive ligand-receptor interaction\n+ENSCAFT00000007145\t607961\tN\n+ENSCAFT00000007244\t476781\tN\n+ENSCAFT00000007375\t403767\tcfa04977=Vitamin digestion and absorption\n+ENSCAFT00000007440\t482516\tN\n+ENSCAFT00000007467\t485576\tN\n+ENSCAFT00000007484\t609336\tN\n+ENSCAFT00000007527\t607108\tN\n+ENSCAFT00000007553\t487123\tcfa03450=Non-homologous end-joining.cfa05340=Primary immunodeficiency\n+ENSCAFT00000007697\t475382\tN\n+ENSCAFT00000007703\t477019\tcfa03430=Mismatch repair.cfa03460=Fanconi anemia pathway.cfa05200=Pathways in cancer.cfa05210=Colorectal cancer.cfa05213=Endometrial cancer\n+ENSCAFT00000007747\tU\tN\n+ENSCAFT00000007774\t477021\tcfa04510=Focal adhesion.cfa04512=ECM-receptor interaction.cfa04514=Cell adhesion molecules (CAMs).cfa04810=Regulation of actin cytoskeleton.cfa05410=Hypertrophic cardiomyopathy (HCM).cfa05412=Arrhythmogenic right ventricular cardiomyopathy (ARVC).cfa05414=Dilated cardiomyopathy\n+ENSCAFT00000007776\tU\tN\n+ENSCAFT00000007779\t478007.478008\tcfa03060=Protein export.cfa04141=Protein processing in endoplasmic reticulum.cfa04145=Phagosome\n+ENSCAFT00000007859\t483010\tN\n+ENSCAFT00000007951\tU\tN\n+ENSCAFT00000007959\t482810.611087\tN\n+ENSCAFT00000008012\t485173\tN\n+ENSCAFT00000008063\t484489\tN\n+ENSCAFT00000008142\t476128\tN\n+ENSCAFT00000008198\t612489\tN\n+ENSCAFT00000008413\tU\tN\n+ENSCAFT00000008540\t483021\tN\n+ENSCAFT00000008586\t484499\tN\n+ENSCAFT00000008588\tU\tN\n+ENSCAFT00000008673\t478018\tN\n+ENSCAFT00000008678\t485188\tN\n+ENSCAFT00000008728\tU\tN\n+ENSCAFT00000008769\t485523\tcfa02010=ABC transporters.cfa04976=Bile secretion\n+ENSCAFT00000008831\t475398\tN\n+ENSCAFT00000009074\t485769\tcfa04330=Notch signaling pathway\n+ENSCAFT00000009114\t483354\tN\n+ENSCAFT00000009614\t475416\tN\n+ENSCAFT00000009698\t486001\tN\n+ENSCAFT00000009710\t486002\tN\n+ENSCAFT00000010094\t486223\tcfa00230=Purine metabolism.cfa00240=Pyrimidine metabolism.cfa01100=Metabolic pathways.cfa03030=DNA replication.cfa03410=Base excision repair.cfa03420=Nucleotide excision repair.cfa05166=HTLV-I infection\n+ENSCAFT00000010141\t482857\tcfa04360=Axon guidance\n+ENSCAFT00000010439\t610992\tN\n+ENSCAFT00000010496\t415126\tcfa04380=Osteoclast differentiation.cfa04916=Melanogenesis.cfa05200=Pathways in cancer.cfa05218=Melanoma\n+ENSCAFT00000010516\tU\tN\n+ENSCAFT00000010531\t484693\tN\n+ENSCAFT00000010559\t483405\tN\n+ENSCAFT00000010593\tU\tN\n+ENSCAFT00000010616\t474176\tcfa03450=Non-homologous end-joining.cfa04110=Cell cycle\n+ENSCAFT00000010630\t486770\tN\n+ENSCAFT00000010829\t486944\tN\n+ENSCAFT00000010865\tU\tN\n+ENSCAFT00000010931\t485368\tN\n+ENSCAFT00000010977\tU\tN\n+ENSCAFT00000010988\t482891\tcfa04145=Phagosome\n+ENSCAFT00000011187\t475441\tN\n+ENSCAFT00000011380\tU\tN\n+ENSCAFT00000011397\t475750\tcfa04110=Cell cycle.cfa04114=Oocyte meiosis.cfa04120=Ubiquitin mediated proteolysis.cfa04914=Progesterone-mediated oocyte maturation.cfa05166=HTLV-I infection\n+ENSCAFT00000011721\t475621\tN\n+ENSCAFT00000011730\t486534\tN\n+ENSCAFT00000011771\t477193\tN\n+ENSCAFT00000011789\t609978\tN\n+ENSCAFT00000011968\t488881\tcfa00760=Nicotinate and nicotinamide metabolism.cfa04146=Peroxisome\n+ENSCAFT00000012081\t478082\tcfa04621=NOD-like receptor signaling pathway\n+ENSCAFT00000012133\t611998\tN\n+ENSCAFT00000012159\t484609\tN\n+ENSCAFT00000012254\tU\tN\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/modify_snp_table/modify.gd_snp
--- a/test-data/test_out/modify_snp_table/modify.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,338 +0,0 @@\n-Contig161_chr1_4641264_4641879\t115\tC\tT\t73.5\tchr1\t4641382\tC\t6\t0\t2\t45\t8\t0\t2\t51\t15\t0\t2\t72\t5\t0\t2\t42\t6\t0\t2\t45\t10\t0\t2\t57\tY\t54\t0.323\t0\n-Contig20_chr1_21313469_21313570\t66\tC\tT\t54.0\tchr1\t21313534\tC\t4\t0\t2\t39\t4\t0\t2\t39\t5\t0\t2\t42\t4\t0\t2\t39\t4\t0\t2\t39\t5\t0\t2\t42\tN\t1\t+99.\t0\n-Contig86_chr1_30984450_30985684\t670\tC\tT\t365.0\tchr1\t30985133\tC\t9\t0\t2\t54\t10\t0\t2\t57\t13\t0\t2\t66\t3\t0\t2\t36\t9\t0\t2\t54\t7\t0\t2\t48\tY\t145\t0.031\t0\n-Contig5_chr1_32562160_32563940\t1215\tG\tT\t163.0\tchr1\t32563356\tG\t17\t0\t2\t78\t19\t0\t2\t84\t20\t0\t2\t87\t14\t0\t2\t69\t12\t0\t2\t63\t10\t0\t2\t57\tY\t17\t0.251\t0\n-Contig110_chr1_33385093_33386888\t510\tC\tT\t270.0\tchr1\t33385587\tA\t14\t0\t2\t69\t11\t0\t2\t60\t19\t0\t2\t84\t11\t0\t2\t60\t10\t0\t2\t57\t13\t0\t2\t66\tY\t13\t0.126\t0\n-Contig100_chr1_33562920_33564288\t743\tC\tT\t178.0\tchr1\t33563655\tC\t6\t0\t2\t45\t10\t0\t2\t57\t8\t0\t2\t51\t5\t0\t2\t42\t13\t0\t2\t66\t7\t0\t2\t48\tY\t13\t0.090\t3\n-Contig7_chr1_37302355_37302489\t97\tA\tG\t59.2\tchr1\t37302452\tG\t3\t0\t2\t36\t8\t0\t2\t51\t5\t0\t2\t42\t8\t0\t2\t51\t7\t0\t2\t48\t6\t0\t2\t45\tN\t56\t2.812\t0\n-Contig62_chr1_41880715_41882180\t1078\tT\tG\t57.6\tchr1\t41881785\tT\t14\t0\t2\t69\t15\t0\t2\t72\t16\t0\t2\t75\t13\t0\t2\t66\t8\t0\t2\t51\t10\t0\t2\t57\tY\t21\t0.477\t0\n-Contig47_chr1_48409178_48409384\t37\tC\tT\t134.0\tchr1\t48409215\tT\t5\t0\t2\t42\t6\t0\t2\t45\t8\t0\t2\t51\t9\t0\t2\t54\t4\t0\t2\t39\t6\t0\t2\t45\tN\t66\t+99.\t0\n-Contig119_chr1_49647683_49650077\t1618\tC\tA\t99.7\tchr1\t49649276\tA\t8\t0\t2\t51\t11\t0\t2\t60\t10\t0\t2\t57\t9\t0\t2\t54\t10\t0\t2\t57\t14\t0\t2\t69\tY\t16\t0.166\t0\n-Contig21_chr1_60697952_60699446\t307\tG\tA\t51.9\tchr1\t60698265\tG\t12\t0\t2\t63\t9\t0\t2\t54\t4\t0\t2\t39\t6\t0\t2\t45\t9\t0\t2\t54\t4\t0\t2\t39\tY\t98\t0.507\t0\n-Contig131_chr1_62319542_62320564\t169\tC\tG\t103.0\tchr1\t62319709\tC\t12\t0\t2\t63\t12\t0\t2\t66\t14\t0\t2\t69\t12\t0\t2\t63\t9\t0\t2\t54\t9\t0\t2\t54\tY\t73\t0.307\t1\n-Contig14_chr1_63450425_63450680\t101\tT\tA\t102.0\tchr1\t63450530\tT\t8\t0\t2\t51\t10\t0\t2\t57\t18\t0\t2\t81\t8\t0\t2\t51\t8\t0\t2\t34\t8\t0\t2\t51\tN\t99\t1.085\t0\n-Contig83_chr1_63869778_63869942\t40\tT\tC\t23.7\tchr1\t63869819\tC\t5\t0\t2\t42\t7\t0\t2\t48\t2\t0\t2\t33\t4\t0\t2\t39\t6\t0\t2\t48\t4\t0\t2\t39\tN\t654\t1.364\t0\n-Contig30_chr1_64702572_64703138\t178\tA\tT\t117.0\tchr1\t64702750\tT\t10\t0\t2\t57\t10\t0\t2\t57\t20\t0\t2\t87\t21\t0\t2\t90\t6\t0\t2\t45\t12\t0\t2\t63\tY\t50\t3.872\t0\n-Contig101_chr1_69868406_69868872\t287\tG\tA\t14.6\tchr1\t69868689\tG\t13\t0\t2\t66\t17\t0\t2\t78\t10\t0\t2\t57\t8\t0\t2\t51\t7\t0\t2\t48\t8\t0\t2\t51\tN\t137\t0.305\t0\n-Contig35_chr1_74482577_74482791\t170\tG\tA\t45.4\tchr1\t74482751\tA\t3\t0\t2\t36\t4\t0\t2\t39\t13\t0\t2\t66\t2\t0\t2\t33\t5\t0\t2\t42\t2\t0\t2\t33\tN\t20\t+99.\t3\n-Contig49_chr1_83865731_83865944\t85\tG\tA\t34.1\tchr1\t-1\tN\t4\t0\t2\t39\t4\t0\t2\t39\t8\t0\t2\t51\t2\t0\t2\t33\t5\t0\t2\t42\t4\t0\t2\t39\tN\t-1\t1.485\t0\n-Contig129_chr1_117547123_117548666\t926\tG\tA\t126.0\tchr1\t117548059\tG\t19\t0\t2\t84\t9\t0\t2\t54\t11\t0\t2\t60\t10\t0\t2\t57\t12\t0\t2\t63\t11\t0\t2\t60\tY\t64\t0.049\t0\n-Contig7_chr1_125154638_125154844\t190\tG\tT\t130.0\tchr1\t125154818\tA\t5\t0\t2\t42\t4\t0\t2\t39\t7\t0\t2\t48\t2\t0\t2\t33\t7\t0\t2\t48\t4\t0\t2\t39\tN\t33\t+99.\t0\n-Contig222_chr2_9817738_9818143\t220\tC\tT\t888.0\tchr2\t9817960\tC\t17\t0\t2\t78\t12\t0\t2\t63\t20\t0\t2\t87\t8\t0\t2\t51\t11\t0\t2\t60\t12\t0\t2\t63\tY\t76\t0.093\t1\n-Contig47_chr2_25470778_25471576\t126\tG\tA\t888.0\tchr2\t25470896\tG\t12\t0\t2\t63\t14\t0\t2\t69\t14\t0\t2\t69\t10\t0\t2\t57\t18\t0\t2\t81\t13\t0\t2\t66\tN\t11\t0.289\t1\n-Contig10_chr2_40859744_40860534\t637\tG\tA\t888.0\tchr2\t40860397\tA\t3\t0\t2\t36\t3\t0\t2\t36\t2\t0\t2\t33\t7\t0\t2\t48\t6\t0\t2\t45\t8\t0\t2\t51\tY\t42\t1.435\t0\n-Contig52_chr2_41421981_41422725\t604\tC\tA\t888.0\tchr2\t41422583\tA\t17\t0\t2\t78\t18\t0\t2\t81\t14\t0\t2\t69\t17\t0\t2\t78\t12\t0\t2\t63\t14\t0\t2\t69\tY\t44\t0.882\t0\n-Contig94_chr2_43869105_43870358\t220\tG\tA\t888.0\tchr2\t43869333\tG\t12\t0\t2\t63\t18\t0\t2\t81\t11\t0\t2\t60\t15\t0\t2\t72\t12\t0\t2\t63\t13\t0\t2\t66\tY\t1\t0.156\t0\n-Contig34_chr2_48444129_48444939\t695\tC\tT\t134.0\tchr2\t48444828\tC\t14\t0\t2\t69\t8\t0\t2\t51\t16\t0\t2\t75\t17\t0\t2\t78\t9\t0\t2\t54\t15\t0\t2\t72\tY\t161\t0.375\t0\n-Contig6_chr2_56859179_56859956\t671\tT\tC\t999.9\tchr2\t56859851\tT\t15\t0\t2\t72\t18\t0\t2\t81\t20\t0\t2\t90\t19\t0\t2\t84\t19\t0\t2\t84\t24\t0\t2\t99\tN\t28\t5.308\t1\n-Contig115_chr2_61631913_61632510\t310\tG\tT\t999.3\tchr2\t61632216\tG\t7\t0\t2\t48\t9\t0\t2\t54\t7\t0\t2\t48\t11\t0\t2\t60\t10\t0\t2\t57\t10\t0\t2\t57\tN\t13\t0.184\t0\n-Contig31_chr2_67331584_67331785\t39\tC\tT\t999.0\tchr2\t67331623\tC\t11\t0\t2\t60\t10\t0\t2\t57\t7\t0\t2\t48\t9\t0\t2\t54\t2\t0\t2\t33\t4\t0\t2\t39\tN\t110\t0.647\t1\n-Contig92_chr2_75906683_75907774\t773\tT\tC\t85.4\tchr2\t75907438\tC\t12\t0\t2\t63\t12\t0\t2\t63\t17\t0\t2\t78\t8\t0\t2\t51\t8\t0\t2\t51\t13\t0\t2\t66\tY\t93\t0.166\t0\n-Contig'..b'73_3667898\t348\tG\tT\t124.0\tchr35\t3667121\tG\t9\t0\t2\t54\t20\t0\t2\t87\t18\t0\t2\t81\t15\t0\t2\t72\t12\t0\t2\t63\t14\t0\t2\t69\tY\t285\t0.235\t0\n-Contig195_chr35_15722500_15722741\t205\tG\tA\t4.08\tchr35\t15722718\tG\t3\t0\t2\t36\t5\t0\t2\t42\t1\t0\t2\t30\t6\t0\t2\t45\t1\t0\t2\t30\t1\t0\t2\t30\tN\t43\t+99.\t0\n-Contig101_chr35_19513178_19513697\t62\tC\tT\t112.0\tchr35\t19513238\tC\t12\t0\t2\t63\t7\t0\t2\t48\t13\t0\t2\t66\t7\t0\t2\t48\t5\t0\t2\t42\t8\t0\t2\t51\tN\t115\t3.135\t0\n-Contig47_chr35_24382042_24382526\t33\tG\tA\t87.0\tchr35\t24382076\tG\t5\t0\t2\t42\t4\t0\t2\t39\t6\t0\t2\t45\t7\t0\t2\t48\t4\t0\t2\t39\t2\t0\t2\t33\tY\t71\t+99.\t0\n-Contig77_chr35_24796947_24797172\t65\tA\tG\t52.1\tchr35\t24797009\tA\t7\t0\t2\t48\t5\t0\t2\t42\t8\t0\t2\t51\t6\t0\t2\t45\t12\t0\t2\t63\t10\t0\t2\t57\tN\t11\t1.401\t3\n-Contig74_chr35_25394343_25394813\t303\tA\tT\t221.0\tchr35\t25394646\tG\t23\t0\t2\t96\t15\t0\t2\t72\t25\t0\t2\t105\t7\t7\t1\t49\t18\t0\t2\t81\t16\t0\t2\t75\tY\t58\t4.298\t0\n-Contig5_chr36_4562983_4563634\t343\tC\tT\t151.0\tchr36\t4563324\tT\t20\t0\t2\t87\t20\t0\t2\t87\t23\t0\t2\t96\t24\t0\t2\t99\t9\t0\t2\t54\t8\t0\t2\t51\tY\t40\t1.169\t0\n-Contig75_chr36_7885319_7885588\t53\tG\tA\t25.7\tchr36\t7885372\tG\t10\t0\t2\t57\t8\t0\t2\t51\t13\t0\t2\t66\t7\t0\t2\t48\t4\t0\t2\t39\t7\t0\t2\t48\tN\t7\t2.653\t0\n-Contig184_chr36_18956191_18958552\t187\tA\tG\t11.5\tchr36\t18956371\tG\t10\t0\t2\t57\t11\t0\t2\t60\t21\t0\t2\t90\t14\t0\t2\t69\t7\t0\t2\t48\t4\t0\t2\t39\tN\t278\t1.434\t2\n-Contig12_chr36_21557176_21557828\t513\tT\tA\t159.0\tchr36\t21557695\tA\t11\t0\t2\t60\t14\t0\t2\t69\t21\t0\t2\t90\t12\t0\t2\t63\t15\t0\t2\t72\t11\t0\t2\t60\tY\t55\t0.222\t0\n-Contig2_chr36_22436067_22436794\t653\tC\tT\t73.0\tchr36\t22436730\tC\t11\t0\t2\t60\t16\t0\t2\t75\t13\t0\t2\t66\t11\t0\t2\t60\t21\t0\t2\t90\t21\t0\t2\t90\tY\t9\t0.534\t0\n-Contig133_chr36_32954045_32955409\t136\tA\tG\t116.0\tchr36\t32954182\tA\t16\t0\t2\t75\t15\t0\t2\t72\t20\t0\t2\t87\t11\t0\t2\t60\t18\t0\t2\t81\t13\t0\t2\t66\tY\t74\t3.772\t1\n-Contig53_chr37_6665763_6665919\t116\tC\tT\t111.0\tchr37\t6665875\tC\t9\t0\t2\t54\t9\t0\t2\t54\t5\t0\t2\t42\t9\t0\t2\t54\t8\t0\t2\t51\t10\t0\t2\t57\tN\t15\t10.875\t1\n-Contig42_chr37_9589176_9591269\t252\tG\tA\t25.1\tchr37\t9589430\tG\t10\t0\t2\t40\t13\t0\t2\t66\t18\t0\t2\t81\t21\t0\t2\t90\t9\t0\t2\t54\t17\t0\t2\t78\tN\t67\t1.170\t2\n-Contig2_chr37_17134963_17136513\t1140\tA\tC\t158.0\tchr37\t17136092\tA\t14\t0\t2\t69\t24\t0\t2\t99\t17\t0\t2\t78\t16\t0\t2\t75\t15\t0\t2\t75\t13\t0\t2\t66\tY\t12\t0.053\t1\n-Contig18_chr37_17147806_17149851\t291\tT\tG\t112.0\tchr37\t17148084\tT\t4\t6\t1\t45\t16\t0\t2\t75\t17\t0\t2\t78\t14\t0\t2\t69\t22\t0\t2\t93\t13\t0\t2\t66\tY\t41\t4.442\t0\n-Contig64_chr37_17606895_17607534\t565\tC\tT\t30.2\tchr37\t17607439\tA\t9\t0\t2\t54\t16\t0\t2\t75\t20\t0\t2\t87\t14\t0\t2\t69\t16\t0\t2\t75\t10\t0\t2\t57\tN\t20\t1.622\t0\n-Contig126_chr37_21587881_21590621\t373\tG\tT\t132.0\tchr37\t21588256\tG\t11\t0\t2\t60\t11\t0\t2\t60\t23\t0\t2\t96\t12\t0\t2\t63\t8\t0\t2\t51\t18\t0\t2\t81\tY\t12\t0.549\t0\n-Contig2_chr37_31197993_31198256\t182\tC\tT\t39.6\tchr37\t31198171\tT\t6\t0\t2\t45\t10\t0\t2\t57\t7\t0\t2\t48\t9\t0\t2\t54\t10\t0\t2\t57\t12\t0\t2\t63\tN\t2\t0.595\t0\n-Contig46_chr37_31852376_31853555\t825\tA\tG\t111.0\tchr37\t31853191\tG\t19\t0\t2\t84\t14\t0\t2\t69\t15\t0\t2\t72\t7\t0\t2\t48\t8\t0\t2\t51\t16\t0\t2\t75\tY\t17\t0.128\t1\n-Contig7_chr38_12217200_12218387\t1163\tA\tT\t44.4\tchr38\t12218353\tA\t11\t0\t2\t60\t13\t0\t2\t66\t17\t0\t2\t78\t10\t0\t2\t57\t11\t0\t2\t60\t11\t0\t2\t60\tY\t67\t+99.\t0\n-Contig15_chr38_12282020_12282253\t150\tC\tT\t156.0\tchr38\t12282164\tA\t17\t0\t2\t78\t11\t0\t2\t60\t19\t0\t2\t84\t14\t0\t2\t69\t5\t0\t2\t42\t14\t0\t2\t69\tY\t26\t2.952\t1\n-Contig6_chr38_16185744_16186110\t325\tA\tG\t74.9\tchr38\t16186061\tA\t5\t0\t2\t42\t3\t0\t2\t36\t9\t0\t2\t54\t7\t0\t2\t48\t1\t0\t2\t30\t12\t0\t2\t63\tY\t40\t+99.\t0\n-Contig265_chrX_2689247_2689484\t114\tC\tG\t103.0\tchrX\t2689356\tC\t11\t0\t2\t60\t9\t0\t2\t54\t13\t0\t2\t66\t16\t0\t2\t75\t14\t0\t2\t69\t10\t0\t2\t57\tN\t2\t9.232\t1\n-Contig122_chrX_6026976_6027327\t330\tC\tT\t79.4\tchrX\t6027303\tC\t3\t0\t2\t36\t3\t0\t2\t36\t3\t0\t2\t36\t4\t0\t2\t39\t3\t0\t2\t36\t6\t0\t2\t45\tY\t30\t+99.\t0\n-Contig113_chrX_26287829_26288398\t385\tC\tT\t59.6\tchrX\t26288213\tC\t9\t0\t2\t54\t9\t0\t2\t54\t17\t0\t2\t78\t11\t0\t2\t60\t3\t8\t1\t44\t4\t0\t2\t39\tN\t13\t0.077\t0\n-Contig237_chrX_31256648_31257654\t165\tT\tA\t246.0\tchrX\t31256814\tT\t7\t0\t2\t48\t23\t0\t2\t96\t19\t0\t2\t84\t17\t0\t2\t78\t14\t0\t2\t69\t8\t0\t2\t51\tY\t37\t1.481\t0\n-Contig90_chrX_57430715_57431566\t548\tC\tT\t116.0\tchrX\t57431266\tT\t9\t0\t2\t54\t18\t0\t2\t81\t13\t0\t2\t66\t14\t0\t2\t69\t8\t0\t2\t54\t7\t0\t2\t48\tY\t261\t0.154\t1\n-Contig133_chrX_84833782_84834125\t182\tG\tA\t69.7\tchrX\t84833962\tG\t5\t0\t2\t42\t18\t0\t2\t81\t12\t0\t2\t63\t19\t0\t2\t84\t6\t3\t1\t27\t7\t0\t2\t48\tN\t619\t0.278\t0\n-Contig125_chrX_93319363_93320877\t349\tA\tC\t145.0\tchrX\t93319721\tA\t4\t0\t2\t39\t6\t0\t2\t45\t11\t0\t2\t60\t10\t0\t2\t57\t13\t0\t2\t66\t6\t0\t2\t45\tY\t59\t1.686\t0\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pathway_image/pathway_image.png

Binary file test-data/test_out/pathway_image/pathway_image.png has changed

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/PCA.pdf

Binary file test-data/test_out/pca/PCA.pdf has changed

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/admix.gd_indivs
--- a/test-data/test_out/pca/admix.gd_indivs Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

@@ -1,6 +0,0 @@
-PB1 M All_Individuals
-PB2 M All_Individuals
-PB3 M All_Individuals
-PB4 M All_Individuals
-PB6 M All_Individuals
-PB8 M All_Individuals

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/admix.gd_snp
--- a/test-data/test_out/pca/admix.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

b'@@ -1,303 +0,0 @@\n- snp1 11 0.002 2000 A T\n- snp3 11 0.002 2000 A T\n- snp4 11 0.002 2000 A T\n- snp5 11 0.002 2000 A T\n- snp6 11 0.002 2000 A T\n- snp7 11 0.002 2000 A T\n- snp8 11 0.002 2000 A T\n- snp9 11 0.002 2000 A T\n- snp10 11 0.002 2000 A T\n- snp11 11 0.002 2000 A T\n- snp12 11 0.002 2000 A T\n- snp13 11 0.002 2000 A T\n- snp14 11 0.002 2000 A T\n- snp16 11 0.002 2000 A T\n- snp17 11 0.002 2000 A T\n- snp22 11 0.002 2000 A T\n- snp24 11 0.002 2000 A T\n- snp25 11 0.002 2000 A T\n- snp27 11 0.002 2000 A T\n- snp28 11 0.002 2000 A T\n- snp29 11 0.002 2000 A T\n- snp30 11 0.002 2000 A T\n- snp31 11 0.002 2000 A T\n- snp33 11 0.002 2000 A T\n- snp34 11 0.002 2000 A T\n- snp37 11 0.002 2000 A T\n- snp38 11 0.002 2000 A T\n- snp39 11 0.002 2000 A T\n- snp40 11 0.002 2000 A T\n- snp41 11 0.002 2000 A T\n- snp42 11 0.002 2000 A T\n- snp43 11 0.002 2000 A T\n- snp45 11 0.002 2000 A T\n- snp46 11 0.002 2000 A T\n- snp47 11 0.002 2000 A T\n- snp48 11 0.002 2000 A T\n- snp49 11 0.002 2000 A T\n- snp50 11 0.002 2000 A T\n- snp51 11 0.002 2000 A T\n- snp52 11 0.002 2000 A T\n- snp53 11 0.002 2000 A T\n- snp54 11 0.002 2000 A T\n- snp56 11 0.002 2000 A T\n- snp58 11 0.002 2000 A T\n- snp59 11 0.002 2000 A T\n- snp60 11 0.002 2000 A T\n- snp61 11 0.002 2000 A T\n- snp62 11 0.002 2000 A T\n- snp63 11 0.002 2000 A T\n- snp64 11 0.002 2000 A T\n- snp65 11 0.002 2000 A T\n- snp67 11 0.002 2000 A T\n- snp68 11 0.002 2000 A T\n- snp70 11 0.002 2000 A T\n- snp71 11 0.002 2000 A T\n- snp72 11 0.002 2000 A T\n- snp73 11 0.002 2000 A T\n- snp74 11 0.002 2000 A T\n- snp75 11 0.002 2000 A T\n- snp76 11 0.002 2000 A T\n- snp77 11 0.002 2000 A T\n- snp78 11 0.002 2000 A T\n- snp80 11 0.002 2000 A T\n- snp81 11 0.002 2000 A T\n- snp83 11 0.002 2000 A T\n- snp84 11 0.002 2000 A T\n- snp87 11 0.002 2000 A T\n- snp89 11 0.002 2000 A T\n- snp90 11 0.002 2000 A T\n- snp91 11 0.002 2000 A T\n- snp92 11 0.002 2000 A T\n- snp93 11 0.002 2000 A T\n- snp94 11 0.002 2000 A T\n- snp98 11 0.002 2000 A T\n- snp100 11 0.002 2000 A T\n- snp101 11 0.002 2000 A T\n- snp102 11 0.002 2000 A T\n- snp103 11 0.002 2000 A T\n- snp104 11 0.002 2000 A T\n- snp105 11 0.002 2000 A T\n- snp106 11 0.002 2000 A T\n- snp107 11 0.002 2000 A T\n- snp108 11 0.002 2000 A T\n- snp110 11 0.002 2000 A T\n- snp111 11 0.002 2000 A T\n- snp112 11 0.002 2000 A T\n- snp113 11 0.002 2000 A T\n- snp116 11 0.002 2000 A T\n- snp117 11 0.002 2000 A T\n- snp118 11 0.002 2000 A T\n- snp119 11 0.002 2000 A T\n- snp121 11 0.002 2000 A T\n- snp122 11 0.002 2000 A T\n- snp123 11 0.002 2000 A T\n- snp124 11 0.002 2000 A T\n- snp125 11 0.002 2000 A T\n- snp126 11 0.002 2000 A T\n- snp128 11 0.002 2000 A T\n- snp129 11 0.002 2000 A T\n- snp131 11 0.002 2000 A T\n- snp133 11 0.002 2000 A T\n- snp134 11 0.002 2000 A T\n- snp135 11 0.002 2000 A T\n- snp137 11 0.002 2000 A T\n- snp138 11 0.002 2000 A T\n- snp139 11 0.002 2000 A T\n- snp140 11 0.002 2000 A T\n- snp141 11 0.002 2000 A T\n- snp143 11 0.002 2000 A T\n- snp145 11 0.002 2000 A T\n- snp146 11 0.002 2000 A T\n- snp148 11 0.002 2000 A T\n- snp149 11 0.002 2000 A T\n- snp150 11 0.002 2000 A T\n- snp151 11 0.002 2000 A T\n- snp152 11 0.002 2000 A T\n- snp153 11 0.002 2000 A T\n- snp154 11 0.002 2000 A T\n- snp156 11 0.002 2000 A T\n- snp157 11 0.002 2000 A T\n- snp158 11 0.002 2000 A T\n- snp159 11 0.002 2000 A T\n- snp160 11 0.002 2000 A T\n- snp161 11 0.002 2000 A T\n- snp162 11 0.002 2000 A T\n- snp164 11 0.002 2000 A T\n- snp165 11 0.002 2000 A T\n- snp167 11 0.002 2000 A T\n- snp168 11 0.002 2000 A T\n- snp169 11 0.002 2000 A T\n- snp170 11 0.002 2000 A T\n- snp171 11 0.002 2000 A T\n- snp172 11 0.002 2000 A T\n- snp174 11 0.002 2000 A T\n- snp175 11 0.002 2000 A T\n- snp176 11 0.002 2000 A T\n- snp177 11 0.002 2000 A T\n- snp178 11 0.002 2000 A T\n- snp179 11 0.002 2000 A T\n- snp181 11 0.002 2000 A T\n- snp182 11 0.002 2000 A T\n- snp183 11 0.002 2000 A T\n- snp184 11 0.002 2000 A T\n- snp185 11 0.002 2000 A T\n- snp186 11 0.002 2000 A T\n- '..b'p211 11 0.002 2000 A T\n- snp212 11 0.002 2000 A T\n- snp213 11 0.002 2000 A T\n- snp214 11 0.002 2000 A T\n- snp215 11 0.002 2000 A T\n- snp216 11 0.002 2000 A T\n- snp217 11 0.002 2000 A T\n- snp218 11 0.002 2000 A T\n- snp219 11 0.002 2000 A T\n- snp220 11 0.002 2000 A T\n- snp221 11 0.002 2000 A T\n- snp223 11 0.002 2000 A T\n- snp224 11 0.002 2000 A T\n- snp225 11 0.002 2000 A T\n- snp226 11 0.002 2000 A T\n- snp227 11 0.002 2000 A T\n- snp228 11 0.002 2000 A T\n- snp229 11 0.002 2000 A T\n- snp230 11 0.002 2000 A T\n- snp231 11 0.002 2000 A T\n- snp232 11 0.002 2000 A T\n- snp235 11 0.002 2000 A T\n- snp236 11 0.002 2000 A T\n- snp237 11 0.002 2000 A T\n- snp239 11 0.002 2000 A T\n- snp240 11 0.002 2000 A T\n- snp241 11 0.002 2000 A T\n- snp242 11 0.002 2000 A T\n- snp243 11 0.002 2000 A T\n- snp244 11 0.002 2000 A T\n- snp246 11 0.002 2000 A T\n- snp247 11 0.002 2000 A T\n- snp248 11 0.002 2000 A T\n- snp249 11 0.002 2000 A T\n- snp250 11 0.002 2000 A T\n- snp251 11 0.002 2000 A T\n- snp252 11 0.002 2000 A T\n- snp253 11 0.002 2000 A T\n- snp254 11 0.002 2000 A T\n- snp255 11 0.002 2000 A T\n- snp256 11 0.002 2000 A T\n- snp257 11 0.002 2000 A T\n- snp258 11 0.002 2000 A T\n- snp260 11 0.002 2000 A T\n- snp261 11 0.002 2000 A T\n- snp262 11 0.002 2000 A T\n- snp263 11 0.002 2000 A T\n- snp264 11 0.002 2000 A T\n- snp265 11 0.002 2000 A T\n- snp266 11 0.002 2000 A T\n- snp267 11 0.002 2000 A T\n- snp268 11 0.002 2000 A T\n- snp269 11 0.002 2000 A T\n- snp270 11 0.002 2000 A T\n- snp271 11 0.002 2000 A T\n- snp273 11 0.002 2000 A T\n- snp274 11 0.002 2000 A T\n- snp275 11 0.002 2000 A T\n- snp276 11 0.002 2000 A T\n- snp277 11 0.002 2000 A T\n- snp278 11 0.002 2000 A T\n- snp281 11 0.002 2000 A T\n- snp282 11 0.002 2000 A T\n- snp284 11 0.002 2000 A T\n- snp287 11 0.002 2000 A T\n- snp288 11 0.002 2000 A T\n- snp289 11 0.002 2000 A T\n- snp290 11 0.002 2000 A T\n- snp291 11 0.002 2000 A T\n- snp292 11 0.002 2000 A T\n- snp293 11 0.002 2000 A T\n- snp294 11 0.002 2000 A T\n- snp297 11 0.002 2000 A T\n- snp298 11 0.002 2000 A T\n- snp299 11 0.002 2000 A T\n- snp300 11 0.002 2000 A T\n- snp301 11 0.002 2000 A T\n- snp302 11 0.002 2000 A T\n- snp303 11 0.002 2000 A T\n- snp304 11 0.002 2000 A T\n- snp307 11 0.002 2000 A T\n- snp308 11 0.002 2000 A T\n- snp309 11 0.002 2000 A T\n- snp310 11 0.002 2000 A T\n- snp312 11 0.002 2000 A T\n- snp313 11 0.002 2000 A T\n- snp316 11 0.002 2000 A T\n- snp317 11 0.002 2000 A T\n- snp320 11 0.002 2000 A T\n- snp321 11 0.002 2000 A T\n- snp322 11 0.002 2000 A T\n- snp323 11 0.002 2000 A T\n- snp324 11 0.002 2000 A T\n- snp325 11 0.002 2000 A T\n- snp328 11 0.002 2000 A T\n- snp329 11 0.002 2000 A T\n- snp331 11 0.002 2000 A T\n- snp332 11 0.002 2000 A T\n- snp333 11 0.002 2000 A T\n- snp334 11 0.002 2000 A T\n- snp335 11 0.002 2000 A T\n- snp336 11 0.002 2000 A T\n- snp338 11 0.002 2000 A T\n- snp339 11 0.002 2000 A T\n- snp341 11 0.002 2000 A T\n- snp342 11 0.002 2000 A T\n- snp344 11 0.002 2000 A T\n- snp345 11 0.002 2000 A T\n- snp348 11 0.002 2000 A T\n- snp350 11 0.002 2000 A T\n- snp352 11 0.002 2000 A T\n- snp353 11 0.002 2000 A T\n- snp354 11 0.002 2000 A T\n- snp355 11 0.002 2000 A T\n- snp360 11 0.002 2000 A T\n- snp361 11 0.002 2000 A T\n- snp362 11 0.002 2000 A T\n- snp364 11 0.002 2000 A T\n- snp366 11 0.002 2000 A T\n- snp369 11 0.002 2000 A T\n- snp370 11 0.002 2000 A T\n- snp371 11 0.002 2000 A T\n- snp372 11 0.002 2000 A T\n- snp373 11 0.002 2000 A T\n- snp374 11 0.002 2000 A T\n- snp375 11 0.002 2000 A T\n- snp376 11 0.002 2000 A T\n- snp377 11 0.002 2000 A T\n- snp378 11 0.002 2000 A T\n- snp379 11 0.002 2000 A T\n- snp380 11 0.002 2000 A T\n- snp381 11 0.002 2000 A T\n- snp382 11 0.002 2000 A T\n- snp383 11 0.002 2000 A T\n- snp384 11 0.002 2000 A T\n- snp385 11 0.002 2000 A T\n- snp386 11 0.002 2000 A T\n- snp389 11 0.002 2000 A T\n- snp390 11 0.002 2000 A T\n- snp393 11 0.002 2000 A T\n- snp395 11 0.002 2000 A T\n- snp397 11 0.002 2000 A T\n- snp400 11 0.002 2000 A T\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/admix.geno
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/pca/admix.geno Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,303 @@
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+122222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+122222
+222222
+222222
+222222
+222222
+222222
+222222
+222212
+222222
+222222
+222221
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+212222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+122211
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+122222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222022
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+221221
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222122
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+122222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222222
+222212
+222222
+222222
+222222

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/coordinates.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/pca/coordinates.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,7 @@
+           #eigvals:     3.243     1.103
+                 PB1     0.1887      0.4703  All_Individuals
+                 PB2     0.0398      0.0455  All_Individuals
+                 PB3     0.1647     -0.6945  All_Individuals
+                 PB4    -0.8954     -0.0220  All_Individuals
+                 PB6     0.1887      0.4703  All_Individuals
+                 PB8     0.3135     -0.2696  All_Individuals

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/explained.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/pca/explained.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,4 @@
+Percentage explained by eigenvectors:
+1: 64.9%
+2: 22.1%
+3: 13.1%

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/par.admix
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/pca/par.admix Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,7 @@
+genotypename: /scratch/galaxy/home/oocyte/galaxy_oocyte/database/files/000/dataset_260_files/admix.geno
+snpname: /scratch/galaxy/home/oocyte/galaxy_oocyte/database/files/000/dataset_260_files/admix.snp
+indivname: /scratch/galaxy/home/oocyte/galaxy_oocyte/database/files/000/dataset_260_files/admix.ind
+evecoutname: /scratch/galaxy/home/oocyte/galaxy_oocyte/database/files/000/dataset_260_files/coordinates.txt
+evaloutname: /scratch/galaxy/home/oocyte/galaxy_oocyte/database/files/000/dataset_260_files/admix.eval
+altnormstyle: NO
+numoutevec: 2

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/pca/pca.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/pca/pca.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,37 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>PCA Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 02:19:05 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="PCA.pdf">PCA.pdf</a></li>
+            <li><a href="coordinates.txt">coordinates.txt</a></li>
+            <li><a href="explained.txt">explained.txt</a></li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li><a href="par.admix">par.admix</a></li>
+            <li><a href="admix.geno">admix.geno</a></li>
+            <li><a href="admix.snp">admix.snp</a></li>
+            <li><a href="admix.ind">admix.ind</a></li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Stats<p/><pre>
+
+</pre>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/distance_matrix.phylip
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/phylogenetic_tree/distance_matrix.phylip Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,8 @@
+7
+  canFam2 0.0000 0.3205 0.3085 0.3193 0.3101 0.3138 0.3170
+      PB1 0.3205 0.0000 0.0103 0.0100 0.0130 0.0119 0.0112
+      PB2 0.3085 0.0103 0.0000 0.0033 0.0062 0.0094 0.0062
+      PB3 0.3193 0.0100 0.0033 0.0000 0.0081 0.0091 0.0054
+      PB4 0.3101 0.0130 0.0062 0.0081 0.0000 0.0099 0.0088
+      PB6 0.3138 0.0119 0.0094 0.0091 0.0099 0.0000 0.0079
+      PB8 0.3170 0.0112 0.0062 0.0054 0.0088 0.0079 0.0000

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/informative_snps.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/phylogenetic_tree/informative_snps.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,7 @@
+  canFam2        0      338      339      350      345      342      344
+      PB1      338        0      338      344      338      336      339
+      PB2      339      338        0      345      338      339      338
+      PB3      350      344      345        0      347      342      347
+      PB4      345      338      338      347        0      337      341
+      PB6      342      336      339      342      337        0      343
+      PB8      344      339      338      347      341      343        0

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/mega_distance_matrix.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/phylogenetic_tree/mega_distance_matrix.txt Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,27 @@
+#mega
+!Title: Galaxy;
+!Format DataType=Distance DataFormat=LowerLeft NTaxa=7;
+
+[1] #canFam2
+[2] #PB1
+[3] #PB2
+[4] #PB3
+[5] #PB4
+[6] #PB6
+[7] #PB8
+
+
+
+[   1   2   3   4   5   6   7 ]
+[1]
+[2]  0.3205
+[3]  0.3085 0.0103
+[4]  0.3193 0.0100 0.0033
+[5]  0.3101 0.0130 0.0062 0.0081
+[6]  0.3138 0.0119 0.0094 0.0091 0.0099
+[7]  0.3170 0.0112 0.0062 0.0054 0.0088 0.0079
+
+
+
+
+

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/phylogenetic_tree.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/phylogenetic_tree/phylogenetic_tree.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,49 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>Phylogenetic tree Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 01:57:44 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="tree.pdf">tree.pdf</a></li>
+            <li><a href="phylogenetic_tree.newick">phylogenetic tree (newick)</a></li>
+            <li><a href="distance_matrix.phylip">Phylip distance matrix</a></li>
+            <li><a href="mega_distance_matrix.txt">Mega distance matrix</a></li>
+            <li><a href="informative_snps.txt">informative SNPs</a></li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li>Minimum coverage: 3</li>
+            <li>Minimum quality: 30</li>
+            <li>Include reference sequence: yes</li>
+            <li>Data source: sequence coverage</li>
+            <li>Branch type: square</li>
+            <li>Draw branches to scale: yes</li>
+            <li>Show branch lengths: yes</li>
+            <li>Tree layout: horizontal</li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Individuals
+<ol>
+<li>PB1</li>
+<li>PB2</li>
+<li>PB3</li>
+<li>PB4</li>
+<li>PB6</li>
+<li>PB8</li>
+</ol>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/phylogenetic_tree.newick
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/phylogenetic_tree/phylogenetic_tree.newick Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,16 @@
+(
+(
+(
+PB4:0.00174,
+canFam2:0.30836)
+:0.00188,
+PB2:0.00042)
+:0.00210,
+(
+PB6:0.00470,
+PB1:0.00720)
+:0.00035,
+(
+PB8:0.00288,
+PB3:0.00252)
+:0.00055);

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/phylogenetic_tree/tree.pdf

Binary file test-data/test_out/phylogenetic_tree/tree.pdf has changed

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/population_structure/graphical.pdf
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/population_structure/graphical.pdf Wed Sep 12 17:10:26 2012 -0400

[

@@ -0,0 +1,147 @@
+%PDF-1.4
+%��ρ�\r
+1 0 obj
+<<
+/CreationDate (D:20120403142055)
+/ModDate (D:20120403142055)
+/Title (R Graphics Output)
+/Producer (R 2.11.0)
+/Creator (R)
+>>
+endobj
+2 0 obj
+<<
+/Type /Catalog
+/Pages 3 0 R
+>>
+endobj
+5 0 obj
+<<
+/Type /Page
+/Parent 3 0 R
+/Contents 6 0 R
+/Resources 4 0 R
+>>
+endobj
+6 0 obj
+<<
+/Length 7 0 R
+>>
+stream
+1 J 1 j q
+Q q
+1.000 0.000 0.000 rg
+74.40 74.27 54.86 0.00 re f
+0.000 1.000 1.000 rg
+74.40 74.27 54.86 82.69 re f
+1.000 0.000 0.000 rg
+140.23 74.27 54.86 82.69 re f
+0.000 1.000 1.000 rg
+140.23 156.96 54.86 0.00 re f
+1.000 0.000 0.000 rg
+206.06 74.27 54.86 82.69 re f
+0.000 1.000 1.000 rg
+206.06 156.96 54.86 0.00 re f
+1.000 0.000 0.000 rg
+271.89 74.27 54.86 0.00 re f
+0.000 1.000 1.000 rg
+271.89 74.27 54.86 82.69 re f
+1.000 0.000 0.000 rg
+337.71 74.27 54.86 82.69 re f
+0.000 1.000 1.000 rg
+337.71 156.96 54.86 0.00 re f
+1.000 0.000 0.000 rg
+403.54 74.27 54.86 82.69 re f
+0.000 1.000 1.000 rg
+403.54 156.96 54.86 0.00 re f
+BT
+0.000 0.000 0.000 rg
+/F2 1 Tf 12.00 0.00 -0.00 12.00 236.05 18.72 Tm (Individual #) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 12.96 91.68 Tm [(Ancestr) -30 (y)] TJ
+ET
+Q q
+0.000 0.000 0.000 RG
+0.75 w
+[] 0 d
+1 J
+1 j
+10.00 M
+59.04 74.27 m 59.04 156.96 l S
+59.04 74.27 m 51.84 74.27 l S
+59.04 90.81 m 51.84 90.81 l S
+59.04 107.34 m 51.84 107.34 l S
+59.04 123.88 m 51.84 123.88 l S
+59.04 140.42 m 51.84 140.42 l S
+59.04 156.96 m 51.84 156.96 l S
+BT
+0.000 0.000 0.000 rg
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 65.93 Tm (0.0) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 99.00 Tm (0.4) Tj
+ET
+BT
+/F2 1 Tf 0.00 12.00 -12.00 0.00 41.76 132.08 Tm (0.8) Tj
+ET
+Q
+endstream
+endobj
+7 0 obj
+1275
+endobj
+3 0 obj
+<<
+/Type /Pages
+/Kids [
+5 0 R
+]
+/Count 1
+/MediaBox [0 0 504 216]
+>>
+endobj
+4 0 obj
+<<
+/ProcSet [/PDF /Text]
+/Font <</F2 9 0 R >>
+/ExtGState << >>
+>>
+endobj
+8 0 obj
+<<
+/Type /Encoding
+/BaseEncoding /WinAnsiEncoding
+/Differences [ 45/minus 96/quoteleft
+144/dotlessi /grave /acute /circumflex /tilde /macron /breve /dotaccent
+/dieresis /.notdef /ring /cedilla /.notdef /hungarumlaut /ogonek /caron /space]
+>>
+endobj
+9 0 obj <<
+/Type /Font
+/Subtype /Type1
+/Name /F2
+/BaseFont /Helvetica
+/Encoding 8 0 R
+>> endobj
+xref
+0 10
+0000000000 65535 f
+0000000021 00000 n
+0000000164 00000 n
+0000001641 00000 n
+0000001724 00000 n
+0000000213 00000 n
+0000000293 00000 n
+0000001621 00000 n
+0000001805 00000 n
+0000002062 00000 n
+trailer
+<<
+/Size 10
+/Info 1 0 R
+/Root 2 0 R
+>>
+startxref
+2158
+%%EOF

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/population_structure/numeric.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/population_structure/numeric.txt Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,6 @@
+0.000010 0.999990
+0.999990 0.000010
+0.999990 0.000010
+0.000010 0.999990
+0.999990 0.000010
+0.999990 0.000010

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/population_structure/population_structure.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/population_structure/population_structure.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,44 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>Population structure Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 02:20:55 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="graphical.pdf">graphical.pdf</a></li>
+            <li><a href="numeric.txt">numeric.txt</a></li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li>Number of populations: 2</li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Populations
+<ul>
+<li>
+All Individuals
+<ol>
+<li>PB1</li>
+<li>PB2</li>
+<li>PB3</li>
+<li>PB4</li>
+<li>PB6</li>
+<li>PB8</li>
+</ol>
+</li>
+</ul>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/prepare_population_structure/admix.map
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/prepare_population_structure/admix.map Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,303 @@
+1 snp1 0 2
+1 snp3 0 4
+1 snp4 0 5
+1 snp5 0 6
+1 snp6 0 7
+1 snp7 0 8
+1 snp8 0 9
+1 snp9 0 10
+1 snp10 0 11
+1 snp11 0 12
+1 snp12 0 13
+1 snp13 0 14
+1 snp14 0 15
+1 snp16 0 17
+1 snp17 0 18
+1 snp22 0 23
+1 snp24 0 25
+1 snp25 0 26
+1 snp27 0 28
+1 snp28 0 29
+1 snp29 0 30
+1 snp30 0 31
+1 snp31 0 32
+1 snp33 0 34
+1 snp34 0 35
+1 snp37 0 38
+1 snp38 0 39
+1 snp39 0 40
+1 snp40 0 41
+1 snp41 0 42
+1 snp42 0 43
+1 snp43 0 44
+1 snp45 0 46
+1 snp46 0 47
+1 snp47 0 48
+1 snp48 0 49
+1 snp49 0 50
+1 snp50 0 51
+1 snp51 0 52
+1 snp52 0 53
+1 snp53 0 54
+1 snp54 0 55
+1 snp56 0 57
+1 snp58 0 59
+1 snp59 0 60
+1 snp60 0 61
+1 snp61 0 62
+1 snp62 0 63
+1 snp63 0 64
+1 snp64 0 65
+1 snp65 0 66
+1 snp67 0 68
+1 snp68 0 69
+1 snp70 0 71
+1 snp71 0 72
+1 snp72 0 73
+1 snp73 0 74
+1 snp74 0 75
+1 snp75 0 76
+1 snp76 0 77
+1 snp77 0 78
+1 snp78 0 79
+1 snp80 0 81
+1 snp81 0 82
+1 snp83 0 84
+1 snp84 0 85
+1 snp87 0 88
+1 snp89 0 90
+1 snp90 0 91
+1 snp91 0 92
+1 snp92 0 93
+1 snp93 0 94
+1 snp94 0 95
+1 snp98 0 99
+1 snp100 0 101
+1 snp101 0 102
+1 snp102 0 103
+1 snp103 0 104
+1 snp104 0 105
+1 snp105 0 106
+1 snp106 0 107
+1 snp107 0 108
+1 snp108 0 109
+1 snp110 0 111
+1 snp111 0 112
+1 snp112 0 113
+1 snp113 0 114
+1 snp116 0 117
+1 snp117 0 118
+1 snp118 0 119
+1 snp119 0 120
+1 snp121 0 122
+1 snp122 0 123
+1 snp123 0 124
+1 snp124 0 125
+1 snp125 0 126
+1 snp126 0 127
+1 snp128 0 129
+1 snp129 0 130
+1 snp131 0 132
+1 snp133 0 134
+1 snp134 0 135
+1 snp135 0 136
+1 snp137 0 138
+1 snp138 0 139
+1 snp139 0 140
+1 snp140 0 141
+1 snp141 0 142
+1 snp143 0 144
+1 snp145 0 146
+1 snp146 0 147
+1 snp148 0 149
+1 snp149 0 150
+1 snp150 0 151
+1 snp151 0 152
+1 snp152 0 153
+1 snp153 0 154
+1 snp154 0 155
+1 snp156 0 157
+1 snp157 0 158
+1 snp158 0 159
+1 snp159 0 160
+1 snp160 0 161
+1 snp161 0 162
+1 snp162 0 163
+1 snp164 0 165
+1 snp165 0 166
+1 snp167 0 168
+1 snp168 0 169
+1 snp169 0 170
+1 snp170 0 171
+1 snp171 0 172
+1 snp172 0 173
+1 snp174 0 175
+1 snp175 0 176
+1 snp176 0 177
+1 snp177 0 178
+1 snp178 0 179
+1 snp179 0 180
+1 snp181 0 182
+1 snp182 0 183
+1 snp183 0 184
+1 snp184 0 185
+1 snp185 0 186
+1 snp186 0 187
+1 snp188 0 189
+1 snp191 0 192
+1 snp192 0 193
+1 snp193 0 194
+1 snp195 0 196
+1 snp196 0 197
+1 snp197 0 198
+1 snp199 0 200
+1 snp200 0 201
+1 snp201 0 202
+1 snp202 0 203
+1 snp203 0 204
+1 snp205 0 206
+1 snp207 0 208
+1 snp210 0 211
+1 snp211 0 212
+1 snp212 0 213
+1 snp213 0 214
+1 snp214 0 215
+1 snp215 0 216
+1 snp216 0 217
+1 snp217 0 218
+1 snp218 0 219
+1 snp219 0 220
+1 snp220 0 221
+1 snp221 0 222
+1 snp223 0 224
+1 snp224 0 225
+1 snp225 0 226
+1 snp226 0 227
+1 snp227 0 228
+1 snp228 0 229
+1 snp229 0 230
+1 snp230 0 231
+1 snp231 0 232
+1 snp232 0 233
+1 snp235 0 236
+1 snp236 0 237
+1 snp237 0 238
+1 snp239 0 240
+1 snp240 0 241
+1 snp241 0 242
+1 snp242 0 243
+1 snp243 0 244
+1 snp244 0 245
+1 snp246 0 247
+1 snp247 0 248
+1 snp248 0 249
+1 snp249 0 250
+1 snp250 0 251
+1 snp251 0 252
+1 snp252 0 253
+1 snp253 0 254
+1 snp254 0 255
+1 snp255 0 256
+1 snp256 0 257
+1 snp257 0 258
+1 snp258 0 259
+1 snp260 0 261
+1 snp261 0 262
+1 snp262 0 263
+1 snp263 0 264
+1 snp264 0 265
+1 snp265 0 266
+1 snp266 0 267
+1 snp267 0 268
+1 snp268 0 269
+1 snp269 0 270
+1 snp270 0 271
+1 snp271 0 272
+1 snp273 0 274
+1 snp274 0 275
+1 snp275 0 276
+1 snp276 0 277
+1 snp277 0 278
+1 snp278 0 279
+1 snp281 0 282
+1 snp282 0 283
+1 snp284 0 285
+1 snp287 0 288
+1 snp288 0 289
+1 snp289 0 290
+1 snp290 0 291
+1 snp291 0 292
+1 snp292 0 293
+1 snp293 0 294
+1 snp294 0 295
+1 snp297 0 298
+1 snp298 0 299
+1 snp299 0 300
+1 snp300 0 301
+1 snp301 0 302
+1 snp302 0 303
+1 snp303 0 304
+1 snp304 0 305
+1 snp307 0 308
+1 snp308 0 309
+1 snp309 0 310
+1 snp310 0 311
+1 snp312 0 313
+1 snp313 0 314
+1 snp316 0 317
+1 snp317 0 318
+1 snp320 0 321
+1 snp321 0 322
+1 snp322 0 323
+1 snp323 0 324
+1 snp324 0 325
+1 snp325 0 326
+1 snp328 0 329
+1 snp329 0 330
+1 snp331 0 332
+1 snp332 0 333
+1 snp333 0 334
+1 snp334 0 335
+1 snp335 0 336
+1 snp336 0 337
+1 snp338 0 339
+1 snp339 0 340
+1 snp341 0 342
+1 snp342 0 343
+1 snp344 0 345
+1 snp345 0 346
+1 snp348 0 349
+1 snp350 0 351
+1 snp352 0 353
+1 snp353 0 354
+1 snp354 0 355
+1 snp355 0 356
+1 snp360 0 361
+1 snp361 0 362
+1 snp362 0 363
+1 snp364 0 365
+1 snp366 0 367
+1 snp369 0 370
+1 snp370 0 371
+1 snp371 0 372
+1 snp372 0 373
+1 snp373 0 374
+1 snp374 0 375
+1 snp375 0 376
+1 snp376 0 377
+1 snp377 0 378
+1 snp378 0 379
+1 snp379 0 380
+1 snp380 0 381
+1 snp381 0 382
+1 snp382 0 383
+1 snp383 0 384
+1 snp384 0 385
+1 snp385 0 386
+1 snp386 0 387
+1 snp389 0 390
+1 snp390 0 391
+1 snp393 0 394
+1 snp395 0 396
+1 snp397 0 398
+1 snp400 0 401

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/prepare_population_structure/admix.ped
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/prepare_population_structure/admix.ped Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,6 @@
+PB1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+PB2 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+PB3 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+PB4 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+PB6 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
+PB8 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/prepare_population_structure/prepare_population_structure.html
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/prepare_population_structure/prepare_population_structure.html Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,47 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+  <head>
+    <meta http-equiv="Content-type" content="text/html; charset=UTF-8" />
+    <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
+    <title>Prepare to look for population structure Galaxy Composite Dataset</title>
+  </head>
+  <body>
+    <div class="document">
+      Output completed: 2012-04-03 02:17:44 PM
+      <p/>
+      <div id="gd_outputs">
+        Outputs
+        <ul>
+            <li><a href="admix.ped">admix.ped</a></li>
+            <li><a href="admix.map">admix.map</a></li>
+            <li>Using 303 of 400 SNPs</li>
+        </ul>
+      </div>
+      <div id="gd_inputs">
+        Inputs
+        <ul>
+            <li>Minimum reads covering a SNP, per individual: 3</li>
+            <li>Minimum quality value, per individual: 30</li>
+            <li>Minimum spacing between SNPs on the same scaffold: 0</li>
+        </ul>
+      </div>
+      <div id="gd_misc">
+        Populations
+<ul>
+<li>
+All Individuals
+<ol>
+<li>PB1</li>
+<li>PB2</li>
+<li>PB3</li>
+<li>PB4</li>
+<li>PB6</li>
+<li>PB8</li>
+</ol>
+</li>
+</ul>
+      </div>
+    </div>
+  </body>
+</html>

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/rank_pathways/rank_pathways.tabular
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/test_out/rank_pathways/rank_pathways.tabular Wed Sep 12 17:10:26 2012 -0400

b'@@ -0,0 +1,240 @@\n+3\t0.25\t1\tcfa03450=Non-homologous end-joining\n+1\t0.25\t1\tcfa00750=Vitamin B6 metabolism\n+2\t0.2\t3\tcfa00290=Valine, leucine and isoleucine biosynthesis\n+3\t0.18\t4\tcfa00770=Pantothenate and CoA biosynthesis\n+5\t0.17\t5\tcfa05310=Asthma\n+4\t0.16\t6\tcfa00760=Nicotinate and nicotinamide metabolism\n+2\t0.12\t7\tcfa00450=Selenocompound metabolism\n+4\t0.11\t8\tcfa05330=Allograft rejection\n+5\t0.098\t9\tcfa04672=Intestinal immune network for IgA production\n+4\t0.098\t9\tcfa02010=ABC transporters\n+2\t0.095\t11\tcfa03430=Mismatch repair\n+4\t0.089\t12\tcfa05320=Autoimmune thyroid disease\n+4\t0.089\t12\tcfa00280=Valine, leucine and isoleucine degradation\n+3\t0.088\t14\tcfa03410=Base excision repair\n+3\t0.088\t14\tcfa03030=DNA replication\n+3\t0.088\t14\tcfa00565=Ether lipid metabolism\n+6\t0.087\t17\tcfa05140=Leishmaniasis\n+2\t0.087\t17\tcfa04977=Vitamin digestion and absorption\n+1\t0.083\t19\tcfa00740=Riboflavin metabolism\n+4\t0.08\t20\tcfa05150=Staphylococcus aureus infection\n+2\t0.08\t20\tcfa03060=Protein export\n+3\t0.079\t22\tcfa05340=Primary immunodeficiency\n+3\t0.079\t22\tcfa05143=African trypanosomiasis\n+6\t0.078\t24\tcfa00564=Glycerophospholipid metabolism\n+2\t0.077\t25\tcfa00410=beta-Alanine metabolism\n+2\t0.071\t26\tcfa05332=Graft-versus-host disease\n+5\t0.069\t27\tcfa03320=PPAR signaling pathway\n+6\t0.066\t28\tcfa05323=Rheumatoid arthritis\n+5\t0.063\t29\tcfa04664=Fc epsilon RI signaling pathway\n+3\t0.062\t30\tcfa00561=Glycerolipid metabolism\n+2\t0.062\t30\tcfa00350=Tyrosine metabolism\n+2\t0.062\t30\tcfa00020=Citrate cycle (TCA cycle)\n+2\t0.059\t33\tcfa00260=Glycine, serine and threonine metabolism\n+1\t0.059\t33\tcfa04614=Renin-angiotensin system\n+1\t0.059\t33\tcfa00360=Phenylalanine metabolism\n+9\t0.058\t36\tcfa04145=Phagosome\n+3\t0.058\t36\tcfa05213=Endometrial cancer\n+4\t0.057\t38\tcfa05416=Viral myocarditis\n+2\t0.057\t38\tcfa00500=Starch and sucrose metabolism\n+2\t0.056\t40\tcfa04130=SNARE interactions in vesicular transport\n+1\t0.056\t40\tcfa00592=alpha-Linolenic acid metabolism\n+1\t0.053\t42\tcfa04964=Proximal tubule bicarbonate reclamation\n+1\t0.053\t42\tcfa00630=Glyoxylate and dicarboxylate metabolism\n+3\t0.052\t44\tcfa04621=NOD-like receptor signaling pathway\n+2\t0.05\t45\tcfa05219=Bladder cancer\n+2\t0.05\t45\tcfa04940=Type I diabetes mellitus\n+2\t0.05\t45\tcfa00380=Tryptophan metabolism\n+2\t0.047\t48\tcfa03420=Nucleotide excision repair\n+3\t0.045\t49\tcfa04920=Adipocytokine signaling pathway\n+3\t0.045\t49\tcfa00970=Aminoacyl-tRNA biosynthesis\n+2\t0.045\t49\tcfa00071=Fatty acid metabolism\n+1\t0.045\t49\tcfa00591=Linoleic acid metabolism\n+1\t0.045\t49\tcfa00340=Histidine metabolism\n+4\t0.043\t54\tcfa04972=Pancreatic secretion\n+2\t0.043\t54\tcfa03022=Basal transcription factors\n+2\t0.043\t54\tcfa00982=Drug metabolism - cytochrome P450\n+3\t0.042\t57\tcfa05218=Melanoma\n+3\t0.042\t57\tcfa05211=Renal cell carcinoma\n+4\t0.041\t59\tcfa05414=Dilated cardiomyopathy\n+2\t0.04\t60\tcfa00590=Arachidonic acid metabolism\n+1\t0.04\t60\tcfa04320=Dorso-ventral axis formation\n+3\t0.039\t62\tcfa04662=B cell receptor signaling pathway\n+2\t0.039\t62\tcfa00310=Lysine degradation\n+3\t0.038\t64\tcfa04512=ECM-receptor interaction\n+2\t0.038\t64\tcfa05144=Malaria\n+2\t0.038\t64\tcfa00270=Cysteine and methionine metabolism\n+1\t0.038\t64\tcfa03440=Homologous recombination\n+1\t0.038\t64\tcfa00052=Galactose metabolism\n+8\t0.037\t69\tcfa04810=Regulation of actin cytoskeleton\n+4\t0.037\t69\tcfa05146=Amoebiasis\n+4\t0.037\t69\tcfa04666=Fc gamma R-mediated phagocytosis\n+2\t0.037\t69\tcfa05223=Non-small cell lung cancer\n+6\t0.036\t73\tcfa05168=Herpes simplex infection\n+6\t0.036\t73\tcfa05152=Tuberculosis\n+3\t0.036\t73\tcfa04640=Hematopoietic cell lineage\n+7\t0.034\t76\tcfa04510=Focal adhesion\n+3\t0.034\t76\tcfa00240=Pyrimidine metabolism\n+3\t0.033\t78\tcfa03008=Ribosome biogenesis in eukaryotes\n+1\t0.033\t78\tcfa00983=Drug metabolism - other enzymes\n+2\t0.032\t80\tcfa04976=Bile secretion\n+6\t0.031\t81\tcfa04060=Cytokine-cytokine receptor interaction\n+4\t0.031\t81\tcfa04110=Cell cycle\n+1\t0.031\t81\tcfa00250=Alanine, aspartate and glutamate metabolism\n+4\t0.029\t84\tcfa05145=Toxoplasmosis\n+3\t0.029\t84\tcfa04650=Natural killer cell mediated cytotoxicity\n+2\t0.029\t84\tcfa05214=Glioma\n+4\t'..b"active ligand-receptor interaction\n+1\t0.0079\t159\tcfa04728=Dopaminergic synapse\n+2\t0.0074\t160\tcfa05010=Alzheimer's disease\n+1\t0.0074\t160\tcfa04722=Neurotrophin signaling pathway\n+1\t0.0074\t160\tcfa04120=Ubiquitin mediated proteolysis\n+1\t0.0068\t163\tcfa00190=Oxidative phosphorylation\n+1\t0.0067\t164\tcfa05012=Parkinson's disease\n+1\t0.0057\t165\tcfa03013=RNA transport\n+1\t0.0056\t166\tcfa03040=Spliceosome\n+1\t0.0049\t167\tcfa05016=Huntington's disease\n+1\t0.0023\t168\tcfa04740=Olfactory transduction\n+0\t0\t169\tcfa05222=Small cell lung cancer\n+0\t0\t169\tcfa05217=Basal cell carcinoma\n+0\t0\t169\tcfa05216=Thyroid cancer\n+0\t0\t169\tcfa05100=Bacterial invasion of epithelial cells\n+0\t0\t169\tcfa05020=Prion diseases\n+0\t0\t169\tcfa05014=Amyotrophic lateral sclerosis (ALS)\n+0\t0\t169\tcfa04973=Carbohydrate digestion and absorption\n+0\t0\t169\tcfa04966=Collecting duct acid secretion\n+0\t0\t169\tcfa04962=Vasopressin-regulated water reabsorption\n+0\t0\t169\tcfa04961=Endocrine and other factor-regulated calcium reabsorption\n+0\t0\t169\tcfa04960=Aldosterone-regulated sodium reabsorption\n+0\t0\t169\tcfa04950=Maturity onset diabetes of the young\n+0\t0\t169\tcfa04930=Type II diabetes mellitus\n+0\t0\t169\tcfa04744=Phototransduction\n+0\t0\t169\tcfa04742=Taste transduction\n+0\t0\t169\tcfa04725=Cholinergic synapse\n+0\t0\t169\tcfa04721=Synaptic vesicle cycle\n+0\t0\t169\tcfa04710=Circadian rhythm - mammal\n+0\t0\t169\tcfa04623=Cytosolic DNA-sensing pathway\n+0\t0\t169\tcfa04622=RIG-I-like receptor signaling pathway\n+0\t0\t169\tcfa04530=Tight junction\n+0\t0\t169\tcfa04340=Hedgehog signaling pathway\n+0\t0\t169\tcfa04310=Wnt signaling pathway\n+0\t0\t169\tcfa04140=Regulation of autophagy\n+0\t0\t169\tcfa04122=Sulfur relay system\n+0\t0\t169\tcfa03050=Proteasome\n+0\t0\t169\tcfa03020=RNA polymerase\n+0\t0\t169\tcfa03010=Ribosome\n+0\t0\t169\tcfa01040=Biosynthesis of unsaturated fatty acids\n+0\t0\t169\tcfa00920=Sulfur metabolism\n+0\t0\t169\tcfa00910=Nitrogen metabolism\n+0\t0\t169\tcfa00900=Terpenoid backbone biosynthesis\n+0\t0\t169\tcfa00860=Porphyrin and chlorophyll metabolism\n+0\t0\t169\tcfa00790=Folate biosynthesis\n+0\t0\t169\tcfa00785=Lipoic acid metabolism\n+0\t0\t169\tcfa00780=Biotin metabolism\n+0\t0\t169\tcfa00730=Thiamine metabolism\n+0\t0\t169\tcfa00670=One carbon pool by folate\n+0\t0\t169\tcfa00650=Butanoate metabolism\n+0\t0\t169\tcfa00604=Glycosphingolipid biosynthesis - ganglio series\n+0\t0\t169\tcfa00603=Glycosphingolipid biosynthesis - globo series\n+0\t0\t169\tcfa00601=Glycosphingolipid biosynthesis - lacto and neolacto series\n+0\t0\t169\tcfa00600=Sphingolipid metabolism\n+0\t0\t169\tcfa00563=Glycosylphosphatidylinositol(GPI)-anchor biosynthesis\n+0\t0\t169\tcfa00562=Inositol phosphate metabolism\n+0\t0\t169\tcfa00534=Glycosaminoglycan biosynthesis - heparan sulfate\n+0\t0\t169\tcfa00533=Glycosaminoglycan biosynthesis - keratan sulfate\n+0\t0\t169\tcfa00532=Glycosaminoglycan biosynthesis - chondroitin sulfate\n+0\t0\t169\tcfa00531=Glycosaminoglycan degradation\n+0\t0\t169\tcfa00520=Amino sugar and nucleotide sugar metabolism\n+0\t0\t169\tcfa00514=Other types of O-glycan biosynthesis\n+0\t0\t169\tcfa00512=Mucin type O-Glycan biosynthesis\n+0\t0\t169\tcfa00511=Other glycan degradation\n+0\t0\t169\tcfa00510=N-Glycan biosynthesis\n+0\t0\t169\tcfa00472=D-Arginine and D-ornithine metabolism\n+0\t0\t169\tcfa00471=D-Glutamine and D-glutamate metabolism\n+0\t0\t169\tcfa00460=Cyanoamino acid metabolism\n+0\t0\t169\tcfa00430=Taurine and hypotaurine metabolism\n+0\t0\t169\tcfa00400=Phenylalanine, tyrosine and tryptophan biosynthesis\n+0\t0\t169\tcfa00300=Lysine biosynthesis\n+0\t0\t169\tcfa00232=Caffeine metabolism\n+0\t0\t169\tcfa00140=Steroid hormone biosynthesis\n+0\t0\t169\tcfa00130=Ubiquinone and other terpenoid-quinone biosynthesis\n+0\t0\t169\tcfa00120=Primary bile acid biosynthesis\n+0\t0\t169\tcfa00100=Steroid biosynthesis\n+0\t0\t169\tcfa00072=Synthesis and degradation of ketone bodies\n+0\t0\t169\tcfa00062=Fatty acid elongation in mitochondria\n+0\t0\t169\tcfa00061=Fatty acid biosynthesis\n+0\t0\t169\tcfa00053=Ascorbate and aldarate metabolism\n+0\t0\t169\tcfa00051=Fructose and mannose metabolism\n+0\t0\t169\tcfa00040=Pentose and glucuronate interconversions\n+0\t0\t169\tcfa00030=Pentose phosphate pathway\n"

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/select_snps/select_snps.gd_snp
--- a/test-data/test_out/select_snps/select_snps.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

b'@@ -1,102 +0,0 @@\n-#{"column_names":["scaf","pos","A","B","qual","ref","rpos","rnuc","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","pair","dist",\n-#"prim","rflp"],"dbkey":"canFam2","individuals":[["PB1",9],["PB2",13],["PB3",17],["PB4",21],["PB6",25],["PB8",29]],"pos":2,"rPos":7,"ref":6,"scaffold":1,"species":"bear"}\n-Contig161_chr1_4641264_4641879\t115\tC\tT\t73.5\tchr1\t4641382\tC\t6\t0\t2\t45\t8\t0\t2\t51\t15\t0\t2\t72\t5\t0\t2\t42\t6\t0\t2\t45\t10\t0\t2\t57\tY\t54\t0.323\t0\n-Contig86_chr1_30984450_30985684\t670\tC\tT\t365.0\tchr1\t30985133\tC\t9\t0\t2\t54\t10\t0\t2\t57\t13\t0\t2\t66\t3\t0\t2\t36\t9\t0\t2\t54\t7\t0\t2\t48\tY\t145\t0.031\t0\n-Contig21_chr1_60697952_60699446\t307\tG\tA\t51.9\tchr1\t60698265\tG\t12\t0\t2\t63\t9\t0\t2\t54\t4\t0\t2\t39\t6\t0\t2\t45\t9\t0\t2\t54\t4\t0\t2\t39\tY\t98\t0.507\t0\n-Contig64_chr1_87343284_87345672\t163\tT\tA\t3.76\tchr1\t87343443\tC\t0\t2\t2\t1\t0\t0\t-1\t0\t5\t0\t2\t42\t2\t0\t2\t33\t0\t1\t2\t14\t0\t0\t-1\t0\tN\t3\t0.039\t2\n-Contig20_chr1_110679280_110679687\t181\tC\tT\t87.4\tchr1\t110679454\t-\t1\t0\t2\t30\t7\t0\t2\t48\t4\t0\t2\t39\t2\t0\t2\t33\t2\t0\t2\t33\t0\t0\t-1\t0\tN\t31\t0.660\t2\n-Contig222_chr2_9817738_9818143\t220\tC\tT\t888.0\tchr2\t9817960\tC\t17\t0\t2\t78\t12\t0\t2\t63\t20\t0\t2\t87\t8\t0\t2\t51\t11\t0\t2\t60\t12\t0\t2\t63\tY\t76\t0.093\t1\n-Contig47_chr2_25470778_25471576\t126\tG\tA\t888.0\tchr2\t25470896\tG\t12\t0\t2\t63\t14\t0\t2\t69\t14\t0\t2\t69\t10\t0\t2\t57\t18\t0\t2\t81\t13\t0\t2\t66\tN\t11\t0.289\t1\n-Contig6_chr2_56859179_56859956\t671\tT\tC\t999.9\tchr2\t56859851\tT\t15\t0\t2\t72\t18\t0\t2\t81\t20\t0\t2\t90\t19\t0\t2\t84\t19\t0\t2\t84\t24\t0\t2\t99\tN\t28\t5.308\t1\n-Contig163_chr2_76402959_76404830\t221\tC\tT\t127.0\tchr2\t76403181\tC\t4\t0\t2\t42\t10\t0\t2\t57\t9\t0\t2\t54\t11\t0\t2\t60\t7\t0\t2\t48\t9\t0\t2\t54\tY\t54\t0.178\t1\n-Contig56_chr3_17326225_17327548\t387\tG\tC\t91.2\tchr3\t17326591\tG\t14\t0\t2\t69\t13\t0\t2\t66\t15\t0\t2\t72\t15\t0\t2\t72\t13\t0\t2\t66\t12\t0\t2\t63\tY\t20\t0.225\t3\n-Contig108_chr3_46210055_46210874\t367\tA\tG\t21.0\tchr3\t46210423\tA\t19\t0\t2\t84\t10\t0\t2\t57\t16\t0\t2\t75\t14\t0\t2\t69\t20\t0\t2\t87\t11\t0\t2\t60\tN\t236\t0.028\t1\n-Contig1_chr3_51588422_51589409\t926\tA\tG\t51.0\tchr3\t51589353\tG\t2\t0\t2\t33\t2\t0\t2\t33\t6\t0\t2\t45\t4\t0\t2\t39\t9\t0\t2\t54\t11\t0\t2\t60\tN\t21\t1.147\t0\n-Contig65_chr3_80727952_80728283\t39\tT\tC\t71.2\tchr3\t80727990\tT\t7\t0\t2\t48\t3\t0\t2\t36\t8\t0\t2\t51\t6\t0\t2\t45\t8\t0\t2\t51\t11\t0\t2\t60\tN\t22\t7.078\t0\n-Contig134_chr4_12145648_12148225\t1326\tC\tT\t164.0\tchr4\t12146961\tC\t9\t0\t2\t54\t8\t0\t2\t51\t7\t0\t2\t48\t3\t0\t2\t36\t5\t0\t2\t42\t5\t0\t2\t42\tY\t4\t0.080\t1\n-Contig19_chr4_26233601_26233991\t146\tG\tC\t51.6\tchr4\t26233744\tG\t10\t0\t2\t57\t8\t0\t2\t51\t9\t0\t2\t54\t5\t0\t2\t42\t9\t0\t2\t54\t4\t0\t2\t39\tN\t41\t0.163\t3\n-Contig17_chr4_61310346_61311158\t267\tC\tT\t49.9\tchr4\t61310604\tT\t10\t0\t2\t57\t7\t0\t2\t48\t9\t0\t2\t54\t10\t0\t2\t57\t14\t0\t2\t69\t7\t0\t2\t48\tY\t219\t0.098\t0\n-Contig31_chr5_4734956_4736547\t1166\tC\tT\t133.0\tchr5\t4736132\tC\t14\t0\t2\t69\t8\t0\t2\t51\t17\t0\t2\t78\t4\t0\t2\t39\t9\t0\t2\t54\t12\t0\t2\t63\tY\t1\t0.021\t0\n-Contig6_chr5_26899813_26900498\t97\tA\tC\t88.6\tchr5\t26899910\tA\t15\t0\t2\t72\t14\t0\t2\t69\t27\t0\t2\t108\t15\t0\t2\t72\t13\t0\t2\t69\t12\t0\t2\t63\tY\t92\t7.370\t3\n-Contig45_chr5_50892738_50892968\t169\tC\tA\t25.8\tchr5\t50892911\tC\t10\t0\t2\t57\t7\t0\t2\t48\t10\t0\t2\t60\t6\t0\t2\t45\t6\t0\t2\t45\t13\t0\t2\t66\tN\t244\t0.497\t1\n-Contig45_chr5_76133561_76134403\t388\tA\tG\t103.0\tchr5\t76133941\tG\t3\t0\t2\t36\t8\t0\t2\t51\t8\t0\t2\t51\t5\t0\t2\t42\t6\t0\t2\t45\t7\t0\t2\t48\tY\t57\t0.038\t0\n-Contig111_chr6_5821219_5822519\t1060\tA\tG\t68.1\tchr6\t5822321\tT\t7\t0\t2\t48\t6\t0\t2\t45\t11\t0\t2\t60\t9\t0\t2\t54\t3\t0\t2\t36\t12\t0\t2\t63\tY\t7\t0.231\t1\n-Contig102_chr6_30271329_30271577\t39\tT\tG\t139.0\tchr6\t30271371\tG\t3\t0\t2\t36\t4\t0\t2\t39\t6\t0\t2\t45\t1\t0\t2\t30\t4\t0\t2\t39\t4\t0\t2\t39\tN\t15\t1.159\t0\n-Contig112_chr6_51024554_51024851\t100\tA\tG\t121.0\tchr6\t51024654\tA\t10\t0\t2\t57\t12\t0\t2\t63\t9\t0\t2\t54\t13\t0\t2\t66\t14\t0\t2\t69\t17\t0\t2\t78\tN\t75\t4.287\t0\n-Contig84_chr7_6648683_6650255\t1297\tG\tA\t110.0\tchr7\t6649988\tG\t18\t0\t2\t81\t9\t0\t2\t54\t22\t0\t2\t77\t16\t0\t2\t75\t20\t0\t2\t87\t6\t0\t2\t45\tY\t83\t0.166\t0\n-Contig206_chr7_26281823_26282074\t103\tC\tA\t101.0\tchr7\t26281925\tT\t11\t0\t2\t60\t16\t0\t2\t61\t19\t0\t2\t84\t6\t0\t2\t45\t19\t0\t2\t84\t16\t0\t2\t75\tN\t-1\t0.947\t1\n-Contig38_chr7_50681997_50682600\t42\tT\tC\t92.4\tchr7\t50682037\tG\t6\t0\t2\t45\t2\t0\t2\t33\t10\t0\t2\t57\t12\t0\t2\t63\t5\t0\t2\t42\t6\t0\t2\t45\tY\t94\t0.146\t0\n-Contig91_chr8_12804505_12805470\t409\tC\tA\t111.0\tchr8\t12804906\tC\t8\t0\t2\t51\t10\t0\t2\t57\t15\t0\t2\t72\t12\t0\t2\t63\t14\t0\t2\t69\t15\t0\t2\t72\tN\t145\t0.175\t0\n-Contig8_chr8_27811135_27812620\t333\tC\tT'..b'0\tchr25\t4011690\tA\t12\t0\t2\t63\t17\t0\t2\t78\t13\t0\t2\t66\t13\t0\t2\t66\t13\t0\t2\t66\t13\t0\t2\t66\tY\t5\t0.087\t0\n-Contig103_chr25_38891221_38892140\t407\tG\tA\t131.0\tchr25\t38891644\tG\t8\t0\t2\t51\t14\t0\t2\t69\t18\t0\t2\t81\t8\t0\t2\t51\t8\t0\t2\t51\t11\t0\t2\t60\tY\t149\t0.167\t4\n-Contig204_chr26_4311195_4311778\t170\tC\tT\t16.9\tchr26\t4311363\tT\t20\t0\t2\t87\t8\t0\t2\t51\t13\t0\t2\t66\t18\t0\t2\t81\t11\t0\t2\t60\t14\t0\t2\t69\tN\t35\t0.085\t0\n-Contig146_chr26_26622638_26623906\t574\tG\tA\t186.0\tchr26\t26623219\tA\t11\t0\t2\t60\t12\t0\t2\t63\t9\t0\t2\t54\t11\t0\t2\t60\t9\t0\t2\t54\t12\t0\t2\t63\tY\t1\t0.318\t0\n-Contig135_chr27_6853874_6854079\t158\tC\tT\t116.0\tchr27\t6854032\tT\t18\t0\t2\t81\t19\t0\t2\t84\t13\t0\t2\t66\t7\t0\t2\t48\t8\t0\t2\t51\t11\t0\t2\t60\tN\t4\t0.060\t1\n-Contig64_chr27_34654435_34654621\t132\tC\tA\t115.0\tchr27\t34654567\tT\t2\t0\t2\t33\t2\t0\t2\t33\t5\t0\t2\t42\t3\t0\t2\t36\t3\t0\t2\t36\t8\t0\t2\t51\tN\t12\t0.297\t1\n-Contig131_chr28_6481806_6483783\t138\tC\tT\t36.2\tchr28\t6481953\tC\t12\t0\t2\t63\t12\t0\t2\t63\t20\t0\t2\t87\t11\t0\t2\t60\t10\t0\t2\t57\t12\t0\t2\t63\tY\t10\t0.387\t0\n-Contig60_chr28_30197166_30197364\t92\tT\tC\t164.0\tchr28\t30197258\tT\t10\t0\t2\t57\t13\t0\t2\t66\t15\t0\t2\t72\t16\t0\t2\t75\t12\t0\t2\t63\t11\t0\t2\t60\tN\t369\t1.139\t0\n-Contig29_chr29_4726399_4727143\t559\tA\tT\t163.0\tchr29\t4726955\tA\t15\t0\t2\t72\t18\t0\t2\t81\t18\t0\t2\t81\t16\t0\t2\t75\t11\t0\t2\t60\t14\t0\t2\t72\tY\t161\t3.114\t0\n-Contig1_chr30_5992217_5993068\t106\tC\tT\t129.0\tchr30\t5992319\tC\t10\t0\t2\t57\t11\t0\t2\t60\t7\t0\t2\t48\t11\t0\t2\t60\t10\t0\t2\t57\t12\t0\t2\t63\tY\t76\t1.079\t0\n-Contig165_chr30_25804389_25804926\t190\tT\tC\t126.0\tchr30\t25804592\tC\t3\t0\t2\t36\t8\t0\t2\t51\t7\t0\t2\t48\t10\t0\t2\t57\t7\t0\t2\t48\t4\t0\t2\t39\tY\t113\t0.329\t0\n-Contig38_chr31_5164423_5166573\t2074\tC\tT\t134.0\tchr31\t5166501\tT\t13\t0\t2\t66\t10\t0\t2\t57\t17\t0\t2\t78\t11\t0\t2\t60\t17\t0\t2\t78\t10\t0\t2\t57\tY\t58\t+99.\t0\n-Contig17_chr31_26433828_26434459\t498\tT\tC\t9.79\tchr31\t26434322\tT\t18\t0\t2\t81\t10\t0\t2\t57\t15\t0\t2\t72\t13\t0\t2\t66\t16\t0\t2\t75\t15\t0\t2\t72\tY\t137\t4.814\t0\n-Contig9_chr32_19479532_19479735\t12\tA\tG\t20.7\tchr32\t19479544\tA\t1\t0\t2\t30\t2\t0\t2\t33\t1\t0\t2\t30\t5\t0\t2\t42\t3\t0\t2\t36\t3\t0\t2\t36\tN\t17\t+99.\t0\n-Contig30_chr32_25902721_25905783\t208\tC\tG\t162.0\tchr32\t25902927\tG\t11\t0\t2\t60\t13\t0\t2\t66\t11\t0\t2\t60\t12\t0\t2\t63\t7\t0\t2\t48\t11\t0\t2\t60\tY\t145\t0.322\t2\n-Contig18_chr33_22207246_22209159\t1363\tG\tT\t51.5\tchr33\t22208619\t-\t16\t0\t2\t75\t8\t0\t2\t51\t11\t0\t2\t60\t10\t0\t2\t57\t15\t0\t2\t72\t12\t0\t2\t63\tY\t59\t2.560\t0\n-Contig170_chr33_26189421_26189940\t292\tT\tC\t98.4\tchr33\t26189703\tT\t21\t0\t2\t90\t13\t0\t2\t66\t15\t0\t2\t72\t13\t0\t2\t66\t19\t0\t2\t84\t13\t0\t2\t66\tY\t23\t0.307\t0\n-Contig113_chr34_13341080_13341643\t236\tC\tT\t90.7\tchr34\t13341316\tC\t4\t0\t2\t39\t2\t0\t2\t33\t8\t0\t2\t51\t4\t0\t2\t39\t8\t0\t2\t51\t3\t0\t2\t36\tY\t47\t0.412\t3\n-Contig152_chr34_31794848_31795540\t242\tG\tA\t93.2\tchr34\t31795093\tG\t11\t0\t2\t60\t24\t0\t2\t99\t17\t0\t2\t78\t15\t0\t2\t72\t18\t0\t2\t81\t17\t0\t2\t78\tY\t123\t2.780\t0\n-Contig47_chr35_3666773_3667898\t348\tG\tT\t124.0\tchr35\t3667121\tG\t9\t0\t2\t54\t20\t0\t2\t87\t18\t0\t2\t81\t15\t0\t2\t72\t12\t0\t2\t63\t14\t0\t2\t69\tY\t285\t0.235\t0\n-Contig74_chr35_25394343_25394813\t303\tA\tT\t221.0\tchr35\t25394646\tG\t23\t0\t2\t96\t15\t0\t2\t72\t25\t0\t2\t105\t7\t7\t1\t49\t18\t0\t2\t81\t16\t0\t2\t75\tY\t58\t4.298\t0\n-Contig5_chr36_4562983_4563634\t343\tC\tT\t151.0\tchr36\t4563324\tT\t20\t0\t2\t87\t20\t0\t2\t87\t23\t0\t2\t96\t24\t0\t2\t99\t9\t0\t2\t54\t8\t0\t2\t51\tY\t40\t1.169\t0\n-Contig133_chr36_32954045_32955409\t136\tA\tG\t116.0\tchr36\t32954182\tA\t16\t0\t2\t75\t15\t0\t2\t72\t20\t0\t2\t87\t11\t0\t2\t60\t18\t0\t2\t81\t13\t0\t2\t66\tY\t74\t3.772\t1\n-Contig53_chr37_6665763_6665919\t116\tC\tT\t111.0\tchr37\t6665875\tC\t9\t0\t2\t54\t9\t0\t2\t54\t5\t0\t2\t42\t9\t0\t2\t54\t8\t0\t2\t51\t10\t0\t2\t57\tN\t15\t10.875\t1\n-Contig2_chr37_31197993_31198256\t182\tC\tT\t39.6\tchr37\t31198171\tT\t6\t0\t2\t45\t10\t0\t2\t57\t7\t0\t2\t48\t9\t0\t2\t54\t10\t0\t2\t57\t12\t0\t2\t63\tN\t2\t0.595\t0\n-Contig7_chr38_12217200_12218387\t1163\tA\tT\t44.4\tchr38\t12218353\tA\t11\t0\t2\t60\t13\t0\t2\t66\t17\t0\t2\t78\t10\t0\t2\t57\t11\t0\t2\t60\t11\t0\t2\t60\tY\t67\t+99.\t0\n-Contig265_chrX_2689247_2689484\t114\tC\tG\t103.0\tchrX\t2689356\tC\t11\t0\t2\t60\t9\t0\t2\t54\t13\t0\t2\t66\t16\t0\t2\t75\t14\t0\t2\t69\t10\t0\t2\t57\tN\t2\t9.232\t1\n-Contig113_chrX_26287829_26288398\t385\tC\tT\t59.6\tchrX\t26288213\tC\t9\t0\t2\t54\t9\t0\t2\t54\t17\t0\t2\t78\t11\t0\t2\t60\t3\t8\t1\t44\t4\t0\t2\t39\tN\t13\t0.077\t0\n-Contig90_chrX_57430715_57431566\t548\tC\tT\t116.0\tchrX\t57431266\tT\t9\t0\t2\t54\t18\t0\t2\t81\t13\t0\t2\t66\t14\t0\t2\t69\t8\t0\t2\t54\t7\t0\t2\t48\tY\t261\t0.154\t1\n-Contig133_chrX_84833782_84834125\t182\tG\tA\t69.7\tchrX\t84833962\tG\t5\t0\t2\t42\t18\t0\t2\t81\t12\t0\t2\t63\t19\t0\t2\t84\t6\t3\t1\t27\t7\t0\t2\t48\tN\t619\t0.278\t0\n'

diff -r d4ec09e8079f -r 4b6590dd7250 test-data/test_out/specify_restriction_enzymes/specify_restriction_enzymes.gd_snp
--- a/test-data/test_out/specify_restriction_enzymes/specify_restriction_enzymes.gd_snp Wed Sep 12 14:27:40 2012 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000

[

@@ -1,10 +0,0 @@
-#{"column_names":["scaf","pos","A","B","qual","ref","rpos","rnuc","1A","1B","1G","1Q","2A","2B","2G","2Q","3A","3B","3G","3Q","4A","4B","4G","4Q","5A","5B","5G","5Q","6A","6B","6G","6Q","pair","dist",
-#"prim","rflp"],"dbkey":"canFam2","individuals":[["PB1",9],["PB2",13],["PB3",17],["PB4",21],["PB6",25],["PB8",29]],"pos":2,"rPos":7,"ref":6,"scaffold":1,"species":"bear"}
-Contig47_chr2_25470778_25471576 126 G A 888.0 chr2 25470896 G 12 0 2 63 14 0 2 69 14 0 2 69 10 0 2 57 18 0 2 81 13 0 2 66 N 11 0.289 1
-Contig73_chr9_29451535_29452248 616 A G 24.7 chr9 29452127 G 4 0 2 39 7 0 2 48 1 0 2 30 4 0 2 39 7 0 2 48 6 0 2 45 N 49 0.448 4
-Contig69_chr10_40547265_40548153 371 G A 58.1 chr10 40547649 A 9 0 2 54 8 0 2 51 8 0 2 51 9 0 2 54 4 0 2 39 5 0 2 42 Y 20 0.138 4
-Contig99_chr17_26021506_26022200 505 C T 88.8 chr17 26022017 T 15 0 2 72 13 0 2 66 19 0 2 84 9 0 2 54 10 0 2 57 11 0 2 60 Y 1 0.172 1
-Contig27_chr17_61713766_61716585 1056 G C 40.0 chr17 61714821 G 4 0 2 39 8 0 2 51 10 0 2 57 6 0 2 45 6 0 2 45 3 0 2 36 N 6 2.200 4
-Contig26_chr22_57817664_57819633 1453 A G 150.0 chr22 57819121 G 9 0 2 54 9 0 2 54 13 0 2 66 15 0 2 72 11 0 2 60 14 0 2 69 N 15 0.471 1
-Contig103_chr25_38891221_38892140 407 G A 131.0 chr25 38891644 G 8 0 2 51 14 0 2 69 18 0 2 81 8 0 2 51 8 0 2 51 11 0 2 60 Y 149 0.167 4
-Contig64_chr27_34654435_34654621 132 C A 115.0 chr27 34654567 T 2 0 2 33 2 0 2 33 5 0 2 42 3 0 2 36 3 0 2 36 8 0 2 51 N 12 0.297 1

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.heterochromatic.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.heterochromatic.loc.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,2 @@
+# ref_species heterochromatic_file
+#canFam2 /galaxy/local_data/genome_diversity/dpmix/canFam2_heterochrom.txt

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.oscar.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.oscar.loc.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,4 @@
+#<species> <data_file>
+#hg19 /galaxy/local_data/genome_diversity/oscar/hsa_ENSEMBLcKEGGctpthw.tsv
+#bosTau4 /galaxy/local_data/genome_diversity/oscar/bta_ENSEMBLcKEGGctpthw.tsv
+#canFam2 /galaxy/local_data/genome_diversity/oscar/cfa_ENSEMBLcKEGGctpthw.tsv

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.pathways.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.pathways.txt.sample Wed Sep 12 17:10:26 2012 -0400

b'@@ -0,0 +1,721 @@\n+hg19\thsa00010\thsa00010 - Glycolysis/ Gluconeogenesis\n+hg19\thsa00020\thsa00020 - Citratecycle (TCA cycle)\n+hg19\thsa00030\thsa00030 - Pentosephosphate pathway\n+hg19\thsa00040\thsa00040 - Pentoseand glucuronate interconversions\n+hg19\thsa00051\thsa00051 - Fructoseand mannose metabolism\n+hg19\thsa00052\thsa00052 - Galactosemetabolism\n+hg19\thsa00053\thsa00053 - Ascorbateand aldarate metabolism\n+hg19\thsa00061\thsa00061 - Fattyacid biosynthesis\n+hg19\thsa00062\thsa00062 - Fattyacid elongation\n+hg19\thsa00071\thsa00071 - Fattyacid metabolism\n+hg19\thsa00072\thsa00072 - Synthesisand degradation of ketone bodies\n+hg19\thsa00100\thsa00100 - Steroidbiosynthesis\n+hg19\thsa00120\thsa00120 - Primarybile acid biosynthesis\n+hg19\thsa00130\thsa00130 - Ubiquinoneand other terpenoid-quinone biosynthesis\n+hg19\thsa00140\thsa00140 - Steroidhormone biosynthesis\n+hg19\thsa00190\thsa00190 - Oxidativephosphorylation\n+hg19\thsa00230\thsa00230 - Purinemetabolism\n+hg19\thsa00232\thsa00232 - Caffeinemetabolism\n+hg19\thsa00240\thsa00240 - Pyrimidinemetabolism\n+hg19\thsa00250\thsa00250 - Alanine,aspartate and glutamate metabolism\n+hg19\thsa00260\thsa00260 - Glycine,serine and threonine metabolism\n+hg19\thsa00270\thsa00270 - Cysteineand methionine metabolism\n+hg19\thsa00280\thsa00280 - Valine,leucine and isoleucine degradation\n+hg19\thsa00290\thsa00290 - Valine,leucine and isoleucine biosynthesis\n+hg19\thsa00300\thsa00300 - Lysinebiosynthesis\n+hg19\thsa00310\thsa00310 - Lysinedegradation\n+hg19\thsa00330\thsa00330 - Arginineand proline metabolism\n+hg19\thsa00340\thsa00340 - Histidinemetabolism\n+hg19\thsa00350\thsa00350 - Tyrosinemetabolism\n+hg19\thsa00360\thsa00360 - Phenylalaninemetabolism\n+hg19\thsa00380\thsa00380 - Tryptophanmetabolism\n+hg19\thsa00400\thsa00400 - Phenylalanine,tyrosine and tryptophan biosynthesis\n+hg19\thsa00410\thsa00410 - beta-Alaninemetabolism\n+hg19\thsa00430\thsa00430 - Taurineand hypotaurine metabolism\n+hg19\thsa00450\thsa00450 - Selenocompoundmetabolism\n+hg19\thsa00460\thsa00460 - Cyanoaminoacid metabolism\n+hg19\thsa00480\thsa00480 - Glutathionemetabolism\n+hg19\thsa00500\thsa00500 - Starchand sucrose metabolism\n+hg19\thsa00510\thsa00510 - N-Glycanbiosynthesis\n+hg19\thsa00511\thsa00511 - Otherglycan degradation\n+hg19\thsa00512\thsa00512 - Mucintype O-Glycan biosynthesis\n+hg19\thsa00514\thsa00514 - Othertypes of O-glycan biosynthesis\n+hg19\thsa00520\thsa00520 - Aminosugar and nucleotide sugar metabolism\n+hg19\thsa00524\thsa00524 - Butirosinand neomycin biosynthesis\n+hg19\thsa00531\thsa00531 - Glycosaminoglycandegradation\n+hg19\thsa00532\thsa00532 - Glycosaminoglycanbiosynthesis - chondroitin sulfate\n+hg19\thsa00533\thsa00533 - Glycosaminoglycanbiosynthesis - keratan sulfate\n+hg19\thsa00534\thsa00534 - Glycosaminoglycanbiosynthesis - heparan sulfate\n+hg19\thsa00561\thsa00561 - Glycerolipidmetabolism\n+hg19\thsa00562\thsa00562 - Inositolphosphate metabolism\n+hg19\thsa00563\thsa00563 - Glycosylphosphatidylinositol(GPI)-anchorbiosynthesis\n+hg19\thsa00564\thsa00564 - Glycerophospholipidmetabolism\n+hg19\thsa00565\thsa00565 - Etherlipid metabolism\n+hg19\thsa00590\thsa00590 - Arachidonicacid metabolism\n+hg19\thsa00591\thsa00591 - Linoleicacid metabolism\n+hg19\thsa00592\thsa00592 - alpha-Linolenicacid metabolism\n+hg19\thsa00600\thsa00600 - Sphingolipidmetabolism\n+hg19\thsa00601\thsa00601 - Glycosphingolipidbiosynthesis - lacto and neolacto series\n+hg19\thsa00603\thsa00603 - Glycosphingolipidbiosynthesis - globo series\n+hg19\thsa00604\thsa00604 - Glycosphingolipidbiosynthesis - ganglio series\n+hg19\thsa00620\thsa00620 - Pyruvatemetabolism\n+hg19\thsa00630\thsa00630 - Glyoxylateand dicarboxylate metabolism\n+hg19\thsa00640\thsa00640 - Propanoatemetabolism\n+hg19\thsa00650\thsa00650 - Butanoatemetabolism\n+hg19\thsa00670\thsa00670 - Onecarbon pool by folate\n+hg19\thsa00730\thsa00730 - Thiaminemetabolism\n+hg19\thsa00740\thsa00740 - Riboflavinmetabolism\n+hg19\thsa00750\thsa00750 - VitaminB6 metabolism\n+hg19\thsa00760\thsa00760 - Nicotinateand nicotinamide metabolism\n+hg19\thsa00770\thsa00770 - Pantothenateand CoA biosynthesis\n+hg19\thsa00780\thsa00780 - Biotinmetabolis'..b"+bosTau4\tbta04728\tbta04728 - Dopaminergicsynapse\n+bosTau4\tbta04730\tbta04730 - Long-termdepression\n+bosTau4\tbta04740\tbta04740 - Olfactorytransduction\n+bosTau4\tbta04742\tbta04742 - Tastetransduction\n+bosTau4\tbta04744\tbta04744 - Phototransduction\n+bosTau4\tbta04810\tbta04810 - Regulationof actin cytoskeleton\n+bosTau4\tbta04910\tbta04910 - Insulinsignaling pathway\n+bosTau4\tbta04912\tbta04912 - GnRHsignaling pathway\n+bosTau4\tbta04914\tbta04914 - Progesterone-mediatedoocyte maturation\n+bosTau4\tbta04916\tbta04916 - Melanogenesis\n+bosTau4\tbta04920\tbta04920 - Adipocytokinesignaling pathway\n+bosTau4\tbta04930\tbta04930 - TypeII diabetes mellitus\n+bosTau4\tbta04940\tbta04940 - TypeI diabetes mellitus\n+bosTau4\tbta04950\tbta04950 - Maturityonset diabetes of the young\n+bosTau4\tbta04960\tbta04960 - Aldosterone-regulatedsodium reabsorption\n+bosTau4\tbta04961\tbta04961 - Endocrineand other factor-regulated calcium reabsorption\n+bosTau4\tbta04962\tbta04962 - Vasopressin-regulatedwater reabsorption\n+bosTau4\tbta04964\tbta04964 - Proximaltubule bicarbonate reclamation\n+bosTau4\tbta04966\tbta04966 - Collectingduct acid secretion\n+bosTau4\tbta04970\tbta04970 - Salivarysecretion\n+bosTau4\tbta04971\tbta04971 - Gastricacid secretion\n+bosTau4\tbta04972\tbta04972 - Pancreaticsecretion\n+bosTau4\tbta04973\tbta04973 - Carbohydratedigestion and absorption\n+bosTau4\tbta04974\tbta04974 - Proteindigestion and absorption\n+bosTau4\tbta04975\tbta04975 - Fatdigestion and absorption\n+bosTau4\tbta04976\tbta04976 - Bilesecretion\n+bosTau4\tbta04977\tbta04977 - Vitamindigestion and absorption\n+bosTau4\tbta04978\tbta04978 - Mineralabsorption\n+bosTau4\tbta05010\tbta05010 - Alzheimer'sdisease\n+bosTau4\tbta05012\tbta05012 - Parkinson'sdisease\n+bosTau4\tbta05014\tbta05014 - Amyotrophiclateral sclerosis (ALS)\n+bosTau4\tbta05016\tbta05016 - Huntington'sdisease\n+bosTau4\tbta05020\tbta05020 - Priondiseases\n+bosTau4\tbta05100\tbta05100 - Bacterialinvasion of epithelial cells\n+bosTau4\tbta05132\tbta05132 - Salmonellainfection\n+bosTau4\tbta05133\tbta05133 - Pertussis\n+bosTau4\tbta05134\tbta05134 - Legionellosis\n+bosTau4\tbta05140\tbta05140 - Leishmaniasis\n+bosTau4\tbta05142\tbta05142 - Chagasdisease (American trypanosomiasis)\n+bosTau4\tbta05143\tbta05143 - Africantrypanosomiasis\n+bosTau4\tbta05144\tbta05144 - Malaria\n+bosTau4\tbta05145\tbta05145 - Toxoplasmosis\n+bosTau4\tbta05146\tbta05146 - Amoebiasis\n+bosTau4\tbta05150\tbta05150 - Staphylococcusaureus infection\n+bosTau4\tbta05152\tbta05152 - Tuberculosis\n+bosTau4\tbta05160\tbta05160 - HepatitisC\n+bosTau4\tbta05162\tbta05162 - Measles\n+bosTau4\tbta05164\tbta05164 - InfluenzaA\n+bosTau4\tbta05166\tbta05166 - HTLV-Iinfection\n+bosTau4\tbta05168\tbta05168 - Herpessimplex infection\n+bosTau4\tbta05200\tbta05200 - Pathwaysin cancer\n+bosTau4\tbta05202\tbta05202 - Transcriptionalmisregulation in cancers\n+bosTau4\tbta05210\tbta05210 - Colorectalcancer\n+bosTau4\tbta05211\tbta05211 - Renalcell carcinoma\n+bosTau4\tbta05212\tbta05212 - Pancreaticcancer\n+bosTau4\tbta05213\tbta05213 - Endometrialcancer\n+bosTau4\tbta05214\tbta05214 - Glioma\n+bosTau4\tbta05215\tbta05215 - Prostatecancer\n+bosTau4\tbta05216\tbta05216 - Thyroidcancer\n+bosTau4\tbta05217\tbta05217 - Basalcell carcinoma\n+bosTau4\tbta05218\tbta05218 - Melanoma\n+bosTau4\tbta05219\tbta05219 - Bladdercancer\n+bosTau4\tbta05220\tbta05220 - Chronicmyeloid leukemia\n+bosTau4\tbta05221\tbta05221 - Acutemyeloid leukemia\n+bosTau4\tbta05222\tbta05222 - Smallcell lung cancer\n+bosTau4\tbta05223\tbta05223 - Non-smallcell lung cancer\n+bosTau4\tbta05310\tbta05310 - Asthma\n+bosTau4\tbta05320\tbta05320 - Autoimmunethyroid disease\n+bosTau4\tbta05322\tbta05322 - Systemiclupus erythematosus\n+bosTau4\tbta05323\tbta05323 - Rheumatoidarthritis\n+bosTau4\tbta05330\tbta05330 - Allograftrejection\n+bosTau4\tbta05332\tbta05332 - Graft-versus-hostdisease\n+bosTau4\tbta05340\tbta05340 - Primaryimmunodeficiency\n+bosTau4\tbta05410\tbta05410 - Hypertrophiccardiomyopathy (HCM)\n+bosTau4\tbta05412\tbta05412 - Arrhythmogenicright ventricular cardiomyopathy (ARVC)\n+bosTau4\tbta05414\tbta05414 - Dilatedcardiomyopathy\n+bosTau4\tbta05416\tbta05416 - Viralmyocarditis\n"

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.primers.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.primers.loc.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,6 @@
+#<species> <primers_file_path>
+#aye-aye /galaxy/local_data/genome_diversity/primers/aye-aye_Galaxy_primers.txt
+#bear /galaxy/local_data/genome_diversity/primers/bear_Galaxy_primers.txt
+#bighorn /galaxy/local_data/genome_diversity/primers/bighorn_Galaxy_primers.txt
+#tasmanian_devil /galaxy/local_data/genome_diversity/primers/devil_Galaxy_primers.txt
+#tick /galaxy/local_data/genome_diversity/primers/tick_Galaxy_primers.txt

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.rank.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.rank.loc.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,4 @@
+#<species> <prefix> <kxml_dir_path> <path_to_dict_file>
+#hg19 hsa /galaxy/local_data/genome_diversity/rank/KXML_hsa.d /galaxy/local_data/genome_diversity/rank/hsa_dict.txt
+#canFam2 cfa /galaxy/local_data/genome_diversity/rank/KXML_cfa.d /galaxy/local_data/genome_diversity/rank/cfa_dict.txt
+#bosTau4 bta /galaxy/local_data/genome_diversity/rank/KXML_bta.d /galaxy/local_data/genome_diversity/rank/bta_dict.txt

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.ref_species.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.ref_species.txt.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,4 @@
+# genome diversity species
+cow cow
+hg19 hg19
+dog dog

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.restriction_enzymes.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.restriction_enzymes.txt.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,99 @@
+Acc65I - GGTACC Acc65I
+AccB7I - CCANNNNNTGG AccB7I
+AccI - GT(A/C)(G/T)AC AccI
+AccIII - TCCGGA AccIII
+AcyI - G(A/G)CG(C/T)C AcyI
+AgeI - ACCGGT AgeI
+AluI - AGCT AluI
+Alw44I - GTGCAC Alw44I
+ApaI - GGGCCC ApaI
+AvaI - C(C/T)CG(A/G)G AvaI
+AvaII - GG(A/T)CC AvaII
+BalI - TGGCCA BalI
+BamHI - GGATCC BamHI
+BanI - GG(C/T)(A/G)CC BanI
+BanII - G(A/G)GC(C/T)C BanII
+BbuI - GCATGC BbuI
+BclI - TGATCA BclI
+BglI - GCCNNNNNGGC BglI
+BglII - AGATCT BglII
+BsaMI - GAATGC BsaMI
+BsaOI - CG(A/G)(C/T)CG BsaOI
+Bsp1286I - G(A/G/T)GC(A/C/T)C Bsp1286I
+BsrBRI - GATNNNNATC BsrBRI
+BsrSI - ACTGG BsrSI
+BssHII - GCGCGC BssHII
+Bst98I - CTTAAG Bst98I
+BstEII - GGTNACC BstEII
+BstOI - CC(A/T)GG BstOI
+BstXI - CCANNNNNNTGG BstXI
+BstZI - CGGCCG BstZI
+Bsu36I - CCTNAGG Bsu36I
+CfoI - GCGC CfoI
+ClaI - ATCGAT ClaI
+Csp45I - TTCGAA Csp45I
+CspI - CGG(A/T)CCG CspI
+DdeI - CTNAG DdeI
+DpnI - GATC DpnI
+DraI - TTTAAA DraI
+EclHKI - GACNNNNNGTC EclHKI
+Eco47III - AGCGCT Eco47III
+Eco52I - CGGCCG Eco52I
+Eco72I - CACGTG Eco72I
+EcoRI - GAATTC EcoRI
+EcoRV - GATATC EcoRV
+HaeII - (A/G)GCGC(C/T) HaeII
+HaeIII - GGCC HaeIII
+HhaI - GCGC HhaI
+HincII - GT(C/T)(A/G)AC HincII
+HindIII - AAGCTT HindIII
+HinfI - GANTC HinfI
+HpaI - GTTAAC HpaI
+HpaII - CCGG HpaII
+Hsp92I - G(A/G)CG(C/T)C Hsp92I
+Hsp92II - CATG Hsp92II
+I-PpoI - TAACTATGACTCTCTTAAGGTAGCCAAAT I-PpoI
+KpnI - GGTACC KpnI
+MboI - GATC MboI
+MluI - ACGCGT MluI
+MspA1I - C(A/C)GC(G/T)G MspA1I
+MspI - CCGG MspI
+NaeI - GCCGGC NaeI
+NarI - GGCGCC NarI
+NciI - CC(C/G)GG NciI
+NcoI - CCATGG NcoI
+NdeI - CATATG NdeI
+NgoMIV - GCCGGC NgoMIV
+NheI - GCTAGC NheI
+NotI - GCGGCCGC NotI
+NruI - TCGCGA NruI
+NsiI - ATGCAT NsiI
+PstI - CTGCAG PstI
+PvuI - CGATCG PvuI
+PvuII - CAGCTG PvuII
+RsaI - GTAC RsaI
+SacI - GAGCTC SacI
+SacII - CCGCGG SacII
+SalI - GTCGAC SalI
+Sau3AI - GATC Sau3AI
+Sau96I - GGNCC Sau96I
+ScaI - AGTACT ScaI
+SfiI - GGCCNNNNNGGCC SfiI
+SgfI - GCGATCGC SgfI
+SinI - GG(A/T)CC SinI
+SmaI - CCCGGG SmaI
+SnaBI - TACGTA SnaBI
+SpeI - ACTAGT SpeI
+SphI - GCATGC SphI
+SspI - AATATT SspI
+StuI - AGGCCT StuI
+StyI - CC(A/T)(A/T)GG StyI
+TaqI - TCGA TaqI
+Tru9I - TTAA Tru9I
+Tth111I - GACNNNGTC Tth111I
+VspI - ATTAAT VspI
+XbaI - TCTAGA XbaI
+XhoI - CTCGAG XhoI
+XhoII - (A/G)GATC(C/T) XhoII
+XmaI - CCCGGG XmaI
+XmnI - GAANNNNTTC XmnI

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.snps.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.snps.loc.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,3 @@
+#<species> <SNP_call_file_path>
+#bighorn /galaxy/local_data/genome_diversity/snps/bighorn_snps.txt
+#tasmanian_devil /galaxy/local_data/genome_diversity/snps/devil_snps.txt

diff -r d4ec09e8079f -r 4b6590dd7250 tool-data/gd.species.txt.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/gd.species.txt.sample Wed Sep 12 17:10:26 2012 -0400

@@ -0,0 +1,6 @@
+# genome diversity species
+aye-aye aye-aye
+bear bear
+bighorn bighorn
+tasmanian_devil Tasmanian devil
+tick tick