Mercurial > repos > crs4 > seal_galaxy

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/README.md	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,73 @@
+
+Galaxy wrapper for the Seal toolkit
+====================================
+
+These are the Galaxy wrappers for the Seal toolkit for Hadoop-based processing
+of sequencing data (http://biodoop-seal.sf.net).
+
+
+Installation
+-------------------
+
+You can install the Seal-Galaxy wrappers through the Galaxy toolshed or like
+any other Galaxy tool.  The installation process will try to fetch and build
+Seal and some of its dependencies.  However, you'll need to make sure that
+the build process can find any required headers, libraries and executables,
+such as:
+
+* javac
+* protobuf
+* maven
+* ant
+* zlib
+* git
+* hadoop
+
+For details on Seal's installation process refer directly to [its
+documentation](http://biodoop-seal.sourceforge.net/installation.html).
+
+Hadoop-Galaxy integration
+----------------------------
+
+These wrappers use the [Hadoop-Galaxy](https://github.com/crs4/hadoop-galaxy)
+tool to implement the integration between Hadoop and Galaxy.  You should have a
+look at its documentation.
+
+An important issue
+-----------------------
+
+An implication of the integration provided by Hadoop-Galaxy is that Galaxy
+knows nothing about your actual data. Because of this, removing the Galaxy
+datasets does not delete the files produced by your Hadoop runs, potentially
+resulting in the waste of a lot of space.  Also, be careful with situations
+where you may end up with multiple pathsets pointing to the same data, or where
+they point to data that you want to access from Hadoop but would not want to
+delete (e.g., your run directories).
+
+Have a look at the Hadoop-Galaxy README for more details.
+
+
+Authors
+-------------
+
+Luca Pireddu <pireddu@crs4.it>
+
+
+Support
+-------------
+
+No support is provided.
+
+
+
+License
+--------------
+
+This code is release under the GPLv3.
+
+
+
+Copyright
+--------------
+
+Copyright CRS4, 2011-2014.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/make_release.sh	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,93 @@
+#!/bin/bash
+
+#set -x
+set -o errexit
+set -o nounset
+set -o pipefail
+
+PackageName="seal-galaxy"
+
+
+function error() {
+    if [ $# -ge 1 ]; then
+        echo $* >&1
+    fi
+    exit 1
+}
+
+function usage_error() {
+    echo "Usage: $0 version"
+    echo "Specify version as a git revid (id or tag) for the Seal repository, and " >&2
+    echo "optionally a '-n' suffix for the wrapper version; e.g., 0.4.1, 0.4.1-1, 0.4.1-2" >&2
+    error
+}
+
+function confirm() {
+    local prompt="${1}"
+    echo "${prompt} [Y/n]"
+    read -p "Answer: " yn
+    case "${yn}" in
+        ''|[Yy]) # do nothing and keep going
+            ;;
+        [Nn]) echo "Aborting"; exit 0
+            ;;
+        *) usage_error "Unrecognized answer. Please specify Y or n"
+            ;;
+    esac
+    return 0
+}
+
+function rewrite_seal_version() {
+  local grep_expr='<package name="seal" version=".*">'
+  if ! grep  "${grep_expr}" tool_dependencies.xml >/dev/null ; then
+    error "Couldn't find expected package line in tool_dependencies.xml"
+  fi
+
+  printf -v sed_expr1  '/<package name="seal"/s/version="[^"]*"/version="%s"/' "${seal_version}"
+  printf -v sed_expr2  '/<action type="shell_command">/s/git reset --hard \([^<]\+\)\s*/git reset --hard %s/' "${seal_version}"
+  sed -i -e "${sed_expr1}" -e "${sed_expr2}" tool_dependencies.xml
+  echo "Edited tool_dependencies.xml" >&2
+
+  # edit the tools as well
+  printf -v sed_expr3 '/<requirement type="package" version=.*>\s*seal\s*</s/version="[^"]\+"/version="%s"/' "${seal_version}"
+  printf -v sed_expr4 '/<tool id=/s/version="[^"]\+"/version="%s"/' "${seal_version}"
+  sed -i -e "${sed_expr3}" -e "${sed_expr4}" seal/*.xml
+
+  echo "Edited tool definitions" >&2
+}
+
+############# main ###############3
+
+if [ $# -eq 1 ]; then
+    wrapper_version="${1}"
+else
+    usage_error
+fi
+
+echo "Will rewrite tool_dependencies.xml setting the the package version to '${wrapper_version}'."
+confirm "Are you sure you want to proceed? [Y/n]"
+
+# ensure the tag doesn't already exist
+if git tag -l | grep -w "${wrapper_version}" ; then
+    error "A release tag called '${wrapper_version}' already exists"
+fi
+
+# remove the wrapper suffix, if it's there
+seal_version=$(echo ${wrapper_version} | sed -e 's/-[^-]\+$//')
+echo "Using seal version ${seal_version}"
+
+rewrite_seal_version "${seal_version}"
+
+git commit -a --allow-empty -m "Wrappers release for Seal '${seal_version}'"
+git tag "${wrapper_version}"
+
+revid=$(git rev-parse HEAD)
+
+echo "Tagged new commit ${revid} with tag '${wrapper_version}'"
+
+short_revid=${revid::8}
+archive_name=${PackageName}-${short_revid}.tar.gz
+
+git archive --format tar.gz --prefix ${PackageName}-${short_revid}/ HEAD -o "${archive_name}"
+
+echo "Don't forget to upload the archive to the toolshed!"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/bcl2qseq.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,112 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="dist_bcl2qseq" name="Dist Bcl2Qseq" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Convert Illumina bcl files to qseq on Hadoop</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    hadoop_galaxy
+    --executable seal
+    --input $input_data
+    --output $output1
+    bcl2qseq
+    #if $advanced.control == 'show'
+      #if $advanced.bcl2qseq_bin:
+      --bclToQseq-path $advanced.bcl2qseq_bin
+      #end if
+
+      #if $advanced.additional_ld_path
+      --append-ld-library-path $advanced.additional_ld_path
+      #end if
+
+      #if $advanced.ignore_missing_bcl
+      --ignore-missing-bcl
+      #end if
+
+      #if $advanced.ignore_missing_control
+      --ignore-missing-control
+      #end if
+
+      #if $advanced.exclude_controls
+      --exclude-controls
+      #end if
+
+      #if $advanced.no_eamss
+      --no-eamss
+      #end if
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Source data set"/>
+    <conditional name="advanced">
+      <param name="control" type="select" label="Advanced controls" default="hide">
+        <option value="hide">Hide</option>
+        <option value="show">Show</option>
+      </param>
+      <when value="show">
+        <param name="ignore_missing_bcl"
+          type="boolean" default="false"
+          label="Interpret missing *.bcl files as a base calling of '.'"
+          />
+        <param name="ignore_missing_control"
+          type="boolean" default="false"
+          label="Don't throw an error when *.control files are missing"
+          />
+        <param name="exclude_controls"
+          type="boolean" default="false"
+          label="Do not include clusters that are used as controls"
+          />
+        <param name="no_eamss"
+          type="boolean" default="false"
+          label="Do not apply the EAMSS masking on the quality values"
+          />
+        <param name="bcl2qseq_bin"
+          type="text"
+          default=""
+          size="80"
+          label="Full path to bclToQseq binary (needed only if the executable isn't in the PATH)"
+          />
+        <param name="additional_ld_path"
+          type="text" default="" size="80"
+          label="paths to append to the value of LD_LIBRARY_PATH"
+          />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="pathset" label="Qseq" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+    This is a Pydoop-based distributed version of Illumina's bclToQseq tool.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/demux.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,118 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_demux" name="Demux" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4" force_history_refresh="True">
+  <description>Demultiplex Illumina runs on Hadoop</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command interpreter="python">
+    demux_galaxy.py
+    $input_data
+    $mismatches
+    $__new_file_path__
+    #if $num_reducers
+      $num_reducers
+    #else
+      null
+    #end if
+    $output1
+    $output1.id
+    $sample_sheet
+    $input_format
+    $output_format
+    $output_compression
+    #if $index.specify_index == 'present'
+      true
+    #else if $index.specify_index == 'not_present'
+      false
+    #else if $index.specify_index == 'dynamic'
+      $index_present
+    #else
+      #raise ValueError('Invalid index value!')
+    #end if
+    $separate_reads
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Source data set"/>
+    <param name="sample_sheet" type="data" format="csv" label="Sample sheet" />
+
+    <conditional name="index">
+      <param name="specify_index" type="select" label="Index read" default="present">
+        <option value="present">Present</option>
+        <option value="not_present">Not present</option>
+        <option value="dynamic">Determine at runtime</option>
+      </param>
+      <when value="dynamic">
+        <param name="index_present" type="data" />
+      </when>
+    </conditional>
+
+    <param name="mismatches"
+      label="Barcode base mismatch limit"
+      type="integer"
+      value="0"
+      min="0"
+      max="3" />
+    <param name="num_reducers"
+      label="Number of reduce tasks"
+      type="integer"
+      value="90"
+      min="1"
+      optional="true"
+      />
+    <param name="input_format" type="select" label="Input data format" default="qseq">
+      <option value="qseq">Qseq</option>
+      <option value="fastq">Fastq</option>
+    </param>
+    <param name="output_format" type="select" label="Output data format" default="qseq">
+      <option value="qseq">Qseq</option>
+      <option value="fastq">Fastq</option>
+    </param>
+    <param name="output_compression" type="select" label="Output compression" default="none">
+      <option value="none">None</option>
+      <option value="gzip">Gzip</option>
+      <option value="bzip2">Bzip2</option>
+    </param>
+
+    <param name="separate_reads" type="boolean"
+      label="Separate reads by read number"
+      default="false"
+      truevalue="separate-reads"
+      />
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="pathset" label="Demuxed" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+    Demux is a Hadoop utility to demultiplex data from multiplexed Illumina runs.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/demux_galaxy.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+"""
+Calls the Seal Demux tool.  Then, it calls the custom galaxy integration script
+split_demux_output.py to generate one Galaxy dataset per each sample extracted
+by Demux.
+"""
+
+# parameters:
+#    INPUT_DATA
+#    MISMATCHES
+#    NEW_FILE_PATH
+#    NUM_REDUCERS
+#    OUTPUT1
+#    OUTPUT_ID
+#    SAMPLE_SHEET
+#    INPUT_FORMAT
+#    OUTPUT_FORMAT
+#    OUTPUT_COMPRESSION
+#    SEPARATE_READS
+
+import os
+import re
+import subprocess
+import sys
+
+# XXX: add --append-python-path to the possible arguments?
+
+def parse_indexed(s):
+  if s is not None:
+    normalized = s.lower().strip()
+    if normalized == 'notindexed':
+      return False
+    elif normalized == 'indexed':
+      return True
+  return None # failed to parse
+
+def parse_index_present(param):
+  is_indexed = parse_indexed(param)
+  if is_indexed is None:
+    # try to read it as a file
+    if os.path.isfile(param):
+      with open(param) as f:
+        contents = f.readline(10000)
+        uri, value = contents.split("\t", 1)
+        is_indexed = parse_indexed(value)
+        if is_indexed is None:
+          raise RuntimeError("Error determining whether run has an index read. " + \
+              "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents)
+  return is_indexed
+
+def usage_error(msg=None):
+  print >> sys.stderr, "Usage error"
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\
+    "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS"
+  sys.exit(1)
+
+
+if __name__ == "__main__":
+  if len(sys.argv) != 13:
+    usage_error()
+
+  input_data         = sys.argv[1]
+  mismatches         = sys.argv[2]
+  new_file_path      = sys.argv[3]
+  num_reducers       = sys.argv[4]
+  output1            = sys.argv[5]
+  output_id          = sys.argv[6]
+  sample_sheet       = sys.argv[7]
+  input_format       = sys.argv[8]
+  output_format      = sys.argv[9]
+  output_compression = sys.argv[10]
+  index_present      = sys.argv[11]
+  separate_reads     = sys.argv[12]
+
+  mydir = os.path.abspath(os.path.dirname(__file__))
+
+  # Run the demux program
+  cmd = [
+      'hadoop_galaxy',
+      '--input', input_data,
+      '--input-format', input_format, # --input-format for hadoop-galaxy
+      '--output', output1,
+      '--executable', 'seal',
+      'demux',
+      '--sample-sheet', sample_sheet,
+      '--input-format', input_format, # --input-format for seal demux
+      '--output-format', output_format
+    ]
+  if re.match(r'\s*\d+\s*', num_reducers):
+    cmd.extend( ('--num-reducers', num_reducers) )
+
+  if output_compression.lower() != 'none':
+    cmd.extend( ('--compress-output', output_compression) )
+
+  if mismatches != '0':
+    cmd.extend( ('--mismatches', mismatches) )
+
+  is_indexed = parse_index_present(index_present)
+  if is_indexed is False:
+    cmd.append("--no-index")
+
+  norm_separate_reads = separate_reads.lower().strip()
+  if norm_separate_reads == 'separate-reads':
+    cmd.append("--separate-reads")
+  elif norm_separate_reads.startswith('f'):
+    pass
+  else:
+    raise RuntimeError("Unrecognized value for separate-reads parameter:  '%s'" % separate_reads)
+
+  print >> sys.stderr, ' '.join(cmd)
+  subprocess.check_call(cmd)
+
+  ###
+  # now the second phase: split_demux_output.py
+  cmd = [
+      os.path.join(mydir, 'split_demux_output.py'),
+      output_id, output1, new_file_path ]
+  print >> sys.stderr, ' '.join(cmd)
+  subprocess.check_call(cmd)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/generate_sam_header.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+# A really really thin wrapper.  We only seem to need it because Galaxy won't
+# search for the command in the PATH
+
+import os
+import subprocess
+import sys
+
+if __name__ == '__main__':
+  output_path = sys.argv[-1]
+  try:
+    # seal merge_alignments won't overwrite an existing file, so we first remove
+    # the file Galaxy creates for us.
+    os.remove(output_path)
+  except IOError:
+    pass
+  hadoopized_output_path = 'file://' + os.path.abspath(output_path)
+  cmd = [ 'seal', 'merge_alignments' ] + sys.argv[1:-1]
+  cmd.append(hadoopized_output_path)
+  print "running command:", str(cmd)
+  subprocess.check_call(cmd)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/generate_sam_header.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,100 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="generate_sam_header" name="Generate SAM header" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Generate a SAM header for the given reference</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command interpreter="python">
+    #set $ref_path = 'file://' + $reference.fields.path if $reference.fields.path.startswith('/') else $reference.fields.path
+    generate_sam_header.py
+    --header-only
+    --annotations ${ref_path}.ann
+    --sort-order $sort_order
+
+    #if $compute_md5:
+    --md5
+    #end if
+
+    #if $assembly:
+    --sq-assembly "$assembly"
+    #end if
+
+    #if $rg.set_rg == 'true':
+      --rg_cn "$rg.rg_cn"
+      --rg_dt "$rg.rg_dt"
+      --rg_id "$rg.rg_id"
+      --rg_lb "$rg.rg_lb"
+      --rg_pl "$rg.rg_pl"
+      --rg_pu "$rg.rg_pu"
+      --rg_sm "$rg.rg_sm"
+    #end if
+
+    ${output}
+  </command>
+
+  <inputs>
+    <param name="reference" type="select" label="Reference (should be the same one used for alignment)">
+      <options from_data_table="bwa_0510_indexes" />
+    </param>
+
+    <param name="sort_order" type="select" default="coordinate">
+      <option value="coordinate">Coordinate</option>
+      <option value="read_id">Read ID</option>
+      <option value="unsorted">Unsorted</option>
+    </param>
+
+    <param name="compute_md5" type="boolean" checked="false" label="Whether to compute the MD5 checksums of the reference contigs" />
+    <param name="assembly" type="text" label="Genome assembly identifier (@SQ AS:XXX tag)" />
+
+    <conditional name="rg">
+      <param name="set_rg" type="boolean" checked="false" label="Set a Read Group line" truevalue="true" falsevalue="false" />
+
+      <when value="true">
+        <param name="rg_cn" type="text" label="Read group center" />
+        <param name="rg_dt" type="text" label="Read group date" />
+        <param name="rg_id" type="text" label="Read group id" />
+        <param name="rg_lb" type="text" label="Read group library" />
+        <param name="rg_pl" type="text" label="Read group platform" />
+        <param name="rg_pu" type="text" label="Read group platform unit" />
+        <param name="rg_sm" type="text" label="Read group sample" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="sam" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+ReadSort is a Hadoop-based program for sorting reads by alignment position.
+For the full help see the `manual &lt;http://biodoop-seal.sourceforge.net/read_sort_index.html&gt;`_.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/merge_alignments.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+import os
+import subprocess
+import sys
+import tempfile
+
+import hadoop_galaxy.pathset as pathset
+import hadoop_galaxy.cat_paths as cat_paths
+
+def usage_error(msg=None):
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, os.path.basename(__file__), "INPUT_PATHSET OUTPUT [args...]"
+  sys.exit(1)
+
+def main(args):
+  if len(args) < 2:
+    usage_error()
+
+  # We generate the header with seal_merge_alignments, insert it at the
+  # top of a copy of the input pathset, and then use cat_parts to
+  # join everything into a single file.
+
+  input_pathset, output_path = map(os.path.abspath, args[0:2])
+
+  with tempfile.NamedTemporaryFile() as header_file:
+    print "generating header"
+    gen_header_cmd = [ 'seal', 'merge_alignments', '--header-only' ]
+    gen_header_cmd.extend(args[2:])
+    header_text = subprocess.check_output(gen_header_cmd)
+
+    header_file.write(header_text)
+    header_file.flush()
+    print "header ready"
+    print "generating new pathset"
+
+    original_pathset = pathset.FilePathset.from_file(input_pathset)
+    new_pathset = pathset.FilePathset()
+    new_pathset.append(header_file.name)
+    for p in original_pathset:
+      new_pathset.append(p)
+
+    with tempfile.NamedTemporaryFile() as temp_pathset:
+      new_pathset.write(temp_pathset)
+      temp_pathset.flush()
+
+      print "concatenating pathset"
+      # TODO:  Add ability to use dist_cat_paths
+      cat_paths.main([temp_pathset.name, output_path])
+      print "operation complete"
+
+if __name__ == '__main__':
+  main(sys.argv[1:])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/merge_alignments.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,100 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_merge_alignments" name="Merge Alignments" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Merge a pathset of part-files of alignments into a single well-formatted SAM file</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+  <command interpreter="python">
+    #set $ref_path = 'file://' + $reference.fields.path if $reference.fields.path.startswith('/') else $reference.fields.path
+    merge_alignments.py
+    $input_data
+    $output
+
+    --annotations ${ref_path}.ann
+    --sort-order $sort_order
+
+    #if $compute_md5:
+    --md5
+    #end if
+
+    #if $assembly:
+    --sq-assembly "$assembly"
+    #end if
+
+    #if $rg.set_rg == 'true':
+      --rg_cn "$rg.rg_cn"
+      --rg_dt "$rg.rg_dt"
+      --rg_id "$rg.rg_id"
+      --rg_lb "$rg.rg_lb"
+      --rg_pl "$rg.rg_pl"
+      --rg_pu "$rg.rg_pu"
+      --rg_sm "$rg.rg_sm"
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Input data" />
+
+    <param name="reference" type="select" label="Reference (should be the same one used for alignment)">
+      <options from_data_table="bwa_0510_indexes" />
+    </param>
+
+    <param name="sort_order" type="select" default="coordinate">
+      <option value="coordinate">Coordinate</option>
+      <option value="read_id">Read ID</option>
+      <option value="unsorted">Unsorted</option>
+    </param>
+    <param name="compute_md5" type="boolean" checked="false" label="Whether to compute the MD5 checksums of the reference contigs" />
+    <param name="assembly" type="text" label="Genome assembly identifier (@SQ AS:XXX tag)" />
+
+    <conditional name="rg">
+      <param name="set_rg" type="boolean" checked="false" label="Set a Read Group line" truevalue="true" falsevalue="false" />
+
+      <when value="true">
+        <param name="rg_cn" type="text" label="Read group center" />
+        <param name="rg_dt" type="text" label="Read group date" />
+        <param name="rg_id" type="text" label="Read group id" />
+        <param name="rg_lb" type="text" label="Read group library" />
+        <param name="rg_pl" type="text" label="Read group platform" />
+        <param name="rg_pu" type="text" label="Read group platform unit" />
+        <param name="rg_sm" type="text" label="Read group sample" />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output" format="sam" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+ReadSort is a Hadoop-based program for sorting reads by alignment position.
+For the full help see the `manual &lt;http://biodoop-seal.sourceforge.net/read_sort_index.html&gt;`_.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/prq.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,118 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_prq" name="Prq" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Convert qseq or fastq files to prq on Hadoop</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+  <command>
+    hadoop_galaxy
+    --input $input_data
+    --input-format $input_format.type
+    --output $output1
+    --executable seal
+    prq
+    --input-format $input_format.type
+    --num-reducers $num_reducers
+    -D hbam.qseq-input.base-quality-encoding=$input_format.bq_encoding
+    -D hbam.fastq-input.base-quality-encoding=$input_format.bq_encoding
+
+    #if $bpr
+      -D seal.prq.min-bases-per-read=$bpr
+    #end if
+    #if $drop_failed
+      -D seal.prq.drop-failed-filter=$drop_failed
+    #end if
+    #if $warn_unpaired
+      -D seal.prq.warning-only-if-unpaired=$warn_unpaired
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Input data" />
+    <conditional name="input_format">
+      <!-- We use a conditional for the input_format since we want a different
+           default base quality encoding value for each of the respective
+           supported formats, qseq and fastq.-->
+      <param name="type" type="select" label="Input format" default="qseq">
+        <option value="qseq" />
+        <option value="fastq" />
+      </param>
+      <when value="qseq">
+        <param name="bq_encoding" type="select" label="BQ encoding" default="illumina">
+          <option value="illumina">Illumina</option>
+          <option value="sanger">Sanger</option>
+        </param>
+      </when>
+      <when value="fastq">
+        <param name="bq_encoding" type="select" label="BQ encoding" default="sanger">
+          <option value="sanger">Sanger</option>
+          <option value="illumina">Illumina</option>
+        </param>
+      </when>
+    </conditional>
+
+    <param name="num_reducers"
+      label="Number of reduce tasks"
+      type="integer"
+      value="90"
+      min="1"
+      />
+
+    <!-- prq-specific parameters -->
+    <param name="bpr"
+      label="Min bases per read"
+      type="integer"
+      help="If neither read in a pair has at least this many known bases the pair is dropped (prop: seal.prq.min-bases-per-read)."
+      value="30"
+      min="0"
+      />
+    <param name="drop_failed"
+      label="Filter by machine quality check"
+      type="boolean"
+      help="Drop pairs if both reads failed machine quality checks (prop: seal.prq.drop-failed-filter)."
+      checked="true"
+      />
+    <param name="warn_unpaired"
+      label="Warn only on unpaired reads"
+      type="boolean"
+      help="PRQ normally gives an error if it finds an unpaired read. If this setting is checked it will instead emit a warning, drop the unpaired read and keep going (prop: seal.prq.warning-only-if-unpaired)."
+      checked="false"
+      />
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="pathset" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+PairReadsQSeq (PRQ) is a Hadoop utility to convert  Illumina qseq files into
+prq file format.  For the full help see the `manual &lt;http://biodoop-seal.sourceforge.net/prq_index.html&gt;`_.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/read_sort.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,68 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_read_sort" name="ReadSort" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Sort reads with Hadoop</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+  <command>
+    #set $ref_path = 'file://' + $reference.fields.path if $reference.fields.path.startswith('/') else $reference.fields.path
+    hadoop_galaxy
+    --input $input_data
+    --output $output
+    --executable seal
+    read_sort
+    --annotations ${ref_path}.ann
+    --num-reducers $num_reducers
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Input data" />
+
+    <param name="reference" type="select" label="Reference (should be the same one used for alignment)">
+      <options from_data_table="bwa_0510_indexes" />
+    </param>
+
+    <param name="num_reducers"
+      label="Number of reduce tasks"
+      type="integer"
+      value="90"
+      min="1"
+      />
+  </inputs>
+
+  <outputs>
+    <data name="output" format="pathset" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+ReadSort is a Hadoop-based program for sorting reads by alignment position.
+For the full help see the `manual &lt;http://biodoop-seal.sourceforge.net/read_sort_index.html&gt;`_.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/recab_table.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,122 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_recab_table" name="Recab Table" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Calculate a base quality recalibration table on Hadoop.</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command interpreter="python">
+    recab_table_galaxy.py
+    $input_data
+    $output1
+
+    #if $dbsnp.db_source == "history":
+        $dbsnp.ownFile
+    #else:
+        ${dbsnp.built-inFile.fields.path}
+    #end if
+
+    $num_reducers
+
+    #if $default_rg:
+    -D seal.recab.rg-covariate.default-rg=$default_rg
+    #end if
+
+    #if $smoothing:
+    -D seal.recab.smoothing=$smoothing
+    #end if
+
+    #if $max_qscore:
+    -D seal.recab.max-qscore=$max_qscore
+    #end if
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Input data" />
+    <param name="input_format" type="select" label="Input format" default="sam">
+      <option value="sam" />
+      <option value="bam" />
+    </param>
+
+    <conditional name="dbsnp">
+      <param name="db_source" type="select" label="Select database of known variation sites">
+        <option value="built-in">Select a different built-in database</option>
+        <option value="history">Use a database (vcf format) from my history</option>
+      </param>
+
+        <when value="built-in">
+        <param name="built-inFile" type="select" label="Select a built-in database">
+          <options from_data_table="variant_tables"/>
+        </param>
+      </when>
+
+      <when value="history">
+        <param name="ownFile" type="data" format="vcf" label="Select a database from history"/>
+      </when>
+    </conditional>
+
+
+    <param name="num_reducers"
+      label="Number of reduce tasks"
+      type="integer"
+      value="90"
+      min="1"
+      />
+
+    <!-- recab-specific parameters -->
+    <param name="default_rg"
+      label="Default read group"
+      type="text"
+      help="Read group to assign to mappings without an RG tag. This value is mandatory if your data includes mappings that do not have a read group tag (RG) Seal RecabTable property: seal.recab.rg-covariate.default-rg."
+      />
+    <param name="smoothing"
+      label="Smoothing"
+      type="integer"
+      value="0"
+      help="Smoothing parameter for empirical quality calculation. Seal RecabTable property: seal.recab.smoothing."
+      min="0"
+      />
+    <param name="max_qscore"
+      label="Max quality score"
+      type="integer"
+      value="40"
+      min="1"
+      help="Upper limit for the empirical quality scores. Seal RecabTable property: seal.recab.max-qscore."
+      />
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="csv" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+RecabTable is a Hadoop program to calculate a table of base qualities for all values of a given set of factors. It computes a result equivalent to the GATK CountCovariatesWalker.
+For the full help see the `manual &lt;http://biodoop-seal.sourceforge.net/recab_table_index.html&gt;`_.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/recab_table_galaxy.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+"""
+Calls the Seal RecabTable tool.  Then, it calls recab_table_fetch to
+concatenate all the partial tables and create a single csv file.
+"""
+
+
+# parameters:
+#    INPUT_DATA
+#    OUTPUT
+#    VCF
+#    NUM_REDUCERS
+#    [OTHER]
+
+import os
+import sys
+
+import hadoop_galaxy.pathset as pathset
+import subprocess
+import tempfile
+import pydoop.hdfs as phdfs
+
+# XXX: add --append-python-path to the possible arguments?
+
+def usage_error(msg=None):
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, os.path.basename(sys.argv[0]), "INPUT_DATA OUTPUT VCF NUM_REDUCERS [OTHER]"
+  sys.exit(1)
+
+
+def run_recab(input_path, output_path, vcf, num_red, other_args):
+  mydir = os.path.abspath(os.path.dirname(__file__))
+  cmd = [
+    'hadoop_galaxy',
+    '--input', input_path,
+    '--output', output_path,
+    '--executable', 'seal',
+    'recab_table',
+    '--vcf-file', vcf,
+    '--num-reducers', num_red
+  ]
+
+  if other_args:
+    cmd.extend(other_args)
+
+  # now execute the hadoop job
+  subprocess.check_call(cmd)
+
+def collect_table(pset, output_path):
+  # finally, fetch the result into the final output file
+  cmd = ['seal', 'recab_table_fetch']
+  cmd.extend(pset.get_paths())
+  cmd.append(output_path)
+  try:
+    # remove the file that galaxy creates.  recab_table_fetch refuses to
+    # overwrite it
+    os.unlink(output_path)
+  except IOError:
+    pass
+  subprocess.check_call(cmd)
+
+def cleanup(out_pathset):
+  # clean-up job output
+  for path in out_pathset:
+    try:
+      print >> sys.stderr, "Deleting output path", path
+      phdfs.rmr(path)
+    except StandardError as e:
+      print >> sys.stderr, "Error!", str(e)
+
+def main(args):
+  if len(args) < 5:
+    usage_error()
+
+  input_data            = args[0]
+  final_output          = args[1]
+  vcf                   = args[2]
+  num_reducers          = args[3]
+  other                 = args[4:]
+
+  # Create a temporary pathset to reference the recab_table
+  # output directory
+  with tempfile.NamedTemporaryFile(mode='rwb') as tmp_pathset_file:
+    try:
+      run_recab(input_data, tmp_pathset_file.name, vcf, num_reducers, other)
+      tmp_pathset_file.seek(0)
+      out_paths = pathset.FilePathset.from_file(tmp_pathset_file)
+      collect_table(out_paths, final_output)
+    finally:
+      cleanup(out_paths)
+
+if __name__ == "__main__":
+  main(sys.argv[1:])
+
+# vim: et ai ts=2 sw=2
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/seqal.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,83 @@
+
+<!--
+  Copyright (C) 2011-2014 CRS4.
+
+  This file is part of Seal.
+
+  Seal is free software: you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by the Free
+  Software Foundation, either version 3 of the License, or (at your option)
+  any later version.
+
+  Seal is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+  for more details.
+
+  You should have received a copy of the GNU General Public License along
+  with Seal.  If not, see <http://www.gnu.org/licenses/>.
+-->
+
+
+<tool id="seal_seqal" name="Seqal" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+  <description>Map reads on Hadoop</description>
+  <requirements>
+    <requirement type="package" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">seal</requirement>
+    <requirement type="package" version="0.11">pydoop</requirement>
+    <requirement type="package" version="0.1.3">hadoop-galaxy</requirement>
+  </requirements>
+
+  <command>
+    hadoop_galaxy
+    --input $input_data
+    --output $output1
+    --executable seal
+    seqal
+    #if $align_only.value:
+      --align-only --num-reducers 0
+    #else
+      --num-reducers $align_only.num_reducers
+    #end if
+    --trimq $trimq
+    ${reference.fields.path}
+  </command>
+
+  <inputs>
+    <param name="input_data" type="data" format="pathset" label="Input data" />
+
+    <param name="reference" type="select" label="Select a built-in reference index archive">
+      <options from_data_table="seqal_indexes">
+      </options>
+    </param>
+
+    <param name="trimq" type="integer" min="0" value="0" label="trim quality, like BWA’s -q argument" />
+
+    <conditional name="align_only">
+      <param name="value" type="boolean" default="false" label="Align only (don't identify duplicates)" />
+      <when value="false">
+        <param name="num_reducers"
+          label="Number of reduce tasks"
+          type="integer"
+          value="90"
+          min="1"
+          />
+      </when>
+    </conditional>
+  </inputs>
+
+  <outputs>
+    <data name="output1" format="pathset" />
+  </outputs>
+
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+  </stdio>
+
+  <help>
+    Seqal is a distributed short read mapping and duplicate removal tool. It
+    implements a distributed version of the BWA aligner, and adds a duplicate
+    read identification feature using the same criteria as the Picard
+    MarkDuplicates command.  For a full description see the `manual
+    &lt;http://biodoop-seal.sourceforge.net/seqal_index.html&gt;`_.
+  </help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/split_demux_output.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+import logging
+import os
+import sys
+
+import pydoop.hdfs as phdfs
+
+from hadoop_galaxy.pathset import FilePathset
+
+Debug = os.environ.get('DEBUG', None)
+logging.basicConfig(level=logging.DEBUG if Debug else logging.INFO)
+
+def usage_error(msg=None):
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, "Usage:  %s OUTPUT_ID DEMUX_OUTPUT_PATHSET NEW_FILE_DIR" % os.path.basename(sys.argv[0])
+  sys.exit(1)
+
+
+class PathsetWriter(object):
+  # The format is dictated by the Galaxy documentation for tools that produce a variable
+  # number of output files:  http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple%20Output%20Files
+  # We fix the file_type to 'pathset'.
+  Galaxy_output_name_template = "primary_%s_%s_visible_pathset"
+
+  def __init__(self, output_dir, output_id, data_type):
+    self.output_dir = output_dir
+    self.output_id = output_id
+    self.data_type = data_type
+
+  def write_pathset(self, dataset_path, name):
+    """
+    dataset_path: the path of the dataset to which the new pathset needs to refer
+    name:  name of dataset to appear in Galaxy
+    """
+    if not name:
+      raise RuntimeError("Blank dataset name")
+    sanitized_name = name.replace('_', '-') # replace _ with - or galaxy won't like the name
+    opathset = FilePathset(dataset_path)
+    opathset.set_datatype(self.data_type)
+    opath = os.path.join(self.output_dir, self.Galaxy_output_name_template % (self.output_id, sanitized_name))
+    logging.debug("writing dataset path %s to pathset file %s", dataset_path, opath)
+    with open(opath, 'w') as f:
+      opathset.write(f)
+    return self # to allow chaining
+
+
+
+def main():
+  if len(sys.argv) != 4:
+    usage_error("Wrong number of arguments")
+
+  output_id, demux_data, dest_dir = sys.argv[1:]
+  logging.debug("input args: output_id, demux_data, dest_dir = %s", sys.argv[1:])
+
+  ipathset = FilePathset.from_file(demux_data)
+  logging.debug("input path set: %s", ipathset)
+
+  writer = PathsetWriter(dest_dir, output_id, ipathset.datatype)
+
+  # ipathset points to the output directory given to demux.  Inside it
+  # we should find all the project/sample subdirectories, plus 'unknown' (if there
+  # were any reads not attributable to a sample).  So, we list the output
+  # dir and collect sample names and their paths.  In theory, the pathset
+  # we receive as input should only contains the output from one demux; thus
+  # a sample should only occur once.
+  if len(ipathset) != 1:
+    raise RuntimeError("Unexpected demux output pathset size of %d.  Expected 1 (the demux output path)" % len(ipathset))
+
+  project_paths = \
+    filter(lambda p: os.path.basename(p)[0] not in ('_', '.'), # filter hadoop and regular hidden files
+      phdfs.ls(iter(ipathset).next()) # List the contents of the pathset. ls produces absolute paths
+    )
+  # Each project_path points to a directory containing the data from one project.
+  # There may also be a directory 'unknown'
+  for project_path in project_paths:
+    if os.path.basename(project_path).lower() == 'unknown':
+      writer.write_pathset(project_path, 'unknown')
+    else:
+      for project_sample_path in phdfs.ls(project_path):
+        # take the last two elements of the path -- should be project, sample
+        complete_sample_name = "%s.%s" % tuple(project_sample_path.split(os.path.sep)[-2:])
+        writer.write_pathset(project_sample_path, complete_sample_name)
+
+if __name__ == '__main__':
+  main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal_tool_conf.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,14 @@
+<?xml version="1.0"?>
+
+<toolbox>
+  <section name="Seal" id="seal">
+		<tool file="seal_galaxy/seal/bcl2qseq.xml" />
+    <tool file="seal_galaxy/seal/demux.xml" />
+    <tool file="seal_galaxy/seal/prq.xml" />
+    <tool file="seal_galaxy/seal/seqal.xml" />
+    <tool file="seal_galaxy/seal/read_sort.xml" />
+    <tool file="seal_galaxy/seal/merge_alignments.xml" />
+    <tool file="seal_galaxy/seal/recab_table.xml" />
+    <tool file="seal_galaxy/seal/generate_sam_header.xml" />
+  </section>
+</toolbox>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/tool_data_table_conf.xml.sample	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,16 @@
+<tables>
+    <table name="bwa_0510_indices" comment_char="#">
+        <columns>name, value, path</columns>
+        <!--<file path="tool-data/bwa_0510_indices.loc" />-->
+    </table>
+
+    <table name="seqal_indexes" comment_char="#">
+        <columns>name, value, path</columns>
+        <!--<file path="tool-data/bwa_0510_indices.loc" />-->
+    </table>
+
+    <table name="variant_tables" comment_char="#">
+        <columns>name, value, path</columns>
+        <!--<file path="tool-data/bwa_0510_indices.loc" />-->
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/tool_dependencies.xml	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,39 @@
+<?xml version="1.0"?>
+<tool_dependency>
+  <package name="hadoop-galaxy" version="0.1.3">
+    <repository changeset_revision="30bd2584b6a0" name="hadoop_galaxy" owner="crs4" prior_installation_required="True" toolshed="https://toolshed.g2.bx.psu.edu" />
+  </package>
+
+  <package name="seal" version="13986416aa79561bd0102cb7ccc1e0668ac9f0a4">
+    <install version="1.0">
+      <actions>
+        <action type="shell_command">git clone https://github.com/crs4/seal.git</action>
+        <action type="shell_command">git checkout master</action>
+        <action type="shell_command">git reset --hard 13986416aa79561bd0102cb7ccc1e0668ac9f0a4</action>
+        <!--<action type="download_by_url">https://github.com/crs4/seal/archive/0.4.0-rc2.tar.gz</action>-->
+        <action type="set_environment_for_install">
+          <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable>
+        </action>
+        <action type="make_directory">$INSTALL_DIR/lib/python</action>
+        <action type="shell_command">python setup.py build_hadoop_bam</action>
+        <action type="shell_command">python setup.py install --prefix=$INSTALL_DIR --install-lib=$INSTALL_DIR/lib/python</action>
+        <action type="set_environment">
+          <environment_variable action="prepend_to" name="PATH">$INSTALL_DIR/bin</environment_variable>
+          <environment_variable action="prepend_to" name="PYTHONPATH">$INSTALL_DIR/lib/python</environment_variable>
+        </action>
+      </actions>
+    </install>
+    <readme>
+This package has a number of dependencies that need to be installed before it:
+
+* Pydoop needs to be installed (it will be pulled down as a dependency; see
+that package's instructions for it's own installation pointers)
+
+* protobuf-python
+
+* JDK and Ant (ant version at least version 1.7)
+
+Please see http://biodoop-seal.sourceforge.net/installation_dependencies.html for more details.
+    </readme>
+  </package>
+</tool_dependency>