diff seal-galaxy-cc1b1911/seal/demux_galaxy.py @ 0:244073d9abc1 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:41:10 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/demux_galaxy.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,141 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+"""
+Calls the Seal Demux tool.  Then, it calls the custom galaxy integration script
+split_demux_output.py to generate one Galaxy dataset per each sample extracted
+by Demux.
+"""
+
+# parameters:
+#    INPUT_DATA
+#    MISMATCHES
+#    NEW_FILE_PATH
+#    NUM_REDUCERS
+#    OUTPUT1
+#    OUTPUT_ID
+#    SAMPLE_SHEET
+#    INPUT_FORMAT
+#    OUTPUT_FORMAT
+#    OUTPUT_COMPRESSION
+#    SEPARATE_READS
+
+import os
+import re
+import subprocess
+import sys
+
+# XXX: add --append-python-path to the possible arguments?
+
+def parse_indexed(s):
+  if s is not None:
+    normalized = s.lower().strip()
+    if normalized == 'notindexed':
+      return False
+    elif normalized == 'indexed':
+      return True
+  return None # failed to parse
+
+def parse_index_present(param):
+  is_indexed = parse_indexed(param)
+  if is_indexed is None:
+    # try to read it as a file
+    if os.path.isfile(param):
+      with open(param) as f:
+        contents = f.readline(10000)
+        uri, value = contents.split("\t", 1)
+        is_indexed = parse_indexed(value)
+        if is_indexed is None:
+          raise RuntimeError("Error determining whether run has an index read. " + \
+              "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents)
+  return is_indexed
+
+def usage_error(msg=None):
+  print >> sys.stderr, "Usage error"
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\
+    "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS"
+  sys.exit(1)
+
+
+if __name__ == "__main__":
+  if len(sys.argv) != 13:
+    usage_error()
+
+  input_data         = sys.argv[1]
+  mismatches         = sys.argv[2]
+  new_file_path      = sys.argv[3]
+  num_reducers       = sys.argv[4]
+  output1            = sys.argv[5]
+  output_id          = sys.argv[6]
+  sample_sheet       = sys.argv[7]
+  input_format       = sys.argv[8]
+  output_format      = sys.argv[9]
+  output_compression = sys.argv[10]
+  index_present      = sys.argv[11]
+  separate_reads     = sys.argv[12]
+
+  mydir = os.path.abspath(os.path.dirname(__file__))
+
+  # Run the demux program
+  cmd = [
+      'hadoop_galaxy',
+      '--input', input_data,
+      '--input-format', input_format, # --input-format for hadoop-galaxy
+      '--output', output1,
+      '--executable', 'seal',
+      'demux',
+      '--sample-sheet', sample_sheet,
+      '--input-format', input_format, # --input-format for seal demux
+      '--output-format', output_format
+    ]
+  if re.match(r'\s*\d+\s*', num_reducers):
+    cmd.extend( ('--num-reducers', num_reducers) )
+
+  if output_compression.lower() != 'none':
+    cmd.extend( ('--compress-output', output_compression) )
+
+  if mismatches != '0':
+    cmd.extend( ('--mismatches', mismatches) )
+
+  is_indexed = parse_index_present(index_present)
+  if is_indexed is False:
+    cmd.append("--no-index")
+
+  norm_separate_reads = separate_reads.lower().strip()
+  if norm_separate_reads == 'separate-reads':
+    cmd.append("--separate-reads")
+  elif norm_separate_reads.startswith('f'):
+    pass
+  else:
+    raise RuntimeError("Unrecognized value for separate-reads parameter:  '%s'" % separate_reads)
+
+  print >> sys.stderr, ' '.join(cmd)
+  subprocess.check_call(cmd)
+
+  ###
+  # now the second phase: split_demux_output.py
+  cmd = [
+      os.path.join(mydir, 'split_demux_output.py'),
+      output_id, output1, new_file_path ]
+  print >> sys.stderr, ' '.join(cmd)
+  subprocess.check_call(cmd)