diff seal-galaxy-cc1b1911/seal/split_demux_output.py @ 0:244073d9abc1 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:41:10 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/seal-galaxy-cc1b1911/seal/split_demux_output.py	Wed Oct 15 09:41:10 2014 -0400
@@ -0,0 +1,106 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2011-2014 CRS4.
+#
+# This file is part of Seal.
+#
+# Seal is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option)
+# any later version.
+#
+# Seal is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with Seal.  If not, see <http://www.gnu.org/licenses/>.
+
+
+
+import logging
+import os
+import sys
+
+import pydoop.hdfs as phdfs
+
+from hadoop_galaxy.pathset import FilePathset
+
+Debug = os.environ.get('DEBUG', None)
+logging.basicConfig(level=logging.DEBUG if Debug else logging.INFO)
+
+def usage_error(msg=None):
+  if msg:
+    print >> sys.stderr, msg
+  print >> sys.stderr, "Usage:  %s OUTPUT_ID DEMUX_OUTPUT_PATHSET NEW_FILE_DIR" % os.path.basename(sys.argv[0])
+  sys.exit(1)
+
+
+class PathsetWriter(object):
+  # The format is dictated by the Galaxy documentation for tools that produce a variable
+  # number of output files:  http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple%20Output%20Files
+  # We fix the file_type to 'pathset'.
+  Galaxy_output_name_template = "primary_%s_%s_visible_pathset"
+
+  def __init__(self, output_dir, output_id, data_type):
+    self.output_dir = output_dir
+    self.output_id = output_id
+    self.data_type = data_type
+
+  def write_pathset(self, dataset_path, name):
+    """
+    dataset_path: the path of the dataset to which the new pathset needs to refer
+    name:  name of dataset to appear in Galaxy
+    """
+    if not name:
+      raise RuntimeError("Blank dataset name")
+    sanitized_name = name.replace('_', '-') # replace _ with - or galaxy won't like the name
+    opathset = FilePathset(dataset_path)
+    opathset.set_datatype(self.data_type)
+    opath = os.path.join(self.output_dir, self.Galaxy_output_name_template % (self.output_id, sanitized_name))
+    logging.debug("writing dataset path %s to pathset file %s", dataset_path, opath)
+    with open(opath, 'w') as f:
+      opathset.write(f)
+    return self # to allow chaining
+
+
+
+def main():
+  if len(sys.argv) != 4:
+    usage_error("Wrong number of arguments")
+
+  output_id, demux_data, dest_dir = sys.argv[1:]
+  logging.debug("input args: output_id, demux_data, dest_dir = %s", sys.argv[1:])
+
+  ipathset = FilePathset.from_file(demux_data)
+  logging.debug("input path set: %s", ipathset)
+
+  writer = PathsetWriter(dest_dir, output_id, ipathset.datatype)
+
+  # ipathset points to the output directory given to demux.  Inside it
+  # we should find all the project/sample subdirectories, plus 'unknown' (if there
+  # were any reads not attributable to a sample).  So, we list the output
+  # dir and collect sample names and their paths.  In theory, the pathset
+  # we receive as input should only contains the output from one demux; thus
+  # a sample should only occur once.
+  if len(ipathset) != 1:
+    raise RuntimeError("Unexpected demux output pathset size of %d.  Expected 1 (the demux output path)" % len(ipathset))
+
+  project_paths = \
+    filter(lambda p: os.path.basename(p)[0] not in ('_', '.'), # filter hadoop and regular hidden files
+      phdfs.ls(iter(ipathset).next()) # List the contents of the pathset. ls produces absolute paths
+    )
+  # Each project_path points to a directory containing the data from one project.
+  # There may also be a directory 'unknown'
+  for project_path in project_paths:
+    if os.path.basename(project_path).lower() == 'unknown':
+      writer.write_pathset(project_path, 'unknown')
+    else:
+      for project_sample_path in phdfs.ls(project_path):
+        # take the last two elements of the path -- should be project, sample
+        complete_sample_name = "%s.%s" % tuple(project_sample_path.split(os.path.sep)[-2:])
+        writer.write_pathset(project_sample_path, complete_sample_name)
+
+if __name__ == '__main__':
+  main()