Mercurial > repos > crs4 > seal_galaxy
comparison seal-galaxy-cc1b1911/seal/split_demux_output.py @ 0:244073d9abc1 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 15 Oct 2014 09:41:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:244073d9abc1 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (C) 2011-2014 CRS4. | |
4 # | |
5 # This file is part of Seal. | |
6 # | |
7 # Seal is free software: you can redistribute it and/or modify it | |
8 # under the terms of the GNU General Public License as published by the Free | |
9 # Software Foundation, either version 3 of the License, or (at your option) | |
10 # any later version. | |
11 # | |
12 # Seal is distributed in the hope that it will be useful, but | |
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 # for more details. | |
16 # | |
17 # You should have received a copy of the GNU General Public License along | |
18 # with Seal. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 | |
21 | |
22 import logging | |
23 import os | |
24 import sys | |
25 | |
26 import pydoop.hdfs as phdfs | |
27 | |
28 from hadoop_galaxy.pathset import FilePathset | |
29 | |
30 Debug = os.environ.get('DEBUG', None) | |
31 logging.basicConfig(level=logging.DEBUG if Debug else logging.INFO) | |
32 | |
33 def usage_error(msg=None): | |
34 if msg: | |
35 print >> sys.stderr, msg | |
36 print >> sys.stderr, "Usage: %s OUTPUT_ID DEMUX_OUTPUT_PATHSET NEW_FILE_DIR" % os.path.basename(sys.argv[0]) | |
37 sys.exit(1) | |
38 | |
39 | |
40 class PathsetWriter(object): | |
41 # The format is dictated by the Galaxy documentation for tools that produce a variable | |
42 # number of output files: http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple%20Output%20Files | |
43 # We fix the file_type to 'pathset'. | |
44 Galaxy_output_name_template = "primary_%s_%s_visible_pathset" | |
45 | |
46 def __init__(self, output_dir, output_id, data_type): | |
47 self.output_dir = output_dir | |
48 self.output_id = output_id | |
49 self.data_type = data_type | |
50 | |
51 def write_pathset(self, dataset_path, name): | |
52 """ | |
53 dataset_path: the path of the dataset to which the new pathset needs to refer | |
54 name: name of dataset to appear in Galaxy | |
55 """ | |
56 if not name: | |
57 raise RuntimeError("Blank dataset name") | |
58 sanitized_name = name.replace('_', '-') # replace _ with - or galaxy won't like the name | |
59 opathset = FilePathset(dataset_path) | |
60 opathset.set_datatype(self.data_type) | |
61 opath = os.path.join(self.output_dir, self.Galaxy_output_name_template % (self.output_id, sanitized_name)) | |
62 logging.debug("writing dataset path %s to pathset file %s", dataset_path, opath) | |
63 with open(opath, 'w') as f: | |
64 opathset.write(f) | |
65 return self # to allow chaining | |
66 | |
67 | |
68 | |
69 def main(): | |
70 if len(sys.argv) != 4: | |
71 usage_error("Wrong number of arguments") | |
72 | |
73 output_id, demux_data, dest_dir = sys.argv[1:] | |
74 logging.debug("input args: output_id, demux_data, dest_dir = %s", sys.argv[1:]) | |
75 | |
76 ipathset = FilePathset.from_file(demux_data) | |
77 logging.debug("input path set: %s", ipathset) | |
78 | |
79 writer = PathsetWriter(dest_dir, output_id, ipathset.datatype) | |
80 | |
81 # ipathset points to the output directory given to demux. Inside it | |
82 # we should find all the project/sample subdirectories, plus 'unknown' (if there | |
83 # were any reads not attributable to a sample). So, we list the output | |
84 # dir and collect sample names and their paths. In theory, the pathset | |
85 # we receive as input should only contains the output from one demux; thus | |
86 # a sample should only occur once. | |
87 if len(ipathset) != 1: | |
88 raise RuntimeError("Unexpected demux output pathset size of %d. Expected 1 (the demux output path)" % len(ipathset)) | |
89 | |
90 project_paths = \ | |
91 filter(lambda p: os.path.basename(p)[0] not in ('_', '.'), # filter hadoop and regular hidden files | |
92 phdfs.ls(iter(ipathset).next()) # List the contents of the pathset. ls produces absolute paths | |
93 ) | |
94 # Each project_path points to a directory containing the data from one project. | |
95 # There may also be a directory 'unknown' | |
96 for project_path in project_paths: | |
97 if os.path.basename(project_path).lower() == 'unknown': | |
98 writer.write_pathset(project_path, 'unknown') | |
99 else: | |
100 for project_sample_path in phdfs.ls(project_path): | |
101 # take the last two elements of the path -- should be project, sample | |
102 complete_sample_name = "%s.%s" % tuple(project_sample_path.split(os.path.sep)[-2:]) | |
103 writer.write_pathset(project_sample_path, complete_sample_name) | |
104 | |
105 if __name__ == '__main__': | |
106 main() |