0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright (C) 2011-2014 CRS4.
|
|
4 #
|
|
5 # This file is part of Seal.
|
|
6 #
|
|
7 # Seal is free software: you can redistribute it and/or modify it
|
|
8 # under the terms of the GNU General Public License as published by the Free
|
|
9 # Software Foundation, either version 3 of the License, or (at your option)
|
|
10 # any later version.
|
|
11 #
|
|
12 # Seal is distributed in the hope that it will be useful, but
|
|
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
15 # for more details.
|
|
16 #
|
|
17 # You should have received a copy of the GNU General Public License along
|
|
18 # with Seal. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
|
20
|
|
21
|
|
22 import logging
|
|
23 import os
|
|
24 import sys
|
|
25
|
|
26 import pydoop.hdfs as phdfs
|
|
27
|
|
28 from hadoop_galaxy.pathset import FilePathset
|
|
29
|
|
30 Debug = os.environ.get('DEBUG', None)
|
|
31 logging.basicConfig(level=logging.DEBUG if Debug else logging.INFO)
|
|
32
|
|
33 def usage_error(msg=None):
|
|
34 if msg:
|
|
35 print >> sys.stderr, msg
|
|
36 print >> sys.stderr, "Usage: %s OUTPUT_ID DEMUX_OUTPUT_PATHSET NEW_FILE_DIR" % os.path.basename(sys.argv[0])
|
|
37 sys.exit(1)
|
|
38
|
|
39
|
|
40 class PathsetWriter(object):
|
|
41 # The format is dictated by the Galaxy documentation for tools that produce a variable
|
|
42 # number of output files: http://wiki.g2.bx.psu.edu/Admin/Tools/Multiple%20Output%20Files
|
|
43 # We fix the file_type to 'pathset'.
|
|
44 Galaxy_output_name_template = "primary_%s_%s_visible_pathset"
|
|
45
|
|
46 def __init__(self, output_dir, output_id, data_type):
|
|
47 self.output_dir = output_dir
|
|
48 self.output_id = output_id
|
|
49 self.data_type = data_type
|
|
50
|
|
51 def write_pathset(self, dataset_path, name):
|
|
52 """
|
|
53 dataset_path: the path of the dataset to which the new pathset needs to refer
|
|
54 name: name of dataset to appear in Galaxy
|
|
55 """
|
|
56 if not name:
|
|
57 raise RuntimeError("Blank dataset name")
|
|
58 sanitized_name = name.replace('_', '-') # replace _ with - or galaxy won't like the name
|
|
59 opathset = FilePathset(dataset_path)
|
|
60 opathset.set_datatype(self.data_type)
|
|
61 opath = os.path.join(self.output_dir, self.Galaxy_output_name_template % (self.output_id, sanitized_name))
|
|
62 logging.debug("writing dataset path %s to pathset file %s", dataset_path, opath)
|
|
63 with open(opath, 'w') as f:
|
|
64 opathset.write(f)
|
|
65 return self # to allow chaining
|
|
66
|
|
67
|
|
68
|
|
69 def main():
|
|
70 if len(sys.argv) != 4:
|
|
71 usage_error("Wrong number of arguments")
|
|
72
|
|
73 output_id, demux_data, dest_dir = sys.argv[1:]
|
|
74 logging.debug("input args: output_id, demux_data, dest_dir = %s", sys.argv[1:])
|
|
75
|
|
76 ipathset = FilePathset.from_file(demux_data)
|
|
77 logging.debug("input path set: %s", ipathset)
|
|
78
|
|
79 writer = PathsetWriter(dest_dir, output_id, ipathset.datatype)
|
|
80
|
|
81 # ipathset points to the output directory given to demux. Inside it
|
|
82 # we should find all the project/sample subdirectories, plus 'unknown' (if there
|
|
83 # were any reads not attributable to a sample). So, we list the output
|
|
84 # dir and collect sample names and their paths. In theory, the pathset
|
|
85 # we receive as input should only contains the output from one demux; thus
|
|
86 # a sample should only occur once.
|
|
87 if len(ipathset) != 1:
|
|
88 raise RuntimeError("Unexpected demux output pathset size of %d. Expected 1 (the demux output path)" % len(ipathset))
|
|
89
|
|
90 project_paths = \
|
|
91 filter(lambda p: os.path.basename(p)[0] not in ('_', '.'), # filter hadoop and regular hidden files
|
|
92 phdfs.ls(iter(ipathset).next()) # List the contents of the pathset. ls produces absolute paths
|
|
93 )
|
|
94 # Each project_path points to a directory containing the data from one project.
|
|
95 # There may also be a directory 'unknown'
|
|
96 for project_path in project_paths:
|
|
97 if os.path.basename(project_path).lower() == 'unknown':
|
|
98 writer.write_pathset(project_path, 'unknown')
|
|
99 else:
|
|
100 for project_sample_path in phdfs.ls(project_path):
|
|
101 # take the last two elements of the path -- should be project, sample
|
|
102 complete_sample_name = "%s.%s" % tuple(project_sample_path.split(os.path.sep)[-2:])
|
|
103 writer.write_pathset(project_sample_path, complete_sample_name)
|
|
104
|
|
105 if __name__ == '__main__':
|
|
106 main()
|