Mercurial > repos > crs4 > seal_galaxy
comparison seal-galaxy-cc1b1911/seal/recab_table_galaxy.py @ 0:244073d9abc1 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 15 Oct 2014 09:41:10 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:244073d9abc1 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (C) 2011-2014 CRS4. | |
4 # | |
5 # This file is part of Seal. | |
6 # | |
7 # Seal is free software: you can redistribute it and/or modify it | |
8 # under the terms of the GNU General Public License as published by the Free | |
9 # Software Foundation, either version 3 of the License, or (at your option) | |
10 # any later version. | |
11 # | |
12 # Seal is distributed in the hope that it will be useful, but | |
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY | |
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 # for more details. | |
16 # | |
17 # You should have received a copy of the GNU General Public License along | |
18 # with Seal. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 | |
21 | |
22 """ | |
23 Calls the Seal RecabTable tool. Then, it calls recab_table_fetch to | |
24 concatenate all the partial tables and create a single csv file. | |
25 """ | |
26 | |
27 | |
28 # parameters: | |
29 # INPUT_DATA | |
30 # OUTPUT | |
31 # VCF | |
32 # NUM_REDUCERS | |
33 # [OTHER] | |
34 | |
35 import os | |
36 import sys | |
37 | |
38 import hadoop_galaxy.pathset as pathset | |
39 import subprocess | |
40 import tempfile | |
41 import pydoop.hdfs as phdfs | |
42 | |
43 # XXX: add --append-python-path to the possible arguments? | |
44 | |
45 def usage_error(msg=None): | |
46 if msg: | |
47 print >> sys.stderr, msg | |
48 print >> sys.stderr, os.path.basename(sys.argv[0]), "INPUT_DATA OUTPUT VCF NUM_REDUCERS [OTHER]" | |
49 sys.exit(1) | |
50 | |
51 | |
52 def run_recab(input_path, output_path, vcf, num_red, other_args): | |
53 mydir = os.path.abspath(os.path.dirname(__file__)) | |
54 cmd = [ | |
55 'hadoop_galaxy', | |
56 '--input', input_path, | |
57 '--output', output_path, | |
58 '--executable', 'seal', | |
59 'recab_table', | |
60 '--vcf-file', vcf, | |
61 '--num-reducers', num_red | |
62 ] | |
63 | |
64 if other_args: | |
65 cmd.extend(other_args) | |
66 | |
67 # now execute the hadoop job | |
68 subprocess.check_call(cmd) | |
69 | |
70 def collect_table(pset, output_path): | |
71 # finally, fetch the result into the final output file | |
72 cmd = ['seal', 'recab_table_fetch'] | |
73 cmd.extend(pset.get_paths()) | |
74 cmd.append(output_path) | |
75 try: | |
76 # remove the file that galaxy creates. recab_table_fetch refuses to | |
77 # overwrite it | |
78 os.unlink(output_path) | |
79 except IOError: | |
80 pass | |
81 subprocess.check_call(cmd) | |
82 | |
83 def cleanup(out_pathset): | |
84 # clean-up job output | |
85 for path in out_pathset: | |
86 try: | |
87 print >> sys.stderr, "Deleting output path", path | |
88 phdfs.rmr(path) | |
89 except StandardError as e: | |
90 print >> sys.stderr, "Error!", str(e) | |
91 | |
92 def main(args): | |
93 if len(args) < 5: | |
94 usage_error() | |
95 | |
96 input_data = args[0] | |
97 final_output = args[1] | |
98 vcf = args[2] | |
99 num_reducers = args[3] | |
100 other = args[4:] | |
101 | |
102 # Create a temporary pathset to reference the recab_table | |
103 # output directory | |
104 with tempfile.NamedTemporaryFile(mode='rwb') as tmp_pathset_file: | |
105 try: | |
106 run_recab(input_data, tmp_pathset_file.name, vcf, num_reducers, other) | |
107 tmp_pathset_file.seek(0) | |
108 out_paths = pathset.FilePathset.from_file(tmp_pathset_file) | |
109 collect_table(out_paths, final_output) | |
110 finally: | |
111 cleanup(out_paths) | |
112 | |
113 if __name__ == "__main__": | |
114 main(sys.argv[1:]) | |
115 | |
116 # vim: et ai ts=2 sw=2 |