Mercurial > repos > crs4 > seal_galaxy
view seal-galaxy-cc1b1911/seal/recab_table_galaxy.py @ 0:244073d9abc1 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 15 Oct 2014 09:41:10 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright (C) 2011-2014 CRS4. # # This file is part of Seal. # # Seal is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # Seal is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License along # with Seal. If not, see <http://www.gnu.org/licenses/>. """ Calls the Seal RecabTable tool. Then, it calls recab_table_fetch to concatenate all the partial tables and create a single csv file. """ # parameters: # INPUT_DATA # OUTPUT # VCF # NUM_REDUCERS # [OTHER] import os import sys import hadoop_galaxy.pathset as pathset import subprocess import tempfile import pydoop.hdfs as phdfs # XXX: add --append-python-path to the possible arguments? def usage_error(msg=None): if msg: print >> sys.stderr, msg print >> sys.stderr, os.path.basename(sys.argv[0]), "INPUT_DATA OUTPUT VCF NUM_REDUCERS [OTHER]" sys.exit(1) def run_recab(input_path, output_path, vcf, num_red, other_args): mydir = os.path.abspath(os.path.dirname(__file__)) cmd = [ 'hadoop_galaxy', '--input', input_path, '--output', output_path, '--executable', 'seal', 'recab_table', '--vcf-file', vcf, '--num-reducers', num_red ] if other_args: cmd.extend(other_args) # now execute the hadoop job subprocess.check_call(cmd) def collect_table(pset, output_path): # finally, fetch the result into the final output file cmd = ['seal', 'recab_table_fetch'] cmd.extend(pset.get_paths()) cmd.append(output_path) try: # remove the file that galaxy creates. recab_table_fetch refuses to # overwrite it os.unlink(output_path) except IOError: pass subprocess.check_call(cmd) def cleanup(out_pathset): # clean-up job output for path in out_pathset: try: print >> sys.stderr, "Deleting output path", path phdfs.rmr(path) except StandardError as e: print >> sys.stderr, "Error!", str(e) def main(args): if len(args) < 5: usage_error() input_data = args[0] final_output = args[1] vcf = args[2] num_reducers = args[3] other = args[4:] # Create a temporary pathset to reference the recab_table # output directory with tempfile.NamedTemporaryFile(mode='rwb') as tmp_pathset_file: try: run_recab(input_data, tmp_pathset_file.name, vcf, num_reducers, other) tmp_pathset_file.seek(0) out_paths = pathset.FilePathset.from_file(tmp_pathset_file) collect_table(out_paths, final_output) finally: cleanup(out_paths) if __name__ == "__main__": main(sys.argv[1:]) # vim: et ai ts=2 sw=2