Mercurial > repos > crs4 > seal_galaxy
view seal-galaxy-cc1b1911/seal/demux_galaxy.py @ 0:244073d9abc1 draft default tip
Uploaded
author | crs4 |
---|---|
date | Wed, 15 Oct 2014 09:41:10 -0400 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python # Copyright (C) 2011-2014 CRS4. # # This file is part of Seal. # # Seal is free software: you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # Seal is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License # for more details. # # You should have received a copy of the GNU General Public License along # with Seal. If not, see <http://www.gnu.org/licenses/>. """ Calls the Seal Demux tool. Then, it calls the custom galaxy integration script split_demux_output.py to generate one Galaxy dataset per each sample extracted by Demux. """ # parameters: # INPUT_DATA # MISMATCHES # NEW_FILE_PATH # NUM_REDUCERS # OUTPUT1 # OUTPUT_ID # SAMPLE_SHEET # INPUT_FORMAT # OUTPUT_FORMAT # OUTPUT_COMPRESSION # SEPARATE_READS import os import re import subprocess import sys # XXX: add --append-python-path to the possible arguments? def parse_indexed(s): if s is not None: normalized = s.lower().strip() if normalized == 'notindexed': return False elif normalized == 'indexed': return True return None # failed to parse def parse_index_present(param): is_indexed = parse_indexed(param) if is_indexed is None: # try to read it as a file if os.path.isfile(param): with open(param) as f: contents = f.readline(10000) uri, value = contents.split("\t", 1) is_indexed = parse_indexed(value) if is_indexed is None: raise RuntimeError("Error determining whether run has an index read. " + \ "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents) return is_indexed def usage_error(msg=None): print >> sys.stderr, "Usage error" if msg: print >> sys.stderr, msg print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\ "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS" sys.exit(1) if __name__ == "__main__": if len(sys.argv) != 13: usage_error() input_data = sys.argv[1] mismatches = sys.argv[2] new_file_path = sys.argv[3] num_reducers = sys.argv[4] output1 = sys.argv[5] output_id = sys.argv[6] sample_sheet = sys.argv[7] input_format = sys.argv[8] output_format = sys.argv[9] output_compression = sys.argv[10] index_present = sys.argv[11] separate_reads = sys.argv[12] mydir = os.path.abspath(os.path.dirname(__file__)) # Run the demux program cmd = [ 'hadoop_galaxy', '--input', input_data, '--input-format', input_format, # --input-format for hadoop-galaxy '--output', output1, '--executable', 'seal', 'demux', '--sample-sheet', sample_sheet, '--input-format', input_format, # --input-format for seal demux '--output-format', output_format ] if re.match(r'\s*\d+\s*', num_reducers): cmd.extend( ('--num-reducers', num_reducers) ) if output_compression.lower() != 'none': cmd.extend( ('--compress-output', output_compression) ) if mismatches != '0': cmd.extend( ('--mismatches', mismatches) ) is_indexed = parse_index_present(index_present) if is_indexed is False: cmd.append("--no-index") norm_separate_reads = separate_reads.lower().strip() if norm_separate_reads == 'separate-reads': cmd.append("--separate-reads") elif norm_separate_reads.startswith('f'): pass else: raise RuntimeError("Unrecognized value for separate-reads parameter: '%s'" % separate_reads) print >> sys.stderr, ' '.join(cmd) subprocess.check_call(cmd) ### # now the second phase: split_demux_output.py cmd = [ os.path.join(mydir, 'split_demux_output.py'), output_id, output1, new_file_path ] print >> sys.stderr, ' '.join(cmd) subprocess.check_call(cmd)