comparison seal-galaxy-cc1b1911/seal/demux_galaxy.py @ 0:244073d9abc1 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:41:10 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:244073d9abc1
1 #!/usr/bin/env python
2
3 # Copyright (C) 2011-2014 CRS4.
4 #
5 # This file is part of Seal.
6 #
7 # Seal is free software: you can redistribute it and/or modify it
8 # under the terms of the GNU General Public License as published by the Free
9 # Software Foundation, either version 3 of the License, or (at your option)
10 # any later version.
11 #
12 # Seal is distributed in the hope that it will be useful, but
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 # for more details.
16 #
17 # You should have received a copy of the GNU General Public License along
18 # with Seal. If not, see <http://www.gnu.org/licenses/>.
19
20
21
22 """
23 Calls the Seal Demux tool. Then, it calls the custom galaxy integration script
24 split_demux_output.py to generate one Galaxy dataset per each sample extracted
25 by Demux.
26 """
27
28 # parameters:
29 # INPUT_DATA
30 # MISMATCHES
31 # NEW_FILE_PATH
32 # NUM_REDUCERS
33 # OUTPUT1
34 # OUTPUT_ID
35 # SAMPLE_SHEET
36 # INPUT_FORMAT
37 # OUTPUT_FORMAT
38 # OUTPUT_COMPRESSION
39 # SEPARATE_READS
40
41 import os
42 import re
43 import subprocess
44 import sys
45
46 # XXX: add --append-python-path to the possible arguments?
47
48 def parse_indexed(s):
49 if s is not None:
50 normalized = s.lower().strip()
51 if normalized == 'notindexed':
52 return False
53 elif normalized == 'indexed':
54 return True
55 return None # failed to parse
56
57 def parse_index_present(param):
58 is_indexed = parse_indexed(param)
59 if is_indexed is None:
60 # try to read it as a file
61 if os.path.isfile(param):
62 with open(param) as f:
63 contents = f.readline(10000)
64 uri, value = contents.split("\t", 1)
65 is_indexed = parse_indexed(value)
66 if is_indexed is None:
67 raise RuntimeError("Error determining whether run has an index read. " + \
68 "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents)
69 return is_indexed
70
71 def usage_error(msg=None):
72 print >> sys.stderr, "Usage error"
73 if msg:
74 print >> sys.stderr, msg
75 print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\
76 "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS"
77 sys.exit(1)
78
79
80 if __name__ == "__main__":
81 if len(sys.argv) != 13:
82 usage_error()
83
84 input_data = sys.argv[1]
85 mismatches = sys.argv[2]
86 new_file_path = sys.argv[3]
87 num_reducers = sys.argv[4]
88 output1 = sys.argv[5]
89 output_id = sys.argv[6]
90 sample_sheet = sys.argv[7]
91 input_format = sys.argv[8]
92 output_format = sys.argv[9]
93 output_compression = sys.argv[10]
94 index_present = sys.argv[11]
95 separate_reads = sys.argv[12]
96
97 mydir = os.path.abspath(os.path.dirname(__file__))
98
99 # Run the demux program
100 cmd = [
101 'hadoop_galaxy',
102 '--input', input_data,
103 '--input-format', input_format, # --input-format for hadoop-galaxy
104 '--output', output1,
105 '--executable', 'seal',
106 'demux',
107 '--sample-sheet', sample_sheet,
108 '--input-format', input_format, # --input-format for seal demux
109 '--output-format', output_format
110 ]
111 if re.match(r'\s*\d+\s*', num_reducers):
112 cmd.extend( ('--num-reducers', num_reducers) )
113
114 if output_compression.lower() != 'none':
115 cmd.extend( ('--compress-output', output_compression) )
116
117 if mismatches != '0':
118 cmd.extend( ('--mismatches', mismatches) )
119
120 is_indexed = parse_index_present(index_present)
121 if is_indexed is False:
122 cmd.append("--no-index")
123
124 norm_separate_reads = separate_reads.lower().strip()
125 if norm_separate_reads == 'separate-reads':
126 cmd.append("--separate-reads")
127 elif norm_separate_reads.startswith('f'):
128 pass
129 else:
130 raise RuntimeError("Unrecognized value for separate-reads parameter: '%s'" % separate_reads)
131
132 print >> sys.stderr, ' '.join(cmd)
133 subprocess.check_call(cmd)
134
135 ###
136 # now the second phase: split_demux_output.py
137 cmd = [
138 os.path.join(mydir, 'split_demux_output.py'),
139 output_id, output1, new_file_path ]
140 print >> sys.stderr, ' '.join(cmd)
141 subprocess.check_call(cmd)