0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright (C) 2011-2014 CRS4.
|
|
4 #
|
|
5 # This file is part of Seal.
|
|
6 #
|
|
7 # Seal is free software: you can redistribute it and/or modify it
|
|
8 # under the terms of the GNU General Public License as published by the Free
|
|
9 # Software Foundation, either version 3 of the License, or (at your option)
|
|
10 # any later version.
|
|
11 #
|
|
12 # Seal is distributed in the hope that it will be useful, but
|
|
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
|
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
15 # for more details.
|
|
16 #
|
|
17 # You should have received a copy of the GNU General Public License along
|
|
18 # with Seal. If not, see <http://www.gnu.org/licenses/>.
|
|
19
|
|
20
|
|
21
|
|
22 """
|
|
23 Calls the Seal Demux tool. Then, it calls the custom galaxy integration script
|
|
24 split_demux_output.py to generate one Galaxy dataset per each sample extracted
|
|
25 by Demux.
|
|
26 """
|
|
27
|
|
28 # parameters:
|
|
29 # INPUT_DATA
|
|
30 # MISMATCHES
|
|
31 # NEW_FILE_PATH
|
|
32 # NUM_REDUCERS
|
|
33 # OUTPUT1
|
|
34 # OUTPUT_ID
|
|
35 # SAMPLE_SHEET
|
|
36 # INPUT_FORMAT
|
|
37 # OUTPUT_FORMAT
|
|
38 # OUTPUT_COMPRESSION
|
|
39 # SEPARATE_READS
|
|
40
|
|
41 import os
|
|
42 import re
|
|
43 import subprocess
|
|
44 import sys
|
|
45
|
|
46 # XXX: add --append-python-path to the possible arguments?
|
|
47
|
|
48 def parse_indexed(s):
|
|
49 if s is not None:
|
|
50 normalized = s.lower().strip()
|
|
51 if normalized == 'notindexed':
|
|
52 return False
|
|
53 elif normalized == 'indexed':
|
|
54 return True
|
|
55 return None # failed to parse
|
|
56
|
|
57 def parse_index_present(param):
|
|
58 is_indexed = parse_indexed(param)
|
|
59 if is_indexed is None:
|
|
60 # try to read it as a file
|
|
61 if os.path.isfile(param):
|
|
62 with open(param) as f:
|
|
63 contents = f.readline(10000)
|
|
64 uri, value = contents.split("\t", 1)
|
|
65 is_indexed = parse_indexed(value)
|
|
66 if is_indexed is None:
|
|
67 raise RuntimeError("Error determining whether run has an index read. " + \
|
|
68 "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents)
|
|
69 return is_indexed
|
|
70
|
|
71 def usage_error(msg=None):
|
|
72 print >> sys.stderr, "Usage error"
|
|
73 if msg:
|
|
74 print >> sys.stderr, msg
|
|
75 print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\
|
|
76 "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS"
|
|
77 sys.exit(1)
|
|
78
|
|
79
|
|
80 if __name__ == "__main__":
|
|
81 if len(sys.argv) != 13:
|
|
82 usage_error()
|
|
83
|
|
84 input_data = sys.argv[1]
|
|
85 mismatches = sys.argv[2]
|
|
86 new_file_path = sys.argv[3]
|
|
87 num_reducers = sys.argv[4]
|
|
88 output1 = sys.argv[5]
|
|
89 output_id = sys.argv[6]
|
|
90 sample_sheet = sys.argv[7]
|
|
91 input_format = sys.argv[8]
|
|
92 output_format = sys.argv[9]
|
|
93 output_compression = sys.argv[10]
|
|
94 index_present = sys.argv[11]
|
|
95 separate_reads = sys.argv[12]
|
|
96
|
|
97 mydir = os.path.abspath(os.path.dirname(__file__))
|
|
98
|
|
99 # Run the demux program
|
|
100 cmd = [
|
|
101 'hadoop_galaxy',
|
|
102 '--input', input_data,
|
|
103 '--input-format', input_format, # --input-format for hadoop-galaxy
|
|
104 '--output', output1,
|
|
105 '--executable', 'seal',
|
|
106 'demux',
|
|
107 '--sample-sheet', sample_sheet,
|
|
108 '--input-format', input_format, # --input-format for seal demux
|
|
109 '--output-format', output_format
|
|
110 ]
|
|
111 if re.match(r'\s*\d+\s*', num_reducers):
|
|
112 cmd.extend( ('--num-reducers', num_reducers) )
|
|
113
|
|
114 if output_compression.lower() != 'none':
|
|
115 cmd.extend( ('--compress-output', output_compression) )
|
|
116
|
|
117 if mismatches != '0':
|
|
118 cmd.extend( ('--mismatches', mismatches) )
|
|
119
|
|
120 is_indexed = parse_index_present(index_present)
|
|
121 if is_indexed is False:
|
|
122 cmd.append("--no-index")
|
|
123
|
|
124 norm_separate_reads = separate_reads.lower().strip()
|
|
125 if norm_separate_reads == 'separate-reads':
|
|
126 cmd.append("--separate-reads")
|
|
127 elif norm_separate_reads.startswith('f'):
|
|
128 pass
|
|
129 else:
|
|
130 raise RuntimeError("Unrecognized value for separate-reads parameter: '%s'" % separate_reads)
|
|
131
|
|
132 print >> sys.stderr, ' '.join(cmd)
|
|
133 subprocess.check_call(cmd)
|
|
134
|
|
135 ###
|
|
136 # now the second phase: split_demux_output.py
|
|
137 cmd = [
|
|
138 os.path.join(mydir, 'split_demux_output.py'),
|
|
139 output_id, output1, new_file_path ]
|
|
140 print >> sys.stderr, ' '.join(cmd)
|
|
141 subprocess.check_call(cmd)
|