annotate seal-galaxy-cc1b1911/seal/demux_galaxy.py @ 0:244073d9abc1 draft default tip

Uploaded
author crs4
date Wed, 15 Oct 2014 09:41:10 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
244073d9abc1 Uploaded
crs4
parents:
diff changeset
1 #!/usr/bin/env python
244073d9abc1 Uploaded
crs4
parents:
diff changeset
2
244073d9abc1 Uploaded
crs4
parents:
diff changeset
3 # Copyright (C) 2011-2014 CRS4.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
4 #
244073d9abc1 Uploaded
crs4
parents:
diff changeset
5 # This file is part of Seal.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
6 #
244073d9abc1 Uploaded
crs4
parents:
diff changeset
7 # Seal is free software: you can redistribute it and/or modify it
244073d9abc1 Uploaded
crs4
parents:
diff changeset
8 # under the terms of the GNU General Public License as published by the Free
244073d9abc1 Uploaded
crs4
parents:
diff changeset
9 # Software Foundation, either version 3 of the License, or (at your option)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
10 # any later version.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
11 #
244073d9abc1 Uploaded
crs4
parents:
diff changeset
12 # Seal is distributed in the hope that it will be useful, but
244073d9abc1 Uploaded
crs4
parents:
diff changeset
13 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
244073d9abc1 Uploaded
crs4
parents:
diff changeset
14 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
244073d9abc1 Uploaded
crs4
parents:
diff changeset
15 # for more details.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
16 #
244073d9abc1 Uploaded
crs4
parents:
diff changeset
17 # You should have received a copy of the GNU General Public License along
244073d9abc1 Uploaded
crs4
parents:
diff changeset
18 # with Seal. If not, see <http://www.gnu.org/licenses/>.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
19
244073d9abc1 Uploaded
crs4
parents:
diff changeset
20
244073d9abc1 Uploaded
crs4
parents:
diff changeset
21
244073d9abc1 Uploaded
crs4
parents:
diff changeset
22 """
244073d9abc1 Uploaded
crs4
parents:
diff changeset
23 Calls the Seal Demux tool. Then, it calls the custom galaxy integration script
244073d9abc1 Uploaded
crs4
parents:
diff changeset
24 split_demux_output.py to generate one Galaxy dataset per each sample extracted
244073d9abc1 Uploaded
crs4
parents:
diff changeset
25 by Demux.
244073d9abc1 Uploaded
crs4
parents:
diff changeset
26 """
244073d9abc1 Uploaded
crs4
parents:
diff changeset
27
244073d9abc1 Uploaded
crs4
parents:
diff changeset
28 # parameters:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
29 # INPUT_DATA
244073d9abc1 Uploaded
crs4
parents:
diff changeset
30 # MISMATCHES
244073d9abc1 Uploaded
crs4
parents:
diff changeset
31 # NEW_FILE_PATH
244073d9abc1 Uploaded
crs4
parents:
diff changeset
32 # NUM_REDUCERS
244073d9abc1 Uploaded
crs4
parents:
diff changeset
33 # OUTPUT1
244073d9abc1 Uploaded
crs4
parents:
diff changeset
34 # OUTPUT_ID
244073d9abc1 Uploaded
crs4
parents:
diff changeset
35 # SAMPLE_SHEET
244073d9abc1 Uploaded
crs4
parents:
diff changeset
36 # INPUT_FORMAT
244073d9abc1 Uploaded
crs4
parents:
diff changeset
37 # OUTPUT_FORMAT
244073d9abc1 Uploaded
crs4
parents:
diff changeset
38 # OUTPUT_COMPRESSION
244073d9abc1 Uploaded
crs4
parents:
diff changeset
39 # SEPARATE_READS
244073d9abc1 Uploaded
crs4
parents:
diff changeset
40
244073d9abc1 Uploaded
crs4
parents:
diff changeset
41 import os
244073d9abc1 Uploaded
crs4
parents:
diff changeset
42 import re
244073d9abc1 Uploaded
crs4
parents:
diff changeset
43 import subprocess
244073d9abc1 Uploaded
crs4
parents:
diff changeset
44 import sys
244073d9abc1 Uploaded
crs4
parents:
diff changeset
45
244073d9abc1 Uploaded
crs4
parents:
diff changeset
46 # XXX: add --append-python-path to the possible arguments?
244073d9abc1 Uploaded
crs4
parents:
diff changeset
47
244073d9abc1 Uploaded
crs4
parents:
diff changeset
48 def parse_indexed(s):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
49 if s is not None:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
50 normalized = s.lower().strip()
244073d9abc1 Uploaded
crs4
parents:
diff changeset
51 if normalized == 'notindexed':
244073d9abc1 Uploaded
crs4
parents:
diff changeset
52 return False
244073d9abc1 Uploaded
crs4
parents:
diff changeset
53 elif normalized == 'indexed':
244073d9abc1 Uploaded
crs4
parents:
diff changeset
54 return True
244073d9abc1 Uploaded
crs4
parents:
diff changeset
55 return None # failed to parse
244073d9abc1 Uploaded
crs4
parents:
diff changeset
56
244073d9abc1 Uploaded
crs4
parents:
diff changeset
57 def parse_index_present(param):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
58 is_indexed = parse_indexed(param)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
59 if is_indexed is None:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
60 # try to read it as a file
244073d9abc1 Uploaded
crs4
parents:
diff changeset
61 if os.path.isfile(param):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
62 with open(param) as f:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
63 contents = f.readline(10000)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
64 uri, value = contents.split("\t", 1)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
65 is_indexed = parse_indexed(value)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
66 if is_indexed is None:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
67 raise RuntimeError("Error determining whether run has an index read. " + \
244073d9abc1 Uploaded
crs4
parents:
diff changeset
68 "Couldn't parse the dataset that was supposed to specify it (first 1000 chars): %s" % contents)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
69 return is_indexed
244073d9abc1 Uploaded
crs4
parents:
diff changeset
70
244073d9abc1 Uploaded
crs4
parents:
diff changeset
71 def usage_error(msg=None):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
72 print >> sys.stderr, "Usage error"
244073d9abc1 Uploaded
crs4
parents:
diff changeset
73 if msg:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
74 print >> sys.stderr, msg
244073d9abc1 Uploaded
crs4
parents:
diff changeset
75 print >> sys.stderr, "Usage:", os.path.basename(sys.argv[0]),\
244073d9abc1 Uploaded
crs4
parents:
diff changeset
76 "INPUT_DATA MISMATCHES NEW_FILE_PATH NUM_REDUCERS OUTPUT1 OUTPUT_ID SAMPLE_SHEET INPUT_FORMAT OUTPUT_FORMAT OUTPUT_COMPRESSION INDEX_PRESENT SEPARATE_READS"
244073d9abc1 Uploaded
crs4
parents:
diff changeset
77 sys.exit(1)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
78
244073d9abc1 Uploaded
crs4
parents:
diff changeset
79
244073d9abc1 Uploaded
crs4
parents:
diff changeset
80 if __name__ == "__main__":
244073d9abc1 Uploaded
crs4
parents:
diff changeset
81 if len(sys.argv) != 13:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
82 usage_error()
244073d9abc1 Uploaded
crs4
parents:
diff changeset
83
244073d9abc1 Uploaded
crs4
parents:
diff changeset
84 input_data = sys.argv[1]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
85 mismatches = sys.argv[2]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
86 new_file_path = sys.argv[3]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
87 num_reducers = sys.argv[4]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
88 output1 = sys.argv[5]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
89 output_id = sys.argv[6]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
90 sample_sheet = sys.argv[7]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
91 input_format = sys.argv[8]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
92 output_format = sys.argv[9]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
93 output_compression = sys.argv[10]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
94 index_present = sys.argv[11]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
95 separate_reads = sys.argv[12]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
96
244073d9abc1 Uploaded
crs4
parents:
diff changeset
97 mydir = os.path.abspath(os.path.dirname(__file__))
244073d9abc1 Uploaded
crs4
parents:
diff changeset
98
244073d9abc1 Uploaded
crs4
parents:
diff changeset
99 # Run the demux program
244073d9abc1 Uploaded
crs4
parents:
diff changeset
100 cmd = [
244073d9abc1 Uploaded
crs4
parents:
diff changeset
101 'hadoop_galaxy',
244073d9abc1 Uploaded
crs4
parents:
diff changeset
102 '--input', input_data,
244073d9abc1 Uploaded
crs4
parents:
diff changeset
103 '--input-format', input_format, # --input-format for hadoop-galaxy
244073d9abc1 Uploaded
crs4
parents:
diff changeset
104 '--output', output1,
244073d9abc1 Uploaded
crs4
parents:
diff changeset
105 '--executable', 'seal',
244073d9abc1 Uploaded
crs4
parents:
diff changeset
106 'demux',
244073d9abc1 Uploaded
crs4
parents:
diff changeset
107 '--sample-sheet', sample_sheet,
244073d9abc1 Uploaded
crs4
parents:
diff changeset
108 '--input-format', input_format, # --input-format for seal demux
244073d9abc1 Uploaded
crs4
parents:
diff changeset
109 '--output-format', output_format
244073d9abc1 Uploaded
crs4
parents:
diff changeset
110 ]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
111 if re.match(r'\s*\d+\s*', num_reducers):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
112 cmd.extend( ('--num-reducers', num_reducers) )
244073d9abc1 Uploaded
crs4
parents:
diff changeset
113
244073d9abc1 Uploaded
crs4
parents:
diff changeset
114 if output_compression.lower() != 'none':
244073d9abc1 Uploaded
crs4
parents:
diff changeset
115 cmd.extend( ('--compress-output', output_compression) )
244073d9abc1 Uploaded
crs4
parents:
diff changeset
116
244073d9abc1 Uploaded
crs4
parents:
diff changeset
117 if mismatches != '0':
244073d9abc1 Uploaded
crs4
parents:
diff changeset
118 cmd.extend( ('--mismatches', mismatches) )
244073d9abc1 Uploaded
crs4
parents:
diff changeset
119
244073d9abc1 Uploaded
crs4
parents:
diff changeset
120 is_indexed = parse_index_present(index_present)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
121 if is_indexed is False:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
122 cmd.append("--no-index")
244073d9abc1 Uploaded
crs4
parents:
diff changeset
123
244073d9abc1 Uploaded
crs4
parents:
diff changeset
124 norm_separate_reads = separate_reads.lower().strip()
244073d9abc1 Uploaded
crs4
parents:
diff changeset
125 if norm_separate_reads == 'separate-reads':
244073d9abc1 Uploaded
crs4
parents:
diff changeset
126 cmd.append("--separate-reads")
244073d9abc1 Uploaded
crs4
parents:
diff changeset
127 elif norm_separate_reads.startswith('f'):
244073d9abc1 Uploaded
crs4
parents:
diff changeset
128 pass
244073d9abc1 Uploaded
crs4
parents:
diff changeset
129 else:
244073d9abc1 Uploaded
crs4
parents:
diff changeset
130 raise RuntimeError("Unrecognized value for separate-reads parameter: '%s'" % separate_reads)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
131
244073d9abc1 Uploaded
crs4
parents:
diff changeset
132 print >> sys.stderr, ' '.join(cmd)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
133 subprocess.check_call(cmd)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
134
244073d9abc1 Uploaded
crs4
parents:
diff changeset
135 ###
244073d9abc1 Uploaded
crs4
parents:
diff changeset
136 # now the second phase: split_demux_output.py
244073d9abc1 Uploaded
crs4
parents:
diff changeset
137 cmd = [
244073d9abc1 Uploaded
crs4
parents:
diff changeset
138 os.path.join(mydir, 'split_demux_output.py'),
244073d9abc1 Uploaded
crs4
parents:
diff changeset
139 output_id, output1, new_file_path ]
244073d9abc1 Uploaded
crs4
parents:
diff changeset
140 print >> sys.stderr, ' '.join(cmd)
244073d9abc1 Uploaded
crs4
parents:
diff changeset
141 subprocess.check_call(cmd)