0
|
1 import gzip
|
|
2 import os
|
|
3
|
|
4 import datetime
|
|
5 import re
|
|
6 import shlex
|
|
7 import shutil
|
|
8 import string
|
|
9 import subprocess
|
|
10 import types
|
|
11 #from itertools import izip
|
|
12
|
|
13 import marshal
|
|
14 import sys
|
|
15
|
|
16
|
|
17
|
|
18 _rc_trans = string.maketrans('ACGT', 'TGCA')
|
|
19 def reverse_compl_seq(strseq):
|
|
20 return strseq.translate(_rc_trans)[::-1]
|
|
21
|
|
22
|
|
23
|
|
24 def show_version() :
|
|
25 print ""
|
|
26 print " BS-Seeker2 v2.0.3 - May 19, 2013 "
|
|
27 print ""
|
|
28
|
|
29
|
|
30
|
|
31 """
|
|
32 IUPAC nucleotide code Base
|
|
33 A Adenine
|
|
34 C Cytosine
|
|
35 G Guanine
|
|
36 T (or U) Thymine (or Uracil)
|
|
37 R A or G
|
|
38 Y C or T
|
|
39 S G or C
|
|
40 W A or T
|
|
41 K G or T
|
|
42 M A or C
|
|
43 B C or G or T
|
|
44 D A or G or T
|
|
45 H A or C or T
|
|
46 V A or C or G
|
|
47 N any base
|
|
48 . or - gap
|
|
49 """
|
|
50
|
|
51
|
|
52 def IUPAC ( nuc ) :
|
|
53 if nuc == 'R' :
|
|
54 return ('A','G')
|
|
55 elif nuc == 'Y' :
|
|
56 return ('C', 'T')
|
|
57 elif nuc == 'S' :
|
|
58 return ('G', 'C')
|
|
59 elif nuc == 'W' :
|
|
60 return ('A', 'T')
|
|
61 elif nuc == 'K' :
|
|
62 return ('G','T')
|
|
63 elif nuc == 'M' :
|
|
64 return ('A','C')
|
|
65 elif nuc == 'B' :
|
|
66 return ('C', 'G', 'T')
|
|
67 elif nuc == 'D' :
|
|
68 return ('A', 'G', 'T')
|
|
69 elif nuc == 'H' :
|
|
70 return ('A', 'C', 'T')
|
|
71 elif nuc == 'V' :
|
|
72 return ('A', 'C', 'G')
|
|
73 elif nuc == 'N' :
|
|
74 return ('A', 'C', 'G', 'T')
|
|
75 else :
|
|
76 return (nuc)
|
|
77
|
|
78
|
|
79 def uniq(inlist):
|
|
80 # order preserving
|
|
81 uniques = []
|
|
82 for item in inlist:
|
|
83 if item not in uniques:
|
|
84 uniques.append(item)
|
|
85 return uniques
|
|
86
|
|
87 from itertools import product
|
|
88
|
|
89 def EnumerateIUPAC ( context_lst ) :
|
|
90 tag_list = []
|
|
91 # context_lst = [context]
|
|
92 for one_context in context_lst :
|
|
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) :
|
|
94 tag_list.append(''.join(m))
|
|
95 return uniq(tag_list)
|
|
96
|
|
97 from itertools import product
|
|
98
|
|
99 # example: cut3_context="CGG"
|
|
100 # return generator for : ["CGG","TGG"]
|
|
101 # wild-card C to both C and T
|
|
102 def Enumerate_C_to_CT ( cut3_context_lst ) :
|
|
103 tag_list = []
|
|
104 for context in cut3_context_lst :
|
|
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
|
|
106 tag_list.append(''.join(m))
|
|
107 return uniq(tag_list)
|
|
108
|
|
109 #-------------------------------------------------------------------------------------
|
|
110
|
|
111 # set a reasonable defaults
|
|
112 def find_location(program):
|
|
113 def is_exe(fpath):
|
|
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
|
|
115 for path in os.environ["PATH"].split(os.pathsep):
|
|
116 if is_exe(os.path.join(path, program)):
|
|
117 return path
|
|
118
|
|
119 return None
|
|
120
|
|
121 BOWTIE = 'bowtie'
|
|
122 BOWTIE2 = 'bowtie2'
|
|
123 SOAP = 'soap'
|
|
124 RMAP = 'rmap'
|
|
125
|
|
126 supported_aligners = [
|
|
127 BOWTIE,
|
|
128 BOWTIE2,
|
|
129 SOAP,
|
|
130 RMAP
|
|
131 ]
|
|
132
|
|
133 aligner_options_prefixes = { BOWTIE : '--bt-',
|
|
134 BOWTIE2 : '--bt2-',
|
|
135 SOAP : '--soap-',
|
|
136 RMAP : '--rmap-' }
|
|
137
|
|
138 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
|
|
139 for aligner, default_path in
|
|
140 [(BOWTIE,'~/bowtie-0.12.7/'),
|
|
141 (BOWTIE2, '~/bowtie-0.12.7/'),
|
|
142 (SOAP, '~/soap2.21release/'),
|
|
143 (RMAP, '~/rmap_v2.05/bin')
|
|
144 ])
|
|
145
|
|
146
|
|
147 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
|
|
148
|
|
149
|
|
150
|
|
151 def error(msg):
|
|
152 print >> sys.stderr, 'ERROR: %s' % msg
|
|
153 exit(1)
|
|
154
|
|
155
|
|
156 global_stime = datetime.datetime.now()
|
|
157 def elapsed(msg = None):
|
|
158 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
|
|
159
|
|
160 elapsed.stime = datetime.datetime.now()
|
|
161
|
|
162 elapsed.stime = datetime.datetime.now()
|
|
163
|
|
164
|
|
165 def open_log(fname):
|
|
166 open_log.logfile = open(fname, 'w', 1)
|
|
167
|
|
168 def logm(message):
|
|
169 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
|
|
170 print log_message,
|
|
171 open_log.logfile.write(log_message)
|
|
172
|
|
173 def close_log():
|
|
174 open_log.logfile.close()
|
|
175
|
|
176
|
|
177
|
|
178
|
|
179 def clear_dir(path):
|
|
180 """ If path does not exist, it creates a new directory.
|
|
181 If path points to a directory, it deletes all of its content.
|
|
182 If path points to a file, it raises an exception."""
|
|
183
|
|
184 if os.path.exists(path):
|
|
185 if not os.path.isdir(path):
|
|
186 error("%s is a file. Please, delete it manually!" % path)
|
|
187 else:
|
|
188 for the_file in os.listdir(path):
|
|
189 file_path = os.path.join(path, the_file)
|
|
190 try:
|
|
191 if os.path.isfile(file_path):
|
|
192 os.unlink(file_path)
|
|
193 elif os.path.isdir(file_path):
|
|
194 shutil.rmtree(file_path)
|
|
195 except Exception, e:
|
|
196 print e
|
|
197 else:
|
|
198 os.mkdir(path)
|
|
199
|
|
200
|
|
201 def delete_files(*filenames):
|
|
202 # return
|
|
203 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
|
|
204
|
|
205 for fname in filenames:
|
|
206 if type(fname) in [list, types.GeneratorType]:
|
|
207 delete_files(*list(fname))
|
|
208 elif os.path.isdir(fname):
|
|
209 shutil.rmtree(fname)
|
|
210 else:
|
|
211 os.remove(fname)
|
|
212
|
|
213 def split_file(filename, output_prefix, nlines):
|
|
214 """ Splits a file (equivalend to UNIX split -l ) """
|
|
215 fno = 0
|
|
216 lno = 0
|
|
217 INPUT = open(filename, 'r')
|
|
218 output = None
|
|
219 for l in INPUT:
|
|
220 if lno == 0:
|
|
221 fno += 1
|
|
222 if output is not None: output.close()
|
|
223 output = open('%s%d' % (output_prefix, fno), 'w')
|
|
224 lno = nlines
|
|
225 output.write(l)
|
|
226 lno -= 1
|
|
227 output.close()
|
|
228 INPUT.close()
|
|
229
|
|
230 def isplit_file(filename, output_prefix, nlines):
|
|
231 """ Splits a file (equivalend to UNIX split -l ) """
|
|
232 fno = 0
|
|
233 lno = 0
|
|
234 try :
|
|
235 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
|
|
236 except IOError :
|
|
237 print "[Error] Cannot find file : %s !" % filename
|
|
238 exit(-1)
|
|
239 output = None
|
|
240 output_fname = None
|
|
241 for l in input:
|
|
242 if lno == 0:
|
|
243
|
|
244 if output is not None:
|
|
245 output.close()
|
|
246 yield output_fname
|
|
247
|
|
248 fno += 1
|
|
249 output_fname = '%s%d' % (output_prefix, fno)
|
|
250 # output_fname = '%s_0' % output_prefix
|
|
251 output = open(output_fname, 'w')
|
|
252 lno = nlines
|
|
253 output.write(l)
|
|
254 lno -= 1
|
|
255 if output is not None:
|
|
256 output.close()
|
|
257 yield output_fname
|
|
258
|
|
259 input.close()
|
|
260
|
|
261 def read_fasta(fasta_file):
|
|
262 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
|
|
263 """
|
|
264
|
|
265 try :
|
|
266 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
|
|
267 except IOError:
|
|
268 print "[Error] Cannot find fasta file : %s !" % fasta_file
|
|
269 exit(-1)
|
|
270 sanitize = re.compile(r'[^ACTGN]')
|
|
271 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
|
|
272
|
|
273 chrom_seq = []
|
|
274 chrom_id = None
|
|
275 seen_ids = set()
|
|
276
|
|
277 for line in input:
|
|
278 if line[0] == '>':
|
|
279 if chrom_id is not None:
|
|
280 yield chrom_id, ''.join(chrom_seq)
|
|
281
|
|
282 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
|
|
283
|
|
284 if chrom_id in seen_ids:
|
|
285 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
|
|
286 seen_ids.add(chrom_id)
|
|
287
|
|
288 chrom_seq = []
|
|
289
|
|
290 else:
|
|
291 chrom_seq.append(sanitize.sub('N', line.strip().upper()))
|
|
292
|
|
293 yield chrom_id, ''.join(chrom_seq)
|
|
294
|
|
295 input.close()
|
|
296
|
|
297
|
|
298 def serialize(obj, filename):
|
|
299 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
|
|
300 """
|
|
301 output = open(filename+'.data', 'w')
|
|
302 marshal.dump(obj, output)
|
|
303 output.close()
|
|
304
|
|
305 def deserialize(filename):
|
|
306 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
|
|
307 """
|
|
308 try:
|
|
309 input = open(filename+'.data')
|
|
310 except IOError:
|
|
311 print "\n[Error]:\n\t Cannot find file: %s.data" % filename
|
|
312 exit(-1)
|
|
313
|
|
314 obj = marshal.load(input)
|
|
315 input.close()
|
|
316 return obj
|
|
317
|
|
318
|
|
319
|
|
320 def run_in_parallel(commands):
|
|
321
|
|
322 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
|
|
323
|
|
324 logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
|
|
325 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
|
|
326 return_code = proc.wait()
|
|
327 logm('Finished: ' + commands[i][0])
|
|
328 if return_code != 0:
|
|
329 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
|
|
330 for _, stdout in commands:
|
|
331 if stdout is not None:
|
|
332 stdout.close()
|