0
|
1 import gzip
|
|
2 import os
|
|
3
|
|
4 import datetime
|
|
5 import re
|
|
6 import shlex
|
|
7 import shutil
|
|
8 import string
|
|
9 import subprocess
|
|
10 import types
|
|
11 #from itertools import izip
|
|
12
|
|
13 import marshal
|
|
14 import sys
|
|
15
|
|
16
|
|
17
|
|
18 _rc_trans = string.maketrans('ACGT', 'TGCA')
|
|
19 def reverse_compl_seq(strseq):
|
|
20 return strseq.translate(_rc_trans)[::-1]
|
|
21
|
|
22
|
|
23
|
|
24 def show_version() :
|
|
25 print ""
|
1
|
26 print " BS-Seeker2 v2.0.5 - Nov 5, 2013 "
|
0
|
27 print ""
|
|
28
|
|
29
|
|
30
|
|
31 """
|
|
32 IUPAC nucleotide code Base
|
|
33 A Adenine
|
|
34 C Cytosine
|
|
35 G Guanine
|
|
36 T (or U) Thymine (or Uracil)
|
|
37 R A or G
|
|
38 Y C or T
|
|
39 S G or C
|
|
40 W A or T
|
|
41 K G or T
|
|
42 M A or C
|
|
43 B C or G or T
|
|
44 D A or G or T
|
|
45 H A or C or T
|
|
46 V A or C or G
|
|
47 N any base
|
|
48 . or - gap
|
|
49 """
|
|
50
|
|
51
|
|
52 def IUPAC ( nuc ) :
|
|
53 if nuc == 'R' :
|
|
54 return ('A','G')
|
|
55 elif nuc == 'Y' :
|
|
56 return ('C', 'T')
|
|
57 elif nuc == 'S' :
|
|
58 return ('G', 'C')
|
|
59 elif nuc == 'W' :
|
|
60 return ('A', 'T')
|
|
61 elif nuc == 'K' :
|
|
62 return ('G','T')
|
|
63 elif nuc == 'M' :
|
|
64 return ('A','C')
|
|
65 elif nuc == 'B' :
|
|
66 return ('C', 'G', 'T')
|
|
67 elif nuc == 'D' :
|
|
68 return ('A', 'G', 'T')
|
|
69 elif nuc == 'H' :
|
|
70 return ('A', 'C', 'T')
|
|
71 elif nuc == 'V' :
|
|
72 return ('A', 'C', 'G')
|
|
73 elif nuc == 'N' :
|
|
74 return ('A', 'C', 'G', 'T')
|
|
75 else :
|
|
76 return (nuc)
|
|
77
|
|
78
|
|
79 def uniq(inlist):
|
|
80 # order preserving
|
|
81 uniques = []
|
|
82 for item in inlist:
|
|
83 if item not in uniques:
|
|
84 uniques.append(item)
|
|
85 return uniques
|
|
86
|
|
87 from itertools import product
|
|
88
|
|
89 def EnumerateIUPAC ( context_lst ) :
|
|
90 tag_list = []
|
|
91 # context_lst = [context]
|
|
92 for one_context in context_lst :
|
|
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) :
|
|
94 tag_list.append(''.join(m))
|
|
95 return uniq(tag_list)
|
|
96
|
|
97 from itertools import product
|
|
98
|
|
99 # example: cut3_context="CGG"
|
|
100 # return generator for : ["CGG","TGG"]
|
|
101 # wild-card C to both C and T
|
|
102 def Enumerate_C_to_CT ( cut3_context_lst ) :
|
|
103 tag_list = []
|
|
104 for context in cut3_context_lst :
|
|
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
|
|
106 tag_list.append(''.join(m))
|
|
107 return uniq(tag_list)
|
|
108
|
|
109 #-------------------------------------------------------------------------------------
|
|
110
|
|
111 # set a reasonable defaults
|
|
112 def find_location(program):
|
|
113 def is_exe(fpath):
|
|
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
|
|
115 for path in os.environ["PATH"].split(os.pathsep):
|
|
116 if is_exe(os.path.join(path, program)):
|
|
117 return path
|
|
118
|
|
119 return None
|
|
120
|
|
121 BOWTIE = 'bowtie'
|
|
122 BOWTIE2 = 'bowtie2'
|
|
123 SOAP = 'soap'
|
|
124 RMAP = 'rmap'
|
|
125
|
|
126 supported_aligners = [
|
|
127 BOWTIE,
|
|
128 BOWTIE2,
|
|
129 SOAP,
|
|
130 RMAP
|
|
131 ]
|
|
132
|
|
133 aligner_options_prefixes = { BOWTIE : '--bt-',
|
|
134 BOWTIE2 : '--bt2-',
|
|
135 SOAP : '--soap-',
|
|
136 RMAP : '--rmap-' }
|
|
137
|
1
|
138 #aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
|
|
139 # for aligner, default_path in
|
|
140 # [(BOWTIE,'~/bowtie/'),
|
|
141 # (BOWTIE2, '~/bowtie2/'),
|
|
142 # (SOAP, '~/soap/'),
|
|
143 # (RMAP, '~/rmap/bin')
|
|
144 # ])
|
|
145 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or "None"))
|
|
146 for aligner in [(BOWTIE), (BOWTIE2), (SOAP), (RMAP)])
|
0
|
147
|
|
148 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
|
|
149
|
|
150
|
|
151
|
|
152 def error(msg):
|
|
153 print >> sys.stderr, 'ERROR: %s' % msg
|
|
154 exit(1)
|
|
155
|
|
156
|
|
157 global_stime = datetime.datetime.now()
|
|
158 def elapsed(msg = None):
|
|
159 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
|
|
160
|
|
161 elapsed.stime = datetime.datetime.now()
|
|
162
|
|
163 elapsed.stime = datetime.datetime.now()
|
|
164
|
|
165
|
|
166 def open_log(fname):
|
|
167 open_log.logfile = open(fname, 'w', 1)
|
|
168
|
|
169 def logm(message):
|
|
170 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
|
|
171 print log_message,
|
|
172 open_log.logfile.write(log_message)
|
|
173
|
|
174 def close_log():
|
|
175 open_log.logfile.close()
|
|
176
|
|
177
|
|
178
|
|
179
|
|
180 def clear_dir(path):
|
|
181 """ If path does not exist, it creates a new directory.
|
|
182 If path points to a directory, it deletes all of its content.
|
|
183 If path points to a file, it raises an exception."""
|
|
184
|
|
185 if os.path.exists(path):
|
|
186 if not os.path.isdir(path):
|
|
187 error("%s is a file. Please, delete it manually!" % path)
|
|
188 else:
|
|
189 for the_file in os.listdir(path):
|
|
190 file_path = os.path.join(path, the_file)
|
|
191 try:
|
|
192 if os.path.isfile(file_path):
|
|
193 os.unlink(file_path)
|
|
194 elif os.path.isdir(file_path):
|
|
195 shutil.rmtree(file_path)
|
|
196 except Exception, e:
|
|
197 print e
|
|
198 else:
|
|
199 os.mkdir(path)
|
|
200
|
|
201
|
|
202 def delete_files(*filenames):
|
|
203 # return
|
|
204 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
|
|
205
|
|
206 for fname in filenames:
|
|
207 if type(fname) in [list, types.GeneratorType]:
|
|
208 delete_files(*list(fname))
|
|
209 elif os.path.isdir(fname):
|
|
210 shutil.rmtree(fname)
|
|
211 else:
|
|
212 os.remove(fname)
|
|
213
|
|
214 def split_file(filename, output_prefix, nlines):
|
|
215 """ Splits a file (equivalend to UNIX split -l ) """
|
|
216 fno = 0
|
|
217 lno = 0
|
1
|
218 if filename.endswith(".gz") :
|
|
219 INPUT = gzip.open(filename, 'rb')
|
|
220 else :
|
|
221 INPUT = open(filename, 'r')
|
0
|
222 output = None
|
|
223 for l in INPUT:
|
|
224 if lno == 0:
|
|
225 fno += 1
|
|
226 if output is not None: output.close()
|
|
227 output = open('%s%d' % (output_prefix, fno), 'w')
|
|
228 lno = nlines
|
|
229 output.write(l)
|
|
230 lno -= 1
|
|
231 output.close()
|
|
232 INPUT.close()
|
|
233
|
|
234 def isplit_file(filename, output_prefix, nlines):
|
|
235 """ Splits a file (equivalend to UNIX split -l ) """
|
|
236 fno = 0
|
|
237 lno = 0
|
|
238 try :
|
|
239 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
|
|
240 except IOError :
|
|
241 print "[Error] Cannot find file : %s !" % filename
|
|
242 exit(-1)
|
|
243 output = None
|
|
244 output_fname = None
|
|
245 for l in input:
|
|
246 if lno == 0:
|
|
247
|
|
248 if output is not None:
|
|
249 output.close()
|
|
250 yield output_fname
|
|
251
|
|
252 fno += 1
|
|
253 output_fname = '%s%d' % (output_prefix, fno)
|
|
254 # output_fname = '%s_0' % output_prefix
|
|
255 output = open(output_fname, 'w')
|
|
256 lno = nlines
|
|
257 output.write(l)
|
|
258 lno -= 1
|
|
259 if output is not None:
|
|
260 output.close()
|
|
261 yield output_fname
|
|
262
|
|
263 input.close()
|
|
264
|
|
265 def read_fasta(fasta_file):
|
|
266 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
|
|
267 """
|
|
268
|
|
269 try :
|
|
270 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
|
|
271 except IOError:
|
|
272 print "[Error] Cannot find fasta file : %s !" % fasta_file
|
|
273 exit(-1)
|
|
274 sanitize = re.compile(r'[^ACTGN]')
|
|
275 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
|
|
276
|
|
277 chrom_seq = []
|
|
278 chrom_id = None
|
|
279 seen_ids = set()
|
|
280
|
|
281 for line in input:
|
|
282 if line[0] == '>':
|
|
283 if chrom_id is not None:
|
|
284 yield chrom_id, ''.join(chrom_seq)
|
|
285
|
|
286 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
|
|
287
|
|
288 if chrom_id in seen_ids:
|
|
289 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
|
|
290 seen_ids.add(chrom_id)
|
|
291
|
|
292 chrom_seq = []
|
|
293
|
|
294 else:
|
|
295 chrom_seq.append(sanitize.sub('N', line.strip().upper()))
|
|
296
|
|
297 yield chrom_id, ''.join(chrom_seq)
|
|
298
|
|
299 input.close()
|
|
300
|
|
301
|
|
302 def serialize(obj, filename):
|
|
303 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
|
|
304 """
|
|
305 output = open(filename+'.data', 'w')
|
|
306 marshal.dump(obj, output)
|
|
307 output.close()
|
|
308
|
|
309 def deserialize(filename):
|
|
310 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
|
|
311 """
|
|
312 try:
|
|
313 input = open(filename+'.data')
|
|
314 except IOError:
|
|
315 print "\n[Error]:\n\t Cannot find file: %s.data" % filename
|
|
316 exit(-1)
|
|
317
|
|
318 obj = marshal.load(input)
|
|
319 input.close()
|
|
320 return obj
|
|
321
|
|
322
|
|
323
|
|
324 def run_in_parallel(commands):
|
|
325
|
|
326 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
|
|
327
|
1
|
328 #logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
|
|
329 logm('Starting commands:')
|
|
330 for cmd, stdout in commands :
|
|
331 logm("Launched: "+cmd)
|
0
|
332 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
|
|
333 return_code = proc.wait()
|
|
334 logm('Finished: ' + commands[i][0])
|
|
335 if return_code != 0:
|
|
336 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
|
|
337 for _, stdout in commands:
|
|
338 if stdout is not None:
|
|
339 stdout.close()
|