annotate BSseeker2/bs_utils/utils.py @ 0:e6df770c0e58 draft

Initial upload
author weilong-guo
date Fri, 12 Jul 2013 18:47:28 -0400
parents
children 8b26adf64adc
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import gzip
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 import os
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 import datetime
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 import re
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 import shlex
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 import shutil
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 import string
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 import subprocess
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 import types
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 #from itertools import izip
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 import marshal
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 import sys
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 _rc_trans = string.maketrans('ACGT', 'TGCA')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 def reverse_compl_seq(strseq):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 return strseq.translate(_rc_trans)[::-1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 def show_version() :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25 print ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
26 print " BS-Seeker2 v2.0.3 - May 19, 2013 "
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 print ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 IUPAC nucleotide code Base
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 A Adenine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 C Cytosine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35 G Guanine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 T (or U) Thymine (or Uracil)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 R A or G
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 Y C or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 S G or C
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 W A or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 K G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 M A or C
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 B C or G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 D A or G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 H A or C or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 V A or C or G
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 N any base
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 . or - gap
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 def IUPAC ( nuc ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 if nuc == 'R' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 return ('A','G')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 elif nuc == 'Y' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 return ('C', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 elif nuc == 'S' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 return ('G', 'C')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 elif nuc == 'W' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 return ('A', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 elif nuc == 'K' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 return ('G','T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 elif nuc == 'M' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 return ('A','C')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 elif nuc == 'B' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 return ('C', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 elif nuc == 'D' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 return ('A', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 elif nuc == 'H' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 return ('A', 'C', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 elif nuc == 'V' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 return ('A', 'C', 'G')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 elif nuc == 'N' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 return ('A', 'C', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 return (nuc)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 def uniq(inlist):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 # order preserving
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 uniques = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 for item in inlist:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 if item not in uniques:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 uniques.append(item)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 return uniques
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 from itertools import product
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 def EnumerateIUPAC ( context_lst ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 tag_list = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 # context_lst = [context]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 for one_context in context_lst :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 tag_list.append(''.join(m))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95 return uniq(tag_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 from itertools import product
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 # example: cut3_context="CGG"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 # return generator for : ["CGG","TGG"]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 # wild-card C to both C and T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 def Enumerate_C_to_CT ( cut3_context_lst ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 tag_list = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 for context in cut3_context_lst :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 tag_list.append(''.join(m))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 return uniq(tag_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109 #-------------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 # set a reasonable defaults
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 def find_location(program):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 def is_exe(fpath):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 for path in os.environ["PATH"].split(os.pathsep):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 if is_exe(os.path.join(path, program)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 return path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 return None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121 BOWTIE = 'bowtie'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 BOWTIE2 = 'bowtie2'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 SOAP = 'soap'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 RMAP = 'rmap'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 supported_aligners = [
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 BOWTIE,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 BOWTIE2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 SOAP,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130 RMAP
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 ]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133 aligner_options_prefixes = { BOWTIE : '--bt-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 BOWTIE2 : '--bt2-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 SOAP : '--soap-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 RMAP : '--rmap-' }
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
138 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
139 for aligner, default_path in
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
140 [(BOWTIE,'~/bowtie-0.12.7/'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
141 (BOWTIE2, '~/bowtie-0.12.7/'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
142 (SOAP, '~/soap2.21release/'),
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
143 (RMAP, '~/rmap_v2.05/bin')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
144 ])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
145
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
146
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151 def error(msg):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 print >> sys.stderr, 'ERROR: %s' % msg
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 exit(1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156 global_stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 def elapsed(msg = None):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160 elapsed.stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162 elapsed.stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165 def open_log(fname):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166 open_log.logfile = open(fname, 'w', 1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168 def logm(message):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 print log_message,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 open_log.logfile.write(log_message)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173 def close_log():
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 open_log.logfile.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179 def clear_dir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 """ If path does not exist, it creates a new directory.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 If path points to a directory, it deletes all of its content.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 If path points to a file, it raises an exception."""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184 if os.path.exists(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 if not os.path.isdir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 error("%s is a file. Please, delete it manually!" % path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 for the_file in os.listdir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189 file_path = os.path.join(path, the_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 try:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 if os.path.isfile(file_path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 os.unlink(file_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 elif os.path.isdir(file_path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 shutil.rmtree(file_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195 except Exception, e:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 print e
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 os.mkdir(path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201 def delete_files(*filenames):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202 # return
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205 for fname in filenames:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 if type(fname) in [list, types.GeneratorType]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 delete_files(*list(fname))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208 elif os.path.isdir(fname):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 shutil.rmtree(fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 os.remove(fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213 def split_file(filename, output_prefix, nlines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 """ Splits a file (equivalend to UNIX split -l ) """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 fno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216 lno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217 INPUT = open(filename, 'r')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
218 output = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
219 for l in INPUT:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
220 if lno == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
221 fno += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 if output is not None: output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223 output = open('%s%d' % (output_prefix, fno), 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 lno = nlines
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225 output.write(l)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 lno -= 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 INPUT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 def isplit_file(filename, output_prefix, nlines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 """ Splits a file (equivalend to UNIX split -l ) """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 fno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233 lno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237 print "[Error] Cannot find file : %s !" % filename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 output = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 output_fname = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 for l in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 if lno == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 if output is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 yield output_fname
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 fno += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 output_fname = '%s%d' % (output_prefix, fno)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 # output_fname = '%s_0' % output_prefix
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251 output = open(output_fname, 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 lno = nlines
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 output.write(l)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 lno -= 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 if output is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 yield output_fname
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 def read_fasta(fasta_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268 print "[Error] Cannot find fasta file : %s !" % fasta_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 sanitize = re.compile(r'[^ACTGN]')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273 chrom_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274 chrom_id = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 seen_ids = set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 if line[0] == '>':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279 if chrom_id is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280 yield chrom_id, ''.join(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 if chrom_id in seen_ids:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 seen_ids.add(chrom_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 chrom_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291 chrom_seq.append(sanitize.sub('N', line.strip().upper()))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293 yield chrom_id, ''.join(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298 def serialize(obj, filename):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301 output = open(filename+'.data', 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 marshal.dump(obj, output)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 def deserialize(filename):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308 try:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 input = open(filename+'.data')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 print "\n[Error]:\n\t Cannot find file: %s.data" % filename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 obj = marshal.load(input)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 return obj
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 def run_in_parallel(commands):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 return_code = proc.wait()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327 logm('Finished: ' + commands[i][0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
328 if return_code != 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
329 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
330 for _, stdout in commands:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
331 if stdout is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 stdout.close()