annotate BSseeker2/bs_utils/utils.py @ 1:8b26adf64adc draft default tip

V2.0.5
author weilong-guo
date Tue, 05 Nov 2013 01:55:39 -0500
parents e6df770c0e58
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
1 import gzip
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
2 import os
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
3
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
4 import datetime
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
5 import re
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
6 import shlex
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
7 import shutil
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
8 import string
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
9 import subprocess
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
10 import types
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
11 #from itertools import izip
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
12
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
13 import marshal
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
14 import sys
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
15
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
16
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
17
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
18 _rc_trans = string.maketrans('ACGT', 'TGCA')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
19 def reverse_compl_seq(strseq):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
20 return strseq.translate(_rc_trans)[::-1]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
21
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
22
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
23
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
24 def show_version() :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
25 print ""
1
weilong-guo
parents: 0
diff changeset
26 print " BS-Seeker2 v2.0.5 - Nov 5, 2013 "
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
27 print ""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
28
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
29
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
30
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
31 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
32 IUPAC nucleotide code Base
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
33 A Adenine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
34 C Cytosine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
35 G Guanine
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
36 T (or U) Thymine (or Uracil)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
37 R A or G
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
38 Y C or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
39 S G or C
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
40 W A or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
41 K G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
42 M A or C
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
43 B C or G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
44 D A or G or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
45 H A or C or T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
46 V A or C or G
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
47 N any base
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
48 . or - gap
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
49 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
50
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
51
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
52 def IUPAC ( nuc ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
53 if nuc == 'R' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
54 return ('A','G')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
55 elif nuc == 'Y' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
56 return ('C', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
57 elif nuc == 'S' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
58 return ('G', 'C')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
59 elif nuc == 'W' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
60 return ('A', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
61 elif nuc == 'K' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
62 return ('G','T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
63 elif nuc == 'M' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
64 return ('A','C')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
65 elif nuc == 'B' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
66 return ('C', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
67 elif nuc == 'D' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
68 return ('A', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
69 elif nuc == 'H' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
70 return ('A', 'C', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
71 elif nuc == 'V' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
72 return ('A', 'C', 'G')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
73 elif nuc == 'N' :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
74 return ('A', 'C', 'G', 'T')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
75 else :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
76 return (nuc)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
77
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
78
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
79 def uniq(inlist):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
80 # order preserving
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
81 uniques = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
82 for item in inlist:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
83 if item not in uniques:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
84 uniques.append(item)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
85 return uniques
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
86
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
87 from itertools import product
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
88
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
89 def EnumerateIUPAC ( context_lst ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
90 tag_list = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
91 # context_lst = [context]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
92 for one_context in context_lst :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
93 for m in product(*[ IUPAC(i) for i in list(one_context)]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
94 tag_list.append(''.join(m))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
95 return uniq(tag_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
96
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
97 from itertools import product
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
98
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
99 # example: cut3_context="CGG"
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
100 # return generator for : ["CGG","TGG"]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
101 # wild-card C to both C and T
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
102 def Enumerate_C_to_CT ( cut3_context_lst ) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
103 tag_list = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
104 for context in cut3_context_lst :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
105 for m in product(*[i if (i is not 'C') else ('C','T') for i in context]) :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
106 tag_list.append(''.join(m))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
107 return uniq(tag_list)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
108
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
109 #-------------------------------------------------------------------------------------
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
110
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
111 # set a reasonable defaults
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
112 def find_location(program):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
113 def is_exe(fpath):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
114 return os.path.exists(fpath) and os.access(fpath, os.X_OK)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
115 for path in os.environ["PATH"].split(os.pathsep):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
116 if is_exe(os.path.join(path, program)):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
117 return path
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
118
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
119 return None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
120
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
121 BOWTIE = 'bowtie'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
122 BOWTIE2 = 'bowtie2'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
123 SOAP = 'soap'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
124 RMAP = 'rmap'
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
125
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
126 supported_aligners = [
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
127 BOWTIE,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
128 BOWTIE2,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
129 SOAP,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
130 RMAP
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
131 ]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
132
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
133 aligner_options_prefixes = { BOWTIE : '--bt-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
134 BOWTIE2 : '--bt2-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
135 SOAP : '--soap-',
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
136 RMAP : '--rmap-' }
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
137
1
weilong-guo
parents: 0
diff changeset
138 #aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or default_path))
weilong-guo
parents: 0
diff changeset
139 # for aligner, default_path in
weilong-guo
parents: 0
diff changeset
140 # [(BOWTIE,'~/bowtie/'),
weilong-guo
parents: 0
diff changeset
141 # (BOWTIE2, '~/bowtie2/'),
weilong-guo
parents: 0
diff changeset
142 # (SOAP, '~/soap/'),
weilong-guo
parents: 0
diff changeset
143 # (RMAP, '~/rmap/bin')
weilong-guo
parents: 0
diff changeset
144 # ])
weilong-guo
parents: 0
diff changeset
145 aligner_path = dict((aligner, os.path.expanduser(find_location(aligner) or "None"))
weilong-guo
parents: 0
diff changeset
146 for aligner in [(BOWTIE), (BOWTIE2), (SOAP), (RMAP)])
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
147
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
148 reference_genome_path = os.path.join(os.path.split(globals()['__file__'])[0],'reference_genomes')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
149
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
150
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
151
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
152 def error(msg):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
153 print >> sys.stderr, 'ERROR: %s' % msg
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
154 exit(1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
155
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
156
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
157 global_stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
158 def elapsed(msg = None):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
159 print "[%s]" % msg if msg is not None else "+", "Last:" , datetime.datetime.now() - elapsed.stime, '\tTotal:', datetime.datetime.now() - global_stime
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
160
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
161 elapsed.stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
162
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
163 elapsed.stime = datetime.datetime.now()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
164
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
165
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
166 def open_log(fname):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
167 open_log.logfile = open(fname, 'w', 1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
168
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
169 def logm(message):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
170 log_message = "[%s] %s\n" % (datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), message)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
171 print log_message,
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
172 open_log.logfile.write(log_message)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
173
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
174 def close_log():
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
175 open_log.logfile.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
176
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
177
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
178
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
179
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
180 def clear_dir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
181 """ If path does not exist, it creates a new directory.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
182 If path points to a directory, it deletes all of its content.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
183 If path points to a file, it raises an exception."""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
184
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
185 if os.path.exists(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
186 if not os.path.isdir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
187 error("%s is a file. Please, delete it manually!" % path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
188 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
189 for the_file in os.listdir(path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
190 file_path = os.path.join(path, the_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
191 try:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
192 if os.path.isfile(file_path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
193 os.unlink(file_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
194 elif os.path.isdir(file_path):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
195 shutil.rmtree(file_path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
196 except Exception, e:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
197 print e
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
198 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
199 os.mkdir(path)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
200
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
201
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
202 def delete_files(*filenames):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
203 # return
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
204 """ Deletes a number of files. filenames can contain generator expressions and/or lists, too"""
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
205
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
206 for fname in filenames:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
207 if type(fname) in [list, types.GeneratorType]:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
208 delete_files(*list(fname))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
209 elif os.path.isdir(fname):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
210 shutil.rmtree(fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
211 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
212 os.remove(fname)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
213
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
214 def split_file(filename, output_prefix, nlines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
215 """ Splits a file (equivalend to UNIX split -l ) """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
216 fno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
217 lno = 0
1
weilong-guo
parents: 0
diff changeset
218 if filename.endswith(".gz") :
weilong-guo
parents: 0
diff changeset
219 INPUT = gzip.open(filename, 'rb')
weilong-guo
parents: 0
diff changeset
220 else :
weilong-guo
parents: 0
diff changeset
221 INPUT = open(filename, 'r')
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
222 output = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
223 for l in INPUT:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
224 if lno == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
225 fno += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
226 if output is not None: output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
227 output = open('%s%d' % (output_prefix, fno), 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
228 lno = nlines
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
229 output.write(l)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
230 lno -= 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
231 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
232 INPUT.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
233
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
234 def isplit_file(filename, output_prefix, nlines):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
235 """ Splits a file (equivalend to UNIX split -l ) """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
236 fno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
237 lno = 0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
238 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
239 input = (gzip.open if filename.endswith('.gz') else open)(filename, 'r')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
240 except IOError :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
241 print "[Error] Cannot find file : %s !" % filename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
242 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
243 output = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
244 output_fname = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
245 for l in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
246 if lno == 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
247
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
248 if output is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
249 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
250 yield output_fname
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
251
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
252 fno += 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
253 output_fname = '%s%d' % (output_prefix, fno)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
254 # output_fname = '%s_0' % output_prefix
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
255 output = open(output_fname, 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
256 lno = nlines
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
257 output.write(l)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
258 lno -= 1
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
259 if output is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
260 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
261 yield output_fname
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
262
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
263 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
264
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
265 def read_fasta(fasta_file):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
266 """ Iterates over all sequences in a fasta file. One at a time, without reading the whole file into the main memory.
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
267 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
268
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
269 try :
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
270 input = (gzip.open if fasta_file.endswith('.gz') else open)(fasta_file)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
271 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
272 print "[Error] Cannot find fasta file : %s !" % fasta_file
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
273 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
274 sanitize = re.compile(r'[^ACTGN]')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
275 sanitize_seq_id = re.compile(r'[^A-Za-z0-9]')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
276
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
277 chrom_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
278 chrom_id = None
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
279 seen_ids = set()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
280
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
281 for line in input:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
282 if line[0] == '>':
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
283 if chrom_id is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
284 yield chrom_id, ''.join(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
285
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
286 chrom_id = sanitize_seq_id.sub('_', line.split()[0][1:])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
287
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
288 if chrom_id in seen_ids:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
289 error('BS Seeker found identical sequence ids (id: %s) in the fasta file: %s. Please, make sure that all sequence ids are unique and contain only alphanumeric characters: A-Za-z0-9_' % (chrom_id, fasta_file))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
290 seen_ids.add(chrom_id)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
291
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
292 chrom_seq = []
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
293
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
294 else:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
295 chrom_seq.append(sanitize.sub('N', line.strip().upper()))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
296
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
297 yield chrom_id, ''.join(chrom_seq)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
298
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
299 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
300
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
301
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
302 def serialize(obj, filename):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
303 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
304 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
305 output = open(filename+'.data', 'w')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
306 marshal.dump(obj, output)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
307 output.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
308
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
309 def deserialize(filename):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
310 """ Be careful what you serialize and deseriazlize! marshal.load is not secure!
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
311 """
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
312 try:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
313 input = open(filename+'.data')
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
314 except IOError:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
315 print "\n[Error]:\n\t Cannot find file: %s.data" % filename
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
316 exit(-1)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
317
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
318 obj = marshal.load(input)
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
319 input.close()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
320 return obj
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
321
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
322
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
323
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
324 def run_in_parallel(commands):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
325
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
326 commands = [(cmd[0], open(cmd[1], 'w')) if type(cmd) is tuple else (cmd, None) for cmd in commands]
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
327
1
weilong-guo
parents: 0
diff changeset
328 #logm('Starting commands:\n' + '\n'.join([cmd for cmd, stdout in commands]))
weilong-guo
parents: 0
diff changeset
329 logm('Starting commands:')
weilong-guo
parents: 0
diff changeset
330 for cmd, stdout in commands :
weilong-guo
parents: 0
diff changeset
331 logm("Launched: "+cmd)
0
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
332 for i, proc in enumerate([subprocess.Popen(args = shlex.split(cmd), stdout = stdout) for cmd, stdout in commands]):
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
333 return_code = proc.wait()
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
334 logm('Finished: ' + commands[i][0])
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
335 if return_code != 0:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
336 error('%s \nexit with an error code: %d. Please, check the log files.' % (commands[i], return_code))
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
337 for _, stdout in commands:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
338 if stdout is not None:
e6df770c0e58 Initial upload
weilong-guo
parents:
diff changeset
339 stdout.close()