Mercurial > repos > cafletezbrant > kmersvm
annotate kmersvm/scripts/nullseq_generate.py @ 7:fd740d515502 draft default tip
Uploaded revised kmer-SVM to include modules from kmer-visual.
author | cafletezbrant |
---|---|
date | Sun, 16 Jun 2013 18:06:14 -0400 |
parents | 7fe1103032f7 |
children |
rev | line source |
---|---|
0 | 1 import os |
2 import os.path | |
3 import sys | |
4 import random | |
5 import optparse | |
6 | |
7 from bitarray import bitarray | |
8 | |
9 def read_bed_file(filename): | |
10 """ | |
11 """ | |
12 f = open(filename, 'r') | |
13 lines = f.readlines() | |
14 f.close() | |
15 | |
16 positions = [] | |
17 | |
18 for line in lines: | |
19 if line[0] == '#': | |
20 continue | |
21 | |
22 l = line.split() | |
23 | |
24 positions.append((l[0], int(l[1]), int(l[2]))) | |
25 | |
26 return positions | |
27 | |
28 | |
29 def bitarray_fromfile(filename): | |
30 """ | |
31 """ | |
32 fh = open(filename, 'rb') | |
33 bits = bitarray() | |
34 bits.fromfile(fh) | |
35 | |
36 return bits, fh | |
37 | |
38 | |
39 def make_profile(positions, buildname, basedir): | |
40 """ | |
41 """ | |
42 chrnames = sorted(set(map(lambda p: p[0], positions))) | |
43 | |
44 profiles = [] | |
45 for chrom in chrnames: | |
46 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) | |
47 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) | |
48 | |
49 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False: | |
50 # continue | |
51 bits_gc, gcf = bitarray_fromfile(idxf_gc) | |
52 bits_rpt, rptf = bitarray_fromfile(idxf_rpt) | |
53 | |
54 for pos in positions: | |
55 if pos[0] != chrom: | |
56 continue | |
57 | |
58 seqid = pos[0] + ':' + str(pos[1]+1) + '-' + str(pos[2]) | |
59 slen = pos[2]-pos[1] | |
60 gc = bits_gc[pos[1]:pos[2]].count(True) | |
61 rpt = bits_rpt[pos[1]:pos[2]].count(True) | |
62 | |
63 profiles.append((seqid, slen, gc, rpt)) | |
64 | |
65 gcf.close() | |
66 rptf.close() | |
67 | |
68 return profiles | |
69 | |
70 | |
71 def sample_sequences(positions, buildname, basedir, options): | |
72 """ | |
73 """ | |
7
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
74 max_fails = 20 |
0 | 75 max_trys = options.max_trys |
76 norpt = options.norpt | |
77 nogc = options.nogc | |
78 | |
79 chrnames = sorted(set(map(lambda p: p[0], positions))) | |
80 profiles = make_profile(positions, buildname, basedir) | |
81 | |
82 excluded = [] | |
83 if options.excludefile: | |
84 excluded = read_bed_file(options.excludefile) | |
85 | |
86 #truncate (clear) file | |
87 f = open(options.output, 'w') | |
88 f.close() | |
89 | |
90 for chrom in chrnames: | |
91 if options.quiet == False: | |
92 sys.stderr.write("sampling from " + chrom + "\n") | |
93 | |
94 idxf_na = os.path.join(basedir, '.'.join([buildname, chrom, 'na', 'out'])) | |
95 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out'])) | |
96 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out'])) | |
97 | |
98 #this bit array is used to mark positions that are excluded from sampling | |
99 #this will be updated as we sample more sequences in order to prevent sampled sequences from overlapping | |
100 bits_na, naf = bitarray_fromfile(idxf_na) | |
101 bits_gc, gcf = bitarray_fromfile(idxf_gc) | |
102 bits_rpt, rptf = bitarray_fromfile(idxf_rpt) | |
103 | |
104 #mark excluded regions | |
105 for pos in excluded: | |
106 if pos[0] != chrom: | |
107 continue | |
108 bits_na[pos[1]:pos[2]] = True | |
109 | |
110 npos = 0 | |
111 #mark positive regions | |
112 for pos in positions: | |
113 if pos[0] != chrom: | |
114 continue | |
115 bits_na[pos[1]:pos[2]] = True | |
116 npos+=1 | |
117 | |
118 if options.count == 0: | |
119 count = options.fold*npos | |
120 else: | |
121 count = options.count | |
122 | |
7
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
123 #initialize paramter |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
124 #added by dlee 2/17/13 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
125 ncfails = 0 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
126 rpt_err = options.rpt_err |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
127 gc_err = options.gc_err |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
128 |
0 | 129 sampled_positions = [] |
130 while len(sampled_positions) < count: | |
131 sampled_prof = random.choice(profiles) | |
132 sampled_len = sampled_prof[1] | |
133 sampled_gc = sampled_prof[2] | |
134 sampled_rpt = sampled_prof[3] | |
135 | |
7
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
136 #relax rpt_err and gc_err if it keep fail to sample a region |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
137 #added by dlee 2/17/13 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
138 if ncfails >= max_fails: |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
139 if options.quiet == False: |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
140 sys.stderr.write("reached max_fail. relax gc and rpt err criteria\n") |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
141 ncfails = 0 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
142 rpt_err += 0.01 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
143 gc_err += 0.01 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
144 |
0 | 145 rpt_err_allowed = int(rpt_err*sampled_len) |
146 gc_err_allowed = int(gc_err*sampled_len) | |
147 trys = 0 | |
148 while trys < max_trys: | |
149 trys += 1 | |
150 | |
151 pos = random.randint(1, bits_na.length() - sampled_len) | |
152 pos_e = pos+sampled_len | |
153 | |
154 if bits_na[pos:pos_e].any(): | |
155 continue | |
156 | |
157 if not norpt: | |
158 pos_rpt = bits_rpt[pos:pos_e].count(True) | |
159 if abs(sampled_rpt - pos_rpt) > rpt_err_allowed: | |
160 continue | |
161 | |
162 if not nogc: | |
163 pos_gc = bits_gc[pos:pos_e].count(True) | |
164 if abs(sampled_gc - pos_gc) > gc_err_allowed: | |
165 continue | |
166 | |
167 #accept the sampled position | |
168 #mark the sampled regions | |
169 bits_na[pos:pos_e] = True | |
170 | |
171 sampled_positions.append((chrom, pos, pos_e)) | |
172 | |
7
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
173 #reset the counter of consecutive fails |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
174 #added by dlee 2/17/13 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
175 ncfails = 0 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
176 |
0 | 177 #print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc |
178 break | |
179 else: | |
7
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
180 #increase the counter of consecutive fails |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
181 #added by dlee 2/17/13 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
182 ncfails += 1 |
fd740d515502
Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents:
0
diff
changeset
|
183 |
0 | 184 if options.quiet == False: |
185 sys.stderr.write(' '.join(["fail to sample from", \ | |
186 "len=", str(sampled_len), \ | |
187 "rpt=", str(sampled_rpt), \ | |
188 "gc=", str(sampled_gc)]) + '\n') | |
189 | |
190 naf.close() | |
191 gcf.close() | |
192 rptf.close() | |
193 | |
194 f = open(options.output, 'a') | |
195 for spos in sorted(sampled_positions, key=lambda s: s[1]): | |
196 f.write('\t'.join([spos[0], str(spos[1]), str(spos[2])]) + '\n') | |
197 f.close() | |
198 | |
199 | |
200 def main(argv=sys.argv): | |
201 usage = "usage: %prog [options] <Input Bed File> <Genome Build Name> <Base Directory>" | |
202 | |
203 desc = "generate null sequences" | |
204 parser = optparse.OptionParser(usage=usage, description=desc) | |
205 | |
206 parser.add_option("-x", dest="fold", type="int", \ | |
207 default = 1, help="number of sequence to sample, FOLD times of given dataset (default=1)") | |
208 | |
209 parser.add_option("-c", dest="count", type="int", \ | |
210 default=0, help="number of sequences to sample, override -x option (default=NA, obsolete)") | |
211 | |
212 parser.add_option("-r", dest="rseed", type="int", \ | |
213 default=1, help="random number seed (default=1)") | |
214 | |
215 parser.add_option("-g", dest="gc_err", type="float", \ | |
216 default=0.02, help="GC errors allowed (default=0.02)") | |
217 | |
218 parser.add_option("-t", dest="rpt_err", type="float", \ | |
219 default=0.02, help="RPT errors allowed (default=0.02)") | |
220 | |
221 parser.add_option("-e", dest="excludefile", \ | |
222 default="", help="filename that contains regions to be excluded (default=NA)") | |
223 | |
224 parser.add_option("-G", dest="nogc", action="store_true", \ | |
225 default=False, help="do not match gc-contents") | |
226 | |
227 parser.add_option("-R", dest="norpt", action="store_true", \ | |
228 default=False, help="do not match repeats") | |
229 | |
230 parser.add_option("-m", dest="max_trys", type="int", \ | |
231 default=10000, help="number of maximum trys to sample of one sequence (default=10000)") | |
232 | |
233 parser.add_option("-o", dest="output", default="nullseq_output.bed", \ | |
234 help="set the name of output file (default=nullseq_output.bed)") | |
235 | |
236 parser.add_option("-q", dest="quiet", default=False, action="store_true", \ | |
237 help="supress messages (default=false)") | |
238 | |
239 (options, args) = parser.parse_args() | |
240 | |
241 if len(args) == 0: | |
242 parser.print_help() | |
243 sys.exit(0) | |
244 | |
245 if len(args) != 3: | |
246 parser.error("incorrect number of arguments") | |
247 parser.print_help() | |
248 sys.exit(0) | |
249 | |
250 | |
251 posfile = args[0] | |
252 buildname = args[1] | |
253 basedir = args[2] | |
254 | |
255 random.seed(options.rseed) | |
256 | |
257 if options.quiet == False: | |
258 sys.stderr.write('Options:\n') | |
259 sys.stderr.write(' N fold: ' + str(options.fold) + '\n') | |
260 sys.stderr.write(' GC match: ' + str(not options.nogc) + '\n') | |
261 sys.stderr.write(' repeat match: ' + str(not options.norpt) + '\n') | |
262 sys.stderr.write(' GC error allowed: ' + str(options.gc_err) + '\n') | |
263 sys.stderr.write(' repeat error allowed: ' + str(options.rpt_err) + '\n') | |
264 sys.stderr.write(' random seed: ' + str(options.rseed) + '\n') | |
265 sys.stderr.write(' max trys: ' + str(options.max_trys) + '\n') | |
266 sys.stderr.write(' excluded regions: ' + options.excludefile+ '\n') | |
267 sys.stderr.write(' output: ' + options.output + '\n') | |
268 sys.stderr.write('\n') | |
269 | |
270 sys.stderr.write('Input args:\n') | |
271 sys.stderr.write(' input bed file: ' + posfile+ '\n') | |
272 sys.stderr.write(' genome build name: ' + buildname+ '\n') | |
273 sys.stderr.write(' basedir: ' + basedir + '\n') | |
274 sys.stderr.write('\n') | |
275 | |
276 positions = read_bed_file(posfile) | |
277 | |
278 sample_sequences(positions, buildname, basedir, options) | |
279 | |
280 if __name__ == "__main__": main() |