0
|
1 import os
|
|
2 import os.path
|
|
3 import sys
|
|
4 import random
|
|
5 import optparse
|
|
6
|
|
7 from bitarray import bitarray
|
|
8
|
|
9 def read_bed_file(filename):
|
|
10 """
|
|
11 """
|
|
12 f = open(filename, 'r')
|
|
13 lines = f.readlines()
|
|
14 f.close()
|
|
15
|
|
16 positions = []
|
|
17
|
|
18 for line in lines:
|
|
19 if line[0] == '#':
|
|
20 continue
|
|
21
|
|
22 l = line.split()
|
|
23
|
|
24 positions.append((l[0], int(l[1]), int(l[2])))
|
|
25
|
|
26 return positions
|
|
27
|
|
28
|
|
29 def bitarray_fromfile(filename):
|
|
30 """
|
|
31 """
|
|
32 fh = open(filename, 'rb')
|
|
33 bits = bitarray()
|
|
34 bits.fromfile(fh)
|
|
35
|
|
36 return bits, fh
|
|
37
|
|
38
|
|
39 def make_profile(positions, buildname, basedir):
|
|
40 """
|
|
41 """
|
|
42 chrnames = sorted(set(map(lambda p: p[0], positions)))
|
|
43
|
|
44 profiles = []
|
|
45 for chrom in chrnames:
|
|
46 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
|
|
47 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
|
|
48
|
|
49 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
|
|
50 # continue
|
|
51 bits_gc, gcf = bitarray_fromfile(idxf_gc)
|
|
52 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
|
|
53
|
|
54 for pos in positions:
|
|
55 if pos[0] != chrom:
|
|
56 continue
|
|
57
|
|
58 seqid = pos[0] + ':' + str(pos[1]+1) + '-' + str(pos[2])
|
|
59 slen = pos[2]-pos[1]
|
|
60 gc = bits_gc[pos[1]:pos[2]].count(True)
|
|
61 rpt = bits_rpt[pos[1]:pos[2]].count(True)
|
|
62
|
|
63 profiles.append((seqid, slen, gc, rpt))
|
|
64
|
|
65 gcf.close()
|
|
66 rptf.close()
|
|
67
|
|
68 return profiles
|
|
69
|
|
70
|
|
71 def sample_sequences(positions, buildname, basedir, options):
|
|
72 """
|
|
73 """
|
|
74 rpt_err = options.rpt_err
|
|
75 gc_err = options.gc_err
|
|
76 max_trys = options.max_trys
|
|
77 norpt = options.norpt
|
|
78 nogc = options.nogc
|
|
79
|
|
80 chrnames = sorted(set(map(lambda p: p[0], positions)))
|
|
81 profiles = make_profile(positions, buildname, basedir)
|
|
82
|
|
83 excluded = []
|
|
84 if options.excludefile:
|
|
85 excluded = read_bed_file(options.excludefile)
|
|
86
|
|
87 #truncate (clear) file
|
|
88 f = open(options.output, 'w')
|
|
89 f.close()
|
|
90
|
|
91 for chrom in chrnames:
|
|
92 if options.quiet == False:
|
|
93 sys.stderr.write("sampling from " + chrom + "\n")
|
|
94
|
|
95 idxf_na = os.path.join(basedir, '.'.join([buildname, chrom, 'na', 'out']))
|
|
96 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
|
|
97 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
|
|
98
|
|
99 #this bit array is used to mark positions that are excluded from sampling
|
|
100 #this will be updated as we sample more sequences in order to prevent sampled sequences from overlapping
|
|
101 bits_na, naf = bitarray_fromfile(idxf_na)
|
|
102 bits_gc, gcf = bitarray_fromfile(idxf_gc)
|
|
103 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
|
|
104
|
|
105 #mark excluded regions
|
|
106 for pos in excluded:
|
|
107 if pos[0] != chrom:
|
|
108 continue
|
|
109 bits_na[pos[1]:pos[2]] = True
|
|
110
|
|
111 npos = 0
|
|
112 #mark positive regions
|
|
113 for pos in positions:
|
|
114 if pos[0] != chrom:
|
|
115 continue
|
|
116 bits_na[pos[1]:pos[2]] = True
|
|
117 npos+=1
|
|
118
|
|
119 if options.count == 0:
|
|
120 count = options.fold*npos
|
|
121 else:
|
|
122 count = options.count
|
|
123
|
|
124 sampled_positions = []
|
|
125 while len(sampled_positions) < count:
|
|
126 sampled_prof = random.choice(profiles)
|
|
127 sampled_len = sampled_prof[1]
|
|
128 sampled_gc = sampled_prof[2]
|
|
129 sampled_rpt = sampled_prof[3]
|
|
130
|
|
131 rpt_err_allowed = int(rpt_err*sampled_len)
|
|
132 gc_err_allowed = int(gc_err*sampled_len)
|
|
133 trys = 0
|
|
134 while trys < max_trys:
|
|
135 trys += 1
|
|
136
|
|
137 pos = random.randint(1, bits_na.length() - sampled_len)
|
|
138 pos_e = pos+sampled_len
|
|
139
|
|
140 if bits_na[pos:pos_e].any():
|
|
141 continue
|
|
142
|
|
143 if not norpt:
|
|
144 pos_rpt = bits_rpt[pos:pos_e].count(True)
|
|
145 if abs(sampled_rpt - pos_rpt) > rpt_err_allowed:
|
|
146 continue
|
|
147
|
|
148 if not nogc:
|
|
149 pos_gc = bits_gc[pos:pos_e].count(True)
|
|
150 if abs(sampled_gc - pos_gc) > gc_err_allowed:
|
|
151 continue
|
|
152
|
|
153 #accept the sampled position
|
|
154 #mark the sampled regions
|
|
155 bits_na[pos:pos_e] = True
|
|
156
|
|
157 sampled_positions.append((chrom, pos, pos_e))
|
|
158
|
|
159 #print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc
|
|
160 break
|
|
161 else:
|
|
162 if options.quiet == False:
|
|
163 sys.stderr.write(' '.join(["fail to sample from", \
|
|
164 "len=", str(sampled_len), \
|
|
165 "rpt=", str(sampled_rpt), \
|
|
166 "gc=", str(sampled_gc)]) + '\n')
|
|
167
|
|
168 naf.close()
|
|
169 gcf.close()
|
|
170 rptf.close()
|
|
171
|
|
172 f = open(options.output, 'a')
|
|
173 for spos in sorted(sampled_positions, key=lambda s: s[1]):
|
|
174 f.write('\t'.join([spos[0], str(spos[1]), str(spos[2])]) + '\n')
|
|
175 f.close()
|
|
176
|
|
177
|
|
178 def main(argv=sys.argv):
|
|
179 usage = "usage: %prog [options] <Input Bed File> <Genome Build Name> <Base Directory>"
|
|
180
|
|
181 desc = "generate null sequences"
|
|
182 parser = optparse.OptionParser(usage=usage, description=desc)
|
|
183
|
|
184 parser.add_option("-x", dest="fold", type="int", \
|
|
185 default = 1, help="number of sequence to sample, FOLD times of given dataset (default=1)")
|
|
186
|
|
187 parser.add_option("-c", dest="count", type="int", \
|
|
188 default=0, help="number of sequences to sample, override -x option (default=NA, obsolete)")
|
|
189
|
|
190 parser.add_option("-r", dest="rseed", type="int", \
|
|
191 default=1, help="random number seed (default=1)")
|
|
192
|
|
193 parser.add_option("-g", dest="gc_err", type="float", \
|
|
194 default=0.02, help="GC errors allowed (default=0.02)")
|
|
195
|
|
196 parser.add_option("-t", dest="rpt_err", type="float", \
|
|
197 default=0.02, help="RPT errors allowed (default=0.02)")
|
|
198
|
|
199 parser.add_option("-e", dest="excludefile", \
|
|
200 default="", help="filename that contains regions to be excluded (default=NA)")
|
|
201
|
|
202 parser.add_option("-G", dest="nogc", action="store_true", \
|
|
203 default=False, help="do not match gc-contents")
|
|
204
|
|
205 parser.add_option("-R", dest="norpt", action="store_true", \
|
|
206 default=False, help="do not match repeats")
|
|
207
|
|
208 parser.add_option("-m", dest="max_trys", type="int", \
|
|
209 default=10000, help="number of maximum trys to sample of one sequence (default=10000)")
|
|
210
|
|
211 parser.add_option("-o", dest="output", default="nullseq_output.bed", \
|
|
212 help="set the name of output file (default=nullseq_output.bed)")
|
|
213
|
|
214 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
|
|
215 help="supress messages (default=false)")
|
|
216
|
|
217 (options, args) = parser.parse_args()
|
|
218
|
|
219 if len(args) == 0:
|
|
220 parser.print_help()
|
|
221 sys.exit(0)
|
|
222
|
|
223 if len(args) != 3:
|
|
224 parser.error("incorrect number of arguments")
|
|
225 parser.print_help()
|
|
226 sys.exit(0)
|
|
227
|
|
228
|
|
229 posfile = args[0]
|
|
230 buildname = args[1]
|
|
231 basedir = args[2]
|
|
232
|
|
233 random.seed(options.rseed)
|
|
234
|
|
235 if options.quiet == False:
|
|
236 sys.stderr.write('Options:\n')
|
|
237 sys.stderr.write(' N fold: ' + str(options.fold) + '\n')
|
|
238 sys.stderr.write(' GC match: ' + str(not options.nogc) + '\n')
|
|
239 sys.stderr.write(' repeat match: ' + str(not options.norpt) + '\n')
|
|
240 sys.stderr.write(' GC error allowed: ' + str(options.gc_err) + '\n')
|
|
241 sys.stderr.write(' repeat error allowed: ' + str(options.rpt_err) + '\n')
|
|
242 sys.stderr.write(' random seed: ' + str(options.rseed) + '\n')
|
|
243 sys.stderr.write(' max trys: ' + str(options.max_trys) + '\n')
|
|
244 sys.stderr.write(' excluded regions: ' + options.excludefile+ '\n')
|
|
245 sys.stderr.write(' output: ' + options.output + '\n')
|
|
246 sys.stderr.write('\n')
|
|
247
|
|
248 sys.stderr.write('Input args:\n')
|
|
249 sys.stderr.write(' input bed file: ' + posfile+ '\n')
|
|
250 sys.stderr.write(' genome build name: ' + buildname+ '\n')
|
|
251 sys.stderr.write(' basedir: ' + basedir + '\n')
|
|
252 sys.stderr.write('\n')
|
|
253
|
|
254 positions = read_bed_file(posfile)
|
|
255
|
|
256 sample_sequences(positions, buildname, basedir, options)
|
|
257
|
|
258 if __name__ == "__main__": main()
|