annotate kmersvm/scripts/nullseq_generate.py @ 7:fd740d515502 draft default tip

Uploaded revised kmer-SVM to include modules from kmer-visual.
author cafletezbrant
date Sun, 16 Jun 2013 18:06:14 -0400
parents 7fe1103032f7
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
1 import os
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
2 import os.path
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
3 import sys
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
4 import random
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
5 import optparse
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
6
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
7 from bitarray import bitarray
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
8
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
9 def read_bed_file(filename):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
10 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
11 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
12 f = open(filename, 'r')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
13 lines = f.readlines()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
14 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
15
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
16 positions = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
17
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
18 for line in lines:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
19 if line[0] == '#':
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
20 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
21
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
22 l = line.split()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
23
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
24 positions.append((l[0], int(l[1]), int(l[2])))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
25
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
26 return positions
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
27
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
28
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
29 def bitarray_fromfile(filename):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
30 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
31 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
32 fh = open(filename, 'rb')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
33 bits = bitarray()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
34 bits.fromfile(fh)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
35
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
36 return bits, fh
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
37
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
38
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
39 def make_profile(positions, buildname, basedir):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
40 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
41 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
42 chrnames = sorted(set(map(lambda p: p[0], positions)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
43
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
44 profiles = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
45 for chrom in chrnames:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
46 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
47 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
48
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
49 #if os.path.exists(idxf_gc) == False or os.path.exists(idxf_rpt) == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
50 # continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
51 bits_gc, gcf = bitarray_fromfile(idxf_gc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
52 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
53
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
54 for pos in positions:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
55 if pos[0] != chrom:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
56 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
57
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
58 seqid = pos[0] + ':' + str(pos[1]+1) + '-' + str(pos[2])
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
59 slen = pos[2]-pos[1]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
60 gc = bits_gc[pos[1]:pos[2]].count(True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
61 rpt = bits_rpt[pos[1]:pos[2]].count(True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
62
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
63 profiles.append((seqid, slen, gc, rpt))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
64
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
65 gcf.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
66 rptf.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
67
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
68 return profiles
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
69
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
70
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
71 def sample_sequences(positions, buildname, basedir, options):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
72 """
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
73 """
7
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
74 max_fails = 20
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
75 max_trys = options.max_trys
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
76 norpt = options.norpt
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
77 nogc = options.nogc
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
78
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
79 chrnames = sorted(set(map(lambda p: p[0], positions)))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
80 profiles = make_profile(positions, buildname, basedir)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
81
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
82 excluded = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
83 if options.excludefile:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
84 excluded = read_bed_file(options.excludefile)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
85
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
86 #truncate (clear) file
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
87 f = open(options.output, 'w')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
88 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
89
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
90 for chrom in chrnames:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
91 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
92 sys.stderr.write("sampling from " + chrom + "\n")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
93
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
94 idxf_na = os.path.join(basedir, '.'.join([buildname, chrom, 'na', 'out']))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
95 idxf_gc = os.path.join(basedir, '.'.join([buildname, chrom, 'gc', 'out']))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
96 idxf_rpt = os.path.join(basedir, '.'.join([buildname, chrom, 'rpt', 'out']))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
97
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
98 #this bit array is used to mark positions that are excluded from sampling
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
99 #this will be updated as we sample more sequences in order to prevent sampled sequences from overlapping
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
100 bits_na, naf = bitarray_fromfile(idxf_na)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
101 bits_gc, gcf = bitarray_fromfile(idxf_gc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
102 bits_rpt, rptf = bitarray_fromfile(idxf_rpt)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
103
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
104 #mark excluded regions
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
105 for pos in excluded:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
106 if pos[0] != chrom:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
107 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
108 bits_na[pos[1]:pos[2]] = True
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
109
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
110 npos = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
111 #mark positive regions
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
112 for pos in positions:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
113 if pos[0] != chrom:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
114 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
115 bits_na[pos[1]:pos[2]] = True
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
116 npos+=1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
117
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
118 if options.count == 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
119 count = options.fold*npos
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
120 else:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
121 count = options.count
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
122
7
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
123 #initialize paramter
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
124 #added by dlee 2/17/13
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
125 ncfails = 0
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
126 rpt_err = options.rpt_err
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
127 gc_err = options.gc_err
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
128
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
129 sampled_positions = []
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
130 while len(sampled_positions) < count:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
131 sampled_prof = random.choice(profiles)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
132 sampled_len = sampled_prof[1]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
133 sampled_gc = sampled_prof[2]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
134 sampled_rpt = sampled_prof[3]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
135
7
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
136 #relax rpt_err and gc_err if it keep fail to sample a region
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
137 #added by dlee 2/17/13
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
138 if ncfails >= max_fails:
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
139 if options.quiet == False:
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
140 sys.stderr.write("reached max_fail. relax gc and rpt err criteria\n")
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
141 ncfails = 0
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
142 rpt_err += 0.01
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
143 gc_err += 0.01
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
144
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
145 rpt_err_allowed = int(rpt_err*sampled_len)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
146 gc_err_allowed = int(gc_err*sampled_len)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
147 trys = 0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
148 while trys < max_trys:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
149 trys += 1
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
150
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
151 pos = random.randint(1, bits_na.length() - sampled_len)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
152 pos_e = pos+sampled_len
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
153
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
154 if bits_na[pos:pos_e].any():
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
155 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
156
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
157 if not norpt:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
158 pos_rpt = bits_rpt[pos:pos_e].count(True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
159 if abs(sampled_rpt - pos_rpt) > rpt_err_allowed:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
160 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
161
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
162 if not nogc:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
163 pos_gc = bits_gc[pos:pos_e].count(True)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
164 if abs(sampled_gc - pos_gc) > gc_err_allowed:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
165 continue
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
166
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
167 #accept the sampled position
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
168 #mark the sampled regions
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
169 bits_na[pos:pos_e] = True
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
170
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
171 sampled_positions.append((chrom, pos, pos_e))
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
172
7
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
173 #reset the counter of consecutive fails
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
174 #added by dlee 2/17/13
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
175 ncfails = 0
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
176
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
177 #print trys, chrom, pos, pos_e, sampled_len, pos_rpt, sampled_rpt, pos_gc, sampled_gc
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
178 break
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
179 else:
7
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
180 #increase the counter of consecutive fails
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
181 #added by dlee 2/17/13
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
182 ncfails += 1
fd740d515502 Uploaded revised kmer-SVM to include modules from kmer-visual.
cafletezbrant
parents: 0
diff changeset
183
0
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
184 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
185 sys.stderr.write(' '.join(["fail to sample from", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
186 "len=", str(sampled_len), \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
187 "rpt=", str(sampled_rpt), \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
188 "gc=", str(sampled_gc)]) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
189
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
190 naf.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
191 gcf.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
192 rptf.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
193
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
194 f = open(options.output, 'a')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
195 for spos in sorted(sampled_positions, key=lambda s: s[1]):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
196 f.write('\t'.join([spos[0], str(spos[1]), str(spos[2])]) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
197 f.close()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
198
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
199
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
200 def main(argv=sys.argv):
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
201 usage = "usage: %prog [options] <Input Bed File> <Genome Build Name> <Base Directory>"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
202
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
203 desc = "generate null sequences"
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
204 parser = optparse.OptionParser(usage=usage, description=desc)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
205
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
206 parser.add_option("-x", dest="fold", type="int", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
207 default = 1, help="number of sequence to sample, FOLD times of given dataset (default=1)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
208
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
209 parser.add_option("-c", dest="count", type="int", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
210 default=0, help="number of sequences to sample, override -x option (default=NA, obsolete)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
211
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
212 parser.add_option("-r", dest="rseed", type="int", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
213 default=1, help="random number seed (default=1)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
214
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
215 parser.add_option("-g", dest="gc_err", type="float", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
216 default=0.02, help="GC errors allowed (default=0.02)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
217
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
218 parser.add_option("-t", dest="rpt_err", type="float", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
219 default=0.02, help="RPT errors allowed (default=0.02)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
220
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
221 parser.add_option("-e", dest="excludefile", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
222 default="", help="filename that contains regions to be excluded (default=NA)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
223
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
224 parser.add_option("-G", dest="nogc", action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
225 default=False, help="do not match gc-contents")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
226
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
227 parser.add_option("-R", dest="norpt", action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
228 default=False, help="do not match repeats")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
229
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
230 parser.add_option("-m", dest="max_trys", type="int", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
231 default=10000, help="number of maximum trys to sample of one sequence (default=10000)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
232
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
233 parser.add_option("-o", dest="output", default="nullseq_output.bed", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
234 help="set the name of output file (default=nullseq_output.bed)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
235
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
236 parser.add_option("-q", dest="quiet", default=False, action="store_true", \
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
237 help="supress messages (default=false)")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
238
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
239 (options, args) = parser.parse_args()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
240
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
241 if len(args) == 0:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
242 parser.print_help()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
243 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
244
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
245 if len(args) != 3:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
246 parser.error("incorrect number of arguments")
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
247 parser.print_help()
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
248 sys.exit(0)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
249
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
250
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
251 posfile = args[0]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
252 buildname = args[1]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
253 basedir = args[2]
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
254
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
255 random.seed(options.rseed)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
256
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
257 if options.quiet == False:
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
258 sys.stderr.write('Options:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
259 sys.stderr.write(' N fold: ' + str(options.fold) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
260 sys.stderr.write(' GC match: ' + str(not options.nogc) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
261 sys.stderr.write(' repeat match: ' + str(not options.norpt) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
262 sys.stderr.write(' GC error allowed: ' + str(options.gc_err) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
263 sys.stderr.write(' repeat error allowed: ' + str(options.rpt_err) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
264 sys.stderr.write(' random seed: ' + str(options.rseed) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
265 sys.stderr.write(' max trys: ' + str(options.max_trys) + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
266 sys.stderr.write(' excluded regions: ' + options.excludefile+ '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
267 sys.stderr.write(' output: ' + options.output + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
268 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
269
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
270 sys.stderr.write('Input args:\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
271 sys.stderr.write(' input bed file: ' + posfile+ '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
272 sys.stderr.write(' genome build name: ' + buildname+ '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
273 sys.stderr.write(' basedir: ' + basedir + '\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
274 sys.stderr.write('\n')
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
275
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
276 positions = read_bed_file(posfile)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
277
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
278 sample_sequences(positions, buildname, basedir, options)
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
279
7fe1103032f7 Uploaded
cafletezbrant
parents:
diff changeset
280 if __name__ == "__main__": main()