0
|
1 #!/usr/bin/env python
|
|
2 from optparse import OptionParser, SUPPRESS_HELP
|
|
3 import os, random, quake
|
|
4
|
|
5 ############################################################
|
|
6 # cov_model.py
|
|
7 #
|
|
8 # Given a file of kmer counts, reports the cutoff to use
|
|
9 # to separate trusted/untrusted kmers.
|
|
10 ############################################################
|
|
11
|
|
12 ############################################################
|
|
13 # main
|
|
14 ############################################################
|
|
15 def main():
|
|
16 usage = 'usage: %prog [options] <counts file>'
|
|
17 parser = OptionParser(usage)
|
|
18 parser.add_option('--int', dest='count_kmers', action='store_true', default=False, help='Kmers were counted as integers w/o the use of quality values [default: %default]')
|
|
19 parser.add_option('--ratio', dest='ratio', type='int', default=200, help='Likelihood ratio to set trusted/untrusted cutoff [default: %default]')
|
|
20 parser.add_option('--no_sample', dest='no_sample', action='store_true', default=False, help='Do not sample kmer coverages into kmers.txt because its already done [default: %default]')
|
|
21 # help='Model kmer coverage as a function of GC content of kmers [default: %default]'
|
|
22 parser.add_option('--gc', dest='model_gc', action='store_true', default=False, help=SUPPRESS_HELP)
|
|
23 (options, args) = parser.parse_args()
|
|
24
|
|
25 if len(args) != 1:
|
|
26 parser.error('Must provide kmers counts file')
|
|
27 else:
|
|
28 ctsf = args[0]
|
|
29
|
|
30 if options.count_kmers:
|
|
31 model_cutoff(ctsf, options.ratio)
|
|
32 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
|
|
33
|
|
34 else:
|
|
35 if options.model_gc:
|
|
36 model_q_gc_cutoffs(ctsf, 25000, options.ratio)
|
|
37 else:
|
|
38 model_q_cutoff(ctsf, 50000, options.ratio, options.no_sample)
|
|
39 print 'Cutoff: %s' % open('cutoff.txt').readline().rstrip()
|
|
40
|
|
41
|
|
42 ############################################################
|
|
43 # model_cutoff
|
|
44 #
|
|
45 # Make a histogram of kmers to give to R to learn the cutoff
|
|
46 ############################################################
|
|
47 def model_cutoff(ctsf, ratio):
|
|
48 # make kmer histogram
|
|
49 cov_max = 0
|
|
50 for line in open(ctsf):
|
|
51 cov = int(line.split()[1])
|
|
52 if cov > cov_max:
|
|
53 cov_max = cov
|
|
54
|
|
55 kmer_hist = [0]*cov_max
|
|
56 for line in open(ctsf):
|
|
57 cov = int(line.split()[1])
|
|
58 kmer_hist[cov-1] += 1
|
|
59
|
|
60 cov_out = open('kmers.hist', 'w')
|
|
61 for cov in range(0,cov_max):
|
|
62 if kmer_hist[cov]:
|
|
63 print >> cov_out, '%d\t%d' % (cov+1,kmer_hist[cov])
|
|
64 cov_out.close()
|
|
65
|
|
66 os.system('R --slave --args %d < %s/cov_model.r 2> r.log' % (ratio,quake.quake_dir))
|
|
67
|
|
68
|
|
69 ############################################################
|
|
70 # model_q_cutoff
|
|
71 #
|
|
72 # Sample kmers to give to R to learn the cutoff
|
|
73 # 'div100' is necessary when the number of kmers is too
|
|
74 # large for random.sample, so we only consider every 100th
|
|
75 # kmer.
|
|
76 ############################################################
|
|
77 def model_q_cutoff(ctsf, sample, ratio, no_sample=False):
|
|
78 if not no_sample:
|
|
79 # count number of kmer coverages
|
|
80 num_covs = 0
|
|
81 for line in open(ctsf):
|
|
82 num_covs += 1
|
|
83
|
|
84 # choose random kmer coverages
|
|
85 div100 = False
|
|
86 if sample >= num_covs:
|
|
87 rand_covs = range(num_covs)
|
|
88 else:
|
|
89 if num_covs > 1000000000:
|
|
90 div100 = True
|
|
91 rand_covs = random.sample(xrange(num_covs/100), sample)
|
|
92 else:
|
|
93 rand_covs = random.sample(xrange(num_covs), sample)
|
|
94 rand_covs.sort()
|
|
95
|
|
96 # print to file
|
|
97 out = open('kmers.txt', 'w')
|
|
98 kmer_i = 0
|
|
99 rand_i = 0
|
|
100 for line in open(ctsf):
|
|
101 if div100:
|
|
102 if kmer_i % 100 == 0 and kmer_i/100 == rand_covs[rand_i]:
|
|
103 print >> out, line.split()[1]
|
|
104 rand_i += 1
|
|
105 if rand_i >= sample:
|
|
106 break
|
|
107 else:
|
|
108 if kmer_i == rand_covs[rand_i]:
|
|
109 print >> out, line.split()[1]
|
|
110 rand_i += 1
|
|
111 if rand_i >= sample:
|
|
112 break
|
|
113 kmer_i += 1
|
|
114 out.close()
|
|
115
|
|
116 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r.log' % (ratio,quake.quake_dir))
|
|
117
|
|
118
|
|
119 ############################################################
|
|
120 # model_q_gc_cutoffs
|
|
121 #
|
|
122 # Sample kmers to give to R to learn the cutoff for each
|
|
123 # GC value
|
|
124 ############################################################
|
|
125 def model_q_gc_cutoffs(ctsf, sample, ratio):
|
|
126 # count number of kmer coverages at each at
|
|
127 k = len(open(ctsf).readline().split()[0])
|
|
128 num_covs_at = [0]*(k+1)
|
|
129 for line in open(ctsf):
|
|
130 kmer = line.split()[0]
|
|
131 num_covs_at[count_at(kmer)] += 1
|
|
132
|
|
133 # for each AT bin
|
|
134 at_cutoffs = []
|
|
135 for at in range(1,k):
|
|
136 # sample covs
|
|
137 if sample >= num_covs_at[at]:
|
|
138 rand_covs = range(num_covs_at[at])
|
|
139 else:
|
|
140 rand_covs = random.sample(xrange(num_covs_at[at]), sample)
|
|
141 rand_covs.sort()
|
|
142
|
|
143 # print to file
|
|
144 out = open('kmers.txt', 'w')
|
|
145 kmer_i = 0
|
|
146 rand_i = 0
|
|
147 for line in open(ctsf):
|
|
148 (kmer,cov) = line.split()
|
|
149 if count_at(kmer) == at:
|
|
150 if kmer_i == rand_covs[rand_i]:
|
|
151 print >> out, cov
|
|
152 rand_i += 1
|
|
153 if rand_i >= sample:
|
|
154 break
|
|
155 kmer_i += 1
|
|
156 out.close()
|
|
157
|
|
158 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
|
|
159
|
|
160 at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
|
|
161 if at in [1,k-1]: # setting extremes to next closests
|
|
162 at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
|
|
163
|
|
164 os.system('mv kmers.txt kmers.at%d.txt' % at)
|
|
165 os.system('mv cutoff.txt cutoff.at%d.txt' % at)
|
|
166
|
|
167 out = open('cutoffs.gc.txt','w')
|
|
168 print >> out, '\n'.join(at_cutoffs)
|
|
169 out.close()
|
|
170
|
|
171
|
|
172 ############################################################
|
|
173 # model_q_gc_cutoffs_bigmem
|
|
174 #
|
|
175 # Sample kmers to give to R to learn the cutoff for each
|
|
176 # GC value
|
|
177 ############################################################
|
|
178 def model_q_gc_cutoffs_bigmem(ctsf, sample, ratio):
|
|
179 # input coverages
|
|
180 k = 0
|
|
181 for line in open(ctsf):
|
|
182 (kmer,cov) = line.split()
|
|
183 if k == 0:
|
|
184 k = len(kmer)
|
|
185 at_covs = ['']*(k+1)
|
|
186 else:
|
|
187 at = count_at(kmer)
|
|
188 if at_covs[at]:
|
|
189 at_covs[at].append(cov)
|
|
190 else:
|
|
191 at_covs[at] = [cov]
|
|
192
|
|
193 for at in range(1,k):
|
|
194 print '%d %d' % (at,len(at_covs[at]))
|
|
195
|
|
196 # for each AT bin
|
|
197 at_cutoffs = []
|
|
198 for at in range(1,k):
|
|
199 # sample covs
|
|
200 if sample >= len(at_covs[at]):
|
|
201 rand_covs = at_covs[at]
|
|
202 else:
|
|
203 rand_covs = random.sample(at_covs[at], sample)
|
|
204
|
|
205 # print to file
|
|
206 out = open('kmers.txt', 'w')
|
|
207 for rc in rand_covs:
|
|
208 print >> out, rc
|
|
209 out.close()
|
|
210
|
|
211 os.system('R --slave --args %d < %s/cov_model_qmer.r 2> r%d.log' % (ratio,quake.quake_dir,at))
|
|
212
|
|
213 at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
|
|
214 if at in [1,k-1]: # setting extremes to next closests
|
|
215 at_cutoffs.append( open('cutoff.txt').readline().rstrip() )
|
|
216
|
|
217 os.system('mv kmers.txt kmers.at%d.txt' % at)
|
|
218 os.system('mv cutoff.txt cutoff.at%d.txt' % at)
|
|
219
|
|
220 out = open('cutoffs.gc.txt','w')
|
|
221 print >> out, '\n'.join(at_cutoffs)
|
|
222 out.close()
|
|
223
|
|
224
|
|
225 ############################################################
|
|
226 # count_at
|
|
227 #
|
|
228 # Count A's and T's in the given sequence
|
|
229 ############################################################
|
|
230 def count_at(seq):
|
|
231 return len([nt for nt in seq if nt in ['A','T']])
|
|
232
|
|
233
|
|
234 ############################################################
|
|
235 # __main__
|
|
236 ############################################################
|
|
237 if __name__ == '__main__':
|
|
238 main()
|