comparison utilities/genSeqErrorModel.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 #!/usr/bin/env python
2
3 #
4 #
5 # genSeqErrorModel.py
6 # Computes sequencing error model for genReads.py
7 #
8 #
9 # Usage: python genSeqErrorModel.py -i input_reads.fq -o path/to/output_name.p
10 #
11 #
12
13
14 import os
15 import sys
16 import gzip
17 import random
18 import numpy as np
19 import argparse
20 import cPickle as pickle
21
22 # absolute path to this script
23 SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2])+'/py/'
24 sys.path.append(SIM_PATH)
25
26 from probability import DiscreteDistribution
27
28 def parseFQ(inf):
29 print 'reading '+inf+'...'
30 if inf[-3:] == '.gz':
31 print 'detected gzip suffix...'
32 f = gzip.open(inf,'r')
33 else:
34 f = open(inf,'r')
35
36 IS_SAM = False
37 if inf[-4:] == '.sam':
38 print 'detected sam input...'
39 IS_SAM = True
40
41 rRead = 0
42 actual_readlen = 0
43 qDict = {}
44 while True:
45
46 if IS_SAM:
47 data4 = f.readline()
48 if not len(data4):
49 break
50 try:
51 data4 = data4.split('\t')[10]
52 except IndexError:
53 break
54 # need to add some input checking here? Yup, probably.
55 else:
56 data1 = f.readline()
57 data2 = f.readline()
58 data3 = f.readline()
59 data4 = f.readline()
60 if not all([data1,data2,data3,data4]):
61 break
62
63 if actual_readlen == 0:
64 if inf[-3:] != '.gz' and not IS_SAM:
65 totalSize = os.path.getsize(inf)
66 entrySize = sum([len(n) for n in [data1,data2,data3,data4]])
67 print 'estimated number of reads in file:',int(float(totalSize)/entrySize)
68 actual_readlen = len(data4)-1
69 print 'assuming read length is uniform...'
70 print 'detected read length (from first read found):',actual_readlen
71 priorQ = np.zeros([actual_readlen,RQ])
72 totalQ = [None] + [np.zeros([RQ,RQ]) for n in xrange(actual_readlen-1)]
73
74 # sanity-check readlengths
75 if len(data4)-1 != actual_readlen:
76 print 'skipping read with unexpected length...'
77 continue
78
79 for i in range(len(data4)-1):
80 q = ord(data4[i])-offQ
81 qDict[q] = True
82 if i == 0:
83 priorQ[i][q] += 1
84 else:
85 totalQ[i][prevQ,q] += 1
86 priorQ[i][q] += 1
87 prevQ = q
88
89 rRead += 1
90 if rRead%PRINT_EVERY == 0:
91 print rRead
92 if MAX_READS > 0 and rRead >= MAX_READS:
93 break
94 f.close()
95
96 # some sanity checking again...
97 QRANGE = [min(qDict.keys()),max(qDict.keys())]
98 if QRANGE[0] < 0:
99 print '\nError: Read in Q-scores below 0\n'
100 exit(1)
101 if QRANGE[1] > RQ:
102 print '\nError: Read in Q-scores above specified maximum:',QRANGE[1],'>',RQ,'\n'
103 exit(1)
104
105 print 'computing probabilities...'
106 probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)] for p in xrange(actual_readlen-1)]
107 for p in xrange(1,actual_readlen):
108 for i in xrange(RQ):
109 rowSum = float(np.sum(totalQ[p][i,:]))+PROB_SMOOTH*RQ
110 if rowSum <= 0.:
111 continue
112 for j in xrange(RQ):
113 probQ[p][i][j] = (totalQ[p][i][j]+PROB_SMOOTH)/rowSum
114
115 initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)]
116 for i in xrange(actual_readlen):
117 rowSum = float(np.sum(priorQ[i,:]))+INIT_SMOOTH*RQ
118 if rowSum <= 0.:
119 continue
120 for j in xrange(RQ):
121 initQ[i][j] = (priorQ[i][j]+INIT_SMOOTH)/rowSum
122
123 if PLOT_STUFF:
124 mpl.rcParams.update({'font.size': 14, 'font.weight':'bold', 'lines.linewidth': 3})
125
126 mpl.figure(1)
127 Z = np.array(initQ).T
128 X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) )
129 mpl.pcolormesh(X,Y,Z,vmin=0.,vmax=0.25)
130 mpl.axis([0,len(Z[0]),0,len(Z)])
131 mpl.yticks(range(0,len(Z),10),range(0,len(Z),10))
132 mpl.xticks(range(0,len(Z[0]),10),range(0,len(Z[0]),10))
133 mpl.xlabel('Read Position')
134 mpl.ylabel('Quality Score')
135 mpl.title('Q-Score Prior Probabilities')
136 mpl.colorbar()
137
138 mpl.show()
139
140 VMIN_LOG = [-4,0]
141 minVal = 10**VMIN_LOG[0]
142 qLabels = [str(n) for n in range(QRANGE[0],QRANGE[1]+1) if n%5==0]
143 print qLabels
144 qTicksx = [int(n)+0.5 for n in qLabels]
145 qTicksy = [(RQ-int(n))-0.5 for n in qLabels]
146
147 for p in xrange(1,actual_readlen,10):
148 currentDat = np.array(probQ[p])
149 for i in xrange(len(currentDat)):
150 for j in xrange(len(currentDat[i])):
151 currentDat[i][j] = max(minVal,currentDat[i][j])
152
153 # matrix indices: pcolormesh plotting: plot labels and axes:
154 #
155 # y ^ ^
156 # --> x | y |
157 # x | --> -->
158 # v y x
159 #
160 # to plot a MxN matrix 'Z' with rowNames and colNames we need to:
161 #
162 # pcolormesh(X,Y,Z[::-1,:]) # invert x-axis
163 # # swap x/y axis parameters and labels, remember x is still inverted:
164 # xlim([yMin,yMax])
165 # ylim([M-xMax,M-xMin])
166 # xticks()
167 #
168
169 mpl.figure(p+1)
170 Z = np.log10(currentDat)
171 X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) )
172 mpl.pcolormesh(X,Y,Z[::-1,:],vmin=VMIN_LOG[0],vmax=VMIN_LOG[1],cmap='jet')
173 mpl.xlim([QRANGE[0],QRANGE[1]+1])
174 mpl.ylim([RQ-QRANGE[1]-1,RQ-QRANGE[0]])
175 mpl.yticks(qTicksy,qLabels)
176 mpl.xticks(qTicksx,qLabels)
177 mpl.xlabel('\n' + r'$Q_{i+1}$')
178 mpl.ylabel(r'$Q_i$')
179 mpl.title('Q-Score Transition Frequencies [Read Pos:'+str(p)+']')
180 cb = mpl.colorbar()
181 cb.set_ticks([-4,-3,-2,-1,0])
182 cb.set_ticklabels([r'$10^{-4}$',r'$10^{-3}$',r'$10^{-2}$',r'$10^{-1}$',r'$10^{0}$'])
183
184 #mpl.tight_layout()
185 mpl.show()
186
187 print 'estimating average error rate via simulation...'
188 Qscores = range(RQ)
189 #print (len(initQ), len(initQ[0]))
190 #print (len(probQ), len(probQ[1]), len(probQ[1][0]))
191
192 initDistByPos = [DiscreteDistribution(initQ[i],Qscores) for i in xrange(len(initQ))]
193 probDistByPosByPrevQ = [None]
194 for i in xrange(1,len(initQ)):
195 probDistByPosByPrevQ.append([])
196 for j in xrange(len(initQ[0])):
197 if np.sum(probQ[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore
198 probDistByPosByPrevQ[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
199 else:
200 probDistByPosByPrevQ[-1].append(DiscreteDistribution(probQ[i][j],Qscores))
201
202 countDict = {}
203 for q in Qscores:
204 countDict[q] = 0
205 for samp in xrange(1,N_SAMP+1):
206 if samp%PRINT_EVERY == 0:
207 print samp
208 myQ = initDistByPos[0].sample()
209 countDict[myQ] += 1
210 for i in xrange(1,len(initQ)):
211 myQ = probDistByPosByPrevQ[i][myQ].sample()
212 countDict[myQ] += 1
213
214 totBases = float(sum(countDict.values()))
215 avgError = 0.
216 for k in sorted(countDict.keys()):
217 eVal = 10.**(-k/10.)
218 #print k, eVal, countDict[k]
219 avgError += eVal * (countDict[k]/totBases)
220 print 'AVG ERROR RATE:',avgError
221
222 return (initQ, probQ, avgError)
223
224 parser = argparse.ArgumentParser(description='genSeqErrorModel.py')
225 parser.add_argument('-i', type=str, required=True, metavar='<str>', help="* input_read1.fq (.gz) / input_read1.sam")
226 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p")
227 parser.add_argument('-i2', type=str, required=False, metavar='<str>', default=None, help="input_read2.fq (.gz) / input_read2.sam")
228 parser.add_argument('-p', type=str, required=False, metavar='<str>', default=None, help="input_alignment.pileup")
229 parser.add_argument('-q', type=int, required=False, metavar='<int>', default=33, help="quality score offset [33]")
230 parser.add_argument('-Q', type=int, required=False, metavar='<int>', default=41, help="maximum quality score [41]")
231 parser.add_argument('-n', type=int, required=False, metavar='<int>', default=-1, help="maximum number of reads to process [all]")
232 parser.add_argument('-s', type=int, required=False, metavar='<int>', default=1000000, help="number of simulation iterations [1000000]")
233 parser.add_argument('--plot', required=False, action='store_true', default=False, help='perform some optional plotting')
234 args = parser.parse_args()
235
236 (INF, OUF, offQ, maxQ, MAX_READS, N_SAMP) = (args.i, args.o, args.q, args.Q, args.n, args.s)
237 (INF2, PILEUP) = (args.i2, args.p)
238
239 RQ = maxQ+1
240
241 INIT_SMOOTH = 0.
242 PROB_SMOOTH = 0.
243 PRINT_EVERY = 10000
244 PLOT_STUFF = args.plot
245 if PLOT_STUFF:
246 print 'plotting is desired, lets import matplotlib...'
247 import matplotlib.pyplot as mpl
248
249 def main():
250
251 Qscores = range(RQ)
252 if INF2 == None:
253 (initQ, probQ, avgError) = parseFQ(INF)
254 else:
255 (initQ, probQ, avgError1) = parseFQ(INF)
256 (initQ2, probQ2, avgError2) = parseFQ(INF2)
257 avgError = (avgError1+avgError2)/2.
258
259 #
260 # embed some default sequencing error parameters if no pileup is provided
261 #
262 if PILEUP == None:
263
264 print 'Using default sequencing error parameters...'
265
266 # sequencing substitution transition probabilities
267 SSE_PROB = [[0., 0.4918, 0.3377, 0.1705 ],
268 [0.5238, 0., 0.2661, 0.2101 ],
269 [0.3754, 0.2355, 0., 0.3890 ],
270 [0.2505, 0.2552, 0.4942, 0. ]]
271 # if a sequencing error occurs, what are the odds it's an indel?
272 SIE_RATE = 0.01
273 # sequencing indel error length distribution
274 SIE_PROB = [0.999,0.001]
275 SIE_VAL = [1,2]
276 # if a sequencing indel error occurs, what are the odds it's an insertion as opposed to a deletion?
277 SIE_INS_FREQ = 0.4
278 # if a sequencing insertion error occurs, what's the probability of it being an A, C, G, T...
279 SIE_INS_NUCL = [0.25, 0.25, 0.25, 0.25]
280
281 #
282 # otherwise we need to parse a pileup and compute statistics!
283 #
284 else:
285 print '\nPileup parsing coming soon!\n'
286 exit(1)
287
288 errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
289
290 #
291 # finally, let's save our output model
292 #
293 print 'saving model...'
294 if INF2 == None:
295 pickle.dump([initQ,probQ,Qscores,offQ,avgError,errorParams],open(OUF,'wb'))
296 else:
297 pickle.dump([initQ,probQ,initQ2,probQ2,Qscores,offQ,avgError,errorParams],open(OUF,'wb'))
298
299 if __name__ == '__main__':
300 main()