Mercurial > repos > thondeboer > neat_genreads
comparison utilities/genSeqErrorModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 # | |
| 4 # | |
| 5 # genSeqErrorModel.py | |
| 6 # Computes sequencing error model for genReads.py | |
| 7 # | |
| 8 # | |
| 9 # Usage: python genSeqErrorModel.py -i input_reads.fq -o path/to/output_name.p | |
| 10 # | |
| 11 # | |
| 12 | |
| 13 | |
| 14 import os | |
| 15 import sys | |
| 16 import gzip | |
| 17 import random | |
| 18 import numpy as np | |
| 19 import argparse | |
| 20 import cPickle as pickle | |
| 21 | |
| 22 # absolute path to this script | |
| 23 SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2])+'/py/' | |
| 24 sys.path.append(SIM_PATH) | |
| 25 | |
| 26 from probability import DiscreteDistribution | |
| 27 | |
| 28 def parseFQ(inf): | |
| 29 print 'reading '+inf+'...' | |
| 30 if inf[-3:] == '.gz': | |
| 31 print 'detected gzip suffix...' | |
| 32 f = gzip.open(inf,'r') | |
| 33 else: | |
| 34 f = open(inf,'r') | |
| 35 | |
| 36 IS_SAM = False | |
| 37 if inf[-4:] == '.sam': | |
| 38 print 'detected sam input...' | |
| 39 IS_SAM = True | |
| 40 | |
| 41 rRead = 0 | |
| 42 actual_readlen = 0 | |
| 43 qDict = {} | |
| 44 while True: | |
| 45 | |
| 46 if IS_SAM: | |
| 47 data4 = f.readline() | |
| 48 if not len(data4): | |
| 49 break | |
| 50 try: | |
| 51 data4 = data4.split('\t')[10] | |
| 52 except IndexError: | |
| 53 break | |
| 54 # need to add some input checking here? Yup, probably. | |
| 55 else: | |
| 56 data1 = f.readline() | |
| 57 data2 = f.readline() | |
| 58 data3 = f.readline() | |
| 59 data4 = f.readline() | |
| 60 if not all([data1,data2,data3,data4]): | |
| 61 break | |
| 62 | |
| 63 if actual_readlen == 0: | |
| 64 if inf[-3:] != '.gz' and not IS_SAM: | |
| 65 totalSize = os.path.getsize(inf) | |
| 66 entrySize = sum([len(n) for n in [data1,data2,data3,data4]]) | |
| 67 print 'estimated number of reads in file:',int(float(totalSize)/entrySize) | |
| 68 actual_readlen = len(data4)-1 | |
| 69 print 'assuming read length is uniform...' | |
| 70 print 'detected read length (from first read found):',actual_readlen | |
| 71 priorQ = np.zeros([actual_readlen,RQ]) | |
| 72 totalQ = [None] + [np.zeros([RQ,RQ]) for n in xrange(actual_readlen-1)] | |
| 73 | |
| 74 # sanity-check readlengths | |
| 75 if len(data4)-1 != actual_readlen: | |
| 76 print 'skipping read with unexpected length...' | |
| 77 continue | |
| 78 | |
| 79 for i in range(len(data4)-1): | |
| 80 q = ord(data4[i])-offQ | |
| 81 qDict[q] = True | |
| 82 if i == 0: | |
| 83 priorQ[i][q] += 1 | |
| 84 else: | |
| 85 totalQ[i][prevQ,q] += 1 | |
| 86 priorQ[i][q] += 1 | |
| 87 prevQ = q | |
| 88 | |
| 89 rRead += 1 | |
| 90 if rRead%PRINT_EVERY == 0: | |
| 91 print rRead | |
| 92 if MAX_READS > 0 and rRead >= MAX_READS: | |
| 93 break | |
| 94 f.close() | |
| 95 | |
| 96 # some sanity checking again... | |
| 97 QRANGE = [min(qDict.keys()),max(qDict.keys())] | |
| 98 if QRANGE[0] < 0: | |
| 99 print '\nError: Read in Q-scores below 0\n' | |
| 100 exit(1) | |
| 101 if QRANGE[1] > RQ: | |
| 102 print '\nError: Read in Q-scores above specified maximum:',QRANGE[1],'>',RQ,'\n' | |
| 103 exit(1) | |
| 104 | |
| 105 print 'computing probabilities...' | |
| 106 probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)] for p in xrange(actual_readlen-1)] | |
| 107 for p in xrange(1,actual_readlen): | |
| 108 for i in xrange(RQ): | |
| 109 rowSum = float(np.sum(totalQ[p][i,:]))+PROB_SMOOTH*RQ | |
| 110 if rowSum <= 0.: | |
| 111 continue | |
| 112 for j in xrange(RQ): | |
| 113 probQ[p][i][j] = (totalQ[p][i][j]+PROB_SMOOTH)/rowSum | |
| 114 | |
| 115 initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)] | |
| 116 for i in xrange(actual_readlen): | |
| 117 rowSum = float(np.sum(priorQ[i,:]))+INIT_SMOOTH*RQ | |
| 118 if rowSum <= 0.: | |
| 119 continue | |
| 120 for j in xrange(RQ): | |
| 121 initQ[i][j] = (priorQ[i][j]+INIT_SMOOTH)/rowSum | |
| 122 | |
| 123 if PLOT_STUFF: | |
| 124 mpl.rcParams.update({'font.size': 14, 'font.weight':'bold', 'lines.linewidth': 3}) | |
| 125 | |
| 126 mpl.figure(1) | |
| 127 Z = np.array(initQ).T | |
| 128 X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) ) | |
| 129 mpl.pcolormesh(X,Y,Z,vmin=0.,vmax=0.25) | |
| 130 mpl.axis([0,len(Z[0]),0,len(Z)]) | |
| 131 mpl.yticks(range(0,len(Z),10),range(0,len(Z),10)) | |
| 132 mpl.xticks(range(0,len(Z[0]),10),range(0,len(Z[0]),10)) | |
| 133 mpl.xlabel('Read Position') | |
| 134 mpl.ylabel('Quality Score') | |
| 135 mpl.title('Q-Score Prior Probabilities') | |
| 136 mpl.colorbar() | |
| 137 | |
| 138 mpl.show() | |
| 139 | |
| 140 VMIN_LOG = [-4,0] | |
| 141 minVal = 10**VMIN_LOG[0] | |
| 142 qLabels = [str(n) for n in range(QRANGE[0],QRANGE[1]+1) if n%5==0] | |
| 143 print qLabels | |
| 144 qTicksx = [int(n)+0.5 for n in qLabels] | |
| 145 qTicksy = [(RQ-int(n))-0.5 for n in qLabels] | |
| 146 | |
| 147 for p in xrange(1,actual_readlen,10): | |
| 148 currentDat = np.array(probQ[p]) | |
| 149 for i in xrange(len(currentDat)): | |
| 150 for j in xrange(len(currentDat[i])): | |
| 151 currentDat[i][j] = max(minVal,currentDat[i][j]) | |
| 152 | |
| 153 # matrix indices: pcolormesh plotting: plot labels and axes: | |
| 154 # | |
| 155 # y ^ ^ | |
| 156 # --> x | y | | |
| 157 # x | --> --> | |
| 158 # v y x | |
| 159 # | |
| 160 # to plot a MxN matrix 'Z' with rowNames and colNames we need to: | |
| 161 # | |
| 162 # pcolormesh(X,Y,Z[::-1,:]) # invert x-axis | |
| 163 # # swap x/y axis parameters and labels, remember x is still inverted: | |
| 164 # xlim([yMin,yMax]) | |
| 165 # ylim([M-xMax,M-xMin]) | |
| 166 # xticks() | |
| 167 # | |
| 168 | |
| 169 mpl.figure(p+1) | |
| 170 Z = np.log10(currentDat) | |
| 171 X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) ) | |
| 172 mpl.pcolormesh(X,Y,Z[::-1,:],vmin=VMIN_LOG[0],vmax=VMIN_LOG[1],cmap='jet') | |
| 173 mpl.xlim([QRANGE[0],QRANGE[1]+1]) | |
| 174 mpl.ylim([RQ-QRANGE[1]-1,RQ-QRANGE[0]]) | |
| 175 mpl.yticks(qTicksy,qLabels) | |
| 176 mpl.xticks(qTicksx,qLabels) | |
| 177 mpl.xlabel('\n' + r'$Q_{i+1}$') | |
| 178 mpl.ylabel(r'$Q_i$') | |
| 179 mpl.title('Q-Score Transition Frequencies [Read Pos:'+str(p)+']') | |
| 180 cb = mpl.colorbar() | |
| 181 cb.set_ticks([-4,-3,-2,-1,0]) | |
| 182 cb.set_ticklabels([r'$10^{-4}$',r'$10^{-3}$',r'$10^{-2}$',r'$10^{-1}$',r'$10^{0}$']) | |
| 183 | |
| 184 #mpl.tight_layout() | |
| 185 mpl.show() | |
| 186 | |
| 187 print 'estimating average error rate via simulation...' | |
| 188 Qscores = range(RQ) | |
| 189 #print (len(initQ), len(initQ[0])) | |
| 190 #print (len(probQ), len(probQ[1]), len(probQ[1][0])) | |
| 191 | |
| 192 initDistByPos = [DiscreteDistribution(initQ[i],Qscores) for i in xrange(len(initQ))] | |
| 193 probDistByPosByPrevQ = [None] | |
| 194 for i in xrange(1,len(initQ)): | |
| 195 probDistByPosByPrevQ.append([]) | |
| 196 for j in xrange(len(initQ[0])): | |
| 197 if np.sum(probQ[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore | |
| 198 probDistByPosByPrevQ[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j])) | |
| 199 else: | |
| 200 probDistByPosByPrevQ[-1].append(DiscreteDistribution(probQ[i][j],Qscores)) | |
| 201 | |
| 202 countDict = {} | |
| 203 for q in Qscores: | |
| 204 countDict[q] = 0 | |
| 205 for samp in xrange(1,N_SAMP+1): | |
| 206 if samp%PRINT_EVERY == 0: | |
| 207 print samp | |
| 208 myQ = initDistByPos[0].sample() | |
| 209 countDict[myQ] += 1 | |
| 210 for i in xrange(1,len(initQ)): | |
| 211 myQ = probDistByPosByPrevQ[i][myQ].sample() | |
| 212 countDict[myQ] += 1 | |
| 213 | |
| 214 totBases = float(sum(countDict.values())) | |
| 215 avgError = 0. | |
| 216 for k in sorted(countDict.keys()): | |
| 217 eVal = 10.**(-k/10.) | |
| 218 #print k, eVal, countDict[k] | |
| 219 avgError += eVal * (countDict[k]/totBases) | |
| 220 print 'AVG ERROR RATE:',avgError | |
| 221 | |
| 222 return (initQ, probQ, avgError) | |
| 223 | |
| 224 parser = argparse.ArgumentParser(description='genSeqErrorModel.py') | |
| 225 parser.add_argument('-i', type=str, required=True, metavar='<str>', help="* input_read1.fq (.gz) / input_read1.sam") | |
| 226 parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p") | |
| 227 parser.add_argument('-i2', type=str, required=False, metavar='<str>', default=None, help="input_read2.fq (.gz) / input_read2.sam") | |
| 228 parser.add_argument('-p', type=str, required=False, metavar='<str>', default=None, help="input_alignment.pileup") | |
| 229 parser.add_argument('-q', type=int, required=False, metavar='<int>', default=33, help="quality score offset [33]") | |
| 230 parser.add_argument('-Q', type=int, required=False, metavar='<int>', default=41, help="maximum quality score [41]") | |
| 231 parser.add_argument('-n', type=int, required=False, metavar='<int>', default=-1, help="maximum number of reads to process [all]") | |
| 232 parser.add_argument('-s', type=int, required=False, metavar='<int>', default=1000000, help="number of simulation iterations [1000000]") | |
| 233 parser.add_argument('--plot', required=False, action='store_true', default=False, help='perform some optional plotting') | |
| 234 args = parser.parse_args() | |
| 235 | |
| 236 (INF, OUF, offQ, maxQ, MAX_READS, N_SAMP) = (args.i, args.o, args.q, args.Q, args.n, args.s) | |
| 237 (INF2, PILEUP) = (args.i2, args.p) | |
| 238 | |
| 239 RQ = maxQ+1 | |
| 240 | |
| 241 INIT_SMOOTH = 0. | |
| 242 PROB_SMOOTH = 0. | |
| 243 PRINT_EVERY = 10000 | |
| 244 PLOT_STUFF = args.plot | |
| 245 if PLOT_STUFF: | |
| 246 print 'plotting is desired, lets import matplotlib...' | |
| 247 import matplotlib.pyplot as mpl | |
| 248 | |
| 249 def main(): | |
| 250 | |
| 251 Qscores = range(RQ) | |
| 252 if INF2 == None: | |
| 253 (initQ, probQ, avgError) = parseFQ(INF) | |
| 254 else: | |
| 255 (initQ, probQ, avgError1) = parseFQ(INF) | |
| 256 (initQ2, probQ2, avgError2) = parseFQ(INF2) | |
| 257 avgError = (avgError1+avgError2)/2. | |
| 258 | |
| 259 # | |
| 260 # embed some default sequencing error parameters if no pileup is provided | |
| 261 # | |
| 262 if PILEUP == None: | |
| 263 | |
| 264 print 'Using default sequencing error parameters...' | |
| 265 | |
| 266 # sequencing substitution transition probabilities | |
| 267 SSE_PROB = [[0., 0.4918, 0.3377, 0.1705 ], | |
| 268 [0.5238, 0., 0.2661, 0.2101 ], | |
| 269 [0.3754, 0.2355, 0., 0.3890 ], | |
| 270 [0.2505, 0.2552, 0.4942, 0. ]] | |
| 271 # if a sequencing error occurs, what are the odds it's an indel? | |
| 272 SIE_RATE = 0.01 | |
| 273 # sequencing indel error length distribution | |
| 274 SIE_PROB = [0.999,0.001] | |
| 275 SIE_VAL = [1,2] | |
| 276 # if a sequencing indel error occurs, what are the odds it's an insertion as opposed to a deletion? | |
| 277 SIE_INS_FREQ = 0.4 | |
| 278 # if a sequencing insertion error occurs, what's the probability of it being an A, C, G, T... | |
| 279 SIE_INS_NUCL = [0.25, 0.25, 0.25, 0.25] | |
| 280 | |
| 281 # | |
| 282 # otherwise we need to parse a pileup and compute statistics! | |
| 283 # | |
| 284 else: | |
| 285 print '\nPileup parsing coming soon!\n' | |
| 286 exit(1) | |
| 287 | |
| 288 errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL] | |
| 289 | |
| 290 # | |
| 291 # finally, let's save our output model | |
| 292 # | |
| 293 print 'saving model...' | |
| 294 if INF2 == None: | |
| 295 pickle.dump([initQ,probQ,Qscores,offQ,avgError,errorParams],open(OUF,'wb')) | |
| 296 else: | |
| 297 pickle.dump([initQ,probQ,initQ2,probQ2,Qscores,offQ,avgError,errorParams],open(OUF,'wb')) | |
| 298 | |
| 299 if __name__ == '__main__': | |
| 300 main() |
