Mercurial > repos > thondeboer > neat_genreads
diff utilities/genSeqErrorModel.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/utilities/genSeqErrorModel.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,300 @@ +#!/usr/bin/env python + +# +# +# genSeqErrorModel.py +# Computes sequencing error model for genReads.py +# +# +# Usage: python genSeqErrorModel.py -i input_reads.fq -o path/to/output_name.p +# +# + + +import os +import sys +import gzip +import random +import numpy as np +import argparse +import cPickle as pickle + +# absolute path to this script +SIM_PATH = '/'.join(os.path.realpath(__file__).split('/')[:-2])+'/py/' +sys.path.append(SIM_PATH) + +from probability import DiscreteDistribution + +def parseFQ(inf): + print 'reading '+inf+'...' + if inf[-3:] == '.gz': + print 'detected gzip suffix...' + f = gzip.open(inf,'r') + else: + f = open(inf,'r') + + IS_SAM = False + if inf[-4:] == '.sam': + print 'detected sam input...' + IS_SAM = True + + rRead = 0 + actual_readlen = 0 + qDict = {} + while True: + + if IS_SAM: + data4 = f.readline() + if not len(data4): + break + try: + data4 = data4.split('\t')[10] + except IndexError: + break + # need to add some input checking here? Yup, probably. + else: + data1 = f.readline() + data2 = f.readline() + data3 = f.readline() + data4 = f.readline() + if not all([data1,data2,data3,data4]): + break + + if actual_readlen == 0: + if inf[-3:] != '.gz' and not IS_SAM: + totalSize = os.path.getsize(inf) + entrySize = sum([len(n) for n in [data1,data2,data3,data4]]) + print 'estimated number of reads in file:',int(float(totalSize)/entrySize) + actual_readlen = len(data4)-1 + print 'assuming read length is uniform...' + print 'detected read length (from first read found):',actual_readlen + priorQ = np.zeros([actual_readlen,RQ]) + totalQ = [None] + [np.zeros([RQ,RQ]) for n in xrange(actual_readlen-1)] + + # sanity-check readlengths + if len(data4)-1 != actual_readlen: + print 'skipping read with unexpected length...' + continue + + for i in range(len(data4)-1): + q = ord(data4[i])-offQ + qDict[q] = True + if i == 0: + priorQ[i][q] += 1 + else: + totalQ[i][prevQ,q] += 1 + priorQ[i][q] += 1 + prevQ = q + + rRead += 1 + if rRead%PRINT_EVERY == 0: + print rRead + if MAX_READS > 0 and rRead >= MAX_READS: + break + f.close() + + # some sanity checking again... + QRANGE = [min(qDict.keys()),max(qDict.keys())] + if QRANGE[0] < 0: + print '\nError: Read in Q-scores below 0\n' + exit(1) + if QRANGE[1] > RQ: + print '\nError: Read in Q-scores above specified maximum:',QRANGE[1],'>',RQ,'\n' + exit(1) + + print 'computing probabilities...' + probQ = [None] + [[[0. for m in xrange(RQ)] for n in xrange(RQ)] for p in xrange(actual_readlen-1)] + for p in xrange(1,actual_readlen): + for i in xrange(RQ): + rowSum = float(np.sum(totalQ[p][i,:]))+PROB_SMOOTH*RQ + if rowSum <= 0.: + continue + for j in xrange(RQ): + probQ[p][i][j] = (totalQ[p][i][j]+PROB_SMOOTH)/rowSum + + initQ = [[0. for m in xrange(RQ)] for n in xrange(actual_readlen)] + for i in xrange(actual_readlen): + rowSum = float(np.sum(priorQ[i,:]))+INIT_SMOOTH*RQ + if rowSum <= 0.: + continue + for j in xrange(RQ): + initQ[i][j] = (priorQ[i][j]+INIT_SMOOTH)/rowSum + + if PLOT_STUFF: + mpl.rcParams.update({'font.size': 14, 'font.weight':'bold', 'lines.linewidth': 3}) + + mpl.figure(1) + Z = np.array(initQ).T + X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) ) + mpl.pcolormesh(X,Y,Z,vmin=0.,vmax=0.25) + mpl.axis([0,len(Z[0]),0,len(Z)]) + mpl.yticks(range(0,len(Z),10),range(0,len(Z),10)) + mpl.xticks(range(0,len(Z[0]),10),range(0,len(Z[0]),10)) + mpl.xlabel('Read Position') + mpl.ylabel('Quality Score') + mpl.title('Q-Score Prior Probabilities') + mpl.colorbar() + + mpl.show() + + VMIN_LOG = [-4,0] + minVal = 10**VMIN_LOG[0] + qLabels = [str(n) for n in range(QRANGE[0],QRANGE[1]+1) if n%5==0] + print qLabels + qTicksx = [int(n)+0.5 for n in qLabels] + qTicksy = [(RQ-int(n))-0.5 for n in qLabels] + + for p in xrange(1,actual_readlen,10): + currentDat = np.array(probQ[p]) + for i in xrange(len(currentDat)): + for j in xrange(len(currentDat[i])): + currentDat[i][j] = max(minVal,currentDat[i][j]) + + # matrix indices: pcolormesh plotting: plot labels and axes: + # + # y ^ ^ + # --> x | y | + # x | --> --> + # v y x + # + # to plot a MxN matrix 'Z' with rowNames and colNames we need to: + # + # pcolormesh(X,Y,Z[::-1,:]) # invert x-axis + # # swap x/y axis parameters and labels, remember x is still inverted: + # xlim([yMin,yMax]) + # ylim([M-xMax,M-xMin]) + # xticks() + # + + mpl.figure(p+1) + Z = np.log10(currentDat) + X, Y = np.meshgrid( range(0,len(Z[0])+1), range(0,len(Z)+1) ) + mpl.pcolormesh(X,Y,Z[::-1,:],vmin=VMIN_LOG[0],vmax=VMIN_LOG[1],cmap='jet') + mpl.xlim([QRANGE[0],QRANGE[1]+1]) + mpl.ylim([RQ-QRANGE[1]-1,RQ-QRANGE[0]]) + mpl.yticks(qTicksy,qLabels) + mpl.xticks(qTicksx,qLabels) + mpl.xlabel('\n' + r'$Q_{i+1}$') + mpl.ylabel(r'$Q_i$') + mpl.title('Q-Score Transition Frequencies [Read Pos:'+str(p)+']') + cb = mpl.colorbar() + cb.set_ticks([-4,-3,-2,-1,0]) + cb.set_ticklabels([r'$10^{-4}$',r'$10^{-3}$',r'$10^{-2}$',r'$10^{-1}$',r'$10^{0}$']) + + #mpl.tight_layout() + mpl.show() + + print 'estimating average error rate via simulation...' + Qscores = range(RQ) + #print (len(initQ), len(initQ[0])) + #print (len(probQ), len(probQ[1]), len(probQ[1][0])) + + initDistByPos = [DiscreteDistribution(initQ[i],Qscores) for i in xrange(len(initQ))] + probDistByPosByPrevQ = [None] + for i in xrange(1,len(initQ)): + probDistByPosByPrevQ.append([]) + for j in xrange(len(initQ[0])): + if np.sum(probQ[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore + probDistByPosByPrevQ[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j])) + else: + probDistByPosByPrevQ[-1].append(DiscreteDistribution(probQ[i][j],Qscores)) + + countDict = {} + for q in Qscores: + countDict[q] = 0 + for samp in xrange(1,N_SAMP+1): + if samp%PRINT_EVERY == 0: + print samp + myQ = initDistByPos[0].sample() + countDict[myQ] += 1 + for i in xrange(1,len(initQ)): + myQ = probDistByPosByPrevQ[i][myQ].sample() + countDict[myQ] += 1 + + totBases = float(sum(countDict.values())) + avgError = 0. + for k in sorted(countDict.keys()): + eVal = 10.**(-k/10.) + #print k, eVal, countDict[k] + avgError += eVal * (countDict[k]/totBases) + print 'AVG ERROR RATE:',avgError + + return (initQ, probQ, avgError) + +parser = argparse.ArgumentParser(description='genSeqErrorModel.py') +parser.add_argument('-i', type=str, required=True, metavar='<str>', help="* input_read1.fq (.gz) / input_read1.sam") +parser.add_argument('-o', type=str, required=True, metavar='<str>', help="* output.p") +parser.add_argument('-i2', type=str, required=False, metavar='<str>', default=None, help="input_read2.fq (.gz) / input_read2.sam") +parser.add_argument('-p', type=str, required=False, metavar='<str>', default=None, help="input_alignment.pileup") +parser.add_argument('-q', type=int, required=False, metavar='<int>', default=33, help="quality score offset [33]") +parser.add_argument('-Q', type=int, required=False, metavar='<int>', default=41, help="maximum quality score [41]") +parser.add_argument('-n', type=int, required=False, metavar='<int>', default=-1, help="maximum number of reads to process [all]") +parser.add_argument('-s', type=int, required=False, metavar='<int>', default=1000000, help="number of simulation iterations [1000000]") +parser.add_argument('--plot', required=False, action='store_true', default=False, help='perform some optional plotting') +args = parser.parse_args() + +(INF, OUF, offQ, maxQ, MAX_READS, N_SAMP) = (args.i, args.o, args.q, args.Q, args.n, args.s) +(INF2, PILEUP) = (args.i2, args.p) + +RQ = maxQ+1 + +INIT_SMOOTH = 0. +PROB_SMOOTH = 0. +PRINT_EVERY = 10000 +PLOT_STUFF = args.plot +if PLOT_STUFF: + print 'plotting is desired, lets import matplotlib...' + import matplotlib.pyplot as mpl + +def main(): + + Qscores = range(RQ) + if INF2 == None: + (initQ, probQ, avgError) = parseFQ(INF) + else: + (initQ, probQ, avgError1) = parseFQ(INF) + (initQ2, probQ2, avgError2) = parseFQ(INF2) + avgError = (avgError1+avgError2)/2. + + # + # embed some default sequencing error parameters if no pileup is provided + # + if PILEUP == None: + + print 'Using default sequencing error parameters...' + + # sequencing substitution transition probabilities + SSE_PROB = [[0., 0.4918, 0.3377, 0.1705 ], + [0.5238, 0., 0.2661, 0.2101 ], + [0.3754, 0.2355, 0., 0.3890 ], + [0.2505, 0.2552, 0.4942, 0. ]] + # if a sequencing error occurs, what are the odds it's an indel? + SIE_RATE = 0.01 + # sequencing indel error length distribution + SIE_PROB = [0.999,0.001] + SIE_VAL = [1,2] + # if a sequencing indel error occurs, what are the odds it's an insertion as opposed to a deletion? + SIE_INS_FREQ = 0.4 + # if a sequencing insertion error occurs, what's the probability of it being an A, C, G, T... + SIE_INS_NUCL = [0.25, 0.25, 0.25, 0.25] + + # + # otherwise we need to parse a pileup and compute statistics! + # + else: + print '\nPileup parsing coming soon!\n' + exit(1) + + errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL] + + # + # finally, let's save our output model + # + print 'saving model...' + if INF2 == None: + pickle.dump([initQ,probQ,Qscores,offQ,avgError,errorParams],open(OUF,'wb')) + else: + pickle.dump([initQ,probQ,initQ2,probQ2,Qscores,offQ,avgError,errorParams],open(OUF,'wb')) + +if __name__ == '__main__': + main()
