comparison py/SequenceContainer.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 import random
2 import copy
3 import re
4 import os
5 import bisect
6 import cPickle as pickle
7 import numpy as np
8
9 from probability import DiscreteDistribution, poisson_list, quantize_list
10 from cigar import CigarString
11
12 MAX_ATTEMPTS = 100 # max attempts to insert a mutation into a valid position
13 MAX_MUTFRAC = 0.3 # the maximum percentage of a window that can contain mutations
14
15 NUCL = ['A','C','G','T']
16 TRI_IND = {'AA':0, 'AC':1, 'AG':2, 'AT':3, 'CA':4, 'CC':5, 'CG':6, 'CT':7,
17 'GA':8, 'GC':9, 'GG':10, 'GT':11, 'TA':12, 'TC':13, 'TG':14, 'TT':15}
18 NUC_IND = {'A':0, 'C':1, 'G':2, 'T':3}
19 ALL_TRI = [NUCL[i]+NUCL[j]+NUCL[k] for i in xrange(len(NUCL)) for j in xrange(len(NUCL)) for k in xrange(len(NUCL))]
20 ALL_IND = {ALL_TRI[i]:i for i in xrange(len(ALL_TRI))}
21
22 # DEBUG
23 IGNORE_TRINUC = False
24
25 # percentile resolution used for fraglen quantizing
26 COV_FRAGLEN_PERCENTILE = 10.
27 LARGE_NUMBER = 9999999999
28
29 #
30 # Container for reference sequences, applies mutations
31 #
32 class SequenceContainer:
33 def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, onlyVCF=False):
34 # initialize basic variables
35 self.onlyVCF = onlyVCF
36 self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
37 # initialize mutation models
38 self.init_mutModels(mutationModels, mutRate)
39 # sample the number of variants that will be inserted into each ploid
40 self.init_poisson()
41 self.indelsToAdd = [n.sample() for n in self.ind_pois]
42 self.snpsToAdd = [n.sample() for n in self.snp_pois]
43 # initialize trinuc snp bias
44 self.init_trinucBias()
45
46 def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen):
47 self.x = xOffset
48 self.ploidy = ploidy
49 self.readLen = readLen
50 self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
51 self.seqLen = len(sequence)
52 self.indelList = [[] for n in xrange(self.ploidy)]
53 self.snpList = [[] for n in xrange(self.ploidy)]
54 self.allCigar = [[] for n in xrange(self.ploidy)]
55 self.FM_pos = [[] for n in xrange(self.ploidy)]
56 self.FM_span = [[] for n in xrange(self.ploidy)]
57 self.adj = [None for n in xrange(self.ploidy)]
58 # blackList[ploid][pos] = 0 safe to insert variant here
59 # blackList[ploid][pos] = 1 indel inserted here
60 # blackList[ploid][pos] = 2 snp inserted here
61 # blackList[ploid][pos] = 3 invalid position for various processing reasons
62 self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]
63
64 # disallow mutations to occur on window overlap points
65 self.winBuffer = windowOverlap
66 for p in xrange(self.ploidy):
67 self.blackList[p][-self.winBuffer] = 3
68 self.blackList[p][-self.winBuffer-1] = 3
69
70 def init_coverage(self,coverageDat,fragDist=None):
71 # if we're only creating a vcf, skip some expensive initialization related to coverage depth
72 if not self.onlyVCF:
73 (self.windowSize, gc_scalars, targetCov_vals) = coverageDat
74 gcCov_vals = [[] for n in self.sequences]
75 trCov_vals = [[] for n in self.sequences]
76 self.coverage_distribution = []
77 avg_out = []
78 for i in xrange(len(self.sequences)):
79 # compute gc-bias
80 j = 0
81 while j+self.windowSize < len(self.sequences[i]):
82 gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
83 gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
84 j += self.windowSize
85 gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
86 gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
87 #
88 trCov_vals[i].append(targetCov_vals[0])
89 prevVal = self.FM_pos[i][0]
90 for j in xrange(1,len(self.sequences[i])-self.readLen):
91 if self.FM_pos[i][j] == None:
92 trCov_vals[i].append(targetCov_vals[prevVal])
93 else:
94 trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
95 prevVal = self.FM_pos[i][j]
96 #print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
97 # shift by half of read length
98 trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
99 # fill in missing indices
100 trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))
101
102 #
103 covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
104 coverage_vals = []
105 for j in xrange(0,len(self.sequences[i])-self.readLen):
106 coverage_vals.append(covvec[j+self.readLen] - covvec[j])
107 avg_out.append(np.mean(coverage_vals)/float(self.readLen))
108
109 if fragDist == None:
110 self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
111
112 # fragment length nightmare
113 else:
114 currentThresh = 0.
115 index_list = [0]
116 for j in xrange(len(fragDist.cumP)):
117 if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
118 currentThresh = fragDist.cumP[j]
119 index_list.append(j)
120 flq = [fragDist.values[nnn] for nnn in index_list]
121 if fragDist.values[-1] not in flq:
122 flq.append(fragDist.values[-1])
123 flq.append(LARGE_NUMBER)
124
125 self.fraglens_indMap = {}
126 for j in fragDist.values:
127 bInd = bisect.bisect(flq,j)
128 if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
129 self.fraglens_indMap[j] = flq[bInd-1]
130 else:
131 self.fraglens_indMap[j] = flq[bInd]
132
133 self.coverage_distribution.append({})
134 for flv in sorted(list(set(self.fraglens_indMap.values()))):
135 buffer_val = self.readLen
136 for j in fragDist.values:
137 if self.fraglens_indMap[j] == flv and j > buffer_val:
138 buffer_val = j
139 coverage_vals = []
140 for j in xrange(len(self.sequences[i])-buffer_val):
141 coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])
142
143 # EXPERIMENTAL
144 #quantized_covVals = quantize_list(coverage_vals)
145 #self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])
146
147 # TESTING
148 #import matplotlib.pyplot as mpl
149 #print len(coverage_vals),'-->',len(quantized_covVals)
150 #mpl.figure(0)
151 #mpl.plot(range(len(coverage_vals)),coverage_vals)
152 #for qcv in quantized_covVals:
153 # mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
154 #mpl.show()
155 #exit(1)
156
157 self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))
158
159 return np.mean(avg_out)
160
161 def init_mutModels(self,mutationModels,mutRate):
162 if mutationModels == []:
163 ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
164 self.modelData = ml[:self.ploidy]
165 else:
166 if len(mutationModels) != self.ploidy:
167 print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
168 exit(1)
169 self.modelData = copy.deepcopy(mutationModels)
170
171 # do we need to rescale mutation frequencies?
172 mutRateSum = sum([n[0] for n in self.modelData])
173 self.mutRescale = mutRate
174 if self.mutRescale == None:
175 self.mutScalar = 1.0
176 else:
177 self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))
178
179 # how are mutations spread to each ploid, based on their specified mut rates?
180 self.ploidMutFrac = [float(n[0])/mutRateSum for n in self.modelData]
181 self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))
182
183 # init mutation models
184 #
185 # self.models[ploid][0] = average mutation rate
186 # self.models[ploid][1] = p(mut is homozygous | mutation occurs)
187 # self.models[ploid][2] = p(mut is indel | mut occurs)
188 # self.models[ploid][3] = p(insertion | indel occurs)
189 # self.models[ploid][4] = distribution of insertion lengths
190 # self.models[ploid][5] = distribution of deletion lengths
191 # self.models[ploid][6] = distribution of trinucleotide SNP transitions
192 # self.models[ploid][7] = p(trinuc mutates)
193 self.models = []
194 for n in self.modelData:
195 self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
196 for m in n[8]:
197 self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
198 DiscreteDistribution(m[1],NUCL),
199 DiscreteDistribution(m[2],NUCL),
200 DiscreteDistribution(m[3],NUCL)])
201 self.models[-1].append([m for m in n[9]])
202
203 def init_poisson(self):
204 ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
205 snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
206 k_range = range(int(self.seqLen*MAX_MUTFRAC))
207 self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
208 self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]
209
210 def init_trinucBias(self):
211 # compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
212 #
213 # note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
214 # at positions affected by indels. At the moment I'm going to consider this negligible.
215 trinuc_snp_bias = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
216 self.trinuc_bias = [None for n in xrange(self.ploidy)]
217 for p in xrange(self.ploidy):
218 for i in xrange(self.winBuffer+1,self.seqLen-1):
219 trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
220 self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))
221
222 def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None):
223 # if mutation model is changed, we have to reinitialize it...
224 if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
225 self.ploidy = ploidy
226 self.mutRescale = mutRate
227 self.init_mutModels(mutationModels, mutRate)
228 # if sequence length is different than previous window, we have to redo snp/indel poissons
229 if len(sequence) != self.seqLen:
230 self.seqLen = len(sequence)
231 self.init_poisson()
232 # basic vars
233 self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
234 self.indelsToAdd = [n.sample() for n in self.ind_pois]
235 self.snpsToAdd = [n.sample() for n in self.snp_pois]
236 #print (self.indelsToAdd,self.snpsToAdd)
237 # initialize trinuc snp bias
238 if not IGNORE_TRINUC:
239 self.init_trinucBias()
240
241 def insert_mutations(self, inputList):
242 #
243 # TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
244 #
245 for inpV in inputList:
246 whichPloid = []
247 wps = inpV[4][0]
248 if wps == None: # if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
249 whichPloid.append(self.ploidMutPrior.sample())
250 whichAlt = [0]
251 else:
252 #if 'WP=' in wps:
253 # whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
254 # print 'WHICH:', whichPloid
255 # whichAlt = [0]*len(whichPloid)
256 #elif '/' in wps or '|' in wps:
257 if '/' in wps or '|' in wps:
258 if '/' in wps:
259 splt = wps.split('/')
260 else:
261 splt = wps.split('|')
262 whichPloid = []
263 whichAlt = []
264 for i in xrange(len(splt)):
265 if splt[i] == '1':
266 whichPloid.append(i)
267 #whichAlt.append(int(splt[i])-1)
268 # assume we're just using first alt for inserted variants?
269 whichAlt = [0 for n in whichPloid]
270 else: # otherwise assume monoploidy
271 whichPloid = [0]
272 whichAlt = [0]
273
274 # ignore invalid ploids
275 for i in xrange(len(whichPloid)-1,-1,-1):
276 if whichPloid[i] >= self.ploidy:
277 del whichPloid[i]
278
279 for i in xrange(len(whichPloid)):
280 p = whichPloid[i]
281 myAlt = inpV[2][whichAlt[i]]
282 myVar = (inpV[0]-self.x,inpV[1],myAlt)
283 inLen = max([len(inpV[1]),len(myAlt)])
284 #print myVar, chr(self.sequences[p][myVar[0]])
285 if myVar[0] < 0 or myVar[0] >= len(self.blackList[p]):
286 print '\nError: Attempting to insert variant out of window bounds:'
287 print myVar, '--> blackList[0:'+str(len(self.blackList[p]))+']\n'
288 exit(1)
289 if len(inpV[1]) == 1 and len(myAlt) == 1:
290 if self.blackList[p][myVar[0]]:
291 continue
292 self.snpList[p].append(myVar)
293 self.blackList[p][myVar[0]] = 2
294 else:
295 for k in xrange(myVar[0],myVar[0]+inLen+1):
296 if self.blackList[p][k]:
297 continue
298 for k in xrange(myVar[0],myVar[0]+inLen+1):
299 self.blackList[p][k] = 1
300 self.indelList[p].append(myVar)
301
302 def random_mutations(self):
303
304 # add random indels
305 all_indels = [[] for n in self.sequences]
306 for i in xrange(self.ploidy):
307 for j in xrange(self.indelsToAdd[i]):
308 if random.random() <= self.models[i][1]: # insert homozygous indel
309 whichPloid = range(self.ploidy)
310 else: # insert heterozygous indel
311 whichPloid = [self.ploidMutPrior.sample()]
312
313 # try to find suitable places to insert indels
314 eventPos = -1
315 for attempt in xrange(MAX_ATTEMPTS):
316 eventPos = random.randint(self.winBuffer,self.seqLen-1)
317 for p in whichPloid:
318 if self.blackList[p][eventPos]:
319 eventPos = -1
320 if eventPos != -1:
321 break
322 if eventPos == -1:
323 continue
324
325 if random.random() <= self.models[i][3]: # insertion
326 inLen = self.models[i][4].sample()
327 # sequence content of random insertions is uniformly random (change this later)
328 inSeq = ''.join([random.choice(NUCL) for n in xrange(inLen)])
329 refNucl = chr(self.sequences[i][eventPos])
330 myIndel = (eventPos,refNucl,refNucl+inSeq)
331 else: # deletion
332 inLen = self.models[i][5].sample()
333 if eventPos+inLen+1 >= len(self.sequences[i]): # skip if deletion too close to boundary
334 continue
335 if inLen == 1:
336 inSeq = chr(self.sequences[i][eventPos+1])
337 else:
338 inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
339 refNucl = chr(self.sequences[i][eventPos])
340 myIndel = (eventPos,refNucl+inSeq,refNucl)
341
342 # if event too close to boundary, skip. if event conflicts with other indel, skip.
343 skipEvent = False
344 if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
345 skipEvent = True
346 if skipEvent:
347 continue
348 for p in whichPloid:
349 for k in xrange(eventPos,eventPos+inLen+1):
350 if self.blackList[p][k]:
351 skipEvent = True
352 if skipEvent:
353 continue
354
355 for p in whichPloid:
356 for k in xrange(eventPos,eventPos+inLen+1):
357 self.blackList[p][k] = 1
358 all_indels[p].append(myIndel)
359
360 for i in xrange(len(all_indels)):
361 all_indels[i].extend(self.indelList[i])
362 all_indels = [sorted(n,reverse=True) for n in all_indels]
363 #print all_indels
364
365 # add random snps
366 all_snps = [[] for n in self.sequences]
367 for i in xrange(self.ploidy):
368 for j in xrange(self.snpsToAdd[i]):
369 if random.random() <= self.models[i][1]: # insert homozygous SNP
370 whichPloid = range(self.ploidy)
371 else: # insert heterozygous SNP
372 whichPloid = [self.ploidMutPrior.sample()]
373
374 # try to find suitable places to insert snps
375 eventPos = -1
376 for attempt in xrange(MAX_ATTEMPTS):
377 # based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
378 # (if there are multiple ploids, choose one at random)
379 if IGNORE_TRINUC:
380 eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
381 else:
382 ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
383 eventPos = self.trinuc_bias[ploid_to_use].sample()
384 for p in whichPloid:
385 if self.blackList[p][eventPos]:
386 eventPos = -1
387 if eventPos != -1:
388 break
389 if eventPos == -1:
390 continue
391
392 refNucl = chr(self.sequences[i][eventPos])
393 context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
394 # sample from tri-nucleotide substitution matrices to get SNP alt allele
395 newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
396 mySNP = (eventPos,refNucl,newNucl)
397
398 for p in whichPloid:
399 all_snps[p].append(mySNP)
400 self.blackList[p][mySNP[0]] = 2
401
402 # combine random snps with inserted snps, remove any snps that overlap indels
403 for p in xrange(len(all_snps)):
404 all_snps[p].extend(self.snpList[p])
405 all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]
406
407 # modify reference sequences
408 for i in xrange(len(all_snps)):
409 for j in xrange(len(all_snps[i])):
410 # sanity checking (for debugging purposes)
411 vPos = all_snps[i][j][0]
412 if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
413 print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
414 exit(1)
415 else:
416 self.sequences[i][vPos] = all_snps[i][j][2]
417
418 adjToAdd = [[] for n in xrange(self.ploidy)]
419 for i in xrange(len(all_indels)):
420 for j in xrange(len(all_indels[i])):
421 # sanity checking (for debugging purposes)
422 vPos = all_indels[i][j][0]
423 vPos2 = vPos + len(all_indels[i][j][1])
424 #print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
425 #print len(self.sequences[i]),'-->',
426 if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
427 print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
428 exit(1)
429 else:
430 self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
431 adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
432 #print len(self.sequences[i])
433 adjToAdd[i].sort()
434 #print adjToAdd[i]
435
436 self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
437 indSoFar = 0
438 valSoFar = 0
439 for j in xrange(len(self.adj[i])):
440 if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
441 valSoFar += adjToAdd[i][indSoFar][1]
442 indSoFar += 1
443 self.adj[i][j] = valSoFar
444
445 # precompute cigar strings (we can skip this is going for only vcf output)
446 if not self.onlyVCF:
447 tempSymbolString = ['M']
448 prevVal = self.adj[i][0]
449 j = 1
450 while j < len(self.adj[i]):
451 diff = self.adj[i][j] - prevVal
452 prevVal = self.adj[i][j]
453 if diff > 0: # insertion
454 tempSymbolString.extend(['I']*abs(diff))
455 j += abs(diff)
456 elif diff < 0: # deletion
457 tempSymbolString.append('D'*abs(diff)+'M')
458 j += 1
459 else:
460 tempSymbolString.append('M')
461 j += 1
462
463 for j in xrange(len(tempSymbolString)-self.readLen):
464 self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())
465 # pre-compute reference position of first matching base
466 my_fm_pos = None
467 for k in xrange(self.readLen):
468 if 'M' in tempSymbolString[j+k]:
469 my_fm_pos = j+k
470 break
471 if my_fm_pos == None:
472 self.FM_pos[i].append(None)
473 self.FM_span[i].append(None)
474 else:
475 self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos])
476 span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn])
477 self.FM_span[i].append(self.FM_pos[i][-1] + span_dif)
478
479 # tally up variants implemented
480 countDict = {}
481 all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
482 for i in xrange(len(all_variants)):
483 for j in xrange(len(all_variants[i])):
484 all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
485 t = tuple(all_variants[i][j])
486 if t not in countDict:
487 countDict[t] = []
488 countDict[t].append(i)
489
490 #
491 # TODO: combine multiple variants that happened to occur at same position into single vcf entry
492 #
493
494 output_variants = []
495 for k in sorted(countDict.keys()):
496 output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
497 ploid_string = ['0' for n in xrange(self.ploidy)]
498 for k2 in [n for n in countDict[k]]:
499 ploid_string[k2] = '1'
500 output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
501 return output_variants
502
503
504 def sample_read(self, sequencingModel, fragLen=None):
505
506 # choose a ploid
507 myPloid = random.randint(0,self.ploidy-1)
508
509 # stop attempting to find a valid position if we fail enough times
510 MAX_READPOS_ATTEMPTS = 100
511 attempts_thus_far = 0
512
513 # choose a random position within the ploid, and generate quality scores / sequencing errors
514 readsToSample = []
515 if fragLen == None:
516 rPos = self.coverage_distribution[myPloid].sample()
517 #####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1) # uniform random
518 ####
519 ##### decide which subsection of the sequence to sample from using coverage probabilities
520 ####coords_bad = True
521 ####while coords_bad:
522 #### attempts_thus_far += 1
523 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS:
524 #### return None
525 #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
526 #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
527 #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries
528 #### continue
529 #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
530 #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]
531 #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
532 #### continue
533 #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample
534 #### continue
535 #### if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
536 #### coords_bad = False
537 ####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
538
539 # sample read position and call function to compute quality scores / sequencing errors
540 rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
541 (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
542 readsToSample.append([rPos,myQual,myErrors,rDat])
543
544 else:
545 rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
546
547 # EXPERIMENTAL
548 #coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
549 #rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1])
550
551 #####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1) # uniform random
552 ####
553 ##### decide which subsection of the sequence to sample from using coverage probabilities
554 ####coords_bad = True
555 ####while coords_bad:
556 #### attempts_thus_far += 1
557 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS:
558 #### #print coords_to_select_from
559 #### return None
560 #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
561 #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
562 #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries
563 #### continue
564 #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
565 #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]] # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
566 #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
567 #### continue
568 #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample
569 #### continue
570 #### rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
571 #### # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
572 #### if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
573 #### rPos1 -= fragLen - self.readLen
574 #### if rPos1 < len(self.sequences[myPloid])-fragLen:
575 #### coords_bad = False
576
577 rPos2 = rPos1 + fragLen - self.readLen
578 rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
579 rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
580 #print len(rDat1), rPos1, len(self.sequences[myPloid])
581 (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
582 (myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
583 readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
584 readsToSample.append([rPos2,myQual2,myErrors2,rDat2])
585
586 # error format:
587 # myError[i] = (type, len, pos, ref, alt)
588
589 # examine sequencing errors to-be-inserted.
590 # - remove deletions that don't have enough bordering sequence content to "fill in"
591 # if error is valid, make the changes to the read data
592 rOut = []
593 for r in readsToSample:
594 try:
595 myCigar = self.allCigar[myPloid][r[0]]
596 except IndexError:
597 print 'Index error when attempting to find cigar string.'
598 print len(self.allCigar[myPloid]), r[0]
599 if fragLen != None:
600 print (rPos1, rPos2)
601 print myPloid, fragLen, self.fraglens_indMap[fragLen]
602 exit(1)
603 totalD = sum([error[1] for error in r[2] if error[0] == 'D'])
604 totalI = sum([error[1] for error in r[2] if error[0] == 'I'])
605 availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
606 # add buffer sequence to fill in positions that get deleted
607 r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
608 expandedCigar = []
609 extraCigar = []
610 adj = 0
611 sse_adj = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))]
612 anyIndelErr = False
613
614 # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
615 # secondarily, sort by index
616 arrangedErrors = {'D':[],'I':[],'S':[]}
617 for error in r[2]:
618 arrangedErrors[error[0]].append((error[2],error))
619 sortedErrors = []
620 for k in sorted(arrangedErrors.keys()):
621 sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])
622
623 skipIndels = False
624
625 for error in sortedErrors:
626 #print '-se-',r[0], error
627 #print sse_adj
628 eLen = error[1]
629 ePos = error[2]
630 if error[0] == 'D' or error[0] == 'I':
631 anyIndelErr = True
632 extraCigarVal = []
633 if totalD > availB: # if not enough bases to fill-in deletions, skip all indel erors
634 continue
635 if expandedCigar == []:
636 expandedCigar = CigarString(stringIn=myCigar).getList()
637 fillToGo = totalD - totalI + 1
638 if fillToGo > 0:
639 try:
640 extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]
641 except IndexError: # applying the deletions we want requires going beyond region boundaries. skip all indel errors
642 skipIndels = True
643
644 if skipIndels:
645 continue
646
647 # insert deletion error into read and update cigar string accordingly
648 if error[0] == 'D':
649 myadj = sse_adj[ePos]
650 pi = ePos+myadj
651 pf = ePos+myadj+eLen+1
652 if str(r[3][pi:pf]) == str(error[3]):
653 r[3] = r[3][:pi+1] + r[3][pf:]
654 expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
655 if pi+1 == len(expandedCigar): # weird edge case with del at very end of region. Make a guess and add a "M"
656 expandedCigar.append('M')
657 expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
658 else:
659 print '\nError, ref does not match alt while attempting to insert deletion error!\n'
660 exit(1)
661 adj -= eLen
662 for i in xrange(ePos,len(sse_adj)):
663 sse_adj[i] -= eLen
664
665 # insert insertion error into read and update cigar string accordingly
666 else:
667 myadj = sse_adj[ePos]
668 if chr(r[3][ePos+myadj]) == error[3]:
669 r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:]
670 expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:]
671 else:
672 print '\nError, ref does not match alt while attempting to insert insertion error!\n'
673 print '---',chr(r[3][ePos+myadj]), '!=', error[3]
674 exit(1)
675 adj += eLen
676 for i in xrange(ePos,len(sse_adj)):
677 sse_adj[i] += eLen
678
679 else: # substitution errors, much easier by comparison...
680 if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
681 r[3][ePos+sse_adj[ePos]] = error[4]
682 else:
683 print '\nError, ref does not match alt while attempting to insert substitution error!\n'
684 exit(1)
685
686 if anyIndelErr:
687 if len(expandedCigar):
688 relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
689 myCigar = CigarString(listIn=relevantCigar).getString()
690
691 r[3] = r[3][:self.readLen]
692
693 rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])
694
695 # rOut[i] = (pos, cigar, read_string, qual_string)
696 return rOut
697
698
699 #
700 # Container for read data, computes quality scores and positions to insert errors
701 #
702 class ReadContainer:
703 def __init__(self, readLen, errorModel, reScaledError):
704
705 self.readLen = readLen
706
707 errorDat = pickle.load(open(errorModel,'rb'))
708 self.UNIFORM = False
709 if len(errorDat) == 4: # uniform-error SE reads (e.g. PacBio)
710 self.UNIFORM = True
711 [Qscores,offQ,avgError,errorParams] = errorDat
712 self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
713 print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
714 if len(errorDat) == 6: # only 1 q-score model present, use same model for both strands
715 [initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
716 self.PE_MODELS = False
717 elif len(errorDat) == 8: # found a q-score model for both forward and reverse strands
718 #print 'Using paired-read quality score profiles...'
719 [initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
720 self.PE_MODELS = True
721 if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
722 print '\nError: R1 and R2 quality score models are of different length.\n'
723 exit(1)
724
725
726 self.qErrRate = [0.]*(max(Qscores)+1)
727 for q in Qscores:
728 self.qErrRate[q] = 10.**(-q/10.)
729 self.offQ = offQ
730
731 # errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
732 self.errP = errorParams
733 self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
734 self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
735 self.errSIN = DiscreteDistribution(self.errP[5],NUCL)
736
737 # adjust sequencing error frequency to match desired rate
738 if reScaledError == None:
739 self.errorScale = 1.0
740 else:
741 self.errorScale = reScaledError/avgError
742 print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)
743
744 if self.UNIFORM == False:
745 # adjust length to match desired read length
746 if self.readLen == len(initQ1):
747 self.qIndRemap = range(self.readLen)
748 else:
749 print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
750 self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]
751
752 # initialize probability distributions
753 self.initDistByPos1 = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
754 self.probDistByPosByPrevQ1 = [None]
755 for i in xrange(1,len(initQ1)):
756 self.probDistByPosByPrevQ1.append([])
757 for j in xrange(len(initQ1[0])):
758 if np.sum(probQ1[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore
759 self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
760 else:
761 self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))
762
763 if self.PE_MODELS:
764 self.initDistByPos2 = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
765 self.probDistByPosByPrevQ2 = [None]
766 for i in xrange(1,len(initQ2)):
767 self.probDistByPosByPrevQ2.append([])
768 for j in xrange(len(initQ2[0])):
769 if np.sum(probQ2[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore
770 self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
771 else:
772 self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))
773
774 def getSequencingErrors(self, readData, isReverseStrand=False):
775
776 qOut = [0]*self.readLen
777 sErr = []
778
779 if self.UNIFORM:
780 myQ = [self.uniform_qscore + self.offQ for n in xrange(self.readLen)]
781 qOut = ''.join([chr(n) for n in myQ])
782 for i in xrange(self.readLen):
783 if random.random() < self.errorScale*self.qErrRate[self.uniform_qscore]:
784 sErr.append(i)
785 else:
786
787 if self.PE_MODELS and isReverseStrand:
788 myQ = self.initDistByPos2[0].sample()
789 else:
790 myQ = self.initDistByPos1[0].sample()
791 qOut[0] = myQ
792
793 for i in xrange(1,self.readLen):
794 if self.PE_MODELS and isReverseStrand:
795 myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
796 else:
797 myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()
798 qOut[i] = myQ
799
800 if isReverseStrand:
801 qOut = qOut[::-1]
802
803 for i in xrange(self.readLen):
804 if random.random() < self.errorScale * self.qErrRate[qOut[i]]:
805 sErr.append(i)
806
807 qOut = ''.join([chr(n + self.offQ) for n in qOut])
808
809 if self.errorScale == 0.0:
810 return (qOut,[])
811
812 sOut = []
813 nDelSoFar = 0
814 # don't allow indel errors to occur on subsequent positions
815 prevIndel = -2
816 # don't allow other sequencing errors to occur on bases removed by deletion errors
817 delBlacklist = []
818
819 for ind in sErr[::-1]: # for each error that we're going to insert...
820
821 # determine error type
822 isSub = True
823 if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and abs(ind-prevIndel) > 1:
824 if random.random() < self.errP[1]:
825 isSub = False
826
827 # errorOut = (type, len, pos, ref, alt)
828
829 if isSub: # insert substitution error
830 myNucl = chr(readData[ind])
831 newNucl = self.errSSE[NUC_IND[myNucl]].sample()
832 sOut.append(('S',1,ind,myNucl,newNucl))
833 else: # insert indel error
834 indelLen = self.errSIE.sample()
835 if random.random() < self.errP[4]: # insertion error
836 myNucl = chr(readData[ind])
837 newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
838 sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
839 elif ind < self.readLen-2-nDelSoFar: # deletion error (prevent too many of them from stacking up)
840 myNucl = str(readData[ind:ind+indelLen+1])
841 newNucl = chr(readData[ind])
842 nDelSoFar += len(myNucl)-1
843 sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
844 for i in xrange(ind+1,ind+indelLen+1):
845 delBlacklist.append(i)
846 prevIndel = ind
847
848 # remove blacklisted errors
849 for i in xrange(len(sOut)-1,-1,-1):
850 if sOut[i][2] in delBlacklist:
851 del sOut[i]
852
853 return (qOut,sOut)
854
855
856
857 """************************************************
858 **** DEFAULT MUTATION MODELS
859 ************************************************"""
860
861
862 # parse mutation model pickle file
863 def parseInputMutationModel(model=None, whichDefault=1):
864 if whichDefault == 1:
865 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1]
866 elif whichDefault == 2:
867 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2]
868 else:
869 print '\nError: Unknown default mutation model specified\n'
870 exit(1)
871
872 if model != None:
873 pickle_dict = pickle.load(open(model,"rb"))
874 outModel[0] = pickle_dict['AVG_MUT_RATE']
875 outModel[2] = 1. - pickle_dict['SNP_FREQ']
876
877 insList = pickle_dict['INDEL_FREQ']
878 if len(insList):
879 insCount = sum([insList[k] for k in insList.keys() if k >= 1])
880 delCount = sum([insList[k] for k in insList.keys() if k <= -1])
881 insVals = [k for k in sorted(insList.keys()) if k >= 1]
882 insWght = [insList[k]/float(insCount) for k in insVals]
883 delVals = [k for k in sorted([abs(k) for k in insList.keys() if k <= -1])]
884 delWght = [insList[-k]/float(delCount) for k in delVals]
885 else: # degenerate case where no indel stats are provided
886 insCount = 1
887 delCount = 1
888 insVals = [1]
889 insWght = [1.0]
890 delVals = [1]
891 delWght = [1.0]
892 outModel[3] = insCount/float(insCount + delCount)
893 outModel[4] = insVals
894 outModel[5] = insWght
895 outModel[6] = delVals
896 outModel[7] = delWght
897
898 trinuc_trans_prob = pickle_dict['TRINUC_TRANS_PROBS']
899 for k in sorted(trinuc_trans_prob.keys()):
900 myInd = TRI_IND[k[0][0]+k[0][2]]
901 (k1,k2) = (NUC_IND[k[0][1]],NUC_IND[k[1][1]])
902 outModel[8][myInd][k1][k2] = trinuc_trans_prob[k]
903 for i in xrange(len(outModel[8])):
904 for j in xrange(len(outModel[8][i])):
905 for l in xrange(len(outModel[8][i][j])):
906 # if trinuc not present in input mutation model, assign it uniform probability
907 if float(sum(outModel[8][i][j])) < 1e-12:
908 outModel[8][i][j] = [0.25,0.25,0.25,0.25]
909 else:
910 outModel[8][i][j][l] /= float(sum(outModel[8][i][j]))
911
912 trinuc_mut_prob = pickle_dict['TRINUC_MUT_PROB']
913 which_have_we_seen = {n:False for n in ALL_TRI}
914 trinuc_mean = np.mean(trinuc_mut_prob.values())
915 for trinuc in trinuc_mut_prob.keys():
916 outModel[9][ALL_IND[trinuc]] = trinuc_mut_prob[trinuc]
917 which_have_we_seen[trinuc] = True
918 for trinuc in which_have_we_seen.keys():
919 if which_have_we_seen[trinuc] == False:
920 outModel[9][ALL_IND[trinuc]] = trinuc_mean
921
922 return outModel
923
924
925 # parse mutation model files, returns default model if no model directory is specified
926 #
927 # OLD FUNCTION THAT PROCESSED OUTDATED TEXTFILE MUTATION MODELS
928 def parseInputMutationModel_deprecated(prefix=None, whichDefault=1):
929 if whichDefault == 1:
930 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1]
931 elif whichDefault == 2:
932 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2]
933 else:
934 print '\nError: Unknown default mutation model specified\n'
935 exit(1)
936
937 if prefix != None:
938 if prefix[-1] != '/':
939 prefix += '/'
940 if not os.path.isdir(prefix):
941 '\nError: Input mutation model directory not found:',prefix,'\n'
942 exit(1)
943
944 print 'Reading in mutation model...'
945 listing1 = [n for n in os.listdir(prefix) if n[-5:] == '.prob']
946 listing2 = [n for n in os.listdir(prefix) if n[-7:] == '.trinuc']
947 listing = sorted(listing1) + sorted(listing2)
948 for l in listing:
949 f = open(prefix+l,'r')
950 fr = [n.split('\t') for n in f.read().split('\n')]
951 f.close()
952
953 if '_overall.prob' in l:
954 myIns = None
955 myDel = None
956 for dat in fr[1:]:
957 if len(dat) == 2:
958 if dat[0] == 'insertion':
959 myIns = float(dat[1])
960 elif dat[0] == 'deletion':
961 myDel = float(dat[1])
962 if myIns != None and myDel != None:
963 outModel[2] = myIns + myDel
964 outModel[3] = myIns / (myIns + myDel)
965 print '-',l
966
967 if '_insLength.prob' in l:
968 insVals = {}
969 for dat in fr[1:]:
970 if len(dat) == 2:
971 insVals[int(dat[0])] = float(dat[1])
972 if len(insVals):
973 outModel[4] = sorted(insVals.keys())
974 outModel[5] = [insVals[n] for n in outModel[4]]
975 print '-',l
976
977 if '_delLength.prob' in l:
978 delVals = {}
979 for dat in fr[1:]:
980 if len(dat) == 2:
981 delVals[int(dat[0])] = float(dat[1])
982 if len(delVals):
983 outModel[6] = sorted(delVals.keys())
984 outModel[7] = [delVals[n] for n in outModel[6]]
985 print '-',l
986
987 if '.trinuc' == l[-7:]:
988 context_ind = TRI_IND[l[-10]+l[-8]]
989 p_matrix = [[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1]]
990 for i in xrange(len(p_matrix)):
991 for j in xrange(len(fr[i])):
992 p_matrix[i][j] = float(fr[i][j])
993 anyNone = False
994 for i in xrange(len(p_matrix)):
995 for j in xrange(len(p_matrix[i])):
996 if p_matrix[i][j] == -1:
997 anyNone = True
998 if not anyNone:
999 outModel[8][context_ind] = copy.deepcopy(p_matrix)
1000 print '-',l
1001
1002 return outModel
1003
1004 ######################
1005 # DEFAULT VALUES #
1006 ######################
1007
1008 DEFAULT_1_OVERALL_MUT_RATE = 0.001
1009 DEFAULT_1_HOMOZYGOUS_FREQ = 0.010
1010 DEFAULT_1_INDEL_FRACTION = 0.05
1011 DEFAULT_1_INS_VS_DEL = 0.6
1012 DEFAULT_1_INS_LENGTH_VALUES = [1,2,3,4,5,6,7,8,9,10]
1013 DEFAULT_1_INS_LENGTH_WEIGHTS = [0.4, 0.2, 0.1, 0.05, 0.05, 0.05, 0.05, 0.034, 0.033, 0.033]
1014 DEFAULT_1_DEL_LENGTH_VALUES = [1,2,3,4,5]
1015 DEFAULT_1_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1]
1016 example_matrix_1 = [[0.0, 0.15, 0.7, 0.15],
1017 [0.15, 0.0, 0.15, 0.7],
1018 [0.7, 0.15, 0.0, 0.15],
1019 [0.15, 0.7, 0.15, 0.0]]
1020 DEFAULT_1_TRI_FREQS = [copy.deepcopy(example_matrix_1) for n in xrange(16)]
1021 DEFAULT_1_TRINUC_BIAS = [1./float(len(ALL_TRI)) for n in ALL_TRI]
1022 DEFAULT_MODEL_1 = [DEFAULT_1_OVERALL_MUT_RATE,
1023 DEFAULT_1_HOMOZYGOUS_FREQ,
1024 DEFAULT_1_INDEL_FRACTION,
1025 DEFAULT_1_INS_VS_DEL,
1026 DEFAULT_1_INS_LENGTH_VALUES,
1027 DEFAULT_1_INS_LENGTH_WEIGHTS,
1028 DEFAULT_1_DEL_LENGTH_VALUES,
1029 DEFAULT_1_DEL_LENGTH_WEIGHTS,
1030 DEFAULT_1_TRI_FREQS,
1031 DEFAULT_1_TRINUC_BIAS]
1032
1033 DEFAULT_2_OVERALL_MUT_RATE = 0.002
1034 DEFAULT_2_HOMOZYGOUS_FREQ = 0.200
1035 DEFAULT_2_INDEL_FRACTION = 0.1
1036 DEFAULT_2_INS_VS_DEL = 0.3
1037 DEFAULT_2_INS_LENGTH_VALUES = [1,2,3,4,5,6,7,8,9,10]
1038 DEFAULT_2_INS_LENGTH_WEIGHTS = [0.1, 0.1, 0.2, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
1039 DEFAULT_2_DEL_LENGTH_VALUES = [1,2,3,4,5]
1040 DEFAULT_2_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1]
1041 example_matrix_2 = [[0.0, 0.15, 0.7, 0.15],
1042 [0.15, 0.0, 0.15, 0.7],
1043 [0.7, 0.15, 0.0, 0.15],
1044 [0.15, 0.7, 0.15, 0.0]]
1045 DEFAULT_2_TRI_FREQS = [copy.deepcopy(example_matrix_2) for n in xrange(16)]
1046 DEFAULT_2_TRINUC_BIAS = [1./float(len(ALL_TRI)) for n in ALL_TRI]
1047 DEFAULT_MODEL_2 = [DEFAULT_2_OVERALL_MUT_RATE,
1048 DEFAULT_2_HOMOZYGOUS_FREQ,
1049 DEFAULT_2_INDEL_FRACTION,
1050 DEFAULT_2_INS_VS_DEL,
1051 DEFAULT_2_INS_LENGTH_VALUES,
1052 DEFAULT_2_INS_LENGTH_WEIGHTS,
1053 DEFAULT_2_DEL_LENGTH_VALUES,
1054 DEFAULT_2_DEL_LENGTH_WEIGHTS,
1055 DEFAULT_2_TRI_FREQS,
1056 DEFAULT_2_TRINUC_BIAS]
1057
1058