diff py/SequenceContainer.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/py/SequenceContainer.py	Tue May 15 02:39:53 2018 -0400
@@ -0,0 +1,1058 @@
+import random
+import copy
+import re
+import os
+import bisect
+import cPickle as pickle
+import numpy as np
+
+from probability import DiscreteDistribution, poisson_list, quantize_list
+from cigar import CigarString
+
+MAX_ATTEMPTS = 100	# max attempts to insert a mutation into a valid position
+MAX_MUTFRAC  = 0.3	# the maximum percentage of a window that can contain mutations
+
+NUCL    = ['A','C','G','T']
+TRI_IND = {'AA':0,  'AC':1,  'AG':2,   'AT':3,  'CA':4,  'CC':5,  'CG':6,  'CT':7,
+           'GA':8,  'GC':9,  'GG':10,  'GT':11, 'TA':12, 'TC':13, 'TG':14, 'TT':15}
+NUC_IND = {'A':0, 'C':1, 'G':2, 'T':3}
+ALL_TRI = [NUCL[i]+NUCL[j]+NUCL[k] for i in xrange(len(NUCL)) for j in xrange(len(NUCL)) for k in xrange(len(NUCL))]
+ALL_IND = {ALL_TRI[i]:i for i in xrange(len(ALL_TRI))}
+
+# DEBUG
+IGNORE_TRINUC = False
+
+# percentile resolution used for fraglen quantizing
+COV_FRAGLEN_PERCENTILE = 10.
+LARGE_NUMBER = 9999999999
+
+#
+#	Container for reference sequences, applies mutations
+#
+class SequenceContainer:
+	def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, onlyVCF=False):
+		# initialize basic variables
+		self.onlyVCF = onlyVCF
+		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
+		# initialize mutation models
+		self.init_mutModels(mutationModels, mutRate)
+		# sample the number of variants that will be inserted into each ploid
+		self.init_poisson()
+		self.indelsToAdd = [n.sample() for n in self.ind_pois]
+		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
+		# initialize trinuc snp bias
+		self.init_trinucBias()
+
+	def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen):
+		self.x         = xOffset
+		self.ploidy    = ploidy
+		self.readLen   = readLen
+		self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)]
+		self.seqLen    = len(sequence)
+		self.indelList = [[] for n in xrange(self.ploidy)]
+		self.snpList   = [[] for n in xrange(self.ploidy)]
+		self.allCigar  = [[] for n in xrange(self.ploidy)]
+		self.FM_pos    = [[] for n in xrange(self.ploidy)]
+		self.FM_span   = [[] for n in xrange(self.ploidy)]
+		self.adj       = [None for n in xrange(self.ploidy)]
+		# blackList[ploid][pos] = 0		safe to insert variant here
+		# blackList[ploid][pos] = 1		indel inserted here
+		# blackList[ploid][pos] = 2		snp inserted here
+		# blackList[ploid][pos] = 3		invalid position for various processing reasons
+		self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)]
+
+		# disallow mutations to occur on window overlap points
+		self.winBuffer = windowOverlap
+		for p in xrange(self.ploidy):
+			self.blackList[p][-self.winBuffer]   = 3
+			self.blackList[p][-self.winBuffer-1] = 3
+
+	def init_coverage(self,coverageDat,fragDist=None):
+		# if we're only creating a vcf, skip some expensive initialization related to coverage depth
+		if not self.onlyVCF:
+			(self.windowSize, gc_scalars, targetCov_vals) = coverageDat
+			gcCov_vals = [[] for n in self.sequences]
+			trCov_vals = [[] for n in self.sequences]
+			self.coverage_distribution = []
+			avg_out = []
+			for i in xrange(len(self.sequences)):
+				# compute gc-bias
+				j = 0
+				while j+self.windowSize < len(self.sequences[i]):
+					gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C')
+					gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize)
+					j += self.windowSize
+				gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C')
+				gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i])))
+				#
+				trCov_vals[i].append(targetCov_vals[0])
+				prevVal = self.FM_pos[i][0]
+				for j in xrange(1,len(self.sequences[i])-self.readLen):
+					if self.FM_pos[i][j] == None:
+						trCov_vals[i].append(targetCov_vals[prevVal])
+					else:
+						trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j]))
+						prevVal = self.FM_pos[i][j]
+					#print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j]
+				# shift by half of read length
+				trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)]
+				# fill in missing indices
+				trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i])))
+
+				#
+				covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))])
+				coverage_vals = []
+				for j in xrange(0,len(self.sequences[i])-self.readLen):
+					coverage_vals.append(covvec[j+self.readLen] - covvec[j])
+				avg_out.append(np.mean(coverage_vals)/float(self.readLen))
+
+				if fragDist == None:
+					self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals))))
+			
+				# fragment length nightmare
+				else:
+					currentThresh = 0.
+					index_list    = [0]
+					for j in xrange(len(fragDist.cumP)):
+						if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0:
+							currentThresh = fragDist.cumP[j]
+							index_list.append(j)
+					flq = [fragDist.values[nnn] for nnn in index_list]
+					if fragDist.values[-1] not in flq:
+						flq.append(fragDist.values[-1])
+					flq.append(LARGE_NUMBER)
+
+					self.fraglens_indMap = {}
+					for j in fragDist.values:
+						bInd = bisect.bisect(flq,j)
+						if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j):
+							self.fraglens_indMap[j] = flq[bInd-1]
+						else:
+							self.fraglens_indMap[j] = flq[bInd]
+
+					self.coverage_distribution.append({})
+					for flv in sorted(list(set(self.fraglens_indMap.values()))):
+						buffer_val = self.readLen
+						for j in fragDist.values:
+							if self.fraglens_indMap[j] == flv and j > buffer_val:
+								buffer_val = j
+						coverage_vals = []
+						for j in xrange(len(self.sequences[i])-buffer_val):
+							coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen])
+
+						# EXPERIMENTAL
+						#quantized_covVals = quantize_list(coverage_vals)
+						#self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals])
+
+						# TESTING
+						#import matplotlib.pyplot as mpl
+						#print len(coverage_vals),'-->',len(quantized_covVals)
+						#mpl.figure(0)
+						#mpl.plot(range(len(coverage_vals)),coverage_vals)
+						#for qcv in quantized_covVals:
+						#	mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r')
+						#mpl.show()
+						#exit(1)
+
+						self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals)))
+
+			return np.mean(avg_out)
+
+	def init_mutModels(self,mutationModels,mutRate):
+		if mutationModels == []:
+			ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)]
+			self.modelData = ml[:self.ploidy]
+		else:
+			if len(mutationModels) != self.ploidy:
+				print '\nError: Number of mutation models recieved is not equal to specified ploidy\n'
+				exit(1)
+			self.modelData = copy.deepcopy(mutationModels)
+
+		# do we need to rescale mutation frequencies?
+		mutRateSum = sum([n[0] for n in self.modelData])
+		self.mutRescale = mutRate
+		if self.mutRescale == None:
+			self.mutScalar = 1.0
+		else:
+			self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData)))
+
+		# how are mutations spread to each ploid, based on their specified mut rates?
+		self.ploidMutFrac  = [float(n[0])/mutRateSum for n in self.modelData]
+		self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy))
+
+		# init mutation models
+		#
+		# self.models[ploid][0] = average mutation rate
+		# self.models[ploid][1] = p(mut is homozygous | mutation occurs)
+		# self.models[ploid][2] = p(mut is indel | mut occurs)
+		# self.models[ploid][3] = p(insertion | indel occurs)
+		# self.models[ploid][4] = distribution of insertion lengths
+		# self.models[ploid][5] = distribution of deletion lengths
+		# self.models[ploid][6] = distribution of trinucleotide SNP transitions
+		# self.models[ploid][7] = p(trinuc mutates)
+		self.models = []
+		for n in self.modelData:
+			self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]])
+			for m in n[8]:
+				self.models[-1][6].append([DiscreteDistribution(m[0],NUCL),
+					                       DiscreteDistribution(m[1],NUCL),
+					                       DiscreteDistribution(m[2],NUCL),
+					                       DiscreteDistribution(m[3],NUCL)])
+			self.models[-1].append([m for m in n[9]])
+
+	def init_poisson(self):
+		ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))]
+		snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))]
+		k_range    = range(int(self.seqLen*MAX_MUTFRAC))
+		self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))]
+		self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))]
+
+	def init_trinucBias(self):
+		# compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs)
+		#
+		# note: since indels are added before snps, it's possible these positional biases aren't correctly utilized
+		#       at positions affected by indels. At the moment I'm going to consider this negligible.
+		trinuc_snp_bias  = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)]
+		self.trinuc_bias = [None for n in xrange(self.ploidy)]
+		for p in xrange(self.ploidy):
+			for i in xrange(self.winBuffer+1,self.seqLen-1):
+				trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]]
+			self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1))
+
+	def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None):
+		# if mutation model is changed, we have to reinitialize it...
+		if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []:
+			self.ploidy = ploidy
+			self.mutRescale = mutRate
+			self.init_mutModels(mutationModels, mutRate)
+		# if sequence length is different than previous window, we have to redo snp/indel poissons
+		if len(sequence) != self.seqLen:
+			self.seqLen = len(sequence)
+			self.init_poisson()
+		# basic vars
+		self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen)
+		self.indelsToAdd = [n.sample() for n in self.ind_pois]
+		self.snpsToAdd   = [n.sample() for n in self.snp_pois]
+		#print (self.indelsToAdd,self.snpsToAdd)
+		# initialize trinuc snp bias
+		if not IGNORE_TRINUC:
+			self.init_trinucBias()
+
+	def insert_mutations(self, inputList):
+		#
+		#	TODO!!!!!! user-input variants, determine which ploid to put it on, etc..
+		#
+		for inpV in inputList:
+			whichPloid = []
+			wps = inpV[4][0]
+			if wps == None:	# if no genotype given, assume heterozygous and choose a single ploid based on their mut rates
+				whichPloid.append(self.ploidMutPrior.sample())
+				whichAlt = [0]
+			else:
+				#if 'WP=' in wps:
+				#	whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1']
+				#	print 'WHICH:', whichPloid
+				#	whichAlt   = [0]*len(whichPloid)
+				#elif '/' in wps or '|' in wps:
+				if '/' in wps or '|' in wps:
+					if '/' in wps:
+						splt = wps.split('/')
+					else:
+						splt = wps.split('|')
+					whichPloid = []
+					whichAlt   = []
+					for i in xrange(len(splt)):
+						if splt[i] == '1':
+							whichPloid.append(i)
+						#whichAlt.append(int(splt[i])-1)
+					# assume we're just using first alt for inserted variants?
+					whichAlt = [0 for n in whichPloid]
+				else:	# otherwise assume monoploidy
+					whichPloid = [0]
+					whichAlt   = [0]
+
+			# ignore invalid ploids
+			for i in xrange(len(whichPloid)-1,-1,-1):
+				if whichPloid[i] >= self.ploidy:
+					del whichPloid[i]
+						
+			for i in xrange(len(whichPloid)):
+				p = whichPloid[i]
+				myAlt = inpV[2][whichAlt[i]]
+				myVar = (inpV[0]-self.x,inpV[1],myAlt)
+				inLen = max([len(inpV[1]),len(myAlt)])
+				#print myVar, chr(self.sequences[p][myVar[0]])
+				if myVar[0] < 0 or myVar[0] >= len(self.blackList[p]):
+					print '\nError: Attempting to insert variant out of window bounds:'
+					print myVar, '--> blackList[0:'+str(len(self.blackList[p]))+']\n'
+					exit(1)
+				if len(inpV[1]) == 1 and len(myAlt) == 1:
+					if self.blackList[p][myVar[0]]:
+						continue
+					self.snpList[p].append(myVar)
+					self.blackList[p][myVar[0]] = 2
+				else:
+					for k in xrange(myVar[0],myVar[0]+inLen+1):
+						if self.blackList[p][k]:
+							continue
+					for k in xrange(myVar[0],myVar[0]+inLen+1):
+						self.blackList[p][k] = 1
+					self.indelList[p].append(myVar)
+
+	def random_mutations(self):
+		
+		#	add random indels
+		all_indels  = [[] for n in self.sequences]
+		for i in xrange(self.ploidy):
+			for j in xrange(self.indelsToAdd[i]):
+				if random.random() <= self.models[i][1]:	# insert homozygous indel
+					whichPloid = range(self.ploidy)
+				else:								# insert heterozygous indel
+					whichPloid = [self.ploidMutPrior.sample()]
+
+				# try to find suitable places to insert indels
+				eventPos = -1
+				for attempt in xrange(MAX_ATTEMPTS):
+					eventPos = random.randint(self.winBuffer,self.seqLen-1)
+					for p in whichPloid:
+						if self.blackList[p][eventPos]:
+							eventPos = -1
+					if eventPos != -1:
+						break
+				if eventPos == -1:
+					continue
+
+				if random.random() <= self.models[i][3]:	# insertion
+					inLen   = self.models[i][4].sample()
+					# sequence content of random insertions is uniformly random (change this later)
+					inSeq   = ''.join([random.choice(NUCL) for n in xrange(inLen)])
+					refNucl = chr(self.sequences[i][eventPos])
+					myIndel = (eventPos,refNucl,refNucl+inSeq)
+				else:										# deletion
+					inLen   = self.models[i][5].sample()
+					if eventPos+inLen+1 >= len(self.sequences[i]):	# skip if deletion too close to boundary
+						continue
+					if inLen == 1:
+						inSeq = chr(self.sequences[i][eventPos+1])
+					else:
+						inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1])
+					refNucl = chr(self.sequences[i][eventPos])
+					myIndel = (eventPos,refNucl+inSeq,refNucl)
+
+				# if event too close to boundary, skip. if event conflicts with other indel, skip.
+				skipEvent = False
+				if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1:
+					skipEvent = True
+				if skipEvent:
+					continue
+				for p in whichPloid:
+					for k in xrange(eventPos,eventPos+inLen+1):
+						if self.blackList[p][k]:
+							skipEvent = True
+				if skipEvent:
+					continue
+
+				for p in whichPloid:
+					for k in xrange(eventPos,eventPos+inLen+1):
+						self.blackList[p][k] = 1
+					all_indels[p].append(myIndel)
+
+		for i in xrange(len(all_indels)):
+			all_indels[i].extend(self.indelList[i])
+		all_indels = [sorted(n,reverse=True) for n in all_indels]
+		#print all_indels
+
+		#	add random snps
+		all_snps  = [[] for n in self.sequences]
+		for i in xrange(self.ploidy):
+			for j in xrange(self.snpsToAdd[i]):
+				if random.random() <= self.models[i][1]:	# insert homozygous SNP
+					whichPloid = range(self.ploidy)
+				else:								# insert heterozygous SNP
+					whichPloid = [self.ploidMutPrior.sample()]
+
+				# try to find suitable places to insert snps
+				eventPos = -1
+				for attempt in xrange(MAX_ATTEMPTS):
+					# based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias
+					# (if there are multiple ploids, choose one at random)
+					if IGNORE_TRINUC:
+						eventPos = random.randint(self.winBuffer+1,self.seqLen-2)
+					else:
+						ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)]
+						eventPos     = self.trinuc_bias[ploid_to_use].sample()
+					for p in whichPloid:
+						if self.blackList[p][eventPos]:
+							eventPos = -1
+					if eventPos != -1:
+						break
+				if eventPos == -1:
+					continue
+
+				refNucl = chr(self.sequences[i][eventPos])
+				context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1]))
+				# sample from tri-nucleotide substitution matrices to get SNP alt allele
+				newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample()
+				mySNP   = (eventPos,refNucl,newNucl)
+
+				for p in whichPloid:
+					all_snps[p].append(mySNP)
+					self.blackList[p][mySNP[0]] = 2
+
+		# combine random snps with inserted snps, remove any snps that overlap indels
+		for p in xrange(len(all_snps)):
+			all_snps[p].extend(self.snpList[p])
+			all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1]
+
+		# modify reference sequences
+		for i in xrange(len(all_snps)):
+			for j in xrange(len(all_snps[i])):
+				# sanity checking (for debugging purposes)
+				vPos = all_snps[i][j][0]
+				if all_snps[i][j][1] != chr(self.sequences[i][vPos]):
+					print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n'
+					exit(1)
+				else:
+					self.sequences[i][vPos] = all_snps[i][j][2]
+
+		adjToAdd = [[] for n in xrange(self.ploidy)]
+		for i in xrange(len(all_indels)):
+			for j in xrange(len(all_indels[i])):
+				# sanity checking (for debugging purposes)
+				vPos  = all_indels[i][j][0]
+				vPos2 = vPos + len(all_indels[i][j][1])
+				#print all_indels[i][j], str(self.sequences[i][vPos:vPos2])
+				#print len(self.sequences[i]),'-->',
+				if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]):
+					print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n'
+					exit(1)
+				else:
+					self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:]
+					adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1])))
+				#print len(self.sequences[i])
+			adjToAdd[i].sort()
+			#print adjToAdd[i]
+
+			self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4')
+			indSoFar = 0
+			valSoFar = 0
+			for j in xrange(len(self.adj[i])):
+				if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1:
+					valSoFar += adjToAdd[i][indSoFar][1]
+					indSoFar += 1
+				self.adj[i][j] = valSoFar
+
+			# precompute cigar strings (we can skip this is going for only vcf output)
+			if not self.onlyVCF:
+				tempSymbolString = ['M']
+				prevVal = self.adj[i][0]
+				j = 1
+				while j < len(self.adj[i]):
+					diff = self.adj[i][j] - prevVal
+					prevVal = self.adj[i][j]
+					if diff > 0:	# insertion
+						tempSymbolString.extend(['I']*abs(diff))
+						j += abs(diff)
+					elif diff < 0:	# deletion
+						tempSymbolString.append('D'*abs(diff)+'M')
+						j += 1
+					else:
+						tempSymbolString.append('M')
+						j += 1
+
+				for j in xrange(len(tempSymbolString)-self.readLen):
+					self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString())
+					# pre-compute reference position of first matching base
+					my_fm_pos = None
+					for k in xrange(self.readLen):
+						if 'M' in tempSymbolString[j+k]:
+							my_fm_pos = j+k
+							break
+					if my_fm_pos == None:
+						self.FM_pos[i].append(None)
+						self.FM_span[i].append(None)
+					else:
+						self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos])
+						span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn])
+						self.FM_span[i].append(self.FM_pos[i][-1] + span_dif)
+
+		# tally up variants implemented
+		countDict = {}
+		all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)]
+		for i in xrange(len(all_variants)):
+			for j in xrange(len(all_variants[i])):
+				all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:]
+				t = tuple(all_variants[i][j])
+				if t not in countDict:
+					countDict[t] = []
+				countDict[t].append(i)
+
+		#
+		#	TODO: combine multiple variants that happened to occur at same position into single vcf entry
+		#
+
+		output_variants = []
+		for k in sorted(countDict.keys()):
+			output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)]))
+			ploid_string = ['0' for n in xrange(self.ploidy)]
+			for k2 in [n for n in countDict[k]]:
+				ploid_string[k2] = '1'
+			output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)])
+		return output_variants
+
+
+	def sample_read(self, sequencingModel, fragLen=None):
+		
+		# choose a ploid
+		myPloid = random.randint(0,self.ploidy-1)
+
+		# stop attempting to find a valid position if we fail enough times
+		MAX_READPOS_ATTEMPTS = 100
+		attempts_thus_far    = 0
+
+		# choose a random position within the ploid, and generate quality scores / sequencing errors
+		readsToSample = []
+		if fragLen == None:
+			rPos = self.coverage_distribution[myPloid].sample()
+			#####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1)	# uniform random
+			####
+			##### decide which subsection of the sequence to sample from using coverage probabilities
+			####coords_bad = True
+			####while coords_bad:
+			####	attempts_thus_far += 1
+			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
+			####		return None
+			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
+			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
+			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
+			####		continue
+			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
+			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]
+			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
+			####		continue
+			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
+			####		continue
+			####	if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen:
+			####		coords_bad = False
+			####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
+
+			# sample read position and call function to compute quality scores / sequencing errors
+			rDat = self.sequences[myPloid][rPos:rPos+self.readLen]
+			(myQual, myErrors) = sequencingModel.getSequencingErrors(rDat)
+			readsToSample.append([rPos,myQual,myErrors,rDat])
+
+		else:
+			rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
+			
+			# EXPERIMENTAL
+			#coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample()
+			#rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1])
+
+			#####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1)		# uniform random
+			####
+			##### decide which subsection of the sequence to sample from using coverage probabilities
+			####coords_bad = True
+			####while coords_bad:
+			####	attempts_thus_far += 1
+			####	if attempts_thus_far > MAX_READPOS_ATTEMPTS:
+			####		#print coords_to_select_from
+			####		return None
+			####	myBucket = max([self.which_bucket.sample() - self.win_per_read, 0])
+			####	coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize]
+			####	if coords_to_select_from[0] >= len(self.adj[myPloid]):	# prevent going beyond region boundaries
+			####		continue
+			####	coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]]
+			####	coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]]	# both ends use index of starting position to avoid issues with reads spanning breakpoints of large events
+			####	if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj
+			####		continue
+			####	if coords_to_select_from[1] - coords_to_select_from[0] <= 2:	# we don't span enough coords to sample
+			####		continue
+			####	rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1)
+			####	# for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read
+			####	if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen:
+			####		rPos1 -= fragLen - self.readLen
+			####	if rPos1 < len(self.sequences[myPloid])-fragLen:
+			####		coords_bad = False
+
+			rPos2 = rPos1 + fragLen - self.readLen
+			rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen]
+			rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen]
+			#print len(rDat1), rPos1, len(self.sequences[myPloid])
+			(myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1)
+			(myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True)
+			readsToSample.append([rPos1,myQual1,myErrors1,rDat1])
+			readsToSample.append([rPos2,myQual2,myErrors2,rDat2])
+
+		# error format:
+		# myError[i] = (type, len, pos, ref, alt)
+
+		# examine sequencing errors to-be-inserted.
+		#	- remove deletions that don't have enough bordering sequence content to "fill in"
+		# if error is valid, make the changes to the read data
+		rOut = []
+		for r in readsToSample:
+			try:
+				myCigar = self.allCigar[myPloid][r[0]]
+			except IndexError:
+				print 'Index error when attempting to find cigar string.'
+				print len(self.allCigar[myPloid]), r[0]
+				if fragLen != None:
+					print (rPos1, rPos2)
+				print myPloid, fragLen, self.fraglens_indMap[fragLen]
+				exit(1)
+			totalD  = sum([error[1] for error in r[2] if error[0] == 'D'])
+			totalI  = sum([error[1] for error in r[2] if error[0] == 'I'])
+			availB  = len(self.sequences[myPloid]) - r[0] - self.readLen - 1
+			# add buffer sequence to fill in positions that get deleted
+			r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD]
+			expandedCigar = []
+			extraCigar    = []
+			adj           = 0
+			sse_adj       = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))]
+			anyIndelErr   = False
+
+			# sort by letter (D > I > S) such that we introduce all indel errors before substitution errors
+			# secondarily, sort by index
+			arrangedErrors = {'D':[],'I':[],'S':[]}
+			for error in r[2]:
+				arrangedErrors[error[0]].append((error[2],error))
+			sortedErrors = []
+			for k in sorted(arrangedErrors.keys()):
+				sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])])
+
+			skipIndels = False
+
+			for error in sortedErrors:
+				#print '-se-',r[0], error
+				#print sse_adj
+				eLen = error[1]
+				ePos = error[2]
+				if error[0] == 'D' or error[0] == 'I':
+					anyIndelErr   = True
+					extraCigarVal = []
+					if totalD > availB:	# if not enough bases to fill-in deletions, skip all indel erors
+						continue
+					if expandedCigar == []:
+						expandedCigar = CigarString(stringIn=myCigar).getList()
+						fillToGo = totalD - totalI + 1
+						if fillToGo > 0:
+							try:
+								extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:]
+							except IndexError:	# applying the deletions we want requires going beyond region boundaries. skip all indel errors
+								skipIndels = True
+
+					if skipIndels:
+						continue
+
+					# insert deletion error into read and update cigar string accordingly
+					if error[0] == 'D':
+						myadj = sse_adj[ePos]
+						pi = ePos+myadj
+						pf = ePos+myadj+eLen+1
+						if str(r[3][pi:pf]) == str(error[3]):
+							r[3] = r[3][:pi+1] + r[3][pf:]
+							expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:]
+							if pi+1 == len(expandedCigar):	# weird edge case with del at very end of region. Make a guess and add a "M"
+								expandedCigar.append('M')
+							expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1]
+						else:
+							print '\nError, ref does not match alt while attempting to insert deletion error!\n'
+							exit(1)
+						adj -= eLen
+						for i in xrange(ePos,len(sse_adj)):
+							sse_adj[i] -= eLen
+
+					# insert insertion error into read and update cigar string accordingly
+					else:
+						myadj = sse_adj[ePos]
+						if chr(r[3][ePos+myadj]) == error[3]:
+							r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:]
+							expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:]
+						else:
+							print '\nError, ref does not match alt while attempting to insert insertion error!\n'
+							print '---',chr(r[3][ePos+myadj]), '!=', error[3]
+							exit(1)
+						adj += eLen
+						for i in xrange(ePos,len(sse_adj)):
+							sse_adj[i] += eLen
+
+				else:	# substitution errors, much easier by comparison...
+					if chr(r[3][ePos+sse_adj[ePos]]) == error[3]:
+						r[3][ePos+sse_adj[ePos]] = error[4]
+					else:
+						print '\nError, ref does not match alt while attempting to insert substitution error!\n'
+						exit(1)
+
+			if anyIndelErr:
+				if len(expandedCigar):
+					relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen]
+					myCigar = CigarString(listIn=relevantCigar).getString()
+
+				r[3] = r[3][:self.readLen]
+
+			rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])])
+
+		# rOut[i] = (pos, cigar, read_string, qual_string)
+		return rOut
+
+
+#
+#	Container for read data, computes quality scores and positions to insert errors
+#
+class ReadContainer:
+	def __init__(self, readLen, errorModel, reScaledError):
+
+		self.readLen = readLen
+
+		errorDat = pickle.load(open(errorModel,'rb'))
+		self.UNIFORM = False
+		if len(errorDat) == 4:		# uniform-error SE reads (e.g. PacBio)
+			self.UNIFORM = True
+			[Qscores,offQ,avgError,errorParams] = errorDat
+			self.uniform_qscore = int(-10.*np.log10(avgError)+0.5)
+			print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError)
+		if len(errorDat) == 6:		# only 1 q-score model present, use same model for both strands
+			[initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat
+			self.PE_MODELS = False
+		elif len(errorDat) == 8:	# found a q-score model for both forward and reverse strands
+			#print 'Using paired-read quality score profiles...'
+			[initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat
+			self.PE_MODELS = True
+			if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2):
+				print '\nError: R1 and R2 quality score models are of different length.\n'
+				exit(1)
+
+
+		self.qErrRate = [0.]*(max(Qscores)+1)
+		for q in Qscores:
+			self.qErrRate[q] = 10.**(-q/10.)
+		self.offQ = offQ
+
+		# errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL]
+		self.errP   = errorParams
+		self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]]
+		self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3])
+		self.errSIN = DiscreteDistribution(self.errP[5],NUCL)
+
+		# adjust sequencing error frequency to match desired rate
+		if reScaledError == None:
+			self.errorScale = 1.0
+		else:
+			self.errorScale = reScaledError/avgError
+			print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale)
+
+		if self.UNIFORM == False:
+			# adjust length to match desired read length
+			if self.readLen == len(initQ1):
+				self.qIndRemap = range(self.readLen)
+			else:
+				print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...'
+				self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)]
+
+			# initialize probability distributions
+			self.initDistByPos1        = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))]
+			self.probDistByPosByPrevQ1 = [None]
+			for i in xrange(1,len(initQ1)):
+				self.probDistByPosByPrevQ1.append([])
+				for j in xrange(len(initQ1[0])):
+					if np.sum(probQ1[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
+						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
+					else:
+						self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores))
+
+			if self.PE_MODELS:
+				self.initDistByPos2        = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))]
+				self.probDistByPosByPrevQ2 = [None]
+				for i in xrange(1,len(initQ2)):
+					self.probDistByPosByPrevQ2.append([])
+					for j in xrange(len(initQ2[0])):
+						if np.sum(probQ2[i][j]) <= 0.:	# if we don't have sufficient data for a transition, use the previous qscore
+							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j]))
+						else:
+							self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores))
+
+	def getSequencingErrors(self, readData, isReverseStrand=False):
+
+		qOut = [0]*self.readLen
+		sErr = []
+
+		if self.UNIFORM:
+			myQ  = [self.uniform_qscore + self.offQ for n in xrange(self.readLen)]
+			qOut = ''.join([chr(n) for n in myQ])
+			for i in xrange(self.readLen):
+				if random.random() < self.errorScale*self.qErrRate[self.uniform_qscore]:
+					sErr.append(i)
+		else:
+
+			if self.PE_MODELS and isReverseStrand:
+				myQ = self.initDistByPos2[0].sample()
+			else:
+				myQ = self.initDistByPos1[0].sample()
+			qOut[0] = myQ
+
+			for i in xrange(1,self.readLen):
+				if self.PE_MODELS and isReverseStrand:
+					myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample()
+				else:
+					myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample()
+				qOut[i] = myQ
+
+			if isReverseStrand:
+				qOut = qOut[::-1]
+
+			for i in xrange(self.readLen):
+				if random.random() < self.errorScale * self.qErrRate[qOut[i]]:
+					sErr.append(i)
+
+			qOut = ''.join([chr(n + self.offQ) for n in qOut])
+
+		if self.errorScale == 0.0:
+			return (qOut,[])
+
+		sOut = []
+		nDelSoFar = 0
+		# don't allow indel errors to occur on subsequent positions
+		prevIndel = -2
+		# don't allow other sequencing errors to occur on bases removed by deletion errors
+		delBlacklist = []
+
+		for ind in sErr[::-1]:	# for each error that we're going to insert...
+
+			# determine error type
+			isSub = True
+			if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and abs(ind-prevIndel) > 1:
+				if random.random() < self.errP[1]:
+					isSub = False
+
+			# errorOut = (type, len, pos, ref, alt)
+
+			if isSub:								# insert substitution error
+				myNucl  = chr(readData[ind])
+				newNucl = self.errSSE[NUC_IND[myNucl]].sample()
+				sOut.append(('S',1,ind,myNucl,newNucl))
+			else:									# insert indel error
+				indelLen = self.errSIE.sample()
+				if random.random() < self.errP[4]:		# insertion error
+					myNucl  = chr(readData[ind])
+					newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)])
+					sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl))
+				elif ind < self.readLen-2-nDelSoFar:	# deletion error (prevent too many of them from stacking up)
+					myNucl  = str(readData[ind:ind+indelLen+1])
+					newNucl = chr(readData[ind])
+					nDelSoFar += len(myNucl)-1
+					sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl))
+					for i in xrange(ind+1,ind+indelLen+1):
+						delBlacklist.append(i)
+				prevIndel = ind
+
+		# remove blacklisted errors
+		for i in xrange(len(sOut)-1,-1,-1):
+			if sOut[i][2] in delBlacklist:
+				del sOut[i]
+
+		return (qOut,sOut)
+
+
+
+"""************************************************
+****          DEFAULT MUTATION MODELS
+************************************************"""
+
+
+# parse mutation model pickle file
+def parseInputMutationModel(model=None, whichDefault=1):
+	if whichDefault == 1:
+		outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1]
+	elif whichDefault == 2:
+		outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2]
+	else:
+		print '\nError: Unknown default mutation model specified\n'
+		exit(1)
+
+	if model != None:
+		pickle_dict = pickle.load(open(model,"rb"))
+		outModel[0] = pickle_dict['AVG_MUT_RATE']
+		outModel[2] = 1. - pickle_dict['SNP_FREQ']
+
+		insList     = pickle_dict['INDEL_FREQ']
+		if len(insList):
+			insCount = sum([insList[k] for k in insList.keys() if k >= 1])
+			delCount = sum([insList[k] for k in insList.keys() if k <= -1])
+			insVals  = [k for k in sorted(insList.keys()) if k >= 1]
+			insWght  = [insList[k]/float(insCount) for k in insVals]
+			delVals  = [k for k in sorted([abs(k) for k in insList.keys() if k <= -1])]
+			delWght  = [insList[-k]/float(delCount) for k in delVals]
+		else:	# degenerate case where no indel stats are provided
+			insCount = 1
+			delCount = 1
+			insVals  = [1]
+			insWght  = [1.0]
+			delVals  = [1]
+			delWght  = [1.0]
+		outModel[3] = insCount/float(insCount + delCount)
+		outModel[4] = insVals
+		outModel[5] = insWght
+		outModel[6] = delVals
+		outModel[7] = delWght
+
+		trinuc_trans_prob = pickle_dict['TRINUC_TRANS_PROBS']
+		for k in sorted(trinuc_trans_prob.keys()):
+			myInd   = TRI_IND[k[0][0]+k[0][2]]
+			(k1,k2) = (NUC_IND[k[0][1]],NUC_IND[k[1][1]])
+			outModel[8][myInd][k1][k2] = trinuc_trans_prob[k]
+		for i in xrange(len(outModel[8])):
+			for j in xrange(len(outModel[8][i])):
+				for l in xrange(len(outModel[8][i][j])):
+					# if trinuc not present in input mutation model, assign it uniform probability
+					if float(sum(outModel[8][i][j])) < 1e-12:
+						outModel[8][i][j] = [0.25,0.25,0.25,0.25]
+					else:
+						outModel[8][i][j][l] /= float(sum(outModel[8][i][j]))
+
+		trinuc_mut_prob    = pickle_dict['TRINUC_MUT_PROB']
+		which_have_we_seen = {n:False for n in ALL_TRI}
+		trinuc_mean        = np.mean(trinuc_mut_prob.values())
+		for trinuc in trinuc_mut_prob.keys():
+			outModel[9][ALL_IND[trinuc]] = trinuc_mut_prob[trinuc]
+			which_have_we_seen[trinuc]   = True
+		for trinuc in which_have_we_seen.keys():
+			if which_have_we_seen[trinuc] == False:
+				outModel[9][ALL_IND[trinuc]] = trinuc_mean
+
+	return outModel
+
+
+# parse mutation model files, returns default model if no model directory is specified
+#
+# OLD FUNCTION THAT PROCESSED OUTDATED TEXTFILE MUTATION MODELS
+def parseInputMutationModel_deprecated(prefix=None, whichDefault=1):
+	if whichDefault == 1:
+		outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1]
+	elif whichDefault == 2:
+		outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2]
+	else:
+		print '\nError: Unknown default mutation model specified\n'
+		exit(1)
+
+	if prefix != None:
+		if prefix[-1] != '/':
+			prefix += '/'
+		if not os.path.isdir(prefix):
+			'\nError: Input mutation model directory not found:',prefix,'\n'
+			exit(1)
+
+		print 'Reading in mutation model...'
+		listing1 = [n for n in os.listdir(prefix) if n[-5:] == '.prob']
+		listing2 = [n for n in os.listdir(prefix) if n[-7:] == '.trinuc']
+		listing  = sorted(listing1) + sorted(listing2)
+		for l in listing:
+			f = open(prefix+l,'r')
+			fr = [n.split('\t') for n in f.read().split('\n')]
+			f.close()
+
+			if '_overall.prob' in l:
+				myIns = None
+				myDel = None
+				for dat in fr[1:]:
+					if len(dat) == 2:
+						if dat[0] == 'insertion':
+							myIns = float(dat[1])
+						elif dat[0] == 'deletion':
+							myDel = float(dat[1])
+				if myIns != None and myDel != None:
+					outModel[2] = myIns + myDel
+					outModel[3] = myIns / (myIns + myDel)
+					print '-',l
+
+			if '_insLength.prob' in l:
+				insVals = {}
+				for dat in fr[1:]:
+					if len(dat) == 2:
+						insVals[int(dat[0])] = float(dat[1])
+				if len(insVals):
+					outModel[4] = sorted(insVals.keys())
+					outModel[5] = [insVals[n] for n in outModel[4]]
+					print '-',l
+
+			if '_delLength.prob' in l:
+				delVals = {}
+				for dat in fr[1:]:
+					if len(dat) == 2:
+						delVals[int(dat[0])] = float(dat[1])
+				if len(delVals):
+					outModel[6] = sorted(delVals.keys())
+					outModel[7] = [delVals[n] for n in outModel[6]]
+					print '-',l
+
+			if '.trinuc' == l[-7:]:
+				context_ind = TRI_IND[l[-10]+l[-8]]
+				p_matrix    = [[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1]]
+				for i in xrange(len(p_matrix)):
+					for j in xrange(len(fr[i])):
+						p_matrix[i][j] = float(fr[i][j])
+				anyNone = False
+				for i in xrange(len(p_matrix)):
+					for j in xrange(len(p_matrix[i])):
+						if p_matrix[i][j] == -1:
+							anyNone = True
+				if not anyNone:
+					outModel[8][context_ind] = copy.deepcopy(p_matrix)
+					print '-',l
+
+	return outModel
+
+######################
+#	DEFAULT VALUES   #
+######################
+
+DEFAULT_1_OVERALL_MUT_RATE   = 0.001
+DEFAULT_1_HOMOZYGOUS_FREQ    = 0.010
+DEFAULT_1_INDEL_FRACTION     = 0.05
+DEFAULT_1_INS_VS_DEL         = 0.6
+DEFAULT_1_INS_LENGTH_VALUES  = [1,2,3,4,5,6,7,8,9,10]
+DEFAULT_1_INS_LENGTH_WEIGHTS = [0.4, 0.2, 0.1, 0.05, 0.05, 0.05, 0.05, 0.034, 0.033, 0.033]
+DEFAULT_1_DEL_LENGTH_VALUES  = [1,2,3,4,5]
+DEFAULT_1_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1]
+example_matrix_1             = [[0.0, 0.15, 0.7, 0.15],
+						        [0.15, 0.0, 0.15, 0.7],
+						        [0.7, 0.15, 0.0, 0.15],
+						        [0.15, 0.7, 0.15, 0.0]]
+DEFAULT_1_TRI_FREQS          = [copy.deepcopy(example_matrix_1) for n in xrange(16)]
+DEFAULT_1_TRINUC_BIAS        = [1./float(len(ALL_TRI)) for n in ALL_TRI]
+DEFAULT_MODEL_1              = [DEFAULT_1_OVERALL_MUT_RATE,
+							    DEFAULT_1_HOMOZYGOUS_FREQ,
+							    DEFAULT_1_INDEL_FRACTION,
+							    DEFAULT_1_INS_VS_DEL,
+							    DEFAULT_1_INS_LENGTH_VALUES,
+							    DEFAULT_1_INS_LENGTH_WEIGHTS,
+							    DEFAULT_1_DEL_LENGTH_VALUES,
+							    DEFAULT_1_DEL_LENGTH_WEIGHTS,
+							    DEFAULT_1_TRI_FREQS,
+							    DEFAULT_1_TRINUC_BIAS]
+
+DEFAULT_2_OVERALL_MUT_RATE   = 0.002
+DEFAULT_2_HOMOZYGOUS_FREQ    = 0.200
+DEFAULT_2_INDEL_FRACTION     = 0.1
+DEFAULT_2_INS_VS_DEL         = 0.3
+DEFAULT_2_INS_LENGTH_VALUES  = [1,2,3,4,5,6,7,8,9,10]
+DEFAULT_2_INS_LENGTH_WEIGHTS = [0.1, 0.1, 0.2, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
+DEFAULT_2_DEL_LENGTH_VALUES  = [1,2,3,4,5]
+DEFAULT_2_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1]
+example_matrix_2             = [[0.0, 0.15, 0.7, 0.15],
+						        [0.15, 0.0, 0.15, 0.7],
+						        [0.7, 0.15, 0.0, 0.15],
+						        [0.15, 0.7, 0.15, 0.0]]
+DEFAULT_2_TRI_FREQS          = [copy.deepcopy(example_matrix_2) for n in xrange(16)]
+DEFAULT_2_TRINUC_BIAS        = [1./float(len(ALL_TRI)) for n in ALL_TRI]
+DEFAULT_MODEL_2              = [DEFAULT_2_OVERALL_MUT_RATE,
+							    DEFAULT_2_HOMOZYGOUS_FREQ,
+							    DEFAULT_2_INDEL_FRACTION,
+							    DEFAULT_2_INS_VS_DEL,
+							    DEFAULT_2_INS_LENGTH_VALUES,
+							    DEFAULT_2_INS_LENGTH_WEIGHTS,
+							    DEFAULT_2_DEL_LENGTH_VALUES,
+							    DEFAULT_2_DEL_LENGTH_WEIGHTS,
+							    DEFAULT_2_TRI_FREQS,
+							    DEFAULT_2_TRINUC_BIAS]
+
+