Mercurial > repos > thondeboer > neat_genreads
annotate py/SequenceContainer.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer | 
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 0 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1 import random | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 2 import copy | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 3 import re | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 4 import os | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 5 import bisect | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 6 import cPickle as pickle | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 7 import numpy as np | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 8 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 9 from probability import DiscreteDistribution, poisson_list, quantize_list | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 10 from cigar import CigarString | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 11 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 12 MAX_ATTEMPTS = 100 # max attempts to insert a mutation into a valid position | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 13 MAX_MUTFRAC = 0.3 # the maximum percentage of a window that can contain mutations | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 14 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 15 NUCL = ['A','C','G','T'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 16 TRI_IND = {'AA':0, 'AC':1, 'AG':2, 'AT':3, 'CA':4, 'CC':5, 'CG':6, 'CT':7, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 17 'GA':8, 'GC':9, 'GG':10, 'GT':11, 'TA':12, 'TC':13, 'TG':14, 'TT':15} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 18 NUC_IND = {'A':0, 'C':1, 'G':2, 'T':3} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 19 ALL_TRI = [NUCL[i]+NUCL[j]+NUCL[k] for i in xrange(len(NUCL)) for j in xrange(len(NUCL)) for k in xrange(len(NUCL))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 20 ALL_IND = {ALL_TRI[i]:i for i in xrange(len(ALL_TRI))} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 21 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 22 # DEBUG | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 23 IGNORE_TRINUC = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 24 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 25 # percentile resolution used for fraglen quantizing | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 26 COV_FRAGLEN_PERCENTILE = 10. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 27 LARGE_NUMBER = 9999999999 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 28 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 29 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 30 # Container for reference sequences, applies mutations | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 31 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 32 class SequenceContainer: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 33 def __init__(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None, onlyVCF=False): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 34 # initialize basic variables | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 35 self.onlyVCF = onlyVCF | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 36 self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 37 # initialize mutation models | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 38 self.init_mutModels(mutationModels, mutRate) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 39 # sample the number of variants that will be inserted into each ploid | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 40 self.init_poisson() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 41 self.indelsToAdd = [n.sample() for n in self.ind_pois] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 42 self.snpsToAdd = [n.sample() for n in self.snp_pois] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 43 # initialize trinuc snp bias | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 44 self.init_trinucBias() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 45 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 46 def init_basicVars(self, xOffset, sequence, ploidy, windowOverlap, readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 47 self.x = xOffset | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 48 self.ploidy = ploidy | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 49 self.readLen = readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 50 self.sequences = [bytearray(sequence) for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 51 self.seqLen = len(sequence) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 52 self.indelList = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 53 self.snpList = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 54 self.allCigar = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 55 self.FM_pos = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 56 self.FM_span = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 57 self.adj = [None for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 58 # blackList[ploid][pos] = 0 safe to insert variant here | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 59 # blackList[ploid][pos] = 1 indel inserted here | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 60 # blackList[ploid][pos] = 2 snp inserted here | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 61 # blackList[ploid][pos] = 3 invalid position for various processing reasons | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 62 self.blackList = [np.zeros(self.seqLen,dtype='<i4') for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 63 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 64 # disallow mutations to occur on window overlap points | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 65 self.winBuffer = windowOverlap | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 66 for p in xrange(self.ploidy): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 67 self.blackList[p][-self.winBuffer] = 3 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 68 self.blackList[p][-self.winBuffer-1] = 3 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 69 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 70 def init_coverage(self,coverageDat,fragDist=None): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 71 # if we're only creating a vcf, skip some expensive initialization related to coverage depth | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 72 if not self.onlyVCF: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 73 (self.windowSize, gc_scalars, targetCov_vals) = coverageDat | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 74 gcCov_vals = [[] for n in self.sequences] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 75 trCov_vals = [[] for n in self.sequences] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 76 self.coverage_distribution = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 77 avg_out = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 78 for i in xrange(len(self.sequences)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 79 # compute gc-bias | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 80 j = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 81 while j+self.windowSize < len(self.sequences[i]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 82 gc_c = self.sequences[i][j:j+self.windowSize].count('G') + self.sequences[i][j:j+self.windowSize].count('C') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 83 gcCov_vals[i].extend([gc_scalars[gc_c]]*self.windowSize) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 84 j += self.windowSize | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 85 gc_c = self.sequences[i][-self.windowSize:].count('G') + self.sequences[i][-self.windowSize:].count('C') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 86 gcCov_vals[i].extend([gc_scalars[gc_c]]*(len(self.sequences[i])-len(gcCov_vals[i]))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 87 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 88 trCov_vals[i].append(targetCov_vals[0]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 89 prevVal = self.FM_pos[i][0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 90 for j in xrange(1,len(self.sequences[i])-self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 91 if self.FM_pos[i][j] == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 92 trCov_vals[i].append(targetCov_vals[prevVal]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 93 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 94 trCov_vals[i].append(sum(targetCov_vals[self.FM_pos[i][j]:self.FM_span[i][j]])/float(self.FM_span[i][j]-self.FM_pos[i][j])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 95 prevVal = self.FM_pos[i][j] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 96 #print (i,j), self.adj[i][j], self.allCigar[i][j], self.FM_pos[i][j], self.FM_span[i][j] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 97 # shift by half of read length | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 98 trCov_vals[i] = [0.0]*int(self.readLen/2) + trCov_vals[i][:-int(self.readLen/2.)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 99 # fill in missing indices | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 100 trCov_vals[i].extend([0.0]*(len(self.sequences[i])-len(trCov_vals[i]))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 101 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 102 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 103 covvec = np.cumsum([trCov_vals[i][nnn]*gcCov_vals[i][nnn] for nnn in xrange(len(trCov_vals[i]))]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 104 coverage_vals = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 105 for j in xrange(0,len(self.sequences[i])-self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 106 coverage_vals.append(covvec[j+self.readLen] - covvec[j]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 107 avg_out.append(np.mean(coverage_vals)/float(self.readLen)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 108 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 109 if fragDist == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 110 self.coverage_distribution.append(DiscreteDistribution(coverage_vals,range(len(coverage_vals)))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 111 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 112 # fragment length nightmare | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 113 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 114 currentThresh = 0. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 115 index_list = [0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 116 for j in xrange(len(fragDist.cumP)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 117 if fragDist.cumP[j] >= currentThresh + COV_FRAGLEN_PERCENTILE/100.0: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 118 currentThresh = fragDist.cumP[j] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 119 index_list.append(j) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 120 flq = [fragDist.values[nnn] for nnn in index_list] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 121 if fragDist.values[-1] not in flq: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 122 flq.append(fragDist.values[-1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 123 flq.append(LARGE_NUMBER) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 124 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 125 self.fraglens_indMap = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 126 for j in fragDist.values: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 127 bInd = bisect.bisect(flq,j) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 128 if abs(flq[bInd-1] - j) <= abs(flq[bInd] - j): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 129 self.fraglens_indMap[j] = flq[bInd-1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 130 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 131 self.fraglens_indMap[j] = flq[bInd] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 132 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 133 self.coverage_distribution.append({}) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 134 for flv in sorted(list(set(self.fraglens_indMap.values()))): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 135 buffer_val = self.readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 136 for j in fragDist.values: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 137 if self.fraglens_indMap[j] == flv and j > buffer_val: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 138 buffer_val = j | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 139 coverage_vals = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 140 for j in xrange(len(self.sequences[i])-buffer_val): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 141 coverage_vals.append(covvec[j+self.readLen] - covvec[j] + covvec[j+flv] - covvec[j+flv-self.readLen]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 142 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 143 # EXPERIMENTAL | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 144 #quantized_covVals = quantize_list(coverage_vals) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 145 #self.coverage_distribution[i][flv] = DiscreteDistribution([n[2] for n in quantized_covVals],[(n[0],n[1]) for n in quantized_covVals]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 146 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 147 # TESTING | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 148 #import matplotlib.pyplot as mpl | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 149 #print len(coverage_vals),'-->',len(quantized_covVals) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 150 #mpl.figure(0) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 151 #mpl.plot(range(len(coverage_vals)),coverage_vals) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 152 #for qcv in quantized_covVals: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 153 # mpl.plot([qcv[0],qcv[1]+1],[qcv[2],qcv[2]],'r') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 154 #mpl.show() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 155 #exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 156 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 157 self.coverage_distribution[i][flv] = DiscreteDistribution(coverage_vals,range(len(coverage_vals))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 158 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 159 return np.mean(avg_out) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 160 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 161 def init_mutModels(self,mutationModels,mutRate): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 162 if mutationModels == []: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 163 ml = [copy.deepcopy(DEFAULT_MODEL_1) for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 164 self.modelData = ml[:self.ploidy] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 165 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 166 if len(mutationModels) != self.ploidy: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 167 print '\nError: Number of mutation models recieved is not equal to specified ploidy\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 168 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 169 self.modelData = copy.deepcopy(mutationModels) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 170 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 171 # do we need to rescale mutation frequencies? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 172 mutRateSum = sum([n[0] for n in self.modelData]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 173 self.mutRescale = mutRate | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 174 if self.mutRescale == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 175 self.mutScalar = 1.0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 176 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 177 self.mutScalar = float(self.mutRescale)/(mutRateSum/float(len(self.modelData))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 178 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 179 # how are mutations spread to each ploid, based on their specified mut rates? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 180 self.ploidMutFrac = [float(n[0])/mutRateSum for n in self.modelData] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 181 self.ploidMutPrior = DiscreteDistribution(self.ploidMutFrac,range(self.ploidy)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 182 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 183 # init mutation models | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 184 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 185 # self.models[ploid][0] = average mutation rate | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 186 # self.models[ploid][1] = p(mut is homozygous | mutation occurs) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 187 # self.models[ploid][2] = p(mut is indel | mut occurs) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 188 # self.models[ploid][3] = p(insertion | indel occurs) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 189 # self.models[ploid][4] = distribution of insertion lengths | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 190 # self.models[ploid][5] = distribution of deletion lengths | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 191 # self.models[ploid][6] = distribution of trinucleotide SNP transitions | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 192 # self.models[ploid][7] = p(trinuc mutates) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 193 self.models = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 194 for n in self.modelData: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 195 self.models.append([self.mutScalar*n[0],n[1],n[2],n[3],DiscreteDistribution(n[5],n[4]),DiscreteDistribution(n[7],n[6]),[]]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 196 for m in n[8]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 197 self.models[-1][6].append([DiscreteDistribution(m[0],NUCL), | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 198 DiscreteDistribution(m[1],NUCL), | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 199 DiscreteDistribution(m[2],NUCL), | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 200 DiscreteDistribution(m[3],NUCL)]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 201 self.models[-1].append([m for m in n[9]]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 202 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 203 def init_poisson(self): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 204 ind_l_list = [self.seqLen*self.models[i][0]*self.models[i][2]*self.ploidMutFrac[i] for i in xrange(len(self.models))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 205 snp_l_list = [self.seqLen*self.models[i][0]*(1.-self.models[i][2])*self.ploidMutFrac[i] for i in xrange(len(self.models))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 206 k_range = range(int(self.seqLen*MAX_MUTFRAC)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 207 self.ind_pois = [poisson_list(k_range,ind_l_list[n]) for n in xrange(len(self.models))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 208 self.snp_pois = [poisson_list(k_range,snp_l_list[n]) for n in xrange(len(self.models))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 209 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 210 def init_trinucBias(self): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 211 # compute mutation positional bias given trinucleotide strings of the sequence (ONLY AFFECTS SNPs) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 212 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 213 # note: since indels are added before snps, it's possible these positional biases aren't correctly utilized | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 214 # at positions affected by indels. At the moment I'm going to consider this negligible. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 215 trinuc_snp_bias = [[0. for n in xrange(self.seqLen)] for m in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 216 self.trinuc_bias = [None for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 217 for p in xrange(self.ploidy): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 218 for i in xrange(self.winBuffer+1,self.seqLen-1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 219 trinuc_snp_bias[p][i] = self.models[p][7][ALL_IND[str(self.sequences[p][i-1:i+2])]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 220 self.trinuc_bias[p] = DiscreteDistribution(trinuc_snp_bias[p][self.winBuffer+1:self.seqLen-1],range(self.winBuffer+1,self.seqLen-1)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 221 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 222 def update(self, xOffset, sequence, ploidy, windowOverlap, readLen, mutationModels=[], mutRate=None): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 223 # if mutation model is changed, we have to reinitialize it... | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 224 if ploidy != self.ploidy or mutRate != self.mutRescale or mutationModels != []: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 225 self.ploidy = ploidy | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 226 self.mutRescale = mutRate | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 227 self.init_mutModels(mutationModels, mutRate) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 228 # if sequence length is different than previous window, we have to redo snp/indel poissons | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 229 if len(sequence) != self.seqLen: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 230 self.seqLen = len(sequence) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 231 self.init_poisson() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 232 # basic vars | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 233 self.init_basicVars(xOffset, sequence, ploidy, windowOverlap, readLen) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 234 self.indelsToAdd = [n.sample() for n in self.ind_pois] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 235 self.snpsToAdd = [n.sample() for n in self.snp_pois] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 236 #print (self.indelsToAdd,self.snpsToAdd) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 237 # initialize trinuc snp bias | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 238 if not IGNORE_TRINUC: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 239 self.init_trinucBias() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 240 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 241 def insert_mutations(self, inputList): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 242 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 243 # TODO!!!!!! user-input variants, determine which ploid to put it on, etc.. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 244 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 245 for inpV in inputList: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 246 whichPloid = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 247 wps = inpV[4][0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 248 if wps == None: # if no genotype given, assume heterozygous and choose a single ploid based on their mut rates | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 249 whichPloid.append(self.ploidMutPrior.sample()) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 250 whichAlt = [0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 251 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 252 #if 'WP=' in wps: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 253 # whichPloid = [int(n) for n in inpV[-1][3:].split(',') if n == '1'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 254 # print 'WHICH:', whichPloid | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 255 # whichAlt = [0]*len(whichPloid) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 256 #elif '/' in wps or '|' in wps: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 257 if '/' in wps or '|' in wps: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 258 if '/' in wps: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 259 splt = wps.split('/') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 260 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 261 splt = wps.split('|') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 262 whichPloid = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 263 whichAlt = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 264 for i in xrange(len(splt)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 265 if splt[i] == '1': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 266 whichPloid.append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 267 #whichAlt.append(int(splt[i])-1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 268 # assume we're just using first alt for inserted variants? | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 269 whichAlt = [0 for n in whichPloid] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 270 else: # otherwise assume monoploidy | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 271 whichPloid = [0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 272 whichAlt = [0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 273 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 274 # ignore invalid ploids | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 275 for i in xrange(len(whichPloid)-1,-1,-1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 276 if whichPloid[i] >= self.ploidy: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 277 del whichPloid[i] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 278 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 279 for i in xrange(len(whichPloid)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 280 p = whichPloid[i] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 281 myAlt = inpV[2][whichAlt[i]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 282 myVar = (inpV[0]-self.x,inpV[1],myAlt) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 283 inLen = max([len(inpV[1]),len(myAlt)]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 284 #print myVar, chr(self.sequences[p][myVar[0]]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 285 if myVar[0] < 0 or myVar[0] >= len(self.blackList[p]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 286 print '\nError: Attempting to insert variant out of window bounds:' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 287 print myVar, '--> blackList[0:'+str(len(self.blackList[p]))+']\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 288 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 289 if len(inpV[1]) == 1 and len(myAlt) == 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 290 if self.blackList[p][myVar[0]]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 291 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 292 self.snpList[p].append(myVar) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 293 self.blackList[p][myVar[0]] = 2 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 294 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 295 for k in xrange(myVar[0],myVar[0]+inLen+1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 296 if self.blackList[p][k]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 297 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 298 for k in xrange(myVar[0],myVar[0]+inLen+1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 299 self.blackList[p][k] = 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 300 self.indelList[p].append(myVar) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 301 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 302 def random_mutations(self): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 303 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 304 # add random indels | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 305 all_indels = [[] for n in self.sequences] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 306 for i in xrange(self.ploidy): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 307 for j in xrange(self.indelsToAdd[i]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 308 if random.random() <= self.models[i][1]: # insert homozygous indel | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 309 whichPloid = range(self.ploidy) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 310 else: # insert heterozygous indel | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 311 whichPloid = [self.ploidMutPrior.sample()] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 312 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 313 # try to find suitable places to insert indels | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 314 eventPos = -1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 315 for attempt in xrange(MAX_ATTEMPTS): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 316 eventPos = random.randint(self.winBuffer,self.seqLen-1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 317 for p in whichPloid: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 318 if self.blackList[p][eventPos]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 319 eventPos = -1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 320 if eventPos != -1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 321 break | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 322 if eventPos == -1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 323 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 324 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 325 if random.random() <= self.models[i][3]: # insertion | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 326 inLen = self.models[i][4].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 327 # sequence content of random insertions is uniformly random (change this later) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 328 inSeq = ''.join([random.choice(NUCL) for n in xrange(inLen)]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 329 refNucl = chr(self.sequences[i][eventPos]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 330 myIndel = (eventPos,refNucl,refNucl+inSeq) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 331 else: # deletion | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 332 inLen = self.models[i][5].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 333 if eventPos+inLen+1 >= len(self.sequences[i]): # skip if deletion too close to boundary | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 334 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 335 if inLen == 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 336 inSeq = chr(self.sequences[i][eventPos+1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 337 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 338 inSeq = str(self.sequences[i][eventPos+1:eventPos+inLen+1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 339 refNucl = chr(self.sequences[i][eventPos]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 340 myIndel = (eventPos,refNucl+inSeq,refNucl) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 341 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 342 # if event too close to boundary, skip. if event conflicts with other indel, skip. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 343 skipEvent = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 344 if eventPos+len(myIndel[1]) >= self.seqLen-self.winBuffer-1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 345 skipEvent = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 346 if skipEvent: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 347 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 348 for p in whichPloid: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 349 for k in xrange(eventPos,eventPos+inLen+1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 350 if self.blackList[p][k]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 351 skipEvent = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 352 if skipEvent: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 353 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 354 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 355 for p in whichPloid: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 356 for k in xrange(eventPos,eventPos+inLen+1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 357 self.blackList[p][k] = 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 358 all_indels[p].append(myIndel) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 359 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 360 for i in xrange(len(all_indels)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 361 all_indels[i].extend(self.indelList[i]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 362 all_indels = [sorted(n,reverse=True) for n in all_indels] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 363 #print all_indels | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 364 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 365 # add random snps | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 366 all_snps = [[] for n in self.sequences] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 367 for i in xrange(self.ploidy): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 368 for j in xrange(self.snpsToAdd[i]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 369 if random.random() <= self.models[i][1]: # insert homozygous SNP | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 370 whichPloid = range(self.ploidy) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 371 else: # insert heterozygous SNP | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 372 whichPloid = [self.ploidMutPrior.sample()] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 373 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 374 # try to find suitable places to insert snps | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 375 eventPos = -1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 376 for attempt in xrange(MAX_ATTEMPTS): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 377 # based on the mutation model for the specified ploid, choose a SNP location based on trinuc bias | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 378 # (if there are multiple ploids, choose one at random) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 379 if IGNORE_TRINUC: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 380 eventPos = random.randint(self.winBuffer+1,self.seqLen-2) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 381 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 382 ploid_to_use = whichPloid[random.randint(0,len(whichPloid)-1)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 383 eventPos = self.trinuc_bias[ploid_to_use].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 384 for p in whichPloid: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 385 if self.blackList[p][eventPos]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 386 eventPos = -1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 387 if eventPos != -1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 388 break | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 389 if eventPos == -1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 390 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 391 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 392 refNucl = chr(self.sequences[i][eventPos]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 393 context = str(chr(self.sequences[i][eventPos-1])+chr(self.sequences[i][eventPos+1])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 394 # sample from tri-nucleotide substitution matrices to get SNP alt allele | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 395 newNucl = self.models[i][6][TRI_IND[context]][NUC_IND[refNucl]].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 396 mySNP = (eventPos,refNucl,newNucl) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 397 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 398 for p in whichPloid: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 399 all_snps[p].append(mySNP) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 400 self.blackList[p][mySNP[0]] = 2 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 401 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 402 # combine random snps with inserted snps, remove any snps that overlap indels | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 403 for p in xrange(len(all_snps)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 404 all_snps[p].extend(self.snpList[p]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 405 all_snps[p] = [n for n in all_snps[p] if self.blackList[p][n[0]] != 1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 406 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 407 # modify reference sequences | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 408 for i in xrange(len(all_snps)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 409 for j in xrange(len(all_snps[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 410 # sanity checking (for debugging purposes) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 411 vPos = all_snps[i][j][0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 412 if all_snps[i][j][1] != chr(self.sequences[i][vPos]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 413 print '\nError: Something went wrong!\n', all_snps[i][j], chr(self.sequences[i][vPos]),'\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 414 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 415 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 416 self.sequences[i][vPos] = all_snps[i][j][2] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 417 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 418 adjToAdd = [[] for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 419 for i in xrange(len(all_indels)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 420 for j in xrange(len(all_indels[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 421 # sanity checking (for debugging purposes) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 422 vPos = all_indels[i][j][0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 423 vPos2 = vPos + len(all_indels[i][j][1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 424 #print all_indels[i][j], str(self.sequences[i][vPos:vPos2]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 425 #print len(self.sequences[i]),'-->', | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 426 if all_indels[i][j][1] != str(self.sequences[i][vPos:vPos2]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 427 print '\nError: Something went wrong!\n', all_indels[i][j], str(self.sequences[i][vPos:vPos2]),'\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 428 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 429 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 430 self.sequences[i] = self.sequences[i][:vPos] + bytearray(all_indels[i][j][2]) + self.sequences[i][vPos2:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 431 adjToAdd[i].append((all_indels[i][j][0],len(all_indels[i][j][2])-len(all_indels[i][j][1]))) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 432 #print len(self.sequences[i]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 433 adjToAdd[i].sort() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 434 #print adjToAdd[i] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 435 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 436 self.adj[i] = np.zeros(len(self.sequences[i]),dtype='<i4') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 437 indSoFar = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 438 valSoFar = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 439 for j in xrange(len(self.adj[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 440 if indSoFar < len(adjToAdd[i]) and j >= adjToAdd[i][indSoFar][0]+1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 441 valSoFar += adjToAdd[i][indSoFar][1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 442 indSoFar += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 443 self.adj[i][j] = valSoFar | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 444 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 445 # precompute cigar strings (we can skip this is going for only vcf output) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 446 if not self.onlyVCF: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 447 tempSymbolString = ['M'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 448 prevVal = self.adj[i][0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 449 j = 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 450 while j < len(self.adj[i]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 451 diff = self.adj[i][j] - prevVal | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 452 prevVal = self.adj[i][j] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 453 if diff > 0: # insertion | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 454 tempSymbolString.extend(['I']*abs(diff)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 455 j += abs(diff) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 456 elif diff < 0: # deletion | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 457 tempSymbolString.append('D'*abs(diff)+'M') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 458 j += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 459 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 460 tempSymbolString.append('M') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 461 j += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 462 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 463 for j in xrange(len(tempSymbolString)-self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 464 self.allCigar[i].append(CigarString(listIn=tempSymbolString[j:j+self.readLen]).getString()) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 465 # pre-compute reference position of first matching base | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 466 my_fm_pos = None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 467 for k in xrange(self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 468 if 'M' in tempSymbolString[j+k]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 469 my_fm_pos = j+k | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 470 break | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 471 if my_fm_pos == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 472 self.FM_pos[i].append(None) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 473 self.FM_span[i].append(None) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 474 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 475 self.FM_pos[i].append(my_fm_pos-self.adj[i][my_fm_pos]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 476 span_dif = len([nnn for nnn in tempSymbolString[j:j+self.readLen] if 'M' in nnn]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 477 self.FM_span[i].append(self.FM_pos[i][-1] + span_dif) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 478 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 479 # tally up variants implemented | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 480 countDict = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 481 all_variants = [sorted(all_snps[i]+all_indels[i]) for i in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 482 for i in xrange(len(all_variants)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 483 for j in xrange(len(all_variants[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 484 all_variants[i][j] = tuple([all_variants[i][j][0]+self.x])+all_variants[i][j][1:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 485 t = tuple(all_variants[i][j]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 486 if t not in countDict: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 487 countDict[t] = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 488 countDict[t].append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 489 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 490 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 491 # TODO: combine multiple variants that happened to occur at same position into single vcf entry | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 492 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 493 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 494 output_variants = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 495 for k in sorted(countDict.keys()): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 496 output_variants.append(k+tuple([len(countDict[k])/float(self.ploidy)])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 497 ploid_string = ['0' for n in xrange(self.ploidy)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 498 for k2 in [n for n in countDict[k]]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 499 ploid_string[k2] = '1' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 500 output_variants[-1] += tuple(['WP='+'/'.join(ploid_string)]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 501 return output_variants | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 502 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 503 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 504 def sample_read(self, sequencingModel, fragLen=None): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 505 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 506 # choose a ploid | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 507 myPloid = random.randint(0,self.ploidy-1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 508 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 509 # stop attempting to find a valid position if we fail enough times | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 510 MAX_READPOS_ATTEMPTS = 100 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 511 attempts_thus_far = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 512 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 513 # choose a random position within the ploid, and generate quality scores / sequencing errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 514 readsToSample = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 515 if fragLen == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 516 rPos = self.coverage_distribution[myPloid].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 517 #####rPos = random.randint(0,len(self.sequences[myPloid])-self.readLen-1) # uniform random | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 518 #### | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 519 ##### decide which subsection of the sequence to sample from using coverage probabilities | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 520 ####coords_bad = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 521 ####while coords_bad: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 522 #### attempts_thus_far += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 523 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 524 #### return None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 525 #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 526 #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 527 #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 528 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 529 #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 530 #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 531 #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 532 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 533 #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 534 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 535 #### if coords_to_select_from[1] < len(self.sequences[myPloid])-self.readLen: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 536 #### coords_bad = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 537 ####rPos = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 538 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 539 # sample read position and call function to compute quality scores / sequencing errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 540 rDat = self.sequences[myPloid][rPos:rPos+self.readLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 541 (myQual, myErrors) = sequencingModel.getSequencingErrors(rDat) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 542 readsToSample.append([rPos,myQual,myErrors,rDat]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 543 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 544 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 545 rPos1 = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 546 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 547 # EXPERIMENTAL | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 548 #coords_to_select_from = self.coverage_distribution[myPloid][self.fraglens_indMap[fragLen]].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 549 #rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 550 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 551 #####rPos1 = random.randint(0,len(self.sequences[myPloid])-fragLen-1) # uniform random | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 552 #### | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 553 ##### decide which subsection of the sequence to sample from using coverage probabilities | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 554 ####coords_bad = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 555 ####while coords_bad: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 556 #### attempts_thus_far += 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 557 #### if attempts_thus_far > MAX_READPOS_ATTEMPTS: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 558 #### #print coords_to_select_from | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 559 #### return None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 560 #### myBucket = max([self.which_bucket.sample() - self.win_per_read, 0]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 561 #### coords_to_select_from = [myBucket*self.windowSize,(myBucket+1)*self.windowSize] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 562 #### if coords_to_select_from[0] >= len(self.adj[myPloid]): # prevent going beyond region boundaries | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 563 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 564 #### coords_to_select_from[0] += self.adj[myPloid][coords_to_select_from[0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 565 #### coords_to_select_from[1] += self.adj[myPloid][coords_to_select_from[0]] # both ends use index of starting position to avoid issues with reads spanning breakpoints of large events | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 566 #### if max(coords_to_select_from) <= 0: # prevent invalid negative coords due to adj | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 567 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 568 #### if coords_to_select_from[1] - coords_to_select_from[0] <= 2: # we don't span enough coords to sample | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 569 #### continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 570 #### rPos1 = random.randint(coords_to_select_from[0],coords_to_select_from[1]-1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 571 #### # for PE-reads, flip a coin to decide if R1 or R2 will be the "covering" read | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 572 #### if random.randint(1,2) == 1 and rPos1 > fragLen - self.readLen: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 573 #### rPos1 -= fragLen - self.readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 574 #### if rPos1 < len(self.sequences[myPloid])-fragLen: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 575 #### coords_bad = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 576 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 577 rPos2 = rPos1 + fragLen - self.readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 578 rDat1 = self.sequences[myPloid][rPos1:rPos1+self.readLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 579 rDat2 = self.sequences[myPloid][rPos2:rPos2+self.readLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 580 #print len(rDat1), rPos1, len(self.sequences[myPloid]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 581 (myQual1, myErrors1) = sequencingModel.getSequencingErrors(rDat1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 582 (myQual2, myErrors2) = sequencingModel.getSequencingErrors(rDat2,isReverseStrand=True) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 583 readsToSample.append([rPos1,myQual1,myErrors1,rDat1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 584 readsToSample.append([rPos2,myQual2,myErrors2,rDat2]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 585 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 586 # error format: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 587 # myError[i] = (type, len, pos, ref, alt) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 588 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 589 # examine sequencing errors to-be-inserted. | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 590 # - remove deletions that don't have enough bordering sequence content to "fill in" | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 591 # if error is valid, make the changes to the read data | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 592 rOut = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 593 for r in readsToSample: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 594 try: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 595 myCigar = self.allCigar[myPloid][r[0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 596 except IndexError: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 597 print 'Index error when attempting to find cigar string.' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 598 print len(self.allCigar[myPloid]), r[0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 599 if fragLen != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 600 print (rPos1, rPos2) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 601 print myPloid, fragLen, self.fraglens_indMap[fragLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 602 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 603 totalD = sum([error[1] for error in r[2] if error[0] == 'D']) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 604 totalI = sum([error[1] for error in r[2] if error[0] == 'I']) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 605 availB = len(self.sequences[myPloid]) - r[0] - self.readLen - 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 606 # add buffer sequence to fill in positions that get deleted | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 607 r[3] += self.sequences[myPloid][r[0]+self.readLen:r[0]+self.readLen+totalD] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 608 expandedCigar = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 609 extraCigar = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 610 adj = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 611 sse_adj = [0 for n in xrange(self.readLen + max(sequencingModel.errP[3]))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 612 anyIndelErr = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 613 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 614 # sort by letter (D > I > S) such that we introduce all indel errors before substitution errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 615 # secondarily, sort by index | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 616 arrangedErrors = {'D':[],'I':[],'S':[]} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 617 for error in r[2]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 618 arrangedErrors[error[0]].append((error[2],error)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 619 sortedErrors = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 620 for k in sorted(arrangedErrors.keys()): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 621 sortedErrors.extend([n[1] for n in sorted(arrangedErrors[k])]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 622 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 623 skipIndels = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 624 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 625 for error in sortedErrors: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 626 #print '-se-',r[0], error | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 627 #print sse_adj | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 628 eLen = error[1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 629 ePos = error[2] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 630 if error[0] == 'D' or error[0] == 'I': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 631 anyIndelErr = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 632 extraCigarVal = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 633 if totalD > availB: # if not enough bases to fill-in deletions, skip all indel erors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 634 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 635 if expandedCigar == []: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 636 expandedCigar = CigarString(stringIn=myCigar).getList() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 637 fillToGo = totalD - totalI + 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 638 if fillToGo > 0: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 639 try: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 640 extraCigarVal = CigarString(stringIn=self.allCigar[myPloid][r[0]+fillToGo]).getList()[-fillToGo:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 641 except IndexError: # applying the deletions we want requires going beyond region boundaries. skip all indel errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 642 skipIndels = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 643 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 644 if skipIndels: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 645 continue | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 646 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 647 # insert deletion error into read and update cigar string accordingly | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 648 if error[0] == 'D': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 649 myadj = sse_adj[ePos] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 650 pi = ePos+myadj | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 651 pf = ePos+myadj+eLen+1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 652 if str(r[3][pi:pf]) == str(error[3]): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 653 r[3] = r[3][:pi+1] + r[3][pf:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 654 expandedCigar = expandedCigar[:pi+1] + expandedCigar[pf:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 655 if pi+1 == len(expandedCigar): # weird edge case with del at very end of region. Make a guess and add a "M" | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 656 expandedCigar.append('M') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 657 expandedCigar[pi+1] = 'D'*eLen + expandedCigar[pi+1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 658 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 659 print '\nError, ref does not match alt while attempting to insert deletion error!\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 660 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 661 adj -= eLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 662 for i in xrange(ePos,len(sse_adj)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 663 sse_adj[i] -= eLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 664 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 665 # insert insertion error into read and update cigar string accordingly | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 666 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 667 myadj = sse_adj[ePos] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 668 if chr(r[3][ePos+myadj]) == error[3]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 669 r[3] = r[3][:ePos+myadj] + error[4] + r[3][ePos+myadj+1:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 670 expandedCigar = expandedCigar[:ePos+myadj] + ['I']*eLen + expandedCigar[ePos+myadj:] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 671 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 672 print '\nError, ref does not match alt while attempting to insert insertion error!\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 673 print '---',chr(r[3][ePos+myadj]), '!=', error[3] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 674 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 675 adj += eLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 676 for i in xrange(ePos,len(sse_adj)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 677 sse_adj[i] += eLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 678 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 679 else: # substitution errors, much easier by comparison... | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 680 if chr(r[3][ePos+sse_adj[ePos]]) == error[3]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 681 r[3][ePos+sse_adj[ePos]] = error[4] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 682 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 683 print '\nError, ref does not match alt while attempting to insert substitution error!\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 684 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 685 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 686 if anyIndelErr: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 687 if len(expandedCigar): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 688 relevantCigar = (expandedCigar+extraCigarVal)[:self.readLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 689 myCigar = CigarString(listIn=relevantCigar).getString() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 690 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 691 r[3] = r[3][:self.readLen] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 692 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 693 rOut.append([self.FM_pos[myPloid][r[0]],myCigar,str(r[3]),str(r[1])]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 694 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 695 # rOut[i] = (pos, cigar, read_string, qual_string) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 696 return rOut | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 697 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 698 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 699 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 700 # Container for read data, computes quality scores and positions to insert errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 701 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 702 class ReadContainer: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 703 def __init__(self, readLen, errorModel, reScaledError): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 704 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 705 self.readLen = readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 706 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 707 errorDat = pickle.load(open(errorModel,'rb')) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 708 self.UNIFORM = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 709 if len(errorDat) == 4: # uniform-error SE reads (e.g. PacBio) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 710 self.UNIFORM = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 711 [Qscores,offQ,avgError,errorParams] = errorDat | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 712 self.uniform_qscore = int(-10.*np.log10(avgError)+0.5) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 713 print 'Using uniform sequencing error model. (q='+str(self.uniform_qscore)+'+'+str(offQ)+', p(err)={0:0.2f}%)'.format(100.*avgError) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 714 if len(errorDat) == 6: # only 1 q-score model present, use same model for both strands | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 715 [initQ1,probQ1,Qscores,offQ,avgError,errorParams] = errorDat | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 716 self.PE_MODELS = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 717 elif len(errorDat) == 8: # found a q-score model for both forward and reverse strands | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 718 #print 'Using paired-read quality score profiles...' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 719 [initQ1,probQ1,initQ2,probQ2,Qscores,offQ,avgError,errorParams] = errorDat | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 720 self.PE_MODELS = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 721 if len(initQ1) != len(initQ2) or len(probQ1) != len(probQ2): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 722 print '\nError: R1 and R2 quality score models are of different length.\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 723 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 724 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 725 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 726 self.qErrRate = [0.]*(max(Qscores)+1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 727 for q in Qscores: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 728 self.qErrRate[q] = 10.**(-q/10.) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 729 self.offQ = offQ | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 730 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 731 # errorParams = [SSE_PROB, SIE_RATE, SIE_PROB, SIE_VAL, SIE_INS_FREQ, SIE_INS_NUCL] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 732 self.errP = errorParams | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 733 self.errSSE = [DiscreteDistribution(n,NUCL) for n in self.errP[0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 734 self.errSIE = DiscreteDistribution(self.errP[2],self.errP[3]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 735 self.errSIN = DiscreteDistribution(self.errP[5],NUCL) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 736 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 737 # adjust sequencing error frequency to match desired rate | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 738 if reScaledError == None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 739 self.errorScale = 1.0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 740 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 741 self.errorScale = reScaledError/avgError | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 742 print 'Warning: Quality scores no longer exactly representative of error probability. Error model scaled by {0:.3f} to match desired rate...'.format(self.errorScale) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 743 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 744 if self.UNIFORM == False: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 745 # adjust length to match desired read length | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 746 if self.readLen == len(initQ1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 747 self.qIndRemap = range(self.readLen) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 748 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 749 print 'Warning: Read length of error model ('+str(len(initQ1))+') does not match -R value ('+str(self.readLen)+'), rescaling model...' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 750 self.qIndRemap = [max([1,len(initQ1)*n/readLen]) for n in xrange(readLen)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 751 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 752 # initialize probability distributions | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 753 self.initDistByPos1 = [DiscreteDistribution(initQ1[i],Qscores) for i in xrange(len(initQ1))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 754 self.probDistByPosByPrevQ1 = [None] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 755 for i in xrange(1,len(initQ1)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 756 self.probDistByPosByPrevQ1.append([]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 757 for j in xrange(len(initQ1[0])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 758 if np.sum(probQ1[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 759 self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 760 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 761 self.probDistByPosByPrevQ1[-1].append(DiscreteDistribution(probQ1[i][j],Qscores)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 762 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 763 if self.PE_MODELS: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 764 self.initDistByPos2 = [DiscreteDistribution(initQ2[i],Qscores) for i in xrange(len(initQ2))] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 765 self.probDistByPosByPrevQ2 = [None] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 766 for i in xrange(1,len(initQ2)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 767 self.probDistByPosByPrevQ2.append([]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 768 for j in xrange(len(initQ2[0])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 769 if np.sum(probQ2[i][j]) <= 0.: # if we don't have sufficient data for a transition, use the previous qscore | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 770 self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution([1],[Qscores[j]],degenerateVal=Qscores[j])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 771 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 772 self.probDistByPosByPrevQ2[-1].append(DiscreteDistribution(probQ2[i][j],Qscores)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 773 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 774 def getSequencingErrors(self, readData, isReverseStrand=False): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 775 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 776 qOut = [0]*self.readLen | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 777 sErr = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 778 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 779 if self.UNIFORM: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 780 myQ = [self.uniform_qscore + self.offQ for n in xrange(self.readLen)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 781 qOut = ''.join([chr(n) for n in myQ]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 782 for i in xrange(self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 783 if random.random() < self.errorScale*self.qErrRate[self.uniform_qscore]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 784 sErr.append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 785 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 786 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 787 if self.PE_MODELS and isReverseStrand: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 788 myQ = self.initDistByPos2[0].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 789 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 790 myQ = self.initDistByPos1[0].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 791 qOut[0] = myQ | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 792 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 793 for i in xrange(1,self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 794 if self.PE_MODELS and isReverseStrand: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 795 myQ = self.probDistByPosByPrevQ2[self.qIndRemap[i]][myQ].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 796 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 797 myQ = self.probDistByPosByPrevQ1[self.qIndRemap[i]][myQ].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 798 qOut[i] = myQ | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 799 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 800 if isReverseStrand: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 801 qOut = qOut[::-1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 802 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 803 for i in xrange(self.readLen): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 804 if random.random() < self.errorScale * self.qErrRate[qOut[i]]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 805 sErr.append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 806 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 807 qOut = ''.join([chr(n + self.offQ) for n in qOut]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 808 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 809 if self.errorScale == 0.0: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 810 return (qOut,[]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 811 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 812 sOut = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 813 nDelSoFar = 0 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 814 # don't allow indel errors to occur on subsequent positions | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 815 prevIndel = -2 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 816 # don't allow other sequencing errors to occur on bases removed by deletion errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 817 delBlacklist = [] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 818 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 819 for ind in sErr[::-1]: # for each error that we're going to insert... | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 820 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 821 # determine error type | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 822 isSub = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 823 if ind != 0 and ind != self.readLen-1-max(self.errP[3]) and abs(ind-prevIndel) > 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 824 if random.random() < self.errP[1]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 825 isSub = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 826 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 827 # errorOut = (type, len, pos, ref, alt) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 828 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 829 if isSub: # insert substitution error | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 830 myNucl = chr(readData[ind]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 831 newNucl = self.errSSE[NUC_IND[myNucl]].sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 832 sOut.append(('S',1,ind,myNucl,newNucl)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 833 else: # insert indel error | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 834 indelLen = self.errSIE.sample() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 835 if random.random() < self.errP[4]: # insertion error | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 836 myNucl = chr(readData[ind]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 837 newNucl = myNucl + ''.join([self.errSIN.sample() for n in xrange(indelLen)]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 838 sOut.append(('I',len(newNucl)-1,ind,myNucl,newNucl)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 839 elif ind < self.readLen-2-nDelSoFar: # deletion error (prevent too many of them from stacking up) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 840 myNucl = str(readData[ind:ind+indelLen+1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 841 newNucl = chr(readData[ind]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 842 nDelSoFar += len(myNucl)-1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 843 sOut.append(('D',len(myNucl)-1,ind,myNucl,newNucl)) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 844 for i in xrange(ind+1,ind+indelLen+1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 845 delBlacklist.append(i) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 846 prevIndel = ind | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 847 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 848 # remove blacklisted errors | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 849 for i in xrange(len(sOut)-1,-1,-1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 850 if sOut[i][2] in delBlacklist: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 851 del sOut[i] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 852 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 853 return (qOut,sOut) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 854 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 855 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 856 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 857 """************************************************ | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 858 **** DEFAULT MUTATION MODELS | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 859 ************************************************""" | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 860 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 861 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 862 # parse mutation model pickle file | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 863 def parseInputMutationModel(model=None, whichDefault=1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 864 if whichDefault == 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 865 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 866 elif whichDefault == 2: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 867 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 868 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 869 print '\nError: Unknown default mutation model specified\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 870 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 871 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 872 if model != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 873 pickle_dict = pickle.load(open(model,"rb")) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 874 outModel[0] = pickle_dict['AVG_MUT_RATE'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 875 outModel[2] = 1. - pickle_dict['SNP_FREQ'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 876 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 877 insList = pickle_dict['INDEL_FREQ'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 878 if len(insList): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 879 insCount = sum([insList[k] for k in insList.keys() if k >= 1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 880 delCount = sum([insList[k] for k in insList.keys() if k <= -1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 881 insVals = [k for k in sorted(insList.keys()) if k >= 1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 882 insWght = [insList[k]/float(insCount) for k in insVals] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 883 delVals = [k for k in sorted([abs(k) for k in insList.keys() if k <= -1])] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 884 delWght = [insList[-k]/float(delCount) for k in delVals] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 885 else: # degenerate case where no indel stats are provided | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 886 insCount = 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 887 delCount = 1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 888 insVals = [1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 889 insWght = [1.0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 890 delVals = [1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 891 delWght = [1.0] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 892 outModel[3] = insCount/float(insCount + delCount) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 893 outModel[4] = insVals | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 894 outModel[5] = insWght | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 895 outModel[6] = delVals | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 896 outModel[7] = delWght | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 897 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 898 trinuc_trans_prob = pickle_dict['TRINUC_TRANS_PROBS'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 899 for k in sorted(trinuc_trans_prob.keys()): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 900 myInd = TRI_IND[k[0][0]+k[0][2]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 901 (k1,k2) = (NUC_IND[k[0][1]],NUC_IND[k[1][1]]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 902 outModel[8][myInd][k1][k2] = trinuc_trans_prob[k] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 903 for i in xrange(len(outModel[8])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 904 for j in xrange(len(outModel[8][i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 905 for l in xrange(len(outModel[8][i][j])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 906 # if trinuc not present in input mutation model, assign it uniform probability | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 907 if float(sum(outModel[8][i][j])) < 1e-12: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 908 outModel[8][i][j] = [0.25,0.25,0.25,0.25] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 909 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 910 outModel[8][i][j][l] /= float(sum(outModel[8][i][j])) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 911 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 912 trinuc_mut_prob = pickle_dict['TRINUC_MUT_PROB'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 913 which_have_we_seen = {n:False for n in ALL_TRI} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 914 trinuc_mean = np.mean(trinuc_mut_prob.values()) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 915 for trinuc in trinuc_mut_prob.keys(): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 916 outModel[9][ALL_IND[trinuc]] = trinuc_mut_prob[trinuc] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 917 which_have_we_seen[trinuc] = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 918 for trinuc in which_have_we_seen.keys(): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 919 if which_have_we_seen[trinuc] == False: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 920 outModel[9][ALL_IND[trinuc]] = trinuc_mean | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 921 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 922 return outModel | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 923 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 924 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 925 # parse mutation model files, returns default model if no model directory is specified | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 926 # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 927 # OLD FUNCTION THAT PROCESSED OUTDATED TEXTFILE MUTATION MODELS | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 928 def parseInputMutationModel_deprecated(prefix=None, whichDefault=1): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 929 if whichDefault == 1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 930 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 931 elif whichDefault == 2: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 932 outModel = [copy.deepcopy(n) for n in DEFAULT_MODEL_2] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 933 else: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 934 print '\nError: Unknown default mutation model specified\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 935 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 936 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 937 if prefix != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 938 if prefix[-1] != '/': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 939 prefix += '/' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 940 if not os.path.isdir(prefix): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 941 '\nError: Input mutation model directory not found:',prefix,'\n' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 942 exit(1) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 943 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 944 print 'Reading in mutation model...' | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 945 listing1 = [n for n in os.listdir(prefix) if n[-5:] == '.prob'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 946 listing2 = [n for n in os.listdir(prefix) if n[-7:] == '.trinuc'] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 947 listing = sorted(listing1) + sorted(listing2) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 948 for l in listing: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 949 f = open(prefix+l,'r') | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 950 fr = [n.split('\t') for n in f.read().split('\n')] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 951 f.close() | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 952 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 953 if '_overall.prob' in l: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 954 myIns = None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 955 myDel = None | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 956 for dat in fr[1:]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 957 if len(dat) == 2: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 958 if dat[0] == 'insertion': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 959 myIns = float(dat[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 960 elif dat[0] == 'deletion': | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 961 myDel = float(dat[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 962 if myIns != None and myDel != None: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 963 outModel[2] = myIns + myDel | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 964 outModel[3] = myIns / (myIns + myDel) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 965 print '-',l | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 966 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 967 if '_insLength.prob' in l: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 968 insVals = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 969 for dat in fr[1:]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 970 if len(dat) == 2: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 971 insVals[int(dat[0])] = float(dat[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 972 if len(insVals): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 973 outModel[4] = sorted(insVals.keys()) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 974 outModel[5] = [insVals[n] for n in outModel[4]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 975 print '-',l | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 976 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 977 if '_delLength.prob' in l: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 978 delVals = {} | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 979 for dat in fr[1:]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 980 if len(dat) == 2: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 981 delVals[int(dat[0])] = float(dat[1]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 982 if len(delVals): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 983 outModel[6] = sorted(delVals.keys()) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 984 outModel[7] = [delVals[n] for n in outModel[6]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 985 print '-',l | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 986 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 987 if '.trinuc' == l[-7:]: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 988 context_ind = TRI_IND[l[-10]+l[-8]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 989 p_matrix = [[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1],[-1,-1,-1,-1]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 990 for i in xrange(len(p_matrix)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 991 for j in xrange(len(fr[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 992 p_matrix[i][j] = float(fr[i][j]) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 993 anyNone = False | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 994 for i in xrange(len(p_matrix)): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 995 for j in xrange(len(p_matrix[i])): | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 996 if p_matrix[i][j] == -1: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 997 anyNone = True | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 998 if not anyNone: | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 999 outModel[8][context_ind] = copy.deepcopy(p_matrix) | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1000 print '-',l | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1001 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1002 return outModel | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1003 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1004 ###################### | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1005 # DEFAULT VALUES # | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1006 ###################### | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1007 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1008 DEFAULT_1_OVERALL_MUT_RATE = 0.001 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1009 DEFAULT_1_HOMOZYGOUS_FREQ = 0.010 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1010 DEFAULT_1_INDEL_FRACTION = 0.05 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1011 DEFAULT_1_INS_VS_DEL = 0.6 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1012 DEFAULT_1_INS_LENGTH_VALUES = [1,2,3,4,5,6,7,8,9,10] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1013 DEFAULT_1_INS_LENGTH_WEIGHTS = [0.4, 0.2, 0.1, 0.05, 0.05, 0.05, 0.05, 0.034, 0.033, 0.033] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1014 DEFAULT_1_DEL_LENGTH_VALUES = [1,2,3,4,5] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1015 DEFAULT_1_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1016 example_matrix_1 = [[0.0, 0.15, 0.7, 0.15], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1017 [0.15, 0.0, 0.15, 0.7], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1018 [0.7, 0.15, 0.0, 0.15], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1019 [0.15, 0.7, 0.15, 0.0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1020 DEFAULT_1_TRI_FREQS = [copy.deepcopy(example_matrix_1) for n in xrange(16)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1021 DEFAULT_1_TRINUC_BIAS = [1./float(len(ALL_TRI)) for n in ALL_TRI] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1022 DEFAULT_MODEL_1 = [DEFAULT_1_OVERALL_MUT_RATE, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1023 DEFAULT_1_HOMOZYGOUS_FREQ, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1024 DEFAULT_1_INDEL_FRACTION, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1025 DEFAULT_1_INS_VS_DEL, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1026 DEFAULT_1_INS_LENGTH_VALUES, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1027 DEFAULT_1_INS_LENGTH_WEIGHTS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1028 DEFAULT_1_DEL_LENGTH_VALUES, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1029 DEFAULT_1_DEL_LENGTH_WEIGHTS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1030 DEFAULT_1_TRI_FREQS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1031 DEFAULT_1_TRINUC_BIAS] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1032 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1033 DEFAULT_2_OVERALL_MUT_RATE = 0.002 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1034 DEFAULT_2_HOMOZYGOUS_FREQ = 0.200 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1035 DEFAULT_2_INDEL_FRACTION = 0.1 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1036 DEFAULT_2_INS_VS_DEL = 0.3 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1037 DEFAULT_2_INS_LENGTH_VALUES = [1,2,3,4,5,6,7,8,9,10] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1038 DEFAULT_2_INS_LENGTH_WEIGHTS = [0.1, 0.1, 0.2, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1039 DEFAULT_2_DEL_LENGTH_VALUES = [1,2,3,4,5] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1040 DEFAULT_2_DEL_LENGTH_WEIGHTS = [0.3,0.2,0.2,0.2,0.1] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1041 example_matrix_2 = [[0.0, 0.15, 0.7, 0.15], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1042 [0.15, 0.0, 0.15, 0.7], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1043 [0.7, 0.15, 0.0, 0.15], | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1044 [0.15, 0.7, 0.15, 0.0]] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1045 DEFAULT_2_TRI_FREQS = [copy.deepcopy(example_matrix_2) for n in xrange(16)] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1046 DEFAULT_2_TRINUC_BIAS = [1./float(len(ALL_TRI)) for n in ALL_TRI] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1047 DEFAULT_MODEL_2 = [DEFAULT_2_OVERALL_MUT_RATE, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1048 DEFAULT_2_HOMOZYGOUS_FREQ, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1049 DEFAULT_2_INDEL_FRACTION, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1050 DEFAULT_2_INS_VS_DEL, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1051 DEFAULT_2_INS_LENGTH_VALUES, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1052 DEFAULT_2_INS_LENGTH_WEIGHTS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1053 DEFAULT_2_DEL_LENGTH_VALUES, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1054 DEFAULT_2_DEL_LENGTH_WEIGHTS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1055 DEFAULT_2_TRI_FREQS, | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1056 DEFAULT_2_TRINUC_BIAS] | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1057 | 
| 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 thondeboer parents: diff
changeset | 1058 | 
