Mercurial > repos > thondeboer > neat_genreads
comparison py/refFunc.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer |
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:6e75a84e9338 |
|---|---|
| 1 import sys | |
| 2 import time | |
| 3 import os | |
| 4 import random | |
| 5 | |
| 6 OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True} | |
| 7 ALLOWED_NUCL = ['A','C','G','T'] | |
| 8 | |
| 9 # | |
| 10 # Index reference fasta | |
| 11 # | |
| 12 def indexRef(refPath): | |
| 13 | |
| 14 tt = time.time() | |
| 15 | |
| 16 fn = None | |
| 17 if os.path.isfile(refPath+'i'): | |
| 18 print 'found index '+refPath+'i' | |
| 19 fn = refPath+'i' | |
| 20 if os.path.isfile(refPath+'.fai'): | |
| 21 print 'found index '+refPath+'.fai' | |
| 22 fn = refPath+'.fai' | |
| 23 | |
| 24 ref_inds = [] | |
| 25 if fn != None: | |
| 26 fai = open(fn,'r') | |
| 27 for line in fai: | |
| 28 splt = line[:-1].split('\t') | |
| 29 seqLen = int(splt[1]) | |
| 30 offset = int(splt[2]) | |
| 31 lineLn = int(splt[3]) | |
| 32 nLines = seqLen/lineLn | |
| 33 if seqLen%lineLn != 0: | |
| 34 nLines += 1 | |
| 35 ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen)) | |
| 36 fai.close() | |
| 37 return ref_inds | |
| 38 | |
| 39 sys.stdout.write('index not found, creating one... ') | |
| 40 sys.stdout.flush() | |
| 41 refFile = open(refPath,'r') | |
| 42 prevR = None | |
| 43 prevP = None | |
| 44 seqLen = 0 | |
| 45 while 1: | |
| 46 data = refFile.readline() | |
| 47 if not data: | |
| 48 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) | |
| 49 break | |
| 50 if data[0] == '>': | |
| 51 if prevP != None: | |
| 52 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) | |
| 53 seqLen = 0 | |
| 54 prevP = refFile.tell() | |
| 55 prevR = data[1:-1] | |
| 56 else: | |
| 57 seqLen += len(data)-1 | |
| 58 refFile.close() | |
| 59 | |
| 60 print '{0:.3f} (sec)'.format(time.time()-tt) | |
| 61 return ref_inds | |
| 62 | |
| 63 | |
| 64 # | |
| 65 # Read in sequence data from reference fasta | |
| 66 # | |
| 67 # N_unknowns = True --> all ambiguous characters will be treated as Ns | |
| 68 # N_handling = (mode,params) | |
| 69 # - ('random',read/frag len) --> all regions of Ns smaller than read or fragment | |
| 70 # length (whichever is bigger) will be replaced | |
| 71 # with uniformly random nucleotides | |
| 72 # - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string | |
| 73 # of 'chr's | |
| 74 # - ('ignore') --> do not alter nucleotides in N regions | |
| 75 # | |
| 76 def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False): | |
| 77 | |
| 78 tt = time.time() | |
| 79 if not quiet: | |
| 80 sys.stdout.write('reading '+ref_inds_i[0]+'... ') | |
| 81 sys.stdout.flush() | |
| 82 | |
| 83 refFile = open(refPath,'r') | |
| 84 refFile.seek(ref_inds_i[1]) | |
| 85 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) | |
| 86 myDat = bytearray(myDat.upper()) | |
| 87 | |
| 88 # find N regions | |
| 89 # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns | |
| 90 prevNI = 0 | |
| 91 nCount = 0 | |
| 92 N_atlas = [] | |
| 93 for i in xrange(len(myDat)): | |
| 94 if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD): | |
| 95 if nCount == 0: | |
| 96 prevNI = i | |
| 97 nCount += 1 | |
| 98 if i == len(myDat)-1: | |
| 99 N_atlas.append((prevNI,prevNI+nCount)) | |
| 100 else: | |
| 101 if nCount > 0: | |
| 102 N_atlas.append((prevNI,prevNI+nCount)) | |
| 103 nCount = 0 | |
| 104 | |
| 105 # handle N base-calls as desired | |
| 106 N_info = {} | |
| 107 N_info['all'] = [] | |
| 108 N_info['big'] = [] | |
| 109 N_info['non_N'] = [] | |
| 110 if N_handling[0] == 'random': | |
| 111 for region in N_atlas: | |
| 112 N_info['all'].extend(region) | |
| 113 if region[1]-region[0] <= N_handling[1]: | |
| 114 for i in xrange(region[0],region[1]): | |
| 115 myDat[i] = random.choice(ALLOWED_NUCL) | |
| 116 else: | |
| 117 N_info['big'].extend(region) | |
| 118 elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD: | |
| 119 for region in N_atlas: | |
| 120 N_info['all'].extend(region) | |
| 121 if region[1]-region[0] <= N_handling[1]: | |
| 122 for i in xrange(region[0],region[1]): | |
| 123 myDat[i] = N_handling[2] | |
| 124 else: | |
| 125 N_info['big'].extend(region) | |
| 126 elif N_handling[0] == 'ignore': | |
| 127 for region in N_atlas: | |
| 128 N_info['all'].extend(region) | |
| 129 N_info['big'].extend(region) | |
| 130 else: | |
| 131 print '\nERROR: UNKNOWN N_HANDLING MODE\n' | |
| 132 exit(1) | |
| 133 | |
| 134 habitableRegions = [] | |
| 135 if N_info['big'] == []: | |
| 136 N_info['non_N'] = [(0,len(myDat))] | |
| 137 else: | |
| 138 for i in xrange(0,len(N_info['big']),2): | |
| 139 if i == 0: | |
| 140 habitableRegions.append((0,N_info['big'][0])) | |
| 141 else: | |
| 142 habitableRegions.append((N_info['big'][i-1],N_info['big'][i])) | |
| 143 habitableRegions.append((N_info['big'][-1],len(myDat))) | |
| 144 for n in habitableRegions: | |
| 145 if n[0] != n[1]: | |
| 146 N_info['non_N'].append(n) | |
| 147 | |
| 148 if not quiet: | |
| 149 print '{0:.3f} (sec)'.format(time.time()-tt) | |
| 150 return (myDat,N_info) | |
| 151 | |
| 152 # | |
| 153 # find all non-N regions in reference sequence ahead of time, for computing jobs in parallel | |
| 154 # | |
| 155 def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False): | |
| 156 outRegions = {} | |
| 157 fn = refPath+'.nnr' | |
| 158 if os.path.isfile(fn) and not(saveOutput): | |
| 159 print 'found list of preidentified non-N regions...' | |
| 160 f = open(fn,'r') | |
| 161 for line in f: | |
| 162 splt = line.strip().split('\t') | |
| 163 if splt[0] not in outRegions: | |
| 164 outRegions[splt[0]] = [] | |
| 165 outRegions[splt[0]].append((int(splt[1]),int(splt[2]))) | |
| 166 f.close() | |
| 167 return outRegions | |
| 168 else: | |
| 169 print 'enumerating all non-N regions in reference sequence...' | |
| 170 for RI in xrange(len(ref_inds)): | |
| 171 (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True) | |
| 172 refName = ref_inds[RI][0] | |
| 173 outRegions[refName] = [n for n in N_regions['non_N']] | |
| 174 if saveOutput: | |
| 175 f = open(fn,'w') | |
| 176 for k in outRegions.keys(): | |
| 177 for n in outRegions[k]: | |
| 178 f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n') | |
| 179 f.close() | |
| 180 return outRegions | |
| 181 | |
| 182 # | |
| 183 # find which of the non-N regions are going to be used for this job | |
| 184 # | |
| 185 def partitionRefRegions(inRegions,ref_inds,myjob,njobs): | |
| 186 | |
| 187 totSize = 0 | |
| 188 for RI in xrange(len(ref_inds)): | |
| 189 refName = ref_inds[RI][0] | |
| 190 for region in inRegions[refName]: | |
| 191 totSize += region[1] - region[0] | |
| 192 sizePerJob = int(totSize/float(njobs)-0.5) | |
| 193 | |
| 194 regionsPerJob = [[] for n in xrange(njobs)] | |
| 195 refsPerJob = [{} for n in xrange(njobs)] | |
| 196 currentInd = 0 | |
| 197 currentCount = 0 | |
| 198 for RI in xrange(len(ref_inds)): | |
| 199 refName = ref_inds[RI][0] | |
| 200 for region in inRegions[refName]: | |
| 201 regionsPerJob[currentInd].append((refName,region[0],region[1])) | |
| 202 refsPerJob[currentInd][refName] = True | |
| 203 currentCount += region[1] - region[0] | |
| 204 if currentCount >= sizePerJob: | |
| 205 currentCount = 0 | |
| 206 currentInd = min([currentInd+1,njobs-1]) | |
| 207 | |
| 208 relevantRefs = refsPerJob[myjob-1].keys() | |
| 209 relevantRegs = regionsPerJob[myjob-1] | |
| 210 return (relevantRefs,relevantRegs) |
