Mercurial > repos > thondeboer > neat_genreads
view py/refFunc.py @ 10:7d10b55965c9 draft default tip
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Wed, 16 May 2018 17:02:51 -0400 |
parents | 6e75a84e9338 |
children |
line wrap: on
line source
import sys import time import os import random OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True} ALLOWED_NUCL = ['A','C','G','T'] # # Index reference fasta # def indexRef(refPath): tt = time.time() fn = None if os.path.isfile(refPath+'i'): print 'found index '+refPath+'i' fn = refPath+'i' if os.path.isfile(refPath+'.fai'): print 'found index '+refPath+'.fai' fn = refPath+'.fai' ref_inds = [] if fn != None: fai = open(fn,'r') for line in fai: splt = line[:-1].split('\t') seqLen = int(splt[1]) offset = int(splt[2]) lineLn = int(splt[3]) nLines = seqLen/lineLn if seqLen%lineLn != 0: nLines += 1 ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen)) fai.close() return ref_inds sys.stdout.write('index not found, creating one... ') sys.stdout.flush() refFile = open(refPath,'r') prevR = None prevP = None seqLen = 0 while 1: data = refFile.readline() if not data: ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) break if data[0] == '>': if prevP != None: ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) seqLen = 0 prevP = refFile.tell() prevR = data[1:-1] else: seqLen += len(data)-1 refFile.close() print '{0:.3f} (sec)'.format(time.time()-tt) return ref_inds # # Read in sequence data from reference fasta # # N_unknowns = True --> all ambiguous characters will be treated as Ns # N_handling = (mode,params) # - ('random',read/frag len) --> all regions of Ns smaller than read or fragment # length (whichever is bigger) will be replaced # with uniformly random nucleotides # - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string # of 'chr's # - ('ignore') --> do not alter nucleotides in N regions # def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False): tt = time.time() if not quiet: sys.stdout.write('reading '+ref_inds_i[0]+'... ') sys.stdout.flush() refFile = open(refPath,'r') refFile.seek(ref_inds_i[1]) myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) myDat = bytearray(myDat.upper()) # find N regions # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns prevNI = 0 nCount = 0 N_atlas = [] for i in xrange(len(myDat)): if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD): if nCount == 0: prevNI = i nCount += 1 if i == len(myDat)-1: N_atlas.append((prevNI,prevNI+nCount)) else: if nCount > 0: N_atlas.append((prevNI,prevNI+nCount)) nCount = 0 # handle N base-calls as desired N_info = {} N_info['all'] = [] N_info['big'] = [] N_info['non_N'] = [] if N_handling[0] == 'random': for region in N_atlas: N_info['all'].extend(region) if region[1]-region[0] <= N_handling[1]: for i in xrange(region[0],region[1]): myDat[i] = random.choice(ALLOWED_NUCL) else: N_info['big'].extend(region) elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD: for region in N_atlas: N_info['all'].extend(region) if region[1]-region[0] <= N_handling[1]: for i in xrange(region[0],region[1]): myDat[i] = N_handling[2] else: N_info['big'].extend(region) elif N_handling[0] == 'ignore': for region in N_atlas: N_info['all'].extend(region) N_info['big'].extend(region) else: print '\nERROR: UNKNOWN N_HANDLING MODE\n' exit(1) habitableRegions = [] if N_info['big'] == []: N_info['non_N'] = [(0,len(myDat))] else: for i in xrange(0,len(N_info['big']),2): if i == 0: habitableRegions.append((0,N_info['big'][0])) else: habitableRegions.append((N_info['big'][i-1],N_info['big'][i])) habitableRegions.append((N_info['big'][-1],len(myDat))) for n in habitableRegions: if n[0] != n[1]: N_info['non_N'].append(n) if not quiet: print '{0:.3f} (sec)'.format(time.time()-tt) return (myDat,N_info) # # find all non-N regions in reference sequence ahead of time, for computing jobs in parallel # def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False): outRegions = {} fn = refPath+'.nnr' if os.path.isfile(fn) and not(saveOutput): print 'found list of preidentified non-N regions...' f = open(fn,'r') for line in f: splt = line.strip().split('\t') if splt[0] not in outRegions: outRegions[splt[0]] = [] outRegions[splt[0]].append((int(splt[1]),int(splt[2]))) f.close() return outRegions else: print 'enumerating all non-N regions in reference sequence...' for RI in xrange(len(ref_inds)): (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True) refName = ref_inds[RI][0] outRegions[refName] = [n for n in N_regions['non_N']] if saveOutput: f = open(fn,'w') for k in outRegions.keys(): for n in outRegions[k]: f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n') f.close() return outRegions # # find which of the non-N regions are going to be used for this job # def partitionRefRegions(inRegions,ref_inds,myjob,njobs): totSize = 0 for RI in xrange(len(ref_inds)): refName = ref_inds[RI][0] for region in inRegions[refName]: totSize += region[1] - region[0] sizePerJob = int(totSize/float(njobs)-0.5) regionsPerJob = [[] for n in xrange(njobs)] refsPerJob = [{} for n in xrange(njobs)] currentInd = 0 currentCount = 0 for RI in xrange(len(ref_inds)): refName = ref_inds[RI][0] for region in inRegions[refName]: regionsPerJob[currentInd].append((refName,region[0],region[1])) refsPerJob[currentInd][refName] = True currentCount += region[1] - region[0] if currentCount >= sizePerJob: currentCount = 0 currentInd = min([currentInd+1,njobs-1]) relevantRefs = refsPerJob[myjob-1].keys() relevantRegs = regionsPerJob[myjob-1] return (relevantRefs,relevantRegs)