Mercurial > repos > thondeboer > neat_genreads
diff py/refFunc.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Tue, 15 May 2018 02:39:53 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/refFunc.py Tue May 15 02:39:53 2018 -0400 @@ -0,0 +1,210 @@ +import sys +import time +import os +import random + +OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True} +ALLOWED_NUCL = ['A','C','G','T'] + +# +# Index reference fasta +# +def indexRef(refPath): + + tt = time.time() + + fn = None + if os.path.isfile(refPath+'i'): + print 'found index '+refPath+'i' + fn = refPath+'i' + if os.path.isfile(refPath+'.fai'): + print 'found index '+refPath+'.fai' + fn = refPath+'.fai' + + ref_inds = [] + if fn != None: + fai = open(fn,'r') + for line in fai: + splt = line[:-1].split('\t') + seqLen = int(splt[1]) + offset = int(splt[2]) + lineLn = int(splt[3]) + nLines = seqLen/lineLn + if seqLen%lineLn != 0: + nLines += 1 + ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen)) + fai.close() + return ref_inds + + sys.stdout.write('index not found, creating one... ') + sys.stdout.flush() + refFile = open(refPath,'r') + prevR = None + prevP = None + seqLen = 0 + while 1: + data = refFile.readline() + if not data: + ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) + break + if data[0] == '>': + if prevP != None: + ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) + seqLen = 0 + prevP = refFile.tell() + prevR = data[1:-1] + else: + seqLen += len(data)-1 + refFile.close() + + print '{0:.3f} (sec)'.format(time.time()-tt) + return ref_inds + + +# +# Read in sequence data from reference fasta +# +# N_unknowns = True --> all ambiguous characters will be treated as Ns +# N_handling = (mode,params) +# - ('random',read/frag len) --> all regions of Ns smaller than read or fragment +# length (whichever is bigger) will be replaced +# with uniformly random nucleotides +# - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string +# of 'chr's +# - ('ignore') --> do not alter nucleotides in N regions +# +def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False): + + tt = time.time() + if not quiet: + sys.stdout.write('reading '+ref_inds_i[0]+'... ') + sys.stdout.flush() + + refFile = open(refPath,'r') + refFile.seek(ref_inds_i[1]) + myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) + myDat = bytearray(myDat.upper()) + + # find N regions + # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns + prevNI = 0 + nCount = 0 + N_atlas = [] + for i in xrange(len(myDat)): + if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD): + if nCount == 0: + prevNI = i + nCount += 1 + if i == len(myDat)-1: + N_atlas.append((prevNI,prevNI+nCount)) + else: + if nCount > 0: + N_atlas.append((prevNI,prevNI+nCount)) + nCount = 0 + + # handle N base-calls as desired + N_info = {} + N_info['all'] = [] + N_info['big'] = [] + N_info['non_N'] = [] + if N_handling[0] == 'random': + for region in N_atlas: + N_info['all'].extend(region) + if region[1]-region[0] <= N_handling[1]: + for i in xrange(region[0],region[1]): + myDat[i] = random.choice(ALLOWED_NUCL) + else: + N_info['big'].extend(region) + elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD: + for region in N_atlas: + N_info['all'].extend(region) + if region[1]-region[0] <= N_handling[1]: + for i in xrange(region[0],region[1]): + myDat[i] = N_handling[2] + else: + N_info['big'].extend(region) + elif N_handling[0] == 'ignore': + for region in N_atlas: + N_info['all'].extend(region) + N_info['big'].extend(region) + else: + print '\nERROR: UNKNOWN N_HANDLING MODE\n' + exit(1) + + habitableRegions = [] + if N_info['big'] == []: + N_info['non_N'] = [(0,len(myDat))] + else: + for i in xrange(0,len(N_info['big']),2): + if i == 0: + habitableRegions.append((0,N_info['big'][0])) + else: + habitableRegions.append((N_info['big'][i-1],N_info['big'][i])) + habitableRegions.append((N_info['big'][-1],len(myDat))) + for n in habitableRegions: + if n[0] != n[1]: + N_info['non_N'].append(n) + + if not quiet: + print '{0:.3f} (sec)'.format(time.time()-tt) + return (myDat,N_info) + +# +# find all non-N regions in reference sequence ahead of time, for computing jobs in parallel +# +def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False): + outRegions = {} + fn = refPath+'.nnr' + if os.path.isfile(fn) and not(saveOutput): + print 'found list of preidentified non-N regions...' + f = open(fn,'r') + for line in f: + splt = line.strip().split('\t') + if splt[0] not in outRegions: + outRegions[splt[0]] = [] + outRegions[splt[0]].append((int(splt[1]),int(splt[2]))) + f.close() + return outRegions + else: + print 'enumerating all non-N regions in reference sequence...' + for RI in xrange(len(ref_inds)): + (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True) + refName = ref_inds[RI][0] + outRegions[refName] = [n for n in N_regions['non_N']] + if saveOutput: + f = open(fn,'w') + for k in outRegions.keys(): + for n in outRegions[k]: + f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n') + f.close() + return outRegions + +# +# find which of the non-N regions are going to be used for this job +# +def partitionRefRegions(inRegions,ref_inds,myjob,njobs): + + totSize = 0 + for RI in xrange(len(ref_inds)): + refName = ref_inds[RI][0] + for region in inRegions[refName]: + totSize += region[1] - region[0] + sizePerJob = int(totSize/float(njobs)-0.5) + + regionsPerJob = [[] for n in xrange(njobs)] + refsPerJob = [{} for n in xrange(njobs)] + currentInd = 0 + currentCount = 0 + for RI in xrange(len(ref_inds)): + refName = ref_inds[RI][0] + for region in inRegions[refName]: + regionsPerJob[currentInd].append((refName,region[0],region[1])) + refsPerJob[currentInd][refName] = True + currentCount += region[1] - region[0] + if currentCount >= sizePerJob: + currentCount = 0 + currentInd = min([currentInd+1,njobs-1]) + + relevantRefs = refsPerJob[myjob-1].keys() + relevantRegs = regionsPerJob[myjob-1] + return (relevantRefs,relevantRegs)