comparison py/refFunc.py @ 0:6e75a84e9338 draft

planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author thondeboer
date Tue, 15 May 2018 02:39:53 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:6e75a84e9338
1 import sys
2 import time
3 import os
4 import random
5
6 OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True}
7 ALLOWED_NUCL = ['A','C','G','T']
8
9 #
10 # Index reference fasta
11 #
12 def indexRef(refPath):
13
14 tt = time.time()
15
16 fn = None
17 if os.path.isfile(refPath+'i'):
18 print 'found index '+refPath+'i'
19 fn = refPath+'i'
20 if os.path.isfile(refPath+'.fai'):
21 print 'found index '+refPath+'.fai'
22 fn = refPath+'.fai'
23
24 ref_inds = []
25 if fn != None:
26 fai = open(fn,'r')
27 for line in fai:
28 splt = line[:-1].split('\t')
29 seqLen = int(splt[1])
30 offset = int(splt[2])
31 lineLn = int(splt[3])
32 nLines = seqLen/lineLn
33 if seqLen%lineLn != 0:
34 nLines += 1
35 ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen))
36 fai.close()
37 return ref_inds
38
39 sys.stdout.write('index not found, creating one... ')
40 sys.stdout.flush()
41 refFile = open(refPath,'r')
42 prevR = None
43 prevP = None
44 seqLen = 0
45 while 1:
46 data = refFile.readline()
47 if not data:
48 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) )
49 break
50 if data[0] == '>':
51 if prevP != None:
52 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) )
53 seqLen = 0
54 prevP = refFile.tell()
55 prevR = data[1:-1]
56 else:
57 seqLen += len(data)-1
58 refFile.close()
59
60 print '{0:.3f} (sec)'.format(time.time()-tt)
61 return ref_inds
62
63
64 #
65 # Read in sequence data from reference fasta
66 #
67 # N_unknowns = True --> all ambiguous characters will be treated as Ns
68 # N_handling = (mode,params)
69 # - ('random',read/frag len) --> all regions of Ns smaller than read or fragment
70 # length (whichever is bigger) will be replaced
71 # with uniformly random nucleotides
72 # - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string
73 # of 'chr's
74 # - ('ignore') --> do not alter nucleotides in N regions
75 #
76 def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False):
77
78 tt = time.time()
79 if not quiet:
80 sys.stdout.write('reading '+ref_inds_i[0]+'... ')
81 sys.stdout.flush()
82
83 refFile = open(refPath,'r')
84 refFile.seek(ref_inds_i[1])
85 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n'))
86 myDat = bytearray(myDat.upper())
87
88 # find N regions
89 # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns
90 prevNI = 0
91 nCount = 0
92 N_atlas = []
93 for i in xrange(len(myDat)):
94 if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD):
95 if nCount == 0:
96 prevNI = i
97 nCount += 1
98 if i == len(myDat)-1:
99 N_atlas.append((prevNI,prevNI+nCount))
100 else:
101 if nCount > 0:
102 N_atlas.append((prevNI,prevNI+nCount))
103 nCount = 0
104
105 # handle N base-calls as desired
106 N_info = {}
107 N_info['all'] = []
108 N_info['big'] = []
109 N_info['non_N'] = []
110 if N_handling[0] == 'random':
111 for region in N_atlas:
112 N_info['all'].extend(region)
113 if region[1]-region[0] <= N_handling[1]:
114 for i in xrange(region[0],region[1]):
115 myDat[i] = random.choice(ALLOWED_NUCL)
116 else:
117 N_info['big'].extend(region)
118 elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD:
119 for region in N_atlas:
120 N_info['all'].extend(region)
121 if region[1]-region[0] <= N_handling[1]:
122 for i in xrange(region[0],region[1]):
123 myDat[i] = N_handling[2]
124 else:
125 N_info['big'].extend(region)
126 elif N_handling[0] == 'ignore':
127 for region in N_atlas:
128 N_info['all'].extend(region)
129 N_info['big'].extend(region)
130 else:
131 print '\nERROR: UNKNOWN N_HANDLING MODE\n'
132 exit(1)
133
134 habitableRegions = []
135 if N_info['big'] == []:
136 N_info['non_N'] = [(0,len(myDat))]
137 else:
138 for i in xrange(0,len(N_info['big']),2):
139 if i == 0:
140 habitableRegions.append((0,N_info['big'][0]))
141 else:
142 habitableRegions.append((N_info['big'][i-1],N_info['big'][i]))
143 habitableRegions.append((N_info['big'][-1],len(myDat)))
144 for n in habitableRegions:
145 if n[0] != n[1]:
146 N_info['non_N'].append(n)
147
148 if not quiet:
149 print '{0:.3f} (sec)'.format(time.time()-tt)
150 return (myDat,N_info)
151
152 #
153 # find all non-N regions in reference sequence ahead of time, for computing jobs in parallel
154 #
155 def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False):
156 outRegions = {}
157 fn = refPath+'.nnr'
158 if os.path.isfile(fn) and not(saveOutput):
159 print 'found list of preidentified non-N regions...'
160 f = open(fn,'r')
161 for line in f:
162 splt = line.strip().split('\t')
163 if splt[0] not in outRegions:
164 outRegions[splt[0]] = []
165 outRegions[splt[0]].append((int(splt[1]),int(splt[2])))
166 f.close()
167 return outRegions
168 else:
169 print 'enumerating all non-N regions in reference sequence...'
170 for RI in xrange(len(ref_inds)):
171 (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True)
172 refName = ref_inds[RI][0]
173 outRegions[refName] = [n for n in N_regions['non_N']]
174 if saveOutput:
175 f = open(fn,'w')
176 for k in outRegions.keys():
177 for n in outRegions[k]:
178 f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n')
179 f.close()
180 return outRegions
181
182 #
183 # find which of the non-N regions are going to be used for this job
184 #
185 def partitionRefRegions(inRegions,ref_inds,myjob,njobs):
186
187 totSize = 0
188 for RI in xrange(len(ref_inds)):
189 refName = ref_inds[RI][0]
190 for region in inRegions[refName]:
191 totSize += region[1] - region[0]
192 sizePerJob = int(totSize/float(njobs)-0.5)
193
194 regionsPerJob = [[] for n in xrange(njobs)]
195 refsPerJob = [{} for n in xrange(njobs)]
196 currentInd = 0
197 currentCount = 0
198 for RI in xrange(len(ref_inds)):
199 refName = ref_inds[RI][0]
200 for region in inRegions[refName]:
201 regionsPerJob[currentInd].append((refName,region[0],region[1]))
202 refsPerJob[currentInd][refName] = True
203 currentCount += region[1] - region[0]
204 if currentCount >= sizePerJob:
205 currentCount = 0
206 currentInd = min([currentInd+1,njobs-1])
207
208 relevantRefs = refsPerJob[myjob-1].keys()
209 relevantRegs = regionsPerJob[myjob-1]
210 return (relevantRefs,relevantRegs)