Mercurial > repos > thondeboer > neat_genreads
annotate py/refFunc.py @ 0:6e75a84e9338 draft
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
| author | thondeboer | 
|---|---|
| date | Tue, 15 May 2018 02:39:53 -0400 | 
| parents | |
| children | 
| rev | line source | 
|---|---|
| 
0
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
1 import sys | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
2 import time | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
3 import os | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
4 import random | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
5 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
6 OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True} | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
7 ALLOWED_NUCL = ['A','C','G','T'] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
8 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
9 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
10 # Index reference fasta | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
11 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
12 def indexRef(refPath): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
13 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
14 tt = time.time() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
15 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
16 fn = None | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
17 if os.path.isfile(refPath+'i'): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
18 print 'found index '+refPath+'i' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
19 fn = refPath+'i' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
20 if os.path.isfile(refPath+'.fai'): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
21 print 'found index '+refPath+'.fai' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
22 fn = refPath+'.fai' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
23 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
24 ref_inds = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
25 if fn != None: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
26 fai = open(fn,'r') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
27 for line in fai: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
28 splt = line[:-1].split('\t') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
29 seqLen = int(splt[1]) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
30 offset = int(splt[2]) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
31 lineLn = int(splt[3]) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
32 nLines = seqLen/lineLn | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
33 if seqLen%lineLn != 0: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
34 nLines += 1 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
35 ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen)) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
36 fai.close() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
37 return ref_inds | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
38 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
39 sys.stdout.write('index not found, creating one... ') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
40 sys.stdout.flush() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
41 refFile = open(refPath,'r') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
42 prevR = None | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
43 prevP = None | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
44 seqLen = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
45 while 1: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
46 data = refFile.readline() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
47 if not data: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
48 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
49 break | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
50 if data[0] == '>': | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
51 if prevP != None: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
52 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
53 seqLen = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
54 prevP = refFile.tell() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
55 prevR = data[1:-1] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
56 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
57 seqLen += len(data)-1 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
58 refFile.close() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
59 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
60 print '{0:.3f} (sec)'.format(time.time()-tt) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
61 return ref_inds | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
62 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
63 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
64 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
65 # Read in sequence data from reference fasta | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
66 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
67 # N_unknowns = True --> all ambiguous characters will be treated as Ns | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
68 # N_handling = (mode,params) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
69 # - ('random',read/frag len) --> all regions of Ns smaller than read or fragment | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
70 # length (whichever is bigger) will be replaced | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
71 # with uniformly random nucleotides | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
72 # - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
73 # of 'chr's | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
74 # - ('ignore') --> do not alter nucleotides in N regions | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
75 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
76 def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
77 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
78 tt = time.time() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
79 if not quiet: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
80 sys.stdout.write('reading '+ref_inds_i[0]+'... ') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
81 sys.stdout.flush() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
82 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
83 refFile = open(refPath,'r') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
84 refFile.seek(ref_inds_i[1]) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
85 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
86 myDat = bytearray(myDat.upper()) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
87 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
88 # find N regions | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
89 # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
90 prevNI = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
91 nCount = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
92 N_atlas = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
93 for i in xrange(len(myDat)): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
94 if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
95 if nCount == 0: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
96 prevNI = i | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
97 nCount += 1 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
98 if i == len(myDat)-1: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
99 N_atlas.append((prevNI,prevNI+nCount)) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
100 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
101 if nCount > 0: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
102 N_atlas.append((prevNI,prevNI+nCount)) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
103 nCount = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
104 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
105 # handle N base-calls as desired | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
106 N_info = {} | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
107 N_info['all'] = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
108 N_info['big'] = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
109 N_info['non_N'] = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
110 if N_handling[0] == 'random': | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
111 for region in N_atlas: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
112 N_info['all'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
113 if region[1]-region[0] <= N_handling[1]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
114 for i in xrange(region[0],region[1]): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
115 myDat[i] = random.choice(ALLOWED_NUCL) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
116 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
117 N_info['big'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
118 elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
119 for region in N_atlas: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
120 N_info['all'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
121 if region[1]-region[0] <= N_handling[1]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
122 for i in xrange(region[0],region[1]): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
123 myDat[i] = N_handling[2] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
124 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
125 N_info['big'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
126 elif N_handling[0] == 'ignore': | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
127 for region in N_atlas: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
128 N_info['all'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
129 N_info['big'].extend(region) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
130 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
131 print '\nERROR: UNKNOWN N_HANDLING MODE\n' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
132 exit(1) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
133 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
134 habitableRegions = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
135 if N_info['big'] == []: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
136 N_info['non_N'] = [(0,len(myDat))] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
137 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
138 for i in xrange(0,len(N_info['big']),2): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
139 if i == 0: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
140 habitableRegions.append((0,N_info['big'][0])) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
141 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
142 habitableRegions.append((N_info['big'][i-1],N_info['big'][i])) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
143 habitableRegions.append((N_info['big'][-1],len(myDat))) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
144 for n in habitableRegions: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
145 if n[0] != n[1]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
146 N_info['non_N'].append(n) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
147 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
148 if not quiet: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
149 print '{0:.3f} (sec)'.format(time.time()-tt) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
150 return (myDat,N_info) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
151 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
152 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
153 # find all non-N regions in reference sequence ahead of time, for computing jobs in parallel | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
154 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
155 def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
156 outRegions = {} | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
157 fn = refPath+'.nnr' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
158 if os.path.isfile(fn) and not(saveOutput): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
159 print 'found list of preidentified non-N regions...' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
160 f = open(fn,'r') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
161 for line in f: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
162 splt = line.strip().split('\t') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
163 if splt[0] not in outRegions: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
164 outRegions[splt[0]] = [] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
165 outRegions[splt[0]].append((int(splt[1]),int(splt[2]))) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
166 f.close() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
167 return outRegions | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
168 else: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
169 print 'enumerating all non-N regions in reference sequence...' | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
170 for RI in xrange(len(ref_inds)): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
171 (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
172 refName = ref_inds[RI][0] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
173 outRegions[refName] = [n for n in N_regions['non_N']] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
174 if saveOutput: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
175 f = open(fn,'w') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
176 for k in outRegions.keys(): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
177 for n in outRegions[k]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
178 f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n') | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
179 f.close() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
180 return outRegions | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
181 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
182 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
183 # find which of the non-N regions are going to be used for this job | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
184 # | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
185 def partitionRefRegions(inRegions,ref_inds,myjob,njobs): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
186 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
187 totSize = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
188 for RI in xrange(len(ref_inds)): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
189 refName = ref_inds[RI][0] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
190 for region in inRegions[refName]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
191 totSize += region[1] - region[0] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
192 sizePerJob = int(totSize/float(njobs)-0.5) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
193 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
194 regionsPerJob = [[] for n in xrange(njobs)] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
195 refsPerJob = [{} for n in xrange(njobs)] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
196 currentInd = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
197 currentCount = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
198 for RI in xrange(len(ref_inds)): | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
199 refName = ref_inds[RI][0] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
200 for region in inRegions[refName]: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
201 regionsPerJob[currentInd].append((refName,region[0],region[1])) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
202 refsPerJob[currentInd][refName] = True | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
203 currentCount += region[1] - region[0] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
204 if currentCount >= sizePerJob: | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
205 currentCount = 0 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
206 currentInd = min([currentInd+1,njobs-1]) | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
207 | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
208 relevantRefs = refsPerJob[myjob-1].keys() | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
209 relevantRegs = regionsPerJob[myjob-1] | 
| 
 
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
 
thondeboer 
parents:  
diff
changeset
 | 
210 return (relevantRefs,relevantRegs) | 
