Mercurial > repos > thondeboer > neat_genreads
annotate py/refFunc.py @ 10:7d10b55965c9 draft default tip
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
author | thondeboer |
---|---|
date | Wed, 16 May 2018 17:02:51 -0400 |
parents | 6e75a84e9338 |
children |
rev | line source |
---|---|
0
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
1 import sys |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
2 import time |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
3 import os |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
4 import random |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
5 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
6 OK_CHR_ORD = {ord('A'):True,ord('C'):True,ord('G'):True,ord('T'):True,ord('U'):True} |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
7 ALLOWED_NUCL = ['A','C','G','T'] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
8 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
9 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
10 # Index reference fasta |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
11 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
12 def indexRef(refPath): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
13 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
14 tt = time.time() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
15 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
16 fn = None |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
17 if os.path.isfile(refPath+'i'): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
18 print 'found index '+refPath+'i' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
19 fn = refPath+'i' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
20 if os.path.isfile(refPath+'.fai'): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
21 print 'found index '+refPath+'.fai' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
22 fn = refPath+'.fai' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
23 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
24 ref_inds = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
25 if fn != None: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
26 fai = open(fn,'r') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
27 for line in fai: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
28 splt = line[:-1].split('\t') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
29 seqLen = int(splt[1]) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
30 offset = int(splt[2]) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
31 lineLn = int(splt[3]) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
32 nLines = seqLen/lineLn |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
33 if seqLen%lineLn != 0: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
34 nLines += 1 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
35 ref_inds.append((splt[0],offset,offset+seqLen+nLines,seqLen)) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
36 fai.close() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
37 return ref_inds |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
38 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
39 sys.stdout.write('index not found, creating one... ') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
40 sys.stdout.flush() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
41 refFile = open(refPath,'r') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
42 prevR = None |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
43 prevP = None |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
44 seqLen = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
45 while 1: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
46 data = refFile.readline() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
47 if not data: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
48 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
49 break |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
50 if data[0] == '>': |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
51 if prevP != None: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
52 ref_inds.append( (prevR, prevP, refFile.tell()-len(data), seqLen) ) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
53 seqLen = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
54 prevP = refFile.tell() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
55 prevR = data[1:-1] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
56 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
57 seqLen += len(data)-1 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
58 refFile.close() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
59 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
60 print '{0:.3f} (sec)'.format(time.time()-tt) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
61 return ref_inds |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
62 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
63 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
64 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
65 # Read in sequence data from reference fasta |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
66 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
67 # N_unknowns = True --> all ambiguous characters will be treated as Ns |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
68 # N_handling = (mode,params) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
69 # - ('random',read/frag len) --> all regions of Ns smaller than read or fragment |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
70 # length (whichever is bigger) will be replaced |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
71 # with uniformly random nucleotides |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
72 # - ('allChr',read/frag len, chr) --> same as above, but replaced instead with a string |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
73 # of 'chr's |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
74 # - ('ignore') --> do not alter nucleotides in N regions |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
75 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
76 def readRef(refPath,ref_inds_i,N_handling,N_unknowns=True,quiet=False): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
77 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
78 tt = time.time() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
79 if not quiet: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
80 sys.stdout.write('reading '+ref_inds_i[0]+'... ') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
81 sys.stdout.flush() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
82 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
83 refFile = open(refPath,'r') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
84 refFile.seek(ref_inds_i[1]) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
85 myDat = ''.join(refFile.read(ref_inds_i[2]-ref_inds_i[1]).split('\n')) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
86 myDat = bytearray(myDat.upper()) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
87 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
88 # find N regions |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
89 # data explanation: myDat[N_atlas[0][0]:N_atlas[0][1]] = solid block of Ns |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
90 prevNI = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
91 nCount = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
92 N_atlas = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
93 for i in xrange(len(myDat)): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
94 if myDat[i] == ord('N') or (N_unknowns and myDat[i] not in OK_CHR_ORD): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
95 if nCount == 0: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
96 prevNI = i |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
97 nCount += 1 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
98 if i == len(myDat)-1: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
99 N_atlas.append((prevNI,prevNI+nCount)) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
100 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
101 if nCount > 0: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
102 N_atlas.append((prevNI,prevNI+nCount)) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
103 nCount = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
104 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
105 # handle N base-calls as desired |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
106 N_info = {} |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
107 N_info['all'] = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
108 N_info['big'] = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
109 N_info['non_N'] = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
110 if N_handling[0] == 'random': |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
111 for region in N_atlas: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
112 N_info['all'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
113 if region[1]-region[0] <= N_handling[1]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
114 for i in xrange(region[0],region[1]): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
115 myDat[i] = random.choice(ALLOWED_NUCL) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
116 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
117 N_info['big'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
118 elif N_handling[0] == 'allChr' and N_handling[2] in OK_CHR_ORD: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
119 for region in N_atlas: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
120 N_info['all'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
121 if region[1]-region[0] <= N_handling[1]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
122 for i in xrange(region[0],region[1]): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
123 myDat[i] = N_handling[2] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
124 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
125 N_info['big'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
126 elif N_handling[0] == 'ignore': |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
127 for region in N_atlas: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
128 N_info['all'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
129 N_info['big'].extend(region) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
130 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
131 print '\nERROR: UNKNOWN N_HANDLING MODE\n' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
132 exit(1) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
133 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
134 habitableRegions = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
135 if N_info['big'] == []: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
136 N_info['non_N'] = [(0,len(myDat))] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
137 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
138 for i in xrange(0,len(N_info['big']),2): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
139 if i == 0: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
140 habitableRegions.append((0,N_info['big'][0])) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
141 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
142 habitableRegions.append((N_info['big'][i-1],N_info['big'][i])) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
143 habitableRegions.append((N_info['big'][-1],len(myDat))) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
144 for n in habitableRegions: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
145 if n[0] != n[1]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
146 N_info['non_N'].append(n) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
147 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
148 if not quiet: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
149 print '{0:.3f} (sec)'.format(time.time()-tt) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
150 return (myDat,N_info) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
151 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
152 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
153 # find all non-N regions in reference sequence ahead of time, for computing jobs in parallel |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
154 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
155 def getAllRefRegions(refPath,ref_inds,N_handling,saveOutput=False): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
156 outRegions = {} |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
157 fn = refPath+'.nnr' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
158 if os.path.isfile(fn) and not(saveOutput): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
159 print 'found list of preidentified non-N regions...' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
160 f = open(fn,'r') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
161 for line in f: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
162 splt = line.strip().split('\t') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
163 if splt[0] not in outRegions: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
164 outRegions[splt[0]] = [] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
165 outRegions[splt[0]].append((int(splt[1]),int(splt[2]))) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
166 f.close() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
167 return outRegions |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
168 else: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
169 print 'enumerating all non-N regions in reference sequence...' |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
170 for RI in xrange(len(ref_inds)): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
171 (refSequence,N_regions) = readRef(refPath,ref_inds[RI],N_handling,quiet=True) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
172 refName = ref_inds[RI][0] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
173 outRegions[refName] = [n for n in N_regions['non_N']] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
174 if saveOutput: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
175 f = open(fn,'w') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
176 for k in outRegions.keys(): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
177 for n in outRegions[k]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
178 f.write(k+'\t'+str(n[0])+'\t'+str(n[1])+'\n') |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
179 f.close() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
180 return outRegions |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
181 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
182 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
183 # find which of the non-N regions are going to be used for this job |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
184 # |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
185 def partitionRefRegions(inRegions,ref_inds,myjob,njobs): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
186 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
187 totSize = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
188 for RI in xrange(len(ref_inds)): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
189 refName = ref_inds[RI][0] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
190 for region in inRegions[refName]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
191 totSize += region[1] - region[0] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
192 sizePerJob = int(totSize/float(njobs)-0.5) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
193 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
194 regionsPerJob = [[] for n in xrange(njobs)] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
195 refsPerJob = [{} for n in xrange(njobs)] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
196 currentInd = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
197 currentCount = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
198 for RI in xrange(len(ref_inds)): |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
199 refName = ref_inds[RI][0] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
200 for region in inRegions[refName]: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
201 regionsPerJob[currentInd].append((refName,region[0],region[1])) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
202 refsPerJob[currentInd][refName] = True |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
203 currentCount += region[1] - region[0] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
204 if currentCount >= sizePerJob: |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
205 currentCount = 0 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
206 currentInd = min([currentInd+1,njobs-1]) |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
207 |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
208 relevantRefs = refsPerJob[myjob-1].keys() |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
209 relevantRegs = regionsPerJob[myjob-1] |
6e75a84e9338
planemo upload commit e96b43f96afce6a7b7dfd4499933aad7d05c955e-dirty
thondeboer
parents:
diff
changeset
|
210 return (relevantRefs,relevantRegs) |