0
|
1 ## @file IData_handling.py
|
|
2 #
|
|
3 # VL: Gather here the classes and functions useful for handling input data.
|
|
4 from __future__ import print_function
|
|
5
|
|
6 import gzip
|
|
7 from _modules.utilities import reverseComplement,changeCase
|
|
8 from time import gmtime, strftime
|
|
9 import datetime
|
|
10
|
|
11 try:
|
|
12 import cPickle as pickle
|
|
13 except ImportError: # python 3.x
|
|
14 import pickle
|
|
15
|
|
16
|
|
17 ## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences.
|
|
18 #
|
|
19 # It is used both for searching the read extracts in the sequences and for exploiting the results
|
|
20 class refData:
|
|
21 def __init__(self,refseq_list,seed,hostseq):
|
|
22 self.refseq_list=refseq_list
|
|
23 self.seed=seed
|
|
24 self.hostseq=hostseq
|
|
25 if hostseq!="":
|
|
26 self.refseq_list.insert(0,hostseq)
|
|
27 self.nb_sequences=len(refseq_list)
|
|
28
|
|
29 def getIdxSeq(self,refseq):
|
|
30 idx=-1
|
|
31 found=False
|
|
32 for s in self.refseq_list:
|
|
33 idx += 1
|
|
34 if s==refseq:
|
|
35 found=True
|
|
36 break
|
|
37 if not found:
|
|
38 raise RuntimeError("Couldn't find sequence in list of ref sequences.")
|
|
39 return idx
|
|
40
|
|
41
|
|
42 def IdxIsHostseq(self,idx_seq):
|
|
43 if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or (
|
|
44 (self.hostseq != "") and (idx_seq >0))):
|
|
45 return False
|
|
46 return True
|
|
47
|
|
48 def getSeqSizesList(self):
|
|
49 seq_sizes_list = []
|
|
50 for seq in self.refseq_list:
|
|
51 seq_sizes_list.append(len(seq))
|
|
52 return seq_sizes_list
|
|
53
|
|
54
|
|
55 ## Base class for handling read extracts.
|
|
56 #
|
|
57 # This class should not be used directly.
|
|
58 class ReadExtracts:
|
|
59 def __init__(self,seed):
|
|
60 self.seed = seed
|
|
61 self.r_extracts_list = []
|
|
62 self.nb_reads = 0
|
|
63 self.nb_extracts=0
|
|
64
|
|
65 ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts
|
|
66 def getRExtracts(self):
|
|
67 return self.r_extracts_list,self.nb_reads,self.nb_extracts
|
|
68
|
|
69 ## Class containing all the read extracts (PE reads) that must be mapped against a sequence.
|
|
70 class readExtractsPE(ReadExtracts):
|
|
71 def __init__(self,seed):
|
|
72 self.__init__(seed)
|
|
73
|
|
74
|
|
75 def addRead(self, whole_PE1,whole_PE2):
|
|
76 self.r_extracts_list.append(whole_PE1[:self.seed])
|
|
77 self.r_extracts_list.append(whole_PE1[-self.seed:])
|
|
78 self.r_extracts_list.append(whole_PE2[:self.seed])
|
|
79 self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed])
|
|
80 self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:])
|
|
81 self.nb_reads += 1
|
|
82 self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2.
|
|
83
|
|
84
|
|
85
|
|
86 ## Class containing all the read extracts (single reads) that must be mapped against a sequence.
|
|
87 class readsExtractsS(ReadExtracts):
|
|
88 def __init__(self,seed):
|
|
89 ReadExtracts.__init__(self,seed)
|
|
90
|
|
91 ## Adds a read to the list of extracts
|
|
92 #
|
|
93 # @param whole_read The read as extracted from the fastq file
|
|
94 # @param no_pair This paramenter s only used to make the distinction between Single and paired.
|
|
95 # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance.
|
|
96 # TODO: test it when all the rest works.
|
|
97 def addRead(self,whole_read,no_pair=""):
|
|
98 read_part = whole_read[:self.seed]
|
|
99 self.r_extracts_list.append(read_part)
|
|
100 self.r_extracts_list.append(whole_read[-self.seed:])
|
|
101 self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed])
|
|
102 self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:])
|
|
103 self.nb_reads+=1
|
|
104 self.nb_extracts += 4
|
|
105
|
|
106 ## use objects of this class to store read offset (PE1 and PE2) in files.
|
|
107 class ReadInfo:
|
|
108 def __init__(self, off_PE1,whole_read,seed,off_PE2=None):
|
|
109 self.offset1=off_PE1
|
|
110 self.offset2=off_PE2
|
|
111 self.corlen = len(whole_read) - seed
|
|
112
|
|
113 ## Gets the number of reads in the fastq file
|
|
114 # def getNbReads(fastq):
|
|
115 # with open(fastq) as f:
|
|
116 # for i, l in enumerate(f):
|
|
117 # pass
|
|
118 # nb_r=i+1
|
|
119 # nb_r=nb_r/4
|
|
120 # return nb_r
|
|
121
|
|
122
|
|
123
|
|
124 ## loads a chunk of reads for mapping on GPU.
|
|
125 # Yields a ReadExtracts object plus a dictionnary of ReadInfo.
|
|
126 # keeps in memory the parsing state of the file.
|
|
127 # @param ch_size is in number of reads
|
|
128 # @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk.
|
|
129 def getChunk(fastq,seed,paired,ch_siz,reset_ids=True):
|
|
130 new_chunk = False
|
|
131 d_rinfo=dict()
|
|
132 idx_read=0
|
|
133 off2=None
|
|
134 filin = open(fastq)
|
|
135 line = filin.readline()
|
|
136 read_paired=""
|
|
137 if paired != "":
|
|
138 RE=readExtractsPE(seed)
|
|
139 filin_paired = open(paired)
|
|
140 line_paired = filin_paired.readline()
|
|
141 else:
|
|
142 RE=readsExtractsS(seed)
|
|
143
|
|
144 start = False
|
|
145 num_line=0
|
|
146 while line:
|
|
147 # Read sequence
|
|
148 read = line.split("\n")[0].split("\r")[0]
|
|
149 if paired != "":
|
|
150 read_paired = line_paired.split("\n")[0].split("\r")[0]
|
|
151 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
|
|
152 start = True
|
|
153 off1=filin.tell()
|
|
154 line = filin.readline()
|
|
155 if paired != "":
|
|
156 off2=filin_paired.tell()
|
|
157 line_paired = filin_paired.readline()
|
|
158 continue
|
|
159 if (start == True):
|
|
160 start = False
|
|
161 readlen = len(read)
|
|
162 if readlen < seed:
|
|
163 line = filin.readline()
|
|
164 if paired !="":
|
|
165 line_paired = filin_paired.readline() # also skip PE2 in that case
|
|
166 continue
|
|
167 RE.addRead(read,read_paired)
|
|
168 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
|
|
169 if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)):
|
|
170 yield RE,d_rinfo
|
|
171 if (reset_ids):
|
|
172 idx_read=0
|
|
173 new_chunk=True
|
|
174 if paired != "":
|
|
175 RE = readExtractsPE(seed)
|
|
176 else:
|
|
177 RE = readsExtractsS(seed)
|
|
178 d_rinfo = dict()
|
|
179 if not new_chunk:
|
|
180 idx_read+=1
|
|
181 else:
|
|
182 new_chunk=False
|
|
183
|
|
184 line = filin.readline()
|
|
185 if paired!="":
|
|
186 line_paired = filin_paired.readline()
|
|
187 filin.close()
|
|
188 if paired !="":
|
|
189 filin_paired.close()
|
|
190 yield RE, d_rinfo
|
|
191
|
|
192 ## dumps a dictionnary of ReadInfo objects indexed on read index.
|
|
193 #
|
|
194 # @param d_rinfo dictionnary to dump
|
|
195 # @param fic filename (incl. full path) where to dump
|
|
196 def dump_d_rinfo(d_rinfo,fic):
|
|
197 with open(fic, 'wb') as fp:
|
|
198 pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL)
|
|
199
|
|
200 ## Loads a dictionnary of ReadInfo objects.
|
|
201 def load_d_rinfo(fic):
|
|
202 with open(fic, 'rb') as fp:
|
|
203 d_rinfo = pickle.load(fp)
|
|
204 return d_rinfo
|
|
205
|
|
206
|
|
207 ## loads all extracts of reads into a list for processing on GPU.
|
|
208 #
|
|
209 # returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo.
|
|
210 def getAllReads(fastq,seed,paired):
|
|
211 d_rinfo=dict()
|
|
212 idx_read=0
|
|
213 off2=None
|
|
214 filin = open(fastq)
|
|
215 line = filin.readline()
|
|
216 read_paired=""
|
|
217
|
|
218 if paired != "":
|
|
219 RE=readExtractsPE(seed)
|
|
220 filin_paired = open(paired)
|
|
221 line_paired = filin_paired.readline()
|
|
222 else:
|
|
223 RE=readsExtractsS(seed)
|
|
224
|
|
225 start = False
|
|
226 num_line=0
|
|
227 while line:
|
|
228 # Read sequence
|
|
229 read = line.split("\n")[0].split("\r")[0]
|
|
230 if paired != "":
|
|
231 read_paired = line_paired.split("\n")[0].split("\r")[0]
|
|
232 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read.
|
|
233 start = True
|
|
234 off1=filin.tell()
|
|
235 line = filin.readline()
|
|
236 if paired != "":
|
|
237 off2=filin_paired.tell()
|
|
238 line_paired = filin_paired.readline()
|
|
239 continue
|
|
240 if (start == True):
|
|
241 start = False
|
|
242 readlen = len(read)
|
|
243 if readlen < seed:
|
|
244 line = filin.readline()
|
|
245 if paired !="":
|
|
246 line_paired = filin_paired.readline() # also skip PE2 in that case
|
|
247 continue
|
|
248 RE.addRead(read,read_paired)
|
|
249 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2)
|
|
250 idx_read+=1
|
|
251
|
|
252 line = filin.readline()
|
|
253 if paired!="":
|
|
254 line_paired = filin_paired.readline()
|
|
255 filin.close()
|
|
256 if paired !="":
|
|
257 filin_paired.close()
|
|
258 return RE,d_rinfo
|
|
259
|
|
260 ## use this class to retrieve reads from fastq file.
|
|
261 class ReadGetter:
|
|
262 ## constructor
|
|
263 #
|
|
264 # @param fastq Name of the fastq file that contains the read
|
|
265 # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file.
|
|
266 # @param paired The name of the file containing the PE2 (defaults to "").
|
|
267 def __init__(self,fastq,d_rinfo,paired=""):
|
|
268 self.filin=open(fastq)
|
|
269 self.d_rinfo=d_rinfo
|
|
270 self.paired=paired
|
|
271 if paired!="":
|
|
272 self.filinp=open(fastq)
|
|
273
|
|
274 def getOneRead(self,idx_read):
|
|
275 read_paired=""
|
|
276 self.filin.seek(self.d_rinfo[idx_read].offset1)
|
|
277 read=self.filin.readline()
|
|
278 if self.paired!="":
|
|
279 self.filinp.seek(self.d_rinfo[idx_read].offset2)
|
|
280 read_paired = self.filinp.readline()
|
|
281 return read,read_paired
|
|
282
|
|
283
|
|
284 ### READS Functions
|
|
285 def totReads(filin):
|
|
286 """Verify and retrieve the number of reads in the fastq file before alignment"""
|
|
287 if filin.endswith('.gz'):
|
|
288 filein = gzip.open(filin, 'rb')
|
|
289 else:
|
|
290 filein = open(filin, 'r')
|
|
291
|
|
292 line = 0
|
|
293 while filein.readline():
|
|
294 line += 1
|
|
295 seq = float(round(line / 4))
|
|
296 filein.close()
|
|
297 return seq
|
|
298
|
|
299 ### SEQUENCE parsing function
|
|
300 def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0):
|
|
301 """Get genome sequence from fasta file"""
|
|
302 print("recovering genome from: ",filin)
|
|
303 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime()))
|
|
304 if filin == "":
|
|
305 return "", "", ""
|
|
306
|
|
307 infile = open(filin, 'r')
|
|
308 name = []
|
|
309 genome = []
|
|
310 genome_line = ""
|
|
311 genome_rejected = 0
|
|
312 for line in infile:
|
|
313 if line[0] == ">":
|
|
314 if name != []:
|
|
315 if len(genome_line) >= limit_reference:
|
|
316 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
|
|
317 else:
|
|
318 genome_rejected += 1
|
|
319 name = name[:-1]
|
|
320 genome_line = ""
|
|
321 name.append(line[1:].split('\r')[0].split('\n')[0])
|
|
322 else:
|
|
323 genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0]
|
|
324
|
|
325 if len(genome_line) >= limit_reference:
|
|
326 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge])
|
|
327 genome_line = ""
|
|
328 else:
|
|
329 genome_rejected += 1
|
|
330 name = name[:-1]
|
|
331
|
|
332 infile.close()
|
|
333
|
|
334 if host_test:
|
|
335 return "".join(genome)
|
|
336 else:
|
|
337 return genome, name, genome_rejected
|
|
338 close(filin)
|
|
339
|