Mercurial > repos > bioit_sciensano > phagetermvirome
comparison _modules/IData_handling.py @ 0:69e8f12c8b31 draft
"planemo upload"
author | bioit_sciensano |
---|---|
date | Fri, 11 Mar 2022 15:06:20 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:69e8f12c8b31 |
---|---|
1 ## @file IData_handling.py | |
2 # | |
3 # VL: Gather here the classes and functions useful for handling input data. | |
4 from __future__ import print_function | |
5 | |
6 import gzip | |
7 from _modules.utilities import reverseComplement,changeCase | |
8 from time import gmtime, strftime | |
9 import datetime | |
10 | |
11 try: | |
12 import cPickle as pickle | |
13 except ImportError: # python 3.x | |
14 import pickle | |
15 | |
16 | |
17 ## This class encapsulates the reference sequences, the host sequence if any and all useful information about the sequences. | |
18 # | |
19 # It is used both for searching the read extracts in the sequences and for exploiting the results | |
20 class refData: | |
21 def __init__(self,refseq_list,seed,hostseq): | |
22 self.refseq_list=refseq_list | |
23 self.seed=seed | |
24 self.hostseq=hostseq | |
25 if hostseq!="": | |
26 self.refseq_list.insert(0,hostseq) | |
27 self.nb_sequences=len(refseq_list) | |
28 | |
29 def getIdxSeq(self,refseq): | |
30 idx=-1 | |
31 found=False | |
32 for s in self.refseq_list: | |
33 idx += 1 | |
34 if s==refseq: | |
35 found=True | |
36 break | |
37 if not found: | |
38 raise RuntimeError("Couldn't find sequence in list of ref sequences.") | |
39 return idx | |
40 | |
41 | |
42 def IdxIsHostseq(self,idx_seq): | |
43 if (((self.hostseq == "") and (idx_seq <= self.nb_sequences - 1)) or ( | |
44 (self.hostseq != "") and (idx_seq >0))): | |
45 return False | |
46 return True | |
47 | |
48 def getSeqSizesList(self): | |
49 seq_sizes_list = [] | |
50 for seq in self.refseq_list: | |
51 seq_sizes_list.append(len(seq)) | |
52 return seq_sizes_list | |
53 | |
54 | |
55 ## Base class for handling read extracts. | |
56 # | |
57 # This class should not be used directly. | |
58 class ReadExtracts: | |
59 def __init__(self,seed): | |
60 self.seed = seed | |
61 self.r_extracts_list = [] | |
62 self.nb_reads = 0 | |
63 self.nb_extracts=0 | |
64 | |
65 ## Returns the list of read extracts from the loaded dataset, the number of reads and the total number of extracts | |
66 def getRExtracts(self): | |
67 return self.r_extracts_list,self.nb_reads,self.nb_extracts | |
68 | |
69 ## Class containing all the read extracts (PE reads) that must be mapped against a sequence. | |
70 class readExtractsPE(ReadExtracts): | |
71 def __init__(self,seed): | |
72 self.__init__(seed) | |
73 | |
74 | |
75 def addRead(self, whole_PE1,whole_PE2): | |
76 self.r_extracts_list.append(whole_PE1[:self.seed]) | |
77 self.r_extracts_list.append(whole_PE1[-self.seed:]) | |
78 self.r_extracts_list.append(whole_PE2[:self.seed]) | |
79 self.r_extracts_list.append(reverseComplement(whole_PE2)[:self.seed]) | |
80 self.r_extracts_list.append(reverseComplement(whole_PE2)[-self.seed:]) | |
81 self.nb_reads += 1 | |
82 self.nb_extracts += 5 # Number of extracts per read: 2 extracts for PE1 and 3 for PE2. | |
83 | |
84 | |
85 | |
86 ## Class containing all the read extracts (single reads) that must be mapped against a sequence. | |
87 class readsExtractsS(ReadExtracts): | |
88 def __init__(self,seed): | |
89 ReadExtracts.__init__(self,seed) | |
90 | |
91 ## Adds a read to the list of extracts | |
92 # | |
93 # @param whole_read The read as extracted from the fastq file | |
94 # @param no_pair This paramenter s only used to make the distinction between Single and paired. | |
95 # Note VL: I didn't use meta programming here because I thought that it would have a negative impact on performance. | |
96 # TODO: test it when all the rest works. | |
97 def addRead(self,whole_read,no_pair=""): | |
98 read_part = whole_read[:self.seed] | |
99 self.r_extracts_list.append(read_part) | |
100 self.r_extracts_list.append(whole_read[-self.seed:]) | |
101 self.r_extracts_list.append(reverseComplement(whole_read)[:self.seed]) | |
102 self.r_extracts_list.append(reverseComplement(whole_read)[-self.seed:]) | |
103 self.nb_reads+=1 | |
104 self.nb_extracts += 4 | |
105 | |
106 ## use objects of this class to store read offset (PE1 and PE2) in files. | |
107 class ReadInfo: | |
108 def __init__(self, off_PE1,whole_read,seed,off_PE2=None): | |
109 self.offset1=off_PE1 | |
110 self.offset2=off_PE2 | |
111 self.corlen = len(whole_read) - seed | |
112 | |
113 ## Gets the number of reads in the fastq file | |
114 # def getNbReads(fastq): | |
115 # with open(fastq) as f: | |
116 # for i, l in enumerate(f): | |
117 # pass | |
118 # nb_r=i+1 | |
119 # nb_r=nb_r/4 | |
120 # return nb_r | |
121 | |
122 | |
123 | |
124 ## loads a chunk of reads for mapping on GPU. | |
125 # Yields a ReadExtracts object plus a dictionnary of ReadInfo. | |
126 # keeps in memory the parsing state of the file. | |
127 # @param ch_size is in number of reads | |
128 # @reset_ids indicates whether or not we want read index to be reset to 0 at the beginning of each chunk. | |
129 def getChunk(fastq,seed,paired,ch_siz,reset_ids=True): | |
130 new_chunk = False | |
131 d_rinfo=dict() | |
132 idx_read=0 | |
133 off2=None | |
134 filin = open(fastq) | |
135 line = filin.readline() | |
136 read_paired="" | |
137 if paired != "": | |
138 RE=readExtractsPE(seed) | |
139 filin_paired = open(paired) | |
140 line_paired = filin_paired.readline() | |
141 else: | |
142 RE=readsExtractsS(seed) | |
143 | |
144 start = False | |
145 num_line=0 | |
146 while line: | |
147 # Read sequence | |
148 read = line.split("\n")[0].split("\r")[0] | |
149 if paired != "": | |
150 read_paired = line_paired.split("\n")[0].split("\r")[0] | |
151 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. | |
152 start = True | |
153 off1=filin.tell() | |
154 line = filin.readline() | |
155 if paired != "": | |
156 off2=filin_paired.tell() | |
157 line_paired = filin_paired.readline() | |
158 continue | |
159 if (start == True): | |
160 start = False | |
161 readlen = len(read) | |
162 if readlen < seed: | |
163 line = filin.readline() | |
164 if paired !="": | |
165 line_paired = filin_paired.readline() # also skip PE2 in that case | |
166 continue | |
167 RE.addRead(read,read_paired) | |
168 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) | |
169 if (idx_read>0 and ((idx_read+1)%(ch_siz)==0)): | |
170 yield RE,d_rinfo | |
171 if (reset_ids): | |
172 idx_read=0 | |
173 new_chunk=True | |
174 if paired != "": | |
175 RE = readExtractsPE(seed) | |
176 else: | |
177 RE = readsExtractsS(seed) | |
178 d_rinfo = dict() | |
179 if not new_chunk: | |
180 idx_read+=1 | |
181 else: | |
182 new_chunk=False | |
183 | |
184 line = filin.readline() | |
185 if paired!="": | |
186 line_paired = filin_paired.readline() | |
187 filin.close() | |
188 if paired !="": | |
189 filin_paired.close() | |
190 yield RE, d_rinfo | |
191 | |
192 ## dumps a dictionnary of ReadInfo objects indexed on read index. | |
193 # | |
194 # @param d_rinfo dictionnary to dump | |
195 # @param fic filename (incl. full path) where to dump | |
196 def dump_d_rinfo(d_rinfo,fic): | |
197 with open(fic, 'wb') as fp: | |
198 pickle.dump(d_rinfo, fp, protocol=pickle.HIGHEST_PROTOCOL) | |
199 | |
200 ## Loads a dictionnary of ReadInfo objects. | |
201 def load_d_rinfo(fic): | |
202 with open(fic, 'rb') as fp: | |
203 d_rinfo = pickle.load(fp) | |
204 return d_rinfo | |
205 | |
206 | |
207 ## loads all extracts of reads into a list for processing on GPU. | |
208 # | |
209 # returns 1 or 2 readExtracts objects plus a dictionnary of ReadInfo. | |
210 def getAllReads(fastq,seed,paired): | |
211 d_rinfo=dict() | |
212 idx_read=0 | |
213 off2=None | |
214 filin = open(fastq) | |
215 line = filin.readline() | |
216 read_paired="" | |
217 | |
218 if paired != "": | |
219 RE=readExtractsPE(seed) | |
220 filin_paired = open(paired) | |
221 line_paired = filin_paired.readline() | |
222 else: | |
223 RE=readsExtractsS(seed) | |
224 | |
225 start = False | |
226 num_line=0 | |
227 while line: | |
228 # Read sequence | |
229 read = line.split("\n")[0].split("\r")[0] | |
230 if paired != "": | |
231 read_paired = line_paired.split("\n")[0].split("\r")[0] | |
232 if (read[0] == '@' and num_line%4 == 0): # make sure we don't take into account a quality score instead of a read. | |
233 start = True | |
234 off1=filin.tell() | |
235 line = filin.readline() | |
236 if paired != "": | |
237 off2=filin_paired.tell() | |
238 line_paired = filin_paired.readline() | |
239 continue | |
240 if (start == True): | |
241 start = False | |
242 readlen = len(read) | |
243 if readlen < seed: | |
244 line = filin.readline() | |
245 if paired !="": | |
246 line_paired = filin_paired.readline() # also skip PE2 in that case | |
247 continue | |
248 RE.addRead(read,read_paired) | |
249 d_rinfo[idx_read]=ReadInfo(off1,read,seed,off2) | |
250 idx_read+=1 | |
251 | |
252 line = filin.readline() | |
253 if paired!="": | |
254 line_paired = filin_paired.readline() | |
255 filin.close() | |
256 if paired !="": | |
257 filin_paired.close() | |
258 return RE,d_rinfo | |
259 | |
260 ## use this class to retrieve reads from fastq file. | |
261 class ReadGetter: | |
262 ## constructor | |
263 # | |
264 # @param fastq Name of the fastq file that contains the read | |
265 # @param d_rinfo A dictionnary of ReadInfo objects that contains the offset indicating where the read starts in the file. | |
266 # @param paired The name of the file containing the PE2 (defaults to ""). | |
267 def __init__(self,fastq,d_rinfo,paired=""): | |
268 self.filin=open(fastq) | |
269 self.d_rinfo=d_rinfo | |
270 self.paired=paired | |
271 if paired!="": | |
272 self.filinp=open(fastq) | |
273 | |
274 def getOneRead(self,idx_read): | |
275 read_paired="" | |
276 self.filin.seek(self.d_rinfo[idx_read].offset1) | |
277 read=self.filin.readline() | |
278 if self.paired!="": | |
279 self.filinp.seek(self.d_rinfo[idx_read].offset2) | |
280 read_paired = self.filinp.readline() | |
281 return read,read_paired | |
282 | |
283 | |
284 ### READS Functions | |
285 def totReads(filin): | |
286 """Verify and retrieve the number of reads in the fastq file before alignment""" | |
287 if filin.endswith('.gz'): | |
288 filein = gzip.open(filin, 'rb') | |
289 else: | |
290 filein = open(filin, 'r') | |
291 | |
292 line = 0 | |
293 while filein.readline(): | |
294 line += 1 | |
295 seq = float(round(line / 4)) | |
296 filein.close() | |
297 return seq | |
298 | |
299 ### SEQUENCE parsing function | |
300 def genomeFastaRecovery(filin, limit_reference, edge, host_test = 0): | |
301 """Get genome sequence from fasta file""" | |
302 print("recovering genome from: ",filin) | |
303 print(strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())) | |
304 if filin == "": | |
305 return "", "", "" | |
306 | |
307 infile = open(filin, 'r') | |
308 name = [] | |
309 genome = [] | |
310 genome_line = "" | |
311 genome_rejected = 0 | |
312 for line in infile: | |
313 if line[0] == ">": | |
314 if name != []: | |
315 if len(genome_line) >= limit_reference: | |
316 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) | |
317 else: | |
318 genome_rejected += 1 | |
319 name = name[:-1] | |
320 genome_line = "" | |
321 name.append(line[1:].split('\r')[0].split('\n')[0]) | |
322 else: | |
323 genome_line += changeCase(line).replace(' ', '').split('\r')[0].split('\n')[0] | |
324 | |
325 if len(genome_line) >= limit_reference: | |
326 genome.append(genome_line[-edge:] + genome_line + genome_line[:edge]) | |
327 genome_line = "" | |
328 else: | |
329 genome_rejected += 1 | |
330 name = name[:-1] | |
331 | |
332 infile.close() | |
333 | |
334 if host_test: | |
335 return "".join(genome) | |
336 else: | |
337 return genome, name, genome_rejected | |
338 close(filin) | |
339 |