comparison detect_putative_ltr_wrapper.py @ 13:559940c04c44 draft

"planemo upload commit 139c041f671459192beb10ae45a8b371367c23b6"
author petr-novak
date Thu, 11 Aug 2022 07:29:06 +0000
parents ff01d4263391
children
comparison
equal deleted inserted replaced
12:ff01d4263391 13:559940c04c44
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """This wrapper is intended to be used on large genomes and large DANTE input to 2 """This wrapper is intended to be used on large genomes and large DANTE input to
3 minimize memory usage, It splits input files to pieces and analyze it on by one by 3 minimize memory usage, It splits input files to pieces and analyze it on by one by
4 detect_putative_ltr.R 4 detect_putative_ltr.R
5 If input does not exceed memory limit, it will run detect_putative_ltr.R directly 5 If input does not exceed specified max-chunk_size, it will run detect_putative_ltr.R
6 directly
6 """ 7 """
7 8
8 import argparse 9 import argparse
9 import os 10 import os
10 import sys 11 import sys
11 import tempfile 12 import tempfile
12 from itertools import cycle 13 from itertools import cycle
13 import subprocess 14 import subprocess
14 15
15 16
17 class Gff3Feature:
18 """
19 Class for gff3 feature
20 """
21
22 def __init__(self, line):
23 self.line = line
24 self.items = line.strip().split('\t')
25 self.header = self.items[0]
26 self.source = self.items[1]
27 self.type = self.items[2]
28 self.start = int(self.items[3])
29 self.end = int(self.items[4])
30 self.score = self.items[5]
31 self.strand = self.items[6]
32 self.frame = self.items[7]
33 self.attributes = self.items[8]
34 self.attributes_dict = {}
35 for item in self.attributes.split(';'):
36 if item != '':
37 key, value = item.split('=')
38 self.attributes_dict[key] = value
39
40 self.attributes_str = ';'.join(
41 ['{}={}'.format(key, value) for key, value in self.attributes_dict.items()]
42 )
43
44 def __str__(self):
45 return '\t'.join(
46 [self.header, self.source, self.type, str(self.start), str(self.end),
47 self.score, self.strand, self.frame, self.attributes_str]
48 ) + '\n'
49
50 def __repr__(self):
51 return '\t'.join(
52 [self.header, self.source, self.type, str(self.start), str(self.end),
53 self.score, self.strand, self.frame, self.attributes_str]
54 ) + '\n'
55
56 def __eq__(self, other):
57 return self.line_recalculated() == other.line_recalculated()
58
59 def __hash__(self):
60 return hash(self.line_recalculated())
61
62 def get_line(self):
63 """returns original line"""
64 return self.line
65
66 def overlap(self, other):
67 """
68 Check if two features overlap
69 :param other:
70 :return:
71 """
72 if self.start <= other.end and self.end >= other.start:
73 return True
74 else:
75 return False
76
77 def line_recalculated(self):
78 """
79 :return:
80 string with recalculated line
81 """
82 return '\t'.join(
83 [self.header, self.source, self.type, str(self.start), str(self.end),
84 self.score, self.strand, self.frame, self.attributes_str]
85 ) + '\n'
86
87 def __lt__(self, other):
88 width = self.end - self.start
89 other_width = other.end - other.start
90 return width < other_width
91
92 def __gt__(self, other):
93 width = self.end - self.start
94 other_width = other.end - other.start
95 return width > other_width
96
97 def identical_region(self, other):
98 """
99 Check if two features are identical
100 :param other:
101 :return:
102 """
103 if self.start == other.start and self.end == other.end and self.header == \
104 other.header:
105 return True
106 else:
107 return False
108
109
16 def get_arguments(): 110 def get_arguments():
17 """ 111 """
18 Get arguments from command line 112 Get arguments from command line
19 :return: 113 :return:
20 args 114 args
48 "considered for retrostransposon detection" 142 "considered for retrostransposon detection"
49 ) 143 )
50 parser.add_argument( 144 parser.add_argument(
51 '-S', '--max_chunk_size', default=100000000, required=False, type=int, 145 '-S', '--max_chunk_size', default=100000000, required=False, type=int,
52 help='If size of reference sequence is greater than this value, reference is ' 146 help='If size of reference sequence is greater than this value, reference is '
53 'analyzed in chunks of this size. This is just approximate value - ' 147 'analyzed in chunks of this size. default is %(default)s'
54 'sequences ' 148 'Setting this value too small will slow down the analysis'
55 'which are longer are are not split, default is %(default)s'
56 ) 149 )
57 args = parser.parse_args() 150 args = parser.parse_args()
58 return args 151 return args
59 152
60 153
79 filepaths 172 filepaths
80 """ 173 """
81 temp_files = [] 174 temp_files = []
82 for i in range(number_of_files): 175 for i in range(number_of_files):
83 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name) 176 temp_files.append(tempfile.NamedTemporaryFile(delete=False).name)
177 os.remove(temp_files[-1])
84 return temp_files 178 return temp_files
85 179
86 180
87 def sum_up_stats_files(files): 181 def sum_up_stats_files(files):
88 """ 182 """
110 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts])) 204 statistics_str.append(classification + '\t' + '\t'.join([str(x) for x in counts]))
111 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str) 205 sorted_stat_with_header = ['\t'.join(header)] + sorted(statistics_str)
112 return sorted_stat_with_header 206 return sorted_stat_with_header
113 207
114 208
209 def read_single_fasta_to_dictionary(fh):
210 """
211 Read fasta file into dictionary
212 :param fh:
213 :return:
214 fasta_dict
215 """
216 fasta_dict = {}
217 for line in fh:
218 if line[0] == '>':
219 header = line.strip().split(' ')[0][1:] # remove part of name after space
220 fasta_dict[header] = []
221 else:
222 fasta_dict[header] += [line.strip()]
223 fasta_dict = {k: ''.join(v) for k, v in fasta_dict.items()}
224 return fasta_dict
225
226
227 def split_fasta_to_chunks(fasta_file, chunk_size=100000000, overlap=100000):
228 """
229 Split fasta file to chunks, sequences longe than chuck size are split to overlaping
230 peaces. If sequences are shorter, chunck with multiple sequences are created.
231 :param fasta_file:
232
233 :param fasta_file:
234 :param chunk_size:
235 :param overlap:
236 :return:
237 fasta_file_split
238 matching_table
239 """
240 min_chunk_size = chunk_size * 2
241 fasta_dict = read_fasta_sequence_size(fasta_file)
242 # calculates ranges for splitting of fasta files and store them in list
243 matching_table = []
244 fasta_file_split = tempfile.NamedTemporaryFile(delete=False).name
245 for header, size in fasta_dict.items():
246 if size > min_chunk_size:
247 number_of_chunks = int(size / chunk_size)
248 adjusted_chunk_size = int(size / number_of_chunks)
249 for i in range(number_of_chunks):
250 start = i * adjusted_chunk_size
251 end = ((i + 1) *
252 adjusted_chunk_size
253 + overlap) if i + 1 < number_of_chunks else size
254 new_header = header + '_' + str(i)
255 matching_table.append([header, i, start, end, new_header])
256 else:
257 new_header = header + '_0'
258 matching_table.append([header, 0, 0, size, new_header])
259 # read sequences from fasta files and split them to chunks according to matching table
260 # open output and input files, use with statement to close files
261 fasta_dict = read_single_fasta_to_dictionary(open(fasta_file, 'r'))
262 with open(fasta_file_split, 'w') as fh_out:
263 for header in fasta_dict:
264 matching_table_part = [x for x in matching_table if x[0] == header]
265 for header2, i, start, end, new_header in matching_table_part:
266 fh_out.write('>' + new_header + '\n')
267 fh_out.write(fasta_dict[header][start:end] + '\n')
268 return fasta_file_split, matching_table
269
270
271 def get_new_header_and_coordinates(header, start, end, matching_table):
272 """
273 Get new header and coordinates for sequence
274 :param header:
275 :param start:
276 :param end:
277 :param matching_table:
278 :return:
279 new_header
280 new_start
281 new_end
282 """
283 matching_table_part = [x for x in matching_table if x[0] == header]
284 new_coords = []
285 for chunk in matching_table_part:
286 if chunk[2] <= start < chunk[3]:
287 new_header = chunk[4]
288 new_start = start - chunk[2]
289 new_end = end - chunk[2]
290 new_sequence_length = chunk[3] - chunk[2]
291 new_coords.append([new_header, new_start, new_end, new_sequence_length])
292 return new_coords
293
294
295 def get_original_header_and_coordinates(new_header, new_start, new_end, matching_table):
296 """
297 Get original header and coordinates for sequence
298 :param new_header:
299 :param new_start:
300 :param new_end:
301 :param matching_table:
302 :return:
303 original_header
304 original_start
305 original_end
306 """
307 matching_table_part = [x for x in matching_table if x[4] == new_header]
308 ori_header = matching_table_part[0][0]
309 start = matching_table_part[0][2]
310 ori_start = new_start + start
311 ori_end = new_end + start
312 return ori_header, ori_start, ori_end
313
314
315 # recalculate gff3 coordinates, use gff3_feature class
316 def recalculate_gff3_coordinates(gff3_file, matching_table):
317 """
318 Recalculate gff3 coordinates, use gff3_feature class
319 :param gff3_file:
320 :param matching_table:
321 :return:
322 gff3_file_recalculated
323 """
324 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name
325
326 with open(gff3_file, 'r') as fh_in:
327 with open(gff3_file_recalculated, 'w') as fh_out:
328 for line in fh_in:
329 if line[0] == '#':
330 fh_out.write(line)
331 else:
332 feature = Gff3Feature(line)
333 new_coords = get_new_header_and_coordinates(
334 feature.header, feature.start, feature.end, matching_table
335 )
336 for new_header, new_start, new_end, sequence_length in new_coords:
337 if new_start >= 1 and new_end <= sequence_length:
338 feature.header = new_header
339 feature.start = new_start
340 feature.end = new_end
341 fh_out.write(str(feature))
342 return gff3_file_recalculated
343
344
345 # recalculate gff3 back to original coordinates, use gff3_feature class
346 def recalculate_gff3_back_to_original_coordinates(gff3_file, matching_table):
347 """
348 Recalculate gff3 back to original coordinates, use gff3_feature class
349 :param gff3_file:
350 :param matching_table:
351 :return:
352 gff3_file_recalculated
353 """
354 gff3_file_recalculated = tempfile.NamedTemporaryFile(delete=False).name
355 with open(gff3_file, 'r') as fh_in:
356 with open(gff3_file_recalculated, 'w') as fh_out:
357 for line in fh_in:
358 if line[0] == '#':
359 fh_out.write(line)
360 else:
361 feature = Gff3Feature(line)
362 ori_header, ori_start, ori_end = get_original_header_and_coordinates(
363 feature.header, feature.start, feature.end, matching_table
364 )
365 feature.header = ori_header
366 feature.start = ori_start
367 feature.end = ori_end
368 fh_out.write(str(feature))
369 return gff3_file_recalculated
370
371
372 def get_feature_attributes(line):
373 """
374 Get attributes as dictionary from gff3 list
375 :param line:
376 :return:
377 attributes_dict
378 """
379 attributes_dict = {}
380 for item in line[8].split(';'):
381 if item.strip():
382 key, value = item.strip().split('=')
383 attributes_dict[key] = value
384 return attributes_dict
385
386
387 def get_unique_features(gff3_file):
388 """
389 return list of ID of non-ovelaping features.
390
391 :param gff3_file:
392 :return:
393 duplicated_features
394 """
395 good_id = []
396 feature_list = []
397 with open(gff3_file, 'r') as fh:
398 for line in fh:
399 if line[0] == '#':
400 continue
401 feature = Gff3Feature(line)
402 if feature.type != 'transposable_element':
403 continue
404 feature_list.append(feature) # sort by start position and header
405 feature_list.sort(key=lambda x: (x.start, x.header))
406 i = 0
407 while i < len(feature_list) - 1:
408 ch = feature_list[i].header == feature_list[i + 1].header
409 if not ch:
410 good_id.append(feature_list[i].attributes_dict['ID'])
411 i += 1
412 continue
413 if feature_list[i].identical_region(feature_list[i + 1]):
414 # identical position
415 good_id.append(feature_list[i].attributes_dict['ID'])
416 i += 2
417 continue
418 if feature_list[i].overlap(feature_list[i + 1]):
419 # overlap
420 if feature_list[i] > feature_list[i + 1]:
421 good_id.append(feature_list[i].attributes_dict['ID'])
422 i += 2
423 continue
424 else:
425 good_id.append(feature_list[i + 1].attributes_dict['ID'])
426 i += 2
427 continue
428 else:
429 good_id.append(feature_list[i].attributes_dict['ID'])
430 i += 1
431 if i == len(feature_list) - 1:
432 good_id.append(feature_list[i].attributes_dict['ID'])
433 return good_id
434
435
436 def filter_gff3_file(gff3_file, good_id, output_file):
437 """
438 Filter gff3 file by good_id
439 :param gff3_file:
440 :param good_id:
441 :param output_file:
442 :return:
443 filtered_gff3_file
444 """
445
446 with open(gff3_file, 'r') as fh, open(output_file, 'w') as fout:
447 for line in fh:
448 if line[0] == '#':
449 fout.write(line)
450 else:
451 feature = Gff3Feature(line)
452 if ('ID' in feature.attributes_dict and
453 feature.attributes_dict['ID'] in good_id):
454 fout.write(line)
455 continue
456 if 'Parent' in feature.attributes_dict:
457 if feature.attributes_dict['Parent'] in good_id:
458 fout.write(line)
459 continue
460
461
115 def main(): 462 def main():
116 """ 463 """
117 Main function 464 Main function
118 """ 465 """
119 args = get_arguments() 466 args = get_arguments()
120 # locate directory of current script 467 # locate directory of current script
121 tool_path = os.path.dirname(os.path.realpath(__file__)) 468 tool_path = os.path.dirname(os.path.realpath(__file__))
122 fasta_seq_size = read_fasta_sequence_size(args.reference_sequence) 469 # split fasta file to chunks
470 fasta_file_split, matching_table = split_fasta_to_chunks(
471 args.reference_sequence, chunk_size=args.max_chunk_size
472 )
473 # recalculate gff3 coordinates
474 gff3_file_recalculated = recalculate_gff3_coordinates(args.gff3, matching_table)
475
476 fasta_seq_size = read_fasta_sequence_size(fasta_file_split)
123 total_size = sum(fasta_seq_size.values()) 477 total_size = sum(fasta_seq_size.values())
124 number_of_sequences = len(fasta_seq_size) 478 number_of_sequences = len(fasta_seq_size)
125 if total_size > args.max_chunk_size and number_of_sequences > 1: 479 if total_size > args.max_chunk_size and number_of_sequences > 1:
480 print('running analysis on chunks of ~ {} Mb'.format(args.max_chunk_size))
126 # sort dictionary by values 481 # sort dictionary by values
127 seq_id_size_sorted = [i[0] for i in sorted( 482 seq_id_size_sorted = [i[0] for i in sorted(
128 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True 483 fasta_seq_size.items(), key=lambda x: int(x[1]), reverse=True
129 )] 484 )]
130 number_of_temp_files = int(total_size / args.max_chunk_size) + 1 485 number_of_temp_files = int(total_size / args.max_chunk_size) + 1
135 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta] 490 file_handles = [open(temp_file, 'w') for temp_file in temp_files_fasta]
136 # make dictionary seq_id_sorted as keys and values as file handles 491 # make dictionary seq_id_sorted as keys and values as file handles
137 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) 492 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
138 493
139 # write sequences to temporary files 494 # write sequences to temporary files
140 with open(args.reference_sequence, 'r') as f: 495 with open(fasta_file_split, 'r') as f:
141 for line in f: 496 for line in f:
142 if line[0] == '>': 497 if line[0] == '>':
143 header = line.strip().split(' ')[0][1:] 498 header = line.strip().split(' ')[0][1:]
144 print(header)
145 seq_id_file_handle_dict[header].write(line) 499 seq_id_file_handle_dict[header].write(line)
146 else: 500 else:
147 seq_id_file_handle_dict[header].write(line) 501 seq_id_file_handle_dict[header].write(line)
502 os.remove(fasta_file_split)
148 # close file handles 503 # close file handles
149 for file_handle in file_handles: 504 for file_handle in file_handles:
150 file_handle.close() 505 file_handle.close()
151 506
152 # split gff3 file to temporary files - 507 # split gff3 file to temporary files -
154 temp_files_gff = make_temp_files(number_of_temp_files) 509 temp_files_gff = make_temp_files(number_of_temp_files)
155 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff] 510 file_handles = [open(temp_file, 'w') for temp_file in temp_files_gff]
156 # make dictionary seq_id_sorted as keys and values as file handles 511 # make dictionary seq_id_sorted as keys and values as file handles
157 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles))) 512 seq_id_file_handle_dict = dict(zip(seq_id_size_sorted, cycle(file_handles)))
158 # write gff lines to chunks 513 # write gff lines to chunks
159 with open(args.gff3, 'r') as f: 514 with open(gff3_file_recalculated, 'r') as f:
160 for line in f: 515 for line in f:
161 if line[0] == '#': 516 if line[0] == '#':
162 continue 517 continue
163 else: 518 else:
164 header = line.strip().split('\t')[0] 519 header = line.strip().split('\t')[0]
165 seq_id_file_handle_dict[header].write(line) 520 seq_id_file_handle_dict[header].write(line)
166 # close file handles 521 # close file handles
167 for file_handle in file_handles: 522 for file_handle in file_handles:
168 file_handle.close() 523 file_handle.close()
169 524 os.remove(gff3_file_recalculated)
170 # run retrotransposon detection on each temporary file 525 # run retrotransposon detection on each temporary file
171 output_files = make_temp_files(number_of_temp_files) 526 output_files = make_temp_files(number_of_temp_files)
172 for i in range(number_of_temp_files): 527 for i in range(number_of_temp_files):
173 print('Running retrotransposon detection on file ' + str(i)) 528 print('Running retrotransposon detection on file ' + str(i))
174 subprocess.check_call( 529 subprocess.check_call(
175 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g', 530 [f'{tool_path}/detect_putative_ltr.R', '-s', temp_files_fasta[i], '-g',
176 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M', 531 temp_files_gff[i], '-o', output_files[i], '-c', str(args.cpu), '-M',
177 str(args.max_missing_domains), '-L', str(args.min_relative_length)] 532 str(args.max_missing_domains), '-L', str(args.min_relative_length)]
178 ) 533 )
179 534
180 # remove all temporary input files 535 #remove all temporary input files
181 for temp_file in temp_files_fasta + temp_files_gff: 536 for temp_file in temp_files_fasta + temp_files_gff:
182 os.remove(temp_file) 537 os.remove(temp_file)
183 538
184 # concatenate output files 539 # concatenate output files
185 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta', 540 output_file_suffixes = ['_D.fasta', '_DL.fasta', '_DLT.fasta', '_DLTP.fasta',
193 with open(args.output + suffix, 'w') as f: 548 with open(args.output + suffix, 'w') as f:
194 f.write("\n".join(new_statistics)) 549 f.write("\n".join(new_statistics))
195 # remove parsed temporary statistics files 550 # remove parsed temporary statistics files
196 for file in stat_files: 551 for file in stat_files:
197 os.remove(file) 552 os.remove(file)
553 elif suffix == '.gff3':
554 tmp_gff_unfiltered = tempfile.NamedTemporaryFile(delete=False).name
555 with open(tmp_gff_unfiltered, 'w') as f:
556 for i in range(number_of_temp_files):
557 tmp_gff = recalculate_gff3_back_to_original_coordinates(
558 output_files[i] + suffix, matching_table
559 )
560 # remove temporary gff3 file
561 os.remove(output_files[i] + suffix)
562 with open(tmp_gff, 'r') as f_tmp:
563 for line in f_tmp:
564 f.write(line)
565 os.remove(tmp_gff)
566 # filter overlapping features
567 good_id = get_unique_features(tmp_gff_unfiltered)
568 filter_gff3_file(
569 tmp_gff_unfiltered, good_id, args.output + suffix
570 )
571 # remove temporary gff3 file
572 os.remove(tmp_gff_unfiltered)
198 else: 573 else:
199 with open(args.output + suffix, 'w') as f: 574 with open(args.output + suffix, 'w') as f:
200 for i in range(number_of_temp_files): 575 for i in range(number_of_temp_files):
201 # some file may not exist, so we need to check 576 # some file may not exist, so we need to check
202 try: 577 try:
203 with open(output_files[i] + suffix, 'r') as g: 578 with open(output_files[i] + suffix, 'r') as g:
204 for line in g: 579 for line in g:
205 f.write(line) 580 f.write(line)
206 # remove parsed temporary output files 581 # remove parsed temporary output files
207 os.remove(output_files[i]) 582 os.remove(output_files[i] + suffix)
208 except FileNotFoundError: 583 except FileNotFoundError:
209 pass 584 pass
210 else: 585 else:
586 print('running analysis on whole input')
211 # no need to split sequences into chunks 587 # no need to split sequences into chunks
212 subprocess.check_call( 588 subprocess.check_call(
213 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g', 589 [f'{tool_path}/detect_putative_ltr.R', '-s', args.reference_sequence, '-g',
214 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M', 590 args.gff3, '-o', args.output, '-c', str(args.cpu), '-M',
215 str(args.max_missing_domains), '-L', str(args.min_relative_length)] 591 str(args.max_missing_domains), '-L', str(args.min_relative_length)]