0
|
1 #!/usr/bin/env python3
|
|
2 """ sequence repetitive profile conversion to GFF3 format """
|
|
3
|
|
4 import time
|
|
5 import configuration
|
|
6 from operator import itemgetter
|
|
7 from itertools import groupby
|
|
8 from tempfile import NamedTemporaryFile
|
|
9 import os
|
|
10
|
|
11
|
|
12 def create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, headers):
|
|
13 seq_id = None
|
|
14 exclude = set(['ALL'])
|
|
15 gff_tmp_list = []
|
|
16 for repeat in sorted(set(files_dict.keys()).difference(exclude)):
|
|
17 gff_tmp = NamedTemporaryFile(delete=False)
|
|
18 pos_seq_dict = {}
|
|
19 with open(files_dict[repeat][0], "r") as repeat_f, open(
|
|
20 files_dict[repeat][2], "r") as quality_f:
|
|
21 for line_r, line_q in zip(repeat_f, quality_f):
|
|
22 if "chrom" in line_r:
|
|
23 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
|
|
24 configuration.REPEATS_FEATURE, repeat,
|
|
25 pos_seq_dict)
|
|
26 seq_id = line_r.rstrip().split("chrom=")[1]
|
|
27 pos_seq_dict = {}
|
|
28 else:
|
|
29 hits = int(line_r.rstrip().split("\t")[1])
|
|
30 if hits >= THRESHOLD:
|
|
31 position = int(line_r.rstrip().split("\t")[0])
|
|
32 pos_seq_dict[position] = line_q.rstrip().split("\t")[1]
|
|
33 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
|
|
34 configuration.REPEATS_FEATURE, repeat, pos_seq_dict)
|
|
35 gff_tmp_list.append(gff_tmp.name)
|
|
36 gff_tmp.close()
|
|
37 sort_records(gff_tmp_list, headers, OUTPUT_GFF)
|
|
38 for tmp in gff_tmp_list:
|
|
39 os.unlink(tmp)
|
|
40
|
|
41
|
|
42 def idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_file, feature, repeat,
|
|
43 pos_seq_dict):
|
|
44 indices = sorted(pos_seq_dict.keys(), key=int)
|
|
45 for key, group in groupby(
|
|
46 enumerate(indices),
|
|
47 lambda index_item: index_item[0] - index_item[1]):
|
|
48 group = list(map(itemgetter(1), group))
|
|
49 if len(group) > THRESHOLD_SEGMENT:
|
|
50 sum_qual = 0
|
|
51 for position in group:
|
|
52 sum_qual += int(pos_seq_dict[position])
|
|
53 qual_per_reg = sum_qual / len(group)
|
|
54 # Take boundaries of the group vectors
|
|
55 gff_file.write(
|
|
56 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
|
|
57 seq_id, configuration.SOURCE_PROFREP, feature, group[
|
|
58 0], group[-1], configuration.GFF_EMPTY,
|
|
59 configuration.GFF_EMPTY, configuration.GFF_EMPTY, repeat,
|
|
60 round(qual_per_reg)).encode("utf-8"))
|
|
61
|
|
62
|
|
63 def idx_ranges_N(indices, THRESHOLD_SEGMENT, seq_id, gff_file, feature,
|
|
64 att_name):
|
|
65 for key, group in groupby(
|
|
66 enumerate(indices),
|
|
67 lambda index_item: index_item[0] - index_item[1]):
|
|
68 group = list(map(itemgetter(1), group))
|
|
69 if len(group) > THRESHOLD_SEGMENT:
|
|
70 # Take boundaries of the group vectors
|
|
71 gff_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
|
|
72 seq_id, configuration.SOURCE_PROFREP, feature, group[0], group[
|
|
73 -1], configuration.GFF_EMPTY, configuration.GFF_EMPTY,
|
|
74 configuration.GFF_EMPTY, att_name))
|
|
75
|
|
76
|
|
77 def sort_records(gff_tmp_list, headers, OUTPUT_GFF):
|
|
78 opened_files = [open(i, "r") for i in gff_tmp_list]
|
|
79 files_lines = dict((key, "") for key in opened_files)
|
|
80 count_without_line = 0
|
|
81 count_seq = 0
|
|
82 ####################################################################
|
|
83 present_seqs = headers
|
|
84 ####################################################################
|
|
85 with open(OUTPUT_GFF, "w") as final_gff:
|
|
86 final_gff.write("{}\n".format(configuration.HEADER_GFF))
|
|
87 while True:
|
|
88 for file_name in opened_files:
|
|
89 if not files_lines[file_name]:
|
|
90 line = file_name.readline()
|
|
91 if line:
|
|
92 files_lines[file_name] = line
|
|
93 else:
|
|
94 count_without_line += 1
|
|
95 if count_without_line == len(opened_files):
|
|
96 break
|
|
97 count_without_line = 0
|
|
98 count = 0
|
|
99 lowest_pos = float("inf")
|
|
100 for file_key in files_lines.keys():
|
|
101 if files_lines[file_key].split("\t")[0] == present_seqs[
|
|
102 count_seq]:
|
|
103 count += 1
|
|
104 start_pos = int(files_lines[file_key].split("\t")[3])
|
|
105 if start_pos < lowest_pos:
|
|
106 lowest_pos = start_pos
|
|
107 record_to_write = files_lines[file_key]
|
|
108 lowest_file_key = file_key
|
|
109 if count == 0:
|
|
110 count_seq += 1
|
|
111 else:
|
|
112 final_gff.write(record_to_write)
|
|
113 files_lines[lowest_file_key] = ""
|
|
114
|
|
115
|
|
116 def main():
|
|
117 pass
|
|
118
|
|
119
|
|
120 if __name__ == "__main__":
|
|
121 main()
|