annotate gff.py @ 5:ad3bbf392135 draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 11:14:05 -0400
parents a5f1638b73be
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
1 #!/usr/bin/env python3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
2 """ sequence repetitive profile conversion to GFF3 format """
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
3
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
4 import time
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
5 import configuration
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
6 from operator import itemgetter
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
7 from itertools import groupby
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
8 from tempfile import NamedTemporaryFile
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
9 import os
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
10
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
11
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
12 def create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, headers):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
13 seq_id = None
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
14 exclude = set(['ALL'])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
15 gff_tmp_list = []
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
16 for repeat in sorted(set(files_dict.keys()).difference(exclude)):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
17 gff_tmp = NamedTemporaryFile(delete=False)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
18 pos_seq_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
19 with open(files_dict[repeat][0], "r") as repeat_f, open(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
20 files_dict[repeat][2], "r") as quality_f:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
21 for line_r, line_q in zip(repeat_f, quality_f):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
22 if "chrom" in line_r:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
23 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
24 configuration.REPEATS_FEATURE, repeat,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
25 pos_seq_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
26 seq_id = line_r.rstrip().split("chrom=")[1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
27 pos_seq_dict = {}
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
28 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
29 hits = int(line_r.rstrip().split("\t")[1])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
30 if hits >= THRESHOLD:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
31 position = int(line_r.rstrip().split("\t")[0])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
32 pos_seq_dict[position] = line_q.rstrip().split("\t")[1]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
33 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
34 configuration.REPEATS_FEATURE, repeat, pos_seq_dict)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
35 gff_tmp_list.append(gff_tmp.name)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
36 gff_tmp.close()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
37 sort_records(gff_tmp_list, headers, OUTPUT_GFF)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
38 for tmp in gff_tmp_list:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
39 os.unlink(tmp)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
40
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
41
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
42 def idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_file, feature, repeat,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
43 pos_seq_dict):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
44 indices = sorted(pos_seq_dict.keys(), key=int)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
45 for key, group in groupby(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
46 enumerate(indices),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
47 lambda index_item: index_item[0] - index_item[1]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
48 group = list(map(itemgetter(1), group))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
49 if len(group) > THRESHOLD_SEGMENT:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
50 sum_qual = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
51 for position in group:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
52 sum_qual += int(pos_seq_dict[position])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
53 qual_per_reg = sum_qual / len(group)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
54 # Take boundaries of the group vectors
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
55 gff_file.write(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
56 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
57 seq_id, configuration.SOURCE_PROFREP, feature, group[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
58 0], group[-1], configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
59 configuration.GFF_EMPTY, configuration.GFF_EMPTY, repeat,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
60 round(qual_per_reg)).encode("utf-8"))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
61
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
62
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
63 def idx_ranges_N(indices, THRESHOLD_SEGMENT, seq_id, gff_file, feature,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
64 att_name):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
65 for key, group in groupby(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
66 enumerate(indices),
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
67 lambda index_item: index_item[0] - index_item[1]):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
68 group = list(map(itemgetter(1), group))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
69 if len(group) > THRESHOLD_SEGMENT:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
70 # Take boundaries of the group vectors
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
71 gff_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
72 seq_id, configuration.SOURCE_PROFREP, feature, group[0], group[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
73 -1], configuration.GFF_EMPTY, configuration.GFF_EMPTY,
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
74 configuration.GFF_EMPTY, att_name))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
75
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
76
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
77 def sort_records(gff_tmp_list, headers, OUTPUT_GFF):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
78 opened_files = [open(i, "r") for i in gff_tmp_list]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
79 files_lines = dict((key, "") for key in opened_files)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
80 count_without_line = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
81 count_seq = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
82 ####################################################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
83 present_seqs = headers
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
84 ####################################################################
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
85 with open(OUTPUT_GFF, "w") as final_gff:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
86 final_gff.write("{}\n".format(configuration.HEADER_GFF))
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
87 while True:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
88 for file_name in opened_files:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
89 if not files_lines[file_name]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
90 line = file_name.readline()
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
91 if line:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
92 files_lines[file_name] = line
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
93 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
94 count_without_line += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
95 if count_without_line == len(opened_files):
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
96 break
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
97 count_without_line = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
98 count = 0
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
99 lowest_pos = float("inf")
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
100 for file_key in files_lines.keys():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
101 if files_lines[file_key].split("\t")[0] == present_seqs[
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
102 count_seq]:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
103 count += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
104 start_pos = int(files_lines[file_key].split("\t")[3])
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
105 if start_pos < lowest_pos:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
106 lowest_pos = start_pos
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
107 record_to_write = files_lines[file_key]
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
108 lowest_file_key = file_key
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
109 if count == 0:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
110 count_seq += 1
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
111 else:
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
112 final_gff.write(record_to_write)
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
113 files_lines[lowest_file_key] = ""
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
114
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
115
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
116 def main():
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
117 pass
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
118
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
119
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
120 if __name__ == "__main__":
a5f1638b73be Uploaded
petr-novak
parents:
diff changeset
121 main()