comparison gff.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2 """ sequence repetitive profile conversion to GFF3 format """
3
4 import time
5 import configuration
6 from operator import itemgetter
7 from itertools import groupby
8 from tempfile import NamedTemporaryFile
9 import os
10
11
12 def create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, headers):
13 seq_id = None
14 exclude = set(['ALL'])
15 gff_tmp_list = []
16 for repeat in sorted(set(files_dict.keys()).difference(exclude)):
17 gff_tmp = NamedTemporaryFile(delete=False)
18 pos_seq_dict = {}
19 with open(files_dict[repeat][0], "r") as repeat_f, open(
20 files_dict[repeat][2], "r") as quality_f:
21 for line_r, line_q in zip(repeat_f, quality_f):
22 if "chrom" in line_r:
23 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
24 configuration.REPEATS_FEATURE, repeat,
25 pos_seq_dict)
26 seq_id = line_r.rstrip().split("chrom=")[1]
27 pos_seq_dict = {}
28 else:
29 hits = int(line_r.rstrip().split("\t")[1])
30 if hits >= THRESHOLD:
31 position = int(line_r.rstrip().split("\t")[0])
32 pos_seq_dict[position] = line_q.rstrip().split("\t")[1]
33 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp,
34 configuration.REPEATS_FEATURE, repeat, pos_seq_dict)
35 gff_tmp_list.append(gff_tmp.name)
36 gff_tmp.close()
37 sort_records(gff_tmp_list, headers, OUTPUT_GFF)
38 for tmp in gff_tmp_list:
39 os.unlink(tmp)
40
41
42 def idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_file, feature, repeat,
43 pos_seq_dict):
44 indices = sorted(pos_seq_dict.keys(), key=int)
45 for key, group in groupby(
46 enumerate(indices),
47 lambda index_item: index_item[0] - index_item[1]):
48 group = list(map(itemgetter(1), group))
49 if len(group) > THRESHOLD_SEGMENT:
50 sum_qual = 0
51 for position in group:
52 sum_qual += int(pos_seq_dict[position])
53 qual_per_reg = sum_qual / len(group)
54 # Take boundaries of the group vectors
55 gff_file.write(
56 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
57 seq_id, configuration.SOURCE_PROFREP, feature, group[
58 0], group[-1], configuration.GFF_EMPTY,
59 configuration.GFF_EMPTY, configuration.GFF_EMPTY, repeat,
60 round(qual_per_reg)).encode("utf-8"))
61
62
63 def idx_ranges_N(indices, THRESHOLD_SEGMENT, seq_id, gff_file, feature,
64 att_name):
65 for key, group in groupby(
66 enumerate(indices),
67 lambda index_item: index_item[0] - index_item[1]):
68 group = list(map(itemgetter(1), group))
69 if len(group) > THRESHOLD_SEGMENT:
70 # Take boundaries of the group vectors
71 gff_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
72 seq_id, configuration.SOURCE_PROFREP, feature, group[0], group[
73 -1], configuration.GFF_EMPTY, configuration.GFF_EMPTY,
74 configuration.GFF_EMPTY, att_name))
75
76
77 def sort_records(gff_tmp_list, headers, OUTPUT_GFF):
78 opened_files = [open(i, "r") for i in gff_tmp_list]
79 files_lines = dict((key, "") for key in opened_files)
80 count_without_line = 0
81 count_seq = 0
82 ####################################################################
83 present_seqs = headers
84 ####################################################################
85 with open(OUTPUT_GFF, "w") as final_gff:
86 final_gff.write("{}\n".format(configuration.HEADER_GFF))
87 while True:
88 for file_name in opened_files:
89 if not files_lines[file_name]:
90 line = file_name.readline()
91 if line:
92 files_lines[file_name] = line
93 else:
94 count_without_line += 1
95 if count_without_line == len(opened_files):
96 break
97 count_without_line = 0
98 count = 0
99 lowest_pos = float("inf")
100 for file_key in files_lines.keys():
101 if files_lines[file_key].split("\t")[0] == present_seqs[
102 count_seq]:
103 count += 1
104 start_pos = int(files_lines[file_key].split("\t")[3])
105 if start_pos < lowest_pos:
106 lowest_pos = start_pos
107 record_to_write = files_lines[file_key]
108 lowest_file_key = file_key
109 if count == 0:
110 count_seq += 1
111 else:
112 final_gff.write(record_to_write)
113 files_lines[lowest_file_key] = ""
114
115
116 def main():
117 pass
118
119
120 if __name__ == "__main__":
121 main()