Mercurial > repos > petr-novak > profrep
comparison gff.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a5f1638b73be |
---|---|
1 #!/usr/bin/env python3 | |
2 """ sequence repetitive profile conversion to GFF3 format """ | |
3 | |
4 import time | |
5 import configuration | |
6 from operator import itemgetter | |
7 from itertools import groupby | |
8 from tempfile import NamedTemporaryFile | |
9 import os | |
10 | |
11 | |
12 def create_gff(THRESHOLD, THRESHOLD_SEGMENT, OUTPUT_GFF, files_dict, headers): | |
13 seq_id = None | |
14 exclude = set(['ALL']) | |
15 gff_tmp_list = [] | |
16 for repeat in sorted(set(files_dict.keys()).difference(exclude)): | |
17 gff_tmp = NamedTemporaryFile(delete=False) | |
18 pos_seq_dict = {} | |
19 with open(files_dict[repeat][0], "r") as repeat_f, open( | |
20 files_dict[repeat][2], "r") as quality_f: | |
21 for line_r, line_q in zip(repeat_f, quality_f): | |
22 if "chrom" in line_r: | |
23 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp, | |
24 configuration.REPEATS_FEATURE, repeat, | |
25 pos_seq_dict) | |
26 seq_id = line_r.rstrip().split("chrom=")[1] | |
27 pos_seq_dict = {} | |
28 else: | |
29 hits = int(line_r.rstrip().split("\t")[1]) | |
30 if hits >= THRESHOLD: | |
31 position = int(line_r.rstrip().split("\t")[0]) | |
32 pos_seq_dict[position] = line_q.rstrip().split("\t")[1] | |
33 idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_tmp, | |
34 configuration.REPEATS_FEATURE, repeat, pos_seq_dict) | |
35 gff_tmp_list.append(gff_tmp.name) | |
36 gff_tmp.close() | |
37 sort_records(gff_tmp_list, headers, OUTPUT_GFF) | |
38 for tmp in gff_tmp_list: | |
39 os.unlink(tmp) | |
40 | |
41 | |
42 def idx_ranges(THRESHOLD_SEGMENT, seq_id, gff_file, feature, repeat, | |
43 pos_seq_dict): | |
44 indices = sorted(pos_seq_dict.keys(), key=int) | |
45 for key, group in groupby( | |
46 enumerate(indices), | |
47 lambda index_item: index_item[0] - index_item[1]): | |
48 group = list(map(itemgetter(1), group)) | |
49 if len(group) > THRESHOLD_SEGMENT: | |
50 sum_qual = 0 | |
51 for position in group: | |
52 sum_qual += int(pos_seq_dict[position]) | |
53 qual_per_reg = sum_qual / len(group) | |
54 # Take boundaries of the group vectors | |
55 gff_file.write( | |
56 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format( | |
57 seq_id, configuration.SOURCE_PROFREP, feature, group[ | |
58 0], group[-1], configuration.GFF_EMPTY, | |
59 configuration.GFF_EMPTY, configuration.GFF_EMPTY, repeat, | |
60 round(qual_per_reg)).encode("utf-8")) | |
61 | |
62 | |
63 def idx_ranges_N(indices, THRESHOLD_SEGMENT, seq_id, gff_file, feature, | |
64 att_name): | |
65 for key, group in groupby( | |
66 enumerate(indices), | |
67 lambda index_item: index_item[0] - index_item[1]): | |
68 group = list(map(itemgetter(1), group)) | |
69 if len(group) > THRESHOLD_SEGMENT: | |
70 # Take boundaries of the group vectors | |
71 gff_file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
72 seq_id, configuration.SOURCE_PROFREP, feature, group[0], group[ | |
73 -1], configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
74 configuration.GFF_EMPTY, att_name)) | |
75 | |
76 | |
77 def sort_records(gff_tmp_list, headers, OUTPUT_GFF): | |
78 opened_files = [open(i, "r") for i in gff_tmp_list] | |
79 files_lines = dict((key, "") for key in opened_files) | |
80 count_without_line = 0 | |
81 count_seq = 0 | |
82 #################################################################### | |
83 present_seqs = headers | |
84 #################################################################### | |
85 with open(OUTPUT_GFF, "w") as final_gff: | |
86 final_gff.write("{}\n".format(configuration.HEADER_GFF)) | |
87 while True: | |
88 for file_name in opened_files: | |
89 if not files_lines[file_name]: | |
90 line = file_name.readline() | |
91 if line: | |
92 files_lines[file_name] = line | |
93 else: | |
94 count_without_line += 1 | |
95 if count_without_line == len(opened_files): | |
96 break | |
97 count_without_line = 0 | |
98 count = 0 | |
99 lowest_pos = float("inf") | |
100 for file_key in files_lines.keys(): | |
101 if files_lines[file_key].split("\t")[0] == present_seqs[ | |
102 count_seq]: | |
103 count += 1 | |
104 start_pos = int(files_lines[file_key].split("\t")[3]) | |
105 if start_pos < lowest_pos: | |
106 lowest_pos = start_pos | |
107 record_to_write = files_lines[file_key] | |
108 lowest_file_key = file_key | |
109 if count == 0: | |
110 count_seq += 1 | |
111 else: | |
112 final_gff.write(record_to_write) | |
113 files_lines[lowest_file_key] = "" | |
114 | |
115 | |
116 def main(): | |
117 pass | |
118 | |
119 | |
120 if __name__ == "__main__": | |
121 main() |