Mercurial > repos > petr-novak > profrep
comparison profrep_refining.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a5f1638b73be |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 import numpy as np | |
5 import os | |
6 from tempfile import NamedTemporaryFile | |
7 from collections import defaultdict | |
8 import configuration | |
9 import sys | |
10 | |
11 | |
12 def str2bool(v): | |
13 if v.lower() in ('yes', 'true', 't', 'y', '1'): | |
14 return True | |
15 elif v.lower() in ('no', 'false', 'f', 'n', '0'): | |
16 return False | |
17 else: | |
18 raise argparse.ArgumentTypeError('Boolean value expected') | |
19 | |
20 | |
21 def check_dom_gff(DOM_GFF): | |
22 ''' Check if the GFF file of protein domains was given ''' | |
23 with open(DOM_GFF) as domains_f: | |
24 line = domains_f.readline() | |
25 while line.startswith("#"): | |
26 line = domains_f.readline() | |
27 if len(line.split("\t")) == 9 and "Final_Classification" in line: | |
28 pass | |
29 else: | |
30 raise IOError( | |
31 "There was detected an input GFF file that does not contain domains. Please check it and choose the domains GFF file") | |
32 | |
33 | |
34 def create_dom_dict(DOM_GFF): | |
35 ''' Create hash table of protein domains for individual sequences ''' | |
36 check_dom_gff(DOM_GFF) | |
37 dict_domains = defaultdict(list) | |
38 count_comment = check_file_start(DOM_GFF) | |
39 with open(DOM_GFF, "r") as domains: | |
40 for comment_idx in range(count_comment): | |
41 next(domains) | |
42 for line in domains: | |
43 seq_id = line.split("\t")[0] | |
44 ann_dom_lineage = line.split("\t")[8].split(";")[1].split("=")[-1] | |
45 start_dom = int(line.split("\t")[3]) | |
46 end_dom = int(line.split("\t")[4]) | |
47 strand_dom = line.split("\t")[6] | |
48 dict_domains[seq_id].append((start_dom, end_dom, ann_dom_lineage, | |
49 strand_dom)) | |
50 for seq_id in dict_domains.keys(): | |
51 dict_domains[seq_id] = sorted(dict_domains[seq_id], key=lambda x: x[0]) | |
52 return dict_domains | |
53 | |
54 | |
55 def check_file_start(gff_file): | |
56 count_comment = 0 | |
57 with open(gff_file, "r") as gff_all: | |
58 line = gff_all.readline() | |
59 while line.startswith("#"): | |
60 line = gff_all.readline() | |
61 count_comment += 1 | |
62 return count_comment | |
63 | |
64 | |
65 def interconnect_intervals(OUT_REFINED, gff_removed, GAP_TH, DOM_NUM, | |
66 dict_domains, domains_classes): | |
67 ''' Second step of refining - INTERVALS INTERCONNECTING: | |
68 Gradually checking regions of GFF which are sorted by starting point. | |
69 Adding regions to be interconnected if the gap between the new and previous one is lower than threshold and theirclassification is the same as the one set by the first region. | |
70 If the conditions are not fullfilled the adding is stopped and this whole expanded region is further evaluated: | |
71 1. if domains are not included in joining (A. choosed as parameter or B. based on the classification the element does not belong to mobile elements): | |
72 -> the fragments are joined reported as one region in refined gff | |
73 2. if domains should be included: | |
74 -> the fragments are joined if domains within the expanded region meets the criteria: | |
75 1. they are at least certain number of domains | |
76 2. they have equal strand orientation | |
77 3. they are classified to the last classification level (checked from the class. table of the database) and this matches the classification of the expanded region | |
78 -> otherwise the region is refragmented to previous parts, which are reported as they were in the original repeats gff | |
79 ''' | |
80 with open(OUT_REFINED, "w") as joined_intervals: | |
81 start_line = check_file_start(gff_removed) | |
82 with open(gff_removed, "r") as repeats: | |
83 for comment_idx in range(start_line): | |
84 joined_intervals.write(repeats.readline()) | |
85 joined = False | |
86 initial = repeats.readline() | |
87 ## if there are repeats in GFF, initialize | |
88 if initial is not "": | |
89 seq_id_ini = initial.rstrip().split("\t")[0] | |
90 start_ini = int(initial.rstrip().split("\t")[3]) | |
91 end_ini = int(initial.rstrip().split("\t")[4]) | |
92 ann_ini = initial.rstrip().split("\t")[8].split(";")[0].split( | |
93 "=")[-1] | |
94 starts_part = [start_ini] | |
95 ends_part = [end_ini] | |
96 for line in repeats: | |
97 seq_id = line.rstrip().split("\t")[0] | |
98 start = int(line.rstrip().split("\t")[3]) | |
99 end = int(line.rstrip().split("\t")[4]) | |
100 ann = line.rstrip().split("\t")[8].split(";")[0].split( | |
101 "=")[-1] | |
102 ## if new sequence is detected the joining process is reset | |
103 if seq_id != seq_id_ini: | |
104 ## handle the last expanded region from the previous sequence | |
105 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini: | |
106 # check the domains | |
107 dict_domains = dom_refining( | |
108 joined_intervals, dict_domains, seq_id_ini, | |
109 start_ini, end_ini, ann_ini, starts_part, | |
110 ends_part, DOM_NUM, domains_classes) | |
111 else: | |
112 # write the joined region if domains should not be considered | |
113 joined_intervals.write( | |
114 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
115 seq_id_ini, configuration.SOURCE_PROFREP, | |
116 configuration.REPEATS_FEATURE, start_ini, | |
117 end_ini, configuration.GFF_EMPTY, | |
118 configuration.GFF_EMPTY, | |
119 configuration.GFF_EMPTY, ann_ini)) | |
120 # initialize the search for expanded intervals | |
121 starts_part = [start] | |
122 ends_part = [end] | |
123 joined = False | |
124 seq_id_ini = seq_id | |
125 start_ini = start | |
126 end_ini = end | |
127 ann_ini = ann | |
128 ## prolonging the potential region to be expanded | |
129 elif start - end_ini <= GAP_TH and ann == ann_ini: | |
130 starts_part.append(start) | |
131 ends_part.append(end) | |
132 end_ini = end | |
133 joined = True | |
134 ## end of expanding the interval | |
135 else: | |
136 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini: | |
137 dict_domains = dom_refining( | |
138 joined_intervals, dict_domains, seq_id, | |
139 start_ini, end_ini, ann_ini, starts_part, | |
140 ends_part, DOM_NUM, domains_classes) | |
141 else: | |
142 joined_intervals.write( | |
143 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
144 seq_id, configuration.SOURCE_PROFREP, | |
145 configuration.REPEATS_FEATURE, start_ini, | |
146 end_ini, configuration.GFF_EMPTY, | |
147 configuration.GFF_EMPTY, | |
148 configuration.GFF_EMPTY, ann_ini)) | |
149 starts_part = [start] | |
150 ends_part = [end] | |
151 # after handling the expanded interval, set the "joined" indicator to be false, so the next region expanding and potential joining can begin | |
152 joined = False | |
153 start_ini = start | |
154 end_ini = end | |
155 ann_ini = ann | |
156 ## handle the last potentially expanded region | |
157 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini: | |
158 # check the domains | |
159 dict_domains = dom_refining( | |
160 joined_intervals, dict_domains, seq_id, start_ini, | |
161 end_ini, ann_ini, starts_part, ends_part, DOM_NUM, | |
162 domains_classes) | |
163 else: | |
164 # write the joined region | |
165 joined_intervals.write( | |
166 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
167 seq_id, configuration.SOURCE_PROFREP, | |
168 configuration.REPEATS_FEATURE, start_ini, end_ini, | |
169 configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
170 configuration.GFF_EMPTY, ann_ini)) | |
171 del (starts_part) | |
172 del (ends_part) | |
173 | |
174 | |
175 def cluster_regions_for_quality_check(REPEATS_GFF, BORDERS): | |
176 ''' First step of refining - REMOVING LOW CONFIDENCE REPEATS REGIONS | |
177 Create clusters of overlapping regions from REPEATS_GFF. | |
178 These clusters subsequently serves for removing nested regions of different classification which have significantly lower quality comparing to the region they are nested in. | |
179 Overhang tolerance of couple of bases on the both sides is tolerated when evaluting the nested regions. | |
180 ''' | |
181 start_line = check_file_start(REPEATS_GFF) | |
182 cluster = [] | |
183 end_cluster = 0 | |
184 seq_id_ini = None | |
185 gff_removed_parts = NamedTemporaryFile(delete=False) | |
186 with open(REPEATS_GFF, "r") as repeats: | |
187 for comment_idx in range(start_line): | |
188 gff_removed_parts.write(repeats.readline().encode("utf-8")) | |
189 for interval in repeats: | |
190 seq_id = interval.split("\t")[0] | |
191 start = int(interval.split("\t")[3]) | |
192 end = int(interval.split("\t")[4]) | |
193 ann = interval.split("\t")[8].split(";")[0].split("=")[-1] | |
194 pid = int(interval.rstrip().split("\t")[8].split(";")[1].split( | |
195 "=")[-1]) | |
196 if seq_id != seq_id_ini: | |
197 if len(cluster) > 1: | |
198 remove_low_qual(cluster, BORDERS, gff_removed_parts) | |
199 elif cluster: | |
200 gff_removed_parts.write( | |
201 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format( | |
202 cluster[0][0], configuration.SOURCE_PROFREP, | |
203 configuration.REPEATS_FEATURE, cluster[0][ | |
204 1], cluster[0][2], configuration.GFF_EMPTY, | |
205 configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
206 cluster[0][3], cluster[0][4]).encode("utf-8")) | |
207 seq_id_ini = seq_id | |
208 cluster = [] | |
209 ## expanding the cluster | |
210 if start <= end_cluster or end_cluster == 0: | |
211 cluster.append([seq_id, start, end, ann, pid]) | |
212 if end > end_cluster: | |
213 end_cluster = end | |
214 ## if the next region is not overlapping, evaluate the regions in cluster or write down the single region | |
215 else: | |
216 if len(cluster) > 1: | |
217 remove_low_qual(cluster, BORDERS, gff_removed_parts) | |
218 elif cluster: | |
219 gff_removed_parts.write( | |
220 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format( | |
221 cluster[0][0], configuration.SOURCE_PROFREP, | |
222 configuration.REPEATS_FEATURE, cluster[0][ | |
223 1], cluster[0][2], configuration.GFF_EMPTY, | |
224 configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
225 cluster[0][3], cluster[0][4]).encode("utf-8")) | |
226 end_cluster = end | |
227 cluster = [[seq_id, start, end, ann, pid]] | |
228 if cluster: | |
229 remove_low_qual(cluster, BORDERS, gff_removed_parts) | |
230 gff_removed_parts.close() | |
231 return gff_removed_parts.name | |
232 | |
233 | |
234 def remove_low_qual(cluster, BORDERS, gff_removed_parts): | |
235 ''' Loop over regions in the cluster which are sorted based on descending quality (PID). | |
236 Regions nested in the currently checked region (with some exceeding borders tolerance) are tested for quality. | |
237 If the quality difference between the currently highest quality and the nested region is more than a threshold nested one will be removed. | |
238 Already checked region is removed from the list and procedure continues with the next best quality region in the cluster. | |
239 ''' | |
240 score_sorted = sorted(cluster, key=lambda x: int(x[4]), reverse=True) | |
241 for interval in score_sorted: | |
242 start_toler = interval[1] - BORDERS | |
243 end_toler = interval[2] + BORDERS | |
244 score_best = interval[4] | |
245 ## difference | |
246 for item in score_sorted[:]: | |
247 if item[1] >= start_toler and item[2] <= end_toler: | |
248 if item[4] <= (1 - configuration.QUALITY_DIFF_TO_REMOVE | |
249 ) * score_best: ### 5% difference tolerance | |
250 score_sorted.remove(item) | |
251 position_sorted = sorted(score_sorted, key=lambda x: int(x[1])) | |
252 for interval_refined in position_sorted: | |
253 gff_removed_parts.write( | |
254 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format( | |
255 interval_refined[0], configuration.SOURCE_PROFREP, | |
256 configuration.REPEATS_FEATURE, interval_refined[ | |
257 1], interval_refined[2], configuration.GFF_EMPTY, | |
258 configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
259 interval_refined[3], interval_refined[4]).encode("utf-8")) | |
260 | |
261 | |
262 def dom_refining(joined_intervals, dict_domains, seq_id, start_ini, end_ini, | |
263 ann_ini, starts_part, ends_part, DOM_NUM, domains_classes): | |
264 ''' Check the protein domains within the potential interval to be joined: | |
265 1. appropriate domain has the same classification as interval of repeats to be joined and is classified to the last level | |
266 2. strands of appropriate domains are uniform | |
267 3. the count of such appropriate domains is more than a threshold | |
268 ''' | |
269 count_dom = 0 | |
270 index_dom = 0 | |
271 strands = [] | |
272 indices_del = [] | |
273 for dom_attributes in dict_domains[seq_id]: | |
274 ann_dom = dom_attributes[2] | |
275 if dom_attributes[0] >= start_ini and dom_attributes[1] <= end_ini: | |
276 repeat_class = "|".join(ann_ini.split("|")[2:]) | |
277 if ann_dom in domains_classes and ann_dom == repeat_class: | |
278 strands.append(dom_attributes[3]) | |
279 count_dom += 1 | |
280 index_dom += 1 | |
281 ## if criteria for domains are met, write the expanded interval | |
282 if len(set(strands)) <= 1 and count_dom >= DOM_NUM: | |
283 joined_intervals.write( | |
284 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
285 seq_id, configuration.SOURCE_PROFREP, configuration. | |
286 REPEATS_FEATURE, start_ini, end_ini, configuration.GFF_EMPTY, | |
287 configuration.GFF_EMPTY, configuration.GFF_EMPTY, ann_ini)) | |
288 ## if criteria are not met, refragment the expanded interval on original regions | |
289 else: | |
290 for part_start, part_end in zip(starts_part, ends_part): | |
291 joined_intervals.write( | |
292 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format( | |
293 seq_id, configuration.SOURCE_PROFREP, | |
294 configuration.REPEATS_FEATURE, part_start, part_end, | |
295 configuration.GFF_EMPTY, configuration.GFF_EMPTY, | |
296 configuration.GFF_EMPTY, ann_ini)) | |
297 ## delete already checked domains from the dict | |
298 del (dict_domains[seq_id][0:index_dom]) | |
299 return dict_domains | |
300 | |
301 | |
302 def get_unique_classes(CLASS_TBL): | |
303 ''' Get all the lineages of current domains database classification table to subsequently check the protein domains if they are classified up to the last level. | |
304 Only these domains will be considered as valid for interval joining. | |
305 If their classification is be finite (based on comparing to this list of unique classes) they will not be counted for minimum number of domains criterion within the segment to be joined | |
306 ''' | |
307 unique_classes = [] | |
308 with open(CLASS_TBL, "r") as class_tbl: | |
309 for line in class_tbl: | |
310 line_class = "|".join(line.rstrip().split("\t")[1:]) | |
311 if line_class not in unique_classes: | |
312 unique_classes.append(line_class) | |
313 return unique_classes | |
314 | |
315 | |
316 def main(args): | |
317 # Command line arguments | |
318 REPEATS_GFF = args.repeats_gff | |
319 DOM_GFF = args.domains_gff | |
320 GAP_TH = args.gap_threshold | |
321 DOM_NUM = args.dom_number | |
322 OUT_REFINED = args.out_refined | |
323 INCLUDE_DOM = args.include_dom | |
324 CLASS_TBL = args.class_tbl | |
325 BORDERS = args.borders | |
326 | |
327 # first step of refining - removing low confidence repeats regions | |
328 gff_removed = cluster_regions_for_quality_check(REPEATS_GFF, BORDERS) | |
329 | |
330 # second step of refining - interconnecting repeats regions | |
331 if INCLUDE_DOM: | |
332 unique_classes = get_unique_classes(CLASS_TBL) | |
333 dict_domains = create_dom_dict(DOM_GFF) | |
334 joined_intervals = interconnect_intervals( | |
335 OUT_REFINED, gff_removed, GAP_TH, DOM_NUM, dict_domains, | |
336 unique_classes) | |
337 else: | |
338 joined_intervals = interconnect_intervals(OUT_REFINED, gff_removed, | |
339 GAP_TH, DOM_NUM, None, None) | |
340 os.unlink(gff_removed) | |
341 | |
342 | |
343 if __name__ == "__main__": | |
344 | |
345 # Command line arguments | |
346 parser = argparse.ArgumentParser() | |
347 parser.add_argument('-rep_gff', | |
348 '--repeats_gff', | |
349 type=str, | |
350 required=True, | |
351 help='original repeats regions GFF from PROFREP') | |
352 parser.add_argument( | |
353 '-dom_gff', | |
354 '--domains_gff', | |
355 type=str, | |
356 help= | |
357 'protein domains GFF if you want to support repeats joining by domains information') | |
358 parser.add_argument( | |
359 '-gth', | |
360 '--gap_threshold', | |
361 type=int, | |
362 default=250, | |
363 help='gap tolerance between consecutive repeats to be interconnected') | |
364 parser.add_argument('-our', | |
365 '--out_refined', | |
366 type=str, | |
367 default="output_refined.gff", | |
368 help='query sequence to be processed') | |
369 parser.add_argument( | |
370 '-dn', | |
371 '--dom_number', | |
372 type=int, | |
373 default=2, | |
374 help='min number of domains present to confirm the region joining') | |
375 parser.add_argument( | |
376 '-id', | |
377 '--include_dom', | |
378 type=str2bool, | |
379 default=False, | |
380 help='include domains information to refine the repeats regions') | |
381 parser.add_argument( | |
382 '-ct', | |
383 '--class_tbl', | |
384 help= | |
385 'classification table of protein domain database to check the level of classification') | |
386 parser.add_argument( | |
387 '-br', | |
388 '--borders', | |
389 type=int, | |
390 default=10, | |
391 help= | |
392 'number of bp tolerated from one or the other side of two overlaping regions when evaluating quality') | |
393 args = parser.parse_args() | |
394 main(args) |