comparison profrep_refining.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a5f1638b73be
1 #!/usr/bin/env python3
2
3 import argparse
4 import numpy as np
5 import os
6 from tempfile import NamedTemporaryFile
7 from collections import defaultdict
8 import configuration
9 import sys
10
11
12 def str2bool(v):
13 if v.lower() in ('yes', 'true', 't', 'y', '1'):
14 return True
15 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
16 return False
17 else:
18 raise argparse.ArgumentTypeError('Boolean value expected')
19
20
21 def check_dom_gff(DOM_GFF):
22 ''' Check if the GFF file of protein domains was given '''
23 with open(DOM_GFF) as domains_f:
24 line = domains_f.readline()
25 while line.startswith("#"):
26 line = domains_f.readline()
27 if len(line.split("\t")) == 9 and "Final_Classification" in line:
28 pass
29 else:
30 raise IOError(
31 "There was detected an input GFF file that does not contain domains. Please check it and choose the domains GFF file")
32
33
34 def create_dom_dict(DOM_GFF):
35 ''' Create hash table of protein domains for individual sequences '''
36 check_dom_gff(DOM_GFF)
37 dict_domains = defaultdict(list)
38 count_comment = check_file_start(DOM_GFF)
39 with open(DOM_GFF, "r") as domains:
40 for comment_idx in range(count_comment):
41 next(domains)
42 for line in domains:
43 seq_id = line.split("\t")[0]
44 ann_dom_lineage = line.split("\t")[8].split(";")[1].split("=")[-1]
45 start_dom = int(line.split("\t")[3])
46 end_dom = int(line.split("\t")[4])
47 strand_dom = line.split("\t")[6]
48 dict_domains[seq_id].append((start_dom, end_dom, ann_dom_lineage,
49 strand_dom))
50 for seq_id in dict_domains.keys():
51 dict_domains[seq_id] = sorted(dict_domains[seq_id], key=lambda x: x[0])
52 return dict_domains
53
54
55 def check_file_start(gff_file):
56 count_comment = 0
57 with open(gff_file, "r") as gff_all:
58 line = gff_all.readline()
59 while line.startswith("#"):
60 line = gff_all.readline()
61 count_comment += 1
62 return count_comment
63
64
65 def interconnect_intervals(OUT_REFINED, gff_removed, GAP_TH, DOM_NUM,
66 dict_domains, domains_classes):
67 ''' Second step of refining - INTERVALS INTERCONNECTING:
68 Gradually checking regions of GFF which are sorted by starting point.
69 Adding regions to be interconnected if the gap between the new and previous one is lower than threshold and theirclassification is the same as the one set by the first region.
70 If the conditions are not fullfilled the adding is stopped and this whole expanded region is further evaluated:
71 1. if domains are not included in joining (A. choosed as parameter or B. based on the classification the element does not belong to mobile elements):
72 -> the fragments are joined reported as one region in refined gff
73 2. if domains should be included:
74 -> the fragments are joined if domains within the expanded region meets the criteria:
75 1. they are at least certain number of domains
76 2. they have equal strand orientation
77 3. they are classified to the last classification level (checked from the class. table of the database) and this matches the classification of the expanded region
78 -> otherwise the region is refragmented to previous parts, which are reported as they were in the original repeats gff
79 '''
80 with open(OUT_REFINED, "w") as joined_intervals:
81 start_line = check_file_start(gff_removed)
82 with open(gff_removed, "r") as repeats:
83 for comment_idx in range(start_line):
84 joined_intervals.write(repeats.readline())
85 joined = False
86 initial = repeats.readline()
87 ## if there are repeats in GFF, initialize
88 if initial is not "":
89 seq_id_ini = initial.rstrip().split("\t")[0]
90 start_ini = int(initial.rstrip().split("\t")[3])
91 end_ini = int(initial.rstrip().split("\t")[4])
92 ann_ini = initial.rstrip().split("\t")[8].split(";")[0].split(
93 "=")[-1]
94 starts_part = [start_ini]
95 ends_part = [end_ini]
96 for line in repeats:
97 seq_id = line.rstrip().split("\t")[0]
98 start = int(line.rstrip().split("\t")[3])
99 end = int(line.rstrip().split("\t")[4])
100 ann = line.rstrip().split("\t")[8].split(";")[0].split(
101 "=")[-1]
102 ## if new sequence is detected the joining process is reset
103 if seq_id != seq_id_ini:
104 ## handle the last expanded region from the previous sequence
105 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
106 # check the domains
107 dict_domains = dom_refining(
108 joined_intervals, dict_domains, seq_id_ini,
109 start_ini, end_ini, ann_ini, starts_part,
110 ends_part, DOM_NUM, domains_classes)
111 else:
112 # write the joined region if domains should not be considered
113 joined_intervals.write(
114 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
115 seq_id_ini, configuration.SOURCE_PROFREP,
116 configuration.REPEATS_FEATURE, start_ini,
117 end_ini, configuration.GFF_EMPTY,
118 configuration.GFF_EMPTY,
119 configuration.GFF_EMPTY, ann_ini))
120 # initialize the search for expanded intervals
121 starts_part = [start]
122 ends_part = [end]
123 joined = False
124 seq_id_ini = seq_id
125 start_ini = start
126 end_ini = end
127 ann_ini = ann
128 ## prolonging the potential region to be expanded
129 elif start - end_ini <= GAP_TH and ann == ann_ini:
130 starts_part.append(start)
131 ends_part.append(end)
132 end_ini = end
133 joined = True
134 ## end of expanding the interval
135 else:
136 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
137 dict_domains = dom_refining(
138 joined_intervals, dict_domains, seq_id,
139 start_ini, end_ini, ann_ini, starts_part,
140 ends_part, DOM_NUM, domains_classes)
141 else:
142 joined_intervals.write(
143 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
144 seq_id, configuration.SOURCE_PROFREP,
145 configuration.REPEATS_FEATURE, start_ini,
146 end_ini, configuration.GFF_EMPTY,
147 configuration.GFF_EMPTY,
148 configuration.GFF_EMPTY, ann_ini))
149 starts_part = [start]
150 ends_part = [end]
151 # after handling the expanded interval, set the "joined" indicator to be false, so the next region expanding and potential joining can begin
152 joined = False
153 start_ini = start
154 end_ini = end
155 ann_ini = ann
156 ## handle the last potentially expanded region
157 if dict_domains and joined is True and configuration.WITH_DOMAINS in ann_ini:
158 # check the domains
159 dict_domains = dom_refining(
160 joined_intervals, dict_domains, seq_id, start_ini,
161 end_ini, ann_ini, starts_part, ends_part, DOM_NUM,
162 domains_classes)
163 else:
164 # write the joined region
165 joined_intervals.write(
166 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
167 seq_id, configuration.SOURCE_PROFREP,
168 configuration.REPEATS_FEATURE, start_ini, end_ini,
169 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
170 configuration.GFF_EMPTY, ann_ini))
171 del (starts_part)
172 del (ends_part)
173
174
175 def cluster_regions_for_quality_check(REPEATS_GFF, BORDERS):
176 ''' First step of refining - REMOVING LOW CONFIDENCE REPEATS REGIONS
177 Create clusters of overlapping regions from REPEATS_GFF.
178 These clusters subsequently serves for removing nested regions of different classification which have significantly lower quality comparing to the region they are nested in.
179 Overhang tolerance of couple of bases on the both sides is tolerated when evaluting the nested regions.
180 '''
181 start_line = check_file_start(REPEATS_GFF)
182 cluster = []
183 end_cluster = 0
184 seq_id_ini = None
185 gff_removed_parts = NamedTemporaryFile(delete=False)
186 with open(REPEATS_GFF, "r") as repeats:
187 for comment_idx in range(start_line):
188 gff_removed_parts.write(repeats.readline().encode("utf-8"))
189 for interval in repeats:
190 seq_id = interval.split("\t")[0]
191 start = int(interval.split("\t")[3])
192 end = int(interval.split("\t")[4])
193 ann = interval.split("\t")[8].split(";")[0].split("=")[-1]
194 pid = int(interval.rstrip().split("\t")[8].split(";")[1].split(
195 "=")[-1])
196 if seq_id != seq_id_ini:
197 if len(cluster) > 1:
198 remove_low_qual(cluster, BORDERS, gff_removed_parts)
199 elif cluster:
200 gff_removed_parts.write(
201 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
202 cluster[0][0], configuration.SOURCE_PROFREP,
203 configuration.REPEATS_FEATURE, cluster[0][
204 1], cluster[0][2], configuration.GFF_EMPTY,
205 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
206 cluster[0][3], cluster[0][4]).encode("utf-8"))
207 seq_id_ini = seq_id
208 cluster = []
209 ## expanding the cluster
210 if start <= end_cluster or end_cluster == 0:
211 cluster.append([seq_id, start, end, ann, pid])
212 if end > end_cluster:
213 end_cluster = end
214 ## if the next region is not overlapping, evaluate the regions in cluster or write down the single region
215 else:
216 if len(cluster) > 1:
217 remove_low_qual(cluster, BORDERS, gff_removed_parts)
218 elif cluster:
219 gff_removed_parts.write(
220 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
221 cluster[0][0], configuration.SOURCE_PROFREP,
222 configuration.REPEATS_FEATURE, cluster[0][
223 1], cluster[0][2], configuration.GFF_EMPTY,
224 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
225 cluster[0][3], cluster[0][4]).encode("utf-8"))
226 end_cluster = end
227 cluster = [[seq_id, start, end, ann, pid]]
228 if cluster:
229 remove_low_qual(cluster, BORDERS, gff_removed_parts)
230 gff_removed_parts.close()
231 return gff_removed_parts.name
232
233
234 def remove_low_qual(cluster, BORDERS, gff_removed_parts):
235 ''' Loop over regions in the cluster which are sorted based on descending quality (PID).
236 Regions nested in the currently checked region (with some exceeding borders tolerance) are tested for quality.
237 If the quality difference between the currently highest quality and the nested region is more than a threshold nested one will be removed.
238 Already checked region is removed from the list and procedure continues with the next best quality region in the cluster.
239 '''
240 score_sorted = sorted(cluster, key=lambda x: int(x[4]), reverse=True)
241 for interval in score_sorted:
242 start_toler = interval[1] - BORDERS
243 end_toler = interval[2] + BORDERS
244 score_best = interval[4]
245 ## difference
246 for item in score_sorted[:]:
247 if item[1] >= start_toler and item[2] <= end_toler:
248 if item[4] <= (1 - configuration.QUALITY_DIFF_TO_REMOVE
249 ) * score_best: ### 5% difference tolerance
250 score_sorted.remove(item)
251 position_sorted = sorted(score_sorted, key=lambda x: int(x[1]))
252 for interval_refined in position_sorted:
253 gff_removed_parts.write(
254 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={};Average_PID={}\n".format(
255 interval_refined[0], configuration.SOURCE_PROFREP,
256 configuration.REPEATS_FEATURE, interval_refined[
257 1], interval_refined[2], configuration.GFF_EMPTY,
258 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
259 interval_refined[3], interval_refined[4]).encode("utf-8"))
260
261
262 def dom_refining(joined_intervals, dict_domains, seq_id, start_ini, end_ini,
263 ann_ini, starts_part, ends_part, DOM_NUM, domains_classes):
264 ''' Check the protein domains within the potential interval to be joined:
265 1. appropriate domain has the same classification as interval of repeats to be joined and is classified to the last level
266 2. strands of appropriate domains are uniform
267 3. the count of such appropriate domains is more than a threshold
268 '''
269 count_dom = 0
270 index_dom = 0
271 strands = []
272 indices_del = []
273 for dom_attributes in dict_domains[seq_id]:
274 ann_dom = dom_attributes[2]
275 if dom_attributes[0] >= start_ini and dom_attributes[1] <= end_ini:
276 repeat_class = "|".join(ann_ini.split("|")[2:])
277 if ann_dom in domains_classes and ann_dom == repeat_class:
278 strands.append(dom_attributes[3])
279 count_dom += 1
280 index_dom += 1
281 ## if criteria for domains are met, write the expanded interval
282 if len(set(strands)) <= 1 and count_dom >= DOM_NUM:
283 joined_intervals.write(
284 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
285 seq_id, configuration.SOURCE_PROFREP, configuration.
286 REPEATS_FEATURE, start_ini, end_ini, configuration.GFF_EMPTY,
287 configuration.GFF_EMPTY, configuration.GFF_EMPTY, ann_ini))
288 ## if criteria are not met, refragment the expanded interval on original regions
289 else:
290 for part_start, part_end in zip(starts_part, ends_part):
291 joined_intervals.write(
292 "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\tName={}\n".format(
293 seq_id, configuration.SOURCE_PROFREP,
294 configuration.REPEATS_FEATURE, part_start, part_end,
295 configuration.GFF_EMPTY, configuration.GFF_EMPTY,
296 configuration.GFF_EMPTY, ann_ini))
297 ## delete already checked domains from the dict
298 del (dict_domains[seq_id][0:index_dom])
299 return dict_domains
300
301
302 def get_unique_classes(CLASS_TBL):
303 ''' Get all the lineages of current domains database classification table to subsequently check the protein domains if they are classified up to the last level.
304 Only these domains will be considered as valid for interval joining.
305 If their classification is be finite (based on comparing to this list of unique classes) they will not be counted for minimum number of domains criterion within the segment to be joined
306 '''
307 unique_classes = []
308 with open(CLASS_TBL, "r") as class_tbl:
309 for line in class_tbl:
310 line_class = "|".join(line.rstrip().split("\t")[1:])
311 if line_class not in unique_classes:
312 unique_classes.append(line_class)
313 return unique_classes
314
315
316 def main(args):
317 # Command line arguments
318 REPEATS_GFF = args.repeats_gff
319 DOM_GFF = args.domains_gff
320 GAP_TH = args.gap_threshold
321 DOM_NUM = args.dom_number
322 OUT_REFINED = args.out_refined
323 INCLUDE_DOM = args.include_dom
324 CLASS_TBL = args.class_tbl
325 BORDERS = args.borders
326
327 # first step of refining - removing low confidence repeats regions
328 gff_removed = cluster_regions_for_quality_check(REPEATS_GFF, BORDERS)
329
330 # second step of refining - interconnecting repeats regions
331 if INCLUDE_DOM:
332 unique_classes = get_unique_classes(CLASS_TBL)
333 dict_domains = create_dom_dict(DOM_GFF)
334 joined_intervals = interconnect_intervals(
335 OUT_REFINED, gff_removed, GAP_TH, DOM_NUM, dict_domains,
336 unique_classes)
337 else:
338 joined_intervals = interconnect_intervals(OUT_REFINED, gff_removed,
339 GAP_TH, DOM_NUM, None, None)
340 os.unlink(gff_removed)
341
342
343 if __name__ == "__main__":
344
345 # Command line arguments
346 parser = argparse.ArgumentParser()
347 parser.add_argument('-rep_gff',
348 '--repeats_gff',
349 type=str,
350 required=True,
351 help='original repeats regions GFF from PROFREP')
352 parser.add_argument(
353 '-dom_gff',
354 '--domains_gff',
355 type=str,
356 help=
357 'protein domains GFF if you want to support repeats joining by domains information')
358 parser.add_argument(
359 '-gth',
360 '--gap_threshold',
361 type=int,
362 default=250,
363 help='gap tolerance between consecutive repeats to be interconnected')
364 parser.add_argument('-our',
365 '--out_refined',
366 type=str,
367 default="output_refined.gff",
368 help='query sequence to be processed')
369 parser.add_argument(
370 '-dn',
371 '--dom_number',
372 type=int,
373 default=2,
374 help='min number of domains present to confirm the region joining')
375 parser.add_argument(
376 '-id',
377 '--include_dom',
378 type=str2bool,
379 default=False,
380 help='include domains information to refine the repeats regions')
381 parser.add_argument(
382 '-ct',
383 '--class_tbl',
384 help=
385 'classification table of protein domain database to check the level of classification')
386 parser.add_argument(
387 '-br',
388 '--borders',
389 type=int,
390 default=10,
391 help=
392 'number of bp tolerated from one or the other side of two overlaping regions when evaluating quality')
393 args = parser.parse_args()
394 main(args)