comparison RaGOO/ragoo_utilities/ContigAlignment.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 import operator
2 from collections import defaultdict
3
4 from ragoo_utilities.utilities import summarize_planesweep, binary_search
5
6
7 class ContigAlignment:
8 """
9 This object will represent a contig's alignment to a reference chromosome.
10 Specifically, this class will organize alignments in a one-to-many fashion, where
11 we will only have one object per contig, and each object will store all of the alignments
12 of that contig.
13
14
15 """
16
17 def __init__(self, in_contig):
18 """
19 Each object will refer to a single contig via its header name.
20
21 All other alignment metrics will be stored in lists, where the list offsets
22 correspond to the alignment number.
23
24 Don't actually add any alignments through object instantiation. Only use the add_alignment method.
25 """
26 self.contig = in_contig
27
28 self.query_lens = []
29 self.query_starts = []
30 self.query_ends = []
31 self.strands = []
32 self.ref_headers = []
33 self.ref_lens = []
34 self.ref_starts = []
35 self.ref_ends = []
36 self.num_matches = []
37 self.aln_lens = []
38 self.mapqs = []
39
40 self._is_unique_anchor_filtered = False
41 self._is_merged = False
42
43 def __repr__(self):
44 return '<ContigAlignment' + self.contig + '>'
45
46 def __str__(self):
47 """ Return the alignments in sorted PAF format. """
48 self.sort_by_query()
49 all_alns = []
50 for i in range(len(self.ref_headers)):
51 all_alns.append(
52 "\t".join([
53 self.contig,
54 str(self.query_lens[i]),
55 str(self.query_starts[i]),
56 str(self.query_ends[i]),
57 self.strands[i],
58 self.ref_headers[i],
59 str(self.ref_lens[i]),
60 str(self.ref_starts[i]),
61 str(self.ref_ends[i]),
62 str(self.num_matches[i]),
63 str(self.aln_lens[i]),
64 str(self.mapqs[i])
65 ])
66 )
67 return "\n".join(all_alns)
68
69 def get_attr_lens(self):
70 all_lens = [
71 len(self.query_lens),
72 len(self.query_starts),
73 len(self.query_ends),
74 len(self.strands),
75 len(self.ref_headers),
76 len(self.ref_lens),
77 len(self.ref_starts),
78 len(self.ref_ends),
79 len(self.num_matches),
80 len(self.aln_lens),
81 len(self.mapqs)
82 ]
83 return all_lens
84
85 def add_alignment(self, paf_line):
86 """
87 The only way to add alignments for this contig. Only accepts a PAFLine type object which represents
88 as single line of a paf file.
89 """
90 if not paf_line.contig == self.contig:
91 raise ValueError('Only can add alignments with query header %s' % self.contig)
92
93 self.query_lens.append(paf_line.query_len)
94 self.query_starts.append(paf_line.query_start)
95 self.query_ends.append(paf_line.query_end)
96 self.strands.append(paf_line.strand)
97 self.ref_headers.append(paf_line.ref_header)
98 self.ref_lens.append(paf_line.ref_len)
99 self.ref_starts.append(paf_line.ref_start)
100 self.ref_ends.append(paf_line.ref_end)
101 self.num_matches.append(paf_line.num_match)
102 self.aln_lens.append(paf_line.aln_len)
103 self.mapqs.append(paf_line.mapq)
104
105 # Check that all attribute lists have same length
106 all_lens = self.get_attr_lens()
107 assert len(set(all_lens)) == 1
108
109 def has_unique_chr_match(self):
110 s1 = set(self.ref_headers)
111 if len(s1) == 1:
112 return True
113 return False
114
115 def count_chr_matches(self):
116 s1 = set(self.ref_headers)
117 return len(s1)
118
119 def rearrange_alns(self, hits):
120 """ Generally re structure the alignments according to 'hits', an ordered list of indices. """
121 self.query_lens = [self.query_lens[i] for i in hits]
122 self.query_starts = [self.query_starts[i] for i in hits]
123 self.query_ends = [self.query_ends[i] for i in hits]
124 self.strands = [self.strands[i] for i in hits]
125 self.ref_headers = [self.ref_headers[i] for i in hits]
126 self.ref_lens = [self.ref_lens[i] for i in hits]
127 self.ref_starts = [self.ref_starts[i] for i in hits]
128 self.ref_ends = [self.ref_ends[i] for i in hits]
129 self.num_matches = [self.num_matches[i] for i in hits]
130 self.aln_lens = [self.aln_lens[i] for i in hits]
131 self.mapqs = [self.mapqs[i] for i in hits]
132
133 def filter_ref_chroms(self, in_chroms):
134 """
135 Given a list of chromosomes, mutate this object so as to only include alignments to those chromosomes.
136 """
137 hits = []
138 for i in range(len(self.ref_headers)):
139 if self.ref_headers[i] in in_chroms:
140 hits.append(i)
141
142 self.rearrange_alns(hits)
143
144 def sort_by_ref(self):
145 ref_pos = []
146 for i in range(len(self.ref_headers)):
147 ref_pos.append((self.ref_headers[i], self.ref_starts[i], self.ref_ends[i], i))
148 hits = [i[3] for i in sorted(ref_pos)]
149
150 self.rearrange_alns(hits)
151
152 def sort_by_query(self):
153 q_pos = []
154 for i in range(len(self.ref_headers)):
155 q_pos.append((self.query_starts[i], self.query_ends[i], i))
156 hits = [i[2] for i in sorted(q_pos)]
157
158 self.rearrange_alns(hits)
159
160 def exclude_ref_chroms(self, exclude_list):
161 hits = []
162 for i in range(len(self.ref_headers)):
163 if self.ref_headers[i] not in exclude_list:
164 hits.append(i)
165
166 self.rearrange_alns(hits)
167
168 def filter_lengths(self, l):
169 hits = []
170 for i in range(len(self.ref_headers)):
171 if self.aln_lens[i] > l:
172 hits.append(i)
173
174 self.rearrange_alns(hits)
175
176 def unique_anchor_filter(self):
177 """
178 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
179 written by Maria Nattestad. The original script can be found here:
180
181 https://github.com/MariaNattestad/Assemblytics
182
183 And the publication associated with Maria's work is here:
184
185 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
186 web analytics tool for the detection of variants from an
187 assembly." Bioinformatics 32.19 (2016): 3021-3023.
188 """
189
190 if not self._is_unique_anchor_filtered:
191 lines_by_query = []
192 for i, j in zip(self.query_starts, self.query_ends):
193 lines_by_query.append((i, j))
194
195 hits = summarize_planesweep(lines_by_query, 10000)
196 self.rearrange_alns(hits)
197 self._is_unique_anchor_filtered = True
198
199 def merge_alns(self, merge_dist=100000):
200 """
201 Merge chains of alignments that are to the same target, have the same orientation, and are less than
202 merge_dist away from each other.
203 :param merge_dist:
204 :return:
205 """
206 # Sort the alignments
207 self.sort_by_ref()
208
209 # Might also want to filter out low identity alignments
210
211 # Keep track of which alignments we are comparing
212 i = 0
213 j = 1
214 while j < len(self.ref_headers):
215 if all([
216 self.ref_headers[i] == self.ref_headers[j],
217 self.strands[i] == self.strands[j],
218 self.ref_starts[j] - self.ref_ends[i] <= merge_dist
219 ]):
220 # Merge the alignments in place of the first alignment
221 self.ref_starts[i] = min(self.ref_starts[i], self.ref_starts[j])
222 self.ref_ends[i] = max(self.ref_ends[i], self.ref_ends[j])
223 self.query_starts[i] = min(self.query_starts[i], self.query_starts[j])
224 self.query_ends[i] = max(self.query_ends[i], self.query_ends[j])
225
226 self.num_matches[i] += self.num_matches[j]
227 self.aln_lens[i] = self.ref_ends[i] - self.ref_starts[i]
228 self.mapqs[i] = (self.mapqs[i] + self.mapqs[j])//2
229
230 # Remove the redundant alignment
231 self.query_lens.pop(j)
232 self.query_starts.pop(j)
233 self.query_ends.pop(j)
234 self.strands.pop(j)
235 self.ref_headers.pop(j)
236 self.ref_lens.pop(j)
237 self.ref_starts.pop(j)
238 self.ref_ends.pop(j)
239 self.num_matches.pop(j)
240 self.aln_lens.pop(j)
241 self.mapqs.pop(j)
242 else:
243 i += 1
244 j += 1
245
246 # remove contained alignments. contained w.r.t the contig
247 cont_inds = []
248 for i in range(len(self.ref_headers)):
249 for j in range(len(self.ref_headers)):
250 # Check if j is contained by i
251 if i == j:
252 continue
253 if self.query_starts[i] <= self.query_starts[j] and self.query_ends[i] >= self.query_ends[j]:
254 cont_inds.append(j)
255
256 hits = [i for i in range(len(self.ref_headers)) if i not in cont_inds]
257 self.rearrange_alns(hits)
258
259 self._is_merged = True
260
261 def get_break_candidates(self):
262 if not self._is_merged:
263 raise ValueError("Alignments must first be merged.")
264
265 self.sort_by_query()
266 all_candidates = []
267 for i in range(1, len(self.ref_headers)):
268 all_candidates.append(min(self.query_ends[i-1], self.query_starts[i]))
269 all_candidates.append(max(self.query_ends[i-1], self.query_starts[i]))
270
271 return all_candidates
272
273
274
275 class UniqueContigAlignment:
276 """
277 A class representing the reference chromosome to which a particular contig will be assigned to.
278 At first, the incoming alignments may be to multiple chromosomes, but this class will assign a single
279 reference chromosome to an input contig.
280 """
281
282 def __init__(self, in_contig_aln):
283 if not isinstance(in_contig_aln, ContigAlignment):
284 message = 'Can only make a unique alignment from a ContigAlignment object. Got type %s instead.' % str(type(in_contig_aln))
285 raise ValueError(message)
286
287 self.contig = in_contig_aln.contig
288
289 # Find the chromosome which was covered the most in these alignments.
290 self.ref_chrom = None
291 self.confidence = 0.0
292 self._get_best_ref_cov(in_contig_aln)
293
294 def __str__(self):
295 return '<UniqueContigAlignment:' + self.contig + ' - ' + self.ref_chrom + '>'
296
297 def _get_best_ref_cov(self, alns):
298 # Get list of all references chromosomes
299 all_chroms = set(alns.ref_headers)
300 if len(all_chroms) == 1:
301 self.ref_chrom = alns.ref_headers[0]
302 self.confidence = 1
303 return
304
305 # Initialize coverage counts for each chromosome
306 ranges = dict()
307 for i in all_chroms:
308 ranges[i] = 0
309
310 # Get all the ranges in reference to all chromosomes
311 all_intervals = defaultdict(list)
312 for i in range(len(alns.ref_headers)):
313 this_range = (alns.ref_starts[i], alns.ref_ends[i])
314 this_chrom = alns.ref_headers[i]
315 all_intervals[this_chrom].append(this_range)
316
317 # For each chromosome, sort the range and get the union interval length.
318 # Algorithm and pseudocode given by Michael Kirsche
319 for i in all_intervals.keys():
320 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
321 max_end = -1
322 for j in sorted_intervals:
323 start_new_terr = max(j[0], max_end)
324 ranges[i] += max(0, j[1] - start_new_terr)
325 max_end = max(max_end, j[1])
326
327 assert ranges
328
329 # I convert to a list and sort the ranges.items() in order to have ties broken in a deterministic way.
330 max_chrom = max(sorted(list(ranges.items())), key=operator.itemgetter(1))[0]
331 self.ref_chrom = max_chrom
332
333 # Now get the confidence of this chromosome assignment
334 # Equal to the max range over all ranges
335 self.confidence = ranges[max_chrom]/sum(ranges.values())
336 assert self.confidence >= 0
337 assert self.confidence <= 1
338
339
340 class LongestContigAlignment:
341 """
342 Given a ContigAlignment object, find the longest alignment with respect to a given references chromosome.
343
344 1. Filter out any alignments that are not aligned to specified chromosome.
345 2. Find longest alignment remaining.
346 """
347
348 def __init__(self, in_contig_aln):
349 self.best_index = self._get_best_index(in_contig_aln)
350 self.contig = in_contig_aln.contig
351 self.ref_start = in_contig_aln.ref_starts[self.best_index]
352 self.ref_end = in_contig_aln.ref_ends[self.best_index]
353 self.aln_len = in_contig_aln.aln_lens[self.best_index]
354 self.strand = in_contig_aln.strands[self.best_index]
355 self.interval = (self.ref_start, self.ref_end)
356
357 def _get_best_index(self, aln):
358 max_index = -1
359 max_len = -1
360 for i in range(len(aln.aln_lens)):
361 if aln.aln_lens[i] > max_len:
362 max_len = aln.aln_lens[i]
363 max_index = i
364 return max_index