13
|
1 import operator
|
|
2 from collections import defaultdict
|
|
3
|
|
4 from ragoo_utilities.utilities import summarize_planesweep, binary_search
|
|
5
|
|
6
|
|
7 class ContigAlignment:
|
|
8 """
|
|
9 This object will represent a contig's alignment to a reference chromosome.
|
|
10 Specifically, this class will organize alignments in a one-to-many fashion, where
|
|
11 we will only have one object per contig, and each object will store all of the alignments
|
|
12 of that contig.
|
|
13
|
|
14
|
|
15 """
|
|
16
|
|
17 def __init__(self, in_contig):
|
|
18 """
|
|
19 Each object will refer to a single contig via its header name.
|
|
20
|
|
21 All other alignment metrics will be stored in lists, where the list offsets
|
|
22 correspond to the alignment number.
|
|
23
|
|
24 Don't actually add any alignments through object instantiation. Only use the add_alignment method.
|
|
25 """
|
|
26 self.contig = in_contig
|
|
27
|
|
28 self.query_lens = []
|
|
29 self.query_starts = []
|
|
30 self.query_ends = []
|
|
31 self.strands = []
|
|
32 self.ref_headers = []
|
|
33 self.ref_lens = []
|
|
34 self.ref_starts = []
|
|
35 self.ref_ends = []
|
|
36 self.num_matches = []
|
|
37 self.aln_lens = []
|
|
38 self.mapqs = []
|
|
39
|
|
40 self._is_unique_anchor_filtered = False
|
|
41 self._is_merged = False
|
|
42
|
|
43 def __repr__(self):
|
|
44 return '<ContigAlignment' + self.contig + '>'
|
|
45
|
|
46 def __str__(self):
|
|
47 """ Return the alignments in sorted PAF format. """
|
|
48 self.sort_by_query()
|
|
49 all_alns = []
|
|
50 for i in range(len(self.ref_headers)):
|
|
51 all_alns.append(
|
|
52 "\t".join([
|
|
53 self.contig,
|
|
54 str(self.query_lens[i]),
|
|
55 str(self.query_starts[i]),
|
|
56 str(self.query_ends[i]),
|
|
57 self.strands[i],
|
|
58 self.ref_headers[i],
|
|
59 str(self.ref_lens[i]),
|
|
60 str(self.ref_starts[i]),
|
|
61 str(self.ref_ends[i]),
|
|
62 str(self.num_matches[i]),
|
|
63 str(self.aln_lens[i]),
|
|
64 str(self.mapqs[i])
|
|
65 ])
|
|
66 )
|
|
67 return "\n".join(all_alns)
|
|
68
|
|
69 def get_attr_lens(self):
|
|
70 all_lens = [
|
|
71 len(self.query_lens),
|
|
72 len(self.query_starts),
|
|
73 len(self.query_ends),
|
|
74 len(self.strands),
|
|
75 len(self.ref_headers),
|
|
76 len(self.ref_lens),
|
|
77 len(self.ref_starts),
|
|
78 len(self.ref_ends),
|
|
79 len(self.num_matches),
|
|
80 len(self.aln_lens),
|
|
81 len(self.mapqs)
|
|
82 ]
|
|
83 return all_lens
|
|
84
|
|
85 def add_alignment(self, paf_line):
|
|
86 """
|
|
87 The only way to add alignments for this contig. Only accepts a PAFLine type object which represents
|
|
88 as single line of a paf file.
|
|
89 """
|
|
90 if not paf_line.contig == self.contig:
|
|
91 raise ValueError('Only can add alignments with query header %s' % self.contig)
|
|
92
|
|
93 self.query_lens.append(paf_line.query_len)
|
|
94 self.query_starts.append(paf_line.query_start)
|
|
95 self.query_ends.append(paf_line.query_end)
|
|
96 self.strands.append(paf_line.strand)
|
|
97 self.ref_headers.append(paf_line.ref_header)
|
|
98 self.ref_lens.append(paf_line.ref_len)
|
|
99 self.ref_starts.append(paf_line.ref_start)
|
|
100 self.ref_ends.append(paf_line.ref_end)
|
|
101 self.num_matches.append(paf_line.num_match)
|
|
102 self.aln_lens.append(paf_line.aln_len)
|
|
103 self.mapqs.append(paf_line.mapq)
|
|
104
|
|
105 # Check that all attribute lists have same length
|
|
106 all_lens = self.get_attr_lens()
|
|
107 assert len(set(all_lens)) == 1
|
|
108
|
|
109 def has_unique_chr_match(self):
|
|
110 s1 = set(self.ref_headers)
|
|
111 if len(s1) == 1:
|
|
112 return True
|
|
113 return False
|
|
114
|
|
115 def count_chr_matches(self):
|
|
116 s1 = set(self.ref_headers)
|
|
117 return len(s1)
|
|
118
|
|
119 def rearrange_alns(self, hits):
|
|
120 """ Generally re structure the alignments according to 'hits', an ordered list of indices. """
|
|
121 self.query_lens = [self.query_lens[i] for i in hits]
|
|
122 self.query_starts = [self.query_starts[i] for i in hits]
|
|
123 self.query_ends = [self.query_ends[i] for i in hits]
|
|
124 self.strands = [self.strands[i] for i in hits]
|
|
125 self.ref_headers = [self.ref_headers[i] for i in hits]
|
|
126 self.ref_lens = [self.ref_lens[i] for i in hits]
|
|
127 self.ref_starts = [self.ref_starts[i] for i in hits]
|
|
128 self.ref_ends = [self.ref_ends[i] for i in hits]
|
|
129 self.num_matches = [self.num_matches[i] for i in hits]
|
|
130 self.aln_lens = [self.aln_lens[i] for i in hits]
|
|
131 self.mapqs = [self.mapqs[i] for i in hits]
|
|
132
|
|
133 def filter_ref_chroms(self, in_chroms):
|
|
134 """
|
|
135 Given a list of chromosomes, mutate this object so as to only include alignments to those chromosomes.
|
|
136 """
|
|
137 hits = []
|
|
138 for i in range(len(self.ref_headers)):
|
|
139 if self.ref_headers[i] in in_chroms:
|
|
140 hits.append(i)
|
|
141
|
|
142 self.rearrange_alns(hits)
|
|
143
|
|
144 def sort_by_ref(self):
|
|
145 ref_pos = []
|
|
146 for i in range(len(self.ref_headers)):
|
|
147 ref_pos.append((self.ref_headers[i], self.ref_starts[i], self.ref_ends[i], i))
|
|
148 hits = [i[3] for i in sorted(ref_pos)]
|
|
149
|
|
150 self.rearrange_alns(hits)
|
|
151
|
|
152 def sort_by_query(self):
|
|
153 q_pos = []
|
|
154 for i in range(len(self.ref_headers)):
|
|
155 q_pos.append((self.query_starts[i], self.query_ends[i], i))
|
|
156 hits = [i[2] for i in sorted(q_pos)]
|
|
157
|
|
158 self.rearrange_alns(hits)
|
|
159
|
|
160 def exclude_ref_chroms(self, exclude_list):
|
|
161 hits = []
|
|
162 for i in range(len(self.ref_headers)):
|
|
163 if self.ref_headers[i] not in exclude_list:
|
|
164 hits.append(i)
|
|
165
|
|
166 self.rearrange_alns(hits)
|
|
167
|
|
168 def filter_lengths(self, l):
|
|
169 hits = []
|
|
170 for i in range(len(self.ref_headers)):
|
|
171 if self.aln_lens[i] > l:
|
|
172 hits.append(i)
|
|
173
|
|
174 self.rearrange_alns(hits)
|
|
175
|
|
176 def unique_anchor_filter(self):
|
|
177 """
|
|
178 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
|
|
179 written by Maria Nattestad. The original script can be found here:
|
|
180
|
|
181 https://github.com/MariaNattestad/Assemblytics
|
|
182
|
|
183 And the publication associated with Maria's work is here:
|
|
184
|
|
185 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
|
|
186 web analytics tool for the detection of variants from an
|
|
187 assembly." Bioinformatics 32.19 (2016): 3021-3023.
|
|
188 """
|
|
189
|
|
190 if not self._is_unique_anchor_filtered:
|
|
191 lines_by_query = []
|
|
192 for i, j in zip(self.query_starts, self.query_ends):
|
|
193 lines_by_query.append((i, j))
|
|
194
|
|
195 hits = summarize_planesweep(lines_by_query, 10000)
|
|
196 self.rearrange_alns(hits)
|
|
197 self._is_unique_anchor_filtered = True
|
|
198
|
|
199 def merge_alns(self, merge_dist=100000):
|
|
200 """
|
|
201 Merge chains of alignments that are to the same target, have the same orientation, and are less than
|
|
202 merge_dist away from each other.
|
|
203 :param merge_dist:
|
|
204 :return:
|
|
205 """
|
|
206 # Sort the alignments
|
|
207 self.sort_by_ref()
|
|
208
|
|
209 # Might also want to filter out low identity alignments
|
|
210
|
|
211 # Keep track of which alignments we are comparing
|
|
212 i = 0
|
|
213 j = 1
|
|
214 while j < len(self.ref_headers):
|
|
215 if all([
|
|
216 self.ref_headers[i] == self.ref_headers[j],
|
|
217 self.strands[i] == self.strands[j],
|
|
218 self.ref_starts[j] - self.ref_ends[i] <= merge_dist
|
|
219 ]):
|
|
220 # Merge the alignments in place of the first alignment
|
|
221 self.ref_starts[i] = min(self.ref_starts[i], self.ref_starts[j])
|
|
222 self.ref_ends[i] = max(self.ref_ends[i], self.ref_ends[j])
|
|
223 self.query_starts[i] = min(self.query_starts[i], self.query_starts[j])
|
|
224 self.query_ends[i] = max(self.query_ends[i], self.query_ends[j])
|
|
225
|
|
226 self.num_matches[i] += self.num_matches[j]
|
|
227 self.aln_lens[i] = self.ref_ends[i] - self.ref_starts[i]
|
|
228 self.mapqs[i] = (self.mapqs[i] + self.mapqs[j])//2
|
|
229
|
|
230 # Remove the redundant alignment
|
|
231 self.query_lens.pop(j)
|
|
232 self.query_starts.pop(j)
|
|
233 self.query_ends.pop(j)
|
|
234 self.strands.pop(j)
|
|
235 self.ref_headers.pop(j)
|
|
236 self.ref_lens.pop(j)
|
|
237 self.ref_starts.pop(j)
|
|
238 self.ref_ends.pop(j)
|
|
239 self.num_matches.pop(j)
|
|
240 self.aln_lens.pop(j)
|
|
241 self.mapqs.pop(j)
|
|
242 else:
|
|
243 i += 1
|
|
244 j += 1
|
|
245
|
|
246 # remove contained alignments. contained w.r.t the contig
|
|
247 cont_inds = []
|
|
248 for i in range(len(self.ref_headers)):
|
|
249 for j in range(len(self.ref_headers)):
|
|
250 # Check if j is contained by i
|
|
251 if i == j:
|
|
252 continue
|
|
253 if self.query_starts[i] <= self.query_starts[j] and self.query_ends[i] >= self.query_ends[j]:
|
|
254 cont_inds.append(j)
|
|
255
|
|
256 hits = [i for i in range(len(self.ref_headers)) if i not in cont_inds]
|
|
257 self.rearrange_alns(hits)
|
|
258
|
|
259 self._is_merged = True
|
|
260
|
|
261 def get_break_candidates(self):
|
|
262 if not self._is_merged:
|
|
263 raise ValueError("Alignments must first be merged.")
|
|
264
|
|
265 self.sort_by_query()
|
|
266 all_candidates = []
|
|
267 for i in range(1, len(self.ref_headers)):
|
|
268 all_candidates.append(min(self.query_ends[i-1], self.query_starts[i]))
|
|
269 all_candidates.append(max(self.query_ends[i-1], self.query_starts[i]))
|
|
270
|
|
271 return all_candidates
|
|
272
|
|
273
|
|
274
|
|
275 class UniqueContigAlignment:
|
|
276 """
|
|
277 A class representing the reference chromosome to which a particular contig will be assigned to.
|
|
278 At first, the incoming alignments may be to multiple chromosomes, but this class will assign a single
|
|
279 reference chromosome to an input contig.
|
|
280 """
|
|
281
|
|
282 def __init__(self, in_contig_aln):
|
|
283 if not isinstance(in_contig_aln, ContigAlignment):
|
|
284 message = 'Can only make a unique alignment from a ContigAlignment object. Got type %s instead.' % str(type(in_contig_aln))
|
|
285 raise ValueError(message)
|
|
286
|
|
287 self.contig = in_contig_aln.contig
|
|
288
|
|
289 # Find the chromosome which was covered the most in these alignments.
|
|
290 self.ref_chrom = None
|
|
291 self.confidence = 0.0
|
|
292 self._get_best_ref_cov(in_contig_aln)
|
|
293
|
|
294 def __str__(self):
|
|
295 return '<UniqueContigAlignment:' + self.contig + ' - ' + self.ref_chrom + '>'
|
|
296
|
|
297 def _get_best_ref_cov(self, alns):
|
|
298 # Get list of all references chromosomes
|
|
299 all_chroms = set(alns.ref_headers)
|
|
300 if len(all_chroms) == 1:
|
|
301 self.ref_chrom = alns.ref_headers[0]
|
|
302 self.confidence = 1
|
|
303 return
|
|
304
|
|
305 # Initialize coverage counts for each chromosome
|
|
306 ranges = dict()
|
|
307 for i in all_chroms:
|
|
308 ranges[i] = 0
|
|
309
|
|
310 # Get all the ranges in reference to all chromosomes
|
|
311 all_intervals = defaultdict(list)
|
|
312 for i in range(len(alns.ref_headers)):
|
|
313 this_range = (alns.ref_starts[i], alns.ref_ends[i])
|
|
314 this_chrom = alns.ref_headers[i]
|
|
315 all_intervals[this_chrom].append(this_range)
|
|
316
|
|
317 # For each chromosome, sort the range and get the union interval length.
|
|
318 # Algorithm and pseudocode given by Michael Kirsche
|
|
319 for i in all_intervals.keys():
|
|
320 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
|
|
321 max_end = -1
|
|
322 for j in sorted_intervals:
|
|
323 start_new_terr = max(j[0], max_end)
|
|
324 ranges[i] += max(0, j[1] - start_new_terr)
|
|
325 max_end = max(max_end, j[1])
|
|
326
|
|
327 assert ranges
|
|
328
|
|
329 # I convert to a list and sort the ranges.items() in order to have ties broken in a deterministic way.
|
|
330 max_chrom = max(sorted(list(ranges.items())), key=operator.itemgetter(1))[0]
|
|
331 self.ref_chrom = max_chrom
|
|
332
|
|
333 # Now get the confidence of this chromosome assignment
|
|
334 # Equal to the max range over all ranges
|
|
335 self.confidence = ranges[max_chrom]/sum(ranges.values())
|
|
336 assert self.confidence >= 0
|
|
337 assert self.confidence <= 1
|
|
338
|
|
339
|
|
340 class LongestContigAlignment:
|
|
341 """
|
|
342 Given a ContigAlignment object, find the longest alignment with respect to a given references chromosome.
|
|
343
|
|
344 1. Filter out any alignments that are not aligned to specified chromosome.
|
|
345 2. Find longest alignment remaining.
|
|
346 """
|
|
347
|
|
348 def __init__(self, in_contig_aln):
|
|
349 self.best_index = self._get_best_index(in_contig_aln)
|
|
350 self.contig = in_contig_aln.contig
|
|
351 self.ref_start = in_contig_aln.ref_starts[self.best_index]
|
|
352 self.ref_end = in_contig_aln.ref_ends[self.best_index]
|
|
353 self.aln_len = in_contig_aln.aln_lens[self.best_index]
|
|
354 self.strand = in_contig_aln.strands[self.best_index]
|
|
355 self.interval = (self.ref_start, self.ref_end)
|
|
356
|
|
357 def _get_best_index(self, aln):
|
|
358 max_index = -1
|
|
359 max_len = -1
|
|
360 for i in range(len(aln.aln_lens)):
|
|
361 if aln.aln_lens[i] > max_len:
|
|
362 max_len = aln.aln_lens[i]
|
|
363 max_index = i
|
|
364 return max_index |