annotate RaGOO/ragoo_utilities/ContigAlignment.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 import operator
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2 from collections import defaultdict
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 from ragoo_utilities.utilities import summarize_planesweep, binary_search
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7 class ContigAlignment:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 This object will represent a contig's alignment to a reference chromosome.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10 Specifically, this class will organize alignments in a one-to-many fashion, where
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11 we will only have one object per contig, and each object will store all of the alignments
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 of that contig.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 def __init__(self, in_contig):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 Each object will refer to a single contig via its header name.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 All other alignment metrics will be stored in lists, where the list offsets
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 correspond to the alignment number.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 Don't actually add any alignments through object instantiation. Only use the add_alignment method.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 self.contig = in_contig
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 self.query_lens = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29 self.query_starts = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 self.query_ends = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31 self.strands = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 self.ref_headers = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33 self.ref_lens = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 self.ref_starts = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 self.ref_ends = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 self.num_matches = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 self.aln_lens = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 self.mapqs = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40 self._is_unique_anchor_filtered = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 self._is_merged = False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 def __repr__(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 return '<ContigAlignment' + self.contig + '>'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 def __str__(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47 """ Return the alignments in sorted PAF format. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48 self.sort_by_query()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49 all_alns = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 all_alns.append(
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52 "\t".join([
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 self.contig,
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 str(self.query_lens[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55 str(self.query_starts[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 str(self.query_ends[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 self.strands[i],
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 self.ref_headers[i],
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 str(self.ref_lens[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60 str(self.ref_starts[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61 str(self.ref_ends[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 str(self.num_matches[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 str(self.aln_lens[i]),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64 str(self.mapqs[i])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 ])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 )
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 return "\n".join(all_alns)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 def get_attr_lens(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 all_lens = [
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71 len(self.query_lens),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 len(self.query_starts),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 len(self.query_ends),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74 len(self.strands),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75 len(self.ref_headers),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 len(self.ref_lens),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77 len(self.ref_starts),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78 len(self.ref_ends),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 len(self.num_matches),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80 len(self.aln_lens),
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81 len(self.mapqs)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82 ]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 return all_lens
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85 def add_alignment(self, paf_line):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87 The only way to add alignments for this contig. Only accepts a PAFLine type object which represents
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88 as single line of a paf file.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 if not paf_line.contig == self.contig:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91 raise ValueError('Only can add alignments with query header %s' % self.contig)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93 self.query_lens.append(paf_line.query_len)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94 self.query_starts.append(paf_line.query_start)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95 self.query_ends.append(paf_line.query_end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96 self.strands.append(paf_line.strand)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97 self.ref_headers.append(paf_line.ref_header)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 self.ref_lens.append(paf_line.ref_len)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 self.ref_starts.append(paf_line.ref_start)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 self.ref_ends.append(paf_line.ref_end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 self.num_matches.append(paf_line.num_match)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102 self.aln_lens.append(paf_line.aln_len)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103 self.mapqs.append(paf_line.mapq)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105 # Check that all attribute lists have same length
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
106 all_lens = self.get_attr_lens()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
107 assert len(set(all_lens)) == 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
108
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
109 def has_unique_chr_match(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
110 s1 = set(self.ref_headers)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
111 if len(s1) == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
112 return True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
113 return False
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
114
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
115 def count_chr_matches(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
116 s1 = set(self.ref_headers)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
117 return len(s1)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
118
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
119 def rearrange_alns(self, hits):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
120 """ Generally re structure the alignments according to 'hits', an ordered list of indices. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
121 self.query_lens = [self.query_lens[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
122 self.query_starts = [self.query_starts[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
123 self.query_ends = [self.query_ends[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
124 self.strands = [self.strands[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
125 self.ref_headers = [self.ref_headers[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
126 self.ref_lens = [self.ref_lens[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
127 self.ref_starts = [self.ref_starts[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
128 self.ref_ends = [self.ref_ends[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
129 self.num_matches = [self.num_matches[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
130 self.aln_lens = [self.aln_lens[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
131 self.mapqs = [self.mapqs[i] for i in hits]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
132
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
133 def filter_ref_chroms(self, in_chroms):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
134 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
135 Given a list of chromosomes, mutate this object so as to only include alignments to those chromosomes.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
136 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
137 hits = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
138 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
139 if self.ref_headers[i] in in_chroms:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
140 hits.append(i)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
141
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
142 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
143
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
144 def sort_by_ref(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
145 ref_pos = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
146 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
147 ref_pos.append((self.ref_headers[i], self.ref_starts[i], self.ref_ends[i], i))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
148 hits = [i[3] for i in sorted(ref_pos)]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
149
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
150 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
151
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
152 def sort_by_query(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
153 q_pos = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
154 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
155 q_pos.append((self.query_starts[i], self.query_ends[i], i))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
156 hits = [i[2] for i in sorted(q_pos)]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
157
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
158 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
159
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
160 def exclude_ref_chroms(self, exclude_list):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
161 hits = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
162 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
163 if self.ref_headers[i] not in exclude_list:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
164 hits.append(i)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
165
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
166 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
167
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
168 def filter_lengths(self, l):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
169 hits = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
170 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
171 if self.aln_lens[i] > l:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
172 hits.append(i)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
173
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
174 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
175
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
176 def unique_anchor_filter(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
177 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
178 The contents of this method are either influenced by or directly copied from "Assemblytics_uniq_anchor.py"
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
179 written by Maria Nattestad. The original script can be found here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
180
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
181 https://github.com/MariaNattestad/Assemblytics
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
182
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
183 And the publication associated with Maria's work is here:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
184
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
185 Nattestad, Maria, and Michael C. Schatz. "Assemblytics: a
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
186 web analytics tool for the detection of variants from an
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
187 assembly." Bioinformatics 32.19 (2016): 3021-3023.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
188 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
189
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
190 if not self._is_unique_anchor_filtered:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
191 lines_by_query = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
192 for i, j in zip(self.query_starts, self.query_ends):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
193 lines_by_query.append((i, j))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
194
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
195 hits = summarize_planesweep(lines_by_query, 10000)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
196 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
197 self._is_unique_anchor_filtered = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
198
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
199 def merge_alns(self, merge_dist=100000):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
200 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
201 Merge chains of alignments that are to the same target, have the same orientation, and are less than
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
202 merge_dist away from each other.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
203 :param merge_dist:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
204 :return:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
205 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
206 # Sort the alignments
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
207 self.sort_by_ref()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
208
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
209 # Might also want to filter out low identity alignments
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
210
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
211 # Keep track of which alignments we are comparing
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
212 i = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
213 j = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
214 while j < len(self.ref_headers):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
215 if all([
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
216 self.ref_headers[i] == self.ref_headers[j],
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
217 self.strands[i] == self.strands[j],
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
218 self.ref_starts[j] - self.ref_ends[i] <= merge_dist
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
219 ]):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
220 # Merge the alignments in place of the first alignment
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
221 self.ref_starts[i] = min(self.ref_starts[i], self.ref_starts[j])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
222 self.ref_ends[i] = max(self.ref_ends[i], self.ref_ends[j])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
223 self.query_starts[i] = min(self.query_starts[i], self.query_starts[j])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
224 self.query_ends[i] = max(self.query_ends[i], self.query_ends[j])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
225
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
226 self.num_matches[i] += self.num_matches[j]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
227 self.aln_lens[i] = self.ref_ends[i] - self.ref_starts[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
228 self.mapqs[i] = (self.mapqs[i] + self.mapqs[j])//2
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
229
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
230 # Remove the redundant alignment
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
231 self.query_lens.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
232 self.query_starts.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
233 self.query_ends.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
234 self.strands.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
235 self.ref_headers.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
236 self.ref_lens.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
237 self.ref_starts.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
238 self.ref_ends.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
239 self.num_matches.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
240 self.aln_lens.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
241 self.mapqs.pop(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
242 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
243 i += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
244 j += 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
245
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
246 # remove contained alignments. contained w.r.t the contig
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
247 cont_inds = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
248 for i in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
249 for j in range(len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
250 # Check if j is contained by i
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
251 if i == j:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
252 continue
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
253 if self.query_starts[i] <= self.query_starts[j] and self.query_ends[i] >= self.query_ends[j]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
254 cont_inds.append(j)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
255
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
256 hits = [i for i in range(len(self.ref_headers)) if i not in cont_inds]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
257 self.rearrange_alns(hits)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
258
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
259 self._is_merged = True
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
260
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
261 def get_break_candidates(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
262 if not self._is_merged:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
263 raise ValueError("Alignments must first be merged.")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
264
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
265 self.sort_by_query()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
266 all_candidates = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
267 for i in range(1, len(self.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
268 all_candidates.append(min(self.query_ends[i-1], self.query_starts[i]))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
269 all_candidates.append(max(self.query_ends[i-1], self.query_starts[i]))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
270
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
271 return all_candidates
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
272
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
273
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
274
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
275 class UniqueContigAlignment:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
276 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
277 A class representing the reference chromosome to which a particular contig will be assigned to.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
278 At first, the incoming alignments may be to multiple chromosomes, but this class will assign a single
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
279 reference chromosome to an input contig.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
280 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
281
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
282 def __init__(self, in_contig_aln):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
283 if not isinstance(in_contig_aln, ContigAlignment):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
284 message = 'Can only make a unique alignment from a ContigAlignment object. Got type %s instead.' % str(type(in_contig_aln))
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
285 raise ValueError(message)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
286
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
287 self.contig = in_contig_aln.contig
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
288
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
289 # Find the chromosome which was covered the most in these alignments.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
290 self.ref_chrom = None
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
291 self.confidence = 0.0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
292 self._get_best_ref_cov(in_contig_aln)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
293
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
294 def __str__(self):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
295 return '<UniqueContigAlignment:' + self.contig + ' - ' + self.ref_chrom + '>'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
296
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
297 def _get_best_ref_cov(self, alns):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
298 # Get list of all references chromosomes
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
299 all_chroms = set(alns.ref_headers)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
300 if len(all_chroms) == 1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
301 self.ref_chrom = alns.ref_headers[0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
302 self.confidence = 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
303 return
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
304
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
305 # Initialize coverage counts for each chromosome
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
306 ranges = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
307 for i in all_chroms:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
308 ranges[i] = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
309
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
310 # Get all the ranges in reference to all chromosomes
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
311 all_intervals = defaultdict(list)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
312 for i in range(len(alns.ref_headers)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
313 this_range = (alns.ref_starts[i], alns.ref_ends[i])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
314 this_chrom = alns.ref_headers[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
315 all_intervals[this_chrom].append(this_range)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
316
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
317 # For each chromosome, sort the range and get the union interval length.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
318 # Algorithm and pseudocode given by Michael Kirsche
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
319 for i in all_intervals.keys():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
320 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
321 max_end = -1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
322 for j in sorted_intervals:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
323 start_new_terr = max(j[0], max_end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
324 ranges[i] += max(0, j[1] - start_new_terr)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
325 max_end = max(max_end, j[1])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
326
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
327 assert ranges
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
328
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
329 # I convert to a list and sort the ranges.items() in order to have ties broken in a deterministic way.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
330 max_chrom = max(sorted(list(ranges.items())), key=operator.itemgetter(1))[0]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
331 self.ref_chrom = max_chrom
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
332
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
333 # Now get the confidence of this chromosome assignment
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
334 # Equal to the max range over all ranges
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
335 self.confidence = ranges[max_chrom]/sum(ranges.values())
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
336 assert self.confidence >= 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
337 assert self.confidence <= 1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
338
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
339
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
340 class LongestContigAlignment:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
341 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
342 Given a ContigAlignment object, find the longest alignment with respect to a given references chromosome.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
343
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
344 1. Filter out any alignments that are not aligned to specified chromosome.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
345 2. Find longest alignment remaining.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
346 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
347
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
348 def __init__(self, in_contig_aln):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
349 self.best_index = self._get_best_index(in_contig_aln)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
350 self.contig = in_contig_aln.contig
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
351 self.ref_start = in_contig_aln.ref_starts[self.best_index]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
352 self.ref_end = in_contig_aln.ref_ends[self.best_index]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
353 self.aln_len = in_contig_aln.aln_lens[self.best_index]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
354 self.strand = in_contig_aln.strands[self.best_index]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
355 self.interval = (self.ref_start, self.ref_end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
356
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
357 def _get_best_index(self, aln):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
358 max_index = -1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
359 max_len = -1
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
360 for i in range(len(aln.aln_lens)):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
361 if aln.aln_lens[i] > max_len:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
362 max_len = aln.aln_lens[i]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
363 max_index = i
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
364 return max_index