13
|
1 from collections import defaultdict
|
|
2 import copy
|
|
3
|
|
4 from intervaltree import IntervalTree
|
|
5
|
|
6 from ragoo_utilities.ContigAlignment import UniqueContigAlignment
|
|
7
|
|
8
|
|
9 def get_ref_parts(alns, l, p, r):
|
|
10 """
|
|
11
|
|
12 :param alns: A ContigAlignment object for the contig in question
|
|
13 :param l: Minimum alignment length to consider
|
|
14 :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant
|
|
15 :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant
|
|
16 :return: A list of reference chromosomes to which the input contig significantly aligns to
|
|
17 """
|
|
18 all_intervals = defaultdict(list)
|
|
19
|
|
20 # Iterate over all alignments
|
|
21 for i in range(len(alns.ref_headers)):
|
|
22 if alns.aln_lens[i] < l:
|
|
23 continue
|
|
24
|
|
25 all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i]))
|
|
26
|
|
27 ranges = dict()
|
|
28 for i in all_intervals.keys():
|
|
29 ranges[i] = 0
|
|
30
|
|
31 # For each chromosome, sort the range and get the union interval length.
|
|
32 # Algorithm and pseudocode given by Michael Kirsche
|
|
33 for i in all_intervals.keys():
|
|
34 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
|
|
35 max_end = -1
|
|
36 for j in sorted_intervals:
|
|
37 start_new_terr = max(j[0], max_end)
|
|
38 ranges[i] += max(0, j[1] - start_new_terr)
|
|
39 max_end = max(max_end, j[1])
|
|
40
|
|
41 total_sum = sum(ranges.values())
|
|
42 chimeric_refs = []
|
|
43 for i in ranges.keys():
|
|
44 if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r:
|
|
45 chimeric_refs.append(i)
|
|
46
|
|
47 return chimeric_refs
|
|
48
|
|
49
|
|
50 def cluster_contig_alns(contig, alns, chroms, l):
|
|
51 ctg_alns = alns[contig]
|
|
52
|
|
53 # Filter contig alignments so as to only include alignments to desired chromosomes
|
|
54 ctg_alns.filter_ref_chroms(chroms)
|
|
55
|
|
56 # Intialize the list of start and end positions w.r.t the query
|
|
57 query_pos = []
|
|
58
|
|
59 for i in range(len(ctg_alns.ref_headers)):
|
|
60 query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i))
|
|
61
|
|
62 final_order = [i[2] for i in sorted(query_pos)]
|
|
63 final_refs = []
|
|
64 for i in final_order:
|
|
65 final_refs.append(ctg_alns.ref_headers[i])
|
|
66
|
|
67 return get_borders(final_refs, ctg_alns, final_order)
|
|
68
|
|
69
|
|
70 def get_borders(ordered_refs, alns, order):
|
|
71 borders = [0]
|
|
72 current_ref = ordered_refs[0]
|
|
73 for i in range(1, len(ordered_refs)-1):
|
|
74 if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref:
|
|
75 current_ref = ordered_refs[i]
|
|
76 borders.append(alns.query_ends[order[i-1]])
|
|
77
|
|
78 borders.append(alns.query_lens[0])
|
|
79 intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)]
|
|
80 return intervals
|
|
81
|
|
82
|
|
83 def avoid_gff_intervals(borders, gff_ins):
|
|
84 """
|
|
85 :param borders:
|
|
86 :param gff_ins: all features
|
|
87 :return:
|
|
88 """
|
|
89
|
|
90 # Make an interval tree from the intervals of features associated with this contig
|
|
91 t = IntervalTree()
|
|
92 for line in gff_ins:
|
|
93 # If the interval is one bp long, skip
|
|
94 if line.start == line.end:
|
|
95 continue
|
|
96 t[line.start:line.end] = (line.start, line.end)
|
|
97
|
|
98 new_borders = []
|
|
99 for i in borders:
|
|
100 # Returns an empty set if a border does not fall in any interval.
|
|
101 # Otherwise, returns a list of Interval objects
|
|
102 interval_query = t[i[0]]
|
|
103 if interval_query:
|
|
104 while interval_query:
|
|
105 # Make new border larger than the max end position of all intervals
|
|
106 i = (i[0] + 1, i[1])
|
|
107 interval_query = t[i[0]]
|
|
108
|
|
109 interval_query = t[i[1]]
|
|
110 if interval_query:
|
|
111 while interval_query:
|
|
112 # Make new border larger than the max end position of all intervals
|
|
113 i = (i[0], i[1] + 1)
|
|
114 interval_query = t[i[1]]
|
|
115
|
|
116 new_borders.append(i)
|
|
117
|
|
118 new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])]
|
|
119 return new_borders
|
|
120
|
|
121
|
|
122 def update_gff(features, borders, contig):
|
|
123 # Pop the features to be updated
|
|
124 contig_feats = features.pop(contig)
|
|
125
|
|
126 # Initialize lists for new broken contig headers
|
|
127 for i in range(len(borders)):
|
|
128 features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = []
|
|
129
|
|
130 # Could just use binary search instead of this tree since intervals are non-overlapping.
|
|
131 # but input so small both are trivial
|
|
132 t = IntervalTree()
|
|
133 for i in borders:
|
|
134 t[i[0]:i[1]] = i
|
|
135
|
|
136 for i in contig_feats:
|
|
137 query = t[i.start]
|
|
138 assert len(query) == 1
|
|
139 break_start = list(query)[0].begin
|
|
140 break_end = list(query)[0].end
|
|
141 query_border = (break_start, break_end)
|
|
142 break_number = borders.index(query_border)
|
|
143 i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])
|
|
144 i.start = i.start - break_start
|
|
145 i.end = i.end - break_start
|
|
146 features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i)
|
|
147
|
|
148 return features
|
|
149
|
|
150
|
|
151 def break_contig(contigs_dict, header, break_points):
|
|
152 seq = contigs_dict.pop(header)
|
|
153 test_seq = ''
|
|
154 for i in range(len(break_points)):
|
|
155 contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]]
|
|
156 test_seq += seq[break_points[i][0]: break_points[i][1]]
|
|
157
|
|
158 assert test_seq == seq
|
|
159 return contigs_dict
|
|
160
|
|
161
|
|
162 def get_intra_contigs(alns, l, d, c):
|
|
163 """
|
|
164 Flag contigs as being intrachromosomal chimeras
|
|
165 :param alns:
|
|
166 :param l: Minimum alignment length to consider
|
|
167 :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag
|
|
168 :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag
|
|
169 :return: dict of contigs and break points.
|
|
170 """
|
|
171
|
|
172 # Get only the header to which this contig mostly aligns to and filter out smaller alignments.
|
|
173 uniq_aln = UniqueContigAlignment(alns)
|
|
174 best_header = uniq_aln.ref_chrom
|
|
175 ctg_alns = copy.deepcopy(alns)
|
|
176 ctg_alns.filter_ref_chroms([best_header])
|
|
177 ctg_alns.filter_lengths(l)
|
|
178
|
|
179 # If there are no longer any alignments after length filtering, give up
|
|
180 if not len(ctg_alns.ref_headers):
|
|
181 return
|
|
182
|
|
183 # Sort the alignments with respect to the reference start and end positions.
|
|
184 ctg_alns.sort_by_ref()
|
|
185
|
|
186 # Make a list of distance between alignments
|
|
187 # first with respect to (wrt) the reference.
|
|
188 distances_wrt_ref = []
|
|
189 for i in range(len(ctg_alns.ref_headers)-1):
|
|
190 distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i])
|
|
191
|
|
192 # next, with respect to (wrt) the contig.
|
|
193 distances_wrt_ctg = []
|
|
194 for i in range(len(ctg_alns.ref_headers) - 1):
|
|
195 distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i]))
|
|
196
|
|
197 # Next, assign the following two identities.
|
|
198 # 1. When ordered by the reference, the alignments start at the beginning or the end of the query
|
|
199 # 2. For the alignment which will be broken on, is it on the forward or reverse strand.
|
|
200
|
|
201 is_query_start = True
|
|
202
|
|
203 if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5:
|
|
204 is_query_start = False
|
|
205
|
|
206 # This conditional essentially checks if there are any break points for this contig.
|
|
207 # Returns None otherwise (no return statement)
|
|
208 if distances_wrt_ref:
|
|
209 if max(distances_wrt_ref) > d:
|
|
210 gap_index = distances_wrt_ref.index(max(distances_wrt_ref))
|
|
211 a_alns_strands = ctg_alns.strands[:gap_index]
|
|
212 if is_query_start:
|
|
213 if a_alns_strands.count('-') > a_alns_strands.count('+'):
|
|
214 # The first subcontig is on the reverse strand
|
|
215 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
|
|
216 else:
|
|
217 # The first subcontig is on the forward strand.
|
|
218 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
|
|
219 else:
|
|
220 # The first subcontig starts at the end of the contig
|
|
221 if a_alns_strands.count('-') > a_alns_strands.count('+'):
|
|
222 # The first subcontig is on the reverse strand
|
|
223 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])])
|
|
224 else:
|
|
225 # The first subcontig is on the forward strand.
|
|
226 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
|
|
227
|
|
228 if max(distances_wrt_ctg) > c:
|
|
229 gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1
|
|
230 a_alns_strands = ctg_alns.strands[:gap_index]
|
|
231 if is_query_start:
|
|
232 if a_alns_strands.count('-') > a_alns_strands.count('+'):
|
|
233 # The first subcontig is on the reverse strand
|
|
234 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
|
|
235 else:
|
|
236 # The first subcontig is on the forward strand.
|
|
237 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
|
|
238 else:
|
|
239 # The first subcontig starts at the end of the contig
|
|
240 if a_alns_strands.count('-') > a_alns_strands.count('+'):
|
|
241 # The first subcontig is on the reverse strand
|
|
242 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])])
|
|
243 else:
|
|
244 # The first subcontig is on the forward strand.
|
|
245 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
|
|
246
|