comparison RaGOO/ragoo_utilities/break_chimera.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 from collections import defaultdict
2 import copy
3
4 from intervaltree import IntervalTree
5
6 from ragoo_utilities.ContigAlignment import UniqueContigAlignment
7
8
9 def get_ref_parts(alns, l, p, r):
10 """
11
12 :param alns: A ContigAlignment object for the contig in question
13 :param l: Minimum alignment length to consider
14 :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant
15 :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant
16 :return: A list of reference chromosomes to which the input contig significantly aligns to
17 """
18 all_intervals = defaultdict(list)
19
20 # Iterate over all alignments
21 for i in range(len(alns.ref_headers)):
22 if alns.aln_lens[i] < l:
23 continue
24
25 all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i]))
26
27 ranges = dict()
28 for i in all_intervals.keys():
29 ranges[i] = 0
30
31 # For each chromosome, sort the range and get the union interval length.
32 # Algorithm and pseudocode given by Michael Kirsche
33 for i in all_intervals.keys():
34 sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0])
35 max_end = -1
36 for j in sorted_intervals:
37 start_new_terr = max(j[0], max_end)
38 ranges[i] += max(0, j[1] - start_new_terr)
39 max_end = max(max_end, j[1])
40
41 total_sum = sum(ranges.values())
42 chimeric_refs = []
43 for i in ranges.keys():
44 if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r:
45 chimeric_refs.append(i)
46
47 return chimeric_refs
48
49
50 def cluster_contig_alns(contig, alns, chroms, l):
51 ctg_alns = alns[contig]
52
53 # Filter contig alignments so as to only include alignments to desired chromosomes
54 ctg_alns.filter_ref_chroms(chroms)
55
56 # Intialize the list of start and end positions w.r.t the query
57 query_pos = []
58
59 for i in range(len(ctg_alns.ref_headers)):
60 query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i))
61
62 final_order = [i[2] for i in sorted(query_pos)]
63 final_refs = []
64 for i in final_order:
65 final_refs.append(ctg_alns.ref_headers[i])
66
67 return get_borders(final_refs, ctg_alns, final_order)
68
69
70 def get_borders(ordered_refs, alns, order):
71 borders = [0]
72 current_ref = ordered_refs[0]
73 for i in range(1, len(ordered_refs)-1):
74 if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref:
75 current_ref = ordered_refs[i]
76 borders.append(alns.query_ends[order[i-1]])
77
78 borders.append(alns.query_lens[0])
79 intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)]
80 return intervals
81
82
83 def avoid_gff_intervals(borders, gff_ins):
84 """
85 :param borders:
86 :param gff_ins: all features
87 :return:
88 """
89
90 # Make an interval tree from the intervals of features associated with this contig
91 t = IntervalTree()
92 for line in gff_ins:
93 # If the interval is one bp long, skip
94 if line.start == line.end:
95 continue
96 t[line.start:line.end] = (line.start, line.end)
97
98 new_borders = []
99 for i in borders:
100 # Returns an empty set if a border does not fall in any interval.
101 # Otherwise, returns a list of Interval objects
102 interval_query = t[i[0]]
103 if interval_query:
104 while interval_query:
105 # Make new border larger than the max end position of all intervals
106 i = (i[0] + 1, i[1])
107 interval_query = t[i[0]]
108
109 interval_query = t[i[1]]
110 if interval_query:
111 while interval_query:
112 # Make new border larger than the max end position of all intervals
113 i = (i[0], i[1] + 1)
114 interval_query = t[i[1]]
115
116 new_borders.append(i)
117
118 new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])]
119 return new_borders
120
121
122 def update_gff(features, borders, contig):
123 # Pop the features to be updated
124 contig_feats = features.pop(contig)
125
126 # Initialize lists for new broken contig headers
127 for i in range(len(borders)):
128 features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = []
129
130 # Could just use binary search instead of this tree since intervals are non-overlapping.
131 # but input so small both are trivial
132 t = IntervalTree()
133 for i in borders:
134 t[i[0]:i[1]] = i
135
136 for i in contig_feats:
137 query = t[i.start]
138 assert len(query) == 1
139 break_start = list(query)[0].begin
140 break_end = list(query)[0].end
141 query_border = (break_start, break_end)
142 break_number = borders.index(query_border)
143 i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])
144 i.start = i.start - break_start
145 i.end = i.end - break_start
146 features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i)
147
148 return features
149
150
151 def break_contig(contigs_dict, header, break_points):
152 seq = contigs_dict.pop(header)
153 test_seq = ''
154 for i in range(len(break_points)):
155 contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]]
156 test_seq += seq[break_points[i][0]: break_points[i][1]]
157
158 assert test_seq == seq
159 return contigs_dict
160
161
162 def get_intra_contigs(alns, l, d, c):
163 """
164 Flag contigs as being intrachromosomal chimeras
165 :param alns:
166 :param l: Minimum alignment length to consider
167 :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag
168 :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag
169 :return: dict of contigs and break points.
170 """
171
172 # Get only the header to which this contig mostly aligns to and filter out smaller alignments.
173 uniq_aln = UniqueContigAlignment(alns)
174 best_header = uniq_aln.ref_chrom
175 ctg_alns = copy.deepcopy(alns)
176 ctg_alns.filter_ref_chroms([best_header])
177 ctg_alns.filter_lengths(l)
178
179 # If there are no longer any alignments after length filtering, give up
180 if not len(ctg_alns.ref_headers):
181 return
182
183 # Sort the alignments with respect to the reference start and end positions.
184 ctg_alns.sort_by_ref()
185
186 # Make a list of distance between alignments
187 # first with respect to (wrt) the reference.
188 distances_wrt_ref = []
189 for i in range(len(ctg_alns.ref_headers)-1):
190 distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i])
191
192 # next, with respect to (wrt) the contig.
193 distances_wrt_ctg = []
194 for i in range(len(ctg_alns.ref_headers) - 1):
195 distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i]))
196
197 # Next, assign the following two identities.
198 # 1. When ordered by the reference, the alignments start at the beginning or the end of the query
199 # 2. For the alignment which will be broken on, is it on the forward or reverse strand.
200
201 is_query_start = True
202
203 if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5:
204 is_query_start = False
205
206 # This conditional essentially checks if there are any break points for this contig.
207 # Returns None otherwise (no return statement)
208 if distances_wrt_ref:
209 if max(distances_wrt_ref) > d:
210 gap_index = distances_wrt_ref.index(max(distances_wrt_ref))
211 a_alns_strands = ctg_alns.strands[:gap_index]
212 if is_query_start:
213 if a_alns_strands.count('-') > a_alns_strands.count('+'):
214 # The first subcontig is on the reverse strand
215 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
216 else:
217 # The first subcontig is on the forward strand.
218 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
219 else:
220 # The first subcontig starts at the end of the contig
221 if a_alns_strands.count('-') > a_alns_strands.count('+'):
222 # The first subcontig is on the reverse strand
223 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])])
224 else:
225 # The first subcontig is on the forward strand.
226 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
227
228 if max(distances_wrt_ctg) > c:
229 gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1
230 a_alns_strands = ctg_alns.strands[:gap_index]
231 if is_query_start:
232 if a_alns_strands.count('-') > a_alns_strands.count('+'):
233 # The first subcontig is on the reverse strand
234 return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])])
235 else:
236 # The first subcontig is on the forward strand.
237 return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])])
238 else:
239 # The first subcontig starts at the end of the contig
240 if a_alns_strands.count('-') > a_alns_strands.count('+'):
241 # The first subcontig is on the reverse strand
242 return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])])
243 else:
244 # The first subcontig is on the forward strand.
245 return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])])
246