Mercurial > repos > dereeper > ragoo
diff RaGOO/ragoo_utilities/break_chimera.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/ragoo_utilities/break_chimera.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,246 @@ +from collections import defaultdict +import copy + +from intervaltree import IntervalTree + +from ragoo_utilities.ContigAlignment import UniqueContigAlignment + + +def get_ref_parts(alns, l, p, r): + """ + + :param alns: A ContigAlignment object for the contig in question + :param l: Minimum alignment length to consider + :param p: Minimum percentage of a reference chromosome that must be covered to be considered significant + :param r: Minimum number of reference chromosome nucleotides needed to be covered to be considered significant + :return: A list of reference chromosomes to which the input contig significantly aligns to + """ + all_intervals = defaultdict(list) + + # Iterate over all alignments + for i in range(len(alns.ref_headers)): + if alns.aln_lens[i] < l: + continue + + all_intervals[alns.ref_headers[i]].append((alns.ref_starts[i], alns.ref_ends[i])) + + ranges = dict() + for i in all_intervals.keys(): + ranges[i] = 0 + + # For each chromosome, sort the range and get the union interval length. + # Algorithm and pseudocode given by Michael Kirsche + for i in all_intervals.keys(): + sorted_intervals = sorted(all_intervals[i], key=lambda tup: tup[0]) + max_end = -1 + for j in sorted_intervals: + start_new_terr = max(j[0], max_end) + ranges[i] += max(0, j[1] - start_new_terr) + max_end = max(max_end, j[1]) + + total_sum = sum(ranges.values()) + chimeric_refs = [] + for i in ranges.keys(): + if (ranges[i]/total_sum)*100 > float(p) and ranges[i] > r: + chimeric_refs.append(i) + + return chimeric_refs + + +def cluster_contig_alns(contig, alns, chroms, l): + ctg_alns = alns[contig] + + # Filter contig alignments so as to only include alignments to desired chromosomes + ctg_alns.filter_ref_chroms(chroms) + + # Intialize the list of start and end positions w.r.t the query + query_pos = [] + + for i in range(len(ctg_alns.ref_headers)): + query_pos.append((ctg_alns.query_starts[i], ctg_alns.query_ends[i], i)) + + final_order = [i[2] for i in sorted(query_pos)] + final_refs = [] + for i in final_order: + final_refs.append(ctg_alns.ref_headers[i]) + + return get_borders(final_refs, ctg_alns, final_order) + + +def get_borders(ordered_refs, alns, order): + borders = [0] + current_ref = ordered_refs[0] + for i in range(1, len(ordered_refs)-1): + if ordered_refs[i] != current_ref and ordered_refs[i+1] != current_ref: + current_ref = ordered_refs[i] + borders.append(alns.query_ends[order[i-1]]) + + borders.append(alns.query_lens[0]) + intervals = [(borders[i], borders[i+1]) for i in range(len(borders)-1)] + return intervals + + +def avoid_gff_intervals(borders, gff_ins): + """ + :param borders: + :param gff_ins: all features + :return: + """ + + # Make an interval tree from the intervals of features associated with this contig + t = IntervalTree() + for line in gff_ins: + # If the interval is one bp long, skip + if line.start == line.end: + continue + t[line.start:line.end] = (line.start, line.end) + + new_borders = [] + for i in borders: + # Returns an empty set if a border does not fall in any interval. + # Otherwise, returns a list of Interval objects + interval_query = t[i[0]] + if interval_query: + while interval_query: + # Make new border larger than the max end position of all intervals + i = (i[0] + 1, i[1]) + interval_query = t[i[0]] + + interval_query = t[i[1]] + if interval_query: + while interval_query: + # Make new border larger than the max end position of all intervals + i = (i[0], i[1] + 1) + interval_query = t[i[1]] + + new_borders.append(i) + + new_borders = new_borders[:-1] + [(new_borders[-1][0], borders[-1][1])] + return new_borders + + +def update_gff(features, borders, contig): + # Pop the features to be updated + contig_feats = features.pop(contig) + + # Initialize lists for new broken contig headers + for i in range(len(borders)): + features[contig + '_chimera_broken:' + str(borders[i][0]) + '-' + str(borders[i][1])] = [] + + # Could just use binary search instead of this tree since intervals are non-overlapping. + # but input so small both are trivial + t = IntervalTree() + for i in borders: + t[i[0]:i[1]] = i + + for i in contig_feats: + query = t[i.start] + assert len(query) == 1 + break_start = list(query)[0].begin + break_end = list(query)[0].end + query_border = (break_start, break_end) + break_number = borders.index(query_border) + i.seqname = contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1]) + i.start = i.start - break_start + i.end = i.end - break_start + features[contig + '_chimera_broken:' + str(borders[break_number][0]) + '-' + str(borders[break_number][1])].append(i) + + return features + + +def break_contig(contigs_dict, header, break_points): + seq = contigs_dict.pop(header) + test_seq = '' + for i in range(len(break_points)): + contigs_dict[header + '_chimera_broken:' + str(break_points[i][0]) + '-' + str(break_points[i][1])] = seq[break_points[i][0]: break_points[i][1]] + test_seq += seq[break_points[i][0]: break_points[i][1]] + + assert test_seq == seq + return contigs_dict + + +def get_intra_contigs(alns, l, d, c): + """ + Flag contigs as being intrachromosomal chimeras + :param alns: + :param l: Minimum alignment length to consider + :param d: Distance between consecutive adjacent alignments with respect to the reference. If larger than this, flag + :param c: Distance between consecutive adjacent alignments with respect to the query. If larger than this, flag + :return: dict of contigs and break points. + """ + + # Get only the header to which this contig mostly aligns to and filter out smaller alignments. + uniq_aln = UniqueContigAlignment(alns) + best_header = uniq_aln.ref_chrom + ctg_alns = copy.deepcopy(alns) + ctg_alns.filter_ref_chroms([best_header]) + ctg_alns.filter_lengths(l) + + # If there are no longer any alignments after length filtering, give up + if not len(ctg_alns.ref_headers): + return + + # Sort the alignments with respect to the reference start and end positions. + ctg_alns.sort_by_ref() + + # Make a list of distance between alignments + # first with respect to (wrt) the reference. + distances_wrt_ref = [] + for i in range(len(ctg_alns.ref_headers)-1): + distances_wrt_ref.append(ctg_alns.ref_starts[i+1] - ctg_alns.ref_starts[i]) + + # next, with respect to (wrt) the contig. + distances_wrt_ctg = [] + for i in range(len(ctg_alns.ref_headers) - 1): + distances_wrt_ctg.append(abs(ctg_alns.query_starts[i + 1] - ctg_alns.query_starts[i])) + + # Next, assign the following two identities. + # 1. When ordered by the reference, the alignments start at the beginning or the end of the query + # 2. For the alignment which will be broken on, is it on the forward or reverse strand. + + is_query_start = True + + if ctg_alns.query_starts[0] >= ctg_alns.query_lens[0]/5: + is_query_start = False + + # This conditional essentially checks if there are any break points for this contig. + # Returns None otherwise (no return statement) + if distances_wrt_ref: + if max(distances_wrt_ref) > d: + gap_index = distances_wrt_ref.index(max(distances_wrt_ref)) + a_alns_strands = ctg_alns.strands[:gap_index] + if is_query_start: + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig starts at the end of the contig + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index]), (ctg_alns.query_starts[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) + + if max(distances_wrt_ctg) > c: + gap_index = distances_wrt_ctg.index(max(distances_wrt_ctg)) + 1 + a_alns_strands = ctg_alns.strands[:gap_index] + if is_query_start: + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_ends[0]), (ctg_alns.query_ends[0], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_ends[gap_index]), (ctg_alns.query_ends[gap_index], ctg_alns.query_lens[0])]) + else: + # The first subcontig starts at the end of the contig + if a_alns_strands.count('-') > a_alns_strands.count('+'): + # The first subcontig is on the reverse strand + return (ctg_alns.contig, [(0, ctg_alns.query_starts[gap_index-1]), (ctg_alns.query_starts[gap_index-1], ctg_alns.query_lens[0])]) + else: + # The first subcontig is on the forward strand. + return (ctg_alns.contig, [(0, ctg_alns.query_starts[0]), (ctg_alns.query_starts[0], ctg_alns.query_lens[0])]) +