Mercurial > repos > dereeper > ragoo
comparison RaGOO/filter_gap_SVs.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 #!/usr/bin/env python | |
2 import re | |
3 | |
4 from intervaltree import IntervalTree | |
5 | |
6 from ragoo_utilities.SeqReader import SeqReader | |
7 | |
8 | |
9 class BaseSequence(object): | |
10 | |
11 def __init__(self, in_sequence): | |
12 if not isinstance(in_sequence, str): | |
13 raise AttributeError('Only a string can be used to instantiate this class.') | |
14 self.sequence = in_sequence.upper() | |
15 | |
16 | |
17 class GapSequence(BaseSequence): | |
18 def get_gap_coords(self): | |
19 """ Find all of the gap string indices for this sequence. """ | |
20 return re.finditer(r'N+', self.sequence) | |
21 | |
22 | |
23 def make_gaps_tree(in_file): | |
24 # A dictionary to store an interval tree for each chromosome header. | |
25 all_trees = dict() | |
26 x = SeqReader(in_file) | |
27 if in_file.endswith(".gz"): | |
28 for header, sequence in x.parse_gzip_fasta(): | |
29 # Remove the greater than sign and only get first token if delimited by spaces | |
30 header = header[1:].split(' ')[0] | |
31 all_trees[header] = IntervalTree() | |
32 gap_sequence = GapSequence(sequence) | |
33 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] | |
34 for i in all_coordinates: | |
35 all_trees[header][i[0]:i[1]] = i | |
36 else: | |
37 for header, sequence in x.parse_fasta(): | |
38 # Remove the greater than sign and only get first token if delimited by spaces | |
39 header = header[1:].split(' ')[0] | |
40 all_trees[header] = IntervalTree() | |
41 gap_sequence = GapSequence(sequence) | |
42 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()] | |
43 for i in all_coordinates: | |
44 all_trees[header][i[0]:i[1]] = i | |
45 return all_trees | |
46 | |
47 | |
48 def get_query_bed_coords(in_line): | |
49 c_loc = [m.start() for m in re.finditer(':', in_line)] | |
50 c1, c2 = c_loc[-1], c_loc[-2] | |
51 header = in_line[:c2] | |
52 L1 = in_line[c2+1:c1].split('-') | |
53 start, stop = int(L1[0]), int(L1[1]) | |
54 return header, start, stop | |
55 | |
56 | |
57 def make_svs_bed(in_trees_q, in_trees_r): | |
58 final_lines = [] | |
59 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f: | |
60 | |
61 # Make a new header column for the field I am about to create | |
62 header = f.readline().rstrip() + '\tquery_gap_ovlp' | |
63 final_lines.append(header) | |
64 | |
65 # Calculate the percentage overlap for each structural variant. | |
66 for line in f: | |
67 pct_ovlp = 0 | |
68 ovlp = 0 | |
69 L1 = line.rstrip().split('\t') | |
70 head, start, end = get_query_bed_coords(L1[9]) | |
71 query = in_trees_q[head][start:end] | |
72 if query: | |
73 for interval in query: | |
74 gap_start, gap_end= interval.begin, interval.end | |
75 | |
76 # Calculate the amount these to intervals overlap | |
77 ovlp += max(0, min(end, gap_end) - max(start, gap_start)) | |
78 | |
79 pct_ovlp = ovlp/(end-start) | |
80 | |
81 # Add the new value to the original line | |
82 L1.append(pct_ovlp) | |
83 L1 = [str(i) for i in L1] | |
84 final_lines.append('\t'.join(L1)) | |
85 | |
86 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file: | |
87 out_file.write('\n'.join(final_lines)) | |
88 | |
89 # Now repeat but with respect to the reference | |
90 final_lines = [] | |
91 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f: | |
92 # Make a new header column for the field I am about to create | |
93 header = f.readline().rstrip() + '\tref_gap_ovlp' | |
94 final_lines.append(header) | |
95 | |
96 # Calculate the percentage overlap for each structural variant. | |
97 for line in f: | |
98 pct_ovlp = 0 | |
99 ovlp = 0 | |
100 L1 = line.rstrip().split('\t') | |
101 head, start, end = L1[0], int(L1[1]), int(L1[2]) | |
102 query = in_trees_r[head][start:end] | |
103 if query: | |
104 for interval in query: | |
105 gap_start, gap_end = interval.begin, interval.end | |
106 | |
107 # Calculate the amount these to intervals overlap | |
108 ovlp += max(0, min(end, gap_end) - max(start, gap_start)) | |
109 | |
110 pct_ovlp = ovlp / (end - start) | |
111 | |
112 # Add the new value to the original line | |
113 L1.append(pct_ovlp) | |
114 L1 = [str(i) for i in L1] | |
115 final_lines.append('\t'.join(L1)) | |
116 | |
117 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file: | |
118 out_file.write('\n'.join(final_lines)) | |
119 | |
120 | |
121 def main(): | |
122 import argparse | |
123 | |
124 parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap') | |
125 parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file") | |
126 args = parser.parse_args() | |
127 ref_file = args.ref_file | |
128 | |
129 # Make a new bed file w.r.t the query | |
130 all_trees_q = make_gaps_tree('../ragoo.fasta') | |
131 all_trees_r = make_gaps_tree(ref_file) | |
132 make_svs_bed(all_trees_q, all_trees_r) | |
133 | |
134 | |
135 if __name__ == "__main__": | |
136 main() |