13
|
1 #!/usr/bin/env python
|
|
2 import re
|
|
3
|
|
4 from intervaltree import IntervalTree
|
|
5
|
|
6 from ragoo_utilities.SeqReader import SeqReader
|
|
7
|
|
8
|
|
9 class BaseSequence(object):
|
|
10
|
|
11 def __init__(self, in_sequence):
|
|
12 if not isinstance(in_sequence, str):
|
|
13 raise AttributeError('Only a string can be used to instantiate this class.')
|
|
14 self.sequence = in_sequence.upper()
|
|
15
|
|
16
|
|
17 class GapSequence(BaseSequence):
|
|
18 def get_gap_coords(self):
|
|
19 """ Find all of the gap string indices for this sequence. """
|
|
20 return re.finditer(r'N+', self.sequence)
|
|
21
|
|
22
|
|
23 def make_gaps_tree(in_file):
|
|
24 # A dictionary to store an interval tree for each chromosome header.
|
|
25 all_trees = dict()
|
|
26 x = SeqReader(in_file)
|
|
27 if in_file.endswith(".gz"):
|
|
28 for header, sequence in x.parse_gzip_fasta():
|
|
29 # Remove the greater than sign and only get first token if delimited by spaces
|
|
30 header = header[1:].split(' ')[0]
|
|
31 all_trees[header] = IntervalTree()
|
|
32 gap_sequence = GapSequence(sequence)
|
|
33 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
|
|
34 for i in all_coordinates:
|
|
35 all_trees[header][i[0]:i[1]] = i
|
|
36 else:
|
|
37 for header, sequence in x.parse_fasta():
|
|
38 # Remove the greater than sign and only get first token if delimited by spaces
|
|
39 header = header[1:].split(' ')[0]
|
|
40 all_trees[header] = IntervalTree()
|
|
41 gap_sequence = GapSequence(sequence)
|
|
42 all_coordinates = [(m.start(0), m.end(0)) for m in gap_sequence.get_gap_coords()]
|
|
43 for i in all_coordinates:
|
|
44 all_trees[header][i[0]:i[1]] = i
|
|
45 return all_trees
|
|
46
|
|
47
|
|
48 def get_query_bed_coords(in_line):
|
|
49 c_loc = [m.start() for m in re.finditer(':', in_line)]
|
|
50 c1, c2 = c_loc[-1], c_loc[-2]
|
|
51 header = in_line[:c2]
|
|
52 L1 = in_line[c2+1:c1].split('-')
|
|
53 start, stop = int(L1[0]), int(L1[1])
|
|
54 return header, start, stop
|
|
55
|
|
56
|
|
57 def make_svs_bed(in_trees_q, in_trees_r):
|
|
58 final_lines = []
|
|
59 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
|
|
60
|
|
61 # Make a new header column for the field I am about to create
|
|
62 header = f.readline().rstrip() + '\tquery_gap_ovlp'
|
|
63 final_lines.append(header)
|
|
64
|
|
65 # Calculate the percentage overlap for each structural variant.
|
|
66 for line in f:
|
|
67 pct_ovlp = 0
|
|
68 ovlp = 0
|
|
69 L1 = line.rstrip().split('\t')
|
|
70 head, start, end = get_query_bed_coords(L1[9])
|
|
71 query = in_trees_q[head][start:end]
|
|
72 if query:
|
|
73 for interval in query:
|
|
74 gap_start, gap_end= interval.begin, interval.end
|
|
75
|
|
76 # Calculate the amount these to intervals overlap
|
|
77 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
|
|
78
|
|
79 pct_ovlp = ovlp/(end-start)
|
|
80
|
|
81 # Add the new value to the original line
|
|
82 L1.append(pct_ovlp)
|
|
83 L1 = [str(i) for i in L1]
|
|
84 final_lines.append('\t'.join(L1))
|
|
85
|
|
86 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
|
|
87 out_file.write('\n'.join(final_lines))
|
|
88
|
|
89 # Now repeat but with respect to the reference
|
|
90 final_lines = []
|
|
91 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'r') as f:
|
|
92 # Make a new header column for the field I am about to create
|
|
93 header = f.readline().rstrip() + '\tref_gap_ovlp'
|
|
94 final_lines.append(header)
|
|
95
|
|
96 # Calculate the percentage overlap for each structural variant.
|
|
97 for line in f:
|
|
98 pct_ovlp = 0
|
|
99 ovlp = 0
|
|
100 L1 = line.rstrip().split('\t')
|
|
101 head, start, end = L1[0], int(L1[1]), int(L1[2])
|
|
102 query = in_trees_r[head][start:end]
|
|
103 if query:
|
|
104 for interval in query:
|
|
105 gap_start, gap_end = interval.begin, interval.end
|
|
106
|
|
107 # Calculate the amount these to intervals overlap
|
|
108 ovlp += max(0, min(end, gap_end) - max(start, gap_start))
|
|
109
|
|
110 pct_ovlp = ovlp / (end - start)
|
|
111
|
|
112 # Add the new value to the original line
|
|
113 L1.append(pct_ovlp)
|
|
114 L1 = [str(i) for i in L1]
|
|
115 final_lines.append('\t'.join(L1))
|
|
116
|
|
117 with open('assemblytics_out.Assemblytics_structural_variants.bed', 'w') as out_file:
|
|
118 out_file.write('\n'.join(final_lines))
|
|
119
|
|
120
|
|
121 def main():
|
|
122 import argparse
|
|
123
|
|
124 parser = argparse.ArgumentParser(description='Annotate the SV bed file with the amount degree in which the SV overlaps with a gap')
|
|
125 parser.add_argument("ref_file", metavar="<ref.fasta>", type=str,help="reference fasta file")
|
|
126 args = parser.parse_args()
|
|
127 ref_file = args.ref_file
|
|
128
|
|
129 # Make a new bed file w.r.t the query
|
|
130 all_trees_q = make_gaps_tree('../ragoo.fasta')
|
|
131 all_trees_r = make_gaps_tree(ref_file)
|
|
132 make_svs_bed(all_trees_q, all_trees_r)
|
|
133
|
|
134
|
|
135 if __name__ == "__main__":
|
|
136 main()
|