13
|
1 #! /usr/bin/env python
|
|
2
|
|
3
|
|
4 # Author: Maria Nattestad
|
|
5 # Email: mnattest@cshl.edu
|
|
6 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
|
|
7
|
|
8
|
|
9 import argparse
|
|
10 import gzip
|
|
11 # from intervaltree import *
|
|
12 import time
|
|
13
|
|
14 import numpy as np
|
|
15 import operator
|
|
16
|
|
17
|
|
18
|
|
19 def run(args):
|
|
20 filename = args.delta
|
|
21 unique_length = args.unique_length
|
|
22 output_filename = args.out
|
|
23 keep_small_uniques = args.keep_small_uniques
|
|
24
|
|
25 f = open(filename)
|
|
26 header1 = f.readline()
|
|
27 if header1[0:2]=="\x1f\x8b":
|
|
28 f.close()
|
|
29 f = gzip.open(filename)
|
|
30
|
|
31
|
|
32 linecounter = 0
|
|
33
|
|
34 current_query_name = ""
|
|
35 current_header = ""
|
|
36
|
|
37 lines_by_query = {}
|
|
38 header_lines_by_query = {}
|
|
39
|
|
40 before = time.time()
|
|
41 last = before
|
|
42
|
|
43 existing_query_names = set()
|
|
44
|
|
45 for line in f:
|
|
46 if line[0]==">":
|
|
47
|
|
48 fields = line.strip().split()
|
|
49 current_query_name = fields[1]
|
|
50 current_header = line.strip()
|
|
51 if current_query_name not in existing_query_names:
|
|
52 lines_by_query[current_query_name] = []
|
|
53 header_lines_by_query[current_query_name] = []
|
|
54 existing_query_names.add(current_query_name)
|
|
55 else:
|
|
56 fields = line.strip().split()
|
|
57 if len(fields) > 4:
|
|
58 # sometimes start and end are the other way around, but for this they need to be in order
|
|
59 query_min = min([int(fields[2]),int(fields[3])])
|
|
60 query_max = max([int(fields[2]),int(fields[3])])
|
|
61
|
|
62 ########## TESTING ONLY ###########
|
|
63 # lines_by_query[current_query_name] = (query_min,query_max)
|
|
64 # test_list = test_list + [(query_min,query_max)]
|
|
65 #####################################
|
|
66
|
|
67 lines_by_query[current_query_name].append((query_min,query_max))
|
|
68 header_lines_by_query[current_query_name].append(current_header)
|
|
69 # linecounter += 1
|
|
70 # if linecounter % 10000000 == 0:
|
|
71 # print "%d,%f" % (linecounter, time.time()-last)
|
|
72 # last = time.time()
|
|
73
|
|
74
|
|
75 f.close()
|
|
76
|
|
77
|
|
78 before = time.time()
|
|
79 alignments_to_keep = {}
|
|
80 num_queries = len(lines_by_query)
|
|
81
|
|
82 num_query_step_to_report = num_queries/100
|
|
83 if num_queries < 100:
|
|
84 num_query_step_to_report = num_queries/10
|
|
85 if num_queries < 10:
|
|
86 num_query_step_to_report = 1
|
|
87
|
|
88 query_counter = 0
|
|
89
|
|
90 for query in lines_by_query:
|
|
91
|
|
92 ################ TESTING ####################
|
|
93
|
|
94 # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length)
|
|
95 # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree)
|
|
96
|
|
97 # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length)
|
|
98 # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep)
|
|
99 # if intervaltree_filtered_out == planesweep_filtered_out :
|
|
100 # num_matches += 1
|
|
101 # else:
|
|
102 # num_mismatches += 1
|
|
103 # print "MISMATCH:"
|
|
104 # print "number of alignments:", len(lines_by_query[query])
|
|
105 # print "results_intervaltree:"
|
|
106 # print results_intervaltree
|
|
107 # for i in results_intervaltree:
|
|
108 # print lines_by_query[query][i]
|
|
109 # print "results_planesweep:"
|
|
110 # print results_planesweep
|
|
111 # for i in results_planesweep:
|
|
112 # print lines_by_query[query][i]
|
|
113 ################ TESTING ####################
|
|
114
|
|
115 alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques)
|
|
116
|
|
117 query_counter += 1
|
|
118
|
|
119 before = time.time()
|
|
120
|
|
121
|
|
122 fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w')
|
|
123
|
|
124
|
|
125 f = open(filename)
|
|
126 header1 = f.readline()
|
|
127 if header1[0:2]=="\x1f\x8b":
|
|
128 f.close()
|
|
129 f = gzip.open(filename)
|
|
130 header1 = f.readline()
|
|
131
|
|
132 fout.write(header1) # write the first line that we read already
|
|
133 fout.write(f.readline())
|
|
134
|
|
135 linecounter = 0
|
|
136
|
|
137 # For filtered delta file:
|
|
138 list_of_alignments_to_keep = []
|
|
139 alignment_counter = {}
|
|
140 keep_printing = False
|
|
141
|
|
142 # For coords:
|
|
143 current_query_name = ""
|
|
144 current_query_position = 0
|
|
145 fcoords_out_tab = open(output_filename + ".coords.tab",'w')
|
|
146 fcoords_out_csv = open(output_filename + ".coords.csv",'w')
|
|
147 fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n")
|
|
148
|
|
149
|
|
150 # For basic assembly stats:
|
|
151 ref_sequences = set()
|
|
152 query_sequences = set()
|
|
153 ref_lengths = []
|
|
154 query_lengths = []
|
|
155
|
|
156 f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w")
|
|
157
|
|
158 for line in f:
|
|
159 linecounter += 1
|
|
160 if line[0]==">":
|
|
161 fields = line.strip().split()
|
|
162
|
|
163 # For delta file output:
|
|
164 query = fields[1]
|
|
165 list_of_alignments_to_keep = alignments_to_keep[query]
|
|
166
|
|
167 header_needed = False
|
|
168 for index in list_of_alignments_to_keep:
|
|
169 if line.strip() == header_lines_by_query[query][index]:
|
|
170 header_needed = True
|
|
171 if header_needed == True:
|
|
172 fout.write(line) # if we have any alignments under this header, print the header
|
|
173 alignment_counter[query] = alignment_counter.get(query,0)
|
|
174
|
|
175 # For coords:
|
|
176 current_reference_name = fields[0][1:]
|
|
177 current_query_name = fields[1]
|
|
178
|
|
179 current_reference_size = int(fields[2])
|
|
180 current_query_size = int(fields[3])
|
|
181
|
|
182 # For basic assembly stats:
|
|
183 if not current_reference_name in ref_sequences:
|
|
184 ref_lengths.append(current_reference_size)
|
|
185 ref_sequences.add(current_reference_name)
|
|
186 if not current_query_name in query_sequences:
|
|
187 query_lengths.append(current_query_size)
|
|
188 query_sequences.add(current_query_name)
|
|
189
|
|
190 else:
|
|
191 fields = line.strip().split()
|
|
192 if len(fields) > 4:
|
|
193 # For coords:
|
|
194 ref_start = int(fields[0])
|
|
195 ref_end = int(fields[1])
|
|
196 query_start = int(fields[2])
|
|
197 query_end = int(fields[3])
|
|
198 csv_tag = "repetitive"
|
|
199 if alignment_counter[query] in list_of_alignments_to_keep:
|
|
200 fout.write(line)
|
|
201 fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n")
|
|
202 csv_tag = "unique"
|
|
203 keep_printing = True
|
|
204 else:
|
|
205 keep_printing = False
|
|
206 fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n")
|
|
207 alignment_counter[query] = alignment_counter[query] + 1
|
|
208
|
|
209 elif keep_printing == True:
|
|
210 fout.write(line)
|
|
211
|
|
212 fcoords_out_tab.close()
|
|
213 fcoords_out_csv.close()
|
|
214
|
|
215
|
|
216 ref_lengths.sort()
|
|
217 query_lengths.sort()
|
|
218
|
|
219 # Assembly statistics
|
|
220 ref_lengths = np.array(ref_lengths)
|
|
221 query_lengths = np.array(query_lengths)
|
|
222
|
|
223 f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1]))
|
|
224 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths)))
|
|
225 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths)))
|
|
226 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths)))
|
|
227 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths)))
|
|
228 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths)))
|
|
229 f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths)))
|
|
230 f_stats_out.write( "\n\n")
|
|
231 f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1])
|
|
232 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths)))
|
|
233 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths)))
|
|
234 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths)))
|
|
235 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths)))
|
|
236 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths)))
|
|
237 f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths)))
|
|
238
|
|
239
|
|
240 f.close()
|
|
241 fout.close()
|
|
242 f_stats_out.close()
|
|
243
|
|
244 def N50(sorted_list):
|
|
245 # List should be sorted as increasing
|
|
246
|
|
247 # We flip the list around here so we start with the largest element
|
|
248 cumsum = 0
|
|
249 for length in sorted_list[::-1]:
|
|
250 cumsum += length
|
|
251 if cumsum >= sum(sorted_list)/2:
|
|
252 return length
|
|
253
|
|
254
|
|
255 def gig_meg(number,digits = 2):
|
|
256 gig = 1000000000.
|
|
257 meg = 1000000.
|
|
258 kil = 1000.
|
|
259
|
|
260 if number > gig:
|
|
261 return str(round(number/gig,digits)) + " Gbp"
|
|
262 elif number > meg:
|
|
263 return str(round(number/meg,digits)) + " Mbp"
|
|
264 elif number > kil:
|
|
265 return str(round(number/kil,digits)) + " Kbp"
|
|
266 else:
|
|
267 return str(number) + " bp"
|
|
268
|
|
269 def intWithCommas(x):
|
|
270 if type(x) not in [type(0)]:
|
|
271 raise TypeError("Parameter must be an integer.")
|
|
272 if x < 0:
|
|
273 return '-' + intWithCommas(-x)
|
|
274 result = ''
|
|
275 while x >= 1000:
|
|
276 x, r = divmod(x, 1000)
|
|
277 result = ",%03d%s" % (r, result)
|
|
278 return "%d%s" % (x, result)
|
|
279
|
|
280
|
|
281 def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False):
|
|
282
|
|
283 alignments_to_keep = []
|
|
284 # print len(lines)
|
|
285
|
|
286 # If no alignments:
|
|
287 if len(lines)==0:
|
|
288 return []
|
|
289
|
|
290 # If only one alignment:
|
|
291 if len(lines) == 1:
|
|
292 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required:
|
|
293 return [0]
|
|
294 else:
|
|
295 return []
|
|
296
|
|
297 starts_and_stops = []
|
|
298 for query_min,query_max in lines:
|
|
299 # print query_min, query_max
|
|
300 starts_and_stops.append((query_min,"start"))
|
|
301 starts_and_stops.append((query_max,"stop"))
|
|
302
|
|
303
|
|
304 sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0))
|
|
305 # print sorted_starts_and_stops
|
|
306
|
|
307 current_coverage = 0
|
|
308 last_position = -1
|
|
309 # sorted_unique_intervals = []
|
|
310 sorted_unique_intervals_left = []
|
|
311 sorted_unique_intervals_right = []
|
|
312 for pos,change in sorted_starts_and_stops:
|
|
313 # print sorted_starts_and_stops[i]
|
|
314 # pos = sorted_starts_and_stops[i][0]
|
|
315 # change = sorted_starts_and_stops[i][1]
|
|
316
|
|
317 # print pos,change
|
|
318 # First alignment only:
|
|
319 # if last_position == -1:
|
|
320 # last_position = pos
|
|
321 # continue
|
|
322
|
|
323 # print last_position,pos,current_coverage
|
|
324
|
|
325 if current_coverage == 1:
|
|
326 # sorted_unique_intervals.append((last_position,pos))
|
|
327 sorted_unique_intervals_left.append(last_position)
|
|
328 sorted_unique_intervals_right.append(pos)
|
|
329
|
|
330 if change == "start":
|
|
331 current_coverage += 1
|
|
332 else:
|
|
333 current_coverage -= 1
|
|
334 last_position = pos
|
|
335
|
|
336
|
|
337 linecounter = 0
|
|
338 for query_min,query_max in lines:
|
|
339
|
|
340 i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left))
|
|
341
|
|
342 exact_match = False
|
|
343 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max:
|
|
344 exact_match = True
|
|
345 sum_uniq = 0
|
|
346 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max:
|
|
347 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i]
|
|
348 i += 1
|
|
349
|
|
350 # print query_min,query_max,sum_uniq
|
|
351 if sum_uniq >= unique_length_required:
|
|
352 alignments_to_keep.append(linecounter)
|
|
353 elif keep_small_uniques == True and exact_match == True:
|
|
354 alignments_to_keep.append(linecounter)
|
|
355 # print "Keeping small alignment:", query_min, query_max
|
|
356 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1]
|
|
357
|
|
358 linecounter += 1
|
|
359
|
|
360 return alignments_to_keep
|
|
361
|
|
362
|
|
363
|
|
364 def binary_search(query, numbers, left, right):
|
|
365 # Returns index of the matching element or the first element to the right
|
|
366
|
|
367 if left >= right:
|
|
368 return right
|
|
369 mid = (right+left)//2
|
|
370
|
|
371
|
|
372 if query == numbers[mid]:
|
|
373 return mid
|
|
374 elif query < numbers[mid]:
|
|
375 return binary_search(query,numbers,left,mid)
|
|
376 else: # if query > numbers[mid]:
|
|
377 return binary_search(query,numbers,mid+1,right)
|
|
378
|
|
379
|
|
380 # def summarize_intervaltree(lines, unique_length_required):
|
|
381
|
|
382 # alignments_to_keep = []
|
|
383 # # print len(lines)
|
|
384
|
|
385 # if len(lines)==0:
|
|
386 # return alignments_to_keep
|
|
387
|
|
388 # if len(lines) == 1:
|
|
389 # if abs(lines[0][1] - lines[0][0]) >= unique_length_required:
|
|
390 # return [0]
|
|
391
|
|
392
|
|
393 # starts_and_stops = []
|
|
394 # for query_min,query_max in lines:
|
|
395 # starts_and_stops.append((query_min,query_max))
|
|
396
|
|
397 # # build full tree
|
|
398 # tree = IntervalTree.from_tuples(starts_and_stops)
|
|
399
|
|
400
|
|
401 # # for each interval (keeping the same order as the lines in the input file)
|
|
402 # line_counter = 0
|
|
403 # for query_min,query_max in lines:
|
|
404
|
|
405 # # create a tree object from the current interval
|
|
406 # this_interval = IntervalTree.from_tuples([(query_min,query_max)])
|
|
407
|
|
408 # # create a copy of the tree without this one interval
|
|
409 # rest_of_tree = tree - this_interval
|
|
410
|
|
411 # # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one
|
|
412 # for other_interval in rest_of_tree:
|
|
413 # this_interval.chop(other_interval.begin, other_interval.end)
|
|
414
|
|
415 # # loop through to count the total number of unique basepairs
|
|
416 # total_unique_length = 0
|
|
417 # for sub_interval in this_interval:
|
|
418 # total_unique_length += sub_interval.end - sub_interval.begin
|
|
419
|
|
420 # # if the total unique length is above our threshold, add the index to the list we are reporting
|
|
421 # if total_unique_length >= unique_length_required:
|
|
422 # alignments_to_keep.append(line_counter)
|
|
423 # line_counter += 1
|
|
424
|
|
425
|
|
426 # return alignments_to_keep
|
|
427
|
|
428
|
|
429 def main():
|
|
430 parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it")
|
|
431 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
|
|
432 parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True)
|
|
433 parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000)
|
|
434 parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true")
|
|
435 parser.set_defaults(func=run)
|
|
436 args=parser.parse_args()
|
|
437 args.func(args)
|
|
438
|
|
439 if __name__=="__main__":
|
|
440 main()
|