Mercurial > repos > dereeper > ragoo
comparison RaGOO/Assemblytics_uniq_anchor.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 #! /usr/bin/env python | |
2 | |
3 | |
4 # Author: Maria Nattestad | |
5 # Email: mnattest@cshl.edu | |
6 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. | |
7 | |
8 | |
9 import argparse | |
10 import gzip | |
11 # from intervaltree import * | |
12 import time | |
13 | |
14 import numpy as np | |
15 import operator | |
16 | |
17 | |
18 | |
19 def run(args): | |
20 filename = args.delta | |
21 unique_length = args.unique_length | |
22 output_filename = args.out | |
23 keep_small_uniques = args.keep_small_uniques | |
24 | |
25 f = open(filename) | |
26 header1 = f.readline() | |
27 if header1[0:2]=="\x1f\x8b": | |
28 f.close() | |
29 f = gzip.open(filename) | |
30 | |
31 | |
32 linecounter = 0 | |
33 | |
34 current_query_name = "" | |
35 current_header = "" | |
36 | |
37 lines_by_query = {} | |
38 header_lines_by_query = {} | |
39 | |
40 before = time.time() | |
41 last = before | |
42 | |
43 existing_query_names = set() | |
44 | |
45 for line in f: | |
46 if line[0]==">": | |
47 | |
48 fields = line.strip().split() | |
49 current_query_name = fields[1] | |
50 current_header = line.strip() | |
51 if current_query_name not in existing_query_names: | |
52 lines_by_query[current_query_name] = [] | |
53 header_lines_by_query[current_query_name] = [] | |
54 existing_query_names.add(current_query_name) | |
55 else: | |
56 fields = line.strip().split() | |
57 if len(fields) > 4: | |
58 # sometimes start and end are the other way around, but for this they need to be in order | |
59 query_min = min([int(fields[2]),int(fields[3])]) | |
60 query_max = max([int(fields[2]),int(fields[3])]) | |
61 | |
62 ########## TESTING ONLY ########### | |
63 # lines_by_query[current_query_name] = (query_min,query_max) | |
64 # test_list = test_list + [(query_min,query_max)] | |
65 ##################################### | |
66 | |
67 lines_by_query[current_query_name].append((query_min,query_max)) | |
68 header_lines_by_query[current_query_name].append(current_header) | |
69 # linecounter += 1 | |
70 # if linecounter % 10000000 == 0: | |
71 # print "%d,%f" % (linecounter, time.time()-last) | |
72 # last = time.time() | |
73 | |
74 | |
75 f.close() | |
76 | |
77 | |
78 before = time.time() | |
79 alignments_to_keep = {} | |
80 num_queries = len(lines_by_query) | |
81 | |
82 num_query_step_to_report = num_queries/100 | |
83 if num_queries < 100: | |
84 num_query_step_to_report = num_queries/10 | |
85 if num_queries < 10: | |
86 num_query_step_to_report = 1 | |
87 | |
88 query_counter = 0 | |
89 | |
90 for query in lines_by_query: | |
91 | |
92 ################ TESTING #################### | |
93 | |
94 # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length) | |
95 # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree) | |
96 | |
97 # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length) | |
98 # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep) | |
99 # if intervaltree_filtered_out == planesweep_filtered_out : | |
100 # num_matches += 1 | |
101 # else: | |
102 # num_mismatches += 1 | |
103 # print "MISMATCH:" | |
104 # print "number of alignments:", len(lines_by_query[query]) | |
105 # print "results_intervaltree:" | |
106 # print results_intervaltree | |
107 # for i in results_intervaltree: | |
108 # print lines_by_query[query][i] | |
109 # print "results_planesweep:" | |
110 # print results_planesweep | |
111 # for i in results_planesweep: | |
112 # print lines_by_query[query][i] | |
113 ################ TESTING #################### | |
114 | |
115 alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques) | |
116 | |
117 query_counter += 1 | |
118 | |
119 before = time.time() | |
120 | |
121 | |
122 fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w') | |
123 | |
124 | |
125 f = open(filename) | |
126 header1 = f.readline() | |
127 if header1[0:2]=="\x1f\x8b": | |
128 f.close() | |
129 f = gzip.open(filename) | |
130 header1 = f.readline() | |
131 | |
132 fout.write(header1) # write the first line that we read already | |
133 fout.write(f.readline()) | |
134 | |
135 linecounter = 0 | |
136 | |
137 # For filtered delta file: | |
138 list_of_alignments_to_keep = [] | |
139 alignment_counter = {} | |
140 keep_printing = False | |
141 | |
142 # For coords: | |
143 current_query_name = "" | |
144 current_query_position = 0 | |
145 fcoords_out_tab = open(output_filename + ".coords.tab",'w') | |
146 fcoords_out_csv = open(output_filename + ".coords.csv",'w') | |
147 fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n") | |
148 | |
149 | |
150 # For basic assembly stats: | |
151 ref_sequences = set() | |
152 query_sequences = set() | |
153 ref_lengths = [] | |
154 query_lengths = [] | |
155 | |
156 f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w") | |
157 | |
158 for line in f: | |
159 linecounter += 1 | |
160 if line[0]==">": | |
161 fields = line.strip().split() | |
162 | |
163 # For delta file output: | |
164 query = fields[1] | |
165 list_of_alignments_to_keep = alignments_to_keep[query] | |
166 | |
167 header_needed = False | |
168 for index in list_of_alignments_to_keep: | |
169 if line.strip() == header_lines_by_query[query][index]: | |
170 header_needed = True | |
171 if header_needed == True: | |
172 fout.write(line) # if we have any alignments under this header, print the header | |
173 alignment_counter[query] = alignment_counter.get(query,0) | |
174 | |
175 # For coords: | |
176 current_reference_name = fields[0][1:] | |
177 current_query_name = fields[1] | |
178 | |
179 current_reference_size = int(fields[2]) | |
180 current_query_size = int(fields[3]) | |
181 | |
182 # For basic assembly stats: | |
183 if not current_reference_name in ref_sequences: | |
184 ref_lengths.append(current_reference_size) | |
185 ref_sequences.add(current_reference_name) | |
186 if not current_query_name in query_sequences: | |
187 query_lengths.append(current_query_size) | |
188 query_sequences.add(current_query_name) | |
189 | |
190 else: | |
191 fields = line.strip().split() | |
192 if len(fields) > 4: | |
193 # For coords: | |
194 ref_start = int(fields[0]) | |
195 ref_end = int(fields[1]) | |
196 query_start = int(fields[2]) | |
197 query_end = int(fields[3]) | |
198 csv_tag = "repetitive" | |
199 if alignment_counter[query] in list_of_alignments_to_keep: | |
200 fout.write(line) | |
201 fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n") | |
202 csv_tag = "unique" | |
203 keep_printing = True | |
204 else: | |
205 keep_printing = False | |
206 fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n") | |
207 alignment_counter[query] = alignment_counter[query] + 1 | |
208 | |
209 elif keep_printing == True: | |
210 fout.write(line) | |
211 | |
212 fcoords_out_tab.close() | |
213 fcoords_out_csv.close() | |
214 | |
215 | |
216 ref_lengths.sort() | |
217 query_lengths.sort() | |
218 | |
219 # Assembly statistics | |
220 ref_lengths = np.array(ref_lengths) | |
221 query_lengths = np.array(query_lengths) | |
222 | |
223 f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1])) | |
224 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths))) | |
225 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths))) | |
226 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths))) | |
227 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths))) | |
228 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths))) | |
229 f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths))) | |
230 f_stats_out.write( "\n\n") | |
231 f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1]) | |
232 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths))) | |
233 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths))) | |
234 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths))) | |
235 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths))) | |
236 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths))) | |
237 f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths))) | |
238 | |
239 | |
240 f.close() | |
241 fout.close() | |
242 f_stats_out.close() | |
243 | |
244 def N50(sorted_list): | |
245 # List should be sorted as increasing | |
246 | |
247 # We flip the list around here so we start with the largest element | |
248 cumsum = 0 | |
249 for length in sorted_list[::-1]: | |
250 cumsum += length | |
251 if cumsum >= sum(sorted_list)/2: | |
252 return length | |
253 | |
254 | |
255 def gig_meg(number,digits = 2): | |
256 gig = 1000000000. | |
257 meg = 1000000. | |
258 kil = 1000. | |
259 | |
260 if number > gig: | |
261 return str(round(number/gig,digits)) + " Gbp" | |
262 elif number > meg: | |
263 return str(round(number/meg,digits)) + " Mbp" | |
264 elif number > kil: | |
265 return str(round(number/kil,digits)) + " Kbp" | |
266 else: | |
267 return str(number) + " bp" | |
268 | |
269 def intWithCommas(x): | |
270 if type(x) not in [type(0)]: | |
271 raise TypeError("Parameter must be an integer.") | |
272 if x < 0: | |
273 return '-' + intWithCommas(-x) | |
274 result = '' | |
275 while x >= 1000: | |
276 x, r = divmod(x, 1000) | |
277 result = ",%03d%s" % (r, result) | |
278 return "%d%s" % (x, result) | |
279 | |
280 | |
281 def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False): | |
282 | |
283 alignments_to_keep = [] | |
284 # print len(lines) | |
285 | |
286 # If no alignments: | |
287 if len(lines)==0: | |
288 return [] | |
289 | |
290 # If only one alignment: | |
291 if len(lines) == 1: | |
292 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required: | |
293 return [0] | |
294 else: | |
295 return [] | |
296 | |
297 starts_and_stops = [] | |
298 for query_min,query_max in lines: | |
299 # print query_min, query_max | |
300 starts_and_stops.append((query_min,"start")) | |
301 starts_and_stops.append((query_max,"stop")) | |
302 | |
303 | |
304 sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0)) | |
305 # print sorted_starts_and_stops | |
306 | |
307 current_coverage = 0 | |
308 last_position = -1 | |
309 # sorted_unique_intervals = [] | |
310 sorted_unique_intervals_left = [] | |
311 sorted_unique_intervals_right = [] | |
312 for pos,change in sorted_starts_and_stops: | |
313 # print sorted_starts_and_stops[i] | |
314 # pos = sorted_starts_and_stops[i][0] | |
315 # change = sorted_starts_and_stops[i][1] | |
316 | |
317 # print pos,change | |
318 # First alignment only: | |
319 # if last_position == -1: | |
320 # last_position = pos | |
321 # continue | |
322 | |
323 # print last_position,pos,current_coverage | |
324 | |
325 if current_coverage == 1: | |
326 # sorted_unique_intervals.append((last_position,pos)) | |
327 sorted_unique_intervals_left.append(last_position) | |
328 sorted_unique_intervals_right.append(pos) | |
329 | |
330 if change == "start": | |
331 current_coverage += 1 | |
332 else: | |
333 current_coverage -= 1 | |
334 last_position = pos | |
335 | |
336 | |
337 linecounter = 0 | |
338 for query_min,query_max in lines: | |
339 | |
340 i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left)) | |
341 | |
342 exact_match = False | |
343 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max: | |
344 exact_match = True | |
345 sum_uniq = 0 | |
346 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max: | |
347 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i] | |
348 i += 1 | |
349 | |
350 # print query_min,query_max,sum_uniq | |
351 if sum_uniq >= unique_length_required: | |
352 alignments_to_keep.append(linecounter) | |
353 elif keep_small_uniques == True and exact_match == True: | |
354 alignments_to_keep.append(linecounter) | |
355 # print "Keeping small alignment:", query_min, query_max | |
356 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1] | |
357 | |
358 linecounter += 1 | |
359 | |
360 return alignments_to_keep | |
361 | |
362 | |
363 | |
364 def binary_search(query, numbers, left, right): | |
365 # Returns index of the matching element or the first element to the right | |
366 | |
367 if left >= right: | |
368 return right | |
369 mid = (right+left)//2 | |
370 | |
371 | |
372 if query == numbers[mid]: | |
373 return mid | |
374 elif query < numbers[mid]: | |
375 return binary_search(query,numbers,left,mid) | |
376 else: # if query > numbers[mid]: | |
377 return binary_search(query,numbers,mid+1,right) | |
378 | |
379 | |
380 # def summarize_intervaltree(lines, unique_length_required): | |
381 | |
382 # alignments_to_keep = [] | |
383 # # print len(lines) | |
384 | |
385 # if len(lines)==0: | |
386 # return alignments_to_keep | |
387 | |
388 # if len(lines) == 1: | |
389 # if abs(lines[0][1] - lines[0][0]) >= unique_length_required: | |
390 # return [0] | |
391 | |
392 | |
393 # starts_and_stops = [] | |
394 # for query_min,query_max in lines: | |
395 # starts_and_stops.append((query_min,query_max)) | |
396 | |
397 # # build full tree | |
398 # tree = IntervalTree.from_tuples(starts_and_stops) | |
399 | |
400 | |
401 # # for each interval (keeping the same order as the lines in the input file) | |
402 # line_counter = 0 | |
403 # for query_min,query_max in lines: | |
404 | |
405 # # create a tree object from the current interval | |
406 # this_interval = IntervalTree.from_tuples([(query_min,query_max)]) | |
407 | |
408 # # create a copy of the tree without this one interval | |
409 # rest_of_tree = tree - this_interval | |
410 | |
411 # # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one | |
412 # for other_interval in rest_of_tree: | |
413 # this_interval.chop(other_interval.begin, other_interval.end) | |
414 | |
415 # # loop through to count the total number of unique basepairs | |
416 # total_unique_length = 0 | |
417 # for sub_interval in this_interval: | |
418 # total_unique_length += sub_interval.end - sub_interval.begin | |
419 | |
420 # # if the total unique length is above our threshold, add the index to the list we are reporting | |
421 # if total_unique_length >= unique_length_required: | |
422 # alignments_to_keep.append(line_counter) | |
423 # line_counter += 1 | |
424 | |
425 | |
426 # return alignments_to_keep | |
427 | |
428 | |
429 def main(): | |
430 parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it") | |
431 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) | |
432 parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True) | |
433 parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000) | |
434 parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true") | |
435 parser.set_defaults(func=run) | |
436 args=parser.parse_args() | |
437 args.func(args) | |
438 | |
439 if __name__=="__main__": | |
440 main() |