Mercurial > repos > dereeper > ragoo
comparison RaGOO/Assemblytics_uniq_anchor.py @ 13:b9a3aeb162ab draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 26 Jul 2021 18:22:37 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:68a9ec9ce51e | 13:b9a3aeb162ab |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 | |
| 3 | |
| 4 # Author: Maria Nattestad | |
| 5 # Email: mnattest@cshl.edu | |
| 6 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. | |
| 7 | |
| 8 | |
| 9 import argparse | |
| 10 import gzip | |
| 11 # from intervaltree import * | |
| 12 import time | |
| 13 | |
| 14 import numpy as np | |
| 15 import operator | |
| 16 | |
| 17 | |
| 18 | |
| 19 def run(args): | |
| 20 filename = args.delta | |
| 21 unique_length = args.unique_length | |
| 22 output_filename = args.out | |
| 23 keep_small_uniques = args.keep_small_uniques | |
| 24 | |
| 25 f = open(filename) | |
| 26 header1 = f.readline() | |
| 27 if header1[0:2]=="\x1f\x8b": | |
| 28 f.close() | |
| 29 f = gzip.open(filename) | |
| 30 | |
| 31 | |
| 32 linecounter = 0 | |
| 33 | |
| 34 current_query_name = "" | |
| 35 current_header = "" | |
| 36 | |
| 37 lines_by_query = {} | |
| 38 header_lines_by_query = {} | |
| 39 | |
| 40 before = time.time() | |
| 41 last = before | |
| 42 | |
| 43 existing_query_names = set() | |
| 44 | |
| 45 for line in f: | |
| 46 if line[0]==">": | |
| 47 | |
| 48 fields = line.strip().split() | |
| 49 current_query_name = fields[1] | |
| 50 current_header = line.strip() | |
| 51 if current_query_name not in existing_query_names: | |
| 52 lines_by_query[current_query_name] = [] | |
| 53 header_lines_by_query[current_query_name] = [] | |
| 54 existing_query_names.add(current_query_name) | |
| 55 else: | |
| 56 fields = line.strip().split() | |
| 57 if len(fields) > 4: | |
| 58 # sometimes start and end are the other way around, but for this they need to be in order | |
| 59 query_min = min([int(fields[2]),int(fields[3])]) | |
| 60 query_max = max([int(fields[2]),int(fields[3])]) | |
| 61 | |
| 62 ########## TESTING ONLY ########### | |
| 63 # lines_by_query[current_query_name] = (query_min,query_max) | |
| 64 # test_list = test_list + [(query_min,query_max)] | |
| 65 ##################################### | |
| 66 | |
| 67 lines_by_query[current_query_name].append((query_min,query_max)) | |
| 68 header_lines_by_query[current_query_name].append(current_header) | |
| 69 # linecounter += 1 | |
| 70 # if linecounter % 10000000 == 0: | |
| 71 # print "%d,%f" % (linecounter, time.time()-last) | |
| 72 # last = time.time() | |
| 73 | |
| 74 | |
| 75 f.close() | |
| 76 | |
| 77 | |
| 78 before = time.time() | |
| 79 alignments_to_keep = {} | |
| 80 num_queries = len(lines_by_query) | |
| 81 | |
| 82 num_query_step_to_report = num_queries/100 | |
| 83 if num_queries < 100: | |
| 84 num_query_step_to_report = num_queries/10 | |
| 85 if num_queries < 10: | |
| 86 num_query_step_to_report = 1 | |
| 87 | |
| 88 query_counter = 0 | |
| 89 | |
| 90 for query in lines_by_query: | |
| 91 | |
| 92 ################ TESTING #################### | |
| 93 | |
| 94 # results_intervaltree = summarize_intervaltree(lines_by_query[query], unique_length_required = unique_length) | |
| 95 # intervaltree_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_intervaltree) | |
| 96 | |
| 97 # results_planesweep = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length) | |
| 98 # planesweep_filtered_out = set(range(0,len(lines_by_query[query]))) - set(results_planesweep) | |
| 99 # if intervaltree_filtered_out == planesweep_filtered_out : | |
| 100 # num_matches += 1 | |
| 101 # else: | |
| 102 # num_mismatches += 1 | |
| 103 # print "MISMATCH:" | |
| 104 # print "number of alignments:", len(lines_by_query[query]) | |
| 105 # print "results_intervaltree:" | |
| 106 # print results_intervaltree | |
| 107 # for i in results_intervaltree: | |
| 108 # print lines_by_query[query][i] | |
| 109 # print "results_planesweep:" | |
| 110 # print results_planesweep | |
| 111 # for i in results_planesweep: | |
| 112 # print lines_by_query[query][i] | |
| 113 ################ TESTING #################### | |
| 114 | |
| 115 alignments_to_keep[query] = summarize_planesweep(lines_by_query[query], unique_length_required = unique_length,keep_small_uniques=keep_small_uniques) | |
| 116 | |
| 117 query_counter += 1 | |
| 118 | |
| 119 before = time.time() | |
| 120 | |
| 121 | |
| 122 fout = open(output_filename + ".Assemblytics.unique_length_filtered_l%d.delta" % (unique_length),'w') | |
| 123 | |
| 124 | |
| 125 f = open(filename) | |
| 126 header1 = f.readline() | |
| 127 if header1[0:2]=="\x1f\x8b": | |
| 128 f.close() | |
| 129 f = gzip.open(filename) | |
| 130 header1 = f.readline() | |
| 131 | |
| 132 fout.write(header1) # write the first line that we read already | |
| 133 fout.write(f.readline()) | |
| 134 | |
| 135 linecounter = 0 | |
| 136 | |
| 137 # For filtered delta file: | |
| 138 list_of_alignments_to_keep = [] | |
| 139 alignment_counter = {} | |
| 140 keep_printing = False | |
| 141 | |
| 142 # For coords: | |
| 143 current_query_name = "" | |
| 144 current_query_position = 0 | |
| 145 fcoords_out_tab = open(output_filename + ".coords.tab",'w') | |
| 146 fcoords_out_csv = open(output_filename + ".coords.csv",'w') | |
| 147 fcoords_out_csv.write("ref_start,ref_end,query_start,query_end,ref_length,query_length,ref,query,tag\n") | |
| 148 | |
| 149 | |
| 150 # For basic assembly stats: | |
| 151 ref_sequences = set() | |
| 152 query_sequences = set() | |
| 153 ref_lengths = [] | |
| 154 query_lengths = [] | |
| 155 | |
| 156 f_stats_out = open(output_filename + ".Assemblytics_assembly_stats.txt","w") | |
| 157 | |
| 158 for line in f: | |
| 159 linecounter += 1 | |
| 160 if line[0]==">": | |
| 161 fields = line.strip().split() | |
| 162 | |
| 163 # For delta file output: | |
| 164 query = fields[1] | |
| 165 list_of_alignments_to_keep = alignments_to_keep[query] | |
| 166 | |
| 167 header_needed = False | |
| 168 for index in list_of_alignments_to_keep: | |
| 169 if line.strip() == header_lines_by_query[query][index]: | |
| 170 header_needed = True | |
| 171 if header_needed == True: | |
| 172 fout.write(line) # if we have any alignments under this header, print the header | |
| 173 alignment_counter[query] = alignment_counter.get(query,0) | |
| 174 | |
| 175 # For coords: | |
| 176 current_reference_name = fields[0][1:] | |
| 177 current_query_name = fields[1] | |
| 178 | |
| 179 current_reference_size = int(fields[2]) | |
| 180 current_query_size = int(fields[3]) | |
| 181 | |
| 182 # For basic assembly stats: | |
| 183 if not current_reference_name in ref_sequences: | |
| 184 ref_lengths.append(current_reference_size) | |
| 185 ref_sequences.add(current_reference_name) | |
| 186 if not current_query_name in query_sequences: | |
| 187 query_lengths.append(current_query_size) | |
| 188 query_sequences.add(current_query_name) | |
| 189 | |
| 190 else: | |
| 191 fields = line.strip().split() | |
| 192 if len(fields) > 4: | |
| 193 # For coords: | |
| 194 ref_start = int(fields[0]) | |
| 195 ref_end = int(fields[1]) | |
| 196 query_start = int(fields[2]) | |
| 197 query_end = int(fields[3]) | |
| 198 csv_tag = "repetitive" | |
| 199 if alignment_counter[query] in list_of_alignments_to_keep: | |
| 200 fout.write(line) | |
| 201 fcoords_out_tab.write("\t".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name,current_query_name])) + "\n") | |
| 202 csv_tag = "unique" | |
| 203 keep_printing = True | |
| 204 else: | |
| 205 keep_printing = False | |
| 206 fcoords_out_csv.write(",".join(map(str,[ref_start,ref_end,query_start, query_end,current_reference_size,current_query_size,current_reference_name.replace(",","_"),current_query_name.replace(",","_"),csv_tag])) + "\n") | |
| 207 alignment_counter[query] = alignment_counter[query] + 1 | |
| 208 | |
| 209 elif keep_printing == True: | |
| 210 fout.write(line) | |
| 211 | |
| 212 fcoords_out_tab.close() | |
| 213 fcoords_out_csv.close() | |
| 214 | |
| 215 | |
| 216 ref_lengths.sort() | |
| 217 query_lengths.sort() | |
| 218 | |
| 219 # Assembly statistics | |
| 220 ref_lengths = np.array(ref_lengths) | |
| 221 query_lengths = np.array(query_lengths) | |
| 222 | |
| 223 f_stats_out.write("Reference: %s\n" % (header1.split()[0].split("/")[-1])) | |
| 224 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(ref_lengths))) | |
| 225 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(ref_lengths))) | |
| 226 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(ref_lengths))) | |
| 227 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(ref_lengths))) | |
| 228 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(ref_lengths))) | |
| 229 f_stats_out.write( "N50: %s\n" % gig_meg(N50(ref_lengths))) | |
| 230 f_stats_out.write( "\n\n") | |
| 231 f_stats_out.write( "Query: %s\n" % header1.split()[1].split("/")[-1]) | |
| 232 f_stats_out.write( "Number of sequences: %s\n" % intWithCommas(len(query_lengths))) | |
| 233 f_stats_out.write( "Total sequence length: %s\n" % gig_meg(sum(query_lengths))) | |
| 234 f_stats_out.write( "Mean: %s\n" % gig_meg(np.mean(query_lengths))) | |
| 235 f_stats_out.write( "Min: %s\n" % gig_meg(np.min(query_lengths))) | |
| 236 f_stats_out.write( "Max: %s\n" % gig_meg(np.max(query_lengths))) | |
| 237 f_stats_out.write( "N50: %s\n" % gig_meg(N50(query_lengths))) | |
| 238 | |
| 239 | |
| 240 f.close() | |
| 241 fout.close() | |
| 242 f_stats_out.close() | |
| 243 | |
| 244 def N50(sorted_list): | |
| 245 # List should be sorted as increasing | |
| 246 | |
| 247 # We flip the list around here so we start with the largest element | |
| 248 cumsum = 0 | |
| 249 for length in sorted_list[::-1]: | |
| 250 cumsum += length | |
| 251 if cumsum >= sum(sorted_list)/2: | |
| 252 return length | |
| 253 | |
| 254 | |
| 255 def gig_meg(number,digits = 2): | |
| 256 gig = 1000000000. | |
| 257 meg = 1000000. | |
| 258 kil = 1000. | |
| 259 | |
| 260 if number > gig: | |
| 261 return str(round(number/gig,digits)) + " Gbp" | |
| 262 elif number > meg: | |
| 263 return str(round(number/meg,digits)) + " Mbp" | |
| 264 elif number > kil: | |
| 265 return str(round(number/kil,digits)) + " Kbp" | |
| 266 else: | |
| 267 return str(number) + " bp" | |
| 268 | |
| 269 def intWithCommas(x): | |
| 270 if type(x) not in [type(0)]: | |
| 271 raise TypeError("Parameter must be an integer.") | |
| 272 if x < 0: | |
| 273 return '-' + intWithCommas(-x) | |
| 274 result = '' | |
| 275 while x >= 1000: | |
| 276 x, r = divmod(x, 1000) | |
| 277 result = ",%03d%s" % (r, result) | |
| 278 return "%d%s" % (x, result) | |
| 279 | |
| 280 | |
| 281 def summarize_planesweep(lines,unique_length_required, keep_small_uniques=False): | |
| 282 | |
| 283 alignments_to_keep = [] | |
| 284 # print len(lines) | |
| 285 | |
| 286 # If no alignments: | |
| 287 if len(lines)==0: | |
| 288 return [] | |
| 289 | |
| 290 # If only one alignment: | |
| 291 if len(lines) == 1: | |
| 292 if keep_small_uniques == True or abs(lines[0][1] - lines[0][0]) >= unique_length_required: | |
| 293 return [0] | |
| 294 else: | |
| 295 return [] | |
| 296 | |
| 297 starts_and_stops = [] | |
| 298 for query_min,query_max in lines: | |
| 299 # print query_min, query_max | |
| 300 starts_and_stops.append((query_min,"start")) | |
| 301 starts_and_stops.append((query_max,"stop")) | |
| 302 | |
| 303 | |
| 304 sorted_starts_and_stops = sorted(starts_and_stops,key=operator.itemgetter(0)) | |
| 305 # print sorted_starts_and_stops | |
| 306 | |
| 307 current_coverage = 0 | |
| 308 last_position = -1 | |
| 309 # sorted_unique_intervals = [] | |
| 310 sorted_unique_intervals_left = [] | |
| 311 sorted_unique_intervals_right = [] | |
| 312 for pos,change in sorted_starts_and_stops: | |
| 313 # print sorted_starts_and_stops[i] | |
| 314 # pos = sorted_starts_and_stops[i][0] | |
| 315 # change = sorted_starts_and_stops[i][1] | |
| 316 | |
| 317 # print pos,change | |
| 318 # First alignment only: | |
| 319 # if last_position == -1: | |
| 320 # last_position = pos | |
| 321 # continue | |
| 322 | |
| 323 # print last_position,pos,current_coverage | |
| 324 | |
| 325 if current_coverage == 1: | |
| 326 # sorted_unique_intervals.append((last_position,pos)) | |
| 327 sorted_unique_intervals_left.append(last_position) | |
| 328 sorted_unique_intervals_right.append(pos) | |
| 329 | |
| 330 if change == "start": | |
| 331 current_coverage += 1 | |
| 332 else: | |
| 333 current_coverage -= 1 | |
| 334 last_position = pos | |
| 335 | |
| 336 | |
| 337 linecounter = 0 | |
| 338 for query_min,query_max in lines: | |
| 339 | |
| 340 i = binary_search(query_min,sorted_unique_intervals_left,0,len(sorted_unique_intervals_left)) | |
| 341 | |
| 342 exact_match = False | |
| 343 if sorted_unique_intervals_left[i] == query_min and sorted_unique_intervals_right[i] == query_max: | |
| 344 exact_match = True | |
| 345 sum_uniq = 0 | |
| 346 while i < len(sorted_unique_intervals_left) and sorted_unique_intervals_left[i] >= query_min and sorted_unique_intervals_right[i] <= query_max: | |
| 347 sum_uniq += sorted_unique_intervals_right[i] - sorted_unique_intervals_left[i] | |
| 348 i += 1 | |
| 349 | |
| 350 # print query_min,query_max,sum_uniq | |
| 351 if sum_uniq >= unique_length_required: | |
| 352 alignments_to_keep.append(linecounter) | |
| 353 elif keep_small_uniques == True and exact_match == True: | |
| 354 alignments_to_keep.append(linecounter) | |
| 355 # print "Keeping small alignment:", query_min, query_max | |
| 356 # print sorted_unique_intervals_left[i-1],sorted_unique_intervals_right[i-1] | |
| 357 | |
| 358 linecounter += 1 | |
| 359 | |
| 360 return alignments_to_keep | |
| 361 | |
| 362 | |
| 363 | |
| 364 def binary_search(query, numbers, left, right): | |
| 365 # Returns index of the matching element or the first element to the right | |
| 366 | |
| 367 if left >= right: | |
| 368 return right | |
| 369 mid = (right+left)//2 | |
| 370 | |
| 371 | |
| 372 if query == numbers[mid]: | |
| 373 return mid | |
| 374 elif query < numbers[mid]: | |
| 375 return binary_search(query,numbers,left,mid) | |
| 376 else: # if query > numbers[mid]: | |
| 377 return binary_search(query,numbers,mid+1,right) | |
| 378 | |
| 379 | |
| 380 # def summarize_intervaltree(lines, unique_length_required): | |
| 381 | |
| 382 # alignments_to_keep = [] | |
| 383 # # print len(lines) | |
| 384 | |
| 385 # if len(lines)==0: | |
| 386 # return alignments_to_keep | |
| 387 | |
| 388 # if len(lines) == 1: | |
| 389 # if abs(lines[0][1] - lines[0][0]) >= unique_length_required: | |
| 390 # return [0] | |
| 391 | |
| 392 | |
| 393 # starts_and_stops = [] | |
| 394 # for query_min,query_max in lines: | |
| 395 # starts_and_stops.append((query_min,query_max)) | |
| 396 | |
| 397 # # build full tree | |
| 398 # tree = IntervalTree.from_tuples(starts_and_stops) | |
| 399 | |
| 400 | |
| 401 # # for each interval (keeping the same order as the lines in the input file) | |
| 402 # line_counter = 0 | |
| 403 # for query_min,query_max in lines: | |
| 404 | |
| 405 # # create a tree object from the current interval | |
| 406 # this_interval = IntervalTree.from_tuples([(query_min,query_max)]) | |
| 407 | |
| 408 # # create a copy of the tree without this one interval | |
| 409 # rest_of_tree = tree - this_interval | |
| 410 | |
| 411 # # find difference between this interval and the rest of the tree by subtracting out the other intervals one by one | |
| 412 # for other_interval in rest_of_tree: | |
| 413 # this_interval.chop(other_interval.begin, other_interval.end) | |
| 414 | |
| 415 # # loop through to count the total number of unique basepairs | |
| 416 # total_unique_length = 0 | |
| 417 # for sub_interval in this_interval: | |
| 418 # total_unique_length += sub_interval.end - sub_interval.begin | |
| 419 | |
| 420 # # if the total unique length is above our threshold, add the index to the list we are reporting | |
| 421 # if total_unique_length >= unique_length_required: | |
| 422 # alignments_to_keep.append(line_counter) | |
| 423 # line_counter += 1 | |
| 424 | |
| 425 | |
| 426 # return alignments_to_keep | |
| 427 | |
| 428 | |
| 429 def main(): | |
| 430 parser=argparse.ArgumentParser(description="Filters alignments in delta file based whether each alignment has a unique sequence anchoring it") | |
| 431 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) | |
| 432 parser.add_argument("--out",help="output file" ,dest="out", type=str, required=True) | |
| 433 parser.add_argument("--unique-length",help="The total length of unique sequence an alignment must have on the query side to be retained. Default: 10000" ,dest="unique_length",type=int, default=10000) | |
| 434 parser.add_argument("--keep-small-uniques",help="Keep small aligments (below the unique anchor length) if they are completely unique without any part of the alignment mapping multiple places" ,dest="keep_small_uniques",action="store_true") | |
| 435 parser.set_defaults(func=run) | |
| 436 args=parser.parse_args() | |
| 437 args.func(args) | |
| 438 | |
| 439 if __name__=="__main__": | |
| 440 main() |
