13
+ − 1 #! /usr/bin/env python
+ − 2 import argparse
+ − 3
+ − 4 import gzip
+ − 5 # Author: Maria Nattestad
+ − 6 # Email: mnattest@cshl.edu
+ − 7 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer.
+ − 8
+ − 9
+ − 10 def run(args):
+ − 11 filename = args.delta
+ − 12 minimum_variant_size = args.minimum_variant_size
+ − 13
+ − 14 f = open(filename)
+ − 15 header1 = f.readline()
+ − 16 if header1[0:2]=="\x1f\x8b":
+ − 17 f.close()
+ − 18 f = gzip.open(filename)
+ − 19 header1 = f.readline()
+ − 20
+ − 21 # Ignore the first two lines for now
+ − 22 f.readline()
+ − 23
+ − 24 linecounter = 0
+ − 25
+ − 26 current_reference_name = ""
+ − 27 current_reference_position = 0
+ − 28
+ − 29 current_query_name = ""
+ − 30 current_query_position = 0
+ − 31
+ − 32 variants = []
+ − 33
+ − 34 for line in f:
+ − 35 if line[0]==">":
+ − 36 # linecounter += 1
+ − 37 # if linecounter > 1:
+ − 38 # break
+ − 39 fields = line.strip().split()
+ − 40 current_reference_name = fields[0][1:]
+ − 41 current_query_name = fields[1]
+ − 42 else:
+ − 43 fields = line.strip().split()
+ − 44 if len(fields) > 4:
+ − 45 # current_reference_position = int(fields[0])
+ − 46 current_reference_position = min(int(fields[0]),int(fields[1]))
+ − 47 # fields[1] is the reference position at the end of the alignment
+ − 48 # current_query_position = int(fields[2])
+ − 49 current_query_position = min(int(fields[2]),int(fields[3]))
+ − 50 # fields[3] is the query position at the end of the alignment
+ − 51 else:
+ − 52 tick = int(fields[0])
+ − 53 if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size
+ − 54 report = variants[-1]
+ − 55 report[4] = report[4] + 1 # size
+ − 56 if tick > 0: # deletion, moves in reference
+ − 57 report[2] = report[2] + 1 # reference end position
+ − 58 report[7] = report[7] + 1 # reference gap size
+ − 59 current_reference_position += 1 # update reference position after deletion
+ − 60 elif tick < 0: # insertion, moves in query
+ − 61 report[8] = report[8] + 1 # query gap size
+ − 62 report[12] = report[12] + 1 # query end position
+ − 63 current_query_position += 1 # update query position after insertion
+ − 64 else: # report the last one and continue
+ − 65 current_reference_position += abs(tick) - 1
+ − 66 current_query_position += abs(tick) - 1
+ − 67 if tick > 0:
+ − 68 size = 1
+ − 69 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment")
+ − 70 report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position]
+ − 71 current_reference_position += size # update reference position after deletion
+ − 72 variants.append(report)
+ − 73 elif tick < 0:
+ − 74 size = 1
+ − 75 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment")
+ − 76 report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size]
+ − 77 current_query_position += size # update query position after insertion
+ − 78 variants.append(report)
+ − 79 # TESTING
+ − 80 # print line, report
+ − 81
+ − 82
+ − 83 f.close()
+ − 84
+ − 85 newcounter = 1
+ − 86 for line in variants:
+ − 87 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line
+ − 88 if line[4] >= minimum_variant_size:
+ − 89 line[3] = "Assemblytics_w_%d" % (newcounter)
+ − 90 print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10])
+ − 91 # print "\t".join(map(str,line))
+ − 92 newcounter += 1
+ − 93
+ − 94
+ − 95 def main():
+ − 96 parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment")
+ − 97 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True)
+ − 98 parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50)
+ − 99 parser.set_defaults(func=run)
+ − 100 args=parser.parse_args()
+ − 101 args.func(args)
+ − 102
+ − 103 if __name__=="__main__":
+ − 104 main()
+ − 105