Mercurial > repos > dereeper > ragoo
comparison RaGOO/Assemblytics_within_alignment.py @ 13:b9a3aeb162ab draft default tip
Uploaded
| author | dereeper |
|---|---|
| date | Mon, 26 Jul 2021 18:22:37 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:68a9ec9ce51e | 13:b9a3aeb162ab |
|---|---|
| 1 #! /usr/bin/env python | |
| 2 import argparse | |
| 3 | |
| 4 import gzip | |
| 5 # Author: Maria Nattestad | |
| 6 # Email: mnattest@cshl.edu | |
| 7 # This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. | |
| 8 | |
| 9 | |
| 10 def run(args): | |
| 11 filename = args.delta | |
| 12 minimum_variant_size = args.minimum_variant_size | |
| 13 | |
| 14 f = open(filename) | |
| 15 header1 = f.readline() | |
| 16 if header1[0:2]=="\x1f\x8b": | |
| 17 f.close() | |
| 18 f = gzip.open(filename) | |
| 19 header1 = f.readline() | |
| 20 | |
| 21 # Ignore the first two lines for now | |
| 22 f.readline() | |
| 23 | |
| 24 linecounter = 0 | |
| 25 | |
| 26 current_reference_name = "" | |
| 27 current_reference_position = 0 | |
| 28 | |
| 29 current_query_name = "" | |
| 30 current_query_position = 0 | |
| 31 | |
| 32 variants = [] | |
| 33 | |
| 34 for line in f: | |
| 35 if line[0]==">": | |
| 36 # linecounter += 1 | |
| 37 # if linecounter > 1: | |
| 38 # break | |
| 39 fields = line.strip().split() | |
| 40 current_reference_name = fields[0][1:] | |
| 41 current_query_name = fields[1] | |
| 42 else: | |
| 43 fields = line.strip().split() | |
| 44 if len(fields) > 4: | |
| 45 # current_reference_position = int(fields[0]) | |
| 46 current_reference_position = min(int(fields[0]),int(fields[1])) | |
| 47 # fields[1] is the reference position at the end of the alignment | |
| 48 # current_query_position = int(fields[2]) | |
| 49 current_query_position = min(int(fields[2]),int(fields[3])) | |
| 50 # fields[3] is the query position at the end of the alignment | |
| 51 else: | |
| 52 tick = int(fields[0]) | |
| 53 if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size | |
| 54 report = variants[-1] | |
| 55 report[4] = report[4] + 1 # size | |
| 56 if tick > 0: # deletion, moves in reference | |
| 57 report[2] = report[2] + 1 # reference end position | |
| 58 report[7] = report[7] + 1 # reference gap size | |
| 59 current_reference_position += 1 # update reference position after deletion | |
| 60 elif tick < 0: # insertion, moves in query | |
| 61 report[8] = report[8] + 1 # query gap size | |
| 62 report[12] = report[12] + 1 # query end position | |
| 63 current_query_position += 1 # update query position after insertion | |
| 64 else: # report the last one and continue | |
| 65 current_reference_position += abs(tick) - 1 | |
| 66 current_query_position += abs(tick) - 1 | |
| 67 if tick > 0: | |
| 68 size = 1 | |
| 69 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment") | |
| 70 report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position] | |
| 71 current_reference_position += size # update reference position after deletion | |
| 72 variants.append(report) | |
| 73 elif tick < 0: | |
| 74 size = 1 | |
| 75 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment") | |
| 76 report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size] | |
| 77 current_query_position += size # update query position after insertion | |
| 78 variants.append(report) | |
| 79 # TESTING | |
| 80 # print line, report | |
| 81 | |
| 82 | |
| 83 f.close() | |
| 84 | |
| 85 newcounter = 1 | |
| 86 for line in variants: | |
| 87 # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line | |
| 88 if line[4] >= minimum_variant_size: | |
| 89 line[3] = "Assemblytics_w_%d" % (newcounter) | |
| 90 print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10]) | |
| 91 # print "\t".join(map(str,line)) | |
| 92 newcounter += 1 | |
| 93 | |
| 94 | |
| 95 def main(): | |
| 96 parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment") | |
| 97 parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) | |
| 98 parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50) | |
| 99 parser.set_defaults(func=run) | |
| 100 args=parser.parse_args() | |
| 101 args.func(args) | |
| 102 | |
| 103 if __name__=="__main__": | |
| 104 main() | |
| 105 |
