Mercurial > repos > dereeper > ragoo
diff RaGOO/Assemblytics_within_alignment.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/RaGOO/Assemblytics_within_alignment.py Mon Jul 26 18:22:37 2021 +0000 @@ -0,0 +1,105 @@ +#! /usr/bin/env python +import argparse + +import gzip +# Author: Maria Nattestad +# Email: mnattest@cshl.edu +# This script is part of Assemblytics, a program to detect and analyze structural variants from an assembly aligned to a reference genome using MUMmer. + + +def run(args): + filename = args.delta + minimum_variant_size = args.minimum_variant_size + + f = open(filename) + header1 = f.readline() + if header1[0:2]=="\x1f\x8b": + f.close() + f = gzip.open(filename) + header1 = f.readline() + + # Ignore the first two lines for now + f.readline() + + linecounter = 0 + + current_reference_name = "" + current_reference_position = 0 + + current_query_name = "" + current_query_position = 0 + + variants = [] + + for line in f: + if line[0]==">": + # linecounter += 1 + # if linecounter > 1: + # break + fields = line.strip().split() + current_reference_name = fields[0][1:] + current_query_name = fields[1] + else: + fields = line.strip().split() + if len(fields) > 4: + # current_reference_position = int(fields[0]) + current_reference_position = min(int(fields[0]),int(fields[1])) + # fields[1] is the reference position at the end of the alignment + # current_query_position = int(fields[2]) + current_query_position = min(int(fields[2]),int(fields[3])) + # fields[3] is the query position at the end of the alignment + else: + tick = int(fields[0]) + if abs(tick) == 1: # then go back and edit the last entry to add 1 more to its size + report = variants[-1] + report[4] = report[4] + 1 # size + if tick > 0: # deletion, moves in reference + report[2] = report[2] + 1 # reference end position + report[7] = report[7] + 1 # reference gap size + current_reference_position += 1 # update reference position after deletion + elif tick < 0: # insertion, moves in query + report[8] = report[8] + 1 # query gap size + report[12] = report[12] + 1 # query end position + current_query_position += 1 # update query position after insertion + else: # report the last one and continue + current_reference_position += abs(tick) - 1 + current_query_position += abs(tick) - 1 + if tick > 0: + size = 1 + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position+size,len(variants)+1,size,"+","Deletion",size,0,current_query_name,"within_alignment") + report = [current_reference_name,current_reference_position,current_reference_position+size,"Assemblytics_w_"+str(len(variants)+1),size,"+","Deletion",size,0,current_query_name,"within_alignment",current_query_position,current_query_position] + current_reference_position += size # update reference position after deletion + variants.append(report) + elif tick < 0: + size = 1 + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % (current_reference_name,current_reference_position,current_reference_position,len(variants)+1,size,"+","Insertion",0,size,current_query_name,"within_alignment") + report = [current_reference_name,current_reference_position,current_reference_position,"Assemblytics_w_"+str(len(variants)+1),size,"+","Insertion",0,size,current_query_name,"within_alignment",current_query_position,current_query_position+size] + current_query_position += size # update query position after insertion + variants.append(report) + # TESTING + # print line, report + + + f.close() + + newcounter = 1 + for line in variants: + # report = "%s\t%d\t%d\tAssemblytics_%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\n" % line + if line[4] >= minimum_variant_size: + line[3] = "Assemblytics_w_%d" % (newcounter) + print ("\t".join(map(str,line[0:10])) + ":" + str(line[11]) + "-" + str(line[12]) + ":+\t" + line[10]) + # print "\t".join(map(str,line)) + newcounter += 1 + + +def main(): + parser=argparse.ArgumentParser(description="Outputs MUMmer coordinates annotated with length of unique sequence for each alignment") + parser.add_argument("--delta",help="delta file" ,dest="delta", type=str, required=True) + parser.add_argument("--min",help="Minimum size (bp) of variant to include, default = 50" ,dest="minimum_variant_size",type=int, default=50) + parser.set_defaults(func=run) + args=parser.parse_args() + args.func(args) + +if __name__=="__main__": + main() +