Mercurial > repos > dereeper > ragoo
view RaGOO/lift_over.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env python def get_contig_lengths(in_fai): """ Get contig lengths from a fasta index. """ lens = dict() with open(in_fai, 'r') as f: for line in f: L1 = line.rstrip().split('\t') lens[L1[0]] = int(L1[1]) return lens def get_contig_orderings(in_groupings): """ From the orderings files, return the following: 1. Dictionary associating a reference header with a list of ordered contig headers 2. Dictionary associating a contig with an orientation 3. Dicitonary associating a contig with its assigned reference header """ orderings = dict() orientations = dict() ref_chr = dict() # Iterate through the orderings files with open(in_groupings, 'r') as f1: for line1 in f1: header = line1.rstrip().replace('_orderings.txt', '_RaGOO') header = header[header.rfind('/') + 1:] # Initialize the list for the orderings orderings[header] = [] with open(line1.rstrip(), 'r') as f2: for line2 in f2: L1 = line2.rstrip().split('\t') orderings[header].append(L1[0]) orientations[L1[0]] = L1[1] ref_chr[L1[0]] = header return orderings, orientations, ref_chr def get_reverse_coords(start_coord, end_coord, seq_length): """ Returns new genomic coordinates of a region that has undergone reverse complementation. new start coordinate = seqLength - endCoord new end coordinate = seqLength - startCoord """ return seq_length - (end_coord - 1), seq_length - (start_coord - 1) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.' 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.' 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.') parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over") parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'") parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'") parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.") # Get the command line arguments args = parser.parse_args() gff_file = args.gff orderings_file = args.orderings fai_file = args.fai gap_size = args.g # Get the contig lengths ctg_lens = get_contig_lengths(fai_file) # Get the contig orderings and orientations ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file) #Iterate through the GFF features and update offsets = dict() with open(gff_file) as f: for gff_line in f: if gff_line.startswith("#"): print(gff_line.rstrip()) else: gff_fields = gff_line.rstrip().split('\t') gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6] new_header = ctg_chr[gff_header] # Check that this contig header is in our list of headers if gff_header not in ctg_lens.keys(): err_msg = """ %s was not found in the list of orderings files provided. Please check that you are using the correct files. If chimeric contig correction was used, please use the corrected gff and fai file. """ raise ValueError(err_msg) # Check if the contig has been reverse complemented. Update accordingly if ctg_orientations[gff_header] == '-': start, end = get_reverse_coords(start, end, ctg_lens[gff_header]) # Set the opposite strand if strand == '+': strand = '-' else: strand = '+' # Check if the offset for this contig has already been calculated if gff_header in offsets: offset = offsets[gff_header] else: ctg_idx = ctg_orderings[new_header].index(gff_header) offset = 0 for ctg in ctg_orderings[new_header][:ctg_idx]: offset += ctg_lens[ctg] offset += gap_size # memoize the offset offsets[gff_header] = offset new_start = start + offset new_end = end + offset gff_fields[0] = new_header gff_fields[3] = str(new_start) gff_fields[4] = str(new_end) gff_fields[6] = strand print('\t'.join(gff_fields))