Mercurial > repos > dereeper > ragoo
comparison RaGOO/lift_over.py @ 13:b9a3aeb162ab draft default tip
Uploaded
author | dereeper |
---|---|
date | Mon, 26 Jul 2021 18:22:37 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
12:68a9ec9ce51e | 13:b9a3aeb162ab |
---|---|
1 #!/usr/bin/env python | |
2 def get_contig_lengths(in_fai): | |
3 """ Get contig lengths from a fasta index. """ | |
4 lens = dict() | |
5 with open(in_fai, 'r') as f: | |
6 for line in f: | |
7 L1 = line.rstrip().split('\t') | |
8 lens[L1[0]] = int(L1[1]) | |
9 return lens | |
10 | |
11 | |
12 def get_contig_orderings(in_groupings): | |
13 """ | |
14 From the orderings files, return the following: | |
15 | |
16 1. Dictionary associating a reference header with a list of ordered contig headers | |
17 2. Dictionary associating a contig with an orientation | |
18 3. Dicitonary associating a contig with its assigned reference header | |
19 """ | |
20 orderings = dict() | |
21 orientations = dict() | |
22 ref_chr = dict() | |
23 | |
24 # Iterate through the orderings files | |
25 with open(in_groupings, 'r') as f1: | |
26 for line1 in f1: | |
27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO') | |
28 header = header[header.rfind('/') + 1:] | |
29 | |
30 # Initialize the list for the orderings | |
31 orderings[header] = [] | |
32 with open(line1.rstrip(), 'r') as f2: | |
33 for line2 in f2: | |
34 L1 = line2.rstrip().split('\t') | |
35 orderings[header].append(L1[0]) | |
36 orientations[L1[0]] = L1[1] | |
37 ref_chr[L1[0]] = header | |
38 return orderings, orientations, ref_chr | |
39 | |
40 | |
41 def get_reverse_coords(start_coord, end_coord, seq_length): | |
42 """ | |
43 Returns new genomic coordinates of a region that has undergone reverse complementation. | |
44 new start coordinate = seqLength - endCoord | |
45 new end coordinate = seqLength - startCoord | |
46 """ | |
47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1) | |
48 | |
49 | |
50 if __name__ == "__main__": | |
51 import argparse | |
52 | |
53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.' | |
54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.' | |
55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.') | |
56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over") | |
57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'") | |
58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'") | |
59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.") | |
60 | |
61 | |
62 # Get the command line arguments | |
63 args = parser.parse_args() | |
64 gff_file = args.gff | |
65 orderings_file = args.orderings | |
66 fai_file = args.fai | |
67 gap_size = args.g | |
68 | |
69 # Get the contig lengths | |
70 ctg_lens = get_contig_lengths(fai_file) | |
71 | |
72 # Get the contig orderings and orientations | |
73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file) | |
74 | |
75 #Iterate through the GFF features and update | |
76 offsets = dict() | |
77 with open(gff_file) as f: | |
78 for gff_line in f: | |
79 if gff_line.startswith("#"): | |
80 print(gff_line.rstrip()) | |
81 else: | |
82 gff_fields = gff_line.rstrip().split('\t') | |
83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6] | |
84 new_header = ctg_chr[gff_header] | |
85 | |
86 # Check that this contig header is in our list of headers | |
87 if gff_header not in ctg_lens.keys(): | |
88 err_msg = """ %s was not found in the list of orderings files provided. | |
89 Please check that you are using the correct files. If chimeric contig correction was | |
90 used, please use the corrected gff and fai file. | |
91 """ | |
92 raise ValueError(err_msg) | |
93 | |
94 # Check if the contig has been reverse complemented. Update accordingly | |
95 if ctg_orientations[gff_header] == '-': | |
96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header]) | |
97 | |
98 # Set the opposite strand | |
99 if strand == '+': | |
100 strand = '-' | |
101 else: | |
102 strand = '+' | |
103 | |
104 # Check if the offset for this contig has already been calculated | |
105 if gff_header in offsets: | |
106 offset = offsets[gff_header] | |
107 else: | |
108 ctg_idx = ctg_orderings[new_header].index(gff_header) | |
109 offset = 0 | |
110 | |
111 for ctg in ctg_orderings[new_header][:ctg_idx]: | |
112 offset += ctg_lens[ctg] | |
113 offset += gap_size | |
114 | |
115 # memoize the offset | |
116 offsets[gff_header] = offset | |
117 | |
118 new_start = start + offset | |
119 new_end = end + offset | |
120 | |
121 gff_fields[0] = new_header | |
122 gff_fields[3] = str(new_start) | |
123 gff_fields[4] = str(new_end) | |
124 gff_fields[6] = strand | |
125 print('\t'.join(gff_fields)) |