comparison RaGOO/lift_over.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
comparison
equal deleted inserted replaced
12:68a9ec9ce51e 13:b9a3aeb162ab
1 #!/usr/bin/env python
2 def get_contig_lengths(in_fai):
3 """ Get contig lengths from a fasta index. """
4 lens = dict()
5 with open(in_fai, 'r') as f:
6 for line in f:
7 L1 = line.rstrip().split('\t')
8 lens[L1[0]] = int(L1[1])
9 return lens
10
11
12 def get_contig_orderings(in_groupings):
13 """
14 From the orderings files, return the following:
15
16 1. Dictionary associating a reference header with a list of ordered contig headers
17 2. Dictionary associating a contig with an orientation
18 3. Dicitonary associating a contig with its assigned reference header
19 """
20 orderings = dict()
21 orientations = dict()
22 ref_chr = dict()
23
24 # Iterate through the orderings files
25 with open(in_groupings, 'r') as f1:
26 for line1 in f1:
27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO')
28 header = header[header.rfind('/') + 1:]
29
30 # Initialize the list for the orderings
31 orderings[header] = []
32 with open(line1.rstrip(), 'r') as f2:
33 for line2 in f2:
34 L1 = line2.rstrip().split('\t')
35 orderings[header].append(L1[0])
36 orientations[L1[0]] = L1[1]
37 ref_chr[L1[0]] = header
38 return orderings, orientations, ref_chr
39
40
41 def get_reverse_coords(start_coord, end_coord, seq_length):
42 """
43 Returns new genomic coordinates of a region that has undergone reverse complementation.
44 new start coordinate = seqLength - endCoord
45 new end coordinate = seqLength - startCoord
46 """
47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1)
48
49
50 if __name__ == "__main__":
51 import argparse
52
53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.'
54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.'
55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.')
56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over")
57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'")
58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'")
59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.")
60
61
62 # Get the command line arguments
63 args = parser.parse_args()
64 gff_file = args.gff
65 orderings_file = args.orderings
66 fai_file = args.fai
67 gap_size = args.g
68
69 # Get the contig lengths
70 ctg_lens = get_contig_lengths(fai_file)
71
72 # Get the contig orderings and orientations
73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file)
74
75 #Iterate through the GFF features and update
76 offsets = dict()
77 with open(gff_file) as f:
78 for gff_line in f:
79 if gff_line.startswith("#"):
80 print(gff_line.rstrip())
81 else:
82 gff_fields = gff_line.rstrip().split('\t')
83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6]
84 new_header = ctg_chr[gff_header]
85
86 # Check that this contig header is in our list of headers
87 if gff_header not in ctg_lens.keys():
88 err_msg = """ %s was not found in the list of orderings files provided.
89 Please check that you are using the correct files. If chimeric contig correction was
90 used, please use the corrected gff and fai file.
91 """
92 raise ValueError(err_msg)
93
94 # Check if the contig has been reverse complemented. Update accordingly
95 if ctg_orientations[gff_header] == '-':
96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header])
97
98 # Set the opposite strand
99 if strand == '+':
100 strand = '-'
101 else:
102 strand = '+'
103
104 # Check if the offset for this contig has already been calculated
105 if gff_header in offsets:
106 offset = offsets[gff_header]
107 else:
108 ctg_idx = ctg_orderings[new_header].index(gff_header)
109 offset = 0
110
111 for ctg in ctg_orderings[new_header][:ctg_idx]:
112 offset += ctg_lens[ctg]
113 offset += gap_size
114
115 # memoize the offset
116 offsets[gff_header] = offset
117
118 new_start = start + offset
119 new_end = end + offset
120
121 gff_fields[0] = new_header
122 gff_fields[3] = str(new_start)
123 gff_fields[4] = str(new_end)
124 gff_fields[6] = strand
125 print('\t'.join(gff_fields))