annotate RaGOO/lift_over.py @ 13:b9a3aeb162ab draft default tip

Uploaded
author dereeper
date Mon, 26 Jul 2021 18:22:37 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
13
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/env python
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
2 def get_contig_lengths(in_fai):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
3 """ Get contig lengths from a fasta index. """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
4 lens = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
5 with open(in_fai, 'r') as f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
6 for line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
7 L1 = line.rstrip().split('\t')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
8 lens[L1[0]] = int(L1[1])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
9 return lens
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
10
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
11
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
12 def get_contig_orderings(in_groupings):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
13 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
14 From the orderings files, return the following:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
15
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
16 1. Dictionary associating a reference header with a list of ordered contig headers
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
17 2. Dictionary associating a contig with an orientation
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
18 3. Dicitonary associating a contig with its assigned reference header
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
19 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
20 orderings = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
21 orientations = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
22 ref_chr = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
23
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
24 # Iterate through the orderings files
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
25 with open(in_groupings, 'r') as f1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
26 for line1 in f1:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
28 header = header[header.rfind('/') + 1:]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
29
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
30 # Initialize the list for the orderings
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
31 orderings[header] = []
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
32 with open(line1.rstrip(), 'r') as f2:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
33 for line2 in f2:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
34 L1 = line2.rstrip().split('\t')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
35 orderings[header].append(L1[0])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
36 orientations[L1[0]] = L1[1]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
37 ref_chr[L1[0]] = header
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
38 return orderings, orientations, ref_chr
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
39
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
40
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
41 def get_reverse_coords(start_coord, end_coord, seq_length):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
42 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
43 Returns new genomic coordinates of a region that has undergone reverse complementation.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
44 new start coordinate = seqLength - endCoord
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
45 new end coordinate = seqLength - startCoord
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
46 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
48
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
49
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
50 if __name__ == "__main__":
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
51 import argparse
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
52
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.")
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
60
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
61
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
62 # Get the command line arguments
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
63 args = parser.parse_args()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
64 gff_file = args.gff
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
65 orderings_file = args.orderings
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
66 fai_file = args.fai
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
67 gap_size = args.g
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
68
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
69 # Get the contig lengths
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
70 ctg_lens = get_contig_lengths(fai_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
71
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
72 # Get the contig orderings and orientations
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
74
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
75 #Iterate through the GFF features and update
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
76 offsets = dict()
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
77 with open(gff_file) as f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
78 for gff_line in f:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
79 if gff_line.startswith("#"):
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
80 print(gff_line.rstrip())
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
81 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
82 gff_fields = gff_line.rstrip().split('\t')
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
84 new_header = ctg_chr[gff_header]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
85
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
86 # Check that this contig header is in our list of headers
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
87 if gff_header not in ctg_lens.keys():
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
88 err_msg = """ %s was not found in the list of orderings files provided.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
89 Please check that you are using the correct files. If chimeric contig correction was
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
90 used, please use the corrected gff and fai file.
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
91 """
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
92 raise ValueError(err_msg)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
93
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
94 # Check if the contig has been reverse complemented. Update accordingly
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
95 if ctg_orientations[gff_header] == '-':
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header])
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
97
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
98 # Set the opposite strand
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
99 if strand == '+':
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
100 strand = '-'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
101 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
102 strand = '+'
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
103
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
104 # Check if the offset for this contig has already been calculated
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
105 if gff_header in offsets:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
106 offset = offsets[gff_header]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
107 else:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
108 ctg_idx = ctg_orderings[new_header].index(gff_header)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
109 offset = 0
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
110
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
111 for ctg in ctg_orderings[new_header][:ctg_idx]:
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
112 offset += ctg_lens[ctg]
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
113 offset += gap_size
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
114
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
115 # memoize the offset
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
116 offsets[gff_header] = offset
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
117
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
118 new_start = start + offset
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
119 new_end = end + offset
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
120
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
121 gff_fields[0] = new_header
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
122 gff_fields[3] = str(new_start)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
123 gff_fields[4] = str(new_end)
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
124 gff_fields[6] = strand
b9a3aeb162ab Uploaded
dereeper
parents:
diff changeset
125 print('\t'.join(gff_fields))