13
|
1 #!/usr/bin/env python
|
|
2 def get_contig_lengths(in_fai):
|
|
3 """ Get contig lengths from a fasta index. """
|
|
4 lens = dict()
|
|
5 with open(in_fai, 'r') as f:
|
|
6 for line in f:
|
|
7 L1 = line.rstrip().split('\t')
|
|
8 lens[L1[0]] = int(L1[1])
|
|
9 return lens
|
|
10
|
|
11
|
|
12 def get_contig_orderings(in_groupings):
|
|
13 """
|
|
14 From the orderings files, return the following:
|
|
15
|
|
16 1. Dictionary associating a reference header with a list of ordered contig headers
|
|
17 2. Dictionary associating a contig with an orientation
|
|
18 3. Dicitonary associating a contig with its assigned reference header
|
|
19 """
|
|
20 orderings = dict()
|
|
21 orientations = dict()
|
|
22 ref_chr = dict()
|
|
23
|
|
24 # Iterate through the orderings files
|
|
25 with open(in_groupings, 'r') as f1:
|
|
26 for line1 in f1:
|
|
27 header = line1.rstrip().replace('_orderings.txt', '_RaGOO')
|
|
28 header = header[header.rfind('/') + 1:]
|
|
29
|
|
30 # Initialize the list for the orderings
|
|
31 orderings[header] = []
|
|
32 with open(line1.rstrip(), 'r') as f2:
|
|
33 for line2 in f2:
|
|
34 L1 = line2.rstrip().split('\t')
|
|
35 orderings[header].append(L1[0])
|
|
36 orientations[L1[0]] = L1[1]
|
|
37 ref_chr[L1[0]] = header
|
|
38 return orderings, orientations, ref_chr
|
|
39
|
|
40
|
|
41 def get_reverse_coords(start_coord, end_coord, seq_length):
|
|
42 """
|
|
43 Returns new genomic coordinates of a region that has undergone reverse complementation.
|
|
44 new start coordinate = seqLength - endCoord
|
|
45 new end coordinate = seqLength - startCoord
|
|
46 """
|
|
47 return seq_length - (end_coord - 1), seq_length - (start_coord - 1)
|
|
48
|
|
49
|
|
50 if __name__ == "__main__":
|
|
51 import argparse
|
|
52
|
|
53 parser = argparse.ArgumentParser(description='Lift-over gff coordinates to from contigs to RaGOO pseudomolecules.'
|
|
54 'Please make sure that you are using the updated gff file and set of contigs if chimera correction was used.'
|
|
55 'Also make sure that the gap size (-g) matches that which was used during ordering and orienting.')
|
|
56 parser.add_argument("gff", metavar="<genes.gff>", type=str, help="Gff file to be lifted-over")
|
|
57 parser.add_argument("orderings", metavar="<orderings.fofn>", type=str, help="List of RaGOO 'orderings' files. 'ls ragoo_output/orderings/* > orderings.fofn'")
|
|
58 parser.add_argument("fai", metavar="<contigs.fasta.fai>", type=str, help="Fasta index for contigs.'")
|
|
59 parser.add_argument("-g", metavar="100", type=int, default=100, help="Gap size for padding in pseudomolecules (must match what was used for 'ragoo.py'.")
|
|
60
|
|
61
|
|
62 # Get the command line arguments
|
|
63 args = parser.parse_args()
|
|
64 gff_file = args.gff
|
|
65 orderings_file = args.orderings
|
|
66 fai_file = args.fai
|
|
67 gap_size = args.g
|
|
68
|
|
69 # Get the contig lengths
|
|
70 ctg_lens = get_contig_lengths(fai_file)
|
|
71
|
|
72 # Get the contig orderings and orientations
|
|
73 ctg_orderings, ctg_orientations, ctg_chr = get_contig_orderings(orderings_file)
|
|
74
|
|
75 #Iterate through the GFF features and update
|
|
76 offsets = dict()
|
|
77 with open(gff_file) as f:
|
|
78 for gff_line in f:
|
|
79 if gff_line.startswith("#"):
|
|
80 print(gff_line.rstrip())
|
|
81 else:
|
|
82 gff_fields = gff_line.rstrip().split('\t')
|
|
83 gff_header, start, end, strand = gff_fields[0], int(gff_fields[3]), int(gff_fields[4]), gff_fields[6]
|
|
84 new_header = ctg_chr[gff_header]
|
|
85
|
|
86 # Check that this contig header is in our list of headers
|
|
87 if gff_header not in ctg_lens.keys():
|
|
88 err_msg = """ %s was not found in the list of orderings files provided.
|
|
89 Please check that you are using the correct files. If chimeric contig correction was
|
|
90 used, please use the corrected gff and fai file.
|
|
91 """
|
|
92 raise ValueError(err_msg)
|
|
93
|
|
94 # Check if the contig has been reverse complemented. Update accordingly
|
|
95 if ctg_orientations[gff_header] == '-':
|
|
96 start, end = get_reverse_coords(start, end, ctg_lens[gff_header])
|
|
97
|
|
98 # Set the opposite strand
|
|
99 if strand == '+':
|
|
100 strand = '-'
|
|
101 else:
|
|
102 strand = '+'
|
|
103
|
|
104 # Check if the offset for this contig has already been calculated
|
|
105 if gff_header in offsets:
|
|
106 offset = offsets[gff_header]
|
|
107 else:
|
|
108 ctg_idx = ctg_orderings[new_header].index(gff_header)
|
|
109 offset = 0
|
|
110
|
|
111 for ctg in ctg_orderings[new_header][:ctg_idx]:
|
|
112 offset += ctg_lens[ctg]
|
|
113 offset += gap_size
|
|
114
|
|
115 # memoize the offset
|
|
116 offsets[gff_header] = offset
|
|
117
|
|
118 new_start = start + offset
|
|
119 new_end = end + offset
|
|
120
|
|
121 gff_fields[0] = new_header
|
|
122 gff_fields[3] = str(new_start)
|
|
123 gff_fields[4] = str(new_end)
|
|
124 gff_fields[6] = strand
|
|
125 print('\t'.join(gff_fields))
|