0
|
1 #! /usr/bin/env python
|
|
2
|
|
3 # Read pair-wise alignments in MAF or LAST tabular format: write an
|
|
4 # "Oxford grid", a.k.a. dotplot.
|
|
5
|
|
6 # TODO: Currently, pixels with zero aligned nt-pairs are white, and
|
|
7 # pixels with one or more aligned nt-pairs are black. This can look
|
|
8 # too crowded for large genome alignments. I tried shading each pixel
|
|
9 # according to the number of aligned nt-pairs within it, but the
|
|
10 # result is too faint. How can this be done better?
|
|
11
|
|
12 import fileinput, itertools, optparse, os, re, sys
|
|
13
|
|
14 # Try to make PIL/PILLOW work:
|
|
15 try: from PIL import Image, ImageDraw, ImageFont, ImageColor
|
|
16 except ImportError: import Image, ImageDraw, ImageFont, ImageColor
|
|
17
|
|
18 my_name = os.path.basename(sys.argv[0])
|
|
19 usage = """
|
|
20 %prog --help
|
|
21 %prog [options] last-tabular-output dotplot.png
|
|
22 %prog [options] last-tabular-output dotplot.gif
|
|
23 etc."""
|
|
24 parser = optparse.OptionParser(usage=usage)
|
|
25 # Replace "width" & "height" with a single "length" option?
|
|
26 parser.add_option("-x", "--width", type="int", dest="width", default=1000,
|
|
27 help="maximum width in pixels (default: %default)")
|
|
28 parser.add_option("-y", "--height", type="int", dest="height", default=1000,
|
|
29 help="maximum height in pixels (default: %default)")
|
|
30 parser.add_option("-f", "--fontfile", dest="fontfile",
|
|
31 help="TrueType or OpenType font file")
|
|
32 parser.add_option("-s", "--fontsize", type="int", dest="fontsize", default=11,
|
|
33 help="TrueType or OpenType font size (default: %default)")
|
|
34 parser.add_option("-c", "--forwardcolor", dest="forwardcolor", default="red",
|
|
35 help="Color for forward alignments (default: %default)")
|
|
36 parser.add_option("-r", "--reversecolor", dest="reversecolor", default="blue",
|
|
37 help="Color for reverse alignments (default: %default)")
|
|
38 (opts, args) = parser.parse_args()
|
|
39 if len(args) != 2: parser.error("2 arguments needed")
|
|
40
|
|
41 if opts.fontfile: font = ImageFont.truetype(opts.fontfile, opts.fontsize)
|
|
42 else: font = ImageFont.load_default()
|
|
43
|
|
44 # Make these options too?
|
|
45 text_color = "black"
|
|
46 background_color = "white"
|
|
47 pix_tween_seqs = 2 # number of border pixels between sequences
|
|
48 border_shade = 239, 239, 239 # the shade of grey to use for border pixels
|
|
49 label_space = 5 # minimum number of pixels between axis labels
|
|
50
|
|
51 image_mode = 'RGB'
|
|
52 forward_color = ImageColor.getcolor(opts.forwardcolor, image_mode)
|
|
53 reverse_color = ImageColor.getcolor(opts.reversecolor, image_mode)
|
|
54 overlap_color = tuple([(i+j)//2 for i, j in zip(forward_color, reverse_color)])
|
|
55
|
|
56 def tabBlocks(beg1, beg2, blocks):
|
|
57 '''Get the gapless blocks of an alignment, from LAST tabular format.'''
|
|
58 for i in blocks.split(","):
|
|
59 if ":" in i:
|
|
60 x, y = i.split(":")
|
|
61 beg1 += int(x)
|
|
62 beg2 += int(y)
|
|
63 else:
|
|
64 size = int(i)
|
|
65 yield beg1, beg2, size
|
|
66 beg1 += size
|
|
67 beg2 += size
|
|
68
|
|
69 def mafBlocks(beg1, beg2, seq1, seq2):
|
|
70 '''Get the gapless blocks of an alignment, from MAF format.'''
|
|
71 size = 0
|
|
72 for x, y in itertools.izip(seq1, seq2):
|
|
73 if x == "-":
|
|
74 if size:
|
|
75 yield beg1, beg2, size
|
|
76 beg1 += size
|
|
77 beg2 += size
|
|
78 size = 0
|
|
79 beg2 += 1
|
|
80 elif y == "-":
|
|
81 if size:
|
|
82 yield beg1, beg2, size
|
|
83 beg1 += size
|
|
84 beg2 += size
|
|
85 size = 0
|
|
86 beg1 += 1
|
|
87 else:
|
|
88 size += 1
|
|
89 if size: yield beg1, beg2, size
|
|
90
|
|
91 def alignmentInput(lines):
|
|
92 '''Get alignments and sequence lengths, from MAF or tabular format.'''
|
|
93 mafCount = 0
|
|
94 for line in lines:
|
|
95 w = line.split()
|
|
96 if line[0].isdigit(): # tabular format
|
|
97 chr1, beg1, seqlen1 = w[1], int(w[2]), int(w[5])
|
|
98 if w[4] == "-": beg1 -= seqlen1
|
|
99 chr2, beg2, seqlen2 = w[6], int(w[7]), int(w[10])
|
|
100 if w[9] == "-": beg2 -= seqlen2
|
|
101 blocks = tabBlocks(beg1, beg2, w[11])
|
|
102 yield chr1, seqlen1, chr2, seqlen2, blocks
|
|
103 elif line[0] == "s": # MAF format
|
|
104 if mafCount == 0:
|
|
105 chr1, beg1, seqlen1, seq1 = w[1], int(w[2]), int(w[5]), w[6]
|
|
106 if w[4] == "-": beg1 -= seqlen1
|
|
107 mafCount = 1
|
|
108 else:
|
|
109 chr2, beg2, seqlen2, seq2 = w[1], int(w[2]), int(w[5]), w[6]
|
|
110 if w[4] == "-": beg2 -= seqlen2
|
|
111 blocks = mafBlocks(beg1, beg2, seq1, seq2)
|
|
112 yield chr1, seqlen1, chr2, seqlen2, blocks
|
|
113 mafCount = 0
|
|
114
|
|
115 def readAlignments(lines):
|
|
116 '''Get alignments and sequence lengths, from MAF or tabular format.'''
|
|
117 alignments = []
|
|
118 seqLengths1 = {}
|
|
119 seqLengths2 = {}
|
|
120 for chr1, seqlen1, chr2, seqlen2, blocks in alignmentInput(lines):
|
|
121 aln = chr1, chr2, blocks
|
|
122 alignments.append(aln)
|
|
123 seqLengths1[chr1] = seqlen1
|
|
124 seqLengths2[chr2] = seqlen2
|
|
125 return alignments, seqLengths1, seqLengths2
|
|
126
|
|
127 sys.stderr.write(my_name + ": reading alignments...\n")
|
|
128 input = fileinput.input(args[0])
|
|
129 alignments, seq_size_dic1, seq_size_dic2 = readAlignments(input)
|
|
130 sys.stderr.write(my_name + ": done\n")
|
|
131
|
|
132 if not alignments:
|
|
133 sys.exit(my_name + ": there are no alignments")
|
|
134
|
|
135 def natural_sort_key(my_string):
|
|
136 '''Return a sort key for "natural" ordering, e.g. chr9 < chr10.'''
|
|
137 parts = re.split(r'(\d+)', my_string)
|
|
138 parts[1::2] = map(int, parts[1::2])
|
|
139 return parts
|
|
140
|
|
141 def get_text_sizes(my_strings):
|
|
142 '''Get widths & heights, in pixels, of some strings.'''
|
|
143 if opts.fontsize == 0: return [(0, 0) for i in my_strings]
|
|
144 image_size = 1, 1
|
|
145 im = Image.new(image_mode, image_size)
|
|
146 draw = ImageDraw.Draw(im)
|
|
147 return [draw.textsize(i, font=font) for i in my_strings]
|
|
148
|
|
149 def get_seq_info(seq_size_dic):
|
|
150 '''Return miscellaneous information about the sequences.'''
|
|
151 seq_names = seq_size_dic.keys()
|
|
152 seq_names.sort(key=natural_sort_key)
|
|
153 seq_sizes = [seq_size_dic[i] for i in seq_names]
|
|
154 name_sizes = get_text_sizes(seq_names)
|
|
155 margin = max(zip(*name_sizes)[1]) # maximum text height
|
|
156 return seq_names, seq_sizes, name_sizes, margin
|
|
157
|
|
158 seq_names1, seq_sizes1, name_sizes1, margin1 = get_seq_info(seq_size_dic1)
|
|
159 seq_names2, seq_sizes2, name_sizes2, margin2 = get_seq_info(seq_size_dic2)
|
|
160
|
|
161 def div_ceil(x, y):
|
|
162 '''Return x / y rounded up.'''
|
|
163 q, r = divmod(x, y)
|
|
164 return q + (r != 0)
|
|
165
|
|
166 def tot_seq_pix(seq_sizes, bp_per_pix):
|
|
167 '''Return the total pixels needed for sequences of the given sizes.'''
|
|
168 return sum([div_ceil(i, bp_per_pix) for i in seq_sizes])
|
|
169
|
|
170 def get_bp_per_pix(seq_sizes, pix_limit):
|
|
171 '''Get the minimum bp-per-pixel that fits in the size limit.'''
|
|
172 seq_num = len(seq_sizes)
|
|
173 seq_pix_limit = pix_limit - pix_tween_seqs * (seq_num - 1)
|
|
174 if seq_pix_limit < seq_num:
|
|
175 sys.exit(my_name + ": can't fit the image: too many sequences?")
|
|
176 lower_bound = div_ceil(sum(seq_sizes), seq_pix_limit)
|
|
177 for bp_per_pix in itertools.count(lower_bound): # slow linear search
|
|
178 if tot_seq_pix(seq_sizes, bp_per_pix) <= seq_pix_limit: break
|
|
179 return bp_per_pix
|
|
180
|
|
181 sys.stderr.write(my_name + ": choosing bp per pixel...\n")
|
|
182 bp_per_pix1 = get_bp_per_pix(seq_sizes1, opts.width - margin1)
|
|
183 bp_per_pix2 = get_bp_per_pix(seq_sizes2, opts.height - margin2)
|
|
184 bp_per_pix = max(bp_per_pix1, bp_per_pix2)
|
|
185 sys.stderr.write(my_name + ": bp per pixel = " + str(bp_per_pix) + "\n")
|
|
186
|
|
187 def get_seq_starts(seq_pix, pix_tween_seqs, margin):
|
|
188 '''Get the start pixel for each sequence.'''
|
|
189 seq_starts = []
|
|
190 pix_tot = margin - pix_tween_seqs
|
|
191 for i in seq_pix:
|
|
192 pix_tot += pix_tween_seqs
|
|
193 seq_starts.append(pix_tot)
|
|
194 pix_tot += i
|
|
195 return seq_starts
|
|
196
|
|
197 def get_pix_info(seq_sizes, margin):
|
|
198 '''Return pixel information about the sequences.'''
|
|
199 seq_pix = [div_ceil(i, bp_per_pix) for i in seq_sizes]
|
|
200 seq_starts = get_seq_starts(seq_pix, pix_tween_seqs, margin)
|
|
201 tot_pix = seq_starts[-1] + seq_pix[-1]
|
|
202 return seq_pix, seq_starts, tot_pix
|
|
203
|
|
204 seq_pix1, seq_starts1, width = get_pix_info(seq_sizes1, margin1)
|
|
205 seq_pix2, seq_starts2, height = get_pix_info(seq_sizes2, margin2)
|
|
206 seq_start_dic1 = dict(zip(seq_names1, seq_starts1))
|
|
207 seq_start_dic2 = dict(zip(seq_names2, seq_starts2))
|
|
208 hits = [0] * (width * height) # the image data
|
|
209
|
|
210 sys.stderr.write(my_name + ": processing alignments...\n")
|
|
211 for seq1, seq2, blocks in alignments:
|
|
212 seq_start1 = seq_start_dic1[seq1]
|
|
213 seq_start2 = seq_start_dic2[seq2]
|
|
214 my_start = seq_start2 * width + seq_start1
|
|
215 for beg1, beg2, size in blocks:
|
|
216 end1 = beg1 + size
|
|
217 end2 = beg2 + size
|
|
218 if beg1 >= 0: j = xrange(beg1, end1)
|
|
219 else: j = xrange(-1 - beg1, -1 - end1, -1)
|
|
220 if beg2 >= 0: k = xrange(beg2, end2)
|
|
221 else: k = xrange(-1 - beg2, -1 - end2, -1)
|
|
222 if beg2 >= 0: store_value = 1
|
|
223 else: store_value = 2
|
|
224 for real_pos1, real_pos2 in itertools.izip(j, k):
|
|
225 pix1 = real_pos1 // bp_per_pix
|
|
226 pix2 = real_pos2 // bp_per_pix
|
|
227 hits[my_start + pix2 * width + pix1] |= store_value
|
|
228 sys.stderr.write(my_name + ": done\n")
|
|
229
|
|
230 def make_label(text, text_size, range_start, range_size):
|
|
231 '''Return an axis label with endpoint & sort-order information.'''
|
|
232 text_width = text_size[0]
|
|
233 label_start = range_start + (range_size - text_width) // 2
|
|
234 label_end = label_start + text_width
|
|
235 sort_key = text_width - range_size
|
|
236 return sort_key, label_start, label_end, text
|
|
237
|
|
238 def get_nonoverlapping_labels(labels):
|
|
239 '''Get a subset of non-overlapping axis labels, greedily.'''
|
|
240 nonoverlapping_labels = []
|
|
241 for i in labels:
|
|
242 if True not in [i[1] < j[2] + label_space and j[1] < i[2] + label_space
|
|
243 for j in nonoverlapping_labels]:
|
|
244 nonoverlapping_labels.append(i)
|
|
245 return nonoverlapping_labels
|
|
246
|
|
247 def get_axis_image(seq_names, name_sizes, seq_starts, seq_pix):
|
|
248 '''Make an image of axis labels.'''
|
|
249 min_pos = seq_starts[0]
|
|
250 max_pos = seq_starts[-1] + seq_pix[-1]
|
|
251 height = max(zip(*name_sizes)[1])
|
|
252 labels = [make_label(i, j, k, l) for i, j, k, l in
|
|
253 zip(seq_names, name_sizes, seq_starts, seq_pix)]
|
|
254 labels = [i for i in labels if i[1] >= min_pos and i[2] <= max_pos]
|
|
255 labels.sort()
|
|
256 labels = get_nonoverlapping_labels(labels)
|
|
257 image_size = max_pos, height
|
|
258 im = Image.new(image_mode, image_size, border_shade)
|
|
259 draw = ImageDraw.Draw(im)
|
|
260 for i in labels:
|
|
261 position = i[1], 0
|
|
262 draw.text(position, i[3], font=font, fill=text_color)
|
|
263 return im
|
|
264
|
|
265 image_size = width, height
|
|
266 im = Image.new(image_mode, image_size, background_color)
|
|
267
|
|
268 for i in range(height):
|
|
269 for j in range(width):
|
|
270 store_value = hits[i * width + j]
|
|
271 xy = j, i
|
|
272 if store_value == 1: im.putpixel(xy, forward_color)
|
|
273 elif store_value == 2: im.putpixel(xy, reverse_color)
|
|
274 elif store_value == 3: im.putpixel(xy, overlap_color)
|
|
275
|
|
276 if opts.fontsize != 0:
|
|
277 axis1 = get_axis_image(seq_names1, name_sizes1, seq_starts1, seq_pix1)
|
|
278 axis2 = get_axis_image(seq_names2, name_sizes2, seq_starts2, seq_pix2)
|
|
279 axis2 = axis2.rotate(270)
|
|
280 im.paste(axis1, (0, 0))
|
|
281 im.paste(axis2, (0, 0))
|
|
282
|
|
283 for i in seq_starts1[1:]:
|
|
284 box = i - pix_tween_seqs, margin2, i, height
|
|
285 im.paste(border_shade, box)
|
|
286
|
|
287 for i in seq_starts2[1:]:
|
|
288 box = margin1, i - pix_tween_seqs, width, i
|
|
289 im.paste(border_shade, box)
|
|
290
|
|
291 im.save(args[1])
|