Mercurial > repos > iuc > repmatch_gff3
comparison repmatch_gff3_util.py @ 0:a072f0f30ea3 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 0e04a4c237677c1f5be1950babcf8591097996a9
author | iuc |
---|---|
date | Wed, 23 Dec 2015 09:25:42 -0500 |
parents | |
children | e5c7fffdc078 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a072f0f30ea3 |
---|---|
1 import bisect | |
2 import csv | |
3 import os | |
4 import shutil | |
5 import sys | |
6 import tempfile | |
7 import matplotlib | |
8 matplotlib.use('Agg') | |
9 from matplotlib import pyplot | |
10 | |
11 # Graph settings | |
12 Y_LABEL = 'Counts' | |
13 X_LABEL = 'Number of matched replicates' | |
14 TICK_WIDTH = 3 | |
15 # Amount to shift the graph to make labels fit, [left, right, top, bottom] | |
16 ADJUST = [0.180, 0.9, 0.9, 0.1] | |
17 # Length of tick marks, use TICK_WIDTH for width | |
18 pyplot.rc('xtick.major', size=10.00) | |
19 pyplot.rc('ytick.major', size=10.00) | |
20 pyplot.rc('lines', linewidth=4.00) | |
21 pyplot.rc('axes', linewidth=3.00) | |
22 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) | |
23 | |
24 COLORS = 'krb' | |
25 | |
26 | |
27 class Replicate(object): | |
28 | |
29 def __init__(self, id, dataset_path): | |
30 self.id = id | |
31 self.dataset_path = dataset_path | |
32 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) | |
33 | |
34 def parse(self, reader): | |
35 self.chromosomes = {} | |
36 for line in reader: | |
37 if line[0].startswith("#") or line[0].startswith('"'): | |
38 continue | |
39 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line | |
40 attrs = parse_gff_attrs(attrs) | |
41 distance = attrs['cw_distance'] | |
42 mid = int(mid) | |
43 midplus = int(midplus) | |
44 value = float(value) | |
45 distance = int(distance) | |
46 if cname not in self.chromosomes: | |
47 self.chromosomes[cname] = Chromosome(cname) | |
48 chrom = self.chromosomes[cname] | |
49 chrom.add_peak(Peak(cname, mid, value, distance, self)) | |
50 for chrom in self.chromosomes.values(): | |
51 chrom.sort_by_index() | |
52 | |
53 def filter(self, up_limit, low_limit): | |
54 for chrom in self.chromosomes.values(): | |
55 chrom.filter(up_limit, low_limit) | |
56 | |
57 def size(self): | |
58 return sum([len(c.peaks) for c in self.chromosomes.values()]) | |
59 | |
60 | |
61 class Chromosome(object): | |
62 | |
63 def __init__(self, name): | |
64 self.name = name | |
65 self.peaks = [] | |
66 | |
67 def add_peak(self, peak): | |
68 self.peaks.append(peak) | |
69 | |
70 def sort_by_index(self): | |
71 self.peaks.sort(key=lambda peak: peak.midpoint) | |
72 self.keys = make_keys(self.peaks) | |
73 | |
74 def remove_peak(self, peak): | |
75 i = bisect.bisect_left(self.keys, peak.midpoint) | |
76 # If the peak was actually found | |
77 if i < len(self.peaks) and self.peaks[i].midpoint == peak.midpoint: | |
78 del self.keys[i] | |
79 del self.peaks[i] | |
80 | |
81 def filter(self, up_limit, low_limit): | |
82 self.peaks = [p for p in self.peaks if low_limit <= p.distance <= up_limit] | |
83 self.keys = make_keys(self.peaks) | |
84 | |
85 | |
86 class Peak(object): | |
87 | |
88 def __init__(self, chrom, midpoint, value, distance, replicate): | |
89 self.chrom = chrom | |
90 self.value = value | |
91 self.midpoint = midpoint | |
92 self.distance = distance | |
93 self.replicate = replicate | |
94 | |
95 def normalized_value(self, med): | |
96 return self.value * med / self.replicate.median | |
97 | |
98 | |
99 class PeakGroup(object): | |
100 | |
101 def __init__(self): | |
102 self.peaks = {} | |
103 | |
104 def add_peak(self, repid, peak): | |
105 self.peaks[repid] = peak | |
106 | |
107 @property | |
108 def chrom(self): | |
109 return self.peaks.values()[0].chrom | |
110 | |
111 @property | |
112 def midpoint(self): | |
113 return median([peak.midpoint for peak in self.peaks.values()]) | |
114 | |
115 @property | |
116 def num_replicates(self): | |
117 return len(self.peaks) | |
118 | |
119 @property | |
120 def median_distance(self): | |
121 return median([peak.distance for peak in self.peaks.values()]) | |
122 | |
123 @property | |
124 def value_sum(self): | |
125 return sum([peak.value for peak in self.peaks.values()]) | |
126 | |
127 def normalized_value(self, med): | |
128 values = [] | |
129 for peak in self.peaks.values(): | |
130 values.append(peak.normalized_value(med)) | |
131 return median(values) | |
132 | |
133 @property | |
134 def peakpeak_distance(self): | |
135 keys = self.peaks.keys() | |
136 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) | |
137 | |
138 | |
139 class FrequencyDistribution(object): | |
140 | |
141 def __init__(self, d=None): | |
142 self.dist = d or {} | |
143 | |
144 def add(self, x): | |
145 self.dist[x] = self.dist.get(x, 0) + 1 | |
146 | |
147 def graph_series(self): | |
148 x = [] | |
149 y = [] | |
150 for key, val in self.dist.items(): | |
151 x.append(key) | |
152 y.append(val) | |
153 return x, y | |
154 | |
155 def mode(self): | |
156 return max(self.dist.items(), key=lambda data: data[1])[0] | |
157 | |
158 def size(self): | |
159 return sum(self.dist.values()) | |
160 | |
161 | |
162 def stop_err(msg): | |
163 sys.stderr.write(msg) | |
164 sys.exit(1) | |
165 | |
166 | |
167 def median(data): | |
168 """ | |
169 Find the integer median of the data set. | |
170 """ | |
171 if not data: | |
172 return 0 | |
173 sdata = sorted(data) | |
174 if len(data) % 2 == 0: | |
175 return (sdata[len(data)//2] + sdata[len(data)//2-1]) / 2 | |
176 else: | |
177 return sdata[len(data)//2] | |
178 | |
179 | |
180 def make_keys(peaks): | |
181 return [data.midpoint for data in peaks] | |
182 | |
183 | |
184 def get_window(chromosome, target_peaks, distance): | |
185 """ | |
186 Returns a window of all peaks from a replicate within a certain distance of | |
187 a peak from another replicate. | |
188 """ | |
189 lower = target_peaks[0].midpoint | |
190 upper = target_peaks[0].midpoint | |
191 for peak in target_peaks: | |
192 lower = min(lower, peak.midpoint - distance) | |
193 upper = max(upper, peak.midpoint + distance) | |
194 start_index = bisect.bisect_left(chromosome.keys, lower) | |
195 end_index = bisect.bisect_right(chromosome.keys, upper) | |
196 return (chromosome.peaks[start_index: end_index], chromosome.name) | |
197 | |
198 | |
199 def match_largest(window, peak, chrum): | |
200 if not window: | |
201 return None | |
202 if peak.chrom != chrum: | |
203 return None | |
204 return max(window, key=lambda cpeak: cpeak.value) | |
205 | |
206 | |
207 def match_closest(window, peak, chrum): | |
208 if not window: | |
209 return None | |
210 if peak.chrom != chrum: | |
211 return None | |
212 return min(window, key=lambda match: abs(match.midpoint - peak.midpoint)) | |
213 | |
214 | |
215 def frequency_histogram(freqs, dataset_path, labels=[], title=''): | |
216 pyplot.clf() | |
217 pyplot.figure(figsize=(10, 10)) | |
218 for i, freq in enumerate(freqs): | |
219 xvals, yvals = freq.graph_series() | |
220 # Go from high to low | |
221 xvals.reverse() | |
222 pyplot.bar([x-0.4 + 0.8/len(freqs)*i for x in xvals], yvals, width=0.8/len(freqs), color=COLORS[i]) | |
223 pyplot.xticks(range(min(xvals), max(xvals)+1), map(str, reversed(range(min(xvals), max(xvals)+1)))) | |
224 pyplot.xlabel(X_LABEL) | |
225 pyplot.ylabel(Y_LABEL) | |
226 pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) | |
227 ax = pyplot.gca() | |
228 for l in ax.get_xticklines() + ax.get_yticklines(): | |
229 l.set_markeredgewidth(TICK_WIDTH) | |
230 pyplot.savefig(dataset_path) | |
231 | |
232 | |
233 METHODS = {'closest': match_closest, 'largest': match_largest} | |
234 | |
235 | |
236 def gff_attrs(d): | |
237 if not d: | |
238 return '.' | |
239 return ';'.join('%s=%s' % item for item in d.items()) | |
240 | |
241 | |
242 def parse_gff_attrs(s): | |
243 d = {} | |
244 if s == '.': | |
245 return d | |
246 for item in s.split(';'): | |
247 key, val = item.split('=') | |
248 d[key] = val | |
249 return d | |
250 | |
251 | |
252 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | |
253 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | |
254 | |
255 | |
256 def get_temporary_plot_path(): | |
257 """ | |
258 Return the path to a temporary file with a valid image format | |
259 file extension that can be used with bioformats. | |
260 """ | |
261 tmp_dir = tempfile.mkdtemp(prefix='tmp-repmatch-') | |
262 fd, name = tempfile.mkstemp(suffix='.pdf', dir=tmp_dir) | |
263 os.close(fd) | |
264 return name | |
265 | |
266 | |
267 def process_files(dataset_paths, galaxy_hids, method, distance, step, replicates, up_limit, low_limit, output_files, | |
268 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): | |
269 output_statistics_histogram_file = output_files in ["all"] and method in ["all"] | |
270 if len(dataset_paths) < 2: | |
271 return | |
272 if method == 'all': | |
273 match_methods = METHODS.keys() | |
274 else: | |
275 match_methods = [method] | |
276 for match_method in match_methods: | |
277 statistics = perform_process(dataset_paths, | |
278 galaxy_hids, | |
279 match_method, | |
280 distance, | |
281 step, | |
282 replicates, | |
283 up_limit, | |
284 low_limit, | |
285 output_files, | |
286 output_matched_peaks, | |
287 output_unmatched_peaks, | |
288 output_detail, | |
289 output_statistics_table, | |
290 output_statistics_histogram) | |
291 if output_statistics_histogram_file: | |
292 tmp_statistics_histogram_path = get_temporary_plot_path() | |
293 frequency_histogram([stat['distribution'] for stat in [statistics]], | |
294 tmp_statistics_histogram_path, | |
295 METHODS.keys()) | |
296 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) | |
297 | |
298 | |
299 def perform_process(dataset_paths, galaxy_hids, method, distance, step, num_required, up_limit, low_limit, output_files, | |
300 output_matched_peaks, output_unmatched_peaks, output_detail, output_statistics_table, output_statistics_histogram): | |
301 output_detail_file = output_files in ["all"] and output_detail is not None | |
302 output_statistics_table_file = output_files in ["all"] and output_statistics_table is not None | |
303 output_unmatched_peaks_file = output_files in ["all", "matched_peaks_unmatched_peaks"] and output_unmatched_peaks is not None | |
304 output_statistics_histogram_file = output_files in ["all"] and output_statistics_histogram is not None | |
305 replicates = [] | |
306 for i, dataset_path in enumerate(dataset_paths): | |
307 try: | |
308 galaxy_hid = galaxy_hids[i] | |
309 r = Replicate(galaxy_hid, dataset_path) | |
310 replicates.append(r) | |
311 except Exception, e: | |
312 stop_err('Unable to parse file "%s", exception: %s' % (dataset_path, str(e))) | |
313 attrs = 'd%sr%s' % (distance, num_required) | |
314 if up_limit != 1000: | |
315 attrs += 'u%d' % up_limit | |
316 if low_limit != -1000: | |
317 attrs += 'l%d' % low_limit | |
318 if step != 0: | |
319 attrs += 's%d' % step | |
320 | |
321 def td_writer(file_path): | |
322 # Returns a tab-delimited writer for a certain output | |
323 return csv.writer(open(file_path, 'wt'), delimiter='\t') | |
324 | |
325 labels = ('chrom', | |
326 'median midpoint', | |
327 'median midpoint+1', | |
328 'median normalized reads', | |
329 'replicates', | |
330 'median c-w distance', | |
331 'reads sum') | |
332 for replicate in replicates: | |
333 labels += ('chrom', | |
334 'median midpoint', | |
335 'median midpoint+1', | |
336 'c-w sum', | |
337 'c-w distance', | |
338 'replicate id') | |
339 matched_peaks_output = td_writer(output_matched_peaks) | |
340 if output_statistics_table_file: | |
341 statistics_table_output = td_writer(output_statistics_table) | |
342 statistics_table_output.writerow(('data', 'median read count')) | |
343 if output_detail_file: | |
344 detail_output = td_writer(output_detail) | |
345 detail_output.writerow(labels) | |
346 if output_unmatched_peaks_file: | |
347 unmatched_peaks_output = td_writer(output_unmatched_peaks) | |
348 unmatched_peaks_output.writerow(('chrom', 'midpoint', 'midpoint+1', 'c-w sum', 'c-w distance', 'replicate id')) | |
349 # Perform filtering | |
350 if up_limit < 1000 or low_limit > -1000: | |
351 for replicate in replicates: | |
352 replicate.filter(up_limit, low_limit) | |
353 # Actually merge the peaks | |
354 peak_groups = [] | |
355 unmatched_peaks = [] | |
356 freq = FrequencyDistribution() | |
357 | |
358 def do_match(reps, distance): | |
359 # Copy list because we will mutate it, but keep replicate references. | |
360 reps = reps[:] | |
361 while len(reps) > 1: | |
362 # Iterate over each replicate as "main" | |
363 main = reps[0] | |
364 reps.remove(main) | |
365 for chromosome in main.chromosomes.values(): | |
366 peaks_by_value = chromosome.peaks[:] | |
367 # Sort main replicate by value | |
368 peaks_by_value.sort(key=lambda peak: -peak.value) | |
369 | |
370 def search_for_matches(group): | |
371 # Here we use multiple passes, expanding the window to be | |
372 # +- distance from any previously matched peak. | |
373 while True: | |
374 new_match = False | |
375 for replicate in reps: | |
376 if replicate.id in group.peaks: | |
377 # Stop if match already found for this replicate | |
378 continue | |
379 try: | |
380 # Lines changed to remove a major bug by Rohit Reja. | |
381 window, chrum = get_window(replicate.chromosomes[chromosome.name], | |
382 group.peaks.values(), | |
383 distance) | |
384 match = METHODS[method](window, peak, chrum) | |
385 except KeyError: | |
386 continue | |
387 if match: | |
388 group.add_peak(replicate.id, match) | |
389 new_match = True | |
390 if not new_match: | |
391 break | |
392 # Attempt to enlarge existing peak groups | |
393 for group in peak_groups: | |
394 old_peaks = group.peaks.values()[:] | |
395 search_for_matches(group) | |
396 for peak in group.peaks.values(): | |
397 if peak not in old_peaks: | |
398 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
399 # Attempt to find new peaks groups. For each peak in the | |
400 # main replicate, search for matches in the other replicates | |
401 for peak in peaks_by_value: | |
402 matches = PeakGroup() | |
403 matches.add_peak(main.id, peak) | |
404 search_for_matches(matches) | |
405 # Were enough replicates matched? | |
406 if matches.num_replicates >= num_required: | |
407 for peak in matches.peaks.values(): | |
408 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | |
409 peak_groups.append(matches) | |
410 # Zero or less = no stepping | |
411 if step <= 0: | |
412 do_match(replicates, distance) | |
413 else: | |
414 for d in range(0, distance, step): | |
415 do_match(replicates, d) | |
416 for group in peak_groups: | |
417 freq.add(group.num_replicates) | |
418 # Collect together the remaining unmatched_peaks | |
419 for replicate in replicates: | |
420 for chromosome in replicate.chromosomes.values(): | |
421 for peak in chromosome.peaks: | |
422 freq.add(1) | |
423 unmatched_peaks.append(peak) | |
424 # Average the unmatched_peaks count in the graph by # replicates | |
425 med = median([peak.value for group in peak_groups for peak in group.peaks.values()]) | |
426 for replicate in replicates: | |
427 replicate.median = median([peak.value for group in peak_groups for peak in group.peaks.values() if peak.replicate == replicate]) | |
428 statistics_table_output.writerow((replicate.id, replicate.median)) | |
429 for group in peak_groups: | |
430 # Output matched_peaks (matched pairs). | |
431 matched_peaks_output.writerow(gff_row(cname=group.chrom, | |
432 start=group.midpoint, | |
433 end=group.midpoint+1, | |
434 source='repmatch', | |
435 score=group.normalized_value(med), | |
436 attrs={'median_distance': group.median_distance, | |
437 'replicates': group.num_replicates, | |
438 'value_sum': group.value_sum})) | |
439 if output_detail_file: | |
440 matched_peaks = (group.chrom, | |
441 group.midpoint, | |
442 group.midpoint+1, | |
443 group.normalized_value(med), | |
444 group.num_replicates, | |
445 group.median_distance, | |
446 group.value_sum) | |
447 for peak in group.peaks.values(): | |
448 matched_peaks += (peak.chrom, peak.midpoint, peak.midpoint+1, peak.value, peak.distance, peak.replicate.id) | |
449 detail_output.writerow(matched_peaks) | |
450 if output_unmatched_peaks_file: | |
451 for unmatched_peak in unmatched_peaks: | |
452 unmatched_peaks_output.writerow((unmatched_peak.chrom, | |
453 unmatched_peak.midpoint, | |
454 unmatched_peak.midpoint+1, | |
455 unmatched_peak.value, | |
456 unmatched_peak.distance, | |
457 unmatched_peak.replicate.id)) | |
458 if output_statistics_histogram_file: | |
459 tmp_statistics_histogram_path = get_temporary_plot_path() | |
460 frequency_histogram([freq], tmp_statistics_histogram_path) | |
461 shutil.move(tmp_statistics_histogram_path, output_statistics_histogram) | |
462 return {'distribution': freq} |