Mercurial > repos > iuc > repmatch_gff3
comparison repmatch_gff3_util.py @ 4:6acaa2c93f47 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author | iuc |
---|---|
date | Wed, 12 Jul 2017 10:11:53 -0400 |
parents | e5c7fffdc078 |
children | 2365720de36d |
comparison
equal
deleted
inserted
replaced
3:f7608d0363bf | 4:6acaa2c93f47 |
---|---|
21 pyplot.rc('lines', linewidth=4.00) | 21 pyplot.rc('lines', linewidth=4.00) |
22 pyplot.rc('axes', linewidth=3.00) | 22 pyplot.rc('axes', linewidth=3.00) |
23 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) | 23 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) |
24 | 24 |
25 COLORS = 'krb' | 25 COLORS = 'krb' |
26 ISPY2 = sys.version_info[0] == 2 | |
26 | 27 |
27 | 28 |
28 class Replicate(object): | 29 class Replicate(object): |
29 | 30 |
30 def __init__(self, id, dataset_path): | 31 def __init__(self, id, dataset_path): |
31 self.id = id | 32 self.id = id |
32 self.dataset_path = dataset_path | 33 self.dataset_path = dataset_path |
33 self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) | 34 if ISPY2: |
35 fh = open(dataset_path, 'rb') | |
36 else: | |
37 fh = open(dataset_path, 'r', newline='') | |
38 self.parse(csv.reader(fh, delimiter='\t')) | |
34 | 39 |
35 def parse(self, reader): | 40 def parse(self, reader): |
36 self.chromosomes = {} | 41 self.chromosomes = {} |
37 for line in reader: | 42 for line in reader: |
38 if line[0].startswith("#") or line[0].startswith('"'): | 43 if line[0].startswith("#") or line[0].startswith('"'): |
39 continue | 44 continue |
40 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line | 45 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line |
41 attrs = parse_gff_attrs(attrs) | 46 attrs = parse_gff_attrs(attrs) |
42 distance = attrs['cw_distance'] | 47 distance = int(attrs['cw_distance']) |
43 mid = int(mid) | 48 mid = int(mid) |
44 midplus = int(midplus) | 49 midplus = int(midplus) |
45 value = float(value) | 50 value = float(value) |
46 distance = int(distance) | |
47 if cname not in self.chromosomes: | 51 if cname not in self.chromosomes: |
48 self.chromosomes[cname] = Chromosome(cname) | 52 self.chromosomes[cname] = Chromosome(cname) |
49 chrom = self.chromosomes[cname] | 53 chrom = self.chromosomes[cname] |
50 chrom.add_peak(Peak(cname, mid, value, distance, self)) | 54 chrom.add_peak(Peak(cname, mid, value, distance, self)) |
51 for chrom in self.chromosomes.values(): | 55 for chrom in self.chromosomes.values(): |
105 def add_peak(self, repid, peak): | 109 def add_peak(self, repid, peak): |
106 self.peaks[repid] = peak | 110 self.peaks[repid] = peak |
107 | 111 |
108 @property | 112 @property |
109 def chrom(self): | 113 def chrom(self): |
110 return self.peaks.values()[0].chrom | 114 return list(self.peaks.values())[0].chrom |
111 | 115 |
112 @property | 116 @property |
113 def midpoint(self): | 117 def midpoint(self): |
114 return median([peak.midpoint for peak in self.peaks.values()]) | 118 return int(median([peak.midpoint for peak in self.peaks.values()])) |
115 | 119 |
116 @property | 120 @property |
117 def num_replicates(self): | 121 def num_replicates(self): |
118 return len(self.peaks) | 122 return len(self.peaks) |
119 | 123 |
120 @property | 124 @property |
121 def median_distance(self): | 125 def median_distance(self): |
122 return median([peak.distance for peak in self.peaks.values()]) | 126 return int(median([peak.distance for peak in self.peaks.values()])) |
123 | 127 |
124 @property | 128 @property |
125 def value_sum(self): | 129 def value_sum(self): |
126 return sum([peak.value for peak in self.peaks.values()]) | 130 return sum([peak.value for peak in self.peaks.values()]) |
127 | 131 |
131 values.append(peak.normalized_value(med)) | 135 values.append(peak.normalized_value(med)) |
132 return median(values) | 136 return median(values) |
133 | 137 |
134 @property | 138 @property |
135 def peakpeak_distance(self): | 139 def peakpeak_distance(self): |
136 keys = self.peaks.keys() | 140 keys = list(self.peaks.keys()) |
137 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) | 141 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) |
138 | 142 |
139 | 143 |
140 class FrequencyDistribution(object): | 144 class FrequencyDistribution(object): |
141 | 145 |
185 def get_window(chromosome, target_peaks, distance): | 189 def get_window(chromosome, target_peaks, distance): |
186 """ | 190 """ |
187 Returns a window of all peaks from a replicate within a certain distance of | 191 Returns a window of all peaks from a replicate within a certain distance of |
188 a peak from another replicate. | 192 a peak from another replicate. |
189 """ | 193 """ |
190 lower = target_peaks[0].midpoint | 194 lower = list(target_peaks)[0].midpoint |
191 upper = target_peaks[0].midpoint | 195 upper = list(target_peaks)[0].midpoint |
192 for peak in target_peaks: | 196 for peak in target_peaks: |
193 lower = min(lower, peak.midpoint - distance) | 197 lower = min(lower, peak.midpoint - distance) |
194 upper = max(upper, peak.midpoint + distance) | 198 upper = max(upper, peak.midpoint + distance) |
195 start_index = bisect.bisect_left(chromosome.keys, lower) | 199 start_index = bisect.bisect_left(chromosome.keys, lower) |
196 end_index = bisect.bisect_right(chromosome.keys, upper) | 200 end_index = bisect.bisect_right(chromosome.keys, upper) |
232 | 236 |
233 | 237 |
234 METHODS = {'closest': match_closest, 'largest': match_largest} | 238 METHODS = {'closest': match_closest, 'largest': match_largest} |
235 | 239 |
236 | 240 |
237 def gff_attrs(d): | 241 def gff_attrs(l): |
238 if not d: | 242 if len(l) == 0: |
239 return '.' | 243 return '.' |
240 return ';'.join('%s=%s' % item for item in d.items()) | 244 return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l) |
241 | 245 |
242 | 246 |
243 def parse_gff_attrs(s): | 247 def parse_gff_attrs(s): |
244 d = {} | 248 d = {} |
245 if s == '.': | 249 if s == '.': |
248 key, val = item.split('=') | 252 key, val = item.split('=') |
249 d[key] = val | 253 d[key] = val |
250 return d | 254 return d |
251 | 255 |
252 | 256 |
253 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | 257 def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None): |
254 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | 258 return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or [])) |
255 | 259 |
256 | 260 |
257 def get_temporary_plot_path(): | 261 def get_temporary_plot_path(): |
258 """ | 262 """ |
259 Return the path to a temporary file with a valid image format | 263 Return the path to a temporary file with a valid image format |
319 if step != 0: | 323 if step != 0: |
320 attrs += 's%d' % step | 324 attrs += 's%d' % step |
321 | 325 |
322 def td_writer(file_path): | 326 def td_writer(file_path): |
323 # Returns a tab-delimited writer for a certain output | 327 # Returns a tab-delimited writer for a certain output |
324 return csv.writer(open(file_path, 'wt'), delimiter='\t') | 328 if ISPY2: |
329 fh = open(file_path, 'wb') | |
330 return csv.writer(fh, delimiter='\t') | |
331 else: | |
332 fh = open(file_path, 'w', newline='') | |
333 return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE) | |
325 | 334 |
326 labels = ('chrom', | 335 labels = ('chrom', |
327 'median midpoint', | 336 'median midpoint', |
328 'median midpoint+1', | 337 'median midpoint+1', |
329 'median normalized reads', | 338 'median normalized reads', |
361 reps = reps[:] | 370 reps = reps[:] |
362 while len(reps) > 1: | 371 while len(reps) > 1: |
363 # Iterate over each replicate as "main" | 372 # Iterate over each replicate as "main" |
364 main = reps[0] | 373 main = reps[0] |
365 reps.remove(main) | 374 reps.remove(main) |
366 for chromosome in main.chromosomes.values(): | 375 for chromosome in list(main.chromosomes.values()): |
367 peaks_by_value = chromosome.peaks[:] | 376 peaks_by_value = chromosome.peaks[:] |
368 # Sort main replicate by value | 377 # Sort main replicate by value |
369 peaks_by_value.sort(key=lambda peak: -peak.value) | 378 peaks_by_value.sort(key=lambda peak: -peak.value) |
370 | 379 |
371 def search_for_matches(group): | 380 def search_for_matches(group): |
377 if replicate.id in group.peaks: | 386 if replicate.id in group.peaks: |
378 # Stop if match already found for this replicate | 387 # Stop if match already found for this replicate |
379 continue | 388 continue |
380 try: | 389 try: |
381 # Lines changed to remove a major bug by Rohit Reja. | 390 # Lines changed to remove a major bug by Rohit Reja. |
382 window, chrum = get_window(replicate.chromosomes[chromosome.name], | 391 window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance) |
383 group.peaks.values(), | |
384 distance) | |
385 match = METHODS[method](window, peak, chrum) | 392 match = METHODS[method](window, peak, chrum) |
386 except KeyError: | 393 except KeyError: |
387 continue | 394 continue |
388 if match: | 395 if match: |
389 group.add_peak(replicate.id, match) | 396 group.add_peak(replicate.id, match) |
390 new_match = True | 397 new_match = True |
391 if not new_match: | 398 if not new_match: |
392 break | 399 break |
393 # Attempt to enlarge existing peak groups | 400 # Attempt to enlarge existing peak groups |
394 for group in peak_groups: | 401 for group in peak_groups: |
395 old_peaks = group.peaks.values()[:] | 402 old_peaks = list(group.peaks.values()) |
396 search_for_matches(group) | 403 search_for_matches(group) |
397 for peak in group.peaks.values(): | 404 for peak in list(group.peaks.values()): |
398 if peak not in old_peaks: | 405 if peak not in old_peaks: |
399 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | 406 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) |
400 # Attempt to find new peaks groups. For each peak in the | 407 # Attempt to find new peaks groups. For each peak in the |
401 # main replicate, search for matches in the other replicates | 408 # main replicate, search for matches in the other replicates |
402 for peak in peaks_by_value: | 409 for peak in peaks_by_value: |
403 matches = PeakGroup() | 410 matches = PeakGroup() |
404 matches.add_peak(main.id, peak) | 411 matches.add_peak(main.id, peak) |
405 search_for_matches(matches) | 412 search_for_matches(matches) |
406 # Were enough replicates matched? | 413 # Were enough replicates matched? |
407 if matches.num_replicates >= num_required: | 414 if matches.num_replicates >= num_required: |
408 for peak in matches.peaks.values(): | 415 for peak in list(matches.peaks.values()): |
409 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) | 416 peak.replicate.chromosomes[chromosome.name].remove_peak(peak) |
410 peak_groups.append(matches) | 417 peak_groups.append(matches) |
411 # Zero or less = no stepping | 418 # Zero or less = no stepping |
412 if step <= 0: | 419 if step <= 0: |
413 do_match(replicates, distance) | 420 do_match(replicates, distance) |
430 for group in peak_groups: | 437 for group in peak_groups: |
431 # Output matched_peaks (matched pairs). | 438 # Output matched_peaks (matched pairs). |
432 matched_peaks_output.writerow(gff_row(cname=group.chrom, | 439 matched_peaks_output.writerow(gff_row(cname=group.chrom, |
433 start=group.midpoint, | 440 start=group.midpoint, |
434 end=group.midpoint + 1, | 441 end=group.midpoint + 1, |
442 score=group.normalized_value(med), | |
435 source='repmatch', | 443 source='repmatch', |
436 score=group.normalized_value(med), | 444 stype='.', |
437 attrs={'median_distance': group.median_distance, | 445 strand='.', |
438 'replicates': group.num_replicates, | 446 phase='.', |
439 'value_sum': group.value_sum})) | 447 attrs=[('median_distance', group.median_distance), |
448 ('value_sum', group.value_sum), | |
449 ('replicates', group.num_replicates)])) | |
440 if output_detail_file: | 450 if output_detail_file: |
441 matched_peaks = (group.chrom, | 451 matched_peaks = (group.chrom, |
442 group.midpoint, | 452 group.midpoint, |
443 group.midpoint + 1, | 453 group.midpoint + 1, |
444 group.normalized_value(med), | 454 group.normalized_value(med), |