Mercurial > repos > iuc > repmatch_gff3
diff repmatch_gff3_util.py @ 4:6acaa2c93f47 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9
author | iuc |
---|---|
date | Wed, 12 Jul 2017 10:11:53 -0400 |
parents | e5c7fffdc078 |
children | 2365720de36d |
line wrap: on
line diff
--- a/repmatch_gff3_util.py Fri Jan 13 10:52:02 2017 -0500 +++ b/repmatch_gff3_util.py Wed Jul 12 10:11:53 2017 -0400 @@ -23,6 +23,7 @@ pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) COLORS = 'krb' +ISPY2 = sys.version_info[0] == 2 class Replicate(object): @@ -30,7 +31,11 @@ def __init__(self, id, dataset_path): self.id = id self.dataset_path = dataset_path - self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t')) + if ISPY2: + fh = open(dataset_path, 'rb') + else: + fh = open(dataset_path, 'r', newline='') + self.parse(csv.reader(fh, delimiter='\t')) def parse(self, reader): self.chromosomes = {} @@ -39,11 +44,10 @@ continue cname, junk, junk, mid, midplus, value, strand, junk, attrs = line attrs = parse_gff_attrs(attrs) - distance = attrs['cw_distance'] + distance = int(attrs['cw_distance']) mid = int(mid) midplus = int(midplus) value = float(value) - distance = int(distance) if cname not in self.chromosomes: self.chromosomes[cname] = Chromosome(cname) chrom = self.chromosomes[cname] @@ -107,11 +111,11 @@ @property def chrom(self): - return self.peaks.values()[0].chrom + return list(self.peaks.values())[0].chrom @property def midpoint(self): - return median([peak.midpoint for peak in self.peaks.values()]) + return int(median([peak.midpoint for peak in self.peaks.values()])) @property def num_replicates(self): @@ -119,7 +123,7 @@ @property def median_distance(self): - return median([peak.distance for peak in self.peaks.values()]) + return int(median([peak.distance for peak in self.peaks.values()])) @property def value_sum(self): @@ -133,7 +137,7 @@ @property def peakpeak_distance(self): - keys = self.peaks.keys() + keys = list(self.peaks.keys()) return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint) @@ -187,8 +191,8 @@ Returns a window of all peaks from a replicate within a certain distance of a peak from another replicate. """ - lower = target_peaks[0].midpoint - upper = target_peaks[0].midpoint + lower = list(target_peaks)[0].midpoint + upper = list(target_peaks)[0].midpoint for peak in target_peaks: lower = min(lower, peak.midpoint - distance) upper = max(upper, peak.midpoint + distance) @@ -234,10 +238,10 @@ METHODS = {'closest': match_closest, 'largest': match_largest} -def gff_attrs(d): - if not d: +def gff_attrs(l): + if len(l) == 0: return '.' - return ';'.join('%s=%s' % item for item in d.items()) + return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l) def parse_gff_attrs(s): @@ -250,8 +254,8 @@ return d -def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): - return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) +def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None): + return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or [])) def get_temporary_plot_path(): @@ -321,7 +325,12 @@ def td_writer(file_path): # Returns a tab-delimited writer for a certain output - return csv.writer(open(file_path, 'wt'), delimiter='\t') + if ISPY2: + fh = open(file_path, 'wb') + return csv.writer(fh, delimiter='\t') + else: + fh = open(file_path, 'w', newline='') + return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE) labels = ('chrom', 'median midpoint', @@ -363,7 +372,7 @@ # Iterate over each replicate as "main" main = reps[0] reps.remove(main) - for chromosome in main.chromosomes.values(): + for chromosome in list(main.chromosomes.values()): peaks_by_value = chromosome.peaks[:] # Sort main replicate by value peaks_by_value.sort(key=lambda peak: -peak.value) @@ -379,9 +388,7 @@ continue try: # Lines changed to remove a major bug by Rohit Reja. - window, chrum = get_window(replicate.chromosomes[chromosome.name], - group.peaks.values(), - distance) + window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance) match = METHODS[method](window, peak, chrum) except KeyError: continue @@ -392,9 +399,9 @@ break # Attempt to enlarge existing peak groups for group in peak_groups: - old_peaks = group.peaks.values()[:] + old_peaks = list(group.peaks.values()) search_for_matches(group) - for peak in group.peaks.values(): + for peak in list(group.peaks.values()): if peak not in old_peaks: peak.replicate.chromosomes[chromosome.name].remove_peak(peak) # Attempt to find new peaks groups. For each peak in the @@ -405,7 +412,7 @@ search_for_matches(matches) # Were enough replicates matched? if matches.num_replicates >= num_required: - for peak in matches.peaks.values(): + for peak in list(matches.peaks.values()): peak.replicate.chromosomes[chromosome.name].remove_peak(peak) peak_groups.append(matches) # Zero or less = no stepping @@ -432,11 +439,14 @@ matched_peaks_output.writerow(gff_row(cname=group.chrom, start=group.midpoint, end=group.midpoint + 1, + score=group.normalized_value(med), source='repmatch', - score=group.normalized_value(med), - attrs={'median_distance': group.median_distance, - 'replicates': group.num_replicates, - 'value_sum': group.value_sum})) + stype='.', + strand='.', + phase='.', + attrs=[('median_distance', group.median_distance), + ('value_sum', group.value_sum), + ('replicates', group.num_replicates)])) if output_detail_file: matched_peaks = (group.chrom, group.midpoint,