repmatch_gff3: repmatch_gff3_util.py comparison

comparison repmatch_gff3_util.py @ 4:6acaa2c93f47 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/repmatch_gff3 commit 315c3ddcdbf38a27d43753aae3b6d379306be5a9

author	iuc
date	Wed, 12 Jul 2017 10:11:53 -0400
parents	e5c7fffdc078
children	2365720de36d

comparison

equal deleted inserted replaced

-:f7608d0363bf
+:6acaa2c93f47
 pyplot.rc('lines', linewidth=4.00)
 pyplot.rc('axes', linewidth=3.00)
 pyplot.rc('font', family='Bitstream Vera Sans', size=32.0)
 COLORS = 'krb'
+ISPY2 = sys.version_info[0] == 2
 class Replicate(object):
 def __init__(self, id, dataset_path):
 self.id = id
 self.dataset_path = dataset_path
-self.parse(csv.reader(open(dataset_path, 'rt'), delimiter='\t'))
+if ISPY2:
+fh = open(dataset_path, 'rb')
+else:
+fh = open(dataset_path, 'r', newline='')
+self.parse(csv.reader(fh, delimiter='\t'))
 def parse(self, reader):
 self.chromosomes = {}
 for line in reader:
 if line[0].startswith("#") or line[0].startswith('"'):
 continue
 cname, junk, junk, mid, midplus, value, strand, junk, attrs = line
 attrs = parse_gff_attrs(attrs)
-distance = attrs['cw_distance']
+distance = int(attrs['cw_distance'])
 mid = int(mid)
 midplus = int(midplus)
 value = float(value)
-distance = int(distance)
 if cname not in self.chromosomes:
 self.chromosomes[cname] = Chromosome(cname)
 chrom = self.chromosomes[cname]
 chrom.add_peak(Peak(cname, mid, value, distance, self))
 for chrom in self.chromosomes.values():
 def add_peak(self, repid, peak):
 self.peaks[repid] = peak
 @property
 def chrom(self):
-return self.peaks.values()[0].chrom
+return list(self.peaks.values())[0].chrom
 @property
 def midpoint(self):
-return median([peak.midpoint for peak in self.peaks.values()])
+return int(median([peak.midpoint for peak in self.peaks.values()]))
 @property
 def num_replicates(self):
 return len(self.peaks)
 @property
 def median_distance(self):
-return median([peak.distance for peak in self.peaks.values()])
+return int(median([peak.distance for peak in self.peaks.values()]))
 @property
 def value_sum(self):
 return sum([peak.value for peak in self.peaks.values()])
 values.append(peak.normalized_value(med))
 return median(values)
 @property
 def peakpeak_distance(self):
-keys = self.peaks.keys()
+keys = list(self.peaks.keys())
 return abs(self.peaks[keys[0]].midpoint - self.peaks[keys[1]].midpoint)
 class FrequencyDistribution(object):
 def get_window(chromosome, target_peaks, distance):
 """
 Returns a window of all peaks from a replicate within a certain distance of
 a peak from another replicate.
 """
-lower = target_peaks[0].midpoint
+lower = list(target_peaks)[0].midpoint
-upper = target_peaks[0].midpoint
+upper = list(target_peaks)[0].midpoint
 for peak in target_peaks:
 lower = min(lower, peak.midpoint - distance)
 upper = max(upper, peak.midpoint + distance)
 start_index = bisect.bisect_left(chromosome.keys, lower)
 end_index = bisect.bisect_right(chromosome.keys, upper)
 METHODS = {'closest': match_closest, 'largest': match_largest}
-def gff_attrs(d):
+def gff_attrs(l):
-if not d:
+if len(l) == 0:
 return '.'
-return ';'.join('%s=%s' % item for item in d.items())
+return ';'.join('%s=%s' % (tup[0], tup[1]) for tup in l)
 def parse_gff_attrs(s):
 d = {}
 if s == '.':
 key, val = item.split('=')
 d[key] = val
 return d
-def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
+def gff_row(cname, start, end, score, source, stype='.', strand='.', phase='.', attrs=None):
-return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
+return (cname, source, stype, start, end, score, strand, phase, gff_attrs(attrs or []))
 def get_temporary_plot_path():
 """
 Return the path to a temporary file with a valid image format
 if step != 0:
 attrs += 's%d' % step
 def td_writer(file_path):
 # Returns a tab-delimited writer for a certain output
-return csv.writer(open(file_path, 'wt'), delimiter='\t')
+if ISPY2:
+fh = open(file_path, 'wb')
+return csv.writer(fh, delimiter='\t')
+else:
+fh = open(file_path, 'w', newline='')
+return csv.writer(fh, delimiter='\t', quoting=csv.QUOTE_NONE)
 labels = ('chrom',
 'median midpoint',
 'median midpoint+1',
 'median normalized reads',
 reps = reps[:]
 while len(reps) > 1:
 # Iterate over each replicate as "main"
 main = reps[0]
 reps.remove(main)
-for chromosome in main.chromosomes.values():
+for chromosome in list(main.chromosomes.values()):
 peaks_by_value = chromosome.peaks[:]
 # Sort main replicate by value
 peaks_by_value.sort(key=lambda peak: -peak.value)
 def search_for_matches(group):
 if replicate.id in group.peaks:
 # Stop if match already found for this replicate
 continue
 try:
 # Lines changed to remove a major bug by Rohit Reja.
-window, chrum = get_window(replicate.chromosomes[chromosome.name],
+window, chrum = get_window(replicate.chromosomes[chromosome.name], list(group.peaks.values()), distance)
-group.peaks.values(),
-distance)
 match = METHODS[method](window, peak, chrum)
 except KeyError:
 continue
 if match:
 group.add_peak(replicate.id, match)
 new_match = True
 if not new_match:
 break
 # Attempt to enlarge existing peak groups
 for group in peak_groups:
-old_peaks = group.peaks.values()[:]
+old_peaks = list(group.peaks.values())
 search_for_matches(group)
-for peak in group.peaks.values():
+for peak in list(group.peaks.values()):
 if peak not in old_peaks:
 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
 # Attempt to find new peaks groups.  For each peak in the
 # main replicate, search for matches in the other replicates
 for peak in peaks_by_value:
 matches = PeakGroup()
 matches.add_peak(main.id, peak)
 search_for_matches(matches)
 # Were enough replicates matched?
 if matches.num_replicates >= num_required:
-for peak in matches.peaks.values():
+for peak in list(matches.peaks.values()):
 peak.replicate.chromosomes[chromosome.name].remove_peak(peak)
 peak_groups.append(matches)
 # Zero or less = no stepping
 if step <= 0:
 do_match(replicates, distance)
 for group in peak_groups:
 # Output matched_peaks (matched pairs).
 matched_peaks_output.writerow(gff_row(cname=group.chrom,
 start=group.midpoint,
 end=group.midpoint + 1,
+score=group.normalized_value(med),
 source='repmatch',
-score=group.normalized_value(med),
+stype='.',
-attrs={'median_distance': group.median_distance,
+strand='.',
-'replicates': group.num_replicates,
+phase='.',
-'value_sum': group.value_sum}))
+attrs=[('median_distance', group.median_distance),
+('value_sum', group.value_sum),
+('replicates', group.num_replicates)]))
 if output_detail_file:
 matched_peaks = (group.chrom,
 group.midpoint,
 group.midpoint + 1,
 group.normalized_value(med),

Mercurial > repos > iuc > repmatch_gff3

comparison repmatch_gff3_util.py @ 4:6acaa2c93f47 draft