cwpair2: cwpair2_util.py comparison

comparison cwpair2_util.py @ 6:c4b926c9831c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit ca66053a0d915edba3d7f1d4ce014718e7f4f0f2

author	iuc
date	Thu, 02 May 2019 08:30:22 -0400
parents	71188f3f4b76
children

comparison

equal deleted inserted replaced

-:71188f3f4b76
+:c4b926c9831c
 MATCHED_PAIRS = 'MP'
 ORPHANS = 'O'
 # Data output formats
 GFF_EXT = 'gff'
 TABULAR_EXT = 'tabular'
-# Statistics historgrams output directory.
+# Statistics histograms output directory.
 HISTOGRAM = 'H'
 # Statistics outputs
 FINAL_PLOTS = 'F'
 PREVIEW_PLOTS = 'P'
 STATS_GRAPH = 'C'
 self.dist = d or {}
 self.binsize = binsize
 def get_bin(self, x):
 """
-Returns the bin in which a data point falls
+Returns the centre of the bin in which a data point falls
 """
 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0
 def add(self, x):
 x = self.get_bin(x)
 x.append(center)
 y.append(self.dist.get(center, 0))
 return x, y
 def mode(self):
-return max(self.dist.items(), key=lambda data: data[1])[0]
+# There could be more than one mode for a frequency distribution,
+# return the median of the modes to be consistent
+max_frequency = max(self.dist.values())
+modes = sorted(_[0] for _ in self.dist.items() if _[1] == max_frequency)
+median_index = len(modes) // 2
+return modes[median_index]
 def size(self):
 return sum(self.dist.values())
 sys.stderr.write(msg)
 sys.exit(1)
 def distance(peak1, peak2):
-return (peak2[1] + peak2[2]) / 2 - (peak1[1] + peak1[2]) / 2
+return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0
 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
 def parse_chromosomes(reader):
 # This version of cwpair2 accepts only gff format as input.
 chromosomes = {}
-next(reader)
 for line in reader:
-cname, junk, junk, start, end, value, strand, junk, junk = line
+line = line.rstrip("\r\n")
+if not line or line.startswith('#'):
+continue
+cname, _, _, start, end, value, strand, _, _ = line.split("\t")
 start = int(start)
 end = int(end)
 value = float(value)
 if cname not in chromosomes:
 chromosomes[cname] = []
 values.sort()
 # Get 95% value
 return values[int(len(values) * 0.95)]
-def filter(chromosomes, threshold=0.05):
+def peak_filter(chromosomes, threshold):
 """
 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
 as a proportion of the maximum, >=1.0 as an absolute value.
 """
 if threshold < 1:
 return watson, crick
 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
-for cname, data in chromosomes.items():
+for data in chromosomes.values():
 watson, crick = split_strands(data)
 crick.sort(key=lambda data: float(data[1]))
 keys = make_keys(crick)
 for peak in watson:
 for cpeak in get_window(crick, peak, up_distance, down_distance, keys):
 output_details = output_files in ["all", "matched_pair_orphan_detail"]
 output_plots = output_files in ["all"]
 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]
 # Keep track of statistics for the output file
 statistics = {}
-input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
 fpath, fname = os.path.split(dataset_path)
 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
 statistics['dir'] = fpath
 if threshold >= 1:
 filter_string = 'fa%d' % threshold
 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
 def td_writer(output_type, extension, fname):
 # Returns a tab-delimited writer for a specified output.
 output_file_path = make_path(output_type, extension, fname)
-return csv.writer(open(output_file_path, 'wt'), delimiter='\t')
+return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n")
-try:
+with open(dataset_path, 'rt') as input:
-chromosomes = parse_chromosomes(input)
+try:
-except Exception:
+chromosomes = parse_chromosomes(input)
-stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
+except Exception:
+stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
 if output_details:
 # Details
 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname)
 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
 if output_plots:
 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
 if output_plots:
 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname)
 statistics['perc95'] = perc95(chromosomes)
 if threshold > 0:
-# Apply filter
+# Apply peak_filter
-filter(chromosomes, threshold)
+peak_filter(chromosomes, threshold)
 if method == 'mode':
 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
 mode = freq.mode()
 statistics['preview_mode'] = mode
 if output_plots:

Mercurial > repos > iuc > cwpair2

comparison cwpair2_util.py @ 6:c4b926c9831c draft default tip