Mercurial > repos > iuc > cwpair2
diff cwpair2_util.py @ 0:8600bfe7ed52 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit e96df94dba60050fa28aaf55b5bb095717a5f260
| author | iuc | 
|---|---|
| date | Tue, 22 Dec 2015 17:03:46 -0500 | 
| parents | |
| children | abc464ca7260 | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cwpair2_util.py Tue Dec 22 17:03:46 2015 -0500 @@ -0,0 +1,393 @@ +import bisect +import csv +import os +import sys +import traceback +import matplotlib +matplotlib.use('Agg') +from matplotlib import pyplot + +# Data outputs +DETAILS = 'D' +MATCHED_PAIRS = 'MP' +ORPHANS = 'O' +# Data output formats +GFF_EXT = 'gff' +TABULAR_EXT = 'tabular' +# Statistics historgrams output directory. +HISTOGRAM = 'H' +# Statistics outputs +FINAL_PLOTS = 'F' +PREVIEW_PLOTS = 'P' +STATS_GRAPH = 'C' + +# Graph settings. +COLORS = 'krg' +Y_LABEL = 'Peak-pair counts' +X_LABEL = 'Peak-pair distance (bp)' +TICK_WIDTH = 3 +ADJUST = [0.140, 0.9, 0.9, 0.1] +PLOT_FORMAT = 'pdf' +pyplot.rc('xtick.major', size=10.00) +pyplot.rc('ytick.major', size=10.00) +pyplot.rc('lines', linewidth=4.00) +pyplot.rc('axes', linewidth=3.00) +pyplot.rc('font', family='Bitstream Vera Sans', size=32.0) + + +class FrequencyDistribution(object): + + def __init__(self, start, end, binsize=10, d=None): + self.start = start + self.end = end + self.dist = d or {} + self.binsize = binsize + + def get_bin(self, x): + """ + Returns the bin in which a data point falls + """ + return self.start + (x-self.start) // self.binsize * self.binsize + self.binsize/2.0 + + def add(self, x): + x = self.get_bin(x) + self.dist[x] = self.dist.get(x, 0) + 1 + + def graph_series(self): + x = [] + y = [] + for i in range(self.start, self.end, self.binsize): + center = self.get_bin(i) + x.append(center) + y.append(self.dist.get(center, 0)) + return x, y + + def mode(self): + return max(self.dist.items(), key=lambda data: data[1])[0] + + def size(self): + return sum(self.dist.values()) + + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit(1) + + +def distance(peak1, peak2): + return (peak2[1]+peak2[2])/2 - (peak1[1]+peak1[2])/2 + + +def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): + return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) + + +def gff_attrs(d): + if not d: + return '.' + return ';'.join('%s=%s' % item for item in d.items()) + + +def parse_chromosomes(reader): + # This version of cwpair2 accepts only gff format as input. + chromosomes = {} + reader.next() + for line in reader: + cname, junk, junk, start, end, value, strand, junk, junk = line + start = int(start) + end = int(end) + value = float(value) + if cname not in chromosomes: + chromosomes[cname] = [] + peaks = chromosomes[cname] + peaks.append((strand, start, end, value)) + return chromosomes + + +def perc95(chromosomes): + """ + Returns the 95th percentile value of the given chromosomes. + """ + values = [] + for peaks in chromosomes.values(): + for peak in peaks: + values.append(peak[3]) + values.sort() + # Get 95% value + return values[int(len(values)*0.95)] + + +def filter(chromosomes, threshold=0.05): + """ + Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted + as a proportion of the maximum, >=1.0 as an absolute value. + """ + if threshold < 1: + p95 = perc95(chromosomes) + threshold = p95 * threshold + # Make the threshold a proportion of the + for cname, peaks in chromosomes.items(): + chromosomes[cname] = [peak for peak in peaks if peak[3] > threshold] + + +def split_strands(chromosome): + watson = [peak for peak in chromosome if peak[0] == '+'] + crick = [peak for peak in chromosome if peak[0] == '-'] + return watson, crick + + +def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): + dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) + for cname, data in chromosomes.items(): + watson, crick = split_strands(data) + crick.sort(key=lambda data: float(data[1])) + keys = make_keys(crick) + for peak in watson: + for cpeak in get_window(crick, peak, up_distance, down_distance, keys): + dist.add(distance(peak, cpeak)) + return dist + + +def make_keys(crick): + return [(data[1] + data[2])//2 for data in crick] + + +def get_window(crick, peak, up_distance, down_distance, keys=None): + """ + Returns a window of all crick peaks within a distance of a watson peak. + crick strand MUST be sorted by distance + """ + strand, start, end, value = peak + midpoint = (start + end) // 2 + lower = midpoint - up_distance + upper = midpoint + down_distance + keys = keys or make_keys(crick) + start_index = bisect.bisect_left(keys, lower) + end_index = bisect.bisect_right(keys, upper) + return [cpeak for cpeak in crick[start_index:end_index]] + + +def match_largest(window, peak): + if not window: + return None + return max(window, key=lambda cpeak: cpeak[3]) + + +def match_closest(window, peak): + if not window: + return None + + def key(cpeak): + d = distance(peak, cpeak) + # Search negative distances last + if d < 0: + # And then prefer less negative distances + d = 10000 - d + return d + return min(window, key=key) + + +def match_mode(window, peak, mode): + if not window: + return None + return min(window, key=lambda cpeak: abs(distance(peak, cpeak)-mode)) + +METHODS = {'mode': match_mode, 'closest': match_closest, 'largest': match_largest} + + +def frequency_plot(freqs, fname, labels=[], title=''): + pyplot.clf() + pyplot.figure(figsize=(10, 10)) + for i, freq in enumerate(freqs): + x, y = freq.graph_series() + pyplot.plot(x, y, '%s-' % COLORS[i]) + if len(freqs) > 1: + pyplot.legend(labels) + pyplot.xlim(freq.start, freq.end) + pyplot.ylim(ymin=0) + pyplot.ylabel(Y_LABEL) + pyplot.xlabel(X_LABEL) + pyplot.subplots_adjust(left=ADJUST[0], right=ADJUST[1], top=ADJUST[2], bottom=ADJUST[3]) + # Get the current axes + ax = pyplot.gca() + for l in ax.get_xticklines() + ax.get_yticklines(): + l.set_markeredgewidth(TICK_WIDTH) + pyplot.savefig(fname) + + +def create_directories(): + # Output histograms in pdf. + os.mkdir(HISTOGRAM) + os.mkdir('data_%s' % DETAILS) + os.mkdir('data_%s' % ORPHANS) + os.mkdir('data_%s' % MATCHED_PAIRS) + + +def process_file(dataset_path, galaxy_hid, method, threshold, up_distance, + down_distance, binsize, output_files): + if method == 'all': + match_methods = METHODS.keys() + else: + match_methods = [method] + statistics = [] + for match_method in match_methods: + stats = perform_process(dataset_path, + galaxy_hid, + match_method, + threshold, + up_distance, + down_distance, + binsize, + output_files) + statistics.append(stats) + if output_files == 'all' and method == 'all': + frequency_plot([s['dist'] for s in statistics], + statistics[0]['graph_path'], + labels=METHODS.keys()) + return statistics + + +def perform_process(dataset_path, galaxy_hid, method, threshold, up_distance, + down_distance, binsize, output_files): + output_details = output_files in ["all", "matched_pair_orphan_detail"] + output_plots = output_files in ["all"] + output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] + # Keep track of statistics for the output file + statistics = {} + input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') + fpath, fname = os.path.split(dataset_path) + statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) + statistics['dir'] = fpath + if threshold >= 1: + filter_string = 'fa%d' % threshold + else: + filter_string = 'f%d' % (threshold * 100) + fname = '%s_%su%dd%d_on_data_%s' % (method, filter_string, up_distance, down_distance, galaxy_hid) + + def make_histogram_path(output_type, fname): + return os.path.join(HISTOGRAM, 'histogram_%s_%s.%s' % (output_type, fname, PLOT_FORMAT)) + + def make_path(output_type, extension, fname): + # Returns the full path for an output. + return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) + + def td_writer(output_type, extension, fname): + # Returns a tab-delimited writer for a specified output. + output_file_path = make_path(output_type, extension, fname) + return csv.writer(open(output_file_path, 'wt'), delimiter='\t') + + try: + chromosomes = parse_chromosomes(input) + except Exception: + stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) + if output_details: + # Details + detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) + detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) + if output_plots: + # Final Plot + final_plot_path = make_histogram_path(FINAL_PLOTS, fname) + if output_orphans: + # Orphans + orphan_output = td_writer('data_%s' % ORPHANS, TABULAR_EXT, fname) + orphan_output.writerow(('chrom', 'strand', 'start', 'end', 'value')) + if output_plots: + # Preview Plot + preview_plot_path = make_histogram_path(PREVIEW_PLOTS, fname) + # Matched Pairs. + matched_pairs_output = td_writer('data_%s' % MATCHED_PAIRS, GFF_EXT, fname) + statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT + if output_plots: + statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) + statistics['perc95'] = perc95(chromosomes) + if threshold > 0: + # Apply filter + filter(chromosomes, threshold) + if method == 'mode': + freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) + mode = freq.mode() + statistics['preview_mode'] = mode + if output_plots: + frequency_plot([freq], preview_plot_path, title='Preview frequency plot') + else: + statistics['preview_mode'] = 'NA' + dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) + orphans = 0 + # x will be used to archive the summary dataset + x = [] + for cname, chromosome in chromosomes.items(): + # Each peak is (strand, start, end, value) + watson, crick = split_strands(chromosome) + # Sort by value of each peak + watson.sort(key=lambda data: -float(data[3])) + # Sort by position to facilitate binary search + crick.sort(key=lambda data: float(data[1])) + keys = make_keys(crick) + for peak in watson: + window = get_window(crick, peak, up_distance, down_distance, keys) + if method == 'mode': + match = match_mode(window, peak, mode) + else: + match = METHODS[method](window, peak) + if match: + midpoint = (match[1] + match[2] + peak[1] + peak[2]) // 4 + d = distance(peak, match) + dist.add(d) + # Simple output in gff format. + x.append(gff_row(cname, + source='cwpair', + start=midpoint, + end=midpoint + 1, + score=peak[3] + match[3], + attrs={'cw_distance': d})) + if output_details: + detailed_output.writerow((cname, + peak[1], + peak[2], + peak[3], + '+', + cname, + match[1], + match[2], + match[3], '-', + midpoint, + peak[3]+match[3], + d)) + i = bisect.bisect_left(keys, (match[1]+match[2])/2) + del crick[i] + del keys[i] + else: + if output_orphans: + orphan_output.writerow((cname, peak[0], peak[1], peak[2], peak[3])) + # Keep track of orphans for statistics. + orphans += 1 + # Remaining crick peaks are orphans + if output_orphans: + for cpeak in crick: + orphan_output.writerow((cname, cpeak[0], cpeak[1], cpeak[2], cpeak[3])) + # Keep track of orphans for statistics. + orphans += len(crick) + # Sort output descending by score. + x.sort(key=lambda data: float(data[5]), reverse=True) + # Writing a summary to gff format file + for row in x: + row_tmp = list(row) + # Dataset in tuple cannot be modified in Python, so row will + # be converted to list format to add 'chr'. + if row_tmp[0] == "999": + row_tmp[0] = 'chrM' + elif row_tmp[0] == "998": + row_tmp[0] = 'chrY' + elif row_tmp[0] == "997": + row_tmp[0] = 'chrX' + else: + row_tmp[0] = row_tmp[0] + # Print row_tmp. + matched_pairs_output.writerow(row_tmp) + statistics['paired'] = dist.size() * 2 + statistics['orphans'] = orphans + statistics['final_mode'] = dist.mode() + if output_plots: + frequency_plot([dist], final_plot_path, title='Frequency distribution') + statistics['dist'] = dist + return statistics
