Mercurial > repos > iuc > cwpair2
comparison cwpair2_util.py @ 6:c4b926c9831c draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit ca66053a0d915edba3d7f1d4ce014718e7f4f0f2
| author | iuc |
|---|---|
| date | Thu, 02 May 2019 08:30:22 -0400 |
| parents | 71188f3f4b76 |
| children |
comparison
equal
deleted
inserted
replaced
| 5:71188f3f4b76 | 6:c4b926c9831c |
|---|---|
| 13 MATCHED_PAIRS = 'MP' | 13 MATCHED_PAIRS = 'MP' |
| 14 ORPHANS = 'O' | 14 ORPHANS = 'O' |
| 15 # Data output formats | 15 # Data output formats |
| 16 GFF_EXT = 'gff' | 16 GFF_EXT = 'gff' |
| 17 TABULAR_EXT = 'tabular' | 17 TABULAR_EXT = 'tabular' |
| 18 # Statistics historgrams output directory. | 18 # Statistics histograms output directory. |
| 19 HISTOGRAM = 'H' | 19 HISTOGRAM = 'H' |
| 20 # Statistics outputs | 20 # Statistics outputs |
| 21 FINAL_PLOTS = 'F' | 21 FINAL_PLOTS = 'F' |
| 22 PREVIEW_PLOTS = 'P' | 22 PREVIEW_PLOTS = 'P' |
| 23 STATS_GRAPH = 'C' | 23 STATS_GRAPH = 'C' |
| 44 self.dist = d or {} | 44 self.dist = d or {} |
| 45 self.binsize = binsize | 45 self.binsize = binsize |
| 46 | 46 |
| 47 def get_bin(self, x): | 47 def get_bin(self, x): |
| 48 """ | 48 """ |
| 49 Returns the bin in which a data point falls | 49 Returns the centre of the bin in which a data point falls |
| 50 """ | 50 """ |
| 51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 | 51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 |
| 52 | 52 |
| 53 def add(self, x): | 53 def add(self, x): |
| 54 x = self.get_bin(x) | 54 x = self.get_bin(x) |
| 62 x.append(center) | 62 x.append(center) |
| 63 y.append(self.dist.get(center, 0)) | 63 y.append(self.dist.get(center, 0)) |
| 64 return x, y | 64 return x, y |
| 65 | 65 |
| 66 def mode(self): | 66 def mode(self): |
| 67 return max(self.dist.items(), key=lambda data: data[1])[0] | 67 # There could be more than one mode for a frequency distribution, |
| 68 # return the median of the modes to be consistent | |
| 69 max_frequency = max(self.dist.values()) | |
| 70 modes = sorted(_[0] for _ in self.dist.items() if _[1] == max_frequency) | |
| 71 median_index = len(modes) // 2 | |
| 72 return modes[median_index] | |
| 68 | 73 |
| 69 def size(self): | 74 def size(self): |
| 70 return sum(self.dist.values()) | 75 return sum(self.dist.values()) |
| 71 | 76 |
| 72 | 77 |
| 74 sys.stderr.write(msg) | 79 sys.stderr.write(msg) |
| 75 sys.exit(1) | 80 sys.exit(1) |
| 76 | 81 |
| 77 | 82 |
| 78 def distance(peak1, peak2): | 83 def distance(peak1, peak2): |
| 79 return (peak2[1] + peak2[2]) / 2 - (peak1[1] + peak1[2]) / 2 | 84 return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0 |
| 80 | 85 |
| 81 | 86 |
| 82 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | 87 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): |
| 83 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | 88 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) |
| 84 | 89 |
| 90 | 95 |
| 91 | 96 |
| 92 def parse_chromosomes(reader): | 97 def parse_chromosomes(reader): |
| 93 # This version of cwpair2 accepts only gff format as input. | 98 # This version of cwpair2 accepts only gff format as input. |
| 94 chromosomes = {} | 99 chromosomes = {} |
| 95 next(reader) | |
| 96 for line in reader: | 100 for line in reader: |
| 97 cname, junk, junk, start, end, value, strand, junk, junk = line | 101 line = line.rstrip("\r\n") |
| 102 if not line or line.startswith('#'): | |
| 103 continue | |
| 104 cname, _, _, start, end, value, strand, _, _ = line.split("\t") | |
| 98 start = int(start) | 105 start = int(start) |
| 99 end = int(end) | 106 end = int(end) |
| 100 value = float(value) | 107 value = float(value) |
| 101 if cname not in chromosomes: | 108 if cname not in chromosomes: |
| 102 chromosomes[cname] = [] | 109 chromosomes[cname] = [] |
| 116 values.sort() | 123 values.sort() |
| 117 # Get 95% value | 124 # Get 95% value |
| 118 return values[int(len(values) * 0.95)] | 125 return values[int(len(values) * 0.95)] |
| 119 | 126 |
| 120 | 127 |
| 121 def filter(chromosomes, threshold=0.05): | 128 def peak_filter(chromosomes, threshold): |
| 122 """ | 129 """ |
| 123 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted | 130 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted |
| 124 as a proportion of the maximum, >=1.0 as an absolute value. | 131 as a proportion of the maximum, >=1.0 as an absolute value. |
| 125 """ | 132 """ |
| 126 if threshold < 1: | 133 if threshold < 1: |
| 137 return watson, crick | 144 return watson, crick |
| 138 | 145 |
| 139 | 146 |
| 140 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): | 147 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): |
| 141 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | 148 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) |
| 142 for cname, data in chromosomes.items(): | 149 for data in chromosomes.values(): |
| 143 watson, crick = split_strands(data) | 150 watson, crick = split_strands(data) |
| 144 crick.sort(key=lambda data: float(data[1])) | 151 crick.sort(key=lambda data: float(data[1])) |
| 145 keys = make_keys(crick) | 152 keys = make_keys(crick) |
| 146 for peak in watson: | 153 for peak in watson: |
| 147 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): | 154 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): |
| 254 output_details = output_files in ["all", "matched_pair_orphan_detail"] | 261 output_details = output_files in ["all", "matched_pair_orphan_detail"] |
| 255 output_plots = output_files in ["all"] | 262 output_plots = output_files in ["all"] |
| 256 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] | 263 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] |
| 257 # Keep track of statistics for the output file | 264 # Keep track of statistics for the output file |
| 258 statistics = {} | 265 statistics = {} |
| 259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') | |
| 260 fpath, fname = os.path.split(dataset_path) | 266 fpath, fname = os.path.split(dataset_path) |
| 261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) | 267 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) |
| 262 statistics['dir'] = fpath | 268 statistics['dir'] = fpath |
| 263 if threshold >= 1: | 269 if threshold >= 1: |
| 264 filter_string = 'fa%d' % threshold | 270 filter_string = 'fa%d' % threshold |
| 274 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) | 280 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) |
| 275 | 281 |
| 276 def td_writer(output_type, extension, fname): | 282 def td_writer(output_type, extension, fname): |
| 277 # Returns a tab-delimited writer for a specified output. | 283 # Returns a tab-delimited writer for a specified output. |
| 278 output_file_path = make_path(output_type, extension, fname) | 284 output_file_path = make_path(output_type, extension, fname) |
| 279 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') | 285 return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n") |
| 280 | 286 |
| 281 try: | 287 with open(dataset_path, 'rt') as input: |
| 282 chromosomes = parse_chromosomes(input) | 288 try: |
| 283 except Exception: | 289 chromosomes = parse_chromosomes(input) |
| 284 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | 290 except Exception: |
| 291 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | |
| 285 if output_details: | 292 if output_details: |
| 286 # Details | 293 # Details |
| 287 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) | 294 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) |
| 288 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) | 295 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) |
| 289 if output_plots: | 296 if output_plots: |
| 301 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT | 308 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT |
| 302 if output_plots: | 309 if output_plots: |
| 303 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) | 310 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) |
| 304 statistics['perc95'] = perc95(chromosomes) | 311 statistics['perc95'] = perc95(chromosomes) |
| 305 if threshold > 0: | 312 if threshold > 0: |
| 306 # Apply filter | 313 # Apply peak_filter |
| 307 filter(chromosomes, threshold) | 314 peak_filter(chromosomes, threshold) |
| 308 if method == 'mode': | 315 if method == 'mode': |
| 309 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) | 316 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) |
| 310 mode = freq.mode() | 317 mode = freq.mode() |
| 311 statistics['preview_mode'] = mode | 318 statistics['preview_mode'] = mode |
| 312 if output_plots: | 319 if output_plots: |
