Mercurial > repos > iuc > cwpair2
comparison cwpair2_util.py @ 6:c4b926c9831c draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit ca66053a0d915edba3d7f1d4ce014718e7f4f0f2
author | iuc |
---|---|
date | Thu, 02 May 2019 08:30:22 -0400 |
parents | 71188f3f4b76 |
children |
comparison
equal
deleted
inserted
replaced
5:71188f3f4b76 | 6:c4b926c9831c |
---|---|
13 MATCHED_PAIRS = 'MP' | 13 MATCHED_PAIRS = 'MP' |
14 ORPHANS = 'O' | 14 ORPHANS = 'O' |
15 # Data output formats | 15 # Data output formats |
16 GFF_EXT = 'gff' | 16 GFF_EXT = 'gff' |
17 TABULAR_EXT = 'tabular' | 17 TABULAR_EXT = 'tabular' |
18 # Statistics historgrams output directory. | 18 # Statistics histograms output directory. |
19 HISTOGRAM = 'H' | 19 HISTOGRAM = 'H' |
20 # Statistics outputs | 20 # Statistics outputs |
21 FINAL_PLOTS = 'F' | 21 FINAL_PLOTS = 'F' |
22 PREVIEW_PLOTS = 'P' | 22 PREVIEW_PLOTS = 'P' |
23 STATS_GRAPH = 'C' | 23 STATS_GRAPH = 'C' |
44 self.dist = d or {} | 44 self.dist = d or {} |
45 self.binsize = binsize | 45 self.binsize = binsize |
46 | 46 |
47 def get_bin(self, x): | 47 def get_bin(self, x): |
48 """ | 48 """ |
49 Returns the bin in which a data point falls | 49 Returns the centre of the bin in which a data point falls |
50 """ | 50 """ |
51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 | 51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 |
52 | 52 |
53 def add(self, x): | 53 def add(self, x): |
54 x = self.get_bin(x) | 54 x = self.get_bin(x) |
62 x.append(center) | 62 x.append(center) |
63 y.append(self.dist.get(center, 0)) | 63 y.append(self.dist.get(center, 0)) |
64 return x, y | 64 return x, y |
65 | 65 |
66 def mode(self): | 66 def mode(self): |
67 return max(self.dist.items(), key=lambda data: data[1])[0] | 67 # There could be more than one mode for a frequency distribution, |
68 # return the median of the modes to be consistent | |
69 max_frequency = max(self.dist.values()) | |
70 modes = sorted(_[0] for _ in self.dist.items() if _[1] == max_frequency) | |
71 median_index = len(modes) // 2 | |
72 return modes[median_index] | |
68 | 73 |
69 def size(self): | 74 def size(self): |
70 return sum(self.dist.values()) | 75 return sum(self.dist.values()) |
71 | 76 |
72 | 77 |
74 sys.stderr.write(msg) | 79 sys.stderr.write(msg) |
75 sys.exit(1) | 80 sys.exit(1) |
76 | 81 |
77 | 82 |
78 def distance(peak1, peak2): | 83 def distance(peak1, peak2): |
79 return (peak2[1] + peak2[2]) / 2 - (peak1[1] + peak1[2]) / 2 | 84 return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0 |
80 | 85 |
81 | 86 |
82 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): | 87 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): |
83 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) | 88 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) |
84 | 89 |
90 | 95 |
91 | 96 |
92 def parse_chromosomes(reader): | 97 def parse_chromosomes(reader): |
93 # This version of cwpair2 accepts only gff format as input. | 98 # This version of cwpair2 accepts only gff format as input. |
94 chromosomes = {} | 99 chromosomes = {} |
95 next(reader) | |
96 for line in reader: | 100 for line in reader: |
97 cname, junk, junk, start, end, value, strand, junk, junk = line | 101 line = line.rstrip("\r\n") |
102 if not line or line.startswith('#'): | |
103 continue | |
104 cname, _, _, start, end, value, strand, _, _ = line.split("\t") | |
98 start = int(start) | 105 start = int(start) |
99 end = int(end) | 106 end = int(end) |
100 value = float(value) | 107 value = float(value) |
101 if cname not in chromosomes: | 108 if cname not in chromosomes: |
102 chromosomes[cname] = [] | 109 chromosomes[cname] = [] |
116 values.sort() | 123 values.sort() |
117 # Get 95% value | 124 # Get 95% value |
118 return values[int(len(values) * 0.95)] | 125 return values[int(len(values) * 0.95)] |
119 | 126 |
120 | 127 |
121 def filter(chromosomes, threshold=0.05): | 128 def peak_filter(chromosomes, threshold): |
122 """ | 129 """ |
123 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted | 130 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted |
124 as a proportion of the maximum, >=1.0 as an absolute value. | 131 as a proportion of the maximum, >=1.0 as an absolute value. |
125 """ | 132 """ |
126 if threshold < 1: | 133 if threshold < 1: |
137 return watson, crick | 144 return watson, crick |
138 | 145 |
139 | 146 |
140 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): | 147 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): |
141 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) | 148 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) |
142 for cname, data in chromosomes.items(): | 149 for data in chromosomes.values(): |
143 watson, crick = split_strands(data) | 150 watson, crick = split_strands(data) |
144 crick.sort(key=lambda data: float(data[1])) | 151 crick.sort(key=lambda data: float(data[1])) |
145 keys = make_keys(crick) | 152 keys = make_keys(crick) |
146 for peak in watson: | 153 for peak in watson: |
147 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): | 154 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): |
254 output_details = output_files in ["all", "matched_pair_orphan_detail"] | 261 output_details = output_files in ["all", "matched_pair_orphan_detail"] |
255 output_plots = output_files in ["all"] | 262 output_plots = output_files in ["all"] |
256 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] | 263 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] |
257 # Keep track of statistics for the output file | 264 # Keep track of statistics for the output file |
258 statistics = {} | 265 statistics = {} |
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t') | |
260 fpath, fname = os.path.split(dataset_path) | 266 fpath, fname = os.path.split(dataset_path) |
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) | 267 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) |
262 statistics['dir'] = fpath | 268 statistics['dir'] = fpath |
263 if threshold >= 1: | 269 if threshold >= 1: |
264 filter_string = 'fa%d' % threshold | 270 filter_string = 'fa%d' % threshold |
274 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) | 280 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) |
275 | 281 |
276 def td_writer(output_type, extension, fname): | 282 def td_writer(output_type, extension, fname): |
277 # Returns a tab-delimited writer for a specified output. | 283 # Returns a tab-delimited writer for a specified output. |
278 output_file_path = make_path(output_type, extension, fname) | 284 output_file_path = make_path(output_type, extension, fname) |
279 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') | 285 return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n") |
280 | 286 |
281 try: | 287 with open(dataset_path, 'rt') as input: |
282 chromosomes = parse_chromosomes(input) | 288 try: |
283 except Exception: | 289 chromosomes = parse_chromosomes(input) |
284 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | 290 except Exception: |
291 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) | |
285 if output_details: | 292 if output_details: |
286 # Details | 293 # Details |
287 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) | 294 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) |
288 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) | 295 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) |
289 if output_plots: | 296 if output_plots: |
301 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT | 308 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT |
302 if output_plots: | 309 if output_plots: |
303 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) | 310 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) |
304 statistics['perc95'] = perc95(chromosomes) | 311 statistics['perc95'] = perc95(chromosomes) |
305 if threshold > 0: | 312 if threshold > 0: |
306 # Apply filter | 313 # Apply peak_filter |
307 filter(chromosomes, threshold) | 314 peak_filter(chromosomes, threshold) |
308 if method == 'mode': | 315 if method == 'mode': |
309 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) | 316 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) |
310 mode = freq.mode() | 317 mode = freq.mode() |
311 statistics['preview_mode'] = mode | 318 statistics['preview_mode'] = mode |
312 if output_plots: | 319 if output_plots: |