comparison cwpair2_util.py @ 6:c4b926c9831c draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/cwpair2 commit ca66053a0d915edba3d7f1d4ce014718e7f4f0f2
author iuc
date Thu, 02 May 2019 08:30:22 -0400
parents 71188f3f4b76
children
comparison
equal deleted inserted replaced
5:71188f3f4b76 6:c4b926c9831c
13 MATCHED_PAIRS = 'MP' 13 MATCHED_PAIRS = 'MP'
14 ORPHANS = 'O' 14 ORPHANS = 'O'
15 # Data output formats 15 # Data output formats
16 GFF_EXT = 'gff' 16 GFF_EXT = 'gff'
17 TABULAR_EXT = 'tabular' 17 TABULAR_EXT = 'tabular'
18 # Statistics historgrams output directory. 18 # Statistics histograms output directory.
19 HISTOGRAM = 'H' 19 HISTOGRAM = 'H'
20 # Statistics outputs 20 # Statistics outputs
21 FINAL_PLOTS = 'F' 21 FINAL_PLOTS = 'F'
22 PREVIEW_PLOTS = 'P' 22 PREVIEW_PLOTS = 'P'
23 STATS_GRAPH = 'C' 23 STATS_GRAPH = 'C'
44 self.dist = d or {} 44 self.dist = d or {}
45 self.binsize = binsize 45 self.binsize = binsize
46 46
47 def get_bin(self, x): 47 def get_bin(self, x):
48 """ 48 """
49 Returns the bin in which a data point falls 49 Returns the centre of the bin in which a data point falls
50 """ 50 """
51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0 51 return self.start + (x - self.start) // self.binsize * self.binsize + self.binsize / 2.0
52 52
53 def add(self, x): 53 def add(self, x):
54 x = self.get_bin(x) 54 x = self.get_bin(x)
62 x.append(center) 62 x.append(center)
63 y.append(self.dist.get(center, 0)) 63 y.append(self.dist.get(center, 0))
64 return x, y 64 return x, y
65 65
66 def mode(self): 66 def mode(self):
67 return max(self.dist.items(), key=lambda data: data[1])[0] 67 # There could be more than one mode for a frequency distribution,
68 # return the median of the modes to be consistent
69 max_frequency = max(self.dist.values())
70 modes = sorted(_[0] for _ in self.dist.items() if _[1] == max_frequency)
71 median_index = len(modes) // 2
72 return modes[median_index]
68 73
69 def size(self): 74 def size(self):
70 return sum(self.dist.values()) 75 return sum(self.dist.values())
71 76
72 77
74 sys.stderr.write(msg) 79 sys.stderr.write(msg)
75 sys.exit(1) 80 sys.exit(1)
76 81
77 82
78 def distance(peak1, peak2): 83 def distance(peak1, peak2):
79 return (peak2[1] + peak2[2]) / 2 - (peak1[1] + peak1[2]) / 2 84 return (peak2[1] + peak2[2]) / 2.0 - (peak1[1] + peak1[2]) / 2.0
80 85
81 86
82 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}): 87 def gff_row(cname, start, end, score, source, type='.', strand='.', phase='.', attrs={}):
83 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs)) 88 return (cname, source, type, start, end, score, strand, phase, gff_attrs(attrs))
84 89
90 95
91 96
92 def parse_chromosomes(reader): 97 def parse_chromosomes(reader):
93 # This version of cwpair2 accepts only gff format as input. 98 # This version of cwpair2 accepts only gff format as input.
94 chromosomes = {} 99 chromosomes = {}
95 next(reader)
96 for line in reader: 100 for line in reader:
97 cname, junk, junk, start, end, value, strand, junk, junk = line 101 line = line.rstrip("\r\n")
102 if not line or line.startswith('#'):
103 continue
104 cname, _, _, start, end, value, strand, _, _ = line.split("\t")
98 start = int(start) 105 start = int(start)
99 end = int(end) 106 end = int(end)
100 value = float(value) 107 value = float(value)
101 if cname not in chromosomes: 108 if cname not in chromosomes:
102 chromosomes[cname] = [] 109 chromosomes[cname] = []
116 values.sort() 123 values.sort()
117 # Get 95% value 124 # Get 95% value
118 return values[int(len(values) * 0.95)] 125 return values[int(len(values) * 0.95)]
119 126
120 127
121 def filter(chromosomes, threshold=0.05): 128 def peak_filter(chromosomes, threshold):
122 """ 129 """
123 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted 130 Filters the peaks to those above a threshold. Threshold < 1.0 is interpreted
124 as a proportion of the maximum, >=1.0 as an absolute value. 131 as a proportion of the maximum, >=1.0 as an absolute value.
125 """ 132 """
126 if threshold < 1: 133 if threshold < 1:
137 return watson, crick 144 return watson, crick
138 145
139 146
140 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize): 147 def all_pair_distribution(chromosomes, up_distance, down_distance, binsize):
141 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize) 148 dist = FrequencyDistribution(-up_distance, down_distance, binsize=binsize)
142 for cname, data in chromosomes.items(): 149 for data in chromosomes.values():
143 watson, crick = split_strands(data) 150 watson, crick = split_strands(data)
144 crick.sort(key=lambda data: float(data[1])) 151 crick.sort(key=lambda data: float(data[1]))
145 keys = make_keys(crick) 152 keys = make_keys(crick)
146 for peak in watson: 153 for peak in watson:
147 for cpeak in get_window(crick, peak, up_distance, down_distance, keys): 154 for cpeak in get_window(crick, peak, up_distance, down_distance, keys):
254 output_details = output_files in ["all", "matched_pair_orphan_detail"] 261 output_details = output_files in ["all", "matched_pair_orphan_detail"]
255 output_plots = output_files in ["all"] 262 output_plots = output_files in ["all"]
256 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"] 263 output_orphans = output_files in ["all", "matched_pair_orphan", "matched_pair_orphan_detail"]
257 # Keep track of statistics for the output file 264 # Keep track of statistics for the output file
258 statistics = {} 265 statistics = {}
259 input = csv.reader(open(dataset_path, 'rt'), delimiter='\t')
260 fpath, fname = os.path.split(dataset_path) 266 fpath, fname = os.path.split(dataset_path)
261 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid)) 267 statistics['fname'] = '%s: data %s' % (method, str(galaxy_hid))
262 statistics['dir'] = fpath 268 statistics['dir'] = fpath
263 if threshold >= 1: 269 if threshold >= 1:
264 filter_string = 'fa%d' % threshold 270 filter_string = 'fa%d' % threshold
274 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension)) 280 return os.path.join(output_type, '%s_%s.%s' % (output_type, fname, extension))
275 281
276 def td_writer(output_type, extension, fname): 282 def td_writer(output_type, extension, fname):
277 # Returns a tab-delimited writer for a specified output. 283 # Returns a tab-delimited writer for a specified output.
278 output_file_path = make_path(output_type, extension, fname) 284 output_file_path = make_path(output_type, extension, fname)
279 return csv.writer(open(output_file_path, 'wt'), delimiter='\t') 285 return csv.writer(open(output_file_path, 'wt'), delimiter='\t', lineterminator="\n")
280 286
281 try: 287 with open(dataset_path, 'rt') as input:
282 chromosomes = parse_chromosomes(input) 288 try:
283 except Exception: 289 chromosomes = parse_chromosomes(input)
284 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc())) 290 except Exception:
291 stop_err('Unable to parse file "%s".\n%s' % (dataset_path, traceback.format_exc()))
285 if output_details: 292 if output_details:
286 # Details 293 # Details
287 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname) 294 detailed_output = td_writer('data_%s' % DETAILS, TABULAR_EXT, fname)
288 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)')) 295 detailed_output.writerow(('chrom', 'start', 'end', 'value', 'strand') * 2 + ('midpoint', 'c-w reads sum', 'c-w distance (bp)'))
289 if output_plots: 296 if output_plots:
301 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT 308 statistics['stats_path'] = 'statistics.%s' % TABULAR_EXT
302 if output_plots: 309 if output_plots:
303 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname) 310 statistics['graph_path'] = make_histogram_path(STATS_GRAPH, fname)
304 statistics['perc95'] = perc95(chromosomes) 311 statistics['perc95'] = perc95(chromosomes)
305 if threshold > 0: 312 if threshold > 0:
306 # Apply filter 313 # Apply peak_filter
307 filter(chromosomes, threshold) 314 peak_filter(chromosomes, threshold)
308 if method == 'mode': 315 if method == 'mode':
309 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize) 316 freq = all_pair_distribution(chromosomes, up_distance, down_distance, binsize)
310 mode = freq.mode() 317 mode = freq.mode()
311 statistics['preview_mode'] = mode 318 statistics['preview_mode'] = mode
312 if output_plots: 319 if output_plots: