Mercurial > repos > jay > padaug_peptide_sequence_analysis
comparison PDAUG_Fishers_Plot/PDAUG_Fishers_Plot.py @ 0:5d01ab729b2b draft
"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author | jay |
---|---|
date | Wed, 28 Oct 2020 01:43:12 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5d01ab729b2b |
---|---|
1 import matplotlib | |
2 matplotlib.use('Agg') | |
3 import os | |
4 import sys | |
5 sys.path.insert(0, os.path.abspath('..')) | |
6 | |
7 import quantiprot | |
8 from quantiprot.utils.io import load_fasta_file | |
9 from quantiprot.utils.feature import Feature, FeatureSet | |
10 from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy | |
11 from quantiprot.metrics.basic import average | |
12 | |
13 from matplotlib import pyplot as plt | |
14 | |
15 | |
16 from math import log10, floor | |
17 import numpy as np | |
18 from matplotlib import pyplot as plt | |
19 from scipy.stats import fisher_exact | |
20 from quantiprot.utils.sequence import SequenceSet, compact | |
21 | |
22 | |
23 def _count_frame(data, frame_range, num_bins): | |
24 """ | |
25 Count instances in a 2D frame | |
26 | |
27 The function discretizes the feature space into a grid of cells. | |
28 Then it counts the number of instances that fall into each cell. | |
29 An efficient method for counting instances is used. It performs parallel | |
30 logical comparisons of data instances to vectors that hold information on | |
31 grid lines. | |
32 | |
33 Args: | |
34 data (numpy.matrix): a Nx2 data matrix | |
35 frame_range (numpy.matrix): a 2x2 matrix which defines feature ranges | |
36 num_bins (list): a pair defining the resolution of the 2D grid | |
37 Returns: | |
38 cell_counts (numpy.matrix): a matrix holding counts of instances in | |
39 each grid cell | |
40 bin_ranges (tuple): a pair of numpy matrices holding information on | |
41 bin(grid_cell) ranges | |
42 """ | |
43 grid_x = np.linspace(start=frame_range[0, 0], stop=frame_range[1, 0],\ | |
44 num=num_bins[0]+1, endpoint=True) | |
45 grid_y = np.linspace(start=frame_range[0, 1], stop=frame_range[1, 1],\ | |
46 num=num_bins[1]+1, endpoint=True) | |
47 # copy because we add ones in the next lines | |
48 bin_ranges = (np.copy(grid_x), np.copy(grid_y)) | |
49 | |
50 | |
51 #Count points in each grid cell | |
52 grid_x[-1] += 1 # the last cell has to contain data at the border | |
53 grid_y[-1] += 1 # the last cell has to contain data at the border | |
54 | |
55 gte_x = np.matrix(data[:, 0] >= grid_x, dtype='float64') | |
56 lt_x = np.matrix(data[:, 0] < grid_x, dtype='float64') | |
57 gte_y = np.matrix(data[:, 1] >= grid_y, dtype='float64') | |
58 lt_y = np.matrix(data[:, 1] < grid_y, dtype='float64') | |
59 | |
60 dif_x = gte_x - lt_x | |
61 dif_y = gte_y - lt_y | |
62 | |
63 bins_x = dif_x.argmin(axis=1) - 1 | |
64 bins_y = dif_y.argmin(axis=1) - 1 | |
65 | |
66 coords = np.concatenate((bins_x, bins_y), axis=1) | |
67 | |
68 cell_counts = np.zeros(shape=(len(grid_x)-1, len(grid_y)-1)) | |
69 | |
70 for i in range(coords.shape[0]): | |
71 cell_counts[coords[i, 0], coords[i, 1]] += 1 | |
72 | |
73 return cell_counts, bin_ranges | |
74 | |
75 | |
76 def local_fisher_2d(set1, set2, features=None, \ | |
77 windows_per_frame=10, overlap_factor=1, frame_range=None): | |
78 """ | |
79 Compare local and global distribution of samples from two populations | |
80 in the 2d feature space using the Fisher's exact test. | |
81 | |
82 The function performs the Fisher Exact Test for comparing local and global | |
83 ratia of instance counts from two different populations. It uses the | |
84 '_count_frame' function to discretize the feature space and get instance | |
85 counts. Then it scans the 2d feature space with a sliding window and | |
86 performs the Fisher Exact test. | |
87 | |
88 Args: | |
89 set1 (SequenceSet or numpy.matrix): the first set with at least | |
90 2 sequence features. | |
91 set2 (SequenceSet or numpy.matrix): the second set with at least | |
92 2 sequence features. | |
93 features (tuple or list): strings with feature names for running | |
94 the 2d Fisher test. If None then the first two features are | |
95 used. Relevant only if 'set1' or 'set2' are SequenceSets. | |
96 windows_per_frame (int): ratio between the whole feature space and | |
97 the sliding window (default 10). | |
98 overlap_factor (int):ratio between the size of a sliding window | |
99 and a discretization grid cell (default 1). | |
100 frame_range(numpy.matrix): 2x2 matrix with range of features | |
101 in both dimensions. | |
102 | |
103 Returns final_res (dict): a dictionary including: | |
104 'odds_ratio' (numpy.matrix): a matrix of odds_ratios obtained | |
105 in each sliding window position. | |
106 'p_value' (numpy.matrix): a matrix containing Fisher test outcome | |
107 pvalues in each sliding window position. | |
108 'w_counts1' (numpy.matrix): a matrix with first population instance | |
109 counts in each sliding window position. | |
110 'w_counts2' (numpy.matrix): a matrix with second population instance | |
111 counts in each sliding window position. | |
112 'w_center_x' (numpy.matrix): matrix containing coordinates of window | |
113 centers in the X dimension. | |
114 'w_center_y' (numpy.matrix): matrix containing coordinates of window | |
115 centers in the Y dimension. | |
116 '_bin_ranges_x' (numpy.matrix): matrix containing bin(grid_cell) | |
117 ranges in the X dimension. | |
118 '_bin_ranges_y' (numpy.matrix): matrix containing bin(grid_cell) | |
119 ranges in the Y dimension. | |
120 """ | |
121 | |
122 if isinstance(set1, SequenceSet): | |
123 mat1 = np.transpose(np.matrix(compact(set1, | |
124 features=features).columns())) | |
125 if isinstance(set2, SequenceSet): | |
126 mat2 = np.transpose(np.matrix(compact(set2, | |
127 features=features).columns())) | |
128 | |
129 #Deal with window_per_frame and overlap_factor | |
130 #given either as a scalar or as a list-like | |
131 if not hasattr(windows_per_frame, "__len__"): | |
132 w_per_frame = (windows_per_frame, windows_per_frame) | |
133 else: | |
134 w_per_frame = (windows_per_frame[0], windows_per_frame[1]) | |
135 | |
136 if not hasattr(overlap_factor, "__len__"): | |
137 w_size = (overlap_factor, overlap_factor) | |
138 else: | |
139 w_size = (overlap_factor[0], overlap_factor[1]) | |
140 | |
141 num_bins = (w_per_frame[0]*w_size[0], w_per_frame[1]*w_size[1]) | |
142 | |
143 if frame_range is None: | |
144 #Evaluate the range of features in both populations. | |
145 | |
146 frame_range = np.concatenate((np.minimum(mat1.min(0), mat2.min(0)),\ | |
147 np.maximum(mat1.max(0), mat2.max(0)))) | |
148 | |
149 margin_x = (frame_range[1, 0] - frame_range[0, 0])/w_per_frame[0] | |
150 margin_y = (frame_range[1, 1] - frame_range[0, 1])/w_per_frame[1] | |
151 | |
152 frame_range[0, 0] -= margin_x | |
153 frame_range[1, 0] += margin_x | |
154 | |
155 frame_range[0, 1] -= margin_y | |
156 frame_range[1, 1] += margin_y | |
157 | |
158 #Discretize feature space into NxM grid, | |
159 #where N = w_per_frame[0]*w_size[0]. | |
160 # M = w_per_frame[1]*w_size[1]. | |
161 #count instances of population1 and population2 in each grid cell. | |
162 #both bin ranges are always the same because the frame range is common. | |
163 cell_counts1, bin_ranges = _count_frame(mat1, frame_range=frame_range,\ | |
164 num_bins=num_bins) | |
165 cell_counts2, _ = _count_frame(mat2, frame_range=frame_range,\ | |
166 num_bins=num_bins) | |
167 | |
168 #Number of windows that fit in a single row/column of a frame | |
169 w_number = (cell_counts1.shape[0]-w_size[0]+1, | |
170 cell_counts1.shape[1]-w_size[1]+1) | |
171 | |
172 #Initialize matrices holding counts at scanning window positions. | |
173 window_counts1 = np.zeros(shape=w_number) | |
174 window_counts2 = np.zeros(shape=w_number) | |
175 | |
176 #Initialize matrices holding window coordinates | |
177 window_center_x = np.zeros(shape=w_number[0]) | |
178 window_center_y = np.zeros(shape=w_number[1]) | |
179 | |
180 #Initialize matrices holding Fisher Exact test results | |
181 fisher_pv = np.ones(shape=w_number) | |
182 odds_ratio = np.ones(shape=w_number) | |
183 | |
184 #Calculate population totals in the whole feature space | |
185 all1 = cell_counts1.sum() | |
186 all2 = cell_counts2.sum() | |
187 | |
188 #Calculate window centers | |
189 for start_x in range(0, w_number[0]): | |
190 window_center_x[start_x] = (bin_ranges[0][start_x]+ \ | |
191 bin_ranges[0][start_x+w_size[0]])/2 | |
192 for start_y in range(0, w_number[1]): | |
193 window_center_y[start_y] = (bin_ranges[1][start_y]+ \ | |
194 bin_ranges[1][start_y+w_size[1]])/2 | |
195 | |
196 #Scan the feature space with a step of 1 cell. | |
197 for start_x in range(0, w_number[0]): | |
198 | |
199 for start_y in range(0, w_number[1]): | |
200 #Count instances of each population in the window | |
201 window_counts1[start_x, start_y] = \ | |
202 cell_counts1[start_x:(start_x+w_size[0]), \ | |
203 start_y:(start_y+w_size[1])].sum() | |
204 window_counts2[start_x, start_y] = \ | |
205 cell_counts2[start_x:(start_x+w_size[0]), \ | |
206 start_y:(start_y+w_size[1])].sum() | |
207 #Perform the Fisher Exact Test against | |
208 #h0: population ratio in the window the same as in the whole space. | |
209 odds_ratio[start_x, start_y], fisher_pv[start_x, start_y] =\ | |
210 fisher_exact([[all1, window_counts1[start_x, start_y]],\ | |
211 [all2, window_counts2[start_x, start_y]]]) | |
212 | |
213 fisher_res = {'p_value':fisher_pv, 'odds_ratio':odds_ratio,\ | |
214 'w_counts1':window_counts1, 'w_counts2':window_counts2,\ | |
215 'w_center_x':window_center_x, 'w_center_y':window_center_y,\ | |
216 '_bin_ranges_x':bin_ranges[0], '_bin_ranges_y':bin_ranges[1]} | |
217 | |
218 return fisher_res | |
219 | |
220 | |
221 def _plot_local_fisher_2d(fisher_res, xlabel="feat_1", ylabel="feat_2", | |
222 pop1_label="pop_1", pop2_label="pop_2", out_file_path=None, fig_width=8, fig_hight=8, fig_hspace=0.35, fig_wspace=0.25): | |
223 """ | |
224 Plot results of the local Fisher's extact test in the 2d space. | |
225 | |
226 Args: | |
227 fisher_res (dict): output from 'fisher_local_2d'. | |
228 xlabel (str): name of the 1st feature to appear in the plots | |
229 (default: "feat_1") | |
230 ylabel (str): name of the 2nd feature to appear in the plots | |
231 (default: "feat_2") | |
232 pop1_label (str): name of the 1st population to appear in the plots | |
233 (default: "pop_1") | |
234 pop2_label (str): name of the 2nd population to appear in the plots | |
235 (default: "pop_2") | |
236 """ | |
237 fisher_or = fisher_res["odds_ratio"] | |
238 fisher_c1 = fisher_res["w_counts1"] | |
239 fisher_c2 = fisher_res["w_counts2"] | |
240 fisher_pv = fisher_res["p_value"] | |
241 | |
242 for pos_x in range(len(fisher_or)): | |
243 for pos_y in range(len(fisher_or[0])): | |
244 if fisher_c1[pos_x][pos_y] == 0 and fisher_c2[pos_x][pos_y] == 0: | |
245 fisher_or[pos_x][pos_y] = np.nan | |
246 elif fisher_c1[pos_x][pos_y] == 0: | |
247 fisher_or[pos_x][pos_y] = np.inf | |
248 elif fisher_c2[pos_x][pos_y] == 0: | |
249 fisher_or[pos_x][pos_y] = -np.inf | |
250 elif fisher_or[pos_x][pos_y] < 1: | |
251 fisher_or[pos_x][pos_y] = -1.0/fisher_or[pos_x][pos_y] | |
252 | |
253 vmax_abs = np.nanmax(np.abs([x for x in np.array(fisher_or).flatten() | |
254 if x > -np.inf and x < np.inf])) | |
255 | |
256 for pos_x in range(len(fisher_or)): | |
257 for pos_y in range(len(fisher_or[0])): | |
258 if abs(fisher_or[pos_x][pos_y]) == np.inf: | |
259 fisher_or[pos_x][pos_y] = np.sign(fisher_or[pos_x][pos_y])*vmax_abs | |
260 | |
261 ##### Extra Fig perimeters added ################################ | |
262 plt.figure(figsize=(fig_width, fig_hight)) # Figure size | |
263 plt.subplots_adjust(hspace = fig_hspace, wspace = fig_wspace) # space between the subplots. | |
264 ################################################################## | |
265 | |
266 plt.subplot(221) | |
267 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
268 np.ma.masked_invalid(fisher_c1).T, cmap="Reds") | |
269 plt.colorbar() | |
270 plt.xlabel(xlabel) | |
271 plt.ylabel(ylabel) | |
272 plt.title("Counts "+pop1_label) | |
273 | |
274 plt.subplot(222) | |
275 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
276 np.ma.masked_invalid(fisher_c2).T, cmap="Reds") | |
277 plt.colorbar() | |
278 plt.xlabel(xlabel) | |
279 plt.ylabel(ylabel) | |
280 plt.title("Counts "+pop2_label) | |
281 | |
282 cmap = plt.get_cmap('RdBu') | |
283 cmap.set_bad(color='k', alpha=1.) | |
284 | |
285 cbar_lo = 1.0/vmax_abs | |
286 cbar_lo_places = max(0, -floor(log10(cbar_lo))+1) | |
287 cbar_hi = vmax_abs | |
288 cbar_hi_places = max(0, -floor(log10(cbar_hi))+1) | |
289 | |
290 plt.subplot(223) | |
291 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
292 np.ma.masked_invalid(fisher_or).T, cmap=cmap, | |
293 vmin=-vmax_abs, vmax=vmax_abs) | |
294 cbar = plt.colorbar(ticks=([-vmax_abs, 0, vmax_abs])) | |
295 cbar.ax.set_yticklabels(['< '+str(round(cbar_lo, int(cbar_lo_places))), '1', | |
296 '> '+str(round(cbar_hi, int(cbar_hi_places)))]) | |
297 plt.xlabel(xlabel) | |
298 plt.ylabel(ylabel) | |
299 plt.title("Odds ratio") | |
300 | |
301 plt.subplot(224) | |
302 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"], | |
303 np.log10(np.ma.masked_invalid(fisher_pv)).T, cmap="RdGy") | |
304 plt.colorbar() | |
305 plt.xlabel(xlabel) | |
306 plt.ylabel(ylabel) | |
307 plt.title("Fisher test\np-value (logarithm of 10)") | |
308 | |
309 #Savefig function added with preserving default behavior | |
310 | |
311 if out_file_path==None: | |
312 plt.show() | |
313 else: | |
314 plt.savefig(out_file_path,dpi=300) | |
315 | |
316 | |
317 def HTML_Gen(html): | |
318 | |
319 out_html = open(html,'w') | |
320 part_1 = """ | |
321 | |
322 <!DOCTYPE html> | |
323 <html lang="en"> | |
324 <head> | |
325 <title>Bootstrap Example</title> | |
326 <meta charset="utf-8"> | |
327 <meta name="viewport" content="width=device-width, initial-scale=1"> | |
328 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css"> | |
329 <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script> | |
330 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script> | |
331 <body> | |
332 <style> | |
333 div.container_1 { | |
334 width:600px; | |
335 margin: auto; | |
336 padding-right: 10; | |
337 } | |
338 div.table { | |
339 width:600px; | |
340 margin: auto; | |
341 padding-right: 10; | |
342 } | |
343 </style> | |
344 </head> | |
345 <div class="jumbotron text-center"> | |
346 <h1> Fisher's Plot </h1> | |
347 </div> | |
348 <div class="container"> | |
349 <div class="row"> | |
350 <div class="col-sm-4"> | |
351 <img src="1.png" alt="Smiley face" height="800" width="800"> | |
352 </div> | |
353 | |
354 </div> | |
355 </div> | |
356 </body> | |
357 </html> | |
358 """ | |
359 out_html.write(part_1) | |
360 out_html.close() | |
361 # Load sets of amyloidogenic and non-amyloidogenic peptides: | |
362 | |
363 def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath): | |
364 | |
365 if not os.path.exists(htmlOutDir): | |
366 os.makedirs(htmlOutDir) | |
367 | |
368 amyload_pos_seq = load_fasta_file(Fasta1) | |
369 amyload_neg_seq = load_fasta_file(Fasta2) | |
370 | |
371 # Calculate quantitive features: volume and hydropathy | |
372 mean_volume = Feature(get_aa2volume()).then(average) | |
373 mean_hydropathy = Feature(get_aa2hydropathy()).then(average) | |
374 | |
375 fs = FeatureSet("volume'n'hydropathy") | |
376 fs.add(mean_volume) | |
377 fs.add(mean_hydropathy) | |
378 | |
379 amyload_pos_conv_seq = fs(amyload_pos_seq) | |
380 amyload_neg_conv_seq = fs(amyload_neg_seq) | |
381 | |
382 # Do local Fisher: | |
383 result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq, | |
384 windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor)) | |
385 | |
386 # Plot local Fisher: | |
387 _plot_local_fisher_2d(result, xlabel=xlabel, | |
388 ylabel=ylabel, | |
389 pop1_label=pop1_label, | |
390 pop2_label=pop2_label, | |
391 out_file_path =os.path.join(os.getcwd(), "out.png") | |
392 ) | |
393 | |
394 | |
395 # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png")) | |
396 | |
397 HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname)) | |
398 | |
399 if __name__=="__main__": | |
400 | |
401 | |
402 import argparse | |
403 | |
404 parser = argparse.ArgumentParser() | |
405 | |
406 parser.add_argument("-f1", "--Fasta1", required=True, default=None, help="First fasta file ") | |
407 parser.add_argument("-f2", "--Fasta2", required=True, default=None, help="Second fasta file") | |
408 parser.add_argument("-o", "--overlap_factor", required=False, default=5, help="Overlap factor") | |
409 parser.add_argument("-w", "--windows_per_frame", required=False, default=5, help="Windows per frame") | |
410 parser.add_argument("-x", "--xlabel", required=True, default=None, help="X label") | |
411 parser.add_argument("-y", "--ylabel", required=True, default=None, help="Y label") | |
412 parser.add_argument("-p1", "--pop1_label", required=True, default=None, help="First population label") | |
413 parser.add_argument("-p2", "--pop2_label", required=True, default=None, help="Second population label") | |
414 parser.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="Path to html dir") | |
415 parser.add_argument("--htmlFname", required=False, help="html output file", default="report.html") | |
416 parser.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Path to output Working Directory") | |
417 args = parser.parse_args() | |
418 | |
419 run(args.Fasta1, args.Fasta2, args.windows_per_frame, args.overlap_factor, args.xlabel, args.ylabel, args.pop1_label, args.pop2_label, args.htmlOutDir, args.htmlFname, args.Workdirpath) | |
420 |