comparison PDAUG_Fishers_Plot/PDAUG_Fishers_Plot.py @ 0:5d01ab729b2b draft

"planemo upload for repository https://github.com/jaidevjoshi83/pdaug commit a9bd83f6a1afa6338cb6e4358b63ebff5bed155e"
author jay
date Wed, 28 Oct 2020 01:43:12 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5d01ab729b2b
1 import matplotlib
2 matplotlib.use('Agg')
3 import os
4 import sys
5 sys.path.insert(0, os.path.abspath('..'))
6
7 import quantiprot
8 from quantiprot.utils.io import load_fasta_file
9 from quantiprot.utils.feature import Feature, FeatureSet
10 from quantiprot.metrics.aaindex import get_aa2volume, get_aa2hydropathy
11 from quantiprot.metrics.basic import average
12
13 from matplotlib import pyplot as plt
14
15
16 from math import log10, floor
17 import numpy as np
18 from matplotlib import pyplot as plt
19 from scipy.stats import fisher_exact
20 from quantiprot.utils.sequence import SequenceSet, compact
21
22
23 def _count_frame(data, frame_range, num_bins):
24 """
25 Count instances in a 2D frame
26
27 The function discretizes the feature space into a grid of cells.
28 Then it counts the number of instances that fall into each cell.
29 An efficient method for counting instances is used. It performs parallel
30 logical comparisons of data instances to vectors that hold information on
31 grid lines.
32
33 Args:
34 data (numpy.matrix): a Nx2 data matrix
35 frame_range (numpy.matrix): a 2x2 matrix which defines feature ranges
36 num_bins (list): a pair defining the resolution of the 2D grid
37 Returns:
38 cell_counts (numpy.matrix): a matrix holding counts of instances in
39 each grid cell
40 bin_ranges (tuple): a pair of numpy matrices holding information on
41 bin(grid_cell) ranges
42 """
43 grid_x = np.linspace(start=frame_range[0, 0], stop=frame_range[1, 0],\
44 num=num_bins[0]+1, endpoint=True)
45 grid_y = np.linspace(start=frame_range[0, 1], stop=frame_range[1, 1],\
46 num=num_bins[1]+1, endpoint=True)
47 # copy because we add ones in the next lines
48 bin_ranges = (np.copy(grid_x), np.copy(grid_y))
49
50
51 #Count points in each grid cell
52 grid_x[-1] += 1 # the last cell has to contain data at the border
53 grid_y[-1] += 1 # the last cell has to contain data at the border
54
55 gte_x = np.matrix(data[:, 0] >= grid_x, dtype='float64')
56 lt_x = np.matrix(data[:, 0] < grid_x, dtype='float64')
57 gte_y = np.matrix(data[:, 1] >= grid_y, dtype='float64')
58 lt_y = np.matrix(data[:, 1] < grid_y, dtype='float64')
59
60 dif_x = gte_x - lt_x
61 dif_y = gte_y - lt_y
62
63 bins_x = dif_x.argmin(axis=1) - 1
64 bins_y = dif_y.argmin(axis=1) - 1
65
66 coords = np.concatenate((bins_x, bins_y), axis=1)
67
68 cell_counts = np.zeros(shape=(len(grid_x)-1, len(grid_y)-1))
69
70 for i in range(coords.shape[0]):
71 cell_counts[coords[i, 0], coords[i, 1]] += 1
72
73 return cell_counts, bin_ranges
74
75
76 def local_fisher_2d(set1, set2, features=None, \
77 windows_per_frame=10, overlap_factor=1, frame_range=None):
78 """
79 Compare local and global distribution of samples from two populations
80 in the 2d feature space using the Fisher's exact test.
81
82 The function performs the Fisher Exact Test for comparing local and global
83 ratia of instance counts from two different populations. It uses the
84 '_count_frame' function to discretize the feature space and get instance
85 counts. Then it scans the 2d feature space with a sliding window and
86 performs the Fisher Exact test.
87
88 Args:
89 set1 (SequenceSet or numpy.matrix): the first set with at least
90 2 sequence features.
91 set2 (SequenceSet or numpy.matrix): the second set with at least
92 2 sequence features.
93 features (tuple or list): strings with feature names for running
94 the 2d Fisher test. If None then the first two features are
95 used. Relevant only if 'set1' or 'set2' are SequenceSets.
96 windows_per_frame (int): ratio between the whole feature space and
97 the sliding window (default 10).
98 overlap_factor (int):ratio between the size of a sliding window
99 and a discretization grid cell (default 1).
100 frame_range(numpy.matrix): 2x2 matrix with range of features
101 in both dimensions.
102
103 Returns final_res (dict): a dictionary including:
104 'odds_ratio' (numpy.matrix): a matrix of odds_ratios obtained
105 in each sliding window position.
106 'p_value' (numpy.matrix): a matrix containing Fisher test outcome
107 pvalues in each sliding window position.
108 'w_counts1' (numpy.matrix): a matrix with first population instance
109 counts in each sliding window position.
110 'w_counts2' (numpy.matrix): a matrix with second population instance
111 counts in each sliding window position.
112 'w_center_x' (numpy.matrix): matrix containing coordinates of window
113 centers in the X dimension.
114 'w_center_y' (numpy.matrix): matrix containing coordinates of window
115 centers in the Y dimension.
116 '_bin_ranges_x' (numpy.matrix): matrix containing bin(grid_cell)
117 ranges in the X dimension.
118 '_bin_ranges_y' (numpy.matrix): matrix containing bin(grid_cell)
119 ranges in the Y dimension.
120 """
121
122 if isinstance(set1, SequenceSet):
123 mat1 = np.transpose(np.matrix(compact(set1,
124 features=features).columns()))
125 if isinstance(set2, SequenceSet):
126 mat2 = np.transpose(np.matrix(compact(set2,
127 features=features).columns()))
128
129 #Deal with window_per_frame and overlap_factor
130 #given either as a scalar or as a list-like
131 if not hasattr(windows_per_frame, "__len__"):
132 w_per_frame = (windows_per_frame, windows_per_frame)
133 else:
134 w_per_frame = (windows_per_frame[0], windows_per_frame[1])
135
136 if not hasattr(overlap_factor, "__len__"):
137 w_size = (overlap_factor, overlap_factor)
138 else:
139 w_size = (overlap_factor[0], overlap_factor[1])
140
141 num_bins = (w_per_frame[0]*w_size[0], w_per_frame[1]*w_size[1])
142
143 if frame_range is None:
144 #Evaluate the range of features in both populations.
145
146 frame_range = np.concatenate((np.minimum(mat1.min(0), mat2.min(0)),\
147 np.maximum(mat1.max(0), mat2.max(0))))
148
149 margin_x = (frame_range[1, 0] - frame_range[0, 0])/w_per_frame[0]
150 margin_y = (frame_range[1, 1] - frame_range[0, 1])/w_per_frame[1]
151
152 frame_range[0, 0] -= margin_x
153 frame_range[1, 0] += margin_x
154
155 frame_range[0, 1] -= margin_y
156 frame_range[1, 1] += margin_y
157
158 #Discretize feature space into NxM grid,
159 #where N = w_per_frame[0]*w_size[0].
160 # M = w_per_frame[1]*w_size[1].
161 #count instances of population1 and population2 in each grid cell.
162 #both bin ranges are always the same because the frame range is common.
163 cell_counts1, bin_ranges = _count_frame(mat1, frame_range=frame_range,\
164 num_bins=num_bins)
165 cell_counts2, _ = _count_frame(mat2, frame_range=frame_range,\
166 num_bins=num_bins)
167
168 #Number of windows that fit in a single row/column of a frame
169 w_number = (cell_counts1.shape[0]-w_size[0]+1,
170 cell_counts1.shape[1]-w_size[1]+1)
171
172 #Initialize matrices holding counts at scanning window positions.
173 window_counts1 = np.zeros(shape=w_number)
174 window_counts2 = np.zeros(shape=w_number)
175
176 #Initialize matrices holding window coordinates
177 window_center_x = np.zeros(shape=w_number[0])
178 window_center_y = np.zeros(shape=w_number[1])
179
180 #Initialize matrices holding Fisher Exact test results
181 fisher_pv = np.ones(shape=w_number)
182 odds_ratio = np.ones(shape=w_number)
183
184 #Calculate population totals in the whole feature space
185 all1 = cell_counts1.sum()
186 all2 = cell_counts2.sum()
187
188 #Calculate window centers
189 for start_x in range(0, w_number[0]):
190 window_center_x[start_x] = (bin_ranges[0][start_x]+ \
191 bin_ranges[0][start_x+w_size[0]])/2
192 for start_y in range(0, w_number[1]):
193 window_center_y[start_y] = (bin_ranges[1][start_y]+ \
194 bin_ranges[1][start_y+w_size[1]])/2
195
196 #Scan the feature space with a step of 1 cell.
197 for start_x in range(0, w_number[0]):
198
199 for start_y in range(0, w_number[1]):
200 #Count instances of each population in the window
201 window_counts1[start_x, start_y] = \
202 cell_counts1[start_x:(start_x+w_size[0]), \
203 start_y:(start_y+w_size[1])].sum()
204 window_counts2[start_x, start_y] = \
205 cell_counts2[start_x:(start_x+w_size[0]), \
206 start_y:(start_y+w_size[1])].sum()
207 #Perform the Fisher Exact Test against
208 #h0: population ratio in the window the same as in the whole space.
209 odds_ratio[start_x, start_y], fisher_pv[start_x, start_y] =\
210 fisher_exact([[all1, window_counts1[start_x, start_y]],\
211 [all2, window_counts2[start_x, start_y]]])
212
213 fisher_res = {'p_value':fisher_pv, 'odds_ratio':odds_ratio,\
214 'w_counts1':window_counts1, 'w_counts2':window_counts2,\
215 'w_center_x':window_center_x, 'w_center_y':window_center_y,\
216 '_bin_ranges_x':bin_ranges[0], '_bin_ranges_y':bin_ranges[1]}
217
218 return fisher_res
219
220
221 def _plot_local_fisher_2d(fisher_res, xlabel="feat_1", ylabel="feat_2",
222 pop1_label="pop_1", pop2_label="pop_2", out_file_path=None, fig_width=8, fig_hight=8, fig_hspace=0.35, fig_wspace=0.25):
223 """
224 Plot results of the local Fisher's extact test in the 2d space.
225
226 Args:
227 fisher_res (dict): output from 'fisher_local_2d'.
228 xlabel (str): name of the 1st feature to appear in the plots
229 (default: "feat_1")
230 ylabel (str): name of the 2nd feature to appear in the plots
231 (default: "feat_2")
232 pop1_label (str): name of the 1st population to appear in the plots
233 (default: "pop_1")
234 pop2_label (str): name of the 2nd population to appear in the plots
235 (default: "pop_2")
236 """
237 fisher_or = fisher_res["odds_ratio"]
238 fisher_c1 = fisher_res["w_counts1"]
239 fisher_c2 = fisher_res["w_counts2"]
240 fisher_pv = fisher_res["p_value"]
241
242 for pos_x in range(len(fisher_or)):
243 for pos_y in range(len(fisher_or[0])):
244 if fisher_c1[pos_x][pos_y] == 0 and fisher_c2[pos_x][pos_y] == 0:
245 fisher_or[pos_x][pos_y] = np.nan
246 elif fisher_c1[pos_x][pos_y] == 0:
247 fisher_or[pos_x][pos_y] = np.inf
248 elif fisher_c2[pos_x][pos_y] == 0:
249 fisher_or[pos_x][pos_y] = -np.inf
250 elif fisher_or[pos_x][pos_y] < 1:
251 fisher_or[pos_x][pos_y] = -1.0/fisher_or[pos_x][pos_y]
252
253 vmax_abs = np.nanmax(np.abs([x for x in np.array(fisher_or).flatten()
254 if x > -np.inf and x < np.inf]))
255
256 for pos_x in range(len(fisher_or)):
257 for pos_y in range(len(fisher_or[0])):
258 if abs(fisher_or[pos_x][pos_y]) == np.inf:
259 fisher_or[pos_x][pos_y] = np.sign(fisher_or[pos_x][pos_y])*vmax_abs
260
261 ##### Extra Fig perimeters added ################################
262 plt.figure(figsize=(fig_width, fig_hight)) # Figure size
263 plt.subplots_adjust(hspace = fig_hspace, wspace = fig_wspace) # space between the subplots.
264 ##################################################################
265
266 plt.subplot(221)
267 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
268 np.ma.masked_invalid(fisher_c1).T, cmap="Reds")
269 plt.colorbar()
270 plt.xlabel(xlabel)
271 plt.ylabel(ylabel)
272 plt.title("Counts "+pop1_label)
273
274 plt.subplot(222)
275 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
276 np.ma.masked_invalid(fisher_c2).T, cmap="Reds")
277 plt.colorbar()
278 plt.xlabel(xlabel)
279 plt.ylabel(ylabel)
280 plt.title("Counts "+pop2_label)
281
282 cmap = plt.get_cmap('RdBu')
283 cmap.set_bad(color='k', alpha=1.)
284
285 cbar_lo = 1.0/vmax_abs
286 cbar_lo_places = max(0, -floor(log10(cbar_lo))+1)
287 cbar_hi = vmax_abs
288 cbar_hi_places = max(0, -floor(log10(cbar_hi))+1)
289
290 plt.subplot(223)
291 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
292 np.ma.masked_invalid(fisher_or).T, cmap=cmap,
293 vmin=-vmax_abs, vmax=vmax_abs)
294 cbar = plt.colorbar(ticks=([-vmax_abs, 0, vmax_abs]))
295 cbar.ax.set_yticklabels(['< '+str(round(cbar_lo, int(cbar_lo_places))), '1',
296 '> '+str(round(cbar_hi, int(cbar_hi_places)))])
297 plt.xlabel(xlabel)
298 plt.ylabel(ylabel)
299 plt.title("Odds ratio")
300
301 plt.subplot(224)
302 plt.pcolormesh(fisher_res["w_center_x"], fisher_res["w_center_y"],
303 np.log10(np.ma.masked_invalid(fisher_pv)).T, cmap="RdGy")
304 plt.colorbar()
305 plt.xlabel(xlabel)
306 plt.ylabel(ylabel)
307 plt.title("Fisher test\np-value (logarithm of 10)")
308
309 #Savefig function added with preserving default behavior
310
311 if out_file_path==None:
312 plt.show()
313 else:
314 plt.savefig(out_file_path,dpi=300)
315
316
317 def HTML_Gen(html):
318
319 out_html = open(html,'w')
320 part_1 = """
321
322 <!DOCTYPE html>
323 <html lang="en">
324 <head>
325 <title>Bootstrap Example</title>
326 <meta charset="utf-8">
327 <meta name="viewport" content="width=device-width, initial-scale=1">
328 <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/css/bootstrap.min.css">
329 <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.4.0/jquery.min.js"></script>
330 <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.0/js/bootstrap.min.js"></script>
331 <body>
332 <style>
333 div.container_1 {
334 width:600px;
335 margin: auto;
336 padding-right: 10;
337 }
338 div.table {
339 width:600px;
340 margin: auto;
341 padding-right: 10;
342 }
343 </style>
344 </head>
345 <div class="jumbotron text-center">
346 <h1> Fisher's Plot </h1>
347 </div>
348 <div class="container">
349 <div class="row">
350 <div class="col-sm-4">
351 <img src="1.png" alt="Smiley face" height="800" width="800">
352 </div>
353
354 </div>
355 </div>
356 </body>
357 </html>
358 """
359 out_html.write(part_1)
360 out_html.close()
361 # Load sets of amyloidogenic and non-amyloidogenic peptides:
362
363 def run(Fasta1, Fasta2, windows_per_frame, overlap_factor, xlabel, ylabel, pop1_label, pop2_label, htmlOutDir, htmlFname, Workdirpath):
364
365 if not os.path.exists(htmlOutDir):
366 os.makedirs(htmlOutDir)
367
368 amyload_pos_seq = load_fasta_file(Fasta1)
369 amyload_neg_seq = load_fasta_file(Fasta2)
370
371 # Calculate quantitive features: volume and hydropathy
372 mean_volume = Feature(get_aa2volume()).then(average)
373 mean_hydropathy = Feature(get_aa2hydropathy()).then(average)
374
375 fs = FeatureSet("volume'n'hydropathy")
376 fs.add(mean_volume)
377 fs.add(mean_hydropathy)
378
379 amyload_pos_conv_seq = fs(amyload_pos_seq)
380 amyload_neg_conv_seq = fs(amyload_neg_seq)
381
382 # Do local Fisher:
383 result = local_fisher_2d(amyload_pos_conv_seq, amyload_neg_conv_seq,
384 windows_per_frame=int(windows_per_frame), overlap_factor=int(overlap_factor))
385
386 # Plot local Fisher:
387 _plot_local_fisher_2d(result, xlabel=xlabel,
388 ylabel=ylabel,
389 pop1_label=pop1_label,
390 pop2_label=pop2_label,
391 out_file_path =os.path.join(os.getcwd(), "out.png")
392 )
393
394
395 # plt.savefig(os.path.join(Workdirpath, htmlOutDir, "1.png"))
396
397 HTML_Gen(os.path.join(Workdirpath, htmlOutDir, htmlFname))
398
399 if __name__=="__main__":
400
401
402 import argparse
403
404 parser = argparse.ArgumentParser()
405
406 parser.add_argument("-f1", "--Fasta1", required=True, default=None, help="First fasta file ")
407 parser.add_argument("-f2", "--Fasta2", required=True, default=None, help="Second fasta file")
408 parser.add_argument("-o", "--overlap_factor", required=False, default=5, help="Overlap factor")
409 parser.add_argument("-w", "--windows_per_frame", required=False, default=5, help="Windows per frame")
410 parser.add_argument("-x", "--xlabel", required=True, default=None, help="X label")
411 parser.add_argument("-y", "--ylabel", required=True, default=None, help="Y label")
412 parser.add_argument("-p1", "--pop1_label", required=True, default=None, help="First population label")
413 parser.add_argument("-p2", "--pop2_label", required=True, default=None, help="Second population label")
414 parser.add_argument("--htmlOutDir", required=False, default=os.path.join(os.getcwd(),'report_dir'), help="Path to html dir")
415 parser.add_argument("--htmlFname", required=False, help="html output file", default="report.html")
416 parser.add_argument("--Workdirpath", required=False, default=os.getcwd(), help="Path to output Working Directory")
417 args = parser.parse_args()
418
419 run(args.Fasta1, args.Fasta2, args.windows_per_frame, args.overlap_factor, args.xlabel, args.ylabel, args.pop1_label, args.pop2_label, args.htmlOutDir, args.htmlFname, args.Workdirpath)
420