1
|
1 from collections import Counter
|
|
2
|
|
3 import numpy as np
|
|
4 import scipy as sp
|
|
5 import pandas as pd
|
|
6
|
|
7
|
|
8 def compute_influence(W,
|
|
9 alpha=0.9,
|
|
10 beta=0.1,
|
|
11 S=None,
|
|
12 rtype='df',
|
|
13 outputs=None,
|
|
14 n2i=None,
|
|
15 max_iter=1000,
|
|
16 tol=1e-7,
|
|
17 get_iter=False,
|
|
18 device="cpu",
|
|
19 sparse=False):
|
|
20 r"""Compute the influence.
|
|
21 It estimates the effects of a node to the other nodes,
|
|
22 by calculating partial derivative with respect to source nodes,
|
|
23 based on a simple iterative method.
|
|
24
|
|
25 Based on the below difference equation,
|
|
26
|
|
27 x(t+1) = alpha*W.dot(x(t)) + (1-alpha)*b
|
|
28
|
|
29 The influence matrix, S, is computed using chain rule of
|
|
30 partial derivative as follows.
|
|
31
|
|
32 \begin{align}
|
|
33 S_{ij} &= \frac{\partial{x_i}}{\partial{x_j}} \\
|
|
34 &= (I + \alpha W + \alpha^2 W^2 + ... + \alpha^{\infty}W^{\infty})_{ij} \\
|
|
35 &\approx (I + \alpha W + \alpha^2 W^2 + ... + \alpha^{l}W^{l})_{ij} \\
|
|
36 \end{align}
|
|
37
|
|
38 This is the summation of the weight multiplications along all paths
|
|
39 including cycles. $S_{ij}$ denotes the influence of node (j) on node (i).
|
|
40
|
|
41 An iterative method for an approximated solution is as follows.
|
|
42
|
|
43 S(t+1) = \alpha WS(t) + I,
|
|
44
|
|
45 where $S(0) = \beta I$ and $S(1) = \beta(I + \alpha W)$ $(t>1)$.
|
|
46
|
|
47 The iteration continues until $||S(t+1) - S(t)|| \leq tol$.
|
|
48
|
|
49
|
|
50 Parameters
|
|
51 ----------
|
|
52 W : numpy.ndarray
|
|
53 Weight matrix.
|
|
54 alpha : float, optional
|
|
55 Hyperparameter for adjusting the effect of signal flow.
|
|
56 beta : float, optional
|
|
57 Hyperparameter for adjusting the effect of basal activity.
|
|
58 S : numpy.ndarray, optional
|
|
59 Initial influence matrix.
|
|
60 rtype: str (optional)
|
|
61 Return object type: 'df' or 'array'.
|
|
62 outputs: list (or iterable) of str, optional
|
|
63 Names of output nodes, which is necessary for 'df' rtype.
|
|
64 n2i: dict, optional
|
|
65 Name to index dict, which is necessary for 'df' rtype.
|
|
66 max_iter : int, optional
|
|
67 The maximum iteration number for the estimation.
|
|
68 tol : float, optional
|
|
69 Tolerance for terminating the iteration.
|
|
70 get_iter : bool, optional
|
|
71 Determine whether the actual iteration number is returned.
|
|
72 device : str, optional, {'CPU', 'GPU:0', 'GPU:1', ...}
|
|
73 Select which device to use. 'CPU' is default.
|
|
74 sparse : bool, optional
|
|
75 Use sparse matrices for the computation.
|
|
76
|
|
77 Returns
|
|
78 -------
|
|
79 S : numpy.ndarray, optional
|
|
80 2D array of influence.
|
|
81 df : pd.DataFrame, optional
|
|
82 Influences for each output in DataFrame.
|
|
83 num_iter : int, optional
|
|
84 The actual number of iteration.
|
|
85 """
|
|
86 # TODO: Test rendering the above mathematical expressions in LaTeX form.
|
|
87
|
|
88 if max_iter < 2:
|
|
89 raise ValueError("max_iter should be greater than 2.")
|
|
90
|
|
91 device = device.lower()
|
|
92
|
|
93 if 'cpu' in device:
|
|
94 if sparse:
|
|
95 ret = _compute_influence_cpu_sparse(W, alpha, beta, S,
|
|
96 max_iter, tol, get_iter)
|
|
97 else:
|
|
98 ret = _compute_influence_cpu(W, alpha, beta, S,
|
|
99 max_iter, tol, get_iter)
|
|
100 elif 'gpu'in device:
|
|
101 _, id_device = device.split(':')
|
|
102 ret = _compute_influence_gpu(W, alpha, beta, S,
|
|
103 max_iter, tol, get_iter, id_device)
|
|
104
|
|
105 if rtype == 'df':
|
|
106 import cupy as cp
|
|
107 if isinstance(ret, cp.core.core.ndarray):
|
|
108 ret = cp.asnumpy(ret)
|
|
109
|
|
110 if get_iter:
|
|
111 S_ret, num_iter = ret
|
|
112 else:
|
|
113 S_ret = ret
|
|
114
|
|
115 if rtype == 'array':
|
|
116 return ret
|
|
117 elif rtype == 'df':
|
|
118 if not outputs:
|
|
119 err_msg = "outputs should be designated for 'df' return type."
|
|
120 raise ValueError(err_msg)
|
|
121
|
|
122 df = pd.DataFrame(columns=outputs)
|
|
123
|
|
124 for trg in outputs:
|
|
125 for src in n2i:
|
|
126 if src == trg:
|
|
127 df.loc[src, trg] = np.inf
|
|
128
|
|
129 idx_src = n2i[src]
|
|
130 idx_trg = n2i[trg]
|
|
131 df.loc[src, trg] = S_ret[idx_trg, idx_src]
|
|
132
|
|
133 if get_iter:
|
|
134 return df, num_iter
|
|
135 else:
|
|
136 return df
|
|
137 else:
|
|
138 raise ValueError("Unknown return type: %s"%(rtype))
|
|
139
|
|
140
|
|
141 def _compute_influence_cpu(W, alpha=0.5, beta=0.5, S=None,
|
|
142 max_iter=1000, tol=1e-6, get_iter=False):
|
|
143 N = W.shape[0]
|
|
144 if S is not None:
|
|
145 S1 = S
|
|
146 else:
|
|
147 S1 = np.eye(N, dtype=np.float)
|
|
148
|
|
149 I = np.eye(N, dtype=np.float)
|
|
150 S2 = np.zeros_like(W)
|
|
151 aW = alpha * W
|
|
152 for cnt in range(max_iter):
|
|
153 S2[:, :] = S1.dot(aW) + I
|
|
154 norm = np.linalg.norm(S2 - S1)
|
|
155 if norm < tol:
|
|
156 break
|
|
157 # end of if
|
|
158 S1[:, :] = S2
|
|
159 # end of for
|
|
160
|
|
161 S_fin = beta * S2
|
|
162 if get_iter:
|
|
163 return S_fin, cnt
|
|
164 else:
|
|
165 return S_fin
|
|
166
|
|
167
|
|
168 def _compute_influence_cpu_sparse(W, alpha, beta, S,
|
|
169 max_iter, tol, get_iter):
|
|
170 N = W.shape[0]
|
|
171 if S is not None:
|
|
172 S1 = S
|
|
173 else:
|
|
174 S1 = sp.sparse.lil_matrix(sp.sparse.eye(N, dtype=np.float))
|
|
175
|
|
176
|
|
177 I = sp.sparse.eye(N, dtype=np.float)
|
|
178 S2 = sp.sparse.lil_matrix((N,N), dtype=np.float)
|
|
179 aW = sp.sparse.csc_matrix(alpha * W)
|
|
180 for cnt in range(max_iter):
|
|
181 S2[:, :] = S1.dot(aW) + I
|
|
182 norm = sp.sparse.linalg.norm(S2 - S1)
|
|
183 if norm < tol:
|
|
184 break
|
|
185 # end of if
|
|
186 S1[:, :] = S2
|
|
187 # end of for
|
|
188
|
|
189 S_fin = beta * S2
|
|
190 if get_iter:
|
|
191 return S_fin, cnt
|
|
192 else:
|
|
193 return S_fin
|
|
194
|
|
195
|
|
196 def _compute_influence_gpu(W, alpha=0.5, beta=0.5, S=None,
|
|
197 max_iter=1000, tol=1e-6, get_iter=False,
|
|
198 id_device=0):
|
|
199 import cupy as cp
|
|
200 cp.cuda.Device(id_device).use()
|
|
201 N = W.shape[0]
|
|
202 I = cp.eye(N, dtype=cp.float32) #np.eye(N, N, dtype=np.float)
|
|
203 if S is not None:
|
|
204 S1 = cp.array(S, dtype=cp.float32)
|
|
205 else:
|
|
206 S1 = cp.eye(N, dtype=cp.float32)
|
|
207
|
|
208 S2 = cp.zeros((N,N), dtype=cp.float32)
|
|
209 aW = alpha * cp.array(W, dtype=cp.float32)
|
|
210
|
|
211 tol_gpu = cp.array(tol)
|
|
212
|
|
213 for cnt in range(max_iter):
|
|
214 S2[:, :] = cp.dot(S1, aW) + I
|
|
215 mat_norm = cp.linalg.norm(S2 - S1)
|
|
216 if mat_norm < tol_gpu:
|
|
217 break
|
|
218 # end of if
|
|
219 S1[:, :] = S2
|
|
220 # end of for
|
|
221
|
|
222 S_fin = beta*S2
|
|
223 if get_iter:
|
|
224 return S_fin, cnt
|
|
225 else:
|
|
226 return S_fin
|
|
227
|
|
228
|
|
229
|
|
230 def arrange_si(
|
|
231 df_splo,
|
|
232 df_inf,
|
|
233 output,
|
|
234 min_splo=None,
|
|
235 max_splo=None,
|
|
236 thr_inf=1e-10,
|
|
237 ascending=True):
|
|
238
|
|
239 # SPLO-Influence data
|
|
240 if not min_splo:
|
|
241 min_splo = df_splo.min()
|
|
242
|
|
243 if not max_splo:
|
|
244 max_splo = df_splo.max()
|
|
245
|
|
246 mask_splo = (min_splo <= df_splo) & (df_splo <= max_splo)
|
|
247 df_splo = df_splo[mask_splo]
|
|
248
|
|
249 df_splo = pd.DataFrame(df_splo)
|
|
250 df_splo.columns = ['SPLO']
|
|
251
|
|
252 if output in df_splo.index:
|
|
253 df_splo.drop(output, inplace=True)
|
|
254
|
|
255 index_common = df_splo.index.intersection(df_inf.index)
|
|
256 df_inf = pd.DataFrame(df_inf.loc[index_common])
|
|
257
|
|
258 mark_drop = df_inf[output].abs() <= thr_inf
|
|
259 df_inf.drop(df_inf.loc[mark_drop, output].index,
|
|
260 inplace=True)
|
|
261
|
|
262
|
|
263 df_si = df_inf.join(df_splo.loc[index_common])
|
|
264 df_si.index.name = 'Source'
|
|
265 df_si.reset_index(inplace=True)
|
|
266
|
|
267 cnt_splo = Counter(df_si['SPLO'])
|
|
268 splos = sorted(cnt_splo.keys())
|
|
269
|
|
270 si = {}
|
|
271 for i, splo in enumerate(splos):
|
|
272 df_sub = df_si[df_si['SPLO'] == splo]
|
|
273 df_sub = df_sub.sort_values(by=output,
|
|
274 ascending=ascending)
|
|
275 #num_items = df_sub[output].count()
|
|
276 #influence = np.zeros((cnt_max,)) # Influence
|
|
277 #num_empty = cnt_max - num_items
|
|
278 #influence[num_empty:] = df_sub[output]
|
|
279 #names = df_sub['Source'].tolist()
|
|
280 si[splo] = df_sub #[output]
|
|
281
|
|
282 return si
|
|
283
|
|
284
|
|
285 def prioritize(df_splo,
|
|
286 df_inf,
|
|
287 output,
|
|
288 dac,
|
|
289 thr_rank=3,
|
|
290 min_group_size=0,
|
|
291 min_splo=None,
|
|
292 max_splo=None,
|
|
293 thr_inf=1e-10,
|
|
294 ):
|
|
295 """Prioritize target candiates.
|
|
296
|
|
297 Parameters
|
|
298 ----------
|
|
299 df_splo : pandas.DataFrame
|
|
300 Dataframe for SPLO information.
|
|
301 df_inf : pandas.DataFrame
|
|
302 Dataframe for influence information.
|
|
303 output : str
|
|
304 Names of output node, which is necessary for 'df_inf'.
|
|
305 dac : int
|
|
306 Direction of activity change (DAC) of the output.
|
|
307 thr_rank : int or float
|
|
308 Rank to filter out the entities.
|
|
309 The entities whose ranks are greater than thr_rank survive.
|
|
310 min_group_size : int
|
|
311 Minimum group size to be satisfied.
|
|
312 """
|
|
313 ascending = True if dac < 0 else False
|
|
314
|
|
315 df_inf_dac = df_inf[np.sign(df_inf[output]) == dac]
|
|
316 si = arrange_si(df_splo,
|
|
317 df_inf_dac,
|
|
318 output,
|
|
319 min_splo,
|
|
320 max_splo,
|
|
321 thr_inf,
|
|
322 ascending)
|
|
323 targets = []
|
|
324 for splo in si:
|
|
325 # Get the group of this SPLO.
|
|
326 df_sub = si[splo]
|
|
327
|
|
328 if df_sub.shape[0] < min_group_size:
|
|
329 continue
|
|
330
|
|
331 # Get the entities that have the designated dac.
|
|
332 df_sub = df_sub[np.sign(df_sub[output]) == dac]
|
|
333
|
|
334 # Get the enetities whose rank exceeds the threshods.
|
|
335 if 0 < thr_rank < 1:
|
|
336 ix_max_rank = int(thr_rank * df_sub.shape[0])
|
|
337 if ix_max_rank == 0:
|
|
338 ix_max_rank = df_sub.shape[0]
|
|
339 else:
|
|
340 ix_max_rank = thr_rank
|
|
341
|
|
342 #print(ix_max_rank)
|
|
343 df_top = df_sub.iloc[:ix_max_rank, :]
|
|
344
|
|
345 targets.extend(df_top['Source'].tolist())
|
|
346 # end of for
|
|
347 return targets |