comparison env/lib/python3.9/site-packages/networkx/algorithms/link_analysis/pagerank_alg.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """PageRank analysis of graph structure. """
2 import networkx as nx
3 from networkx.utils import not_implemented_for
4
5 __all__ = ["pagerank", "pagerank_numpy", "pagerank_scipy", "google_matrix"]
6
7
8 @not_implemented_for("multigraph")
9 def pagerank(
10 G,
11 alpha=0.85,
12 personalization=None,
13 max_iter=100,
14 tol=1.0e-6,
15 nstart=None,
16 weight="weight",
17 dangling=None,
18 ):
19 """Returns the PageRank of the nodes in the graph.
20
21 PageRank computes a ranking of the nodes in the graph G based on
22 the structure of the incoming links. It was originally designed as
23 an algorithm to rank web pages.
24
25 Parameters
26 ----------
27 G : graph
28 A NetworkX graph. Undirected graphs will be converted to a directed
29 graph with two directed edges for each undirected edge.
30
31 alpha : float, optional
32 Damping parameter for PageRank, default=0.85.
33
34 personalization: dict, optional
35 The "personalization vector" consisting of a dictionary with a
36 key some subset of graph nodes and personalization value each of those.
37 At least one personalization value must be non-zero.
38 If not specfiied, a nodes personalization value will be zero.
39 By default, a uniform distribution is used.
40
41 max_iter : integer, optional
42 Maximum number of iterations in power method eigenvalue solver.
43
44 tol : float, optional
45 Error tolerance used to check convergence in power method solver.
46
47 nstart : dictionary, optional
48 Starting value of PageRank iteration for each node.
49
50 weight : key, optional
51 Edge data key to use as weight. If None weights are set to 1.
52
53 dangling: dict, optional
54 The outedges to be assigned to any "dangling" nodes, i.e., nodes without
55 any outedges. The dict key is the node the outedge points to and the dict
56 value is the weight of that outedge. By default, dangling nodes are given
57 outedges according to the personalization vector (uniform if not
58 specified). This must be selected to result in an irreducible transition
59 matrix (see notes under google_matrix). It may be common to have the
60 dangling dict to be the same as the personalization dict.
61
62 Returns
63 -------
64 pagerank : dictionary
65 Dictionary of nodes with PageRank as value
66
67 Examples
68 --------
69 >>> G = nx.DiGraph(nx.path_graph(4))
70 >>> pr = nx.pagerank(G, alpha=0.9)
71
72 Notes
73 -----
74 The eigenvector calculation is done by the power iteration method
75 and has no guarantee of convergence. The iteration will stop after
76 an error tolerance of ``len(G) * tol`` has been reached. If the
77 number of iterations exceed `max_iter`, a
78 :exc:`networkx.exception.PowerIterationFailedConvergence` exception
79 is raised.
80
81 The PageRank algorithm was designed for directed graphs but this
82 algorithm does not check if the input graph is directed and will
83 execute on undirected graphs by converting each edge in the
84 directed graph to two edges.
85
86 See Also
87 --------
88 pagerank_numpy, pagerank_scipy, google_matrix
89
90 Raises
91 ------
92 PowerIterationFailedConvergence
93 If the algorithm fails to converge to the specified tolerance
94 within the specified number of iterations of the power iteration
95 method.
96
97 References
98 ----------
99 .. [1] A. Langville and C. Meyer,
100 "A survey of eigenvector methods of web information retrieval."
101 http://citeseer.ist.psu.edu/713792.html
102 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
103 The PageRank citation ranking: Bringing order to the Web. 1999
104 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
105
106 """
107 if len(G) == 0:
108 return {}
109
110 if not G.is_directed():
111 D = G.to_directed()
112 else:
113 D = G
114
115 # Create a copy in (right) stochastic form
116 W = nx.stochastic_graph(D, weight=weight)
117 N = W.number_of_nodes()
118
119 # Choose fixed starting vector if not given
120 if nstart is None:
121 x = dict.fromkeys(W, 1.0 / N)
122 else:
123 # Normalized nstart vector
124 s = float(sum(nstart.values()))
125 x = {k: v / s for k, v in nstart.items()}
126
127 if personalization is None:
128 # Assign uniform personalization vector if not given
129 p = dict.fromkeys(W, 1.0 / N)
130 else:
131 s = float(sum(personalization.values()))
132 p = {k: v / s for k, v in personalization.items()}
133
134 if dangling is None:
135 # Use personalization vector if dangling vector not specified
136 dangling_weights = p
137 else:
138 s = float(sum(dangling.values()))
139 dangling_weights = {k: v / s for k, v in dangling.items()}
140 dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
141
142 # power iteration: make up to max_iter iterations
143 for _ in range(max_iter):
144 xlast = x
145 x = dict.fromkeys(xlast.keys(), 0)
146 danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
147 for n in x:
148 # this matrix multiply looks odd because it is
149 # doing a left multiply x^T=xlast^T*W
150 for nbr in W[n]:
151 x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
152 x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0)
153 # check convergence, l1 norm
154 err = sum([abs(x[n] - xlast[n]) for n in x])
155 if err < N * tol:
156 return x
157 raise nx.PowerIterationFailedConvergence(max_iter)
158
159
160 def google_matrix(
161 G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None
162 ):
163 """Returns the Google matrix of the graph.
164
165 Parameters
166 ----------
167 G : graph
168 A NetworkX graph. Undirected graphs will be converted to a directed
169 graph with two directed edges for each undirected edge.
170
171 alpha : float
172 The damping factor.
173
174 personalization: dict, optional
175 The "personalization vector" consisting of a dictionary with a
176 key some subset of graph nodes and personalization value each of those.
177 At least one personalization value must be non-zero.
178 If not specfiied, a nodes personalization value will be zero.
179 By default, a uniform distribution is used.
180
181 nodelist : list, optional
182 The rows and columns are ordered according to the nodes in nodelist.
183 If nodelist is None, then the ordering is produced by G.nodes().
184
185 weight : key, optional
186 Edge data key to use as weight. If None weights are set to 1.
187
188 dangling: dict, optional
189 The outedges to be assigned to any "dangling" nodes, i.e., nodes without
190 any outedges. The dict key is the node the outedge points to and the dict
191 value is the weight of that outedge. By default, dangling nodes are given
192 outedges according to the personalization vector (uniform if not
193 specified) This must be selected to result in an irreducible transition
194 matrix (see notes below). It may be common to have the dangling dict to
195 be the same as the personalization dict.
196
197 Returns
198 -------
199 A : NumPy matrix
200 Google matrix of the graph
201
202 Notes
203 -----
204 The matrix returned represents the transition matrix that describes the
205 Markov chain used in PageRank. For PageRank to converge to a unique
206 solution (i.e., a unique stationary distribution in a Markov chain), the
207 transition matrix must be irreducible. In other words, it must be that
208 there exists a path between every pair of nodes in the graph, or else there
209 is the potential of "rank sinks."
210
211 This implementation works with Multi(Di)Graphs. For multigraphs the
212 weight between two nodes is set to be the sum of all edge weights
213 between those nodes.
214
215 See Also
216 --------
217 pagerank, pagerank_numpy, pagerank_scipy
218 """
219 import numpy as np
220
221 if nodelist is None:
222 nodelist = list(G)
223
224 M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
225 N = len(G)
226 if N == 0:
227 return M
228
229 # Personalization vector
230 if personalization is None:
231 p = np.repeat(1.0 / N, N)
232 else:
233 p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
234 p /= p.sum()
235
236 # Dangling nodes
237 if dangling is None:
238 dangling_weights = p
239 else:
240 # Convert the dangling dictionary into an array in nodelist order
241 dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
242 dangling_weights /= dangling_weights.sum()
243 dangling_nodes = np.where(M.sum(axis=1) == 0)[0]
244
245 # Assign dangling_weights to any dangling nodes (nodes with no out links)
246 for node in dangling_nodes:
247 M[node] = dangling_weights
248
249 M /= M.sum(axis=1) # Normalize rows to sum to 1
250
251 return alpha * M + (1 - alpha) * p
252
253
254 def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
255 """Returns the PageRank of the nodes in the graph.
256
257 PageRank computes a ranking of the nodes in the graph G based on
258 the structure of the incoming links. It was originally designed as
259 an algorithm to rank web pages.
260
261 Parameters
262 ----------
263 G : graph
264 A NetworkX graph. Undirected graphs will be converted to a directed
265 graph with two directed edges for each undirected edge.
266
267 alpha : float, optional
268 Damping parameter for PageRank, default=0.85.
269
270 personalization: dict, optional
271 The "personalization vector" consisting of a dictionary with a
272 key some subset of graph nodes and personalization value each of those.
273 At least one personalization value must be non-zero.
274 If not specfiied, a nodes personalization value will be zero.
275 By default, a uniform distribution is used.
276
277 weight : key, optional
278 Edge data key to use as weight. If None weights are set to 1.
279
280 dangling: dict, optional
281 The outedges to be assigned to any "dangling" nodes, i.e., nodes without
282 any outedges. The dict key is the node the outedge points to and the dict
283 value is the weight of that outedge. By default, dangling nodes are given
284 outedges according to the personalization vector (uniform if not
285 specified) This must be selected to result in an irreducible transition
286 matrix (see notes under google_matrix). It may be common to have the
287 dangling dict to be the same as the personalization dict.
288
289 Returns
290 -------
291 pagerank : dictionary
292 Dictionary of nodes with PageRank as value.
293
294 Examples
295 --------
296 >>> G = nx.DiGraph(nx.path_graph(4))
297 >>> pr = nx.pagerank_numpy(G, alpha=0.9)
298
299 Notes
300 -----
301 The eigenvector calculation uses NumPy's interface to the LAPACK
302 eigenvalue solvers. This will be the fastest and most accurate
303 for small graphs.
304
305 This implementation works with Multi(Di)Graphs. For multigraphs the
306 weight between two nodes is set to be the sum of all edge weights
307 between those nodes.
308
309 See Also
310 --------
311 pagerank, pagerank_scipy, google_matrix
312
313 References
314 ----------
315 .. [1] A. Langville and C. Meyer,
316 "A survey of eigenvector methods of web information retrieval."
317 http://citeseer.ist.psu.edu/713792.html
318 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
319 The PageRank citation ranking: Bringing order to the Web. 1999
320 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
321 """
322 import numpy as np
323
324 if len(G) == 0:
325 return {}
326 M = google_matrix(
327 G, alpha, personalization=personalization, weight=weight, dangling=dangling
328 )
329 # use numpy LAPACK solver
330 eigenvalues, eigenvectors = np.linalg.eig(M.T)
331 ind = np.argmax(eigenvalues)
332 # eigenvector of largest eigenvalue is at ind, normalized
333 largest = np.array(eigenvectors[:, ind]).flatten().real
334 norm = float(largest.sum())
335 return dict(zip(G, map(float, largest / norm)))
336
337
338 def pagerank_scipy(
339 G,
340 alpha=0.85,
341 personalization=None,
342 max_iter=100,
343 tol=1.0e-6,
344 nstart=None,
345 weight="weight",
346 dangling=None,
347 ):
348 """Returns the PageRank of the nodes in the graph.
349
350 PageRank computes a ranking of the nodes in the graph G based on
351 the structure of the incoming links. It was originally designed as
352 an algorithm to rank web pages.
353
354 Parameters
355 ----------
356 G : graph
357 A NetworkX graph. Undirected graphs will be converted to a directed
358 graph with two directed edges for each undirected edge.
359
360 alpha : float, optional
361 Damping parameter for PageRank, default=0.85.
362
363 personalization: dict, optional
364 The "personalization vector" consisting of a dictionary with a
365 key some subset of graph nodes and personalization value each of those.
366 At least one personalization value must be non-zero.
367 If not specfiied, a nodes personalization value will be zero.
368 By default, a uniform distribution is used.
369
370 max_iter : integer, optional
371 Maximum number of iterations in power method eigenvalue solver.
372
373 tol : float, optional
374 Error tolerance used to check convergence in power method solver.
375
376 nstart : dictionary, optional
377 Starting value of PageRank iteration for each node.
378
379 weight : key, optional
380 Edge data key to use as weight. If None weights are set to 1.
381
382 dangling: dict, optional
383 The outedges to be assigned to any "dangling" nodes, i.e., nodes without
384 any outedges. The dict key is the node the outedge points to and the dict
385 value is the weight of that outedge. By default, dangling nodes are given
386 outedges according to the personalization vector (uniform if not
387 specified) This must be selected to result in an irreducible transition
388 matrix (see notes under google_matrix). It may be common to have the
389 dangling dict to be the same as the personalization dict.
390
391 Returns
392 -------
393 pagerank : dictionary
394 Dictionary of nodes with PageRank as value
395
396 Examples
397 --------
398 >>> G = nx.DiGraph(nx.path_graph(4))
399 >>> pr = nx.pagerank_scipy(G, alpha=0.9)
400
401 Notes
402 -----
403 The eigenvector calculation uses power iteration with a SciPy
404 sparse matrix representation.
405
406 This implementation works with Multi(Di)Graphs. For multigraphs the
407 weight between two nodes is set to be the sum of all edge weights
408 between those nodes.
409
410 See Also
411 --------
412 pagerank, pagerank_numpy, google_matrix
413
414 Raises
415 ------
416 PowerIterationFailedConvergence
417 If the algorithm fails to converge to the specified tolerance
418 within the specified number of iterations of the power iteration
419 method.
420
421 References
422 ----------
423 .. [1] A. Langville and C. Meyer,
424 "A survey of eigenvector methods of web information retrieval."
425 http://citeseer.ist.psu.edu/713792.html
426 .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
427 The PageRank citation ranking: Bringing order to the Web. 1999
428 http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
429 """
430 import numpy as np
431 import scipy.sparse
432
433 N = len(G)
434 if N == 0:
435 return {}
436
437 nodelist = list(G)
438 M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight, dtype=float)
439 S = np.array(M.sum(axis=1)).flatten()
440 S[S != 0] = 1.0 / S[S != 0]
441 Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format="csr")
442 M = Q * M
443
444 # initial vector
445 if nstart is None:
446 x = np.repeat(1.0 / N, N)
447 else:
448 x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float)
449 x = x / x.sum()
450
451 # Personalization vector
452 if personalization is None:
453 p = np.repeat(1.0 / N, N)
454 else:
455 p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
456 p = p / p.sum()
457
458 # Dangling nodes
459 if dangling is None:
460 dangling_weights = p
461 else:
462 # Convert the dangling dictionary into an array in nodelist order
463 dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
464 dangling_weights /= dangling_weights.sum()
465 is_dangling = np.where(S == 0)[0]
466
467 # power iteration: make up to max_iter iterations
468 for _ in range(max_iter):
469 xlast = x
470 x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p
471 # check convergence, l1 norm
472 err = np.absolute(x - xlast).sum()
473 if err < N * tol:
474 return dict(zip(nodelist, map(float, x)))
475 raise nx.PowerIterationFailedConvergence(max_iter)