pagerank_alg.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. """PageRank analysis of graph structure. """
  2. from warnings import warn
  3. import networkx as nx
  4. __all__ = ["pagerank", "google_matrix"]
  5. @nx._dispatch
  6. def pagerank(
  7. G,
  8. alpha=0.85,
  9. personalization=None,
  10. max_iter=100,
  11. tol=1.0e-6,
  12. nstart=None,
  13. weight="weight",
  14. dangling=None,
  15. ):
  16. """Returns the PageRank of the nodes in the graph.
  17. PageRank computes a ranking of the nodes in the graph G based on
  18. the structure of the incoming links. It was originally designed as
  19. an algorithm to rank web pages.
  20. Parameters
  21. ----------
  22. G : graph
  23. A NetworkX graph. Undirected graphs will be converted to a directed
  24. graph with two directed edges for each undirected edge.
  25. alpha : float, optional
  26. Damping parameter for PageRank, default=0.85.
  27. personalization: dict, optional
  28. The "personalization vector" consisting of a dictionary with a
  29. key some subset of graph nodes and personalization value each of those.
  30. At least one personalization value must be non-zero.
  31. If not specified, a nodes personalization value will be zero.
  32. By default, a uniform distribution is used.
  33. max_iter : integer, optional
  34. Maximum number of iterations in power method eigenvalue solver.
  35. tol : float, optional
  36. Error tolerance used to check convergence in power method solver.
  37. The iteration will stop after a tolerance of ``len(G) * tol`` is reached.
  38. nstart : dictionary, optional
  39. Starting value of PageRank iteration for each node.
  40. weight : key, optional
  41. Edge data key to use as weight. If None weights are set to 1.
  42. dangling: dict, optional
  43. The outedges to be assigned to any "dangling" nodes, i.e., nodes without
  44. any outedges. The dict key is the node the outedge points to and the dict
  45. value is the weight of that outedge. By default, dangling nodes are given
  46. outedges according to the personalization vector (uniform if not
  47. specified). This must be selected to result in an irreducible transition
  48. matrix (see notes under google_matrix). It may be common to have the
  49. dangling dict to be the same as the personalization dict.
  50. Returns
  51. -------
  52. pagerank : dictionary
  53. Dictionary of nodes with PageRank as value
  54. Examples
  55. --------
  56. >>> G = nx.DiGraph(nx.path_graph(4))
  57. >>> pr = nx.pagerank(G, alpha=0.9)
  58. Notes
  59. -----
  60. The eigenvector calculation is done by the power iteration method
  61. and has no guarantee of convergence. The iteration will stop after
  62. an error tolerance of ``len(G) * tol`` has been reached. If the
  63. number of iterations exceed `max_iter`, a
  64. :exc:`networkx.exception.PowerIterationFailedConvergence` exception
  65. is raised.
  66. The PageRank algorithm was designed for directed graphs but this
  67. algorithm does not check if the input graph is directed and will
  68. execute on undirected graphs by converting each edge in the
  69. directed graph to two edges.
  70. See Also
  71. --------
  72. google_matrix
  73. Raises
  74. ------
  75. PowerIterationFailedConvergence
  76. If the algorithm fails to converge to the specified tolerance
  77. within the specified number of iterations of the power iteration
  78. method.
  79. References
  80. ----------
  81. .. [1] A. Langville and C. Meyer,
  82. "A survey of eigenvector methods of web information retrieval."
  83. http://citeseer.ist.psu.edu/713792.html
  84. .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
  85. The PageRank citation ranking: Bringing order to the Web. 1999
  86. http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
  87. """
  88. return _pagerank_scipy(
  89. G, alpha, personalization, max_iter, tol, nstart, weight, dangling
  90. )
  91. def _pagerank_python(
  92. G,
  93. alpha=0.85,
  94. personalization=None,
  95. max_iter=100,
  96. tol=1.0e-6,
  97. nstart=None,
  98. weight="weight",
  99. dangling=None,
  100. ):
  101. if len(G) == 0:
  102. return {}
  103. D = G.to_directed()
  104. # Create a copy in (right) stochastic form
  105. W = nx.stochastic_graph(D, weight=weight)
  106. N = W.number_of_nodes()
  107. # Choose fixed starting vector if not given
  108. if nstart is None:
  109. x = dict.fromkeys(W, 1.0 / N)
  110. else:
  111. # Normalized nstart vector
  112. s = sum(nstart.values())
  113. x = {k: v / s for k, v in nstart.items()}
  114. if personalization is None:
  115. # Assign uniform personalization vector if not given
  116. p = dict.fromkeys(W, 1.0 / N)
  117. else:
  118. s = sum(personalization.values())
  119. p = {k: v / s for k, v in personalization.items()}
  120. if dangling is None:
  121. # Use personalization vector if dangling vector not specified
  122. dangling_weights = p
  123. else:
  124. s = sum(dangling.values())
  125. dangling_weights = {k: v / s for k, v in dangling.items()}
  126. dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
  127. # power iteration: make up to max_iter iterations
  128. for _ in range(max_iter):
  129. xlast = x
  130. x = dict.fromkeys(xlast.keys(), 0)
  131. danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
  132. for n in x:
  133. # this matrix multiply looks odd because it is
  134. # doing a left multiply x^T=xlast^T*W
  135. for _, nbr, wt in W.edges(n, data=weight):
  136. x[nbr] += alpha * xlast[n] * wt
  137. x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0)
  138. # check convergence, l1 norm
  139. err = sum(abs(x[n] - xlast[n]) for n in x)
  140. if err < N * tol:
  141. return x
  142. raise nx.PowerIterationFailedConvergence(max_iter)
  143. @nx._dispatch
  144. def google_matrix(
  145. G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None
  146. ):
  147. """Returns the Google matrix of the graph.
  148. Parameters
  149. ----------
  150. G : graph
  151. A NetworkX graph. Undirected graphs will be converted to a directed
  152. graph with two directed edges for each undirected edge.
  153. alpha : float
  154. The damping factor.
  155. personalization: dict, optional
  156. The "personalization vector" consisting of a dictionary with a
  157. key some subset of graph nodes and personalization value each of those.
  158. At least one personalization value must be non-zero.
  159. If not specified, a nodes personalization value will be zero.
  160. By default, a uniform distribution is used.
  161. nodelist : list, optional
  162. The rows and columns are ordered according to the nodes in nodelist.
  163. If nodelist is None, then the ordering is produced by G.nodes().
  164. weight : key, optional
  165. Edge data key to use as weight. If None weights are set to 1.
  166. dangling: dict, optional
  167. The outedges to be assigned to any "dangling" nodes, i.e., nodes without
  168. any outedges. The dict key is the node the outedge points to and the dict
  169. value is the weight of that outedge. By default, dangling nodes are given
  170. outedges according to the personalization vector (uniform if not
  171. specified) This must be selected to result in an irreducible transition
  172. matrix (see notes below). It may be common to have the dangling dict to
  173. be the same as the personalization dict.
  174. Returns
  175. -------
  176. A : 2D NumPy ndarray
  177. Google matrix of the graph
  178. Notes
  179. -----
  180. The array returned represents the transition matrix that describes the
  181. Markov chain used in PageRank. For PageRank to converge to a unique
  182. solution (i.e., a unique stationary distribution in a Markov chain), the
  183. transition matrix must be irreducible. In other words, it must be that
  184. there exists a path between every pair of nodes in the graph, or else there
  185. is the potential of "rank sinks."
  186. This implementation works with Multi(Di)Graphs. For multigraphs the
  187. weight between two nodes is set to be the sum of all edge weights
  188. between those nodes.
  189. See Also
  190. --------
  191. pagerank
  192. """
  193. import numpy as np
  194. if nodelist is None:
  195. nodelist = list(G)
  196. A = nx.to_numpy_array(G, nodelist=nodelist, weight=weight)
  197. N = len(G)
  198. if N == 0:
  199. return A
  200. # Personalization vector
  201. if personalization is None:
  202. p = np.repeat(1.0 / N, N)
  203. else:
  204. p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
  205. if p.sum() == 0:
  206. raise ZeroDivisionError
  207. p /= p.sum()
  208. # Dangling nodes
  209. if dangling is None:
  210. dangling_weights = p
  211. else:
  212. # Convert the dangling dictionary into an array in nodelist order
  213. dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
  214. dangling_weights /= dangling_weights.sum()
  215. dangling_nodes = np.where(A.sum(axis=1) == 0)[0]
  216. # Assign dangling_weights to any dangling nodes (nodes with no out links)
  217. A[dangling_nodes] = dangling_weights
  218. A /= A.sum(axis=1)[:, np.newaxis] # Normalize rows to sum to 1
  219. return alpha * A + (1 - alpha) * p
  220. def _pagerank_numpy(
  221. G, alpha=0.85, personalization=None, weight="weight", dangling=None
  222. ):
  223. """Returns the PageRank of the nodes in the graph.
  224. PageRank computes a ranking of the nodes in the graph G based on
  225. the structure of the incoming links. It was originally designed as
  226. an algorithm to rank web pages.
  227. Parameters
  228. ----------
  229. G : graph
  230. A NetworkX graph. Undirected graphs will be converted to a directed
  231. graph with two directed edges for each undirected edge.
  232. alpha : float, optional
  233. Damping parameter for PageRank, default=0.85.
  234. personalization: dict, optional
  235. The "personalization vector" consisting of a dictionary with a
  236. key some subset of graph nodes and personalization value each of those.
  237. At least one personalization value must be non-zero.
  238. If not specified, a nodes personalization value will be zero.
  239. By default, a uniform distribution is used.
  240. weight : key, optional
  241. Edge data key to use as weight. If None weights are set to 1.
  242. dangling: dict, optional
  243. The outedges to be assigned to any "dangling" nodes, i.e., nodes without
  244. any outedges. The dict key is the node the outedge points to and the dict
  245. value is the weight of that outedge. By default, dangling nodes are given
  246. outedges according to the personalization vector (uniform if not
  247. specified) This must be selected to result in an irreducible transition
  248. matrix (see notes under google_matrix). It may be common to have the
  249. dangling dict to be the same as the personalization dict.
  250. Returns
  251. -------
  252. pagerank : dictionary
  253. Dictionary of nodes with PageRank as value.
  254. Examples
  255. --------
  256. >>> from networkx.algorithms.link_analysis.pagerank_alg import _pagerank_numpy
  257. >>> G = nx.DiGraph(nx.path_graph(4))
  258. >>> pr = _pagerank_numpy(G, alpha=0.9)
  259. Notes
  260. -----
  261. The eigenvector calculation uses NumPy's interface to the LAPACK
  262. eigenvalue solvers. This will be the fastest and most accurate
  263. for small graphs.
  264. This implementation works with Multi(Di)Graphs. For multigraphs the
  265. weight between two nodes is set to be the sum of all edge weights
  266. between those nodes.
  267. See Also
  268. --------
  269. pagerank, google_matrix
  270. References
  271. ----------
  272. .. [1] A. Langville and C. Meyer,
  273. "A survey of eigenvector methods of web information retrieval."
  274. http://citeseer.ist.psu.edu/713792.html
  275. .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
  276. The PageRank citation ranking: Bringing order to the Web. 1999
  277. http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
  278. """
  279. import numpy as np
  280. if len(G) == 0:
  281. return {}
  282. M = google_matrix(
  283. G, alpha, personalization=personalization, weight=weight, dangling=dangling
  284. )
  285. # use numpy LAPACK solver
  286. eigenvalues, eigenvectors = np.linalg.eig(M.T)
  287. ind = np.argmax(eigenvalues)
  288. # eigenvector of largest eigenvalue is at ind, normalized
  289. largest = np.array(eigenvectors[:, ind]).flatten().real
  290. norm = largest.sum()
  291. return dict(zip(G, map(float, largest / norm)))
  292. def _pagerank_scipy(
  293. G,
  294. alpha=0.85,
  295. personalization=None,
  296. max_iter=100,
  297. tol=1.0e-6,
  298. nstart=None,
  299. weight="weight",
  300. dangling=None,
  301. ):
  302. """Returns the PageRank of the nodes in the graph.
  303. PageRank computes a ranking of the nodes in the graph G based on
  304. the structure of the incoming links. It was originally designed as
  305. an algorithm to rank web pages.
  306. Parameters
  307. ----------
  308. G : graph
  309. A NetworkX graph. Undirected graphs will be converted to a directed
  310. graph with two directed edges for each undirected edge.
  311. alpha : float, optional
  312. Damping parameter for PageRank, default=0.85.
  313. personalization: dict, optional
  314. The "personalization vector" consisting of a dictionary with a
  315. key some subset of graph nodes and personalization value each of those.
  316. At least one personalization value must be non-zero.
  317. If not specified, a nodes personalization value will be zero.
  318. By default, a uniform distribution is used.
  319. max_iter : integer, optional
  320. Maximum number of iterations in power method eigenvalue solver.
  321. tol : float, optional
  322. Error tolerance used to check convergence in power method solver.
  323. The iteration will stop after a tolerance of ``len(G) * tol`` is reached.
  324. nstart : dictionary, optional
  325. Starting value of PageRank iteration for each node.
  326. weight : key, optional
  327. Edge data key to use as weight. If None weights are set to 1.
  328. dangling: dict, optional
  329. The outedges to be assigned to any "dangling" nodes, i.e., nodes without
  330. any outedges. The dict key is the node the outedge points to and the dict
  331. value is the weight of that outedge. By default, dangling nodes are given
  332. outedges according to the personalization vector (uniform if not
  333. specified) This must be selected to result in an irreducible transition
  334. matrix (see notes under google_matrix). It may be common to have the
  335. dangling dict to be the same as the personalization dict.
  336. Returns
  337. -------
  338. pagerank : dictionary
  339. Dictionary of nodes with PageRank as value
  340. Examples
  341. --------
  342. >>> from networkx.algorithms.link_analysis.pagerank_alg import _pagerank_scipy
  343. >>> G = nx.DiGraph(nx.path_graph(4))
  344. >>> pr = _pagerank_scipy(G, alpha=0.9)
  345. Notes
  346. -----
  347. The eigenvector calculation uses power iteration with a SciPy
  348. sparse matrix representation.
  349. This implementation works with Multi(Di)Graphs. For multigraphs the
  350. weight between two nodes is set to be the sum of all edge weights
  351. between those nodes.
  352. See Also
  353. --------
  354. pagerank
  355. Raises
  356. ------
  357. PowerIterationFailedConvergence
  358. If the algorithm fails to converge to the specified tolerance
  359. within the specified number of iterations of the power iteration
  360. method.
  361. References
  362. ----------
  363. .. [1] A. Langville and C. Meyer,
  364. "A survey of eigenvector methods of web information retrieval."
  365. http://citeseer.ist.psu.edu/713792.html
  366. .. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
  367. The PageRank citation ranking: Bringing order to the Web. 1999
  368. http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
  369. """
  370. import numpy as np
  371. import scipy as sp
  372. import scipy.sparse # call as sp.sparse
  373. N = len(G)
  374. if N == 0:
  375. return {}
  376. nodelist = list(G)
  377. A = nx.to_scipy_sparse_array(G, nodelist=nodelist, weight=weight, dtype=float)
  378. S = A.sum(axis=1)
  379. S[S != 0] = 1.0 / S[S != 0]
  380. # TODO: csr_array
  381. Q = sp.sparse.csr_array(sp.sparse.spdiags(S.T, 0, *A.shape))
  382. A = Q @ A
  383. # initial vector
  384. if nstart is None:
  385. x = np.repeat(1.0 / N, N)
  386. else:
  387. x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float)
  388. x /= x.sum()
  389. # Personalization vector
  390. if personalization is None:
  391. p = np.repeat(1.0 / N, N)
  392. else:
  393. p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
  394. if p.sum() == 0:
  395. raise ZeroDivisionError
  396. p /= p.sum()
  397. # Dangling nodes
  398. if dangling is None:
  399. dangling_weights = p
  400. else:
  401. # Convert the dangling dictionary into an array in nodelist order
  402. dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
  403. dangling_weights /= dangling_weights.sum()
  404. is_dangling = np.where(S == 0)[0]
  405. # power iteration: make up to max_iter iterations
  406. for _ in range(max_iter):
  407. xlast = x
  408. x = alpha * (x @ A + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p
  409. # check convergence, l1 norm
  410. err = np.absolute(x - xlast).sum()
  411. if err < N * tol:
  412. return dict(zip(nodelist, map(float, x)))
  413. raise nx.PowerIterationFailedConvergence(max_iter)