hits_alg.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. """Hubs and authorities analysis of graph structure.
  2. """
  3. import networkx as nx
  4. __all__ = ["hits"]
  5. @nx._dispatch
  6. def hits(G, max_iter=100, tol=1.0e-8, nstart=None, normalized=True):
  7. """Returns HITS hubs and authorities values for nodes.
  8. The HITS algorithm computes two numbers for a node.
  9. Authorities estimates the node value based on the incoming links.
  10. Hubs estimates the node value based on outgoing links.
  11. Parameters
  12. ----------
  13. G : graph
  14. A NetworkX graph
  15. max_iter : integer, optional
  16. Maximum number of iterations in power method.
  17. tol : float, optional
  18. Error tolerance used to check convergence in power method iteration.
  19. nstart : dictionary, optional
  20. Starting value of each node for power method iteration.
  21. normalized : bool (default=True)
  22. Normalize results by the sum of all of the values.
  23. Returns
  24. -------
  25. (hubs,authorities) : two-tuple of dictionaries
  26. Two dictionaries keyed by node containing the hub and authority
  27. values.
  28. Raises
  29. ------
  30. PowerIterationFailedConvergence
  31. If the algorithm fails to converge to the specified tolerance
  32. within the specified number of iterations of the power iteration
  33. method.
  34. Examples
  35. --------
  36. >>> G = nx.path_graph(4)
  37. >>> h, a = nx.hits(G)
  38. Notes
  39. -----
  40. The eigenvector calculation is done by the power iteration method
  41. and has no guarantee of convergence. The iteration will stop
  42. after max_iter iterations or an error tolerance of
  43. number_of_nodes(G)*tol has been reached.
  44. The HITS algorithm was designed for directed graphs but this
  45. algorithm does not check if the input graph is directed and will
  46. execute on undirected graphs.
  47. References
  48. ----------
  49. .. [1] A. Langville and C. Meyer,
  50. "A survey of eigenvector methods of web information retrieval."
  51. http://citeseer.ist.psu.edu/713792.html
  52. .. [2] Jon Kleinberg,
  53. Authoritative sources in a hyperlinked environment
  54. Journal of the ACM 46 (5): 604-32, 1999.
  55. doi:10.1145/324133.324140.
  56. http://www.cs.cornell.edu/home/kleinber/auth.pdf.
  57. """
  58. import numpy as np
  59. import scipy as sp
  60. import scipy.sparse.linalg # call as sp.sparse.linalg
  61. if len(G) == 0:
  62. return {}, {}
  63. A = nx.adjacency_matrix(G, nodelist=list(G), dtype=float)
  64. if nstart is None:
  65. _, _, vt = sp.sparse.linalg.svds(A, k=1, maxiter=max_iter, tol=tol)
  66. else:
  67. nstart = np.array(list(nstart.values()))
  68. _, _, vt = sp.sparse.linalg.svds(A, k=1, v0=nstart, maxiter=max_iter, tol=tol)
  69. a = vt.flatten().real
  70. h = A @ a
  71. if normalized:
  72. h /= h.sum()
  73. a /= a.sum()
  74. hubs = dict(zip(G, map(float, h)))
  75. authorities = dict(zip(G, map(float, a)))
  76. return hubs, authorities
  77. def _hits_python(G, max_iter=100, tol=1.0e-8, nstart=None, normalized=True):
  78. if isinstance(G, (nx.MultiGraph, nx.MultiDiGraph)):
  79. raise Exception("hits() not defined for graphs with multiedges.")
  80. if len(G) == 0:
  81. return {}, {}
  82. # choose fixed starting vector if not given
  83. if nstart is None:
  84. h = dict.fromkeys(G, 1.0 / G.number_of_nodes())
  85. else:
  86. h = nstart
  87. # normalize starting vector
  88. s = 1.0 / sum(h.values())
  89. for k in h:
  90. h[k] *= s
  91. for _ in range(max_iter): # power iteration: make up to max_iter iterations
  92. hlast = h
  93. h = dict.fromkeys(hlast.keys(), 0)
  94. a = dict.fromkeys(hlast.keys(), 0)
  95. # this "matrix multiply" looks odd because it is
  96. # doing a left multiply a^T=hlast^T*G
  97. for n in h:
  98. for nbr in G[n]:
  99. a[nbr] += hlast[n] * G[n][nbr].get("weight", 1)
  100. # now multiply h=Ga
  101. for n in h:
  102. for nbr in G[n]:
  103. h[n] += a[nbr] * G[n][nbr].get("weight", 1)
  104. # normalize vector
  105. s = 1.0 / max(h.values())
  106. for n in h:
  107. h[n] *= s
  108. # normalize vector
  109. s = 1.0 / max(a.values())
  110. for n in a:
  111. a[n] *= s
  112. # check convergence, l1 norm
  113. err = sum(abs(h[n] - hlast[n]) for n in h)
  114. if err < tol:
  115. break
  116. else:
  117. raise nx.PowerIterationFailedConvergence(max_iter)
  118. if normalized:
  119. s = 1.0 / sum(a.values())
  120. for n in a:
  121. a[n] *= s
  122. s = 1.0 / sum(h.values())
  123. for n in h:
  124. h[n] *= s
  125. return h, a
  126. def _hits_numpy(G, normalized=True):
  127. """Returns HITS hubs and authorities values for nodes.
  128. The HITS algorithm computes two numbers for a node.
  129. Authorities estimates the node value based on the incoming links.
  130. Hubs estimates the node value based on outgoing links.
  131. Parameters
  132. ----------
  133. G : graph
  134. A NetworkX graph
  135. normalized : bool (default=True)
  136. Normalize results by the sum of all of the values.
  137. Returns
  138. -------
  139. (hubs,authorities) : two-tuple of dictionaries
  140. Two dictionaries keyed by node containing the hub and authority
  141. values.
  142. Examples
  143. --------
  144. >>> G = nx.path_graph(4)
  145. The `hubs` and `authorities` are given by the eigenvectors corresponding to the
  146. maximum eigenvalues of the hubs_matrix and the authority_matrix, respectively.
  147. The ``hubs`` and ``authority`` matrices are computed from the adjancency
  148. matrix:
  149. >>> adj_ary = nx.to_numpy_array(G)
  150. >>> hubs_matrix = adj_ary @ adj_ary.T
  151. >>> authority_matrix = adj_ary.T @ adj_ary
  152. `_hits_numpy` maps the eigenvector corresponding to the maximum eigenvalue
  153. of the respective matrices to the nodes in `G`:
  154. >>> from networkx.algorithms.link_analysis.hits_alg import _hits_numpy
  155. >>> hubs, authority = _hits_numpy(G)
  156. Notes
  157. -----
  158. The eigenvector calculation uses NumPy's interface to LAPACK.
  159. The HITS algorithm was designed for directed graphs but this
  160. algorithm does not check if the input graph is directed and will
  161. execute on undirected graphs.
  162. References
  163. ----------
  164. .. [1] A. Langville and C. Meyer,
  165. "A survey of eigenvector methods of web information retrieval."
  166. http://citeseer.ist.psu.edu/713792.html
  167. .. [2] Jon Kleinberg,
  168. Authoritative sources in a hyperlinked environment
  169. Journal of the ACM 46 (5): 604-32, 1999.
  170. doi:10.1145/324133.324140.
  171. http://www.cs.cornell.edu/home/kleinber/auth.pdf.
  172. """
  173. import numpy as np
  174. if len(G) == 0:
  175. return {}, {}
  176. adj_ary = nx.to_numpy_array(G)
  177. # Hub matrix
  178. H = adj_ary @ adj_ary.T
  179. e, ev = np.linalg.eig(H)
  180. h = ev[:, np.argmax(e)] # eigenvector corresponding to the maximum eigenvalue
  181. # Authority matrix
  182. A = adj_ary.T @ adj_ary
  183. e, ev = np.linalg.eig(A)
  184. a = ev[:, np.argmax(e)] # eigenvector corresponding to the maximum eigenvalue
  185. if normalized:
  186. h /= h.sum()
  187. a /= a.sum()
  188. else:
  189. h /= h.max()
  190. a /= a.max()
  191. hubs = dict(zip(G, map(float, h)))
  192. authorities = dict(zip(G, map(float, a)))
  193. return hubs, authorities
  194. def _hits_scipy(G, max_iter=100, tol=1.0e-6, nstart=None, normalized=True):
  195. """Returns HITS hubs and authorities values for nodes.
  196. The HITS algorithm computes two numbers for a node.
  197. Authorities estimates the node value based on the incoming links.
  198. Hubs estimates the node value based on outgoing links.
  199. Parameters
  200. ----------
  201. G : graph
  202. A NetworkX graph
  203. max_iter : integer, optional
  204. Maximum number of iterations in power method.
  205. tol : float, optional
  206. Error tolerance used to check convergence in power method iteration.
  207. nstart : dictionary, optional
  208. Starting value of each node for power method iteration.
  209. normalized : bool (default=True)
  210. Normalize results by the sum of all of the values.
  211. Returns
  212. -------
  213. (hubs,authorities) : two-tuple of dictionaries
  214. Two dictionaries keyed by node containing the hub and authority
  215. values.
  216. Examples
  217. --------
  218. >>> from networkx.algorithms.link_analysis.hits_alg import _hits_scipy
  219. >>> G = nx.path_graph(4)
  220. >>> h, a = _hits_scipy(G)
  221. Notes
  222. -----
  223. This implementation uses SciPy sparse matrices.
  224. The eigenvector calculation is done by the power iteration method
  225. and has no guarantee of convergence. The iteration will stop
  226. after max_iter iterations or an error tolerance of
  227. number_of_nodes(G)*tol has been reached.
  228. The HITS algorithm was designed for directed graphs but this
  229. algorithm does not check if the input graph is directed and will
  230. execute on undirected graphs.
  231. Raises
  232. ------
  233. PowerIterationFailedConvergence
  234. If the algorithm fails to converge to the specified tolerance
  235. within the specified number of iterations of the power iteration
  236. method.
  237. References
  238. ----------
  239. .. [1] A. Langville and C. Meyer,
  240. "A survey of eigenvector methods of web information retrieval."
  241. http://citeseer.ist.psu.edu/713792.html
  242. .. [2] Jon Kleinberg,
  243. Authoritative sources in a hyperlinked environment
  244. Journal of the ACM 46 (5): 604-632, 1999.
  245. doi:10.1145/324133.324140.
  246. http://www.cs.cornell.edu/home/kleinber/auth.pdf.
  247. """
  248. import numpy as np
  249. if len(G) == 0:
  250. return {}, {}
  251. A = nx.to_scipy_sparse_array(G, nodelist=list(G))
  252. (n, _) = A.shape # should be square
  253. ATA = A.T @ A # authority matrix
  254. # choose fixed starting vector if not given
  255. if nstart is None:
  256. x = np.ones((n, 1)) / n
  257. else:
  258. x = np.array([nstart.get(n, 0) for n in list(G)], dtype=float)
  259. x /= x.sum()
  260. # power iteration on authority matrix
  261. i = 0
  262. while True:
  263. xlast = x
  264. x = ATA @ x
  265. x /= x.max()
  266. # check convergence, l1 norm
  267. err = np.absolute(x - xlast).sum()
  268. if err < tol:
  269. break
  270. if i > max_iter:
  271. raise nx.PowerIterationFailedConvergence(max_iter)
  272. i += 1
  273. a = x.flatten()
  274. h = A @ a
  275. if normalized:
  276. h /= h.sum()
  277. a /= a.sum()
  278. hubs = dict(zip(G, map(float, h)))
  279. authorities = dict(zip(G, map(float, a)))
  280. return hubs, authorities