modularity_max.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. """Functions for detecting communities based on modularity."""
  2. from collections import defaultdict
  3. import networkx as nx
  4. from networkx.algorithms.community.quality import modularity
  5. from networkx.utils import not_implemented_for
  6. from networkx.utils.mapped_queue import MappedQueue
  7. __all__ = [
  8. "greedy_modularity_communities",
  9. "naive_greedy_modularity_communities",
  10. ]
  11. def _greedy_modularity_communities_generator(G, weight=None, resolution=1):
  12. r"""Yield community partitions of G and the modularity change at each step.
  13. This function performs Clauset-Newman-Moore greedy modularity maximization [2]_
  14. At each step of the process it yields the change in modularity that will occur in
  15. the next step followed by yielding the new community partition after that step.
  16. Greedy modularity maximization begins with each node in its own community
  17. and repeatedly joins the pair of communities that lead to the largest
  18. modularity until one community contains all nodes (the partition has one set).
  19. This function maximizes the generalized modularity, where `resolution`
  20. is the resolution parameter, often expressed as $\gamma$.
  21. See :func:`~networkx.algorithms.community.quality.modularity`.
  22. Parameters
  23. ----------
  24. G : NetworkX graph
  25. weight : string or None, optional (default=None)
  26. The name of an edge attribute that holds the numerical value used
  27. as a weight. If None, then each edge has weight 1.
  28. The degree is the sum of the edge weights adjacent to the node.
  29. resolution : float (default=1)
  30. If resolution is less than 1, modularity favors larger communities.
  31. Greater than 1 favors smaller communities.
  32. Yields
  33. ------
  34. Alternating yield statements produce the following two objects:
  35. communities: dict_values
  36. A dict_values of frozensets of nodes, one for each community.
  37. This represents a partition of the nodes of the graph into communities.
  38. The first yield is the partition with each node in its own community.
  39. dq: float
  40. The change in modularity when merging the next two communities
  41. that leads to the largest modularity.
  42. See Also
  43. --------
  44. modularity
  45. References
  46. ----------
  47. .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
  48. Oxford University Press 2011.
  49. .. [2] Clauset, A., Newman, M. E., & Moore, C.
  50. "Finding community structure in very large networks."
  51. Physical Review E 70(6), 2004.
  52. .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
  53. Detection" Phys. Rev. E74, 2006.
  54. .. [4] Newman, M. E. J."Analysis of weighted networks"
  55. Physical Review E 70(5 Pt 2):056131, 2004.
  56. """
  57. directed = G.is_directed()
  58. N = G.number_of_nodes()
  59. # Count edges (or the sum of edge-weights for weighted graphs)
  60. m = G.size(weight)
  61. q0 = 1 / m
  62. # Calculate degrees (notation from the papers)
  63. # a : the fraction of (weighted) out-degree for each node
  64. # b : the fraction of (weighted) in-degree for each node
  65. if directed:
  66. a = {node: deg_out * q0 for node, deg_out in G.out_degree(weight=weight)}
  67. b = {node: deg_in * q0 for node, deg_in in G.in_degree(weight=weight)}
  68. else:
  69. a = b = {node: deg * q0 * 0.5 for node, deg in G.degree(weight=weight)}
  70. # this preliminary step collects the edge weights for each node pair
  71. # It handles multigraph and digraph and works fine for graph.
  72. dq_dict = defaultdict(lambda: defaultdict(float))
  73. for u, v, wt in G.edges(data=weight, default=1):
  74. if u == v:
  75. continue
  76. dq_dict[u][v] += wt
  77. dq_dict[v][u] += wt
  78. # now scale and subtract the expected edge-weights term
  79. for u, nbrdict in dq_dict.items():
  80. for v, wt in nbrdict.items():
  81. dq_dict[u][v] = q0 * wt - resolution * (a[u] * b[v] + b[u] * a[v])
  82. # Use -dq to get a max_heap instead of a min_heap
  83. # dq_heap holds a heap for each node's neighbors
  84. dq_heap = {u: MappedQueue({(u, v): -dq for v, dq in dq_dict[u].items()}) for u in G}
  85. # H -> all_dq_heap holds a heap with the best items for each node
  86. H = MappedQueue([dq_heap[n].heap[0] for n in G if len(dq_heap[n]) > 0])
  87. # Initialize single-node communities
  88. communities = {n: frozenset([n]) for n in G}
  89. yield communities.values()
  90. # Merge the two communities that lead to the largest modularity
  91. while len(H) > 1:
  92. # Find best merge
  93. # Remove from heap of row maxes
  94. # Ties will be broken by choosing the pair with lowest min community id
  95. try:
  96. negdq, u, v = H.pop()
  97. except IndexError:
  98. break
  99. dq = -negdq
  100. yield dq
  101. # Remove best merge from row u heap
  102. dq_heap[u].pop()
  103. # Push new row max onto H
  104. if len(dq_heap[u]) > 0:
  105. H.push(dq_heap[u].heap[0])
  106. # If this element was also at the root of row v, we need to remove the
  107. # duplicate entry from H
  108. if dq_heap[v].heap[0] == (v, u):
  109. H.remove((v, u))
  110. # Remove best merge from row v heap
  111. dq_heap[v].remove((v, u))
  112. # Push new row max onto H
  113. if len(dq_heap[v]) > 0:
  114. H.push(dq_heap[v].heap[0])
  115. else:
  116. # Duplicate wasn't in H, just remove from row v heap
  117. dq_heap[v].remove((v, u))
  118. # Perform merge
  119. communities[v] = frozenset(communities[u] | communities[v])
  120. del communities[u]
  121. # Get neighbor communities connected to the merged communities
  122. u_nbrs = set(dq_dict[u])
  123. v_nbrs = set(dq_dict[v])
  124. all_nbrs = (u_nbrs | v_nbrs) - {u, v}
  125. both_nbrs = u_nbrs & v_nbrs
  126. # Update dq for merge of u into v
  127. for w in all_nbrs:
  128. # Calculate new dq value
  129. if w in both_nbrs:
  130. dq_vw = dq_dict[v][w] + dq_dict[u][w]
  131. elif w in v_nbrs:
  132. dq_vw = dq_dict[v][w] - resolution * (a[u] * b[w] + a[w] * b[u])
  133. else: # w in u_nbrs
  134. dq_vw = dq_dict[u][w] - resolution * (a[v] * b[w] + a[w] * b[v])
  135. # Update rows v and w
  136. for row, col in [(v, w), (w, v)]:
  137. dq_heap_row = dq_heap[row]
  138. # Update dict for v,w only (u is removed below)
  139. dq_dict[row][col] = dq_vw
  140. # Save old max of per-row heap
  141. if len(dq_heap_row) > 0:
  142. d_oldmax = dq_heap_row.heap[0]
  143. else:
  144. d_oldmax = None
  145. # Add/update heaps
  146. d = (row, col)
  147. d_negdq = -dq_vw
  148. # Save old value for finding heap index
  149. if w in v_nbrs:
  150. # Update existing element in per-row heap
  151. dq_heap_row.update(d, d, priority=d_negdq)
  152. else:
  153. # We're creating a new nonzero element, add to heap
  154. dq_heap_row.push(d, priority=d_negdq)
  155. # Update heap of row maxes if necessary
  156. if d_oldmax is None:
  157. # No entries previously in this row, push new max
  158. H.push(d, priority=d_negdq)
  159. else:
  160. # We've updated an entry in this row, has the max changed?
  161. row_max = dq_heap_row.heap[0]
  162. if d_oldmax != row_max or d_oldmax.priority != row_max.priority:
  163. H.update(d_oldmax, row_max)
  164. # Remove row/col u from dq_dict matrix
  165. for w in dq_dict[u]:
  166. # Remove from dict
  167. dq_old = dq_dict[w][u]
  168. del dq_dict[w][u]
  169. # Remove from heaps if we haven't already
  170. if w != v:
  171. # Remove both row and column
  172. for row, col in [(w, u), (u, w)]:
  173. dq_heap_row = dq_heap[row]
  174. # Check if replaced dq is row max
  175. d_old = (row, col)
  176. if dq_heap_row.heap[0] == d_old:
  177. # Update per-row heap and heap of row maxes
  178. dq_heap_row.remove(d_old)
  179. H.remove(d_old)
  180. # Update row max
  181. if len(dq_heap_row) > 0:
  182. H.push(dq_heap_row.heap[0])
  183. else:
  184. # Only update per-row heap
  185. dq_heap_row.remove(d_old)
  186. del dq_dict[u]
  187. # Mark row u as deleted, but keep placeholder
  188. dq_heap[u] = MappedQueue()
  189. # Merge u into v and update a
  190. a[v] += a[u]
  191. a[u] = 0
  192. if directed:
  193. b[v] += b[u]
  194. b[u] = 0
  195. yield communities.values()
  196. def greedy_modularity_communities(
  197. G,
  198. weight=None,
  199. resolution=1,
  200. cutoff=1,
  201. best_n=None,
  202. ):
  203. r"""Find communities in G using greedy modularity maximization.
  204. This function uses Clauset-Newman-Moore greedy modularity maximization [2]_
  205. to find the community partition with the largest modularity.
  206. Greedy modularity maximization begins with each node in its own community
  207. and repeatedly joins the pair of communities that lead to the largest
  208. modularity until no further increase in modularity is possible (a maximum).
  209. Two keyword arguments adjust the stopping condition. `cutoff` is a lower
  210. limit on the number of communities so you can stop the process before
  211. reaching a maximum (used to save computation time). `best_n` is an upper
  212. limit on the number of communities so you can make the process continue
  213. until at most n communities remain even if the maximum modularity occurs
  214. for more. To obtain exactly n communities, set both `cutoff` and `best_n` to n.
  215. This function maximizes the generalized modularity, where `resolution`
  216. is the resolution parameter, often expressed as $\gamma$.
  217. See :func:`~networkx.algorithms.community.quality.modularity`.
  218. Parameters
  219. ----------
  220. G : NetworkX graph
  221. weight : string or None, optional (default=None)
  222. The name of an edge attribute that holds the numerical value used
  223. as a weight. If None, then each edge has weight 1.
  224. The degree is the sum of the edge weights adjacent to the node.
  225. resolution : float, optional (default=1)
  226. If resolution is less than 1, modularity favors larger communities.
  227. Greater than 1 favors smaller communities.
  228. cutoff : int, optional (default=1)
  229. A minimum number of communities below which the merging process stops.
  230. The process stops at this number of communities even if modularity
  231. is not maximized. The goal is to let the user stop the process early.
  232. The process stops before the cutoff if it finds a maximum of modularity.
  233. best_n : int or None, optional (default=None)
  234. A maximum number of communities above which the merging process will
  235. not stop. This forces community merging to continue after modularity
  236. starts to decrease until `best_n` communities remain.
  237. If ``None``, don't force it to continue beyond a maximum.
  238. Raises
  239. ------
  240. ValueError : If the `cutoff` or `best_n` value is not in the range
  241. ``[1, G.number_of_nodes()]``, or if `best_n` < `cutoff`.
  242. Returns
  243. -------
  244. communities: list
  245. A list of frozensets of nodes, one for each community.
  246. Sorted by length with largest communities first.
  247. Examples
  248. --------
  249. >>> G = nx.karate_club_graph()
  250. >>> c = nx.community.greedy_modularity_communities(G)
  251. >>> sorted(c[0])
  252. [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
  253. See Also
  254. --------
  255. modularity
  256. References
  257. ----------
  258. .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
  259. Oxford University Press 2011.
  260. .. [2] Clauset, A., Newman, M. E., & Moore, C.
  261. "Finding community structure in very large networks."
  262. Physical Review E 70(6), 2004.
  263. .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
  264. Detection" Phys. Rev. E74, 2006.
  265. .. [4] Newman, M. E. J."Analysis of weighted networks"
  266. Physical Review E 70(5 Pt 2):056131, 2004.
  267. """
  268. if (cutoff < 1) or (cutoff > G.number_of_nodes()):
  269. raise ValueError(f"cutoff must be between 1 and {len(G)}. Got {cutoff}.")
  270. if best_n is not None:
  271. if (best_n < 1) or (best_n > G.number_of_nodes()):
  272. raise ValueError(f"best_n must be between 1 and {len(G)}. Got {best_n}.")
  273. if best_n < cutoff:
  274. raise ValueError(f"Must have best_n >= cutoff. Got {best_n} < {cutoff}")
  275. if best_n == 1:
  276. return [set(G)]
  277. else:
  278. best_n = G.number_of_nodes()
  279. # retrieve generator object to construct output
  280. community_gen = _greedy_modularity_communities_generator(
  281. G, weight=weight, resolution=resolution
  282. )
  283. # construct the first best community
  284. communities = next(community_gen)
  285. # continue merging communities until one of the breaking criteria is satisfied
  286. while len(communities) > cutoff:
  287. try:
  288. dq = next(community_gen)
  289. # StopIteration occurs when communities are the connected components
  290. except StopIteration:
  291. communities = sorted(communities, key=len, reverse=True)
  292. # if best_n requires more merging, merge big sets for highest modularity
  293. while len(communities) > best_n:
  294. comm1, comm2, *rest = communities
  295. communities = [comm1 ^ comm2]
  296. communities.extend(rest)
  297. return communities
  298. # keep going unless max_mod is reached or best_n says to merge more
  299. if dq < 0 and len(communities) <= best_n:
  300. break
  301. communities = next(community_gen)
  302. return sorted(communities, key=len, reverse=True)
  303. @not_implemented_for("directed")
  304. @not_implemented_for("multigraph")
  305. def naive_greedy_modularity_communities(G, resolution=1, weight=None):
  306. r"""Find communities in G using greedy modularity maximization.
  307. This implementation is O(n^4), much slower than alternatives, but it is
  308. provided as an easy-to-understand reference implementation.
  309. Greedy modularity maximization begins with each node in its own community
  310. and joins the pair of communities that most increases modularity until no
  311. such pair exists.
  312. This function maximizes the generalized modularity, where `resolution`
  313. is the resolution parameter, often expressed as $\gamma$.
  314. See :func:`~networkx.algorithms.community.quality.modularity`.
  315. Parameters
  316. ----------
  317. G : NetworkX graph
  318. resolution : float (default=1)
  319. If resolution is less than 1, modularity favors larger communities.
  320. Greater than 1 favors smaller communities.
  321. weight : string or None, optional (default=None)
  322. The name of an edge attribute that holds the numerical value used
  323. as a weight. If None, then each edge has weight 1.
  324. The degree is the sum of the edge weights adjacent to the node.
  325. Returns
  326. -------
  327. list
  328. A list of sets of nodes, one for each community.
  329. Sorted by length with largest communities first.
  330. Examples
  331. --------
  332. >>> G = nx.karate_club_graph()
  333. >>> c = nx.community.naive_greedy_modularity_communities(G)
  334. >>> sorted(c[0])
  335. [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
  336. See Also
  337. --------
  338. greedy_modularity_communities
  339. modularity
  340. """
  341. # First create one community for each node
  342. communities = [frozenset([u]) for u in G.nodes()]
  343. # Track merges
  344. merges = []
  345. # Greedily merge communities until no improvement is possible
  346. old_modularity = None
  347. new_modularity = modularity(G, communities, resolution=resolution, weight=weight)
  348. while old_modularity is None or new_modularity > old_modularity:
  349. # Save modularity for comparison
  350. old_modularity = new_modularity
  351. # Find best pair to merge
  352. trial_communities = list(communities)
  353. to_merge = None
  354. for i, u in enumerate(communities):
  355. for j, v in enumerate(communities):
  356. # Skip i==j and empty communities
  357. if j <= i or len(u) == 0 or len(v) == 0:
  358. continue
  359. # Merge communities u and v
  360. trial_communities[j] = u | v
  361. trial_communities[i] = frozenset([])
  362. trial_modularity = modularity(
  363. G, trial_communities, resolution=resolution, weight=weight
  364. )
  365. if trial_modularity >= new_modularity:
  366. # Check if strictly better or tie
  367. if trial_modularity > new_modularity:
  368. # Found new best, save modularity and group indexes
  369. new_modularity = trial_modularity
  370. to_merge = (i, j, new_modularity - old_modularity)
  371. elif to_merge and min(i, j) < min(to_merge[0], to_merge[1]):
  372. # Break ties by choosing pair with lowest min id
  373. new_modularity = trial_modularity
  374. to_merge = (i, j, new_modularity - old_modularity)
  375. # Un-merge
  376. trial_communities[i] = u
  377. trial_communities[j] = v
  378. if to_merge is not None:
  379. # If the best merge improves modularity, use it
  380. merges.append(to_merge)
  381. i, j, dq = to_merge
  382. u, v = communities[i], communities[j]
  383. communities[j] = u | v
  384. communities[i] = frozenset([])
  385. # Remove empty communities and sort
  386. return sorted((c for c in communities if len(c) > 0), key=len, reverse=True)