label_propagation.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. """
  2. Label propagation community detection algorithms.
  3. """
  4. from collections import Counter, defaultdict
  5. import networkx as nx
  6. from networkx.utils import groups, not_implemented_for, py_random_state
  7. __all__ = ["label_propagation_communities", "asyn_lpa_communities"]
  8. @py_random_state(2)
  9. def asyn_lpa_communities(G, weight=None, seed=None):
  10. """Returns communities in `G` as detected by asynchronous label
  11. propagation.
  12. The asynchronous label propagation algorithm is described in
  13. [1]_. The algorithm is probabilistic and the found communities may
  14. vary on different executions.
  15. The algorithm proceeds as follows. After initializing each node with
  16. a unique label, the algorithm repeatedly sets the label of a node to
  17. be the label that appears most frequently among that nodes
  18. neighbors. The algorithm halts when each node has the label that
  19. appears most frequently among its neighbors. The algorithm is
  20. asynchronous because each node is updated without waiting for
  21. updates on the remaining nodes.
  22. This generalized version of the algorithm in [1]_ accepts edge
  23. weights.
  24. Parameters
  25. ----------
  26. G : Graph
  27. weight : string
  28. The edge attribute representing the weight of an edge.
  29. If None, each edge is assumed to have weight one. In this
  30. algorithm, the weight of an edge is used in determining the
  31. frequency with which a label appears among the neighbors of a
  32. node: a higher weight means the label appears more often.
  33. seed : integer, random_state, or None (default)
  34. Indicator of random number generation state.
  35. See :ref:`Randomness<randomness>`.
  36. Returns
  37. -------
  38. communities : iterable
  39. Iterable of communities given as sets of nodes.
  40. Notes
  41. -----
  42. Edge weight attributes must be numerical.
  43. References
  44. ----------
  45. .. [1] Raghavan, Usha Nandini, Réka Albert, and Soundar Kumara. "Near
  46. linear time algorithm to detect community structures in large-scale
  47. networks." Physical Review E 76.3 (2007): 036106.
  48. """
  49. labels = {n: i for i, n in enumerate(G)}
  50. cont = True
  51. while cont:
  52. cont = False
  53. nodes = list(G)
  54. seed.shuffle(nodes)
  55. for node in nodes:
  56. if not G[node]:
  57. continue
  58. # Get label frequencies among adjacent nodes.
  59. # Depending on the order they are processed in,
  60. # some nodes will be in iteration t and others in t-1,
  61. # making the algorithm asynchronous.
  62. if weight is None:
  63. # initialising a Counter from an iterator of labels is
  64. # faster for getting unweighted label frequencies
  65. label_freq = Counter(map(labels.get, G[node]))
  66. else:
  67. # updating a defaultdict is substantially faster
  68. # for getting weighted label frequencies
  69. label_freq = defaultdict(float)
  70. for _, v, wt in G.edges(node, data=weight, default=1):
  71. label_freq[labels[v]] += wt
  72. # Get the labels that appear with maximum frequency.
  73. max_freq = max(label_freq.values())
  74. best_labels = [
  75. label for label, freq in label_freq.items() if freq == max_freq
  76. ]
  77. # If the node does not have one of the maximum frequency labels,
  78. # randomly choose one of them and update the node's label.
  79. # Continue the iteration as long as at least one node
  80. # doesn't have a maximum frequency label.
  81. if labels[node] not in best_labels:
  82. labels[node] = seed.choice(best_labels)
  83. cont = True
  84. yield from groups(labels).values()
  85. @not_implemented_for("directed")
  86. def label_propagation_communities(G):
  87. """Generates community sets determined by label propagation
  88. Finds communities in `G` using a semi-synchronous label propagation
  89. method [1]_. This method combines the advantages of both the synchronous
  90. and asynchronous models. Not implemented for directed graphs.
  91. Parameters
  92. ----------
  93. G : graph
  94. An undirected NetworkX graph.
  95. Returns
  96. -------
  97. communities : iterable
  98. A dict_values object that contains a set of nodes for each community.
  99. Raises
  100. ------
  101. NetworkXNotImplemented
  102. If the graph is directed
  103. References
  104. ----------
  105. .. [1] Cordasco, G., & Gargano, L. (2010, December). Community detection
  106. via semi-synchronous label propagation algorithms. In Business
  107. Applications of Social Network Analysis (BASNA), 2010 IEEE International
  108. Workshop on (pp. 1-8). IEEE.
  109. """
  110. coloring = _color_network(G)
  111. # Create a unique label for each node in the graph
  112. labeling = {v: k for k, v in enumerate(G)}
  113. while not _labeling_complete(labeling, G):
  114. # Update the labels of every node with the same color.
  115. for color, nodes in coloring.items():
  116. for n in nodes:
  117. _update_label(n, labeling, G)
  118. clusters = defaultdict(set)
  119. for node, label in labeling.items():
  120. clusters[label].add(node)
  121. return clusters.values()
  122. def _color_network(G):
  123. """Colors the network so that neighboring nodes all have distinct colors.
  124. Returns a dict keyed by color to a set of nodes with that color.
  125. """
  126. coloring = {} # color => set(node)
  127. colors = nx.coloring.greedy_color(G)
  128. for node, color in colors.items():
  129. if color in coloring:
  130. coloring[color].add(node)
  131. else:
  132. coloring[color] = {node}
  133. return coloring
  134. def _labeling_complete(labeling, G):
  135. """Determines whether or not LPA is done.
  136. Label propagation is complete when all nodes have a label that is
  137. in the set of highest frequency labels amongst its neighbors.
  138. Nodes with no neighbors are considered complete.
  139. """
  140. return all(
  141. labeling[v] in _most_frequent_labels(v, labeling, G) for v in G if len(G[v]) > 0
  142. )
  143. def _most_frequent_labels(node, labeling, G):
  144. """Returns a set of all labels with maximum frequency in `labeling`.
  145. Input `labeling` should be a dict keyed by node to labels.
  146. """
  147. if not G[node]:
  148. # Nodes with no neighbors are themselves a community and are labeled
  149. # accordingly, hence the immediate if statement.
  150. return {labeling[node]}
  151. # Compute the frequencies of all neighbours of node
  152. freqs = Counter(labeling[q] for q in G[node])
  153. max_freq = max(freqs.values())
  154. return {label for label, freq in freqs.items() if freq == max_freq}
  155. def _update_label(node, labeling, G):
  156. """Updates the label of a node using the Prec-Max tie breaking algorithm
  157. The algorithm is explained in: 'Community Detection via Semi-Synchronous
  158. Label Propagation Algorithms' Cordasco and Gargano, 2011
  159. """
  160. high_labels = _most_frequent_labels(node, labeling, G)
  161. if len(high_labels) == 1:
  162. labeling[node] = high_labels.pop()
  163. elif len(high_labels) > 1:
  164. # Prec-Max
  165. if labeling[node] not in high_labels:
  166. labeling[node] = max(high_labels)