123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445 |
- """Functions for detecting communities based on modularity."""
- from collections import defaultdict
- import networkx as nx
- from networkx.algorithms.community.quality import modularity
- from networkx.utils import not_implemented_for
- from networkx.utils.mapped_queue import MappedQueue
- __all__ = [
- "greedy_modularity_communities",
- "naive_greedy_modularity_communities",
- ]
- def _greedy_modularity_communities_generator(G, weight=None, resolution=1):
- r"""Yield community partitions of G and the modularity change at each step.
- This function performs Clauset-Newman-Moore greedy modularity maximization [2]_
- At each step of the process it yields the change in modularity that will occur in
- the next step followed by yielding the new community partition after that step.
- Greedy modularity maximization begins with each node in its own community
- and repeatedly joins the pair of communities that lead to the largest
- modularity until one community contains all nodes (the partition has one set).
- This function maximizes the generalized modularity, where `resolution`
- is the resolution parameter, often expressed as $\gamma$.
- See :func:`~networkx.algorithms.community.quality.modularity`.
- Parameters
- ----------
- G : NetworkX graph
- weight : string or None, optional (default=None)
- The name of an edge attribute that holds the numerical value used
- as a weight. If None, then each edge has weight 1.
- The degree is the sum of the edge weights adjacent to the node.
- resolution : float (default=1)
- If resolution is less than 1, modularity favors larger communities.
- Greater than 1 favors smaller communities.
- Yields
- ------
- Alternating yield statements produce the following two objects:
- communities: dict_values
- A dict_values of frozensets of nodes, one for each community.
- This represents a partition of the nodes of the graph into communities.
- The first yield is the partition with each node in its own community.
- dq: float
- The change in modularity when merging the next two communities
- that leads to the largest modularity.
- See Also
- --------
- modularity
- References
- ----------
- .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
- Oxford University Press 2011.
- .. [2] Clauset, A., Newman, M. E., & Moore, C.
- "Finding community structure in very large networks."
- Physical Review E 70(6), 2004.
- .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
- Detection" Phys. Rev. E74, 2006.
- .. [4] Newman, M. E. J."Analysis of weighted networks"
- Physical Review E 70(5 Pt 2):056131, 2004.
- """
- directed = G.is_directed()
- N = G.number_of_nodes()
- # Count edges (or the sum of edge-weights for weighted graphs)
- m = G.size(weight)
- q0 = 1 / m
- # Calculate degrees (notation from the papers)
- # a : the fraction of (weighted) out-degree for each node
- # b : the fraction of (weighted) in-degree for each node
- if directed:
- a = {node: deg_out * q0 for node, deg_out in G.out_degree(weight=weight)}
- b = {node: deg_in * q0 for node, deg_in in G.in_degree(weight=weight)}
- else:
- a = b = {node: deg * q0 * 0.5 for node, deg in G.degree(weight=weight)}
- # this preliminary step collects the edge weights for each node pair
- # It handles multigraph and digraph and works fine for graph.
- dq_dict = defaultdict(lambda: defaultdict(float))
- for u, v, wt in G.edges(data=weight, default=1):
- if u == v:
- continue
- dq_dict[u][v] += wt
- dq_dict[v][u] += wt
- # now scale and subtract the expected edge-weights term
- for u, nbrdict in dq_dict.items():
- for v, wt in nbrdict.items():
- dq_dict[u][v] = q0 * wt - resolution * (a[u] * b[v] + b[u] * a[v])
- # Use -dq to get a max_heap instead of a min_heap
- # dq_heap holds a heap for each node's neighbors
- dq_heap = {u: MappedQueue({(u, v): -dq for v, dq in dq_dict[u].items()}) for u in G}
- # H -> all_dq_heap holds a heap with the best items for each node
- H = MappedQueue([dq_heap[n].heap[0] for n in G if len(dq_heap[n]) > 0])
- # Initialize single-node communities
- communities = {n: frozenset([n]) for n in G}
- yield communities.values()
- # Merge the two communities that lead to the largest modularity
- while len(H) > 1:
- # Find best merge
- # Remove from heap of row maxes
- # Ties will be broken by choosing the pair with lowest min community id
- try:
- negdq, u, v = H.pop()
- except IndexError:
- break
- dq = -negdq
- yield dq
- # Remove best merge from row u heap
- dq_heap[u].pop()
- # Push new row max onto H
- if len(dq_heap[u]) > 0:
- H.push(dq_heap[u].heap[0])
- # If this element was also at the root of row v, we need to remove the
- # duplicate entry from H
- if dq_heap[v].heap[0] == (v, u):
- H.remove((v, u))
- # Remove best merge from row v heap
- dq_heap[v].remove((v, u))
- # Push new row max onto H
- if len(dq_heap[v]) > 0:
- H.push(dq_heap[v].heap[0])
- else:
- # Duplicate wasn't in H, just remove from row v heap
- dq_heap[v].remove((v, u))
- # Perform merge
- communities[v] = frozenset(communities[u] | communities[v])
- del communities[u]
- # Get neighbor communities connected to the merged communities
- u_nbrs = set(dq_dict[u])
- v_nbrs = set(dq_dict[v])
- all_nbrs = (u_nbrs | v_nbrs) - {u, v}
- both_nbrs = u_nbrs & v_nbrs
- # Update dq for merge of u into v
- for w in all_nbrs:
- # Calculate new dq value
- if w in both_nbrs:
- dq_vw = dq_dict[v][w] + dq_dict[u][w]
- elif w in v_nbrs:
- dq_vw = dq_dict[v][w] - resolution * (a[u] * b[w] + a[w] * b[u])
- else: # w in u_nbrs
- dq_vw = dq_dict[u][w] - resolution * (a[v] * b[w] + a[w] * b[v])
- # Update rows v and w
- for row, col in [(v, w), (w, v)]:
- dq_heap_row = dq_heap[row]
- # Update dict for v,w only (u is removed below)
- dq_dict[row][col] = dq_vw
- # Save old max of per-row heap
- if len(dq_heap_row) > 0:
- d_oldmax = dq_heap_row.heap[0]
- else:
- d_oldmax = None
- # Add/update heaps
- d = (row, col)
- d_negdq = -dq_vw
- # Save old value for finding heap index
- if w in v_nbrs:
- # Update existing element in per-row heap
- dq_heap_row.update(d, d, priority=d_negdq)
- else:
- # We're creating a new nonzero element, add to heap
- dq_heap_row.push(d, priority=d_negdq)
- # Update heap of row maxes if necessary
- if d_oldmax is None:
- # No entries previously in this row, push new max
- H.push(d, priority=d_negdq)
- else:
- # We've updated an entry in this row, has the max changed?
- row_max = dq_heap_row.heap[0]
- if d_oldmax != row_max or d_oldmax.priority != row_max.priority:
- H.update(d_oldmax, row_max)
- # Remove row/col u from dq_dict matrix
- for w in dq_dict[u]:
- # Remove from dict
- dq_old = dq_dict[w][u]
- del dq_dict[w][u]
- # Remove from heaps if we haven't already
- if w != v:
- # Remove both row and column
- for row, col in [(w, u), (u, w)]:
- dq_heap_row = dq_heap[row]
- # Check if replaced dq is row max
- d_old = (row, col)
- if dq_heap_row.heap[0] == d_old:
- # Update per-row heap and heap of row maxes
- dq_heap_row.remove(d_old)
- H.remove(d_old)
- # Update row max
- if len(dq_heap_row) > 0:
- H.push(dq_heap_row.heap[0])
- else:
- # Only update per-row heap
- dq_heap_row.remove(d_old)
- del dq_dict[u]
- # Mark row u as deleted, but keep placeholder
- dq_heap[u] = MappedQueue()
- # Merge u into v and update a
- a[v] += a[u]
- a[u] = 0
- if directed:
- b[v] += b[u]
- b[u] = 0
- yield communities.values()
- def greedy_modularity_communities(
- G,
- weight=None,
- resolution=1,
- cutoff=1,
- best_n=None,
- ):
- r"""Find communities in G using greedy modularity maximization.
- This function uses Clauset-Newman-Moore greedy modularity maximization [2]_
- to find the community partition with the largest modularity.
- Greedy modularity maximization begins with each node in its own community
- and repeatedly joins the pair of communities that lead to the largest
- modularity until no further increase in modularity is possible (a maximum).
- Two keyword arguments adjust the stopping condition. `cutoff` is a lower
- limit on the number of communities so you can stop the process before
- reaching a maximum (used to save computation time). `best_n` is an upper
- limit on the number of communities so you can make the process continue
- until at most n communities remain even if the maximum modularity occurs
- for more. To obtain exactly n communities, set both `cutoff` and `best_n` to n.
- This function maximizes the generalized modularity, where `resolution`
- is the resolution parameter, often expressed as $\gamma$.
- See :func:`~networkx.algorithms.community.quality.modularity`.
- Parameters
- ----------
- G : NetworkX graph
- weight : string or None, optional (default=None)
- The name of an edge attribute that holds the numerical value used
- as a weight. If None, then each edge has weight 1.
- The degree is the sum of the edge weights adjacent to the node.
- resolution : float, optional (default=1)
- If resolution is less than 1, modularity favors larger communities.
- Greater than 1 favors smaller communities.
- cutoff : int, optional (default=1)
- A minimum number of communities below which the merging process stops.
- The process stops at this number of communities even if modularity
- is not maximized. The goal is to let the user stop the process early.
- The process stops before the cutoff if it finds a maximum of modularity.
- best_n : int or None, optional (default=None)
- A maximum number of communities above which the merging process will
- not stop. This forces community merging to continue after modularity
- starts to decrease until `best_n` communities remain.
- If ``None``, don't force it to continue beyond a maximum.
- Raises
- ------
- ValueError : If the `cutoff` or `best_n` value is not in the range
- ``[1, G.number_of_nodes()]``, or if `best_n` < `cutoff`.
- Returns
- -------
- communities: list
- A list of frozensets of nodes, one for each community.
- Sorted by length with largest communities first.
- Examples
- --------
- >>> G = nx.karate_club_graph()
- >>> c = nx.community.greedy_modularity_communities(G)
- >>> sorted(c[0])
- [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
- See Also
- --------
- modularity
- References
- ----------
- .. [1] Newman, M. E. J. "Networks: An Introduction", page 224
- Oxford University Press 2011.
- .. [2] Clauset, A., Newman, M. E., & Moore, C.
- "Finding community structure in very large networks."
- Physical Review E 70(6), 2004.
- .. [3] Reichardt and Bornholdt "Statistical Mechanics of Community
- Detection" Phys. Rev. E74, 2006.
- .. [4] Newman, M. E. J."Analysis of weighted networks"
- Physical Review E 70(5 Pt 2):056131, 2004.
- """
- if (cutoff < 1) or (cutoff > G.number_of_nodes()):
- raise ValueError(f"cutoff must be between 1 and {len(G)}. Got {cutoff}.")
- if best_n is not None:
- if (best_n < 1) or (best_n > G.number_of_nodes()):
- raise ValueError(f"best_n must be between 1 and {len(G)}. Got {best_n}.")
- if best_n < cutoff:
- raise ValueError(f"Must have best_n >= cutoff. Got {best_n} < {cutoff}")
- if best_n == 1:
- return [set(G)]
- else:
- best_n = G.number_of_nodes()
- # retrieve generator object to construct output
- community_gen = _greedy_modularity_communities_generator(
- G, weight=weight, resolution=resolution
- )
- # construct the first best community
- communities = next(community_gen)
- # continue merging communities until one of the breaking criteria is satisfied
- while len(communities) > cutoff:
- try:
- dq = next(community_gen)
- # StopIteration occurs when communities are the connected components
- except StopIteration:
- communities = sorted(communities, key=len, reverse=True)
- # if best_n requires more merging, merge big sets for highest modularity
- while len(communities) > best_n:
- comm1, comm2, *rest = communities
- communities = [comm1 ^ comm2]
- communities.extend(rest)
- return communities
- # keep going unless max_mod is reached or best_n says to merge more
- if dq < 0 and len(communities) <= best_n:
- break
- communities = next(community_gen)
- return sorted(communities, key=len, reverse=True)
- @not_implemented_for("directed")
- @not_implemented_for("multigraph")
- def naive_greedy_modularity_communities(G, resolution=1, weight=None):
- r"""Find communities in G using greedy modularity maximization.
- This implementation is O(n^4), much slower than alternatives, but it is
- provided as an easy-to-understand reference implementation.
- Greedy modularity maximization begins with each node in its own community
- and joins the pair of communities that most increases modularity until no
- such pair exists.
- This function maximizes the generalized modularity, where `resolution`
- is the resolution parameter, often expressed as $\gamma$.
- See :func:`~networkx.algorithms.community.quality.modularity`.
- Parameters
- ----------
- G : NetworkX graph
- resolution : float (default=1)
- If resolution is less than 1, modularity favors larger communities.
- Greater than 1 favors smaller communities.
- weight : string or None, optional (default=None)
- The name of an edge attribute that holds the numerical value used
- as a weight. If None, then each edge has weight 1.
- The degree is the sum of the edge weights adjacent to the node.
- Returns
- -------
- list
- A list of sets of nodes, one for each community.
- Sorted by length with largest communities first.
- Examples
- --------
- >>> G = nx.karate_club_graph()
- >>> c = nx.community.naive_greedy_modularity_communities(G)
- >>> sorted(c[0])
- [8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
- See Also
- --------
- greedy_modularity_communities
- modularity
- """
- # First create one community for each node
- communities = [frozenset([u]) for u in G.nodes()]
- # Track merges
- merges = []
- # Greedily merge communities until no improvement is possible
- old_modularity = None
- new_modularity = modularity(G, communities, resolution=resolution, weight=weight)
- while old_modularity is None or new_modularity > old_modularity:
- # Save modularity for comparison
- old_modularity = new_modularity
- # Find best pair to merge
- trial_communities = list(communities)
- to_merge = None
- for i, u in enumerate(communities):
- for j, v in enumerate(communities):
- # Skip i==j and empty communities
- if j <= i or len(u) == 0 or len(v) == 0:
- continue
- # Merge communities u and v
- trial_communities[j] = u | v
- trial_communities[i] = frozenset([])
- trial_modularity = modularity(
- G, trial_communities, resolution=resolution, weight=weight
- )
- if trial_modularity >= new_modularity:
- # Check if strictly better or tie
- if trial_modularity > new_modularity:
- # Found new best, save modularity and group indexes
- new_modularity = trial_modularity
- to_merge = (i, j, new_modularity - old_modularity)
- elif to_merge and min(i, j) < min(to_merge[0], to_merge[1]):
- # Break ties by choosing pair with lowest min id
- new_modularity = trial_modularity
- to_merge = (i, j, new_modularity - old_modularity)
- # Un-merge
- trial_communities[i] = u
- trial_communities[j] = v
- if to_merge is not None:
- # If the best merge improves modularity, use it
- merges.append(to_merge)
- i, j, dq = to_merge
- u, v = communities[i], communities[j]
- communities[j] = u | v
- communities[i] = frozenset([])
- # Remove empty communities and sort
- return sorted((c for c in communities if len(c) > 0), key=len, reverse=True)
|