similarity.py 57 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690
  1. """ Functions measuring similarity using graph edit distance.
  2. The graph edit distance is the number of edge/node changes needed
  3. to make two graphs isomorphic.
  4. The default algorithm/implementation is sub-optimal for some graphs.
  5. The problem of finding the exact Graph Edit Distance (GED) is NP-hard
  6. so it is often slow. If the simple interface `graph_edit_distance`
  7. takes too long for your graph, try `optimize_graph_edit_distance`
  8. and/or `optimize_edit_paths`.
  9. At the same time, I encourage capable people to investigate
  10. alternative GED algorithms, in order to improve the choices available.
  11. """
  12. import math
  13. import time
  14. import warnings
  15. from dataclasses import dataclass
  16. from itertools import product
  17. import networkx as nx
  18. __all__ = [
  19. "graph_edit_distance",
  20. "optimal_edit_paths",
  21. "optimize_graph_edit_distance",
  22. "optimize_edit_paths",
  23. "simrank_similarity",
  24. "panther_similarity",
  25. "generate_random_paths",
  26. ]
  27. def debug_print(*args, **kwargs):
  28. print(*args, **kwargs)
  29. def graph_edit_distance(
  30. G1,
  31. G2,
  32. node_match=None,
  33. edge_match=None,
  34. node_subst_cost=None,
  35. node_del_cost=None,
  36. node_ins_cost=None,
  37. edge_subst_cost=None,
  38. edge_del_cost=None,
  39. edge_ins_cost=None,
  40. roots=None,
  41. upper_bound=None,
  42. timeout=None,
  43. ):
  44. """Returns GED (graph edit distance) between graphs G1 and G2.
  45. Graph edit distance is a graph similarity measure analogous to
  46. Levenshtein distance for strings. It is defined as minimum cost
  47. of edit path (sequence of node and edge edit operations)
  48. transforming graph G1 to graph isomorphic to G2.
  49. Parameters
  50. ----------
  51. G1, G2: graphs
  52. The two graphs G1 and G2 must be of the same type.
  53. node_match : callable
  54. A function that returns True if node n1 in G1 and n2 in G2
  55. should be considered equal during matching.
  56. The function will be called like
  57. node_match(G1.nodes[n1], G2.nodes[n2]).
  58. That is, the function will receive the node attribute
  59. dictionaries for n1 and n2 as inputs.
  60. Ignored if node_subst_cost is specified. If neither
  61. node_match nor node_subst_cost are specified then node
  62. attributes are not considered.
  63. edge_match : callable
  64. A function that returns True if the edge attribute dictionaries
  65. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  66. be considered equal during matching.
  67. The function will be called like
  68. edge_match(G1[u1][v1], G2[u2][v2]).
  69. That is, the function will receive the edge attribute
  70. dictionaries of the edges under consideration.
  71. Ignored if edge_subst_cost is specified. If neither
  72. edge_match nor edge_subst_cost are specified then edge
  73. attributes are not considered.
  74. node_subst_cost, node_del_cost, node_ins_cost : callable
  75. Functions that return the costs of node substitution, node
  76. deletion, and node insertion, respectively.
  77. The functions will be called like
  78. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  79. node_del_cost(G1.nodes[n1]),
  80. node_ins_cost(G2.nodes[n2]).
  81. That is, the functions will receive the node attribute
  82. dictionaries as inputs. The functions are expected to return
  83. positive numeric values.
  84. Function node_subst_cost overrides node_match if specified.
  85. If neither node_match nor node_subst_cost are specified then
  86. default node substitution cost of 0 is used (node attributes
  87. are not considered during matching).
  88. If node_del_cost is not specified then default node deletion
  89. cost of 1 is used. If node_ins_cost is not specified then
  90. default node insertion cost of 1 is used.
  91. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  92. Functions that return the costs of edge substitution, edge
  93. deletion, and edge insertion, respectively.
  94. The functions will be called like
  95. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  96. edge_del_cost(G1[u1][v1]),
  97. edge_ins_cost(G2[u2][v2]).
  98. That is, the functions will receive the edge attribute
  99. dictionaries as inputs. The functions are expected to return
  100. positive numeric values.
  101. Function edge_subst_cost overrides edge_match if specified.
  102. If neither edge_match nor edge_subst_cost are specified then
  103. default edge substitution cost of 0 is used (edge attributes
  104. are not considered during matching).
  105. If edge_del_cost is not specified then default edge deletion
  106. cost of 1 is used. If edge_ins_cost is not specified then
  107. default edge insertion cost of 1 is used.
  108. roots : 2-tuple
  109. Tuple where first element is a node in G1 and the second
  110. is a node in G2.
  111. These nodes are forced to be matched in the comparison to
  112. allow comparison between rooted graphs.
  113. upper_bound : numeric
  114. Maximum edit distance to consider. Return None if no edit
  115. distance under or equal to upper_bound exists.
  116. timeout : numeric
  117. Maximum number of seconds to execute.
  118. After timeout is met, the current best GED is returned.
  119. Examples
  120. --------
  121. >>> G1 = nx.cycle_graph(6)
  122. >>> G2 = nx.wheel_graph(7)
  123. >>> nx.graph_edit_distance(G1, G2)
  124. 7.0
  125. >>> G1 = nx.star_graph(5)
  126. >>> G2 = nx.star_graph(5)
  127. >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))
  128. 0.0
  129. >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))
  130. 8.0
  131. See Also
  132. --------
  133. optimal_edit_paths, optimize_graph_edit_distance,
  134. is_isomorphic: test for graph edit distance of 0
  135. References
  136. ----------
  137. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  138. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  139. Pattern Recognition Problems. 4th International Conference on
  140. Pattern Recognition Applications and Methods 2015, Jan 2015,
  141. Lisbon, Portugal. 2015,
  142. <10.5220/0005209202710278>. <hal-01168816>
  143. https://hal.archives-ouvertes.fr/hal-01168816
  144. """
  145. bestcost = None
  146. for _, _, cost in optimize_edit_paths(
  147. G1,
  148. G2,
  149. node_match,
  150. edge_match,
  151. node_subst_cost,
  152. node_del_cost,
  153. node_ins_cost,
  154. edge_subst_cost,
  155. edge_del_cost,
  156. edge_ins_cost,
  157. upper_bound,
  158. True,
  159. roots,
  160. timeout,
  161. ):
  162. # assert bestcost is None or cost < bestcost
  163. bestcost = cost
  164. return bestcost
  165. def optimal_edit_paths(
  166. G1,
  167. G2,
  168. node_match=None,
  169. edge_match=None,
  170. node_subst_cost=None,
  171. node_del_cost=None,
  172. node_ins_cost=None,
  173. edge_subst_cost=None,
  174. edge_del_cost=None,
  175. edge_ins_cost=None,
  176. upper_bound=None,
  177. ):
  178. """Returns all minimum-cost edit paths transforming G1 to G2.
  179. Graph edit path is a sequence of node and edge edit operations
  180. transforming graph G1 to graph isomorphic to G2. Edit operations
  181. include substitutions, deletions, and insertions.
  182. Parameters
  183. ----------
  184. G1, G2: graphs
  185. The two graphs G1 and G2 must be of the same type.
  186. node_match : callable
  187. A function that returns True if node n1 in G1 and n2 in G2
  188. should be considered equal during matching.
  189. The function will be called like
  190. node_match(G1.nodes[n1], G2.nodes[n2]).
  191. That is, the function will receive the node attribute
  192. dictionaries for n1 and n2 as inputs.
  193. Ignored if node_subst_cost is specified. If neither
  194. node_match nor node_subst_cost are specified then node
  195. attributes are not considered.
  196. edge_match : callable
  197. A function that returns True if the edge attribute dictionaries
  198. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  199. be considered equal during matching.
  200. The function will be called like
  201. edge_match(G1[u1][v1], G2[u2][v2]).
  202. That is, the function will receive the edge attribute
  203. dictionaries of the edges under consideration.
  204. Ignored if edge_subst_cost is specified. If neither
  205. edge_match nor edge_subst_cost are specified then edge
  206. attributes are not considered.
  207. node_subst_cost, node_del_cost, node_ins_cost : callable
  208. Functions that return the costs of node substitution, node
  209. deletion, and node insertion, respectively.
  210. The functions will be called like
  211. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  212. node_del_cost(G1.nodes[n1]),
  213. node_ins_cost(G2.nodes[n2]).
  214. That is, the functions will receive the node attribute
  215. dictionaries as inputs. The functions are expected to return
  216. positive numeric values.
  217. Function node_subst_cost overrides node_match if specified.
  218. If neither node_match nor node_subst_cost are specified then
  219. default node substitution cost of 0 is used (node attributes
  220. are not considered during matching).
  221. If node_del_cost is not specified then default node deletion
  222. cost of 1 is used. If node_ins_cost is not specified then
  223. default node insertion cost of 1 is used.
  224. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  225. Functions that return the costs of edge substitution, edge
  226. deletion, and edge insertion, respectively.
  227. The functions will be called like
  228. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  229. edge_del_cost(G1[u1][v1]),
  230. edge_ins_cost(G2[u2][v2]).
  231. That is, the functions will receive the edge attribute
  232. dictionaries as inputs. The functions are expected to return
  233. positive numeric values.
  234. Function edge_subst_cost overrides edge_match if specified.
  235. If neither edge_match nor edge_subst_cost are specified then
  236. default edge substitution cost of 0 is used (edge attributes
  237. are not considered during matching).
  238. If edge_del_cost is not specified then default edge deletion
  239. cost of 1 is used. If edge_ins_cost is not specified then
  240. default edge insertion cost of 1 is used.
  241. upper_bound : numeric
  242. Maximum edit distance to consider.
  243. Returns
  244. -------
  245. edit_paths : list of tuples (node_edit_path, edge_edit_path)
  246. node_edit_path : list of tuples (u, v)
  247. edge_edit_path : list of tuples ((u1, v1), (u2, v2))
  248. cost : numeric
  249. Optimal edit path cost (graph edit distance).
  250. Examples
  251. --------
  252. >>> G1 = nx.cycle_graph(4)
  253. >>> G2 = nx.wheel_graph(5)
  254. >>> paths, cost = nx.optimal_edit_paths(G1, G2)
  255. >>> len(paths)
  256. 40
  257. >>> cost
  258. 5.0
  259. See Also
  260. --------
  261. graph_edit_distance, optimize_edit_paths
  262. References
  263. ----------
  264. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  265. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  266. Pattern Recognition Problems. 4th International Conference on
  267. Pattern Recognition Applications and Methods 2015, Jan 2015,
  268. Lisbon, Portugal. 2015,
  269. <10.5220/0005209202710278>. <hal-01168816>
  270. https://hal.archives-ouvertes.fr/hal-01168816
  271. """
  272. paths = []
  273. bestcost = None
  274. for vertex_path, edge_path, cost in optimize_edit_paths(
  275. G1,
  276. G2,
  277. node_match,
  278. edge_match,
  279. node_subst_cost,
  280. node_del_cost,
  281. node_ins_cost,
  282. edge_subst_cost,
  283. edge_del_cost,
  284. edge_ins_cost,
  285. upper_bound,
  286. False,
  287. ):
  288. # assert bestcost is None or cost <= bestcost
  289. if bestcost is not None and cost < bestcost:
  290. paths = []
  291. paths.append((vertex_path, edge_path))
  292. bestcost = cost
  293. return paths, bestcost
  294. def optimize_graph_edit_distance(
  295. G1,
  296. G2,
  297. node_match=None,
  298. edge_match=None,
  299. node_subst_cost=None,
  300. node_del_cost=None,
  301. node_ins_cost=None,
  302. edge_subst_cost=None,
  303. edge_del_cost=None,
  304. edge_ins_cost=None,
  305. upper_bound=None,
  306. ):
  307. """Returns consecutive approximations of GED (graph edit distance)
  308. between graphs G1 and G2.
  309. Graph edit distance is a graph similarity measure analogous to
  310. Levenshtein distance for strings. It is defined as minimum cost
  311. of edit path (sequence of node and edge edit operations)
  312. transforming graph G1 to graph isomorphic to G2.
  313. Parameters
  314. ----------
  315. G1, G2: graphs
  316. The two graphs G1 and G2 must be of the same type.
  317. node_match : callable
  318. A function that returns True if node n1 in G1 and n2 in G2
  319. should be considered equal during matching.
  320. The function will be called like
  321. node_match(G1.nodes[n1], G2.nodes[n2]).
  322. That is, the function will receive the node attribute
  323. dictionaries for n1 and n2 as inputs.
  324. Ignored if node_subst_cost is specified. If neither
  325. node_match nor node_subst_cost are specified then node
  326. attributes are not considered.
  327. edge_match : callable
  328. A function that returns True if the edge attribute dictionaries
  329. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  330. be considered equal during matching.
  331. The function will be called like
  332. edge_match(G1[u1][v1], G2[u2][v2]).
  333. That is, the function will receive the edge attribute
  334. dictionaries of the edges under consideration.
  335. Ignored if edge_subst_cost is specified. If neither
  336. edge_match nor edge_subst_cost are specified then edge
  337. attributes are not considered.
  338. node_subst_cost, node_del_cost, node_ins_cost : callable
  339. Functions that return the costs of node substitution, node
  340. deletion, and node insertion, respectively.
  341. The functions will be called like
  342. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  343. node_del_cost(G1.nodes[n1]),
  344. node_ins_cost(G2.nodes[n2]).
  345. That is, the functions will receive the node attribute
  346. dictionaries as inputs. The functions are expected to return
  347. positive numeric values.
  348. Function node_subst_cost overrides node_match if specified.
  349. If neither node_match nor node_subst_cost are specified then
  350. default node substitution cost of 0 is used (node attributes
  351. are not considered during matching).
  352. If node_del_cost is not specified then default node deletion
  353. cost of 1 is used. If node_ins_cost is not specified then
  354. default node insertion cost of 1 is used.
  355. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  356. Functions that return the costs of edge substitution, edge
  357. deletion, and edge insertion, respectively.
  358. The functions will be called like
  359. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  360. edge_del_cost(G1[u1][v1]),
  361. edge_ins_cost(G2[u2][v2]).
  362. That is, the functions will receive the edge attribute
  363. dictionaries as inputs. The functions are expected to return
  364. positive numeric values.
  365. Function edge_subst_cost overrides edge_match if specified.
  366. If neither edge_match nor edge_subst_cost are specified then
  367. default edge substitution cost of 0 is used (edge attributes
  368. are not considered during matching).
  369. If edge_del_cost is not specified then default edge deletion
  370. cost of 1 is used. If edge_ins_cost is not specified then
  371. default edge insertion cost of 1 is used.
  372. upper_bound : numeric
  373. Maximum edit distance to consider.
  374. Returns
  375. -------
  376. Generator of consecutive approximations of graph edit distance.
  377. Examples
  378. --------
  379. >>> G1 = nx.cycle_graph(6)
  380. >>> G2 = nx.wheel_graph(7)
  381. >>> for v in nx.optimize_graph_edit_distance(G1, G2):
  382. ... minv = v
  383. >>> minv
  384. 7.0
  385. See Also
  386. --------
  387. graph_edit_distance, optimize_edit_paths
  388. References
  389. ----------
  390. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  391. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  392. Pattern Recognition Problems. 4th International Conference on
  393. Pattern Recognition Applications and Methods 2015, Jan 2015,
  394. Lisbon, Portugal. 2015,
  395. <10.5220/0005209202710278>. <hal-01168816>
  396. https://hal.archives-ouvertes.fr/hal-01168816
  397. """
  398. for _, _, cost in optimize_edit_paths(
  399. G1,
  400. G2,
  401. node_match,
  402. edge_match,
  403. node_subst_cost,
  404. node_del_cost,
  405. node_ins_cost,
  406. edge_subst_cost,
  407. edge_del_cost,
  408. edge_ins_cost,
  409. upper_bound,
  410. True,
  411. ):
  412. yield cost
  413. def optimize_edit_paths(
  414. G1,
  415. G2,
  416. node_match=None,
  417. edge_match=None,
  418. node_subst_cost=None,
  419. node_del_cost=None,
  420. node_ins_cost=None,
  421. edge_subst_cost=None,
  422. edge_del_cost=None,
  423. edge_ins_cost=None,
  424. upper_bound=None,
  425. strictly_decreasing=True,
  426. roots=None,
  427. timeout=None,
  428. ):
  429. """GED (graph edit distance) calculation: advanced interface.
  430. Graph edit path is a sequence of node and edge edit operations
  431. transforming graph G1 to graph isomorphic to G2. Edit operations
  432. include substitutions, deletions, and insertions.
  433. Graph edit distance is defined as minimum cost of edit path.
  434. Parameters
  435. ----------
  436. G1, G2: graphs
  437. The two graphs G1 and G2 must be of the same type.
  438. node_match : callable
  439. A function that returns True if node n1 in G1 and n2 in G2
  440. should be considered equal during matching.
  441. The function will be called like
  442. node_match(G1.nodes[n1], G2.nodes[n2]).
  443. That is, the function will receive the node attribute
  444. dictionaries for n1 and n2 as inputs.
  445. Ignored if node_subst_cost is specified. If neither
  446. node_match nor node_subst_cost are specified then node
  447. attributes are not considered.
  448. edge_match : callable
  449. A function that returns True if the edge attribute dictionaries
  450. for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
  451. be considered equal during matching.
  452. The function will be called like
  453. edge_match(G1[u1][v1], G2[u2][v2]).
  454. That is, the function will receive the edge attribute
  455. dictionaries of the edges under consideration.
  456. Ignored if edge_subst_cost is specified. If neither
  457. edge_match nor edge_subst_cost are specified then edge
  458. attributes are not considered.
  459. node_subst_cost, node_del_cost, node_ins_cost : callable
  460. Functions that return the costs of node substitution, node
  461. deletion, and node insertion, respectively.
  462. The functions will be called like
  463. node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
  464. node_del_cost(G1.nodes[n1]),
  465. node_ins_cost(G2.nodes[n2]).
  466. That is, the functions will receive the node attribute
  467. dictionaries as inputs. The functions are expected to return
  468. positive numeric values.
  469. Function node_subst_cost overrides node_match if specified.
  470. If neither node_match nor node_subst_cost are specified then
  471. default node substitution cost of 0 is used (node attributes
  472. are not considered during matching).
  473. If node_del_cost is not specified then default node deletion
  474. cost of 1 is used. If node_ins_cost is not specified then
  475. default node insertion cost of 1 is used.
  476. edge_subst_cost, edge_del_cost, edge_ins_cost : callable
  477. Functions that return the costs of edge substitution, edge
  478. deletion, and edge insertion, respectively.
  479. The functions will be called like
  480. edge_subst_cost(G1[u1][v1], G2[u2][v2]),
  481. edge_del_cost(G1[u1][v1]),
  482. edge_ins_cost(G2[u2][v2]).
  483. That is, the functions will receive the edge attribute
  484. dictionaries as inputs. The functions are expected to return
  485. positive numeric values.
  486. Function edge_subst_cost overrides edge_match if specified.
  487. If neither edge_match nor edge_subst_cost are specified then
  488. default edge substitution cost of 0 is used (edge attributes
  489. are not considered during matching).
  490. If edge_del_cost is not specified then default edge deletion
  491. cost of 1 is used. If edge_ins_cost is not specified then
  492. default edge insertion cost of 1 is used.
  493. upper_bound : numeric
  494. Maximum edit distance to consider.
  495. strictly_decreasing : bool
  496. If True, return consecutive approximations of strictly
  497. decreasing cost. Otherwise, return all edit paths of cost
  498. less than or equal to the previous minimum cost.
  499. roots : 2-tuple
  500. Tuple where first element is a node in G1 and the second
  501. is a node in G2.
  502. These nodes are forced to be matched in the comparison to
  503. allow comparison between rooted graphs.
  504. timeout : numeric
  505. Maximum number of seconds to execute.
  506. After timeout is met, the current best GED is returned.
  507. Returns
  508. -------
  509. Generator of tuples (node_edit_path, edge_edit_path, cost)
  510. node_edit_path : list of tuples (u, v)
  511. edge_edit_path : list of tuples ((u1, v1), (u2, v2))
  512. cost : numeric
  513. See Also
  514. --------
  515. graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
  516. References
  517. ----------
  518. .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
  519. Martineau. An Exact Graph Edit Distance Algorithm for Solving
  520. Pattern Recognition Problems. 4th International Conference on
  521. Pattern Recognition Applications and Methods 2015, Jan 2015,
  522. Lisbon, Portugal. 2015,
  523. <10.5220/0005209202710278>. <hal-01168816>
  524. https://hal.archives-ouvertes.fr/hal-01168816
  525. """
  526. # TODO: support DiGraph
  527. import numpy as np
  528. import scipy as sp
  529. import scipy.optimize # call as sp.optimize
  530. @dataclass
  531. class CostMatrix:
  532. C: ...
  533. lsa_row_ind: ...
  534. lsa_col_ind: ...
  535. ls: ...
  536. def make_CostMatrix(C, m, n):
  537. # assert(C.shape == (m + n, m + n))
  538. lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)
  539. # Fixup dummy assignments:
  540. # each substitution i<->j should have dummy assignment m+j<->n+i
  541. # NOTE: fast reduce of Cv relies on it
  542. # assert len(lsa_row_ind) == len(lsa_col_ind)
  543. indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
  544. subst_ind = [k for k, i, j in indexes if i < m and j < n]
  545. indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
  546. dummy_ind = [k for k, i, j in indexes if i >= m and j >= n]
  547. # assert len(subst_ind) == len(dummy_ind)
  548. lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m
  549. lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n
  550. return CostMatrix(
  551. C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()
  552. )
  553. def extract_C(C, i, j, m, n):
  554. # assert(C.shape == (m + n, m + n))
  555. row_ind = [k in i or k - m in j for k in range(m + n)]
  556. col_ind = [k in j or k - n in i for k in range(m + n)]
  557. return C[row_ind, :][:, col_ind]
  558. def reduce_C(C, i, j, m, n):
  559. # assert(C.shape == (m + n, m + n))
  560. row_ind = [k not in i and k - m not in j for k in range(m + n)]
  561. col_ind = [k not in j and k - n not in i for k in range(m + n)]
  562. return C[row_ind, :][:, col_ind]
  563. def reduce_ind(ind, i):
  564. # assert set(ind) == set(range(len(ind)))
  565. rind = ind[[k not in i for k in ind]]
  566. for k in set(i):
  567. rind[rind >= k] -= 1
  568. return rind
  569. def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=None):
  570. """
  571. Parameters:
  572. u, v: matched vertices, u=None or v=None for
  573. deletion/insertion
  574. pending_g, pending_h: lists of edges not yet mapped
  575. Ce: CostMatrix of pending edge mappings
  576. matched_uv: partial vertex edit path
  577. list of tuples (u, v) of previously matched vertex
  578. mappings u<->v, u=None or v=None for
  579. deletion/insertion
  580. Returns:
  581. list of (i, j): indices of edge mappings g<->h
  582. localCe: local CostMatrix of edge mappings
  583. (basically submatrix of Ce at cross of rows i, cols j)
  584. """
  585. M = len(pending_g)
  586. N = len(pending_h)
  587. # assert Ce.C.shape == (M + N, M + N)
  588. # only attempt to match edges after one node match has been made
  589. # this will stop self-edges on the first node being automatically deleted
  590. # even when a substitution is the better option
  591. if matched_uv is None or len(matched_uv) == 0:
  592. g_ind = []
  593. h_ind = []
  594. else:
  595. g_ind = [
  596. i
  597. for i in range(M)
  598. if pending_g[i][:2] == (u, u)
  599. or any(
  600. pending_g[i][:2] in ((p, u), (u, p), (p, p)) for p, q in matched_uv
  601. )
  602. ]
  603. h_ind = [
  604. j
  605. for j in range(N)
  606. if pending_h[j][:2] == (v, v)
  607. or any(
  608. pending_h[j][:2] in ((q, v), (v, q), (q, q)) for p, q in matched_uv
  609. )
  610. ]
  611. m = len(g_ind)
  612. n = len(h_ind)
  613. if m or n:
  614. C = extract_C(Ce.C, g_ind, h_ind, M, N)
  615. # assert C.shape == (m + n, m + n)
  616. # Forbid structurally invalid matches
  617. # NOTE: inf remembered from Ce construction
  618. for k, i in enumerate(g_ind):
  619. g = pending_g[i][:2]
  620. for l, j in enumerate(h_ind):
  621. h = pending_h[j][:2]
  622. if nx.is_directed(G1) or nx.is_directed(G2):
  623. if any(
  624. g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)
  625. for p, q in matched_uv
  626. ):
  627. continue
  628. else:
  629. if any(
  630. g in ((p, u), (u, p)) and h in ((q, v), (v, q))
  631. for p, q in matched_uv
  632. ):
  633. continue
  634. if g == (u, u) or any(g == (p, p) for p, q in matched_uv):
  635. continue
  636. if h == (v, v) or any(h == (q, q) for p, q in matched_uv):
  637. continue
  638. C[k, l] = inf
  639. localCe = make_CostMatrix(C, m, n)
  640. ij = [
  641. (
  642. g_ind[k] if k < m else M + h_ind[l],
  643. h_ind[l] if l < n else N + g_ind[k],
  644. )
  645. for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
  646. if k < m or l < n
  647. ]
  648. else:
  649. ij = []
  650. localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
  651. return ij, localCe
  652. def reduce_Ce(Ce, ij, m, n):
  653. if len(ij):
  654. i, j = zip(*ij)
  655. m_i = m - sum(1 for t in i if t < m)
  656. n_j = n - sum(1 for t in j if t < n)
  657. return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
  658. return Ce
  659. def get_edit_ops(
  660. matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost
  661. ):
  662. """
  663. Parameters:
  664. matched_uv: partial vertex edit path
  665. list of tuples (u, v) of vertex mappings u<->v,
  666. u=None or v=None for deletion/insertion
  667. pending_u, pending_v: lists of vertices not yet mapped
  668. Cv: CostMatrix of pending vertex mappings
  669. pending_g, pending_h: lists of edges not yet mapped
  670. Ce: CostMatrix of pending edge mappings
  671. matched_cost: cost of partial edit path
  672. Returns:
  673. sequence of
  674. (i, j): indices of vertex mapping u<->v
  675. Cv_ij: reduced CostMatrix of pending vertex mappings
  676. (basically Cv with row i, col j removed)
  677. list of (x, y): indices of edge mappings g<->h
  678. Ce_xy: reduced CostMatrix of pending edge mappings
  679. (basically Ce with rows x, cols y removed)
  680. cost: total cost of edit operation
  681. NOTE: most promising ops first
  682. """
  683. m = len(pending_u)
  684. n = len(pending_v)
  685. # assert Cv.C.shape == (m + n, m + n)
  686. # 1) a vertex mapping from optimal linear sum assignment
  687. i, j = min(
  688. (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n
  689. )
  690. xy, localCe = match_edges(
  691. pending_u[i] if i < m else None,
  692. pending_v[j] if j < n else None,
  693. pending_g,
  694. pending_h,
  695. Ce,
  696. matched_uv,
  697. )
  698. Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
  699. # assert Ce.ls <= localCe.ls + Ce_xy.ls
  700. if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
  701. pass
  702. else:
  703. # get reduced Cv efficiently
  704. Cv_ij = CostMatrix(
  705. reduce_C(Cv.C, (i,), (j,), m, n),
  706. reduce_ind(Cv.lsa_row_ind, (i, m + j)),
  707. reduce_ind(Cv.lsa_col_ind, (j, n + i)),
  708. Cv.ls - Cv.C[i, j],
  709. )
  710. yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
  711. # 2) other candidates, sorted by lower-bound cost estimate
  712. other = []
  713. fixed_i, fixed_j = i, j
  714. if m <= n:
  715. candidates = (
  716. (t, fixed_j)
  717. for t in range(m + n)
  718. if t != fixed_i and (t < m or t == m + fixed_j)
  719. )
  720. else:
  721. candidates = (
  722. (fixed_i, t)
  723. for t in range(m + n)
  724. if t != fixed_j and (t < n or t == n + fixed_i)
  725. )
  726. for i, j in candidates:
  727. if prune(matched_cost + Cv.C[i, j] + Ce.ls):
  728. continue
  729. Cv_ij = make_CostMatrix(
  730. reduce_C(Cv.C, (i,), (j,), m, n),
  731. m - 1 if i < m else m,
  732. n - 1 if j < n else n,
  733. )
  734. # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
  735. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
  736. continue
  737. xy, localCe = match_edges(
  738. pending_u[i] if i < m else None,
  739. pending_v[j] if j < n else None,
  740. pending_g,
  741. pending_h,
  742. Ce,
  743. matched_uv,
  744. )
  745. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
  746. continue
  747. Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
  748. # assert Ce.ls <= localCe.ls + Ce_xy.ls
  749. if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):
  750. continue
  751. other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
  752. yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)
  753. def get_edit_paths(
  754. matched_uv,
  755. pending_u,
  756. pending_v,
  757. Cv,
  758. matched_gh,
  759. pending_g,
  760. pending_h,
  761. Ce,
  762. matched_cost,
  763. ):
  764. """
  765. Parameters:
  766. matched_uv: partial vertex edit path
  767. list of tuples (u, v) of vertex mappings u<->v,
  768. u=None or v=None for deletion/insertion
  769. pending_u, pending_v: lists of vertices not yet mapped
  770. Cv: CostMatrix of pending vertex mappings
  771. matched_gh: partial edge edit path
  772. list of tuples (g, h) of edge mappings g<->h,
  773. g=None or h=None for deletion/insertion
  774. pending_g, pending_h: lists of edges not yet mapped
  775. Ce: CostMatrix of pending edge mappings
  776. matched_cost: cost of partial edit path
  777. Returns:
  778. sequence of (vertex_path, edge_path, cost)
  779. vertex_path: complete vertex edit path
  780. list of tuples (u, v) of vertex mappings u<->v,
  781. u=None or v=None for deletion/insertion
  782. edge_path: complete edge edit path
  783. list of tuples (g, h) of edge mappings g<->h,
  784. g=None or h=None for deletion/insertion
  785. cost: total cost of edit path
  786. NOTE: path costs are non-increasing
  787. """
  788. # debug_print('matched-uv:', matched_uv)
  789. # debug_print('matched-gh:', matched_gh)
  790. # debug_print('matched-cost:', matched_cost)
  791. # debug_print('pending-u:', pending_u)
  792. # debug_print('pending-v:', pending_v)
  793. # debug_print(Cv.C)
  794. # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))
  795. # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))
  796. # debug_print('pending-g:', pending_g)
  797. # debug_print('pending-h:', pending_h)
  798. # debug_print(Ce.C)
  799. # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))
  800. # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))
  801. # debug_print()
  802. if prune(matched_cost + Cv.ls + Ce.ls):
  803. return
  804. if not max(len(pending_u), len(pending_v)):
  805. # assert not len(pending_g)
  806. # assert not len(pending_h)
  807. # path completed!
  808. # assert matched_cost <= maxcost_value
  809. nonlocal maxcost_value
  810. maxcost_value = min(maxcost_value, matched_cost)
  811. yield matched_uv, matched_gh, matched_cost
  812. else:
  813. edit_ops = get_edit_ops(
  814. matched_uv,
  815. pending_u,
  816. pending_v,
  817. Cv,
  818. pending_g,
  819. pending_h,
  820. Ce,
  821. matched_cost,
  822. )
  823. for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
  824. i, j = ij
  825. # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
  826. if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
  827. continue
  828. # dive deeper
  829. u = pending_u.pop(i) if i < len(pending_u) else None
  830. v = pending_v.pop(j) if j < len(pending_v) else None
  831. matched_uv.append((u, v))
  832. for x, y in xy:
  833. len_g = len(pending_g)
  834. len_h = len(pending_h)
  835. matched_gh.append(
  836. (
  837. pending_g[x] if x < len_g else None,
  838. pending_h[y] if y < len_h else None,
  839. )
  840. )
  841. sortedx = sorted(x for x, y in xy)
  842. sortedy = sorted(y for x, y in xy)
  843. G = [
  844. (pending_g.pop(x) if x < len(pending_g) else None)
  845. for x in reversed(sortedx)
  846. ]
  847. H = [
  848. (pending_h.pop(y) if y < len(pending_h) else None)
  849. for y in reversed(sortedy)
  850. ]
  851. yield from get_edit_paths(
  852. matched_uv,
  853. pending_u,
  854. pending_v,
  855. Cv_ij,
  856. matched_gh,
  857. pending_g,
  858. pending_h,
  859. Ce_xy,
  860. matched_cost + edit_cost,
  861. )
  862. # backtrack
  863. if u is not None:
  864. pending_u.insert(i, u)
  865. if v is not None:
  866. pending_v.insert(j, v)
  867. matched_uv.pop()
  868. for x, g in zip(sortedx, reversed(G)):
  869. if g is not None:
  870. pending_g.insert(x, g)
  871. for y, h in zip(sortedy, reversed(H)):
  872. if h is not None:
  873. pending_h.insert(y, h)
  874. for _ in xy:
  875. matched_gh.pop()
  876. # Initialization
  877. pending_u = list(G1.nodes)
  878. pending_v = list(G2.nodes)
  879. initial_cost = 0
  880. if roots:
  881. root_u, root_v = roots
  882. if root_u not in pending_u or root_v not in pending_v:
  883. raise nx.NodeNotFound("Root node not in graph.")
  884. # remove roots from pending
  885. pending_u.remove(root_u)
  886. pending_v.remove(root_v)
  887. # cost matrix of vertex mappings
  888. m = len(pending_u)
  889. n = len(pending_v)
  890. C = np.zeros((m + n, m + n))
  891. if node_subst_cost:
  892. C[0:m, 0:n] = np.array(
  893. [
  894. node_subst_cost(G1.nodes[u], G2.nodes[v])
  895. for u in pending_u
  896. for v in pending_v
  897. ]
  898. ).reshape(m, n)
  899. if roots:
  900. initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])
  901. elif node_match:
  902. C[0:m, 0:n] = np.array(
  903. [
  904. 1 - int(node_match(G1.nodes[u], G2.nodes[v]))
  905. for u in pending_u
  906. for v in pending_v
  907. ]
  908. ).reshape(m, n)
  909. if roots:
  910. initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])
  911. else:
  912. # all zeroes
  913. pass
  914. # assert not min(m, n) or C[0:m, 0:n].min() >= 0
  915. if node_del_cost:
  916. del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
  917. else:
  918. del_costs = [1] * len(pending_u)
  919. # assert not m or min(del_costs) >= 0
  920. if node_ins_cost:
  921. ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
  922. else:
  923. ins_costs = [1] * len(pending_v)
  924. # assert not n or min(ins_costs) >= 0
  925. inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
  926. C[0:m, n : n + m] = np.array(
  927. [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
  928. ).reshape(m, m)
  929. C[m : m + n, 0:n] = np.array(
  930. [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
  931. ).reshape(n, n)
  932. Cv = make_CostMatrix(C, m, n)
  933. # debug_print(f"Cv: {m} x {n}")
  934. # debug_print(Cv.C)
  935. pending_g = list(G1.edges)
  936. pending_h = list(G2.edges)
  937. # cost matrix of edge mappings
  938. m = len(pending_g)
  939. n = len(pending_h)
  940. C = np.zeros((m + n, m + n))
  941. if edge_subst_cost:
  942. C[0:m, 0:n] = np.array(
  943. [
  944. edge_subst_cost(G1.edges[g], G2.edges[h])
  945. for g in pending_g
  946. for h in pending_h
  947. ]
  948. ).reshape(m, n)
  949. elif edge_match:
  950. C[0:m, 0:n] = np.array(
  951. [
  952. 1 - int(edge_match(G1.edges[g], G2.edges[h]))
  953. for g in pending_g
  954. for h in pending_h
  955. ]
  956. ).reshape(m, n)
  957. else:
  958. # all zeroes
  959. pass
  960. # assert not min(m, n) or C[0:m, 0:n].min() >= 0
  961. if edge_del_cost:
  962. del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
  963. else:
  964. del_costs = [1] * len(pending_g)
  965. # assert not m or min(del_costs) >= 0
  966. if edge_ins_cost:
  967. ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
  968. else:
  969. ins_costs = [1] * len(pending_h)
  970. # assert not n or min(ins_costs) >= 0
  971. inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
  972. C[0:m, n : n + m] = np.array(
  973. [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
  974. ).reshape(m, m)
  975. C[m : m + n, 0:n] = np.array(
  976. [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
  977. ).reshape(n, n)
  978. Ce = make_CostMatrix(C, m, n)
  979. # debug_print(f'Ce: {m} x {n}')
  980. # debug_print(Ce.C)
  981. # debug_print()
  982. maxcost_value = Cv.C.sum() + Ce.C.sum() + 1
  983. if timeout is not None:
  984. if timeout <= 0:
  985. raise nx.NetworkXError("Timeout value must be greater than 0")
  986. start = time.perf_counter()
  987. def prune(cost):
  988. if timeout is not None:
  989. if time.perf_counter() - start > timeout:
  990. return True
  991. if upper_bound is not None:
  992. if cost > upper_bound:
  993. return True
  994. if cost > maxcost_value:
  995. return True
  996. if strictly_decreasing and cost >= maxcost_value:
  997. return True
  998. return False
  999. # Now go!
  1000. done_uv = [] if roots is None else [roots]
  1001. for vertex_path, edge_path, cost in get_edit_paths(
  1002. done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost
  1003. ):
  1004. # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
  1005. # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
  1006. # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
  1007. # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
  1008. # print(vertex_path, edge_path, cost, file = sys.stderr)
  1009. # assert cost == maxcost_value
  1010. yield list(vertex_path), list(edge_path), cost
  1011. def simrank_similarity(
  1012. G,
  1013. source=None,
  1014. target=None,
  1015. importance_factor=0.9,
  1016. max_iterations=1000,
  1017. tolerance=1e-4,
  1018. ):
  1019. """Returns the SimRank similarity of nodes in the graph ``G``.
  1020. SimRank is a similarity metric that says "two objects are considered
  1021. to be similar if they are referenced by similar objects." [1]_.
  1022. The pseudo-code definition from the paper is::
  1023. def simrank(G, u, v):
  1024. in_neighbors_u = G.predecessors(u)
  1025. in_neighbors_v = G.predecessors(v)
  1026. scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
  1027. return scale * sum(simrank(G, w, x)
  1028. for w, x in product(in_neighbors_u,
  1029. in_neighbors_v))
  1030. where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
  1031. and ``C`` is a float decay or importance factor between 0 and 1.
  1032. The SimRank algorithm for determining node similarity is defined in
  1033. [2]_.
  1034. Parameters
  1035. ----------
  1036. G : NetworkX graph
  1037. A NetworkX graph
  1038. source : node
  1039. If this is specified, the returned dictionary maps each node
  1040. ``v`` in the graph to the similarity between ``source`` and
  1041. ``v``.
  1042. target : node
  1043. If both ``source`` and ``target`` are specified, the similarity
  1044. value between ``source`` and ``target`` is returned. If
  1045. ``target`` is specified but ``source`` is not, this argument is
  1046. ignored.
  1047. importance_factor : float
  1048. The relative importance of indirect neighbors with respect to
  1049. direct neighbors.
  1050. max_iterations : integer
  1051. Maximum number of iterations.
  1052. tolerance : float
  1053. Error tolerance used to check convergence. When an iteration of
  1054. the algorithm finds that no similarity value changes more than
  1055. this amount, the algorithm halts.
  1056. Returns
  1057. -------
  1058. similarity : dictionary or float
  1059. If ``source`` and ``target`` are both ``None``, this returns a
  1060. dictionary of dictionaries, where keys are node pairs and value
  1061. are similarity of the pair of nodes.
  1062. If ``source`` is not ``None`` but ``target`` is, this returns a
  1063. dictionary mapping node to the similarity of ``source`` and that
  1064. node.
  1065. If neither ``source`` nor ``target`` is ``None``, this returns
  1066. the similarity value for the given pair of nodes.
  1067. Examples
  1068. --------
  1069. >>> G = nx.cycle_graph(2)
  1070. >>> nx.simrank_similarity(G)
  1071. {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
  1072. >>> nx.simrank_similarity(G, source=0)
  1073. {0: 1.0, 1: 0.0}
  1074. >>> nx.simrank_similarity(G, source=0, target=0)
  1075. 1.0
  1076. The result of this function can be converted to a numpy array
  1077. representing the SimRank matrix by using the node order of the
  1078. graph to determine which row and column represent each node.
  1079. Other ordering of nodes is also possible.
  1080. >>> import numpy as np
  1081. >>> sim = nx.simrank_similarity(G)
  1082. >>> np.array([[sim[u][v] for v in G] for u in G])
  1083. array([[1., 0.],
  1084. [0., 1.]])
  1085. >>> sim_1d = nx.simrank_similarity(G, source=0)
  1086. >>> np.array([sim[0][v] for v in G])
  1087. array([1., 0.])
  1088. References
  1089. ----------
  1090. .. [1] https://en.wikipedia.org/wiki/SimRank
  1091. .. [2] G. Jeh and J. Widom.
  1092. "SimRank: a measure of structural-context similarity",
  1093. In KDD'02: Proceedings of the Eighth ACM SIGKDD
  1094. International Conference on Knowledge Discovery and Data Mining,
  1095. pp. 538--543. ACM Press, 2002.
  1096. """
  1097. import numpy as np
  1098. nodelist = list(G)
  1099. s_indx = None if source is None else nodelist.index(source)
  1100. t_indx = None if target is None else nodelist.index(target)
  1101. x = _simrank_similarity_numpy(
  1102. G, s_indx, t_indx, importance_factor, max_iterations, tolerance
  1103. )
  1104. if isinstance(x, np.ndarray):
  1105. if x.ndim == 1:
  1106. return dict(zip(G, x))
  1107. # else x.ndim == 2
  1108. return {u: dict(zip(G, row)) for u, row in zip(G, x)}
  1109. return x
  1110. def _simrank_similarity_python(
  1111. G,
  1112. source=None,
  1113. target=None,
  1114. importance_factor=0.9,
  1115. max_iterations=1000,
  1116. tolerance=1e-4,
  1117. ):
  1118. """Returns the SimRank similarity of nodes in the graph ``G``.
  1119. This pure Python version is provided for pedagogical purposes.
  1120. Examples
  1121. --------
  1122. >>> G = nx.cycle_graph(2)
  1123. >>> nx.similarity._simrank_similarity_python(G)
  1124. {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
  1125. >>> nx.similarity._simrank_similarity_python(G, source=0)
  1126. {0: 1, 1: 0.0}
  1127. >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
  1128. 1
  1129. """
  1130. # build up our similarity adjacency dictionary output
  1131. newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
  1132. # These functions compute the update to the similarity value of the nodes
  1133. # `u` and `v` with respect to the previous similarity values.
  1134. def avg_sim(s):
  1135. return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
  1136. Gadj = G.pred if G.is_directed() else G.adj
  1137. def sim(u, v):
  1138. return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
  1139. for its in range(max_iterations):
  1140. oldsim = newsim
  1141. newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}
  1142. is_close = all(
  1143. all(
  1144. abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
  1145. for v, old in nbrs.items()
  1146. )
  1147. for u, nbrs in oldsim.items()
  1148. )
  1149. if is_close:
  1150. break
  1151. if its + 1 == max_iterations:
  1152. raise nx.ExceededMaxIterations(
  1153. f"simrank did not converge after {max_iterations} iterations."
  1154. )
  1155. if source is not None and target is not None:
  1156. return newsim[source][target]
  1157. if source is not None:
  1158. return newsim[source]
  1159. return newsim
  1160. def _simrank_similarity_numpy(
  1161. G,
  1162. source=None,
  1163. target=None,
  1164. importance_factor=0.9,
  1165. max_iterations=1000,
  1166. tolerance=1e-4,
  1167. ):
  1168. """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
  1169. The SimRank algorithm for determining node similarity is defined in
  1170. [1]_.
  1171. Parameters
  1172. ----------
  1173. G : NetworkX graph
  1174. A NetworkX graph
  1175. source : node
  1176. If this is specified, the returned dictionary maps each node
  1177. ``v`` in the graph to the similarity between ``source`` and
  1178. ``v``.
  1179. target : node
  1180. If both ``source`` and ``target`` are specified, the similarity
  1181. value between ``source`` and ``target`` is returned. If
  1182. ``target`` is specified but ``source`` is not, this argument is
  1183. ignored.
  1184. importance_factor : float
  1185. The relative importance of indirect neighbors with respect to
  1186. direct neighbors.
  1187. max_iterations : integer
  1188. Maximum number of iterations.
  1189. tolerance : float
  1190. Error tolerance used to check convergence. When an iteration of
  1191. the algorithm finds that no similarity value changes more than
  1192. this amount, the algorithm halts.
  1193. Returns
  1194. -------
  1195. similarity : numpy array or float
  1196. If ``source`` and ``target`` are both ``None``, this returns a
  1197. 2D array containing SimRank scores of the nodes.
  1198. If ``source`` is not ``None`` but ``target`` is, this returns an
  1199. 1D array containing SimRank scores of ``source`` and that
  1200. node.
  1201. If neither ``source`` nor ``target`` is ``None``, this returns
  1202. the similarity value for the given pair of nodes.
  1203. Examples
  1204. --------
  1205. >>> G = nx.cycle_graph(2)
  1206. >>> nx.similarity._simrank_similarity_numpy(G)
  1207. array([[1., 0.],
  1208. [0., 1.]])
  1209. >>> nx.similarity._simrank_similarity_numpy(G, source=0)
  1210. array([1., 0.])
  1211. >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
  1212. 1.0
  1213. References
  1214. ----------
  1215. .. [1] G. Jeh and J. Widom.
  1216. "SimRank: a measure of structural-context similarity",
  1217. In KDD'02: Proceedings of the Eighth ACM SIGKDD
  1218. International Conference on Knowledge Discovery and Data Mining,
  1219. pp. 538--543. ACM Press, 2002.
  1220. """
  1221. # This algorithm follows roughly
  1222. #
  1223. # S = max{C * (A.T * S * A), I}
  1224. #
  1225. # where C is the importance factor, A is the column normalized
  1226. # adjacency matrix, and I is the identity matrix.
  1227. import numpy as np
  1228. adjacency_matrix = nx.to_numpy_array(G)
  1229. # column-normalize the ``adjacency_matrix``
  1230. s = np.array(adjacency_matrix.sum(axis=0))
  1231. s[s == 0] = 1
  1232. adjacency_matrix /= s # adjacency_matrix.sum(axis=0)
  1233. newsim = np.eye(len(G), dtype=np.float64)
  1234. for its in range(max_iterations):
  1235. prevsim = newsim.copy()
  1236. newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
  1237. np.fill_diagonal(newsim, 1.0)
  1238. if np.allclose(prevsim, newsim, atol=tolerance):
  1239. break
  1240. if its + 1 == max_iterations:
  1241. raise nx.ExceededMaxIterations(
  1242. f"simrank did not converge after {max_iterations} iterations."
  1243. )
  1244. if source is not None and target is not None:
  1245. return newsim[source, target]
  1246. if source is not None:
  1247. return newsim[source]
  1248. return newsim
  1249. def panther_similarity(G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None):
  1250. r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.
  1251. Panther is a similarity metric that says "two objects are considered
  1252. to be similar if they frequently appear on the same paths." [1]_.
  1253. Parameters
  1254. ----------
  1255. G : NetworkX graph
  1256. A NetworkX graph
  1257. source : node
  1258. Source node for which to find the top `k` similar other nodes
  1259. k : int (default = 5)
  1260. The number of most similar nodes to return
  1261. path_length : int (default = 5)
  1262. How long the randomly generated paths should be (``T`` in [1]_)
  1263. c : float (default = 0.5)
  1264. A universal positive constant used to scale the number
  1265. of sample random paths to generate.
  1266. delta : float (default = 0.1)
  1267. The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),
  1268. where $R$ is the number of random paths and $\phi$ is the probability
  1269. that an element sampled from a set $A \subseteq D$, where $D$ is the domain.
  1270. eps : float or None (default = None)
  1271. The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore,
  1272. if no value is provided, the recommended computed value will be used.
  1273. Returns
  1274. -------
  1275. similarity : dictionary
  1276. Dictionary of nodes to similarity scores (as floats). Note:
  1277. the self-similarity (i.e., ``v``) will not be included in
  1278. the returned dictionary.
  1279. Examples
  1280. --------
  1281. >>> G = nx.star_graph(10)
  1282. >>> sim = nx.panther_similarity(G, 0)
  1283. References
  1284. ----------
  1285. .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
  1286. Panther: Fast top-k similarity search on large networks.
  1287. In Proceedings of the ACM SIGKDD International Conference
  1288. on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
  1289. Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
  1290. """
  1291. import numpy as np
  1292. num_nodes = G.number_of_nodes()
  1293. if num_nodes < k:
  1294. warnings.warn(
  1295. f"Number of nodes is {num_nodes}, but requested k is {k}. "
  1296. "Setting k to number of nodes."
  1297. )
  1298. k = num_nodes
  1299. # According to [1], they empirically determined
  1300. # a good value for ``eps`` to be sqrt( 1 / |E| )
  1301. if eps is None:
  1302. eps = np.sqrt(1.0 / G.number_of_edges())
  1303. inv_node_map = {name: index for index, name in enumerate(G.nodes)}
  1304. node_map = np.array(G)
  1305. # Calculate the sample size ``R`` for how many paths
  1306. # to randomly generate
  1307. t_choose_2 = math.comb(path_length, 2)
  1308. sample_size = int((c / eps**2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))
  1309. index_map = {}
  1310. _ = list(
  1311. generate_random_paths(
  1312. G, sample_size, path_length=path_length, index_map=index_map
  1313. )
  1314. )
  1315. S = np.zeros(num_nodes)
  1316. inv_sample_size = 1 / sample_size
  1317. source_paths = set(index_map[source])
  1318. # Calculate the path similarities
  1319. # between ``source`` (v) and ``node`` (v_j)
  1320. # using our inverted index mapping of
  1321. # vertices to paths
  1322. for node, paths in index_map.items():
  1323. # Only consider paths where both
  1324. # ``node`` and ``source`` are present
  1325. common_paths = source_paths.intersection(paths)
  1326. S[inv_node_map[node]] = len(common_paths) * inv_sample_size
  1327. # Retrieve top ``k`` similar
  1328. # Note: the below performed anywhere from 4-10x faster
  1329. # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``
  1330. top_k_unsorted = np.argpartition(S, -k)[-k:]
  1331. top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]
  1332. # Add back the similarity scores
  1333. top_k_sorted_names = (node_map[n] for n in top_k_sorted)
  1334. top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted]))
  1335. # Remove the self-similarity
  1336. top_k_with_val.pop(source, None)
  1337. return top_k_with_val
  1338. def generate_random_paths(G, sample_size, path_length=5, index_map=None):
  1339. """Randomly generate `sample_size` paths of length `path_length`.
  1340. Parameters
  1341. ----------
  1342. G : NetworkX graph
  1343. A NetworkX graph
  1344. sample_size : integer
  1345. The number of paths to generate. This is ``R`` in [1]_.
  1346. path_length : integer (default = 5)
  1347. The maximum size of the path to randomly generate.
  1348. This is ``T`` in [1]_. According to the paper, ``T >= 5`` is
  1349. recommended.
  1350. index_map : dictionary, optional
  1351. If provided, this will be populated with the inverted
  1352. index of nodes mapped to the set of generated random path
  1353. indices within ``paths``.
  1354. Returns
  1355. -------
  1356. paths : generator of lists
  1357. Generator of `sample_size` paths each with length `path_length`.
  1358. Examples
  1359. --------
  1360. Note that the return value is the list of paths:
  1361. >>> G = nx.star_graph(3)
  1362. >>> random_path = nx.generate_random_paths(G, 2)
  1363. By passing a dictionary into `index_map`, it will build an
  1364. inverted index mapping of nodes to the paths in which that node is present:
  1365. >>> G = nx.star_graph(3)
  1366. >>> index_map = {}
  1367. >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map)
  1368. >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])]
  1369. References
  1370. ----------
  1371. .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
  1372. Panther: Fast top-k similarity search on large networks.
  1373. In Proceedings of the ACM SIGKDD International Conference
  1374. on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
  1375. Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
  1376. """
  1377. import numpy as np
  1378. # Calculate transition probabilities between
  1379. # every pair of vertices according to Eq. (3)
  1380. adj_mat = nx.to_numpy_array(G)
  1381. inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1)
  1382. transition_probabilities = adj_mat * inv_row_sums
  1383. node_map = np.array(G)
  1384. num_nodes = G.number_of_nodes()
  1385. for path_index in range(sample_size):
  1386. # Sample current vertex v = v_i uniformly at random
  1387. node_index = np.random.randint(0, high=num_nodes)
  1388. node = node_map[node_index]
  1389. # Add v into p_r and add p_r into the path set
  1390. # of v, i.e., P_v
  1391. path = [node]
  1392. # Build the inverted index (P_v) of vertices to paths
  1393. if index_map is not None:
  1394. if node in index_map:
  1395. index_map[node].add(path_index)
  1396. else:
  1397. index_map[node] = {path_index}
  1398. starting_index = node_index
  1399. for _ in range(path_length):
  1400. # Randomly sample a neighbor (v_j) according
  1401. # to transition probabilities from ``node`` (v) to its neighbors
  1402. neighbor_index = np.random.choice(
  1403. num_nodes, p=transition_probabilities[starting_index]
  1404. )
  1405. # Set current vertex (v = v_j)
  1406. starting_index = neighbor_index
  1407. # Add v into p_r
  1408. neighbor_node = node_map[neighbor_index]
  1409. path.append(neighbor_node)
  1410. # Add p_r into P_v
  1411. if index_map is not None:
  1412. if neighbor_node in index_map:
  1413. index_map[neighbor_node].add(path_index)
  1414. else:
  1415. index_map[neighbor_node] = {path_index}
  1416. yield path