123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920 |
- # Copyright Anne M. Archibald 2008
- # Released under the scipy license
- import numpy as np
- from ._ckdtree import cKDTree, cKDTreeNode
- __all__ = ['minkowski_distance_p', 'minkowski_distance',
- 'distance_matrix',
- 'Rectangle', 'KDTree']
- def minkowski_distance_p(x, y, p=2):
- """Compute the pth power of the L**p distance between two arrays.
- For efficiency, this function computes the L**p distance but does
- not extract the pth root. If `p` is 1 or infinity, this is equal to
- the actual L**p distance.
- The last dimensions of `x` and `y` must be the same length. Any
- other dimensions must be compatible for broadcasting.
- Parameters
- ----------
- x : (..., K) array_like
- Input array.
- y : (..., K) array_like
- Input array.
- p : float, 1 <= p <= infinity
- Which Minkowski p-norm to use.
- Returns
- -------
- dist : ndarray
- pth power of the distance between the input arrays.
- Examples
- --------
- >>> from scipy.spatial import minkowski_distance_p
- >>> minkowski_distance_p([[0, 0], [0, 0]], [[1, 1], [0, 1]])
- array([2, 1])
- """
- x = np.asarray(x)
- y = np.asarray(y)
- # Find smallest common datatype with float64 (return type of this
- # function) - addresses #10262.
- # Don't just cast to float64 for complex input case.
- common_datatype = np.promote_types(np.promote_types(x.dtype, y.dtype),
- 'float64')
- # Make sure x and y are NumPy arrays of correct datatype.
- x = x.astype(common_datatype)
- y = y.astype(common_datatype)
- if p == np.inf:
- return np.amax(np.abs(y-x), axis=-1)
- elif p == 1:
- return np.sum(np.abs(y-x), axis=-1)
- else:
- return np.sum(np.abs(y-x)**p, axis=-1)
- def minkowski_distance(x, y, p=2):
- """Compute the L**p distance between two arrays.
- The last dimensions of `x` and `y` must be the same length. Any
- other dimensions must be compatible for broadcasting.
- Parameters
- ----------
- x : (..., K) array_like
- Input array.
- y : (..., K) array_like
- Input array.
- p : float, 1 <= p <= infinity
- Which Minkowski p-norm to use.
- Returns
- -------
- dist : ndarray
- Distance between the input arrays.
- Examples
- --------
- >>> from scipy.spatial import minkowski_distance
- >>> minkowski_distance([[0, 0], [0, 0]], [[1, 1], [0, 1]])
- array([ 1.41421356, 1. ])
- """
- x = np.asarray(x)
- y = np.asarray(y)
- if p == np.inf or p == 1:
- return minkowski_distance_p(x, y, p)
- else:
- return minkowski_distance_p(x, y, p)**(1./p)
- class Rectangle:
- """Hyperrectangle class.
- Represents a Cartesian product of intervals.
- """
- def __init__(self, maxes, mins):
- """Construct a hyperrectangle."""
- self.maxes = np.maximum(maxes,mins).astype(float)
- self.mins = np.minimum(maxes,mins).astype(float)
- self.m, = self.maxes.shape
- def __repr__(self):
- return "<Rectangle %s>" % list(zip(self.mins, self.maxes))
- def volume(self):
- """Total volume."""
- return np.prod(self.maxes-self.mins)
- def split(self, d, split):
- """Produce two hyperrectangles by splitting.
- In general, if you need to compute maximum and minimum
- distances to the children, it can be done more efficiently
- by updating the maximum and minimum distances to the parent.
- Parameters
- ----------
- d : int
- Axis to split hyperrectangle along.
- split : float
- Position along axis `d` to split at.
- """
- mid = np.copy(self.maxes)
- mid[d] = split
- less = Rectangle(self.mins, mid)
- mid = np.copy(self.mins)
- mid[d] = split
- greater = Rectangle(mid, self.maxes)
- return less, greater
- def min_distance_point(self, x, p=2.):
- """
- Return the minimum distance between input and points in the
- hyperrectangle.
- Parameters
- ----------
- x : array_like
- Input.
- p : float, optional
- Input.
- """
- return minkowski_distance(
- 0, np.maximum(0, np.maximum(self.mins-x, x-self.maxes)),
- p
- )
- def max_distance_point(self, x, p=2.):
- """
- Return the maximum distance between input and points in the hyperrectangle.
- Parameters
- ----------
- x : array_like
- Input array.
- p : float, optional
- Input.
- """
- return minkowski_distance(0, np.maximum(self.maxes-x, x-self.mins), p)
- def min_distance_rectangle(self, other, p=2.):
- """
- Compute the minimum distance between points in the two hyperrectangles.
- Parameters
- ----------
- other : hyperrectangle
- Input.
- p : float
- Input.
- """
- return minkowski_distance(
- 0,
- np.maximum(0, np.maximum(self.mins-other.maxes,
- other.mins-self.maxes)),
- p
- )
- def max_distance_rectangle(self, other, p=2.):
- """
- Compute the maximum distance between points in the two hyperrectangles.
- Parameters
- ----------
- other : hyperrectangle
- Input.
- p : float, optional
- Input.
- """
- return minkowski_distance(
- 0, np.maximum(self.maxes-other.mins, other.maxes-self.mins), p)
- class KDTree(cKDTree):
- """kd-tree for quick nearest-neighbor lookup.
- This class provides an index into a set of k-dimensional points
- which can be used to rapidly look up the nearest neighbors of any
- point.
- Parameters
- ----------
- data : array_like, shape (n,m)
- The n data points of dimension m to be indexed. This array is
- not copied unless this is necessary to produce a contiguous
- array of doubles, and so modifying this data will result in
- bogus results. The data are also copied if the kd-tree is built
- with copy_data=True.
- leafsize : positive int, optional
- The number of points at which the algorithm switches over to
- brute-force. Default: 10.
- compact_nodes : bool, optional
- If True, the kd-tree is built to shrink the hyperrectangles to
- the actual data range. This usually gives a more compact tree that
- is robust against degenerated input data and gives faster queries
- at the expense of longer build time. Default: True.
- copy_data : bool, optional
- If True the data is always copied to protect the kd-tree against
- data corruption. Default: False.
- balanced_tree : bool, optional
- If True, the median is used to split the hyperrectangles instead of
- the midpoint. This usually gives a more compact tree and
- faster queries at the expense of longer build time. Default: True.
- boxsize : array_like or scalar, optional
- Apply a m-d toroidal topology to the KDTree.. The topology is generated
- by :math:`x_i + n_i L_i` where :math:`n_i` are integers and :math:`L_i`
- is the boxsize along i-th dimension. The input data shall be wrapped
- into :math:`[0, L_i)`. A ValueError is raised if any of the data is
- outside of this bound.
- Notes
- -----
- The algorithm used is described in Maneewongvatana and Mount 1999.
- The general idea is that the kd-tree is a binary tree, each of whose
- nodes represents an axis-aligned hyperrectangle. Each node specifies
- an axis and splits the set of points based on whether their coordinate
- along that axis is greater than or less than a particular value.
- During construction, the axis and splitting point are chosen by the
- "sliding midpoint" rule, which ensures that the cells do not all
- become long and thin.
- The tree can be queried for the r closest neighbors of any given point
- (optionally returning only those within some maximum distance of the
- point). It can also be queried, with a substantial gain in efficiency,
- for the r approximate closest neighbors.
- For large dimensions (20 is already large) do not expect this to run
- significantly faster than brute force. High-dimensional nearest-neighbor
- queries are a substantial open problem in computer science.
- Attributes
- ----------
- data : ndarray, shape (n,m)
- The n data points of dimension m to be indexed. This array is
- not copied unless this is necessary to produce a contiguous
- array of doubles. The data are also copied if the kd-tree is built
- with `copy_data=True`.
- leafsize : positive int
- The number of points at which the algorithm switches over to
- brute-force.
- m : int
- The dimension of a single data-point.
- n : int
- The number of data points.
- maxes : ndarray, shape (m,)
- The maximum value in each dimension of the n data points.
- mins : ndarray, shape (m,)
- The minimum value in each dimension of the n data points.
- size : int
- The number of nodes in the tree.
- """
- class node:
- @staticmethod
- def _create(ckdtree_node=None):
- """Create either an inner or leaf node, wrapping a cKDTreeNode instance"""
- if ckdtree_node is None:
- return KDTree.node(ckdtree_node)
- elif ckdtree_node.split_dim == -1:
- return KDTree.leafnode(ckdtree_node)
- else:
- return KDTree.innernode(ckdtree_node)
- def __init__(self, ckdtree_node=None):
- if ckdtree_node is None:
- ckdtree_node = cKDTreeNode()
- self._node = ckdtree_node
- def __lt__(self, other):
- return id(self) < id(other)
- def __gt__(self, other):
- return id(self) > id(other)
- def __le__(self, other):
- return id(self) <= id(other)
- def __ge__(self, other):
- return id(self) >= id(other)
- def __eq__(self, other):
- return id(self) == id(other)
- class leafnode(node):
- @property
- def idx(self):
- return self._node.indices
- @property
- def children(self):
- return self._node.children
- class innernode(node):
- def __init__(self, ckdtreenode):
- assert isinstance(ckdtreenode, cKDTreeNode)
- super().__init__(ckdtreenode)
- self.less = KDTree.node._create(ckdtreenode.lesser)
- self.greater = KDTree.node._create(ckdtreenode.greater)
- @property
- def split_dim(self):
- return self._node.split_dim
- @property
- def split(self):
- return self._node.split
- @property
- def children(self):
- return self._node.children
- @property
- def tree(self):
- if not hasattr(self, "_tree"):
- self._tree = KDTree.node._create(super().tree)
- return self._tree
- def __init__(self, data, leafsize=10, compact_nodes=True, copy_data=False,
- balanced_tree=True, boxsize=None):
- data = np.asarray(data)
- if data.dtype.kind == 'c':
- raise TypeError("KDTree does not work with complex data")
- # Note KDTree has different default leafsize from cKDTree
- super().__init__(data, leafsize, compact_nodes, copy_data,
- balanced_tree, boxsize)
- def query(
- self, x, k=1, eps=0, p=2, distance_upper_bound=np.inf, workers=1):
- r"""Query the kd-tree for nearest neighbors.
- Parameters
- ----------
- x : array_like, last dimension self.m
- An array of points to query.
- k : int or Sequence[int], optional
- Either the number of nearest neighbors to return, or a list of the
- k-th nearest neighbors to return, starting from 1.
- eps : nonnegative float, optional
- Return approximate nearest neighbors; the kth returned value
- is guaranteed to be no further than (1+eps) times the
- distance to the real kth nearest neighbor.
- p : float, 1<=p<=infinity, optional
- Which Minkowski p-norm to use.
- 1 is the sum-of-absolute-values distance ("Manhattan" distance).
- 2 is the usual Euclidean distance.
- infinity is the maximum-coordinate-difference distance.
- A large, finite p may cause a ValueError if overflow can occur.
- distance_upper_bound : nonnegative float, optional
- Return only neighbors within this distance. This is used to prune
- tree searches, so if you are doing a series of nearest-neighbor
- queries, it may help to supply the distance to the nearest neighbor
- of the most recent point.
- workers : int, optional
- Number of workers to use for parallel processing. If -1 is given
- all CPU threads are used. Default: 1.
- .. versionadded:: 1.6.0
- Returns
- -------
- d : float or array of floats
- The distances to the nearest neighbors.
- If ``x`` has shape ``tuple+(self.m,)``, then ``d`` has shape
- ``tuple+(k,)``.
- When k == 1, the last dimension of the output is squeezed.
- Missing neighbors are indicated with infinite distances.
- Hits are sorted by distance (nearest first).
- .. versionchanged:: 1.9.0
- Previously if ``k=None``, then `d` was an object array of
- shape ``tuple``, containing lists of distances. This behavior
- has been removed, use `query_ball_point` instead.
- i : integer or array of integers
- The index of each neighbor in ``self.data``.
- ``i`` is the same shape as d.
- Missing neighbors are indicated with ``self.n``.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial import KDTree
- >>> x, y = np.mgrid[0:5, 2:8]
- >>> tree = KDTree(np.c_[x.ravel(), y.ravel()])
- To query the nearest neighbours and return squeezed result, use
- >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=1)
- >>> print(dd, ii, sep='\n')
- [2. 0.2236068]
- [ 0 13]
- To query the nearest neighbours and return unsqueezed result, use
- >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1])
- >>> print(dd, ii, sep='\n')
- [[2. ]
- [0.2236068]]
- [[ 0]
- [13]]
- To query the second nearest neighbours and return unsqueezed result,
- use
- >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[2])
- >>> print(dd, ii, sep='\n')
- [[2.23606798]
- [0.80622577]]
- [[ 6]
- [19]]
- To query the first and second nearest neighbours, use
- >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=2)
- >>> print(dd, ii, sep='\n')
- [[2. 2.23606798]
- [0.2236068 0.80622577]]
- [[ 0 6]
- [13 19]]
- or, be more specific
- >>> dd, ii = tree.query([[0, 0], [2.2, 2.9]], k=[1, 2])
- >>> print(dd, ii, sep='\n')
- [[2. 2.23606798]
- [0.2236068 0.80622577]]
- [[ 0 6]
- [13 19]]
- """
- x = np.asarray(x)
- if x.dtype.kind == 'c':
- raise TypeError("KDTree does not work with complex data")
- if k is None:
- raise ValueError("k must be an integer or a sequence of integers")
- d, i = super().query(x, k, eps, p, distance_upper_bound, workers)
- if isinstance(i, int):
- i = np.intp(i)
- return d, i
- def query_ball_point(self, x, r, p=2., eps=0, workers=1,
- return_sorted=None, return_length=False):
- """Find all points within distance r of point(s) x.
- Parameters
- ----------
- x : array_like, shape tuple + (self.m,)
- The point or points to search for neighbors of.
- r : array_like, float
- The radius of points to return, must broadcast to the length of x.
- p : float, optional
- Which Minkowski p-norm to use. Should be in the range [1, inf].
- A finite large p may cause a ValueError if overflow can occur.
- eps : nonnegative float, optional
- Approximate search. Branches of the tree are not explored if their
- nearest points are further than ``r / (1 + eps)``, and branches are
- added in bulk if their furthest points are nearer than
- ``r * (1 + eps)``.
- workers : int, optional
- Number of jobs to schedule for parallel processing. If -1 is given
- all processors are used. Default: 1.
- .. versionadded:: 1.6.0
- return_sorted : bool, optional
- Sorts returned indicies if True and does not sort them if False. If
- None, does not sort single point queries, but does sort
- multi-point queries which was the behavior before this option
- was added.
- .. versionadded:: 1.6.0
- return_length : bool, optional
- Return the number of points inside the radius instead of a list
- of the indices.
- .. versionadded:: 1.6.0
- Returns
- -------
- results : list or array of lists
- If `x` is a single point, returns a list of the indices of the
- neighbors of `x`. If `x` is an array of points, returns an object
- array of shape tuple containing lists of neighbors.
- Notes
- -----
- If you have many points whose neighbors you want to find, you may save
- substantial amounts of time by putting them in a KDTree and using
- query_ball_tree.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import spatial
- >>> x, y = np.mgrid[0:5, 0:5]
- >>> points = np.c_[x.ravel(), y.ravel()]
- >>> tree = spatial.KDTree(points)
- >>> sorted(tree.query_ball_point([2, 0], 1))
- [5, 10, 11, 15]
- Query multiple points and plot the results:
- >>> import matplotlib.pyplot as plt
- >>> points = np.asarray(points)
- >>> plt.plot(points[:,0], points[:,1], '.')
- >>> for results in tree.query_ball_point(([2, 0], [3, 3]), 1):
- ... nearby_points = points[results]
- ... plt.plot(nearby_points[:,0], nearby_points[:,1], 'o')
- >>> plt.margins(0.1, 0.1)
- >>> plt.show()
- """
- x = np.asarray(x)
- if x.dtype.kind == 'c':
- raise TypeError("KDTree does not work with complex data")
- return super().query_ball_point(
- x, r, p, eps, workers, return_sorted, return_length)
- def query_ball_tree(self, other, r, p=2., eps=0):
- """
- Find all pairs of points between `self` and `other` whose distance is
- at most r.
- Parameters
- ----------
- other : KDTree instance
- The tree containing points to search against.
- r : float
- The maximum distance, has to be positive.
- p : float, optional
- Which Minkowski norm to use. `p` has to meet the condition
- ``1 <= p <= infinity``.
- eps : float, optional
- Approximate search. Branches of the tree are not explored
- if their nearest points are further than ``r/(1+eps)``, and
- branches are added in bulk if their furthest points are nearer
- than ``r * (1+eps)``. `eps` has to be non-negative.
- Returns
- -------
- results : list of lists
- For each element ``self.data[i]`` of this tree, ``results[i]`` is a
- list of the indices of its neighbors in ``other.data``.
- Examples
- --------
- You can search all pairs of points between two kd-trees within a distance:
- >>> import matplotlib.pyplot as plt
- >>> import numpy as np
- >>> from scipy.spatial import KDTree
- >>> rng = np.random.default_rng()
- >>> points1 = rng.random((15, 2))
- >>> points2 = rng.random((15, 2))
- >>> plt.figure(figsize=(6, 6))
- >>> plt.plot(points1[:, 0], points1[:, 1], "xk", markersize=14)
- >>> plt.plot(points2[:, 0], points2[:, 1], "og", markersize=14)
- >>> kd_tree1 = KDTree(points1)
- >>> kd_tree2 = KDTree(points2)
- >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2)
- >>> for i in range(len(indexes)):
- ... for j in indexes[i]:
- ... plt.plot([points1[i, 0], points2[j, 0]],
- ... [points1[i, 1], points2[j, 1]], "-r")
- >>> plt.show()
- """
- return super().query_ball_tree(other, r, p, eps)
- def query_pairs(self, r, p=2., eps=0, output_type='set'):
- """Find all pairs of points in `self` whose distance is at most r.
- Parameters
- ----------
- r : positive float
- The maximum distance.
- p : float, optional
- Which Minkowski norm to use. `p` has to meet the condition
- ``1 <= p <= infinity``.
- eps : float, optional
- Approximate search. Branches of the tree are not explored
- if their nearest points are further than ``r/(1+eps)``, and
- branches are added in bulk if their furthest points are nearer
- than ``r * (1+eps)``. `eps` has to be non-negative.
- output_type : string, optional
- Choose the output container, 'set' or 'ndarray'. Default: 'set'
- .. versionadded:: 1.6.0
- Returns
- -------
- results : set or ndarray
- Set of pairs ``(i,j)``, with ``i < j``, for which the corresponding
- positions are close. If output_type is 'ndarray', an ndarry is
- returned instead of a set.
- Examples
- --------
- You can search all pairs of points in a kd-tree within a distance:
- >>> import matplotlib.pyplot as plt
- >>> import numpy as np
- >>> from scipy.spatial import KDTree
- >>> rng = np.random.default_rng()
- >>> points = rng.random((20, 2))
- >>> plt.figure(figsize=(6, 6))
- >>> plt.plot(points[:, 0], points[:, 1], "xk", markersize=14)
- >>> kd_tree = KDTree(points)
- >>> pairs = kd_tree.query_pairs(r=0.2)
- >>> for (i, j) in pairs:
- ... plt.plot([points[i, 0], points[j, 0]],
- ... [points[i, 1], points[j, 1]], "-r")
- >>> plt.show()
- """
- return super().query_pairs(r, p, eps, output_type)
- def count_neighbors(self, other, r, p=2., weights=None, cumulative=True):
- """Count how many nearby pairs can be formed.
- Count the number of pairs ``(x1,x2)`` can be formed, with ``x1`` drawn
- from ``self`` and ``x2`` drawn from ``other``, and where
- ``distance(x1, x2, p) <= r``.
- Data points on ``self`` and ``other`` are optionally weighted by the
- ``weights`` argument. (See below)
- This is adapted from the "two-point correlation" algorithm described by
- Gray and Moore [1]_. See notes for further discussion.
- Parameters
- ----------
- other : KDTree
- The other tree to draw points from, can be the same tree as self.
- r : float or one-dimensional array of floats
- The radius to produce a count for. Multiple radii are searched with
- a single tree traversal.
- If the count is non-cumulative(``cumulative=False``), ``r`` defines
- the edges of the bins, and must be non-decreasing.
- p : float, optional
- 1<=p<=infinity.
- Which Minkowski p-norm to use.
- Default 2.0.
- A finite large p may cause a ValueError if overflow can occur.
- weights : tuple, array_like, or None, optional
- If None, the pair-counting is unweighted.
- If given as a tuple, weights[0] is the weights of points in
- ``self``, and weights[1] is the weights of points in ``other``;
- either can be None to indicate the points are unweighted.
- If given as an array_like, weights is the weights of points in
- ``self`` and ``other``. For this to make sense, ``self`` and
- ``other`` must be the same tree. If ``self`` and ``other`` are two
- different trees, a ``ValueError`` is raised.
- Default: None
- .. versionadded:: 1.6.0
- cumulative : bool, optional
- Whether the returned counts are cumulative. When cumulative is set
- to ``False`` the algorithm is optimized to work with a large number
- of bins (>10) specified by ``r``. When ``cumulative`` is set to
- True, the algorithm is optimized to work with a small number of
- ``r``. Default: True
- .. versionadded:: 1.6.0
- Returns
- -------
- result : scalar or 1-D array
- The number of pairs. For unweighted counts, the result is integer.
- For weighted counts, the result is float.
- If cumulative is False, ``result[i]`` contains the counts with
- ``(-inf if i == 0 else r[i-1]) < R <= r[i]``
- Notes
- -----
- Pair-counting is the basic operation used to calculate the two point
- correlation functions from a data set composed of position of objects.
- Two point correlation function measures the clustering of objects and
- is widely used in cosmology to quantify the large scale structure
- in our Universe, but it may be useful for data analysis in other fields
- where self-similar assembly of objects also occur.
- The Landy-Szalay estimator for the two point correlation function of
- ``D`` measures the clustering signal in ``D``. [2]_
- For example, given the position of two sets of objects,
- - objects ``D`` (data) contains the clustering signal, and
- - objects ``R`` (random) that contains no signal,
- .. math::
- \\xi(r) = \\frac{<D, D> - 2 f <D, R> + f^2<R, R>}{f^2<R, R>},
- where the brackets represents counting pairs between two data sets
- in a finite bin around ``r`` (distance), corresponding to setting
- `cumulative=False`, and ``f = float(len(D)) / float(len(R))`` is the
- ratio between number of objects from data and random.
- The algorithm implemented here is loosely based on the dual-tree
- algorithm described in [1]_. We switch between two different
- pair-cumulation scheme depending on the setting of ``cumulative``.
- The computing time of the method we use when for
- ``cumulative == False`` does not scale with the total number of bins.
- The algorithm for ``cumulative == True`` scales linearly with the
- number of bins, though it is slightly faster when only
- 1 or 2 bins are used. [5]_.
- As an extension to the naive pair-counting,
- weighted pair-counting counts the product of weights instead
- of number of pairs.
- Weighted pair-counting is used to estimate marked correlation functions
- ([3]_, section 2.2),
- or to properly calculate the average of data per distance bin
- (e.g. [4]_, section 2.1 on redshift).
- .. [1] Gray and Moore,
- "N-body problems in statistical learning",
- Mining the sky, 2000,
- https://arxiv.org/abs/astro-ph/0012333
- .. [2] Landy and Szalay,
- "Bias and variance of angular correlation functions",
- The Astrophysical Journal, 1993,
- http://adsabs.harvard.edu/abs/1993ApJ...412...64L
- .. [3] Sheth, Connolly and Skibba,
- "Marked correlations in galaxy formation models",
- Arxiv e-print, 2005,
- https://arxiv.org/abs/astro-ph/0511773
- .. [4] Hawkins, et al.,
- "The 2dF Galaxy Redshift Survey: correlation functions,
- peculiar velocities and the matter density of the Universe",
- Monthly Notices of the Royal Astronomical Society, 2002,
- http://adsabs.harvard.edu/abs/2003MNRAS.346...78H
- .. [5] https://github.com/scipy/scipy/pull/5647#issuecomment-168474926
- Examples
- --------
- You can count neighbors number between two kd-trees within a distance:
- >>> import numpy as np
- >>> from scipy.spatial import KDTree
- >>> rng = np.random.default_rng()
- >>> points1 = rng.random((5, 2))
- >>> points2 = rng.random((5, 2))
- >>> kd_tree1 = KDTree(points1)
- >>> kd_tree2 = KDTree(points2)
- >>> kd_tree1.count_neighbors(kd_tree2, 0.2)
- 1
- This number is same as the total pair number calculated by
- `query_ball_tree`:
- >>> indexes = kd_tree1.query_ball_tree(kd_tree2, r=0.2)
- >>> sum([len(i) for i in indexes])
- 1
- """
- return super().count_neighbors(other, r, p, weights, cumulative)
- def sparse_distance_matrix(
- self, other, max_distance, p=2., output_type='dok_matrix'):
- """Compute a sparse distance matrix.
- Computes a distance matrix between two KDTrees, leaving as zero
- any distance greater than max_distance.
- Parameters
- ----------
- other : KDTree
- max_distance : positive float
- p : float, 1<=p<=infinity
- Which Minkowski p-norm to use.
- A finite large p may cause a ValueError if overflow can occur.
- output_type : string, optional
- Which container to use for output data. Options: 'dok_matrix',
- 'coo_matrix', 'dict', or 'ndarray'. Default: 'dok_matrix'.
- .. versionadded:: 1.6.0
- Returns
- -------
- result : dok_matrix, coo_matrix, dict or ndarray
- Sparse matrix representing the results in "dictionary of keys"
- format. If a dict is returned the keys are (i,j) tuples of indices.
- If output_type is 'ndarray' a record array with fields 'i', 'j',
- and 'v' is returned,
- Examples
- --------
- You can compute a sparse distance matrix between two kd-trees:
- >>> import numpy as np
- >>> from scipy.spatial import KDTree
- >>> rng = np.random.default_rng()
- >>> points1 = rng.random((5, 2))
- >>> points2 = rng.random((5, 2))
- >>> kd_tree1 = KDTree(points1)
- >>> kd_tree2 = KDTree(points2)
- >>> sdm = kd_tree1.sparse_distance_matrix(kd_tree2, 0.3)
- >>> sdm.toarray()
- array([[0. , 0. , 0.12295571, 0. , 0. ],
- [0. , 0. , 0. , 0. , 0. ],
- [0.28942611, 0. , 0. , 0.2333084 , 0. ],
- [0. , 0. , 0. , 0. , 0. ],
- [0.24617575, 0.29571802, 0.26836782, 0. , 0. ]])
- You can check distances above the `max_distance` are zeros:
- >>> from scipy.spatial import distance_matrix
- >>> distance_matrix(points1, points2)
- array([[0.56906522, 0.39923701, 0.12295571, 0.8658745 , 0.79428925],
- [0.37327919, 0.7225693 , 0.87665969, 0.32580855, 0.75679479],
- [0.28942611, 0.30088013, 0.6395831 , 0.2333084 , 0.33630734],
- [0.31994999, 0.72658602, 0.71124834, 0.55396483, 0.90785663],
- [0.24617575, 0.29571802, 0.26836782, 0.57714465, 0.6473269 ]])
- """
- return super().sparse_distance_matrix(
- other, max_distance, p, output_type)
- def distance_matrix(x, y, p=2, threshold=1000000):
- """Compute the distance matrix.
- Returns the matrix of all pair-wise distances.
- Parameters
- ----------
- x : (M, K) array_like
- Matrix of M vectors in K dimensions.
- y : (N, K) array_like
- Matrix of N vectors in K dimensions.
- p : float, 1 <= p <= infinity
- Which Minkowski p-norm to use.
- threshold : positive int
- If ``M * N * K`` > `threshold`, algorithm uses a Python loop instead
- of large temporary arrays.
- Returns
- -------
- result : (M, N) ndarray
- Matrix containing the distance from every vector in `x` to every vector
- in `y`.
- Examples
- --------
- >>> from scipy.spatial import distance_matrix
- >>> distance_matrix([[0,0],[0,1]], [[1,0],[1,1]])
- array([[ 1. , 1.41421356],
- [ 1.41421356, 1. ]])
- """
- x = np.asarray(x)
- m, k = x.shape
- y = np.asarray(y)
- n, kk = y.shape
- if k != kk:
- raise ValueError(f"x contains {k}-dimensional vectors but y contains "
- f"{kk}-dimensional vectors")
- if m*n*k <= threshold:
- return minkowski_distance(x[:,np.newaxis,:],y[np.newaxis,:,:],p)
- else:
- result = np.empty((m,n),dtype=float) # FIXME: figure out the best dtype
- if m < n:
- for i in range(m):
- result[i,:] = minkowski_distance(x[i],y,p)
- else:
- for j in range(n):
- result[:,j] = minkowski_distance(x,y[j],p)
- return result
|