123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132 |
- """
- This module provides functions to perform full Procrustes analysis.
- This code was originally written by Justin Kucynski and ported over from
- scikit-bio by Yoshiki Vazquez-Baeza.
- """
- import numpy as np
- from scipy.linalg import orthogonal_procrustes
- __all__ = ['procrustes']
- def procrustes(data1, data2):
- r"""Procrustes analysis, a similarity test for two data sets.
- Each input matrix is a set of points or vectors (the rows of the matrix).
- The dimension of the space is the number of columns of each matrix. Given
- two identically sized matrices, procrustes standardizes both such that:
- - :math:`tr(AA^{T}) = 1`.
- - Both sets of points are centered around the origin.
- Procrustes ([1]_, [2]_) then applies the optimal transform to the second
- matrix (including scaling/dilation, rotations, and reflections) to minimize
- :math:`M^{2}=\sum(data1-data2)^{2}`, or the sum of the squares of the
- pointwise differences between the two input datasets.
- This function was not designed to handle datasets with different numbers of
- datapoints (rows). If two data sets have different dimensionality
- (different number of columns), simply add columns of zeros to the smaller
- of the two.
- Parameters
- ----------
- data1 : array_like
- Matrix, n rows represent points in k (columns) space `data1` is the
- reference data, after it is standardised, the data from `data2` will be
- transformed to fit the pattern in `data1` (must have >1 unique points).
- data2 : array_like
- n rows of data in k space to be fit to `data1`. Must be the same
- shape ``(numrows, numcols)`` as data1 (must have >1 unique points).
- Returns
- -------
- mtx1 : array_like
- A standardized version of `data1`.
- mtx2 : array_like
- The orientation of `data2` that best fits `data1`. Centered, but not
- necessarily :math:`tr(AA^{T}) = 1`.
- disparity : float
- :math:`M^{2}` as defined above.
- Raises
- ------
- ValueError
- If the input arrays are not two-dimensional.
- If the shape of the input arrays is different.
- If the input arrays have zero columns or zero rows.
- See Also
- --------
- scipy.linalg.orthogonal_procrustes
- scipy.spatial.distance.directed_hausdorff : Another similarity test
- for two data sets
- Notes
- -----
- - The disparity should not depend on the order of the input matrices, but
- the output matrices will, as only the first output matrix is guaranteed
- to be scaled such that :math:`tr(AA^{T}) = 1`.
- - Duplicate data points are generally ok, duplicating a data point will
- increase its effect on the procrustes fit.
- - The disparity scales as the number of points per input matrix.
- References
- ----------
- .. [1] Krzanowski, W. J. (2000). "Principles of Multivariate analysis".
- .. [2] Gower, J. C. (1975). "Generalized procrustes analysis".
- Examples
- --------
- >>> import numpy as np
- >>> from scipy.spatial import procrustes
- The matrix ``b`` is a rotated, shifted, scaled and mirrored version of
- ``a`` here:
- >>> a = np.array([[1, 3], [1, 2], [1, 1], [2, 1]], 'd')
- >>> b = np.array([[4, -2], [4, -4], [4, -6], [2, -6]], 'd')
- >>> mtx1, mtx2, disparity = procrustes(a, b)
- >>> round(disparity)
- 0.0
- """
- mtx1 = np.array(data1, dtype=np.double, copy=True)
- mtx2 = np.array(data2, dtype=np.double, copy=True)
- if mtx1.ndim != 2 or mtx2.ndim != 2:
- raise ValueError("Input matrices must be two-dimensional")
- if mtx1.shape != mtx2.shape:
- raise ValueError("Input matrices must be of same shape")
- if mtx1.size == 0:
- raise ValueError("Input matrices must be >0 rows and >0 cols")
- # translate all the data to the origin
- mtx1 -= np.mean(mtx1, 0)
- mtx2 -= np.mean(mtx2, 0)
- norm1 = np.linalg.norm(mtx1)
- norm2 = np.linalg.norm(mtx2)
- if norm1 == 0 or norm2 == 0:
- raise ValueError("Input matrices must contain >1 unique points")
- # change scaling of data (in rows) such that trace(mtx*mtx') = 1
- mtx1 /= norm1
- mtx2 /= norm2
- # transform mtx2 to minimize disparity
- R, s = orthogonal_procrustes(mtx1, mtx2)
- mtx2 = np.dot(mtx2, R.T) * s
- # measure the dissimilarity between the two datasets
- disparity = np.sum(np.square(mtx1 - mtx2))
- return mtx1, mtx2, disparity
|