_sketches.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. """ Sketching-based Matrix Computations """
  2. # Author: Jordi Montes <jomsdev@gmail.com>
  3. # August 28, 2017
  4. import numpy as np
  5. from scipy._lib._util import check_random_state, rng_integers
  6. from scipy.sparse import csc_matrix
  7. __all__ = ['clarkson_woodruff_transform']
  8. def cwt_matrix(n_rows, n_columns, seed=None):
  9. r"""
  10. Generate a matrix S which represents a Clarkson-Woodruff transform.
  11. Given the desired size of matrix, the method returns a matrix S of size
  12. (n_rows, n_columns) where each column has all the entries set to 0
  13. except for one position which has been randomly set to +1 or -1 with
  14. equal probability.
  15. Parameters
  16. ----------
  17. n_rows : int
  18. Number of rows of S
  19. n_columns : int
  20. Number of columns of S
  21. seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
  22. If `seed` is None (or `np.random`), the `numpy.random.RandomState`
  23. singleton is used.
  24. If `seed` is an int, a new ``RandomState`` instance is used,
  25. seeded with `seed`.
  26. If `seed` is already a ``Generator`` or ``RandomState`` instance then
  27. that instance is used.
  28. Returns
  29. -------
  30. S : (n_rows, n_columns) csc_matrix
  31. The returned matrix has ``n_columns`` nonzero entries.
  32. Notes
  33. -----
  34. Given a matrix A, with probability at least 9/10,
  35. .. math:: \|SA\| = (1 \pm \epsilon)\|A\|
  36. Where the error epsilon is related to the size of S.
  37. """
  38. rng = check_random_state(seed)
  39. rows = rng_integers(rng, 0, n_rows, n_columns)
  40. cols = np.arange(n_columns+1)
  41. signs = rng.choice([1, -1], n_columns)
  42. S = csc_matrix((signs, rows, cols),shape=(n_rows, n_columns))
  43. return S
  44. def clarkson_woodruff_transform(input_matrix, sketch_size, seed=None):
  45. r"""
  46. Applies a Clarkson-Woodruff Transform/sketch to the input matrix.
  47. Given an input_matrix ``A`` of size ``(n, d)``, compute a matrix ``A'`` of
  48. size (sketch_size, d) so that
  49. .. math:: \|Ax\| \approx \|A'x\|
  50. with high probability via the Clarkson-Woodruff Transform, otherwise
  51. known as the CountSketch matrix.
  52. Parameters
  53. ----------
  54. input_matrix : array_like
  55. Input matrix, of shape ``(n, d)``.
  56. sketch_size : int
  57. Number of rows for the sketch.
  58. seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
  59. If `seed` is None (or `np.random`), the `numpy.random.RandomState`
  60. singleton is used.
  61. If `seed` is an int, a new ``RandomState`` instance is used,
  62. seeded with `seed`.
  63. If `seed` is already a ``Generator`` or ``RandomState`` instance then
  64. that instance is used.
  65. Returns
  66. -------
  67. A' : array_like
  68. Sketch of the input matrix ``A``, of size ``(sketch_size, d)``.
  69. Notes
  70. -----
  71. To make the statement
  72. .. math:: \|Ax\| \approx \|A'x\|
  73. precise, observe the following result which is adapted from the
  74. proof of Theorem 14 of [2]_ via Markov's Inequality. If we have
  75. a sketch size ``sketch_size=k`` which is at least
  76. .. math:: k \geq \frac{2}{\epsilon^2\delta}
  77. Then for any fixed vector ``x``,
  78. .. math:: \|Ax\| = (1\pm\epsilon)\|A'x\|
  79. with probability at least one minus delta.
  80. This implementation takes advantage of sparsity: computing
  81. a sketch takes time proportional to ``A.nnz``. Data ``A`` which
  82. is in ``scipy.sparse.csc_matrix`` format gives the quickest
  83. computation time for sparse input.
  84. >>> import numpy as np
  85. >>> from scipy import linalg
  86. >>> from scipy import sparse
  87. >>> rng = np.random.default_rng()
  88. >>> n_rows, n_columns, density, sketch_n_rows = 15000, 100, 0.01, 200
  89. >>> A = sparse.rand(n_rows, n_columns, density=density, format='csc')
  90. >>> B = sparse.rand(n_rows, n_columns, density=density, format='csr')
  91. >>> C = sparse.rand(n_rows, n_columns, density=density, format='coo')
  92. >>> D = rng.standard_normal((n_rows, n_columns))
  93. >>> SA = linalg.clarkson_woodruff_transform(A, sketch_n_rows) # fastest
  94. >>> SB = linalg.clarkson_woodruff_transform(B, sketch_n_rows) # fast
  95. >>> SC = linalg.clarkson_woodruff_transform(C, sketch_n_rows) # slower
  96. >>> SD = linalg.clarkson_woodruff_transform(D, sketch_n_rows) # slowest
  97. That said, this method does perform well on dense inputs, just slower
  98. on a relative scale.
  99. References
  100. ----------
  101. .. [1] Kenneth L. Clarkson and David P. Woodruff. Low rank approximation
  102. and regression in input sparsity time. In STOC, 2013.
  103. .. [2] David P. Woodruff. Sketching as a tool for numerical linear algebra.
  104. In Foundations and Trends in Theoretical Computer Science, 2014.
  105. Examples
  106. --------
  107. Create a big dense matrix ``A`` for the example:
  108. >>> import numpy as np
  109. >>> from scipy import linalg
  110. >>> n_rows, n_columns = 15000, 100
  111. >>> rng = np.random.default_rng()
  112. >>> A = rng.standard_normal((n_rows, n_columns))
  113. Apply the transform to create a new matrix with 200 rows:
  114. >>> sketch_n_rows = 200
  115. >>> sketch = linalg.clarkson_woodruff_transform(A, sketch_n_rows, seed=rng)
  116. >>> sketch.shape
  117. (200, 100)
  118. Now with high probability, the true norm is close to the sketched norm
  119. in absolute value.
  120. >>> linalg.norm(A)
  121. 1224.2812927123198
  122. >>> linalg.norm(sketch)
  123. 1226.518328407333
  124. Similarly, applying our sketch preserves the solution to a linear
  125. regression of :math:`\min \|Ax - b\|`.
  126. >>> b = rng.standard_normal(n_rows)
  127. >>> x = linalg.lstsq(A, b)[0]
  128. >>> Ab = np.hstack((A, b.reshape(-1, 1)))
  129. >>> SAb = linalg.clarkson_woodruff_transform(Ab, sketch_n_rows, seed=rng)
  130. >>> SA, Sb = SAb[:, :-1], SAb[:, -1]
  131. >>> x_sketched = linalg.lstsq(SA, Sb)[0]
  132. As with the matrix norm example, ``linalg.norm(A @ x - b)`` is close
  133. to ``linalg.norm(A @ x_sketched - b)`` with high probability.
  134. >>> linalg.norm(A @ x - b)
  135. 122.83242365433877
  136. >>> linalg.norm(A @ x_sketched - b)
  137. 166.58473879945151
  138. """
  139. S = cwt_matrix(sketch_size, input_matrix.shape[0], seed)
  140. return S.dot(input_matrix)