cuda_sparse_matrix.cc 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226
  1. // Ceres Solver - A fast non-linear least squares minimizer
  2. // Copyright 2023 Google Inc. All rights reserved.
  3. // http://ceres-solver.org/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice,
  9. // this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above copyright notice,
  11. // this list of conditions and the following disclaimer in the documentation
  12. // and/or other materials provided with the distribution.
  13. // * Neither the name of Google Inc. nor the names of its contributors may be
  14. // used to endorse or promote products derived from this software without
  15. // specific prior written permission.
  16. //
  17. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. // POSSIBILITY OF SUCH DAMAGE.
  28. //
  29. // Author: joydeepb@cs.utexas.edu (Joydeep Biswas)
  30. //
  31. // A CUDA sparse matrix linear operator.
  32. // This include must come before any #ifndef check on Ceres compile options.
  33. // clang-format off
  34. #include "ceres/internal/config.h"
  35. // clang-format on
  36. #include "ceres/cuda_sparse_matrix.h"
  37. #include <math.h>
  38. #include <memory>
  39. #include "ceres/block_sparse_matrix.h"
  40. #include "ceres/compressed_row_sparse_matrix.h"
  41. #include "ceres/context_impl.h"
  42. #include "ceres/crs_matrix.h"
  43. #include "ceres/internal/export.h"
  44. #include "ceres/types.h"
  45. #include "ceres/wall_time.h"
  46. #ifndef CERES_NO_CUDA
  47. #include "ceres/cuda_buffer.h"
  48. #include "ceres/cuda_kernels_vector_ops.h"
  49. #include "ceres/cuda_vector.h"
  50. #include "cuda_runtime_api.h"
  51. #include "cusparse.h"
  52. namespace ceres::internal {
  53. namespace {
  54. // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of
  55. // CUSPARSE_SPMV_ALG_DEFAULT.
  56. #if CUDART_VERSION >= 11021
  57. const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT;
  58. #else // CUDART_VERSION >= 11021
  59. const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT;
  60. #endif // CUDART_VERSION >= 11021
  61. size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle,
  62. const cusparseOperation_t op,
  63. const cusparseDnVecDescr_t& x,
  64. const cusparseDnVecDescr_t& y,
  65. const cusparseSpMatDescr_t& A) {
  66. size_t buffer_size;
  67. const double alpha = 1.0;
  68. const double beta = 1.0;
  69. CHECK_NE(A, nullptr);
  70. CHECK_EQ(cusparseSpMV_bufferSize(handle,
  71. op,
  72. &alpha,
  73. A,
  74. x,
  75. &beta,
  76. y,
  77. CUDA_R_64F,
  78. kSpMVAlgorithm,
  79. &buffer_size),
  80. CUSPARSE_STATUS_SUCCESS);
  81. return buffer_size;
  82. }
  83. size_t GetTempBufferSize(const cusparseHandle_t& handle,
  84. const cusparseDnVecDescr_t& left,
  85. const cusparseDnVecDescr_t& right,
  86. const cusparseSpMatDescr_t& A) {
  87. CHECK_NE(A, nullptr);
  88. return std::max(GetTempBufferSizeForOp(
  89. handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A),
  90. GetTempBufferSizeForOp(
  91. handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A));
  92. }
  93. } // namespace
  94. CudaSparseMatrix::CudaSparseMatrix(int num_cols,
  95. CudaBuffer<int32_t>&& rows,
  96. CudaBuffer<int32_t>&& cols,
  97. ContextImpl* context)
  98. : num_rows_(rows.size() - 1),
  99. num_cols_(num_cols),
  100. num_nonzeros_(cols.size()),
  101. context_(context),
  102. rows_(std::move(rows)),
  103. cols_(std::move(cols)),
  104. values_(context, num_nonzeros_),
  105. spmv_buffer_(context) {
  106. Initialize();
  107. }
  108. CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context,
  109. const CompressedRowSparseMatrix& crs_matrix)
  110. : num_rows_(crs_matrix.num_rows()),
  111. num_cols_(crs_matrix.num_cols()),
  112. num_nonzeros_(crs_matrix.num_nonzeros()),
  113. context_(context),
  114. rows_(context, num_rows_ + 1),
  115. cols_(context, num_nonzeros_),
  116. values_(context, num_nonzeros_),
  117. spmv_buffer_(context) {
  118. rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1);
  119. cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_);
  120. values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
  121. Initialize();
  122. }
  123. CudaSparseMatrix::~CudaSparseMatrix() {
  124. CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS);
  125. descr_ = nullptr;
  126. CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_));
  127. CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_));
  128. }
  129. void CudaSparseMatrix::CopyValuesFromCpu(
  130. const CompressedRowSparseMatrix& crs_matrix) {
  131. // There is no quick and easy way to verify that the structure is unchanged,
  132. // but at least we can check that the size of the matrix and the number of
  133. // nonzeros is unchanged.
  134. CHECK_EQ(num_rows_, crs_matrix.num_rows());
  135. CHECK_EQ(num_cols_, crs_matrix.num_cols());
  136. CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros());
  137. values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_);
  138. }
  139. void CudaSparseMatrix::Initialize() {
  140. CHECK(context_->IsCudaInitialized());
  141. CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
  142. cusparseCreateCsr(&descr_,
  143. num_rows_,
  144. num_cols_,
  145. num_nonzeros_,
  146. rows_.data(),
  147. cols_.data(),
  148. values_.data(),
  149. CUSPARSE_INDEX_32I,
  150. CUSPARSE_INDEX_32I,
  151. CUSPARSE_INDEX_BASE_ZERO,
  152. CUDA_R_64F));
  153. // Note: values_.data() is used as non-zero pointer to device memory
  154. // When there is no non-zero values, data-pointer of values_ array will be a
  155. // nullptr; but in this case left/right products are trivial and temporary
  156. // buffer (and vector descriptors) is not required
  157. if (!num_nonzeros_) return;
  158. CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
  159. cusparseCreateDnVec(
  160. &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F));
  161. CHECK_EQ(CUSPARSE_STATUS_SUCCESS,
  162. cusparseCreateDnVec(
  163. &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F));
  164. size_t buffer_size = GetTempBufferSize(
  165. context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_);
  166. spmv_buffer_.Reserve(buffer_size);
  167. }
  168. void CudaSparseMatrix::SpMv(cusparseOperation_t op,
  169. const cusparseDnVecDescr_t& x,
  170. const cusparseDnVecDescr_t& y) const {
  171. const double alpha = 1.0;
  172. const double beta = 1.0;
  173. CHECK_EQ(cusparseSpMV(context_->cusparse_handle_,
  174. op,
  175. &alpha,
  176. descr_,
  177. x,
  178. &beta,
  179. y,
  180. CUDA_R_64F,
  181. kSpMVAlgorithm,
  182. spmv_buffer_.data()),
  183. CUSPARSE_STATUS_SUCCESS);
  184. }
  185. void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x,
  186. CudaVector* y) const {
  187. DCHECK(GetTempBufferSize(
  188. context_->cusparse_handle_, y->descr(), x.descr(), descr_) <=
  189. spmv_buffer_.size());
  190. SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr());
  191. }
  192. void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x,
  193. CudaVector* y) const {
  194. // TODO(Joydeep Biswas): We should consider storing a transposed copy of the
  195. // matrix by converting CSR to CSC. From the cuSPARSE documentation:
  196. // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA
  197. // != CUSPARSE_OPERATION_NON_TRANSPOSE"
  198. DCHECK(GetTempBufferSize(
  199. context_->cusparse_handle_, x.descr(), y->descr(), descr_) <=
  200. spmv_buffer_.size());
  201. SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr());
  202. }
  203. } // namespace ceres::internal
  204. #endif // CERES_NO_CUDA