// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) // // A CUDA sparse matrix linear operator. // This include must come before any #ifndef check on Ceres compile options. // clang-format off #include "ceres/internal/config.h" // clang-format on #include "ceres/cuda_sparse_matrix.h" #include #include #include "ceres/block_sparse_matrix.h" #include "ceres/compressed_row_sparse_matrix.h" #include "ceres/context_impl.h" #include "ceres/crs_matrix.h" #include "ceres/internal/export.h" #include "ceres/types.h" #include "ceres/wall_time.h" #ifndef CERES_NO_CUDA #include "ceres/cuda_buffer.h" #include "ceres/cuda_kernels_vector_ops.h" #include "ceres/cuda_vector.h" #include "cuda_runtime_api.h" #include "cusparse.h" namespace ceres::internal { namespace { // Starting in CUDA 11.2.1, CUSPARSE_MV_ALG_DEFAULT was deprecated in favor of // CUSPARSE_SPMV_ALG_DEFAULT. #if CUDART_VERSION >= 11021 const auto kSpMVAlgorithm = CUSPARSE_SPMV_ALG_DEFAULT; #else // CUDART_VERSION >= 11021 const auto kSpMVAlgorithm = CUSPARSE_MV_ALG_DEFAULT; #endif // CUDART_VERSION >= 11021 size_t GetTempBufferSizeForOp(const cusparseHandle_t& handle, const cusparseOperation_t op, const cusparseDnVecDescr_t& x, const cusparseDnVecDescr_t& y, const cusparseSpMatDescr_t& A) { size_t buffer_size; const double alpha = 1.0; const double beta = 1.0; CHECK_NE(A, nullptr); CHECK_EQ(cusparseSpMV_bufferSize(handle, op, &alpha, A, x, &beta, y, CUDA_R_64F, kSpMVAlgorithm, &buffer_size), CUSPARSE_STATUS_SUCCESS); return buffer_size; } size_t GetTempBufferSize(const cusparseHandle_t& handle, const cusparseDnVecDescr_t& left, const cusparseDnVecDescr_t& right, const cusparseSpMatDescr_t& A) { CHECK_NE(A, nullptr); return std::max(GetTempBufferSizeForOp( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, right, left, A), GetTempBufferSizeForOp( handle, CUSPARSE_OPERATION_TRANSPOSE, left, right, A)); } } // namespace CudaSparseMatrix::CudaSparseMatrix(int num_cols, CudaBuffer&& rows, CudaBuffer&& cols, ContextImpl* context) : num_rows_(rows.size() - 1), num_cols_(num_cols), num_nonzeros_(cols.size()), context_(context), rows_(std::move(rows)), cols_(std::move(cols)), values_(context, num_nonzeros_), spmv_buffer_(context) { Initialize(); } CudaSparseMatrix::CudaSparseMatrix(ContextImpl* context, const CompressedRowSparseMatrix& crs_matrix) : num_rows_(crs_matrix.num_rows()), num_cols_(crs_matrix.num_cols()), num_nonzeros_(crs_matrix.num_nonzeros()), context_(context), rows_(context, num_rows_ + 1), cols_(context, num_nonzeros_), values_(context, num_nonzeros_), spmv_buffer_(context) { rows_.CopyFromCpu(crs_matrix.rows(), num_rows_ + 1); cols_.CopyFromCpu(crs_matrix.cols(), num_nonzeros_); values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); Initialize(); } CudaSparseMatrix::~CudaSparseMatrix() { CHECK_EQ(cusparseDestroySpMat(descr_), CUSPARSE_STATUS_SUCCESS); descr_ = nullptr; CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_left_)); CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseDestroyDnVec(descr_vec_right_)); } void CudaSparseMatrix::CopyValuesFromCpu( const CompressedRowSparseMatrix& crs_matrix) { // There is no quick and easy way to verify that the structure is unchanged, // but at least we can check that the size of the matrix and the number of // nonzeros is unchanged. CHECK_EQ(num_rows_, crs_matrix.num_rows()); CHECK_EQ(num_cols_, crs_matrix.num_cols()); CHECK_EQ(num_nonzeros_, crs_matrix.num_nonzeros()); values_.CopyFromCpu(crs_matrix.values(), num_nonzeros_); } void CudaSparseMatrix::Initialize() { CHECK(context_->IsCudaInitialized()); CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseCreateCsr(&descr_, num_rows_, num_cols_, num_nonzeros_, rows_.data(), cols_.data(), values_.data(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_64F)); // Note: values_.data() is used as non-zero pointer to device memory // When there is no non-zero values, data-pointer of values_ array will be a // nullptr; but in this case left/right products are trivial and temporary // buffer (and vector descriptors) is not required if (!num_nonzeros_) return; CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseCreateDnVec( &descr_vec_left_, num_rows_, values_.data(), CUDA_R_64F)); CHECK_EQ(CUSPARSE_STATUS_SUCCESS, cusparseCreateDnVec( &descr_vec_right_, num_cols_, values_.data(), CUDA_R_64F)); size_t buffer_size = GetTempBufferSize( context_->cusparse_handle_, descr_vec_left_, descr_vec_right_, descr_); spmv_buffer_.Reserve(buffer_size); } void CudaSparseMatrix::SpMv(cusparseOperation_t op, const cusparseDnVecDescr_t& x, const cusparseDnVecDescr_t& y) const { const double alpha = 1.0; const double beta = 1.0; CHECK_EQ(cusparseSpMV(context_->cusparse_handle_, op, &alpha, descr_, x, &beta, y, CUDA_R_64F, kSpMVAlgorithm, spmv_buffer_.data()), CUSPARSE_STATUS_SUCCESS); } void CudaSparseMatrix::RightMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const { DCHECK(GetTempBufferSize( context_->cusparse_handle_, y->descr(), x.descr(), descr_) <= spmv_buffer_.size()); SpMv(CUSPARSE_OPERATION_NON_TRANSPOSE, x.descr(), y->descr()); } void CudaSparseMatrix::LeftMultiplyAndAccumulate(const CudaVector& x, CudaVector* y) const { // TODO(Joydeep Biswas): We should consider storing a transposed copy of the // matrix by converting CSR to CSC. From the cuSPARSE documentation: // "In general, opA == CUSPARSE_OPERATION_NON_TRANSPOSE is 3x faster than opA // != CUSPARSE_OPERATION_NON_TRANSPOSE" DCHECK(GetTempBufferSize( context_->cusparse_handle_, x.descr(), y->descr(), descr_) <= spmv_buffer_.size()); SpMv(CUSPARSE_OPERATION_TRANSPOSE, x.descr(), y->descr()); } } // namespace ceres::internal #endif // CERES_NO_CUDA