// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) #include "ceres/cuda_partitioned_block_sparse_crs_view.h" #ifndef CERES_NO_CUDA #include "ceres/cuda_block_structure.h" #include "ceres/cuda_kernels_bsm_to_crs.h" namespace ceres::internal { CudaPartitionedBlockSparseCRSView::CudaPartitionedBlockSparseCRSView( const BlockSparseMatrix& bsm, const int num_col_blocks_e, ContextImpl* context) : context_(context) { const auto& bs = *bsm.block_structure(); block_structure_ = std::make_unique(bs, num_col_blocks_e, context); // Determine number of non-zeros in left submatrix // Row-blocks are at least 1 row high, thus we can use a temporary array of // num_rows for ComputeNonZerosInColumnBlockSubMatrix; and later reuse it for // FillCRSStructurePartitioned const int num_rows = bsm.num_rows(); const int num_nonzeros_e = block_structure_->num_nonzeros_e(); const int num_nonzeros_f = bsm.num_nonzeros() - num_nonzeros_e; const int num_cols_e = num_col_blocks_e < bs.cols.size() ? bs.cols[num_col_blocks_e].position : bsm.num_cols(); const int num_cols_f = bsm.num_cols() - num_cols_e; CudaBuffer rows_e(context, num_rows + 1); CudaBuffer cols_e(context, num_nonzeros_e); CudaBuffer rows_f(context, num_rows + 1); CudaBuffer cols_f(context, num_nonzeros_f); num_row_blocks_e_ = block_structure_->num_row_blocks_e(); FillCRSStructurePartitioned(block_structure_->num_row_blocks(), num_rows, num_row_blocks_e_, num_col_blocks_e, num_nonzeros_e, block_structure_->first_cell_in_row_block(), block_structure_->cells(), block_structure_->row_blocks(), block_structure_->col_blocks(), rows_e.data(), cols_e.data(), rows_f.data(), cols_f.data(), context->DefaultStream(), context->is_cuda_memory_pools_supported_); f_is_crs_compatible_ = block_structure_->IsCrsCompatible(); if (f_is_crs_compatible_) { block_structure_ = nullptr; } else { streamed_buffer_ = std::make_unique>( context, kMaxTemporaryArraySize); } matrix_e_ = std::make_unique( num_cols_e, std::move(rows_e), std::move(cols_e), context); matrix_f_ = std::make_unique( num_cols_f, std::move(rows_f), std::move(cols_f), context); CHECK_EQ(bsm.num_nonzeros(), matrix_e_->num_nonzeros() + matrix_f_->num_nonzeros()); UpdateValues(bsm); } void CudaPartitionedBlockSparseCRSView::UpdateValues( const BlockSparseMatrix& bsm) { if (f_is_crs_compatible_) { CHECK_EQ(cudaSuccess, cudaMemcpyAsync(matrix_e_->mutable_values(), bsm.values(), matrix_e_->num_nonzeros() * sizeof(double), cudaMemcpyHostToDevice, context_->DefaultStream())); CHECK_EQ(cudaSuccess, cudaMemcpyAsync(matrix_f_->mutable_values(), bsm.values() + matrix_e_->num_nonzeros(), matrix_f_->num_nonzeros() * sizeof(double), cudaMemcpyHostToDevice, context_->DefaultStream())); return; } streamed_buffer_->CopyToGpu( bsm.values(), bsm.num_nonzeros(), [block_structure = block_structure_.get(), num_nonzeros_e = matrix_e_->num_nonzeros(), num_row_blocks_e = num_row_blocks_e_, values_f = matrix_f_->mutable_values(), rows_f = matrix_f_->rows()]( const double* values, int num_values, int offset, auto stream) { PermuteToCRSPartitionedF(num_nonzeros_e + offset, num_values, block_structure->num_row_blocks(), num_row_blocks_e, block_structure->first_cell_in_row_block(), block_structure->value_offset_row_block_f(), block_structure->cells(), block_structure->row_blocks(), block_structure->col_blocks(), rows_f, values, values_f, stream); }); CHECK_EQ(cudaSuccess, cudaMemcpyAsync(matrix_e_->mutable_values(), bsm.values(), matrix_e_->num_nonzeros() * sizeof(double), cudaMemcpyHostToDevice, context_->DefaultStream())); } } // namespace ceres::internal #endif // CERES_NO_CUDA