cuda_partitioned_block_sparse_crs_view.h 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. // Ceres Solver - A fast non-linear least squares minimizer
  2. // Copyright 2023 Google Inc. All rights reserved.
  3. // http://ceres-solver.org/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice,
  9. // this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above copyright notice,
  11. // this list of conditions and the following disclaimer in the documentation
  12. // and/or other materials provided with the distribution.
  13. // * Neither the name of Google Inc. nor the names of its contributors may be
  14. // used to endorse or promote products derived from this software without
  15. // specific prior written permission.
  16. //
  17. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. // POSSIBILITY OF SUCH DAMAGE.
  28. //
  29. // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
  30. //
  31. #ifndef CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
  32. #define CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_
  33. #include "ceres/internal/config.h"
  34. #ifndef CERES_NO_CUDA
  35. #include <memory>
  36. #include "ceres/block_sparse_matrix.h"
  37. #include "ceres/cuda_block_structure.h"
  38. #include "ceres/cuda_buffer.h"
  39. #include "ceres/cuda_sparse_matrix.h"
  40. #include "ceres/cuda_streamed_buffer.h"
  41. namespace ceres::internal {
  42. // We use cuSPARSE library for SpMV operations. However, it does not support
  43. // neither block-sparse format with varying size of the blocks nor
  44. // submatrix-vector products. Thus, we perform the following operations in order
  45. // to compute products of partitioned block-sparse matrices and dense vectors on
  46. // gpu:
  47. // - Once per block-sparse structure update:
  48. // - Compute CRS structures of left and right submatrices from block-sparse
  49. // structure
  50. // - Check if values of F sub-matrix can be copied without permutation
  51. // matrices
  52. // - Once per block-sparse values update:
  53. // - Copy values of E sub-matrix
  54. // - Permute or copy values of F sub-matrix
  55. //
  56. // It is assumed that cells of block-sparse matrix are laid out sequentially in
  57. // both of sub-matrices and there is exactly one cell in row-block of E
  58. // sub-matrix in the first num_row_blocks_e_ row blocks, and no cells in E
  59. // sub-matrix below num_row_blocks_e_ row blocks.
  60. //
  61. // This class avoids storing both CRS and block-sparse values in GPU memory.
  62. // Instead, block-sparse values are transferred to gpu memory as a disjoint set
  63. // of small continuous segments with simultaneous permutation of the values into
  64. // correct order using block-structure.
  65. class CERES_NO_EXPORT CudaPartitionedBlockSparseCRSView {
  66. public:
  67. // Initializes internal CRS matrix and block-sparse structure on GPU side
  68. // values. The following objects are stored in gpu memory for the whole
  69. // lifetime of the object
  70. // - matrix_e_: left CRS submatrix
  71. // - matrix_f_: right CRS submatrix
  72. // - block_structure_: copy of block-sparse structure on GPU
  73. // - streamed_buffer_: helper for value updating
  74. CudaPartitionedBlockSparseCRSView(const BlockSparseMatrix& bsm,
  75. const int num_col_blocks_e,
  76. ContextImpl* context);
  77. // Update values of CRS submatrices using values of block-sparse matrix.
  78. // Assumes that bsm has the same block-sparse structure as matrix that was
  79. // used for construction.
  80. void UpdateValues(const BlockSparseMatrix& bsm);
  81. const CudaSparseMatrix* matrix_e() const { return matrix_e_.get(); }
  82. const CudaSparseMatrix* matrix_f() const { return matrix_f_.get(); }
  83. CudaSparseMatrix* mutable_matrix_e() { return matrix_e_.get(); }
  84. CudaSparseMatrix* mutable_matrix_f() { return matrix_f_.get(); }
  85. private:
  86. // Value permutation kernel performs a single element-wise operation per
  87. // thread, thus performing permutation in blocks of 8 megabytes of
  88. // block-sparse values seems reasonable
  89. static constexpr int kMaxTemporaryArraySize = 1 * 1024 * 1024;
  90. std::unique_ptr<CudaSparseMatrix> matrix_e_;
  91. std::unique_ptr<CudaSparseMatrix> matrix_f_;
  92. std::unique_ptr<CudaStreamedBuffer<double>> streamed_buffer_;
  93. std::unique_ptr<CudaBlockSparseStructure> block_structure_;
  94. bool f_is_crs_compatible_;
  95. int num_row_blocks_e_;
  96. ContextImpl* context_;
  97. };
  98. } // namespace ceres::internal
  99. #endif // CERES_NO_CUDA
  100. #endif // CERES_INTERNAL_CUDA_PARTITIONED_BLOCK_SPARSE_CRS_VIEW_H_