// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Authors: joydeepb@cs.utexas.edu (Joydeep Biswas) #include #include #include #include "Eigen/Dense" #include "benchmark/benchmark.h" #include "ceres/block_jacobi_preconditioner.h" #include "ceres/block_sparse_matrix.h" #include "ceres/context_impl.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/fake_bundle_adjustment_jacobian.h" #include "ceres/internal/config.h" #include "ceres/internal/eigen.h" #include "ceres/linear_solver.h" #ifndef CERES_NO_CUDA #include "cuda_runtime.h" #endif namespace ceres::internal { constexpr int kNumCameras = 1000; constexpr int kNumPoints = 10000; constexpr int kCameraSize = 6; constexpr int kPointSize = 3; constexpr double kVisibility = 0.1; constexpr int kNumRowBlocks = 100000; constexpr int kNumColBlocks = 10000; constexpr int kMinRowBlockSize = 1; constexpr int kMaxRowBlockSize = 5; constexpr int kMinColBlockSize = 1; constexpr int kMaxColBlockSize = 15; constexpr double kBlockDensity = 5.0 / kNumColBlocks; static void BM_BlockSparseRightMultiplyAndAccumulateBA( benchmark::State& state) { const int num_threads = static_cast(state.range(0)); std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); ContextImpl context; context.EnsureMinimumThreads(num_threads); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->RightMultiplyAndAccumulate( x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured( benchmark::State& state) { const int num_threads = static_cast(state.range(0)); BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; options.min_row_block_size = kMinRowBlockSize; options.min_col_block_size = kMinColBlockSize; options.max_row_block_size = kMaxRowBlockSize; options.max_col_block_size = kMaxColBlockSize; options.block_density = kBlockDensity; std::mt19937 prng; auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); ContextImpl context; context.EnsureMinimumThreads(num_threads); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->RightMultiplyAndAccumulate( x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA); static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured( benchmark::State& state) { BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = 100000; options.num_col_blocks = 10000; options.min_row_block_size = 1; options.min_col_block_size = 1; options.max_row_block_size = 10; options.max_col_block_size = 15; options.block_density = 5.0 / options.num_col_blocks; std::mt19937 prng; auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured); static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) { const int num_threads = static_cast(state.range(0)); std::mt19937 prng; auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); ContextImpl context; context.EnsureMinimumThreads(num_threads); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->RightMultiplyAndAccumulate( x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); static void BM_CRSRightMultiplyAndAccumulateUnstructured( benchmark::State& state) { const int num_threads = static_cast(state.range(0)); BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; options.min_row_block_size = kMinRowBlockSize; options.min_col_block_size = kMinColBlockSize; options.max_row_block_size = kMaxRowBlockSize; options.max_col_block_size = kMaxColBlockSize; options.block_density = kBlockDensity; std::mt19937 prng; auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); ContextImpl context; context.EnsureMinimumThreads(num_threads); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { jacobian->RightMultiplyAndAccumulate( x.data(), y.data(), &context, num_threads); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng; // Perform setup here auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { // This code gets timed jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA); static void BM_CRSLeftMultiplyAndAccumulateUnstructured( benchmark::State& state) { BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; options.min_row_block_size = kMinRowBlockSize; options.min_col_block_size = kMinColBlockSize; options.max_row_block_size = kMaxRowBlockSize; options.max_col_block_size = kMaxColBlockSize; options.block_density = kBlockDensity; std::mt19937 prng; auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix(); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); double sum = 0; for (auto _ : state) { // This code gets timed jacobian->LeftMultiplyAndAccumulate(x.data(), y.data()); sum += y.norm(); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured); #ifndef CERES_NO_CUDA static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); ContextImpl context; std::string message; context.InitCuda(&message); auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); CudaVector cuda_x(&context, 0); CudaVector cuda_y(&context, 0); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA); static void BM_CudaRightMultiplyAndAccumulateUnstructured( benchmark::State& state) { BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; options.min_row_block_size = kMinRowBlockSize; options.min_col_block_size = kMinColBlockSize; options.max_row_block_size = kMaxRowBlockSize; options.max_col_block_size = kMaxColBlockSize; options.block_density = kBlockDensity; std::mt19937 prng; auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); ContextImpl context; std::string message; context.InitCuda(&message); auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); CudaVector cuda_x(&context, 0); CudaVector cuda_y(&context, 0); Vector x(jacobian->num_cols()); Vector y(jacobian->num_rows()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured); static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) { std::mt19937 prng; auto jacobian = CreateFakeBundleAdjustmentJacobian( kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng); ContextImpl context; std::string message; context.InitCuda(&message); auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); CudaVector cuda_x(&context, 0); CudaVector cuda_y(&context, 0); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA); static void BM_CudaLeftMultiplyAndAccumulateUnstructured( benchmark::State& state) { BlockSparseMatrix::RandomMatrixOptions options; options.num_row_blocks = kNumRowBlocks; options.num_col_blocks = kNumColBlocks; options.min_row_block_size = kMinRowBlockSize; options.min_col_block_size = kMinColBlockSize; options.max_row_block_size = kMaxRowBlockSize; options.max_col_block_size = kMaxColBlockSize; options.block_density = kBlockDensity; std::mt19937 prng; auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng); ContextImpl context; std::string message; context.InitCuda(&message); auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix(); CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs); CudaVector cuda_x(&context, 0); CudaVector cuda_y(&context, 0); Vector x(jacobian->num_rows()); Vector y(jacobian->num_cols()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured); #endif } // namespace ceres::internal BENCHMARK_MAIN();