// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin) #include #include #include #include #include "benchmark/benchmark.h" #include "ceres/block_sparse_matrix.h" #include "ceres/bundle_adjustment_test_util.h" #include "ceres/cuda_block_sparse_crs_view.h" #include "ceres/cuda_partitioned_block_sparse_crs_view.h" #include "ceres/cuda_sparse_matrix.h" #include "ceres/cuda_vector.h" #include "ceres/evaluator.h" #include "ceres/implicit_schur_complement.h" #include "ceres/partitioned_matrix_view.h" #include "ceres/power_series_expansion_preconditioner.h" #include "ceres/preprocessor.h" #include "ceres/problem.h" #include "ceres/problem_impl.h" #include "ceres/program.h" #include "ceres/sparse_matrix.h" namespace ceres::internal { template std::unique_ptr downcast_unique_ptr(std::unique_ptr& base) { return std::unique_ptr(dynamic_cast(base.release())); } // Benchmark library might invoke benchmark function multiple times. // In order to save time required to parse BAL data, we ensure that // each dataset is being loaded at most once. // Each type of jacobians is also cached after first creation struct BALData { using PartitionedView = PartitionedMatrixView<2, 3, 9>; explicit BALData(const std::string& path) { bal_problem = std::make_unique(path); CHECK(bal_problem != nullptr); auto problem_impl = bal_problem->mutable_problem()->mutable_impl(); auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION); preprocessed_problem = std::make_unique(); Solver::Options options = bal_problem->options(); options.linear_solver_type = ITERATIVE_SCHUR; CHECK(preprocessor->Preprocess( options, problem_impl, preprocessed_problem.get())); auto program = preprocessed_problem->reduced_program.get(); parameters.resize(program->NumParameters()); program->ParameterBlocksToStateVector(parameters.data()); const int num_residuals = program->NumResiduals(); b.resize(num_residuals); std::mt19937 rng; std::normal_distribution rnorm; for (int i = 0; i < num_residuals; ++i) { b[i] = rnorm(rng); } const int num_parameters = program->NumParameters(); D.resize(num_parameters); for (int i = 0; i < num_parameters; ++i) { D[i] = rnorm(rng); } } std::unique_ptr CreateBlockSparseJacobian( ContextImpl* context, bool sequential) { auto problem = bal_problem->mutable_problem(); auto problem_impl = problem->mutable_impl(); CHECK(problem_impl != nullptr); Evaluator::Options options; options.linear_solver_type = ITERATIVE_SCHUR; options.num_threads = 1; options.context = context; options.num_eliminate_blocks = bal_problem->num_points(); std::string error; auto program = preprocessed_problem->reduced_program.get(); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); auto jacobian = evaluator->CreateJacobian(); auto block_sparse = downcast_unique_ptr(jacobian); CHECK(block_sparse != nullptr); if (sequential) { auto block_structure_sequential = std::make_unique( *block_sparse->block_structure()); int num_nonzeros = 0; for (auto& row_block : block_structure_sequential->rows) { const int row_block_size = row_block.block.size; for (auto& cell : row_block.cells) { const int col_block_size = block_structure_sequential->cols[cell.block_id].size; cell.position = num_nonzeros; num_nonzeros += col_block_size * row_block_size; } } block_sparse = std::make_unique( block_structure_sequential.release(), #ifndef CERES_NO_CUDA true #else false #endif ); } std::mt19937 rng; std::normal_distribution rnorm; const int nnz = block_sparse->num_nonzeros(); auto values = block_sparse->mutable_values(); for (int i = 0; i < nnz; ++i) { values[i] = rnorm(rng); } return block_sparse; } std::unique_ptr CreateCompressedRowSparseJacobian( ContextImpl* context) { auto block_sparse = BlockSparseJacobian(context); return block_sparse->ToCompressedRowSparseMatrix(); } const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) { if (!block_sparse_jacobian) { block_sparse_jacobian = CreateBlockSparseJacobian(context, true); } return block_sparse_jacobian.get(); } const BlockSparseMatrix* BlockSparseJacobianPartitioned( ContextImpl* context) { if (!block_sparse_jacobian_partitioned) { block_sparse_jacobian_partitioned = CreateBlockSparseJacobian(context, false); } return block_sparse_jacobian_partitioned.get(); } const CompressedRowSparseMatrix* CompressedRowSparseJacobian( ContextImpl* context) { if (!crs_jacobian) { crs_jacobian = CreateCompressedRowSparseJacobian(context); } return crs_jacobian.get(); } std::unique_ptr PartitionedMatrixViewJacobian( const LinearSolver::Options& options) { auto block_sparse = BlockSparseJacobianPartitioned(options.context); return std::make_unique(options, *block_sparse); } BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) { if (!block_diagonal_ete) { auto partitioned_view = PartitionedMatrixViewJacobian(options); block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE(); } return block_diagonal_ete.get(); } BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) { if (!block_diagonal_ftf) { auto partitioned_view = PartitionedMatrixViewJacobian(options); block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF(); } return block_diagonal_ftf.get(); } const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal( const LinearSolver::Options& options) { auto block_sparse = BlockSparseJacobianPartitioned(options.context); implicit_schur_complement = std::make_unique(options); implicit_schur_complement->Init(*block_sparse, nullptr, b.data()); return implicit_schur_complement.get(); } const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal( const LinearSolver::Options& options) { auto block_sparse = BlockSparseJacobianPartitioned(options.context); implicit_schur_complement_diag = std::make_unique(options); implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data()); return implicit_schur_complement_diag.get(); } Vector parameters; Vector D; Vector b; std::unique_ptr bal_problem; std::unique_ptr preprocessed_problem; std::unique_ptr block_sparse_jacobian_partitioned; std::unique_ptr block_sparse_jacobian; std::unique_ptr crs_jacobian; std::unique_ptr block_diagonal_ete; std::unique_ptr block_diagonal_ftf; std::unique_ptr implicit_schur_complement; std::unique_ptr implicit_schur_complement_diag; }; static void Residuals(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); Evaluator::Options options; options.linear_solver_type = SPARSE_NORMAL_CHOLESKY; options.num_threads = num_threads; options.context = context; options.num_eliminate_blocks = 0; std::string error; CHECK(data->preprocessed_problem != nullptr); auto program = data->preprocessed_problem->reduced_program.get(); CHECK(program != nullptr); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); double cost = 0.; Vector residuals = Vector::Zero(program->NumResiduals()); Evaluator::EvaluateOptions eval_options; for (auto _ : state) { CHECK(evaluator->Evaluate(eval_options, data->parameters.data(), &cost, residuals.data(), nullptr, nullptr)); } } static void ResidualsAndJacobian(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); Evaluator::Options options; options.linear_solver_type = SPARSE_NORMAL_CHOLESKY; options.num_threads = num_threads; options.context = context; options.num_eliminate_blocks = 0; std::string error; CHECK(data->preprocessed_problem != nullptr); auto program = data->preprocessed_problem->reduced_program.get(); CHECK(program != nullptr); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); double cost = 0.; Vector residuals = Vector::Zero(program->NumResiduals()); auto jacobian = evaluator->CreateJacobian(); Evaluator::EvaluateOptions eval_options; for (auto _ : state) { CHECK(evaluator->Evaluate(eval_options, data->parameters.data(), &cost, residuals.data(), nullptr, jacobian.get())); } } static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); Evaluator::Options options; options.linear_solver_type = SPARSE_NORMAL_CHOLESKY; options.num_threads = num_threads; options.context = context; options.num_eliminate_blocks = 0; std::string error; CHECK(data->preprocessed_problem != nullptr); auto program = data->preprocessed_problem->reduced_program.get(); CHECK(program != nullptr); auto evaluator = Evaluator::Create(options, program, &error); CHECK(evaluator != nullptr); Vector state_plus_delta = Vector::Zero(program->NumParameters()); Vector delta = Vector::Random(program->NumEffectiveParameters()); for (auto _ : state) { CHECK(evaluator->Plus( data->parameters.data(), delta.data(), state_plus_delta.data())); } CHECK_GT(state_plus_delta.squaredNorm(), 0.); } static void PSEPreconditioner(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->ImplicitSchurComplementWithDiagonal(options); Preconditioner::Options preconditioner_options(options); PowerSeriesExpansionPreconditioner preconditioner( jacobian, 10, 0, preconditioner_options); Vector y = Vector::Zero(jacobian->num_cols()); Vector x = Vector::Random(jacobian->num_cols()); for (auto _ : state) { preconditioner.RightMultiplyAndAccumulate(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void PMVRightMultiplyAndAccumulateF(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); Vector y = Vector::Zero(jacobian->num_rows()); Vector x = Vector::Random(jacobian->num_cols_f()); for (auto _ : state) { jacobian->RightMultiplyAndAccumulateF(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); Vector y = Vector::Zero(jacobian->num_cols_f()); Vector x = Vector::Random(jacobian->num_rows()); for (auto _ : state) { jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void PMVRightMultiplyAndAccumulateE(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); Vector y = Vector::Zero(jacobian->num_rows()); Vector x = Vector::Random(jacobian->num_cols_e()); for (auto _ : state) { jacobian->RightMultiplyAndAccumulateE(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); Vector y = Vector::Zero(jacobian->num_cols_e()); Vector x = Vector::Random(jacobian->num_rows()); for (auto _ : state) { jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void PMVUpdateBlockDiagonalEtE(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto block_diagonal_ete = data->BlockDiagonalEtE(options); for (auto _ : state) { jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete); } } static void PMVUpdateBlockDiagonalFtF(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto block_diagonal_ftf = data->BlockDiagonalFtF(options); for (auto _ : state) { jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf); } } static void ISCRightMultiplyNoDiag(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options); Vector y = Vector::Zero(jacobian->num_rows()); Vector x = Vector::Random(jacobian->num_cols()); for (auto _ : state) { jacobian->RightMultiplyAndAccumulate(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void ISCRightMultiplyDiag(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.num_threads = static_cast(state.range(0)); options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; auto jacobian = data->ImplicitSchurComplementWithDiagonal(options); Vector y = Vector::Zero(jacobian->num_rows()); Vector x = Vector::Random(jacobian->num_cols()); for (auto _ : state) { jacobian->RightMultiplyAndAccumulate(x.data(), y.data()); } CHECK_GT(y.squaredNorm(), 0.); } static void JacobianToCRS(benchmark::State& state, BALData* data, ContextImpl* context) { auto jacobian = data->BlockSparseJacobian(context); std::unique_ptr matrix; for (auto _ : state) { matrix = jacobian->ToCompressedRowSparseMatrix(); } CHECK(matrix != nullptr); } #ifndef CERES_NO_CUDA static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; options.num_threads = 1; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); CudaPartitionedBlockSparseCRSView view( *underlying_matrix, jacobian->num_col_blocks_e(), context); Vector x = Vector::Random(jacobian->num_cols_f()); CudaVector cuda_x(context, x.size()); CudaVector cuda_y(context, jacobian->num_rows()); cuda_x.CopyFromCpu(x); cuda_y.SetZero(); auto matrix = view.matrix_f(); for (auto _ : state) { matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); } CHECK_GT(cuda_y.Norm(), 0.); } static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; options.num_threads = 1; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); CudaPartitionedBlockSparseCRSView view( *underlying_matrix, jacobian->num_col_blocks_e(), context); Vector x = Vector::Random(jacobian->num_rows()); CudaVector cuda_x(context, x.size()); CudaVector cuda_y(context, jacobian->num_cols_f()); cuda_x.CopyFromCpu(x); cuda_y.SetZero(); auto matrix = view.matrix_f(); for (auto _ : state) { matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); } CHECK_GT(cuda_y.Norm(), 0.); } static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; options.num_threads = 1; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); CudaPartitionedBlockSparseCRSView view( *underlying_matrix, jacobian->num_col_blocks_e(), context); Vector x = Vector::Random(jacobian->num_cols_e()); CudaVector cuda_x(context, x.size()); CudaVector cuda_y(context, jacobian->num_rows()); cuda_x.CopyFromCpu(x); cuda_y.SetZero(); auto matrix = view.matrix_e(); for (auto _ : state) { matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y); } CHECK_GT(cuda_y.Norm(), 0.); } static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state, BALData* data, ContextImpl* context) { LinearSolver::Options options; options.elimination_groups.push_back(data->bal_problem->num_points()); options.context = context; options.num_threads = 1; auto jacobian = data->PartitionedMatrixViewJacobian(options); auto underlying_matrix = data->BlockSparseJacobianPartitioned(context); CudaPartitionedBlockSparseCRSView view( *underlying_matrix, jacobian->num_col_blocks_e(), context); Vector x = Vector::Random(jacobian->num_rows()); CudaVector cuda_x(context, x.size()); CudaVector cuda_y(context, jacobian->num_cols_e()); cuda_x.CopyFromCpu(x); cuda_y.SetZero(); auto matrix = view.matrix_e(); for (auto _ : state) { matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y); } CHECK_GT(cuda_y.Norm(), 0.); } // We want CudaBlockSparseCRSView to be not slower than explicit conversion to // CRS on CPU static void JacobianToCRSView(benchmark::State& state, BALData* data, ContextImpl* context) { auto jacobian = data->BlockSparseJacobian(context); std::unique_ptr matrix; for (auto _ : state) { matrix = std::make_unique(*jacobian, context); } CHECK(matrix != nullptr); } static void JacobianToCRSMatrix(benchmark::State& state, BALData* data, ContextImpl* context) { auto jacobian = data->BlockSparseJacobian(context); std::unique_ptr matrix; std::unique_ptr matrix_cpu; for (auto _ : state) { matrix_cpu = jacobian->ToCompressedRowSparseMatrix(); matrix = std::make_unique(context, *matrix_cpu); } CHECK(matrix != nullptr); } // Updating values in CudaBlockSparseCRSView should be +- as fast as just // copying values (time spent in value permutation has to be hidden by PCIe // transfer) static void JacobianToCRSViewUpdate(benchmark::State& state, BALData* data, ContextImpl* context) { auto jacobian = data->BlockSparseJacobian(context); auto matrix = CudaBlockSparseCRSView(*jacobian, context); for (auto _ : state) { matrix.UpdateValues(*jacobian); } } static void JacobianToCRSMatrixUpdate(benchmark::State& state, BALData* data, ContextImpl* context) { auto jacobian = data->BlockSparseJacobian(context); auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix(); auto matrix = std::make_unique(context, *matrix_cpu); for (auto _ : state) { CHECK_EQ(cudaSuccess, cudaMemcpy(matrix->mutable_values(), matrix_cpu->values(), matrix->num_nonzeros() * sizeof(double), cudaMemcpyHostToDevice)); } } #endif static void JacobianSquaredColumnNorm(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); auto jacobian = data->BlockSparseJacobian(context); Vector x = Vector::Zero(jacobian->num_cols()); for (auto _ : state) { jacobian->SquaredColumnNorm(x.data(), context, num_threads); } CHECK_GT(x.squaredNorm(), 0.); } static void JacobianScaleColumns(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); auto jacobian_const = data->BlockSparseJacobian(context); auto jacobian = const_cast(jacobian_const); Vector x = Vector::Ones(jacobian->num_cols()); for (auto _ : state) { jacobian->ScaleColumns(x.data(), context, num_threads); } } static void JacobianRightMultiplyAndAccumulate(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); auto jacobian = data->BlockSparseJacobian(context); Vector y = Vector::Zero(jacobian->num_rows()); Vector x = Vector::Random(jacobian->num_cols()); for (auto _ : state) { jacobian->RightMultiplyAndAccumulate( x.data(), y.data(), context, num_threads); } CHECK_GT(y.squaredNorm(), 0.); } static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state, BALData* data, ContextImpl* context) { const int num_threads = static_cast(state.range(0)); auto jacobian = data->BlockSparseJacobian(context); Vector y = Vector::Zero(jacobian->num_cols()); Vector x = Vector::Random(jacobian->num_rows()); for (auto _ : state) { jacobian->LeftMultiplyAndAccumulate( x.data(), y.data(), context, num_threads); } CHECK_GT(y.squaredNorm(), 0.); } #ifndef CERES_NO_CUDA static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state, BALData* data, ContextImpl* context) { auto crs_jacobian = data->CompressedRowSparseJacobian(context); CudaSparseMatrix cuda_jacobian(context, *crs_jacobian); CudaVector cuda_x(context, 0); CudaVector cuda_y(context, 0); Vector x(crs_jacobian->num_cols()); Vector y(crs_jacobian->num_rows()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state, BALData* data, ContextImpl* context) { auto crs_jacobian = data->CompressedRowSparseJacobian(context); CudaSparseMatrix cuda_jacobian(context, *crs_jacobian); CudaVector cuda_x(context, 0); CudaVector cuda_y(context, 0); Vector x(crs_jacobian->num_rows()); Vector y(crs_jacobian->num_cols()); x.setRandom(); y.setRandom(); cuda_x.CopyFromCpu(x); cuda_y.CopyFromCpu(y); double sum = 0; for (auto _ : state) { cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y); sum += cuda_y.Norm(); CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess); } CHECK_NE(sum, 0.0); } #endif } // namespace ceres::internal // Older versions of benchmark library might come without ::benchmark::Shutdown // function. We provide an empty fallback variant of Shutdown function in // order to support both older and newer versions namespace benchmark_shutdown_fallback { template void Shutdown(Args... args) {} }; // namespace benchmark_shutdown_fallback int main(int argc, char** argv) { ::benchmark::Initialize(&argc, argv); std::vector> benchmark_data; if (argc == 1) { LOG(FATAL) << "No input datasets specified. Usage: " << argv[0] << " [benchmark flags] path_to_BAL_data_1.txt ... " "path_to_BAL_data_N.txt"; return -1; } ceres::internal::ContextImpl context; context.EnsureMinimumThreads(16); #ifndef CERES_NO_CUDA std::string message; context.InitCuda(&message); #endif for (int i = 1; i < argc; ++i) { const std::string path(argv[i]); const std::string name_residuals = "Residuals<" + path + ">"; benchmark_data.emplace_back( std::make_unique(path)); auto data = benchmark_data.back().get(); ::benchmark::RegisterBenchmark( name_residuals.c_str(), ceres::internal::Residuals, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">"; ::benchmark::RegisterBenchmark(name_jacobians.c_str(), ceres::internal::ResidualsAndJacobian, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_plus = "Plus<" + path + ">"; ::benchmark::RegisterBenchmark( name_plus.c_str(), ceres::internal::Plus, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_right_product = "JacobianRightMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product.c_str(), ceres::internal::JacobianRightMultiplyAndAccumulate, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_right_product_partitioned_f = "PMVRightMultiplyAndAccumulateF<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product_partitioned_f.c_str(), ceres::internal::PMVRightMultiplyAndAccumulateF, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_right_product_partitioned_f_cuda = "PMVRightMultiplyAndAccumulateFCuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product_partitioned_f_cuda.c_str(), ceres::internal::PMVRightMultiplyAndAccumulateFCuda, data, &context); #endif const std::string name_right_product_partitioned_e = "PMVRightMultiplyAndAccumulateE<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product_partitioned_e.c_str(), ceres::internal::PMVRightMultiplyAndAccumulateE, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_right_product_partitioned_e_cuda = "PMVRightMultiplyAndAccumulateECuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product_partitioned_e_cuda.c_str(), ceres::internal::PMVRightMultiplyAndAccumulateECuda, data, &context); #endif const std::string name_update_block_diagonal_ftf = "PMVUpdateBlockDiagonalFtF<" + path + ">"; ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(), ceres::internal::PMVUpdateBlockDiagonalFtF, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_pse = "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark( name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_isc_no_diag = "ISCRightMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(), ceres::internal::ISCRightMultiplyNoDiag, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_update_block_diagonal_ete = "PMVUpdateBlockDiagonalEtE<" + path + ">"; ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(), ceres::internal::PMVUpdateBlockDiagonalEtE, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_isc_diag = "ISCRightMultiplyAndAccumulateDiag<" + path + ">"; ::benchmark::RegisterBenchmark(name_isc_diag.c_str(), ceres::internal::ISCRightMultiplyDiag, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_right_product_cuda = "JacobianRightMultiplyAndAccumulateCuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_right_product_cuda.c_str(), ceres::internal::JacobianRightMultiplyAndAccumulateCuda, data, &context) ->Arg(1); #endif const std::string name_left_product = "JacobianLeftMultiplyAndAccumulate<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product.c_str(), ceres::internal::JacobianLeftMultiplyAndAccumulate, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_left_product_partitioned_f = "PMVLeftMultiplyAndAccumulateF<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product_partitioned_f.c_str(), ceres::internal::PMVLeftMultiplyAndAccumulateF, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_left_product_partitioned_f_cuda = "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product_partitioned_f_cuda.c_str(), ceres::internal::PMVLeftMultiplyAndAccumulateFCuda, data, &context); #endif const std::string name_left_product_partitioned_e = "PMVLeftMultiplyAndAccumulateE<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product_partitioned_e.c_str(), ceres::internal::PMVLeftMultiplyAndAccumulateE, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); #ifndef CERES_NO_CUDA const std::string name_left_product_partitioned_e_cuda = "PMVLeftMultiplyAndAccumulateECuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product_partitioned_e_cuda.c_str(), ceres::internal::PMVLeftMultiplyAndAccumulateECuda, data, &context); #endif #ifndef CERES_NO_CUDA const std::string name_left_product_cuda = "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">"; ::benchmark::RegisterBenchmark( name_left_product_cuda.c_str(), ceres::internal::JacobianLeftMultiplyAndAccumulateCuda, data, &context) ->Arg(1); #endif const std::string name_squared_column_norm = "JacobianSquaredColumnNorm<" + path + ">"; ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(), ceres::internal::JacobianSquaredColumnNorm, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">"; ::benchmark::RegisterBenchmark(name_scale_columns.c_str(), ceres::internal::JacobianScaleColumns, data, &context) ->Arg(1) ->Arg(2) ->Arg(4) ->Arg(8) ->Arg(16); const std::string name_to_crs = "JacobianToCRS<" + path + ">"; ::benchmark::RegisterBenchmark( name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context); #ifndef CERES_NO_CUDA const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">"; ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(), ceres::internal::JacobianToCRSView, data, &context); const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">"; ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(), ceres::internal::JacobianToCRSMatrix, data, &context); const std::string name_to_crs_view_update = "JacobianToCRSViewUpdate<" + path + ">"; ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(), ceres::internal::JacobianToCRSViewUpdate, data, &context); const std::string name_to_crs_matrix_update = "JacobianToCRSMatrixUpdate<" + path + ">"; ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(), ceres::internal::JacobianToCRSMatrixUpdate, data, &context); #endif } ::benchmark::RunSpecifiedBenchmarks(); using namespace ::benchmark; using namespace benchmark_shutdown_fallback; Shutdown(); return 0; }