// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)

#include <memory>
#include <random>
#include <string>
#include <vector>

#include "benchmark/benchmark.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/bundle_adjustment_test_util.h"
#include "ceres/cuda_block_sparse_crs_view.h"
#include "ceres/cuda_partitioned_block_sparse_crs_view.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/evaluator.h"
#include "ceres/implicit_schur_complement.h"
#include "ceres/partitioned_matrix_view.h"
#include "ceres/power_series_expansion_preconditioner.h"
#include "ceres/preprocessor.h"
#include "ceres/problem.h"
#include "ceres/problem_impl.h"
#include "ceres/program.h"
#include "ceres/sparse_matrix.h"

namespace ceres::internal {

template <typename Derived, typename Base>
std::unique_ptr<Derived> downcast_unique_ptr(std::unique_ptr<Base>& base) {
  return std::unique_ptr<Derived>(dynamic_cast<Derived*>(base.release()));
}

// Benchmark library might invoke benchmark function multiple times.
// In order to save time required to parse BAL data, we ensure that
// each dataset is being loaded at most once.
// Each type of jacobians is also cached after first creation
struct BALData {
  using PartitionedView = PartitionedMatrixView<2, 3, 9>;
  explicit BALData(const std::string& path) {
    bal_problem = std::make_unique<BundleAdjustmentProblem>(path);
    CHECK(bal_problem != nullptr);

    auto problem_impl = bal_problem->mutable_problem()->mutable_impl();
    auto preprocessor = Preprocessor::Create(MinimizerType::TRUST_REGION);

    preprocessed_problem = std::make_unique<PreprocessedProblem>();
    Solver::Options options = bal_problem->options();
    options.linear_solver_type = ITERATIVE_SCHUR;
    CHECK(preprocessor->Preprocess(
        options, problem_impl, preprocessed_problem.get()));

    auto program = preprocessed_problem->reduced_program.get();

    parameters.resize(program->NumParameters());
    program->ParameterBlocksToStateVector(parameters.data());

    const int num_residuals = program->NumResiduals();
    b.resize(num_residuals);

    std::mt19937 rng;
    std::normal_distribution<double> rnorm;
    for (int i = 0; i < num_residuals; ++i) {
      b[i] = rnorm(rng);
    }

    const int num_parameters = program->NumParameters();
    D.resize(num_parameters);
    for (int i = 0; i < num_parameters; ++i) {
      D[i] = rnorm(rng);
    }
  }

  std::unique_ptr<BlockSparseMatrix> CreateBlockSparseJacobian(
      ContextImpl* context, bool sequential) {
    auto problem = bal_problem->mutable_problem();
    auto problem_impl = problem->mutable_impl();
    CHECK(problem_impl != nullptr);

    Evaluator::Options options;
    options.linear_solver_type = ITERATIVE_SCHUR;
    options.num_threads = 1;
    options.context = context;
    options.num_eliminate_blocks = bal_problem->num_points();

    std::string error;
    auto program = preprocessed_problem->reduced_program.get();
    auto evaluator = Evaluator::Create(options, program, &error);
    CHECK(evaluator != nullptr);

    auto jacobian = evaluator->CreateJacobian();
    auto block_sparse = downcast_unique_ptr<BlockSparseMatrix>(jacobian);
    CHECK(block_sparse != nullptr);

    if (sequential) {
      auto block_structure_sequential =
          std::make_unique<CompressedRowBlockStructure>(
              *block_sparse->block_structure());
      int num_nonzeros = 0;
      for (auto& row_block : block_structure_sequential->rows) {
        const int row_block_size = row_block.block.size;
        for (auto& cell : row_block.cells) {
          const int col_block_size =
              block_structure_sequential->cols[cell.block_id].size;
          cell.position = num_nonzeros;
          num_nonzeros += col_block_size * row_block_size;
        }
      }
      block_sparse = std::make_unique<BlockSparseMatrix>(
          block_structure_sequential.release(),
#ifndef CERES_NO_CUDA
          true
#else
          false
#endif
      );
    }

    std::mt19937 rng;
    std::normal_distribution<double> rnorm;
    const int nnz = block_sparse->num_nonzeros();
    auto values = block_sparse->mutable_values();
    for (int i = 0; i < nnz; ++i) {
      values[i] = rnorm(rng);
    }

    return block_sparse;
  }

  std::unique_ptr<CompressedRowSparseMatrix> CreateCompressedRowSparseJacobian(
      ContextImpl* context) {
    auto block_sparse = BlockSparseJacobian(context);
    return block_sparse->ToCompressedRowSparseMatrix();
  }

  const BlockSparseMatrix* BlockSparseJacobian(ContextImpl* context) {
    if (!block_sparse_jacobian) {
      block_sparse_jacobian = CreateBlockSparseJacobian(context, true);
    }
    return block_sparse_jacobian.get();
  }

  const BlockSparseMatrix* BlockSparseJacobianPartitioned(
      ContextImpl* context) {
    if (!block_sparse_jacobian_partitioned) {
      block_sparse_jacobian_partitioned =
          CreateBlockSparseJacobian(context, false);
    }
    return block_sparse_jacobian_partitioned.get();
  }

  const CompressedRowSparseMatrix* CompressedRowSparseJacobian(
      ContextImpl* context) {
    if (!crs_jacobian) {
      crs_jacobian = CreateCompressedRowSparseJacobian(context);
    }
    return crs_jacobian.get();
  }

  std::unique_ptr<PartitionedView> PartitionedMatrixViewJacobian(
      const LinearSolver::Options& options) {
    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
    return std::make_unique<PartitionedView>(options, *block_sparse);
  }

  BlockSparseMatrix* BlockDiagonalEtE(const LinearSolver::Options& options) {
    if (!block_diagonal_ete) {
      auto partitioned_view = PartitionedMatrixViewJacobian(options);
      block_diagonal_ete = partitioned_view->CreateBlockDiagonalEtE();
    }
    return block_diagonal_ete.get();
  }

  BlockSparseMatrix* BlockDiagonalFtF(const LinearSolver::Options& options) {
    if (!block_diagonal_ftf) {
      auto partitioned_view = PartitionedMatrixViewJacobian(options);
      block_diagonal_ftf = partitioned_view->CreateBlockDiagonalFtF();
    }
    return block_diagonal_ftf.get();
  }

  const ImplicitSchurComplement* ImplicitSchurComplementWithoutDiagonal(
      const LinearSolver::Options& options) {
    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
    implicit_schur_complement =
        std::make_unique<ImplicitSchurComplement>(options);
    implicit_schur_complement->Init(*block_sparse, nullptr, b.data());
    return implicit_schur_complement.get();
  }

  const ImplicitSchurComplement* ImplicitSchurComplementWithDiagonal(
      const LinearSolver::Options& options) {
    auto block_sparse = BlockSparseJacobianPartitioned(options.context);
    implicit_schur_complement_diag =
        std::make_unique<ImplicitSchurComplement>(options);
    implicit_schur_complement_diag->Init(*block_sparse, D.data(), b.data());
    return implicit_schur_complement_diag.get();
  }

  Vector parameters;
  Vector D;
  Vector b;
  std::unique_ptr<BundleAdjustmentProblem> bal_problem;
  std::unique_ptr<PreprocessedProblem> preprocessed_problem;
  std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian_partitioned;
  std::unique_ptr<BlockSparseMatrix> block_sparse_jacobian;
  std::unique_ptr<CompressedRowSparseMatrix> crs_jacobian;
  std::unique_ptr<BlockSparseMatrix> block_diagonal_ete;
  std::unique_ptr<BlockSparseMatrix> block_diagonal_ftf;
  std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement;
  std::unique_ptr<ImplicitSchurComplement> implicit_schur_complement_diag;
};

static void Residuals(benchmark::State& state,
                      BALData* data,
                      ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  Evaluator::Options options;
  options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
  options.num_threads = num_threads;
  options.context = context;
  options.num_eliminate_blocks = 0;

  std::string error;
  CHECK(data->preprocessed_problem != nullptr);
  auto program = data->preprocessed_problem->reduced_program.get();
  CHECK(program != nullptr);
  auto evaluator = Evaluator::Create(options, program, &error);
  CHECK(evaluator != nullptr);

  double cost = 0.;
  Vector residuals = Vector::Zero(program->NumResiduals());

  Evaluator::EvaluateOptions eval_options;
  for (auto _ : state) {
    CHECK(evaluator->Evaluate(eval_options,
                              data->parameters.data(),
                              &cost,
                              residuals.data(),
                              nullptr,
                              nullptr));
  }
}

static void ResidualsAndJacobian(benchmark::State& state,
                                 BALData* data,
                                 ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  Evaluator::Options options;
  options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
  options.num_threads = num_threads;
  options.context = context;
  options.num_eliminate_blocks = 0;

  std::string error;
  CHECK(data->preprocessed_problem != nullptr);
  auto program = data->preprocessed_problem->reduced_program.get();
  CHECK(program != nullptr);
  auto evaluator = Evaluator::Create(options, program, &error);
  CHECK(evaluator != nullptr);

  double cost = 0.;
  Vector residuals = Vector::Zero(program->NumResiduals());
  auto jacobian = evaluator->CreateJacobian();

  Evaluator::EvaluateOptions eval_options;
  for (auto _ : state) {
    CHECK(evaluator->Evaluate(eval_options,
                              data->parameters.data(),
                              &cost,
                              residuals.data(),
                              nullptr,
                              jacobian.get()));
  }
}

static void Plus(benchmark::State& state, BALData* data, ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  Evaluator::Options options;
  options.linear_solver_type = SPARSE_NORMAL_CHOLESKY;
  options.num_threads = num_threads;
  options.context = context;
  options.num_eliminate_blocks = 0;

  std::string error;
  CHECK(data->preprocessed_problem != nullptr);
  auto program = data->preprocessed_problem->reduced_program.get();
  CHECK(program != nullptr);
  auto evaluator = Evaluator::Create(options, program, &error);
  CHECK(evaluator != nullptr);

  Vector state_plus_delta = Vector::Zero(program->NumParameters());
  Vector delta = Vector::Random(program->NumEffectiveParameters());

  for (auto _ : state) {
    CHECK(evaluator->Plus(
        data->parameters.data(), delta.data(), state_plus_delta.data()));
  }
  CHECK_GT(state_plus_delta.squaredNorm(), 0.);
}

static void PSEPreconditioner(benchmark::State& state,
                              BALData* data,
                              ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;

  auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);
  Preconditioner::Options preconditioner_options(options);

  PowerSeriesExpansionPreconditioner preconditioner(
      jacobian, 10, 0, preconditioner_options);

  Vector y = Vector::Zero(jacobian->num_cols());
  Vector x = Vector::Random(jacobian->num_cols());

  for (auto _ : state) {
    preconditioner.RightMultiplyAndAccumulate(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void PMVRightMultiplyAndAccumulateF(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);

  Vector y = Vector::Zero(jacobian->num_rows());
  Vector x = Vector::Random(jacobian->num_cols_f());

  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulateF(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void PMVLeftMultiplyAndAccumulateF(benchmark::State& state,
                                          BALData* data,
                                          ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);

  Vector y = Vector::Zero(jacobian->num_cols_f());
  Vector x = Vector::Random(jacobian->num_rows());

  for (auto _ : state) {
    jacobian->LeftMultiplyAndAccumulateF(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void PMVRightMultiplyAndAccumulateE(benchmark::State& state,
                                           BALData* data,
                                           ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);

  Vector y = Vector::Zero(jacobian->num_rows());
  Vector x = Vector::Random(jacobian->num_cols_e());

  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulateE(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void PMVLeftMultiplyAndAccumulateE(benchmark::State& state,
                                          BALData* data,
                                          ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);

  Vector y = Vector::Zero(jacobian->num_cols_e());
  Vector x = Vector::Random(jacobian->num_rows());

  for (auto _ : state) {
    jacobian->LeftMultiplyAndAccumulateE(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void PMVUpdateBlockDiagonalEtE(benchmark::State& state,
                                      BALData* data,
                                      ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto block_diagonal_ete = data->BlockDiagonalEtE(options);

  for (auto _ : state) {
    jacobian->UpdateBlockDiagonalEtE(block_diagonal_ete);
  }
}

static void PMVUpdateBlockDiagonalFtF(benchmark::State& state,
                                      BALData* data,
                                      ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto block_diagonal_ftf = data->BlockDiagonalFtF(options);

  for (auto _ : state) {
    jacobian->UpdateBlockDiagonalFtF(block_diagonal_ftf);
  }
}

static void ISCRightMultiplyNoDiag(benchmark::State& state,
                                   BALData* data,
                                   ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  auto jacobian = data->ImplicitSchurComplementWithoutDiagonal(options);

  Vector y = Vector::Zero(jacobian->num_rows());
  Vector x = Vector::Random(jacobian->num_cols());
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void ISCRightMultiplyDiag(benchmark::State& state,
                                 BALData* data,
                                 ContextImpl* context) {
  LinearSolver::Options options;
  options.num_threads = static_cast<int>(state.range(0));
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;

  auto jacobian = data->ImplicitSchurComplementWithDiagonal(options);

  Vector y = Vector::Zero(jacobian->num_rows());
  Vector x = Vector::Random(jacobian->num_cols());
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(x.data(), y.data());
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void JacobianToCRS(benchmark::State& state,
                          BALData* data,
                          ContextImpl* context) {
  auto jacobian = data->BlockSparseJacobian(context);

  std::unique_ptr<CompressedRowSparseMatrix> matrix;
  for (auto _ : state) {
    matrix = jacobian->ToCompressedRowSparseMatrix();
  }
  CHECK(matrix != nullptr);
}

#ifndef CERES_NO_CUDA
static void PMVRightMultiplyAndAccumulateFCuda(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
  LinearSolver::Options options;
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  options.num_threads = 1;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
  CudaPartitionedBlockSparseCRSView view(
      *underlying_matrix, jacobian->num_col_blocks_e(), context);

  Vector x = Vector::Random(jacobian->num_cols_f());
  CudaVector cuda_x(context, x.size());
  CudaVector cuda_y(context, jacobian->num_rows());

  cuda_x.CopyFromCpu(x);
  cuda_y.SetZero();

  auto matrix = view.matrix_f();
  for (auto _ : state) {
    matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
  }
  CHECK_GT(cuda_y.Norm(), 0.);
}

static void PMVLeftMultiplyAndAccumulateFCuda(benchmark::State& state,
                                              BALData* data,
                                              ContextImpl* context) {
  LinearSolver::Options options;
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  options.num_threads = 1;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
  CudaPartitionedBlockSparseCRSView view(
      *underlying_matrix, jacobian->num_col_blocks_e(), context);

  Vector x = Vector::Random(jacobian->num_rows());
  CudaVector cuda_x(context, x.size());
  CudaVector cuda_y(context, jacobian->num_cols_f());

  cuda_x.CopyFromCpu(x);
  cuda_y.SetZero();

  auto matrix = view.matrix_f();
  for (auto _ : state) {
    matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
  }
  CHECK_GT(cuda_y.Norm(), 0.);
}

static void PMVRightMultiplyAndAccumulateECuda(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
  LinearSolver::Options options;
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  options.num_threads = 1;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
  CudaPartitionedBlockSparseCRSView view(
      *underlying_matrix, jacobian->num_col_blocks_e(), context);

  Vector x = Vector::Random(jacobian->num_cols_e());
  CudaVector cuda_x(context, x.size());
  CudaVector cuda_y(context, jacobian->num_rows());

  cuda_x.CopyFromCpu(x);
  cuda_y.SetZero();

  auto matrix = view.matrix_e();
  for (auto _ : state) {
    matrix->RightMultiplyAndAccumulate(cuda_x, &cuda_y);
  }
  CHECK_GT(cuda_y.Norm(), 0.);
}

static void PMVLeftMultiplyAndAccumulateECuda(benchmark::State& state,
                                              BALData* data,
                                              ContextImpl* context) {
  LinearSolver::Options options;
  options.elimination_groups.push_back(data->bal_problem->num_points());
  options.context = context;
  options.num_threads = 1;
  auto jacobian = data->PartitionedMatrixViewJacobian(options);
  auto underlying_matrix = data->BlockSparseJacobianPartitioned(context);
  CudaPartitionedBlockSparseCRSView view(
      *underlying_matrix, jacobian->num_col_blocks_e(), context);

  Vector x = Vector::Random(jacobian->num_rows());
  CudaVector cuda_x(context, x.size());
  CudaVector cuda_y(context, jacobian->num_cols_e());

  cuda_x.CopyFromCpu(x);
  cuda_y.SetZero();

  auto matrix = view.matrix_e();
  for (auto _ : state) {
    matrix->LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
  }
  CHECK_GT(cuda_y.Norm(), 0.);
}

// We want CudaBlockSparseCRSView to be not slower than explicit conversion to
// CRS on CPU
static void JacobianToCRSView(benchmark::State& state,
                              BALData* data,
                              ContextImpl* context) {
  auto jacobian = data->BlockSparseJacobian(context);

  std::unique_ptr<CudaBlockSparseCRSView> matrix;
  for (auto _ : state) {
    matrix = std::make_unique<CudaBlockSparseCRSView>(*jacobian, context);
  }
  CHECK(matrix != nullptr);
}
static void JacobianToCRSMatrix(benchmark::State& state,
                                BALData* data,
                                ContextImpl* context) {
  auto jacobian = data->BlockSparseJacobian(context);

  std::unique_ptr<CudaSparseMatrix> matrix;
  std::unique_ptr<CompressedRowSparseMatrix> matrix_cpu;
  for (auto _ : state) {
    matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
    matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
  }
  CHECK(matrix != nullptr);
}
// Updating values in CudaBlockSparseCRSView should be +- as fast as just
// copying values (time spent in value permutation has to be hidden by PCIe
// transfer)
static void JacobianToCRSViewUpdate(benchmark::State& state,
                                    BALData* data,
                                    ContextImpl* context) {
  auto jacobian = data->BlockSparseJacobian(context);

  auto matrix = CudaBlockSparseCRSView(*jacobian, context);
  for (auto _ : state) {
    matrix.UpdateValues(*jacobian);
  }
}
static void JacobianToCRSMatrixUpdate(benchmark::State& state,
                                      BALData* data,
                                      ContextImpl* context) {
  auto jacobian = data->BlockSparseJacobian(context);

  auto matrix_cpu = jacobian->ToCompressedRowSparseMatrix();
  auto matrix = std::make_unique<CudaSparseMatrix>(context, *matrix_cpu);
  for (auto _ : state) {
    CHECK_EQ(cudaSuccess,
             cudaMemcpy(matrix->mutable_values(),
                        matrix_cpu->values(),
                        matrix->num_nonzeros() * sizeof(double),
                        cudaMemcpyHostToDevice));
  }
}
#endif

static void JacobianSquaredColumnNorm(benchmark::State& state,
                                      BALData* data,
                                      ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  auto jacobian = data->BlockSparseJacobian(context);

  Vector x = Vector::Zero(jacobian->num_cols());

  for (auto _ : state) {
    jacobian->SquaredColumnNorm(x.data(), context, num_threads);
  }
  CHECK_GT(x.squaredNorm(), 0.);
}

static void JacobianScaleColumns(benchmark::State& state,
                                 BALData* data,
                                 ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  auto jacobian_const = data->BlockSparseJacobian(context);
  auto jacobian = const_cast<BlockSparseMatrix*>(jacobian_const);

  Vector x = Vector::Ones(jacobian->num_cols());

  for (auto _ : state) {
    jacobian->ScaleColumns(x.data(), context, num_threads);
  }
}

static void JacobianRightMultiplyAndAccumulate(benchmark::State& state,
                                               BALData* data,
                                               ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  auto jacobian = data->BlockSparseJacobian(context);

  Vector y = Vector::Zero(jacobian->num_rows());
  Vector x = Vector::Random(jacobian->num_cols());

  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(
        x.data(), y.data(), context, num_threads);
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

static void JacobianLeftMultiplyAndAccumulate(benchmark::State& state,
                                              BALData* data,
                                              ContextImpl* context) {
  const int num_threads = static_cast<int>(state.range(0));

  auto jacobian = data->BlockSparseJacobian(context);

  Vector y = Vector::Zero(jacobian->num_cols());
  Vector x = Vector::Random(jacobian->num_rows());

  for (auto _ : state) {
    jacobian->LeftMultiplyAndAccumulate(
        x.data(), y.data(), context, num_threads);
  }
  CHECK_GT(y.squaredNorm(), 0.);
}

#ifndef CERES_NO_CUDA
static void JacobianRightMultiplyAndAccumulateCuda(benchmark::State& state,
                                                   BALData* data,
                                                   ContextImpl* context) {
  auto crs_jacobian = data->CompressedRowSparseJacobian(context);
  CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
  CudaVector cuda_x(context, 0);
  CudaVector cuda_y(context, 0);

  Vector x(crs_jacobian->num_cols());
  Vector y(crs_jacobian->num_rows());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}

static void JacobianLeftMultiplyAndAccumulateCuda(benchmark::State& state,
                                                  BALData* data,
                                                  ContextImpl* context) {
  auto crs_jacobian = data->CompressedRowSparseJacobian(context);
  CudaSparseMatrix cuda_jacobian(context, *crs_jacobian);
  CudaVector cuda_x(context, 0);
  CudaVector cuda_y(context, 0);

  Vector x(crs_jacobian->num_rows());
  Vector y(crs_jacobian->num_cols());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}
#endif

}  // namespace ceres::internal

// Older versions of benchmark library might come without ::benchmark::Shutdown
// function. We provide an empty fallback variant of Shutdown function in
// order to support both older and newer versions
namespace benchmark_shutdown_fallback {
template <typename... Args>
void Shutdown(Args... args) {}
};  // namespace benchmark_shutdown_fallback

int main(int argc, char** argv) {
  ::benchmark::Initialize(&argc, argv);

  std::vector<std::unique_ptr<ceres::internal::BALData>> benchmark_data;
  if (argc == 1) {
    LOG(FATAL) << "No input datasets specified. Usage: " << argv[0]
               << " [benchmark flags] path_to_BAL_data_1.txt ... "
                  "path_to_BAL_data_N.txt";
    return -1;
  }

  ceres::internal::ContextImpl context;
  context.EnsureMinimumThreads(16);
#ifndef CERES_NO_CUDA
  std::string message;
  context.InitCuda(&message);
#endif

  for (int i = 1; i < argc; ++i) {
    const std::string path(argv[i]);
    const std::string name_residuals = "Residuals<" + path + ">";
    benchmark_data.emplace_back(
        std::make_unique<ceres::internal::BALData>(path));
    auto data = benchmark_data.back().get();
    ::benchmark::RegisterBenchmark(
        name_residuals.c_str(), ceres::internal::Residuals, data, &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_jacobians = "ResidualsAndJacobian<" + path + ">";
    ::benchmark::RegisterBenchmark(name_jacobians.c_str(),
                                   ceres::internal::ResidualsAndJacobian,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_plus = "Plus<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_plus.c_str(), ceres::internal::Plus, data, &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_right_product =
        "JacobianRightMultiplyAndAccumulate<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product.c_str(),
        ceres::internal::JacobianRightMultiplyAndAccumulate,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_right_product_partitioned_f =
        "PMVRightMultiplyAndAccumulateF<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product_partitioned_f.c_str(),
        ceres::internal::PMVRightMultiplyAndAccumulateF,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

#ifndef CERES_NO_CUDA
    const std::string name_right_product_partitioned_f_cuda =
        "PMVRightMultiplyAndAccumulateFCuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product_partitioned_f_cuda.c_str(),
        ceres::internal::PMVRightMultiplyAndAccumulateFCuda,
        data,
        &context);
#endif

    const std::string name_right_product_partitioned_e =
        "PMVRightMultiplyAndAccumulateE<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product_partitioned_e.c_str(),
        ceres::internal::PMVRightMultiplyAndAccumulateE,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

#ifndef CERES_NO_CUDA
    const std::string name_right_product_partitioned_e_cuda =
        "PMVRightMultiplyAndAccumulateECuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product_partitioned_e_cuda.c_str(),
        ceres::internal::PMVRightMultiplyAndAccumulateECuda,
        data,
        &context);
#endif

    const std::string name_update_block_diagonal_ftf =
        "PMVUpdateBlockDiagonalFtF<" + path + ">";
    ::benchmark::RegisterBenchmark(name_update_block_diagonal_ftf.c_str(),
                                   ceres::internal::PMVUpdateBlockDiagonalFtF,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_pse =
        "PSEPreconditionerRightMultiplyAndAccumulate<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_pse.c_str(), ceres::internal::PSEPreconditioner, data, &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_isc_no_diag =
        "ISCRightMultiplyAndAccumulate<" + path + ">";
    ::benchmark::RegisterBenchmark(name_isc_no_diag.c_str(),
                                   ceres::internal::ISCRightMultiplyNoDiag,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_update_block_diagonal_ete =
        "PMVUpdateBlockDiagonalEtE<" + path + ">";
    ::benchmark::RegisterBenchmark(name_update_block_diagonal_ete.c_str(),
                                   ceres::internal::PMVUpdateBlockDiagonalEtE,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);
    const std::string name_isc_diag =
        "ISCRightMultiplyAndAccumulateDiag<" + path + ">";
    ::benchmark::RegisterBenchmark(name_isc_diag.c_str(),
                                   ceres::internal::ISCRightMultiplyDiag,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

#ifndef CERES_NO_CUDA
    const std::string name_right_product_cuda =
        "JacobianRightMultiplyAndAccumulateCuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_right_product_cuda.c_str(),
        ceres::internal::JacobianRightMultiplyAndAccumulateCuda,
        data,
        &context)
        ->Arg(1);
#endif

    const std::string name_left_product =
        "JacobianLeftMultiplyAndAccumulate<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product.c_str(),
        ceres::internal::JacobianLeftMultiplyAndAccumulate,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_left_product_partitioned_f =
        "PMVLeftMultiplyAndAccumulateF<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product_partitioned_f.c_str(),
        ceres::internal::PMVLeftMultiplyAndAccumulateF,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

#ifndef CERES_NO_CUDA
    const std::string name_left_product_partitioned_f_cuda =
        "PMVLeftMultiplyAndAccumulateFCuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product_partitioned_f_cuda.c_str(),
        ceres::internal::PMVLeftMultiplyAndAccumulateFCuda,
        data,
        &context);
#endif

    const std::string name_left_product_partitioned_e =
        "PMVLeftMultiplyAndAccumulateE<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product_partitioned_e.c_str(),
        ceres::internal::PMVLeftMultiplyAndAccumulateE,
        data,
        &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

#ifndef CERES_NO_CUDA
    const std::string name_left_product_partitioned_e_cuda =
        "PMVLeftMultiplyAndAccumulateECuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product_partitioned_e_cuda.c_str(),
        ceres::internal::PMVLeftMultiplyAndAccumulateECuda,
        data,
        &context);
#endif

#ifndef CERES_NO_CUDA
    const std::string name_left_product_cuda =
        "JacobianLeftMultiplyAndAccumulateCuda<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_left_product_cuda.c_str(),
        ceres::internal::JacobianLeftMultiplyAndAccumulateCuda,
        data,
        &context)
        ->Arg(1);
#endif

    const std::string name_squared_column_norm =
        "JacobianSquaredColumnNorm<" + path + ">";
    ::benchmark::RegisterBenchmark(name_squared_column_norm.c_str(),
                                   ceres::internal::JacobianSquaredColumnNorm,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_scale_columns = "JacobianScaleColumns<" + path + ">";
    ::benchmark::RegisterBenchmark(name_scale_columns.c_str(),
                                   ceres::internal::JacobianScaleColumns,
                                   data,
                                   &context)
        ->Arg(1)
        ->Arg(2)
        ->Arg(4)
        ->Arg(8)
        ->Arg(16);

    const std::string name_to_crs = "JacobianToCRS<" + path + ">";
    ::benchmark::RegisterBenchmark(
        name_to_crs.c_str(), ceres::internal::JacobianToCRS, data, &context);
#ifndef CERES_NO_CUDA
    const std::string name_to_crs_view = "JacobianToCRSView<" + path + ">";
    ::benchmark::RegisterBenchmark(name_to_crs_view.c_str(),
                                   ceres::internal::JacobianToCRSView,
                                   data,
                                   &context);
    const std::string name_to_crs_matrix = "JacobianToCRSMatrix<" + path + ">";
    ::benchmark::RegisterBenchmark(name_to_crs_matrix.c_str(),
                                   ceres::internal::JacobianToCRSMatrix,
                                   data,
                                   &context);
    const std::string name_to_crs_view_update =
        "JacobianToCRSViewUpdate<" + path + ">";
    ::benchmark::RegisterBenchmark(name_to_crs_view_update.c_str(),
                                   ceres::internal::JacobianToCRSViewUpdate,
                                   data,
                                   &context);
    const std::string name_to_crs_matrix_update =
        "JacobianToCRSMatrixUpdate<" + path + ">";
    ::benchmark::RegisterBenchmark(name_to_crs_matrix_update.c_str(),
                                   ceres::internal::JacobianToCRSMatrixUpdate,
                                   data,
                                   &context);
#endif
  }
  ::benchmark::RunSpecifiedBenchmarks();

  using namespace ::benchmark;
  using namespace benchmark_shutdown_fallback;
  Shutdown();
  return 0;
}