// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)

#include <memory>
#include <random>
#include <string>

#include "Eigen/Dense"
#include "benchmark/benchmark.h"
#include "ceres/block_jacobi_preconditioner.h"
#include "ceres/block_sparse_matrix.h"
#include "ceres/context_impl.h"
#include "ceres/cuda_sparse_matrix.h"
#include "ceres/cuda_vector.h"
#include "ceres/fake_bundle_adjustment_jacobian.h"
#include "ceres/internal/config.h"
#include "ceres/internal/eigen.h"
#include "ceres/linear_solver.h"

#ifndef CERES_NO_CUDA
#include "cuda_runtime.h"
#endif

namespace ceres::internal {

constexpr int kNumCameras = 1000;
constexpr int kNumPoints = 10000;
constexpr int kCameraSize = 6;
constexpr int kPointSize = 3;
constexpr double kVisibility = 0.1;

constexpr int kNumRowBlocks = 100000;
constexpr int kNumColBlocks = 10000;
constexpr int kMinRowBlockSize = 1;
constexpr int kMaxRowBlockSize = 5;
constexpr int kMinColBlockSize = 1;
constexpr int kMaxColBlockSize = 15;
constexpr double kBlockDensity = 5.0 / kNumColBlocks;

static void BM_BlockSparseRightMultiplyAndAccumulateBA(
    benchmark::State& state) {
  const int num_threads = static_cast<int>(state.range(0));
  std::mt19937 prng;
  auto jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

  ContextImpl context;
  context.EnsureMinimumThreads(num_threads);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(
        x.data(), y.data(), &context, num_threads);
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
    ->Arg(8)
    ->Arg(16);

static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  const int num_threads = static_cast<int>(state.range(0));
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = kNumRowBlocks;
  options.num_col_blocks = kNumColBlocks;
  options.min_row_block_size = kMinRowBlockSize;
  options.min_col_block_size = kMinColBlockSize;
  options.max_row_block_size = kMaxRowBlockSize;
  options.max_col_block_size = kMaxColBlockSize;
  options.block_density = kBlockDensity;
  std::mt19937 prng;

  auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);

  ContextImpl context;
  context.EnsureMinimumThreads(num_threads);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(
        x.data(), y.data(), &context, num_threads);
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
    ->Arg(8)
    ->Arg(16);

static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  std::mt19937 prng;
  auto jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA);

static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = 100000;
  options.num_col_blocks = 10000;
  options.min_row_block_size = 1;
  options.min_col_block_size = 1;
  options.max_row_block_size = 10;
  options.max_col_block_size = 15;
  options.block_density = 5.0 / options.num_col_blocks;
  std::mt19937 prng;

  auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);

static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
  const int num_threads = static_cast<int>(state.range(0));
  std::mt19937 prng;
  auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);

  auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

  ContextImpl context;
  context.EnsureMinimumThreads(num_threads);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(
        x.data(), y.data(), &context, num_threads);
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
    ->Arg(8)
    ->Arg(16);

static void BM_CRSRightMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  const int num_threads = static_cast<int>(state.range(0));
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = kNumRowBlocks;
  options.num_col_blocks = kNumColBlocks;
  options.min_row_block_size = kMinRowBlockSize;
  options.min_col_block_size = kMinColBlockSize;
  options.max_row_block_size = kMaxRowBlockSize;
  options.max_col_block_size = kMaxColBlockSize;
  options.block_density = kBlockDensity;
  std::mt19937 prng;

  auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

  ContextImpl context;
  context.EnsureMinimumThreads(num_threads);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    jacobian->RightMultiplyAndAccumulate(
        x.data(), y.data(), &context, num_threads);
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
    ->Arg(1)
    ->Arg(2)
    ->Arg(4)
    ->Arg(8)
    ->Arg(16);

static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  std::mt19937 prng;
  // Perform setup here
  auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    // This code gets timed
    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA);

static void BM_CRSLeftMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = kNumRowBlocks;
  options.num_col_blocks = kNumColBlocks;
  options.min_row_block_size = kMinRowBlockSize;
  options.min_col_block_size = kMinColBlockSize;
  options.max_row_block_size = kMaxRowBlockSize;
  options.max_col_block_size = kMaxColBlockSize;
  options.block_density = kBlockDensity;
  std::mt19937 prng;

  auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();

  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();
  double sum = 0;
  for (auto _ : state) {
    // This code gets timed
    jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
    sum += y.norm();
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured);

#ifndef CERES_NO_CUDA
static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) {
  std::mt19937 prng;
  auto jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  ContextImpl context;
  std::string message;
  context.InitCuda(&message);
  auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  CudaVector cuda_x(&context, 0);
  CudaVector cuda_y(&context, 0);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA);

static void BM_CudaRightMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = kNumRowBlocks;
  options.num_col_blocks = kNumColBlocks;
  options.min_row_block_size = kMinRowBlockSize;
  options.min_col_block_size = kMinColBlockSize;
  options.max_row_block_size = kMaxRowBlockSize;
  options.max_col_block_size = kMaxColBlockSize;
  options.block_density = kBlockDensity;
  std::mt19937 prng;

  auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  ContextImpl context;
  std::string message;
  context.InitCuda(&message);
  auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  CudaVector cuda_x(&context, 0);
  CudaVector cuda_y(&context, 0);

  Vector x(jacobian->num_cols());
  Vector y(jacobian->num_rows());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured);

static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  std::mt19937 prng;
  auto jacobian = CreateFakeBundleAdjustmentJacobian(
      kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  ContextImpl context;
  std::string message;
  context.InitCuda(&message);
  auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  CudaVector cuda_x(&context, 0);
  CudaVector cuda_y(&context, 0);

  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA);

static void BM_CudaLeftMultiplyAndAccumulateUnstructured(
    benchmark::State& state) {
  BlockSparseMatrix::RandomMatrixOptions options;
  options.num_row_blocks = kNumRowBlocks;
  options.num_col_blocks = kNumColBlocks;
  options.min_row_block_size = kMinRowBlockSize;
  options.min_col_block_size = kMinColBlockSize;
  options.max_row_block_size = kMaxRowBlockSize;
  options.max_col_block_size = kMaxColBlockSize;
  options.block_density = kBlockDensity;
  std::mt19937 prng;

  auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  ContextImpl context;
  std::string message;
  context.InitCuda(&message);
  auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  CudaVector cuda_x(&context, 0);
  CudaVector cuda_y(&context, 0);

  Vector x(jacobian->num_rows());
  Vector y(jacobian->num_cols());
  x.setRandom();
  y.setRandom();

  cuda_x.CopyFromCpu(x);
  cuda_y.CopyFromCpu(y);
  double sum = 0;
  for (auto _ : state) {
    cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
    sum += cuda_y.Norm();
    CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  }
  CHECK_NE(sum, 0.0);
}

BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured);

#endif

}  // namespace ceres::internal

BENCHMARK_MAIN();