// Ceres Solver - A fast non-linear least squares minimizer
// Copyright 2023 Google Inc. All rights reserved.
// http://ceres-solver.org/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright notice,
//   this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright notice,
//   this list of conditions and the following disclaimer in the documentation
//   and/or other materials provided with the distribution.
// * Neither the name of Google Inc. nor the names of its contributors may be
//   used to endorse or promote products derived from this software without
//   specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Author: sameeragarwal@google.com (Sameer Agarwal)

#include "ceres/block_sparse_matrix.h"

#include <algorithm>
#include <cstddef>
#include <memory>
#include <numeric>
#include <random>
#include <vector>

#include "ceres/block_structure.h"
#include "ceres/crs_matrix.h"
#include "ceres/internal/eigen.h"
#include "ceres/parallel_for.h"
#include "ceres/parallel_vector_ops.h"
#include "ceres/small_blas.h"
#include "ceres/triplet_sparse_matrix.h"
#include "glog/logging.h"

#ifndef CERES_NO_CUDA
#include "cuda_runtime.h"
#endif

namespace ceres::internal {

namespace {
void ComputeCumulativeNumberOfNonZeros(std::vector<CompressedList>& rows) {
  if (rows.empty()) {
    return;
  }
  rows[0].cumulative_nnz = rows[0].nnz;
  for (int c = 1; c < rows.size(); ++c) {
    const int curr_nnz = rows[c].nnz;
    rows[c].cumulative_nnz = curr_nnz + rows[c - 1].cumulative_nnz;
  }
}

template <bool transpose>
std::unique_ptr<CompressedRowSparseMatrix>
CreateStructureOfCompressedRowSparseMatrix(
    const double* values,
    int num_rows,
    int num_cols,
    int num_nonzeros,
    const CompressedRowBlockStructure* block_structure) {
  auto crs_matrix = std::make_unique<CompressedRowSparseMatrix>(
      num_rows, num_cols, num_nonzeros);
  auto crs_cols = crs_matrix->mutable_cols();
  auto crs_rows = crs_matrix->mutable_rows();
  int value_offset = 0;
  const int num_row_blocks = block_structure->rows.size();
  const auto& cols = block_structure->cols;
  *crs_rows++ = 0;
  for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
    const auto& row_block = block_structure->rows[row_block_id];
    // Empty row block: only requires setting row offsets
    if (row_block.cells.empty()) {
      std::fill(crs_rows, crs_rows + row_block.block.size, value_offset);
      crs_rows += row_block.block.size;
      continue;
    }

    int row_nnz = 0;
    if constexpr (transpose) {
      // Transposed block structure comes with nnz in row-block filled-in
      row_nnz = row_block.nnz / row_block.block.size;
    } else {
      // Nnz field of non-transposed block structure is not filled and it can
      // have non-sequential structure (consider the case of jacobian for
      // Schur-complement solver: E and F blocks are stored separately).
      for (auto& c : row_block.cells) {
        row_nnz += cols[c.block_id].size;
      }
    }

    // Row-wise setup of matrix structure
    for (int row = 0; row < row_block.block.size; ++row) {
      value_offset += row_nnz;
      *crs_rows++ = value_offset;
      for (auto& c : row_block.cells) {
        const int col_block_size = cols[c.block_id].size;
        const int col_position = cols[c.block_id].position;
        std::iota(crs_cols, crs_cols + col_block_size, col_position);
        crs_cols += col_block_size;
      }
    }
  }
  return crs_matrix;
}

template <bool transpose>
void UpdateCompressedRowSparseMatrixImpl(
    CompressedRowSparseMatrix* crs_matrix,
    const double* values,
    const CompressedRowBlockStructure* block_structure) {
  auto crs_values = crs_matrix->mutable_values();
  auto crs_rows = crs_matrix->mutable_rows();
  const int num_row_blocks = block_structure->rows.size();
  const auto& cols = block_structure->cols;
  for (int row_block_id = 0; row_block_id < num_row_blocks; ++row_block_id) {
    const auto& row_block = block_structure->rows[row_block_id];
    const int row_block_size = row_block.block.size;
    const int row_nnz = crs_rows[1] - crs_rows[0];
    crs_rows += row_block_size;

    if (row_nnz == 0) {
      continue;
    }

    MatrixRef crs_row_block(crs_values, row_block_size, row_nnz);
    int col_offset = 0;
    for (auto& c : row_block.cells) {
      const int col_block_size = cols[c.block_id].size;
      auto crs_cell =
          crs_row_block.block(0, col_offset, row_block_size, col_block_size);
      if constexpr (transpose) {
        // Transposed matrix is filled using transposed block-strucutre
        ConstMatrixRef cell(
            values + c.position, col_block_size, row_block_size);
        crs_cell = cell.transpose();
      } else {
        ConstMatrixRef cell(
            values + c.position, row_block_size, col_block_size);
        crs_cell = cell;
      }
      col_offset += col_block_size;
    }
    crs_values += row_nnz * row_block_size;
  }
}

void SetBlockStructureOfCompressedRowSparseMatrix(
    CompressedRowSparseMatrix* crs_matrix,
    CompressedRowBlockStructure* block_structure) {
  const int num_row_blocks = block_structure->rows.size();
  auto& row_blocks = *crs_matrix->mutable_row_blocks();
  row_blocks.resize(num_row_blocks);
  for (int i = 0; i < num_row_blocks; ++i) {
    row_blocks[i] = block_structure->rows[i].block;
  }

  auto& col_blocks = *crs_matrix->mutable_col_blocks();
  col_blocks = block_structure->cols;
}

}  // namespace

BlockSparseMatrix::BlockSparseMatrix(
    CompressedRowBlockStructure* block_structure, bool use_page_locked_memory)
    : use_page_locked_memory_(use_page_locked_memory),
      num_rows_(0),
      num_cols_(0),
      num_nonzeros_(0),
      block_structure_(block_structure) {
  CHECK(block_structure_ != nullptr);

  // Count the number of columns in the matrix.
  for (auto& col : block_structure_->cols) {
    num_cols_ += col.size;
  }

  // Count the number of non-zero entries and the number of rows in
  // the matrix.
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_size = block_structure_->rows[i].block.size;
    num_rows_ += row_block_size;

    const std::vector<Cell>& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      num_nonzeros_ += col_block_size * row_block_size;
    }
  }

  CHECK_GE(num_rows_, 0);
  CHECK_GE(num_cols_, 0);
  CHECK_GE(num_nonzeros_, 0);
  VLOG(2) << "Allocating values array with " << num_nonzeros_ * sizeof(double)
          << " bytes.";  // NOLINT

  values_ = AllocateValues(num_nonzeros_);
  max_num_nonzeros_ = num_nonzeros_;
  CHECK(values_ != nullptr);
  AddTransposeBlockStructure();
}

BlockSparseMatrix::~BlockSparseMatrix() { FreeValues(values_); }

void BlockSparseMatrix::AddTransposeBlockStructure() {
  if (transpose_block_structure_ == nullptr) {
    transpose_block_structure_ = CreateTranspose(*block_structure_);
  }
}

void BlockSparseMatrix::SetZero() {
  std::fill(values_, values_ + num_nonzeros_, 0.0);
}

void BlockSparseMatrix::SetZero(ContextImpl* context, int num_threads) {
  ParallelSetZero(context, num_threads, values_, num_nonzeros_);
}

void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                   double* y) const {
  RightMultiplyAndAccumulate(x, y, nullptr, 1);
}

void BlockSparseMatrix::RightMultiplyAndAccumulate(const double* x,
                                                   double* y,
                                                   ContextImpl* context,
                                                   int num_threads) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);

  const auto values = values_;
  const auto block_structure = block_structure_.get();
  const auto num_row_blocks = block_structure->rows.size();

  ParallelFor(context,
              0,
              num_row_blocks,
              num_threads,
              [values, block_structure, x, y](int row_block_id) {
                const int row_block_pos =
                    block_structure->rows[row_block_id].block.position;
                const int row_block_size =
                    block_structure->rows[row_block_id].block.size;
                const auto& cells = block_structure->rows[row_block_id].cells;
                for (const auto& cell : cells) {
                  const int col_block_id = cell.block_id;
                  const int col_block_size =
                      block_structure->cols[col_block_id].size;
                  const int col_block_pos =
                      block_structure->cols[col_block_id].position;
                  MatrixVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
                      values + cell.position,
                      row_block_size,
                      col_block_size,
                      x + col_block_pos,
                      y + row_block_pos);
                }
              });
}

// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
// might benefit from caching column-block partition
void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
                                                  double* y,
                                                  ContextImpl* context,
                                                  int num_threads) const {
  // While utilizing transposed structure allows to perform parallel
  // left-multiplication by dense vector, it makes access patterns to matrix
  // elements scattered. Thus, multiplication using transposed structure
  // is only useful for parallel execution
  CHECK(x != nullptr);
  CHECK(y != nullptr);
  if (transpose_block_structure_ == nullptr || num_threads == 1) {
    LeftMultiplyAndAccumulate(x, y);
    return;
  }

  auto transpose_bs = transpose_block_structure_.get();
  const auto values = values_;
  const int num_col_blocks = transpose_bs->rows.size();
  if (!num_col_blocks) {
    return;
  }

  // Use non-zero count as iteration cost for guided parallel-for loop
  ParallelFor(
      context,
      0,
      num_col_blocks,
      num_threads,
      [values, transpose_bs, x, y](int row_block_id) {
        int row_block_pos = transpose_bs->rows[row_block_id].block.position;
        int row_block_size = transpose_bs->rows[row_block_id].block.size;
        auto& cells = transpose_bs->rows[row_block_id].cells;

        for (auto& cell : cells) {
          const int col_block_id = cell.block_id;
          const int col_block_size = transpose_bs->cols[col_block_id].size;
          const int col_block_pos = transpose_bs->cols[col_block_id].position;
          MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
              values + cell.position,
              col_block_size,
              row_block_size,
              x + col_block_pos,
              y + row_block_pos);
        }
      },
      transpose_bs->rows.data(),
      [](const CompressedRow& row) { return row.cumulative_nnz; });
}

void BlockSparseMatrix::LeftMultiplyAndAccumulate(const double* x,
                                                  double* y) const {
  CHECK(x != nullptr);
  CHECK(y != nullptr);
  // Single-threaded left products are always computed using a non-transpose
  // block structure, because it has linear acess pattern to matrix elements
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      MatrixTransposeVectorMultiply<Eigen::Dynamic, Eigen::Dynamic, 1>(
          values_ + cell.position,
          row_block_size,
          col_block_size,
          x + row_block_pos,
          y + col_block_pos);
    }
  }
}

void BlockSparseMatrix::SquaredColumnNorm(double* x) const {
  CHECK(x != nullptr);
  VectorRef(x, num_cols_).setZero();
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_size = block_structure_->rows[i].block.size;
    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      const MatrixRef m(
          values_ + cell.position, row_block_size, col_block_size);
      VectorRef(x + col_block_pos, col_block_size) += m.colwise().squaredNorm();
    }
  }
}

// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
// might benefit from caching column-block partition
void BlockSparseMatrix::SquaredColumnNorm(double* x,
                                          ContextImpl* context,
                                          int num_threads) const {
  if (transpose_block_structure_ == nullptr || num_threads == 1) {
    SquaredColumnNorm(x);
    return;
  }

  CHECK(x != nullptr);
  ParallelSetZero(context, num_threads, x, num_cols_);

  auto transpose_bs = transpose_block_structure_.get();
  const auto values = values_;
  const int num_col_blocks = transpose_bs->rows.size();
  ParallelFor(
      context,
      0,
      num_col_blocks,
      num_threads,
      [values, transpose_bs, x](int row_block_id) {
        const auto& row = transpose_bs->rows[row_block_id];

        for (auto& cell : row.cells) {
          const auto& col = transpose_bs->cols[cell.block_id];
          const MatrixRef m(values + cell.position, col.size, row.block.size);
          VectorRef(x + row.block.position, row.block.size) +=
              m.colwise().squaredNorm();
        }
      },
      transpose_bs->rows.data(),
      [](const CompressedRow& row) { return row.cumulative_nnz; });
}

void BlockSparseMatrix::ScaleColumns(const double* scale) {
  CHECK(scale != nullptr);

  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_size = block_structure_->rows[i].block.size;
    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      MatrixRef m(values_ + cell.position, row_block_size, col_block_size);
      m *= ConstVectorRef(scale + col_block_pos, col_block_size).asDiagonal();
    }
  }
}

// TODO(https://github.com/ceres-solver/ceres-solver/issues/933): This method
// might benefit from caching column-block partition
void BlockSparseMatrix::ScaleColumns(const double* scale,
                                     ContextImpl* context,
                                     int num_threads) {
  if (transpose_block_structure_ == nullptr || num_threads == 1) {
    ScaleColumns(scale);
    return;
  }

  CHECK(scale != nullptr);
  auto transpose_bs = transpose_block_structure_.get();
  auto values = values_;
  const int num_col_blocks = transpose_bs->rows.size();
  ParallelFor(
      context,
      0,
      num_col_blocks,
      num_threads,
      [values, transpose_bs, scale](int row_block_id) {
        const auto& row = transpose_bs->rows[row_block_id];

        for (auto& cell : row.cells) {
          const auto& col = transpose_bs->cols[cell.block_id];
          MatrixRef m(values + cell.position, col.size, row.block.size);
          m *= ConstVectorRef(scale + row.block.position, row.block.size)
                   .asDiagonal();
        }
      },
      transpose_bs->rows.data(),
      [](const CompressedRow& row) { return row.cumulative_nnz; });
}
std::unique_ptr<CompressedRowSparseMatrix>
BlockSparseMatrix::ToCompressedRowSparseMatrixTranspose() const {
  auto bs = transpose_block_structure_.get();
  auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<true>(
      values(), num_cols_, num_rows_, num_nonzeros_, bs);

  SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(), bs);

  UpdateCompressedRowSparseMatrixTranspose(crs_matrix.get());
  return crs_matrix;
}

std::unique_ptr<CompressedRowSparseMatrix>
BlockSparseMatrix::ToCompressedRowSparseMatrix() const {
  auto crs_matrix = CreateStructureOfCompressedRowSparseMatrix<false>(
      values(), num_rows_, num_cols_, num_nonzeros_, block_structure_.get());

  SetBlockStructureOfCompressedRowSparseMatrix(crs_matrix.get(),
                                               block_structure_.get());

  UpdateCompressedRowSparseMatrix(crs_matrix.get());
  return crs_matrix;
}

void BlockSparseMatrix::UpdateCompressedRowSparseMatrixTranspose(
    CompressedRowSparseMatrix* crs_matrix) const {
  CHECK(crs_matrix != nullptr);
  CHECK_EQ(crs_matrix->num_rows(), num_cols_);
  CHECK_EQ(crs_matrix->num_cols(), num_rows_);
  CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
  UpdateCompressedRowSparseMatrixImpl<true>(
      crs_matrix, values(), transpose_block_structure_.get());
}
void BlockSparseMatrix::UpdateCompressedRowSparseMatrix(
    CompressedRowSparseMatrix* crs_matrix) const {
  CHECK(crs_matrix != nullptr);
  CHECK_EQ(crs_matrix->num_rows(), num_rows_);
  CHECK_EQ(crs_matrix->num_cols(), num_cols_);
  CHECK_EQ(crs_matrix->num_nonzeros(), num_nonzeros_);
  UpdateCompressedRowSparseMatrixImpl<false>(
      crs_matrix, values(), block_structure_.get());
}

void BlockSparseMatrix::ToDenseMatrix(Matrix* dense_matrix) const {
  CHECK(dense_matrix != nullptr);

  dense_matrix->resize(num_rows_, num_cols_);
  dense_matrix->setZero();
  Matrix& m = *dense_matrix;

  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
    auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      int jac_pos = cell.position;
      m.block(row_block_pos, col_block_pos, row_block_size, col_block_size) +=
          MatrixRef(values_ + jac_pos, row_block_size, col_block_size);
    }
  }
}

void BlockSparseMatrix::ToTripletSparseMatrix(
    TripletSparseMatrix* matrix) const {
  CHECK(matrix != nullptr);

  matrix->Reserve(num_nonzeros_);
  matrix->Resize(num_rows_, num_cols_);
  matrix->SetZero();

  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    int row_block_pos = block_structure_->rows[i].block.position;
    int row_block_size = block_structure_->rows[i].block.size;
    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      int col_block_id = cell.block_id;
      int col_block_size = block_structure_->cols[col_block_id].size;
      int col_block_pos = block_structure_->cols[col_block_id].position;
      int jac_pos = cell.position;
      for (int r = 0; r < row_block_size; ++r) {
        for (int c = 0; c < col_block_size; ++c, ++jac_pos) {
          matrix->mutable_rows()[jac_pos] = row_block_pos + r;
          matrix->mutable_cols()[jac_pos] = col_block_pos + c;
          matrix->mutable_values()[jac_pos] = values_[jac_pos];
        }
      }
    }
  }
  matrix->set_num_nonzeros(num_nonzeros_);
}

// Return a pointer to the block structure. We continue to hold
// ownership of the object though.
const CompressedRowBlockStructure* BlockSparseMatrix::block_structure() const {
  return block_structure_.get();
}

// Return a pointer to the block structure of matrix transpose. We continue to
// hold ownership of the object though.
const CompressedRowBlockStructure*
BlockSparseMatrix::transpose_block_structure() const {
  return transpose_block_structure_.get();
}

void BlockSparseMatrix::ToTextFile(FILE* file) const {
  CHECK(file != nullptr);
  for (int i = 0; i < block_structure_->rows.size(); ++i) {
    const int row_block_pos = block_structure_->rows[i].block.position;
    const int row_block_size = block_structure_->rows[i].block.size;
    const auto& cells = block_structure_->rows[i].cells;
    for (const auto& cell : cells) {
      const int col_block_id = cell.block_id;
      const int col_block_size = block_structure_->cols[col_block_id].size;
      const int col_block_pos = block_structure_->cols[col_block_id].position;
      int jac_pos = cell.position;
      for (int r = 0; r < row_block_size; ++r) {
        for (int c = 0; c < col_block_size; ++c) {
          fprintf(file,
                  "% 10d % 10d %17f\n",
                  row_block_pos + r,
                  col_block_pos + c,
                  values_[jac_pos++]);
        }
      }
    }
  }
}

std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateDiagonalMatrix(
    const double* diagonal, const std::vector<Block>& column_blocks) {
  // Create the block structure for the diagonal matrix.
  auto* bs = new CompressedRowBlockStructure();
  bs->cols = column_blocks;
  int position = 0;
  bs->rows.resize(column_blocks.size(), CompressedRow(1));
  for (int i = 0; i < column_blocks.size(); ++i) {
    CompressedRow& row = bs->rows[i];
    row.block = column_blocks[i];
    Cell& cell = row.cells[0];
    cell.block_id = i;
    cell.position = position;
    position += row.block.size * row.block.size;
  }

  // Create the BlockSparseMatrix with the given block structure.
  auto matrix = std::make_unique<BlockSparseMatrix>(bs);
  matrix->SetZero();

  // Fill the values array of the block sparse matrix.
  double* values = matrix->mutable_values();
  for (const auto& column_block : column_blocks) {
    const int size = column_block.size;
    for (int j = 0; j < size; ++j) {
      // (j + 1) * size is compact way of accessing the (j,j) entry.
      values[j * (size + 1)] = diagonal[j];
    }
    diagonal += size;
    values += size * size;
  }

  return matrix;
}

void BlockSparseMatrix::AppendRows(const BlockSparseMatrix& m) {
  CHECK_EQ(m.num_cols(), num_cols());
  const CompressedRowBlockStructure* m_bs = m.block_structure();
  CHECK_EQ(m_bs->cols.size(), block_structure_->cols.size());

  const int old_num_nonzeros = num_nonzeros_;
  const int old_num_row_blocks = block_structure_->rows.size();
  block_structure_->rows.resize(old_num_row_blocks + m_bs->rows.size());

  for (int i = 0; i < m_bs->rows.size(); ++i) {
    const CompressedRow& m_row = m_bs->rows[i];
    const int row_block_id = old_num_row_blocks + i;
    CompressedRow& row = block_structure_->rows[row_block_id];
    row.block.size = m_row.block.size;
    row.block.position = num_rows_;
    num_rows_ += m_row.block.size;
    row.cells.resize(m_row.cells.size());
    if (transpose_block_structure_) {
      transpose_block_structure_->cols.emplace_back(row.block);
    }
    for (int c = 0; c < m_row.cells.size(); ++c) {
      const int block_id = m_row.cells[c].block_id;
      row.cells[c].block_id = block_id;
      row.cells[c].position = num_nonzeros_;

      const int cell_nnz = m_row.block.size * m_bs->cols[block_id].size;
      if (transpose_block_structure_) {
        transpose_block_structure_->rows[block_id].cells.emplace_back(
            row_block_id, num_nonzeros_);
        transpose_block_structure_->rows[block_id].nnz += cell_nnz;
      }

      num_nonzeros_ += cell_nnz;
    }
  }

  if (num_nonzeros_ > max_num_nonzeros_) {
    double* old_values = values_;
    values_ = AllocateValues(num_nonzeros_);
    std::copy_n(old_values, old_num_nonzeros, values_);
    max_num_nonzeros_ = num_nonzeros_;
    FreeValues(old_values);
  }

  std::copy(
      m.values(), m.values() + m.num_nonzeros(), values_ + old_num_nonzeros);

  if (transpose_block_structure_ == nullptr) {
    return;
  }
  ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
}

void BlockSparseMatrix::DeleteRowBlocks(const int delta_row_blocks) {
  const int num_row_blocks = block_structure_->rows.size();
  const int new_num_row_blocks = num_row_blocks - delta_row_blocks;
  int delta_num_nonzeros = 0;
  int delta_num_rows = 0;
  const std::vector<Block>& column_blocks = block_structure_->cols;
  for (int i = 0; i < delta_row_blocks; ++i) {
    const CompressedRow& row = block_structure_->rows[num_row_blocks - i - 1];
    delta_num_rows += row.block.size;
    for (int c = 0; c < row.cells.size(); ++c) {
      const Cell& cell = row.cells[c];
      delta_num_nonzeros += row.block.size * column_blocks[cell.block_id].size;

      if (transpose_block_structure_) {
        auto& col_cells = transpose_block_structure_->rows[cell.block_id].cells;
        while (!col_cells.empty() &&
               col_cells.back().block_id >= new_num_row_blocks) {
          const int del_block_id = col_cells.back().block_id;
          const int del_block_rows =
              block_structure_->rows[del_block_id].block.size;
          const int del_block_cols = column_blocks[cell.block_id].size;
          const int del_cell_nnz = del_block_rows * del_block_cols;
          transpose_block_structure_->rows[cell.block_id].nnz -= del_cell_nnz;
          col_cells.pop_back();
        }
      }
    }
  }
  num_nonzeros_ -= delta_num_nonzeros;
  num_rows_ -= delta_num_rows;
  block_structure_->rows.resize(new_num_row_blocks);

  if (transpose_block_structure_ == nullptr) {
    return;
  }
  for (int i = 0; i < delta_row_blocks; ++i) {
    transpose_block_structure_->cols.pop_back();
  }

  ComputeCumulativeNumberOfNonZeros(transpose_block_structure_->rows);
}

std::unique_ptr<BlockSparseMatrix> BlockSparseMatrix::CreateRandomMatrix(
    const BlockSparseMatrix::RandomMatrixOptions& options,
    std::mt19937& prng,
    bool use_page_locked_memory) {
  CHECK_GT(options.num_row_blocks, 0);
  CHECK_GT(options.min_row_block_size, 0);
  CHECK_GT(options.max_row_block_size, 0);
  CHECK_LE(options.min_row_block_size, options.max_row_block_size);
  CHECK_GT(options.block_density, 0.0);
  CHECK_LE(options.block_density, 1.0);

  std::uniform_int_distribution<int> col_distribution(
      options.min_col_block_size, options.max_col_block_size);
  std::uniform_int_distribution<int> row_distribution(
      options.min_row_block_size, options.max_row_block_size);
  auto bs = std::make_unique<CompressedRowBlockStructure>();
  if (options.col_blocks.empty()) {
    CHECK_GT(options.num_col_blocks, 0);
    CHECK_GT(options.min_col_block_size, 0);
    CHECK_GT(options.max_col_block_size, 0);
    CHECK_LE(options.min_col_block_size, options.max_col_block_size);

    // Generate the col block structure.
    int col_block_position = 0;
    for (int i = 0; i < options.num_col_blocks; ++i) {
      const int col_block_size = col_distribution(prng);
      bs->cols.emplace_back(col_block_size, col_block_position);
      col_block_position += col_block_size;
    }
  } else {
    bs->cols = options.col_blocks;
  }

  bool matrix_has_blocks = false;
  std::uniform_real_distribution<double> uniform01(0.0, 1.0);
  while (!matrix_has_blocks) {
    VLOG(1) << "Clearing";
    bs->rows.clear();
    int row_block_position = 0;
    int value_position = 0;
    for (int r = 0; r < options.num_row_blocks; ++r) {
      const int row_block_size = row_distribution(prng);
      bs->rows.emplace_back();
      CompressedRow& row = bs->rows.back();
      row.block.size = row_block_size;
      row.block.position = row_block_position;
      row_block_position += row_block_size;
      for (int c = 0; c < bs->cols.size(); ++c) {
        if (uniform01(prng) > options.block_density) continue;

        row.cells.emplace_back();
        Cell& cell = row.cells.back();
        cell.block_id = c;
        cell.position = value_position;
        value_position += row_block_size * bs->cols[c].size;
        matrix_has_blocks = true;
      }
    }
  }

  auto matrix =
      std::make_unique<BlockSparseMatrix>(bs.release(), use_page_locked_memory);
  double* values = matrix->mutable_values();
  std::normal_distribution<double> standard_normal_distribution;
  std::generate_n(
      values, matrix->num_nonzeros(), [&standard_normal_distribution, &prng] {
        return standard_normal_distribution(prng);
      });

  return matrix;
}

std::unique_ptr<CompressedRowBlockStructure> CreateTranspose(
    const CompressedRowBlockStructure& bs) {
  auto transpose = std::make_unique<CompressedRowBlockStructure>();

  transpose->rows.resize(bs.cols.size());
  for (int i = 0; i < bs.cols.size(); ++i) {
    transpose->rows[i].block = bs.cols[i];
    transpose->rows[i].nnz = 0;
  }

  transpose->cols.resize(bs.rows.size());
  for (int i = 0; i < bs.rows.size(); ++i) {
    auto& row = bs.rows[i];
    transpose->cols[i] = row.block;

    const int nrows = row.block.size;
    for (auto& cell : row.cells) {
      transpose->rows[cell.block_id].cells.emplace_back(i, cell.position);
      const int ncols = transpose->rows[cell.block_id].block.size;
      transpose->rows[cell.block_id].nnz += nrows * ncols;
    }
  }
  ComputeCumulativeNumberOfNonZeros(transpose->rows);
  return transpose;
}

double* BlockSparseMatrix::AllocateValues(int size) {
  if (!use_page_locked_memory_) {
    return new double[size];
  }

#ifndef CERES_NO_CUDA

  double* values = nullptr;
  CHECK_EQ(cudaSuccess,
           cudaHostAlloc(&values, sizeof(double) * size, cudaHostAllocDefault));
  return values;
#else
  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
             << "This is a Ceres bug; please contact the developers!";
  return nullptr;
#endif
};

void BlockSparseMatrix::FreeValues(double*& values) {
  if (!use_page_locked_memory_) {
    delete[] values;
    values = nullptr;
    return;
  }

#ifndef CERES_NO_CUDA
  CHECK_EQ(cudaSuccess, cudaFreeHost(values));
  values = nullptr;
#else
  LOG(FATAL) << "Page locked memory requested when CUDA is not available. "
             << "This is a Ceres bug; please contact the developers!";
#endif
};

}  // namespace ceres::internal