// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: sameeragarwal@google.com (Sameer Agarwal) #ifndef CERES_INTERNAL_DENSE_CHOLESKY_H_ #define CERES_INTERNAL_DENSE_CHOLESKY_H_ // This include must come before any #ifndef check on Ceres compile options. // clang-format off #include "ceres/internal/config.h" // clang-format on #include #include #include "Eigen/Dense" #include "ceres/context_impl.h" #include "ceres/cuda_buffer.h" #include "ceres/linear_solver.h" #include "glog/logging.h" #ifndef CERES_NO_CUDA #include "ceres/context_impl.h" #include "cuda_runtime.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA namespace ceres::internal { // An interface that abstracts away the internal details of various dense linear // algebra libraries and offers a simple API for solving dense symmetric // positive definite linear systems using a Cholesky factorization. class CERES_NO_EXPORT DenseCholesky { public: static std::unique_ptr Create( const LinearSolver::Options& options); virtual ~DenseCholesky(); // Computes the Cholesky factorization of the given matrix. // // The input matrix lhs is assumed to be a column-major num_cols x num_cols // matrix, that is symmetric positive definite with its lower triangular part // containing the left hand side of the linear system being solved. // // The input matrix lhs may be modified by the implementation to store the // factorization, irrespective of whether the factorization succeeds or not. // As a result it is the user's responsibility to ensure that lhs is valid // when Solve is called. virtual LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) = 0; // Computes the solution to the equation // // lhs * solution = rhs // // Calling Solve without calling Factorize is undefined behaviour. It is the // user's responsibility to ensure that the input matrix lhs passed to // Factorize has not been freed/modified when Solve is called. virtual LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) = 0; // Convenience method which combines a call to Factorize and Solve. Solve is // only called if Factorize returns LinearSolverTerminationType::SUCCESS. // // The input matrix lhs may be modified by the implementation to store the // factorization, irrespective of whether the method succeeds or not. It is // the user's responsibility to ensure that lhs is valid if and when Solve is // called again after this call. LinearSolverTerminationType FactorAndSolve(int num_cols, double* lhs, const double* rhs, double* solution, std::string* message); }; class CERES_NO_EXPORT EigenDenseCholesky final : public DenseCholesky { public: LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: using LLTType = Eigen::LLT, Eigen::Lower>; std::unique_ptr llt_; }; class CERES_NO_EXPORT FloatEigenDenseCholesky final : public DenseCholesky { public: LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: Eigen::MatrixXf lhs_; Eigen::VectorXf rhs_; Eigen::VectorXf solution_; using LLTType = Eigen::LLT; std::unique_ptr llt_; }; #ifndef CERES_NO_LAPACK class CERES_NO_EXPORT LAPACKDenseCholesky final : public DenseCholesky { public: LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: double* lhs_ = nullptr; int num_cols_ = -1; LinearSolverTerminationType termination_type_ = LinearSolverTerminationType::FATAL_ERROR; }; class CERES_NO_EXPORT FloatLAPACKDenseCholesky final : public DenseCholesky { public: LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: Eigen::MatrixXf lhs_; Eigen::VectorXf rhs_and_solution_; int num_cols_ = -1; LinearSolverTerminationType termination_type_ = LinearSolverTerminationType::FATAL_ERROR; }; #endif // CERES_NO_LAPACK class DenseIterativeRefiner; // Computes an initial solution using the given instance of // DenseCholesky, and then refines it using the DenseIterativeRefiner. class CERES_NO_EXPORT RefinedDenseCholesky final : public DenseCholesky { public: RefinedDenseCholesky( std::unique_ptr dense_cholesky, std::unique_ptr iterative_refiner); ~RefinedDenseCholesky() override; LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: std::unique_ptr dense_cholesky_; std::unique_ptr iterative_refiner_; double* lhs_ = nullptr; int num_cols_; }; #ifndef CERES_NO_CUDA // CUDA implementation of DenseCholesky using the cuSolverDN library using the // 32-bit legacy interface for maximum compatibility. class CERES_NO_EXPORT CUDADenseCholesky final : public DenseCholesky { public: static std::unique_ptr Create( const LinearSolver::Options& options); CUDADenseCholesky(const CUDADenseCholesky&) = delete; CUDADenseCholesky& operator=(const CUDADenseCholesky&) = delete; LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: explicit CUDADenseCholesky(ContextImpl* context); ContextImpl* context_ = nullptr; // Number of columns in the A matrix, to be cached between calls to *Factorize // and *Solve. size_t num_cols_ = 0; // GPU memory allocated for the A matrix (lhs matrix). CudaBuffer lhs_; // GPU memory allocated for the B matrix (rhs vector). CudaBuffer rhs_; // Scratch space for cuSOLVER on the GPU. CudaBuffer device_workspace_; // Required for error handling with cuSOLVER. CudaBuffer error_; // Cache the result of Factorize to ensure that when Solve is called, the // factorization of lhs is valid. LinearSolverTerminationType factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; }; // A mixed-precision iterative refinement dense Cholesky solver using FP32 CUDA // Dense Cholesky for inner iterations, and FP64 outer refinements. // This class implements a modified version of the "Classical iterative // refinement" (Algorithm 4.1) from the following paper: // Haidar, Azzam, Harun Bayraktar, Stanimire Tomov, Jack Dongarra, and Nicholas // J. Higham. "Mixed-precision iterative refinement using tensor cores on GPUs // to accelerate solution of linear systems." Proceedings of the Royal Society A // 476, no. 2243 (2020): 20200110. // // The three key modifications from Algorithm 4.1 in the paper are: // 1. We use Cholesky factorization instead of LU factorization since our A is // symmetric positive definite. // 2. During the solution update, the up-cast and accumulation is performed in // one step with a custom kernel. class CERES_NO_EXPORT CUDADenseCholeskyMixedPrecision final : public DenseCholesky { public: static std::unique_ptr Create( const LinearSolver::Options& options); CUDADenseCholeskyMixedPrecision(const CUDADenseCholeskyMixedPrecision&) = delete; CUDADenseCholeskyMixedPrecision& operator=( const CUDADenseCholeskyMixedPrecision&) = delete; LinearSolverTerminationType Factorize(int num_cols, double* lhs, std::string* message) override; LinearSolverTerminationType Solve(const double* rhs, double* solution, std::string* message) override; private: CUDADenseCholeskyMixedPrecision(ContextImpl* context, int max_num_refinement_iterations); // Helper function to wrap Cuda boilerplate needed to call Spotrf. LinearSolverTerminationType CudaCholeskyFactorize(std::string* message); // Helper function to wrap Cuda boilerplate needed to call Spotrs. LinearSolverTerminationType CudaCholeskySolve(std::string* message); // Picks up the cuSolverDN and cuStream handles from the context in the // options, and the number of refinement iterations from the options. If // the context is unable to initialize CUDA, returns false with a // human-readable message indicating the reason. bool Init(const LinearSolver::Options& options, std::string* message); ContextImpl* context_ = nullptr; // Number of columns in the A matrix, to be cached between calls to *Factorize // and *Solve. size_t num_cols_ = 0; CudaBuffer lhs_fp64_; CudaBuffer rhs_fp64_; CudaBuffer lhs_fp32_; // Scratch space for cuSOLVER on the GPU. CudaBuffer device_workspace_; // Required for error handling with cuSOLVER. CudaBuffer error_; // Solution to lhs * x = rhs. CudaBuffer x_fp64_; // Incremental correction to x. CudaBuffer correction_fp32_; // Residual to iterative refinement. CudaBuffer residual_fp32_; CudaBuffer residual_fp64_; // Number of inner refinement iterations to perform. int max_num_refinement_iterations_ = 0; // Cache the result of Factorize to ensure that when Solve is called, the // factorization of lhs is valid. LinearSolverTerminationType factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; }; #endif // CERES_NO_CUDA } // namespace ceres::internal #endif // CERES_INTERNAL_DENSE_CHOLESKY_H_