// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: joydeepb@cs.utexas.edu (Joydeep Biswas) #ifndef CERES_INTERNAL_CUDA_BUFFER_H_ #define CERES_INTERNAL_CUDA_BUFFER_H_ #include "ceres/context_impl.h" #include "ceres/internal/config.h" #ifndef CERES_NO_CUDA #include #include "cuda_runtime.h" #include "glog/logging.h" namespace ceres::internal { // An encapsulated buffer to maintain GPU memory, and handle transfers between // GPU and system memory. It is the responsibility of the user to ensure that // the appropriate GPU device is selected before each subroutine is called. This // is particularly important when using multiple GPU devices on different CPU // threads, since active Cuda devices are determined by the cuda runtime on a // per-thread basis. template class CudaBuffer { public: explicit CudaBuffer(ContextImpl* context) : context_(context) {} CudaBuffer(ContextImpl* context, int size) : context_(context) { Reserve(size); } CudaBuffer(CudaBuffer&& other) : data_(other.data_), size_(other.size_), context_(other.context_) { other.data_ = nullptr; other.size_ = 0; } CudaBuffer(const CudaBuffer&) = delete; CudaBuffer& operator=(const CudaBuffer&) = delete; ~CudaBuffer() { if (data_ != nullptr) { CHECK_EQ(cudaFree(data_), cudaSuccess); } } // Grow the GPU memory buffer if needed to accommodate data of the specified // size void Reserve(const size_t size) { if (size > size_) { if (data_ != nullptr) { CHECK_EQ(cudaFree(data_), cudaSuccess); } CHECK_EQ(cudaMalloc(&data_, size * sizeof(T)), cudaSuccess) << "Failed to allocate " << size * sizeof(T) << " bytes of GPU memory"; size_ = size; } } // Perform an asynchronous copy from CPU memory to GPU memory managed by this // CudaBuffer instance using the stream provided. void CopyFromCpu(const T* data, const size_t size) { Reserve(size); CHECK_EQ(cudaMemcpyAsync(data_, data, size * sizeof(T), cudaMemcpyHostToDevice, context_->DefaultStream()), cudaSuccess); } // Perform an asynchronous copy from a vector in CPU memory to GPU memory // managed by this CudaBuffer instance. void CopyFromCpuVector(const std::vector& data) { Reserve(data.size()); CHECK_EQ(cudaMemcpyAsync(data_, data.data(), data.size() * sizeof(T), cudaMemcpyHostToDevice, context_->DefaultStream()), cudaSuccess); } // Perform an asynchronous copy from another GPU memory array to the GPU // memory managed by this CudaBuffer instance using the stream provided. void CopyFromGPUArray(const T* data, const size_t size) { Reserve(size); CHECK_EQ(cudaMemcpyAsync(data_, data, size * sizeof(T), cudaMemcpyDeviceToDevice, context_->DefaultStream()), cudaSuccess); } // Copy data from the GPU memory managed by this CudaBuffer instance to CPU // memory. It is the caller's responsibility to ensure that the CPU memory // pointer is valid, i.e. it is not null, and that it points to memory of // at least this->size() size. This method ensures all previously dispatched // GPU operations on the specified stream have completed before copying the // data to CPU memory. void CopyToCpu(T* data, const size_t size) const { CHECK(data_ != nullptr); CHECK_EQ(cudaMemcpyAsync(data, data_, size * sizeof(T), cudaMemcpyDeviceToHost, context_->DefaultStream()), cudaSuccess); CHECK_EQ(cudaStreamSynchronize(context_->DefaultStream()), cudaSuccess); } // Copy N items from another GPU memory array to the GPU memory managed by // this CudaBuffer instance, growing this buffer's size if needed. This copy // is asynchronous, and operates on the stream provided. void CopyNItemsFrom(int n, const CudaBuffer& other) { Reserve(n); CHECK(other.data_ != nullptr); CHECK(data_ != nullptr); CHECK_EQ(cudaMemcpyAsync(data_, other.data_, size_ * sizeof(T), cudaMemcpyDeviceToDevice, context_->DefaultStream()), cudaSuccess); } // Return a pointer to the GPU memory managed by this CudaBuffer instance. T* data() { return data_; } const T* data() const { return data_; } // Return the number of items of type T that can fit in the GPU memory // allocated so far by this CudaBuffer instance. size_t size() const { return size_; } private: T* data_ = nullptr; size_t size_ = 0; ContextImpl* context_ = nullptr; }; } // namespace ceres::internal #endif // CERES_NO_CUDA #endif // CERES_INTERNAL_CUDA_BUFFER_H_