|
- #ifndef CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
- #define CERES_INTERNAL_CUDA_STREAMED_BUFFER_H_
- #include "ceres/internal/config.h"
- #ifndef CERES_NO_CUDA
- #include "ceres/cuda_buffer.h"
- namespace ceres::internal {
- template <typename T>
- class CERES_NO_EXPORT CudaStreamedBuffer {
- public:
-
-
-
-
-
-
-
-
-
-
-
- static constexpr int kNumBatches = 2;
-
-
-
-
-
- CudaStreamedBuffer(ContextImpl* context, const int max_buffer_size)
- : kValuesPerBatch(max_buffer_size / kNumBatches),
- context_(context),
- values_gpu_(context, kValuesPerBatch * kNumBatches) {
- static_assert(ContextImpl::kNumCudaStreams >= kNumBatches);
- CHECK_GE(max_buffer_size, kNumBatches);
-
-
-
- CHECK_EQ(cudaSuccess,
- cudaHostAlloc(&values_cpu_pinned_,
- sizeof(T) * kValuesPerBatch * kNumBatches,
- cudaHostAllocWriteCombined));
- for (auto& e : copy_finished_) {
- CHECK_EQ(cudaSuccess,
- cudaEventCreateWithFlags(&e, cudaEventDisableTiming));
- }
- }
- CudaStreamedBuffer(const CudaStreamedBuffer&) = delete;
- ~CudaStreamedBuffer() {
- CHECK_EQ(cudaSuccess, cudaFreeHost(values_cpu_pinned_));
- for (auto& e : copy_finished_) {
- CHECK_EQ(cudaSuccess, cudaEventDestroy(e));
- }
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- template <typename Fun>
- void CopyToGpu(const T* from, const int num_values, Fun&& callback) {
-
-
-
- CHECK_EQ(cudaSuccess, cudaStreamSynchronize(context_->DefaultStream()));
-
-
-
-
- const bool copy_to_pinned_memory = MemoryTypeResultsInSynchronousCopy(from);
- T* batch_values_gpu[kNumBatches];
- T* batch_values_cpu[kNumBatches];
- auto streams = context_->streams_;
- for (int i = 0; i < kNumBatches; ++i) {
- batch_values_gpu[i] = values_gpu_.data() + kValuesPerBatch * i;
- batch_values_cpu[i] = values_cpu_pinned_ + kValuesPerBatch * i;
- }
- int batch_id = 0;
- for (int offset = 0; offset < num_values; offset += kValuesPerBatch) {
- const int num_values_batch =
- std::min(num_values - offset, kValuesPerBatch);
- const T* batch_from = from + offset;
- T* batch_to = batch_values_gpu[batch_id];
- auto stream = streams[batch_id];
- auto copy_finished = copy_finished_[batch_id];
- if (copy_to_pinned_memory) {
-
-
- CHECK_EQ(cudaSuccess, cudaEventSynchronize(copy_finished));
- std::copy_n(batch_from, num_values_batch, batch_values_cpu[batch_id]);
- batch_from = batch_values_cpu[batch_id];
- }
- CHECK_EQ(cudaSuccess,
- cudaMemcpyAsync(batch_to,
- batch_from,
- sizeof(T) * num_values_batch,
- cudaMemcpyHostToDevice,
- stream));
- if (copy_to_pinned_memory) {
-
-
-
-
-
-
-
- CHECK_EQ(cudaSuccess, cudaEventRecord(copy_finished, stream));
- }
- callback(batch_to, num_values_batch, offset, stream);
- batch_id = (batch_id + 1) % kNumBatches;
- }
-
- for (int i = 0; i < kNumBatches; ++i) {
- CHECK_EQ(cudaSuccess, cudaStreamSynchronize(streams[i]));
- }
- }
- private:
-
-
-
- static bool MemoryTypeResultsInSynchronousCopy(const void* ptr) {
- cudaPointerAttributes attributes;
- auto status = cudaPointerGetAttributes(&attributes, ptr);
- #if CUDART_VERSION < 11000
-
-
- if (status == cudaErrorInvalidValue) {
- return true;
- }
- #endif
- CHECK_EQ(status, cudaSuccess);
-
- CHECK_NE(attributes.type, cudaMemoryTypeDevice);
-
-
-
-
-
- return attributes.type == cudaMemoryTypeUnregistered;
- }
- const int kValuesPerBatch;
- ContextImpl* context_ = nullptr;
- CudaBuffer<T> values_gpu_;
- T* values_cpu_pinned_ = nullptr;
- cudaEvent_t copy_finished_[kNumBatches] = {nullptr};
- };
- }
- #endif
- #endif
|