123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- #pragma once
- #include <c10/core/Allocator.h>
- #include <c10/cuda/CUDAGraphsC10Utils.h>
- #include <c10/cuda/CUDAMacros.h>
- #include <c10/cuda/CUDAStream.h>
- #include <c10/util/Registry.h>
- #include <array>
- #include <mutex>
- namespace c10 {
- // Caching allocator will execute every registered callback if it unable to find
- // block inside of already allocated area.
- class C10_CUDA_API FreeMemoryCallback {
- public:
- virtual ~FreeMemoryCallback() = default;
- virtual bool Execute() = 0;
- };
- C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
- #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
- C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
- namespace cuda {
- // TODO: Turn this into an honest to goodness class. I briefly attempted to do
- // this, but it was a bit irritating to figure out how to also correctly
- // apply pimpl pattern so I didn't have to leak any internal implementation
- // details in the header (CUDACachingAllocator could be made a pimpl, but
- // you also need to appropriately define a class which is a subclass
- // of Allocator. Not impossible, but required a bit more surgery than
- // I wanted to do at the time.)
- //
- // Why is this using a namespace rather than old-style THCCachingAllocator_
- // prefix? Mostly because it made the HIPify rules easier to write; _ is
- // not counted as a word boundary, so you would otherwise have to list each
- // of these functions.
- namespace CUDACachingAllocator {
- struct Stat {
- int64_t current = 0;
- int64_t peak = 0;
- int64_t allocated = 0;
- int64_t freed = 0;
- };
- enum struct StatType : uint64_t {
- AGGREGATE = 0,
- SMALL_POOL = 1,
- LARGE_POOL = 2,
- NUM_TYPES = 3 // remember to update this whenever a new stat type is added
- };
- typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
- // Struct containing memory allocator summary statistics for a device.
- struct DeviceStats {
- // COUNT: allocations requested by client code
- StatArray allocation;
- // COUNT: number of allocated segments from cudaMalloc().
- StatArray segment;
- // COUNT: number of active memory blocks (allocated or used by stream)
- StatArray active;
- // COUNT: number of inactive, split memory blocks (unallocated but can't be
- // released via cudaFree)
- StatArray inactive_split;
- // SUM: bytes allocated by this memory alocator
- StatArray allocated_bytes;
- // SUM: bytes reserved by this memory allocator (both free and used)
- StatArray reserved_bytes;
- // SUM: bytes within active memory blocks
- StatArray active_bytes;
- // SUM: bytes within inactive, split memory blocks
- StatArray inactive_split_bytes;
- // SUM: bytes requested by client code
- StatArray requested_bytes;
- // COUNT: total number of failed calls to CUDA malloc necessitating cache
- // flushes.
- int64_t num_alloc_retries = 0;
- // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
- int64_t num_ooms = 0;
- // COUNT: total number of oversize blocks allocated from pool
- Stat oversize_allocations;
- // COUNT: total number of oversize blocks requiring malloc
- Stat oversize_segments;
- // SIZE: maximum block size that is allowed to be split.
- int64_t max_split_size = 0;
- };
- struct Context {
- virtual ~Context() = default;
- };
- typedef std::shared_ptr<Context> (*CreateContextFn)(void);
- struct History {
- void* addr;
- size_t real_size; // unrounded, actually requested size
- std::shared_ptr<Context> context; // per-watcher context
- };
- // Struct containing info of an allocation block (i.e. a fractional part of a
- // cudaMalloc)..
- struct BlockInfo {
- int64_t size = 0;
- int64_t requested_size = 0;
- int32_t gc_counter = 0;
- bool allocated = false;
- bool active = false;
- std::vector<History> history;
- };
- // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
- struct SegmentInfo {
- int64_t device = 0;
- int64_t address = 0;
- int64_t total_size = 0;
- int64_t requested_size = 0;
- int64_t allocated_size = 0;
- int64_t active_size = 0;
- cudaStream_t stream = 0;
- bool is_large = false;
- std::vector<BlockInfo> blocks;
- };
- struct TraceEntry {
- enum Action {
- ALLOC, // API made to the caching allocator for new memory
- FREE_REQUESTED, // API call made to the caching allocator to free memory
- FREE_COMPLETED, // The allocator might have to delay a free because
- // it is still in use on another stream via record_stream
- // This event is generated when a free actually completes.
- SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
- SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
- // defragement or empty_caches)
- SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
- // events
- OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
- // bytes reported by cuda)
- };
- TraceEntry(
- Action action,
- int64_t addr,
- size_t size,
- cudaStream_t stream,
- std::shared_ptr<Context> context = nullptr)
- : action_(action),
- addr_(addr),
- context_(context),
- stream_(stream),
- size_(size) {}
- Action action_;
- int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
- std::shared_ptr<Context> context_;
- cudaStream_t stream_;
- int64_t size_;
- };
- struct SnapshotInfo {
- std::vector<SegmentInfo> segments;
- std::vector<std::vector<TraceEntry>> device_traces;
- };
- C10_CUDA_API void setAllocatorSettings(const std::string& env);
- // Size pretty-printer
- std::string format_size(uint64_t size);
- using OutOfMemoryObserver = std::function<void(
- int64_t device,
- int64_t allocated,
- int64_t device_total,
- int64_t device_free)>;
- class CUDAAllocator : public Allocator {
- public:
- virtual void* raw_alloc(size_t nbytes) = 0;
- virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
- virtual void raw_delete(void* ptr) = 0;
- virtual void init(int device_count) = 0;
- virtual bool initialized() = 0;
- virtual void setMemoryFraction(double fraction, int device) = 0;
- virtual void emptyCache() = 0;
- virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
- virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
- virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
- virtual DeviceStats getDeviceStats(int device) = 0;
- virtual void resetAccumulatedStats(int device) = 0;
- virtual void resetPeakStats(int device) = 0;
- virtual SnapshotInfo snapshot() = 0;
- virtual void notifyCaptureBegin(
- int device,
- CaptureId_t graph_id,
- MempoolId_t mempool_id) = 0;
- virtual void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) = 0;
- virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) = 0;
- virtual void notifyCaptureDestroy(int device, MempoolId_t mempool_id) = 0;
- virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
- virtual void recordHistory(
- bool enabled,
- CreateContextFn context_recorder,
- size_t alloc_trace_max_entries,
- bool alloc_trace_record_context) = 0;
- virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
- virtual bool needsPoolSpecificPeerAccess() = 0;
- virtual std::string name() = 0;
- };
- // Allocator object, statically initialized
- // See BackendInitializer in CUDACachingAllocator.cpp.
- // Atomic loads on x86 are just normal loads,
- // (atomic stores are different), so reading this value
- // is no different than loading a pointer.
- C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
- inline CUDAAllocator* get() {
- return allocator.load();
- }
- // Called directly by clients.
- inline void* raw_alloc(size_t nbytes) {
- return get()->raw_alloc(nbytes);
- }
- inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
- return get()->raw_alloc_with_stream(nbytes, stream);
- }
- inline void raw_delete(void* ptr) {
- return get()->raw_delete(ptr);
- }
- inline void init(int device_count) {
- return get()->init(device_count);
- }
- inline void setMemoryFraction(double fraction, int device) {
- return get()->setMemoryFraction(fraction, device);
- }
- inline void emptyCache() {
- return get()->emptyCache();
- }
- inline void cacheInfo(int dev_id, size_t* largestBlock) {
- return get()->cacheInfo(dev_id, largestBlock);
- }
- inline void* getBaseAllocation(void* ptr, size_t* size) {
- return get()->getBaseAllocation(ptr, size);
- }
- inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
- return get()->recordStream(dataPtr, stream);
- }
- inline DeviceStats getDeviceStats(int device) {
- return get()->getDeviceStats(device);
- }
- inline void resetAccumulatedStats(int device) {
- return get()->resetAccumulatedStats(device);
- }
- inline void resetPeakStats(int device) {
- return get()->resetPeakStats(device);
- }
- inline SnapshotInfo snapshot() {
- return get()->snapshot();
- }
- // CUDAGraph interactions
- inline void notifyCaptureBegin(
- int device,
- CaptureId_t graph_id,
- MempoolId_t mempool_id) {
- return get()->notifyCaptureBegin(device, graph_id, mempool_id);
- }
- inline void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) {
- return get()->notifyCaptureAboutToEnd(device, graph_id);
- }
- inline void recordHistory(
- bool enabled,
- CreateContextFn context_recorder,
- size_t alloc_trace_max_entries,
- bool alloc_trace_record_context) {
- return get()->recordHistory(
- enabled,
- context_recorder,
- alloc_trace_max_entries,
- alloc_trace_record_context);
- }
- inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
- return get()->attachOutOfMemoryObserver(observer);
- }
- inline void notifyCaptureEnded(int device, CaptureId_t graph_id) {
- return get()->notifyCaptureEnded(device, graph_id);
- }
- inline void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
- return get()->notifyCaptureDestroy(device, mempool_id);
- }
- // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
- inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
- return get()->getIpcDevPtr(handle);
- }
- inline std::string name() {
- return get()->name();
- }
- } // namespace CUDACachingAllocator
- } // namespace cuda
- } // namespace c10
|