CUDACachingAllocator.h 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. #pragma once
  2. #include <c10/core/Allocator.h>
  3. #include <c10/cuda/CUDAGraphsC10Utils.h>
  4. #include <c10/cuda/CUDAMacros.h>
  5. #include <c10/cuda/CUDAStream.h>
  6. #include <c10/util/Registry.h>
  7. #include <array>
  8. #include <mutex>
  9. namespace c10 {
  10. // Caching allocator will execute every registered callback if it unable to find
  11. // block inside of already allocated area.
  12. class C10_CUDA_API FreeMemoryCallback {
  13. public:
  14. virtual ~FreeMemoryCallback() = default;
  15. virtual bool Execute() = 0;
  16. };
  17. C10_DECLARE_REGISTRY(FreeCudaMemoryCallbacksRegistry, FreeMemoryCallback);
  18. #define REGISTER_FREE_MEMORY_CALLBACK(name, ...) \
  19. C10_REGISTER_CLASS(FreeCudaMemoryCallbacksRegistry, name, __VA_ARGS__);
  20. namespace cuda {
  21. // TODO: Turn this into an honest to goodness class. I briefly attempted to do
  22. // this, but it was a bit irritating to figure out how to also correctly
  23. // apply pimpl pattern so I didn't have to leak any internal implementation
  24. // details in the header (CUDACachingAllocator could be made a pimpl, but
  25. // you also need to appropriately define a class which is a subclass
  26. // of Allocator. Not impossible, but required a bit more surgery than
  27. // I wanted to do at the time.)
  28. //
  29. // Why is this using a namespace rather than old-style THCCachingAllocator_
  30. // prefix? Mostly because it made the HIPify rules easier to write; _ is
  31. // not counted as a word boundary, so you would otherwise have to list each
  32. // of these functions.
  33. namespace CUDACachingAllocator {
  34. struct Stat {
  35. int64_t current = 0;
  36. int64_t peak = 0;
  37. int64_t allocated = 0;
  38. int64_t freed = 0;
  39. };
  40. enum struct StatType : uint64_t {
  41. AGGREGATE = 0,
  42. SMALL_POOL = 1,
  43. LARGE_POOL = 2,
  44. NUM_TYPES = 3 // remember to update this whenever a new stat type is added
  45. };
  46. typedef std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)> StatArray;
  47. // Struct containing memory allocator summary statistics for a device.
  48. struct DeviceStats {
  49. // COUNT: allocations requested by client code
  50. StatArray allocation;
  51. // COUNT: number of allocated segments from cudaMalloc().
  52. StatArray segment;
  53. // COUNT: number of active memory blocks (allocated or used by stream)
  54. StatArray active;
  55. // COUNT: number of inactive, split memory blocks (unallocated but can't be
  56. // released via cudaFree)
  57. StatArray inactive_split;
  58. // SUM: bytes allocated by this memory alocator
  59. StatArray allocated_bytes;
  60. // SUM: bytes reserved by this memory allocator (both free and used)
  61. StatArray reserved_bytes;
  62. // SUM: bytes within active memory blocks
  63. StatArray active_bytes;
  64. // SUM: bytes within inactive, split memory blocks
  65. StatArray inactive_split_bytes;
  66. // SUM: bytes requested by client code
  67. StatArray requested_bytes;
  68. // COUNT: total number of failed calls to CUDA malloc necessitating cache
  69. // flushes.
  70. int64_t num_alloc_retries = 0;
  71. // COUNT: total number of OOMs (i.e. failed calls to CUDA after cache flush)
  72. int64_t num_ooms = 0;
  73. // COUNT: total number of oversize blocks allocated from pool
  74. Stat oversize_allocations;
  75. // COUNT: total number of oversize blocks requiring malloc
  76. Stat oversize_segments;
  77. // SIZE: maximum block size that is allowed to be split.
  78. int64_t max_split_size = 0;
  79. };
  80. struct Context {
  81. virtual ~Context() = default;
  82. };
  83. typedef std::shared_ptr<Context> (*CreateContextFn)(void);
  84. struct History {
  85. void* addr;
  86. size_t real_size; // unrounded, actually requested size
  87. std::shared_ptr<Context> context; // per-watcher context
  88. };
  89. // Struct containing info of an allocation block (i.e. a fractional part of a
  90. // cudaMalloc)..
  91. struct BlockInfo {
  92. int64_t size = 0;
  93. int64_t requested_size = 0;
  94. int32_t gc_counter = 0;
  95. bool allocated = false;
  96. bool active = false;
  97. std::vector<History> history;
  98. };
  99. // Struct containing info of a memory segment (i.e. one contiguous cudaMalloc).
  100. struct SegmentInfo {
  101. int64_t device = 0;
  102. int64_t address = 0;
  103. int64_t total_size = 0;
  104. int64_t requested_size = 0;
  105. int64_t allocated_size = 0;
  106. int64_t active_size = 0;
  107. cudaStream_t stream = 0;
  108. bool is_large = false;
  109. std::vector<BlockInfo> blocks;
  110. };
  111. struct TraceEntry {
  112. enum Action {
  113. ALLOC, // API made to the caching allocator for new memory
  114. FREE_REQUESTED, // API call made to the caching allocator to free memory
  115. FREE_COMPLETED, // The allocator might have to delay a free because
  116. // it is still in use on another stream via record_stream
  117. // This event is generated when a free actually completes.
  118. SEGMENT_ALLOC, // a call to cudaMalloc to get more memory from the OS
  119. SEGMENT_FREE, // a call to cudaFree to return memory to the OS (e.g. to
  120. // defragement or empty_caches)
  121. SNAPSHOT, // a call to snapshot, used to correlate memory snapshots to trace
  122. // events
  123. OOM // the allocator threw an OutOfMemoryError (addr_ is the amount of free
  124. // bytes reported by cuda)
  125. };
  126. TraceEntry(
  127. Action action,
  128. int64_t addr,
  129. size_t size,
  130. cudaStream_t stream,
  131. std::shared_ptr<Context> context = nullptr)
  132. : action_(action),
  133. addr_(addr),
  134. context_(context),
  135. stream_(stream),
  136. size_(size) {}
  137. Action action_;
  138. int64_t addr_; // for OOM, this is the amount of free bytes reported by cuda
  139. std::shared_ptr<Context> context_;
  140. cudaStream_t stream_;
  141. int64_t size_;
  142. };
  143. struct SnapshotInfo {
  144. std::vector<SegmentInfo> segments;
  145. std::vector<std::vector<TraceEntry>> device_traces;
  146. };
  147. C10_CUDA_API void setAllocatorSettings(const std::string& env);
  148. // Size pretty-printer
  149. std::string format_size(uint64_t size);
  150. using OutOfMemoryObserver = std::function<void(
  151. int64_t device,
  152. int64_t allocated,
  153. int64_t device_total,
  154. int64_t device_free)>;
  155. class CUDAAllocator : public Allocator {
  156. public:
  157. virtual void* raw_alloc(size_t nbytes) = 0;
  158. virtual void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) = 0;
  159. virtual void raw_delete(void* ptr) = 0;
  160. virtual void init(int device_count) = 0;
  161. virtual bool initialized() = 0;
  162. virtual void setMemoryFraction(double fraction, int device) = 0;
  163. virtual void emptyCache() = 0;
  164. virtual void cacheInfo(int dev_id, size_t* largestBlock) = 0;
  165. virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
  166. virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
  167. virtual DeviceStats getDeviceStats(int device) = 0;
  168. virtual void resetAccumulatedStats(int device) = 0;
  169. virtual void resetPeakStats(int device) = 0;
  170. virtual SnapshotInfo snapshot() = 0;
  171. virtual void notifyCaptureBegin(
  172. int device,
  173. CaptureId_t graph_id,
  174. MempoolId_t mempool_id) = 0;
  175. virtual void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) = 0;
  176. virtual void notifyCaptureEnded(int device, CaptureId_t graph_id) = 0;
  177. virtual void notifyCaptureDestroy(int device, MempoolId_t mempool_id) = 0;
  178. virtual std::shared_ptr<void> getIpcDevPtr(std::string handle) = 0;
  179. virtual void recordHistory(
  180. bool enabled,
  181. CreateContextFn context_recorder,
  182. size_t alloc_trace_max_entries,
  183. bool alloc_trace_record_context) = 0;
  184. virtual void attachOutOfMemoryObserver(OutOfMemoryObserver observer) = 0;
  185. virtual bool needsPoolSpecificPeerAccess() = 0;
  186. virtual std::string name() = 0;
  187. };
  188. // Allocator object, statically initialized
  189. // See BackendInitializer in CUDACachingAllocator.cpp.
  190. // Atomic loads on x86 are just normal loads,
  191. // (atomic stores are different), so reading this value
  192. // is no different than loading a pointer.
  193. C10_CUDA_API extern std::atomic<CUDAAllocator*> allocator;
  194. inline CUDAAllocator* get() {
  195. return allocator.load();
  196. }
  197. // Called directly by clients.
  198. inline void* raw_alloc(size_t nbytes) {
  199. return get()->raw_alloc(nbytes);
  200. }
  201. inline void* raw_alloc_with_stream(size_t nbytes, cudaStream_t stream) {
  202. return get()->raw_alloc_with_stream(nbytes, stream);
  203. }
  204. inline void raw_delete(void* ptr) {
  205. return get()->raw_delete(ptr);
  206. }
  207. inline void init(int device_count) {
  208. return get()->init(device_count);
  209. }
  210. inline void setMemoryFraction(double fraction, int device) {
  211. return get()->setMemoryFraction(fraction, device);
  212. }
  213. inline void emptyCache() {
  214. return get()->emptyCache();
  215. }
  216. inline void cacheInfo(int dev_id, size_t* largestBlock) {
  217. return get()->cacheInfo(dev_id, largestBlock);
  218. }
  219. inline void* getBaseAllocation(void* ptr, size_t* size) {
  220. return get()->getBaseAllocation(ptr, size);
  221. }
  222. inline void recordStream(const DataPtr& dataPtr, CUDAStream stream) {
  223. return get()->recordStream(dataPtr, stream);
  224. }
  225. inline DeviceStats getDeviceStats(int device) {
  226. return get()->getDeviceStats(device);
  227. }
  228. inline void resetAccumulatedStats(int device) {
  229. return get()->resetAccumulatedStats(device);
  230. }
  231. inline void resetPeakStats(int device) {
  232. return get()->resetPeakStats(device);
  233. }
  234. inline SnapshotInfo snapshot() {
  235. return get()->snapshot();
  236. }
  237. // CUDAGraph interactions
  238. inline void notifyCaptureBegin(
  239. int device,
  240. CaptureId_t graph_id,
  241. MempoolId_t mempool_id) {
  242. return get()->notifyCaptureBegin(device, graph_id, mempool_id);
  243. }
  244. inline void notifyCaptureAboutToEnd(int device, CaptureId_t graph_id) {
  245. return get()->notifyCaptureAboutToEnd(device, graph_id);
  246. }
  247. inline void recordHistory(
  248. bool enabled,
  249. CreateContextFn context_recorder,
  250. size_t alloc_trace_max_entries,
  251. bool alloc_trace_record_context) {
  252. return get()->recordHistory(
  253. enabled,
  254. context_recorder,
  255. alloc_trace_max_entries,
  256. alloc_trace_record_context);
  257. }
  258. inline void attachOutOfMemoryObserver(OutOfMemoryObserver observer) {
  259. return get()->attachOutOfMemoryObserver(observer);
  260. }
  261. inline void notifyCaptureEnded(int device, CaptureId_t graph_id) {
  262. return get()->notifyCaptureEnded(device, graph_id);
  263. }
  264. inline void notifyCaptureDestroy(int device, MempoolId_t mempool_id) {
  265. return get()->notifyCaptureDestroy(device, mempool_id);
  266. }
  267. // Not part of CUDA_ALLOCATOR_BACKEND_INTERFACE
  268. inline std::shared_ptr<void> getIpcDevPtr(std::string handle) {
  269. return get()->getIpcDevPtr(handle);
  270. }
  271. inline std::string name() {
  272. return get()->name();
  273. }
  274. } // namespace CUDACachingAllocator
  275. } // namespace cuda
  276. } // namespace c10