123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260 |
- #pragma once
- #include <cstdint>
- #include <utility>
- #include <cuda_runtime_api.h>
- #include <c10/core/DeviceGuard.h>
- #include <c10/core/Stream.h>
- #include <c10/cuda/CUDAFunctions.h>
- #include <c10/util/Exception.h>
- /*
- * Stream pool note.
- *
- * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
- * are backed by cuStreams, but they use several pools to minimize the costs
- * associated with creating, retaining, and destroying cuStreams.
- *
- * There are three pools per device, and a device's pools are lazily created.
- *
- * The first pool contains only the default stream. When the default stream
- * is requested it's returned.
- *
- * The second pool is the "low priority" or "default priority" streams. In
- * HIP builds there is no distinction between streams in this pool and streams
- * in the third pool (below). There are 32 of these streams per device, and
- * when a stream is requested one of these streams is returned round-robin.
- * That is, the first stream requested is at index 0, the second at index 1...
- * to index 31, then index 0 again.
- *
- * This means that if 33 low priority streams are requested, the first and
- * last streams requested are actually the same stream (under the covers)
- * and kernels enqueued on them cannot run concurrently.
- *
- * The third pool is the "high priority" streams. The third pool acts like
- * the second pool except the streams are created with a higher priority.
- *
- * These pools suggest that stream users should prefer many short-lived streams,
- * as the cost of acquiring and releasing streams is effectively zero. If
- * many longer-lived streams are required in performance critical scenarios
- * then the functionality here may need to be extended to allow, for example,
- * "reserving" a subset of the pool so that other streams do not accidentally
- * overlap the performance critical streams.
- *
- * Note: although the notion of "current stream for device" is thread local
- * (every OS thread has a separate current stream, as one might expect),
- * the stream pool is global across all threads; stream 0 is always stream 0
- * no matter which thread you use it on. Multiple threads can synchronize
- * on the same stream. Although the CUDA documentation is not very clear
- * on the matter, streams are thread safe; e.g., it is safe to enqueue
- * a kernel on the same stream from two different threads.
- */
- namespace c10 {
- namespace cuda {
- // Value object representing a CUDA stream. This is just a wrapper
- // around c10::Stream, but it comes with a little extra CUDA-specific
- // functionality (conversion to cudaStream_t), and a guarantee that
- // the wrapped c10::Stream really is a CUDA stream.
- class C10_CUDA_API CUDAStream {
- public:
- enum Unchecked { UNCHECKED };
- /// Construct a CUDAStream from a Stream. This construction is checked,
- /// and will raise an error if the Stream is not, in fact, a CUDA stream.
- explicit CUDAStream(Stream stream) : stream_(stream) {
- TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
- }
- /// Construct a CUDAStream from a Stream with no error checking.
- /// This constructor uses the "named" constructor idiom, and can
- /// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
- explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
- bool operator==(const CUDAStream& other) const noexcept {
- return unwrap() == other.unwrap();
- }
- bool operator!=(const CUDAStream& other) const noexcept {
- return unwrap() != other.unwrap();
- }
- /// Implicit conversion to cudaStream_t.
- operator cudaStream_t() const {
- return stream();
- }
- /// Implicit conversion to Stream (a.k.a., forget that the stream is a
- /// CUDA stream).
- operator Stream() const {
- return unwrap();
- }
- /// Used to avoid baking in device type explicitly to Python-side API.
- DeviceType device_type() const {
- return DeviceType::CUDA;
- }
- /// Get the CUDA device index that this stream is associated with.
- DeviceIndex device_index() const {
- return stream_.device_index();
- }
- /// Get the full Device that this stream is associated with. The Device
- /// is guaranteed to be a CUDA device.
- Device device() const {
- return Device(DeviceType::CUDA, device_index());
- }
- /// Return the stream ID corresponding to this particular stream.
- StreamId id() const {
- return stream_.id();
- }
- bool query() const {
- DeviceGuard guard{stream_.device()};
- cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
- if (err == cudaSuccess) {
- return true;
- } else if (err != cudaErrorNotReady) {
- C10_CUDA_CHECK(err);
- } else {
- // ignore and clear the error if not ready
- (void)cudaGetLastError();
- }
- return false;
- }
- void synchronize() const {
- DeviceGuard guard{stream_.device()};
- c10::cuda::stream_synchronize(stream());
- }
- int priority() const {
- DeviceGuard guard{stream_.device()};
- int priority = 0;
- C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
- return priority;
- }
- /// Explicit conversion to cudaStream_t.
- cudaStream_t stream() const;
- /// Explicit conversion to Stream.
- Stream unwrap() const {
- return stream_;
- }
- /// Reversibly pack a CUDAStream into a struct representation.
- /// Previously the stream's data was packed into a single int64_t,
- /// as it was assumed the fields would not require more than
- /// 64 bits of storage in total.
- /// See https://github.com/pytorch/pytorch/issues/75854
- /// for more information regarding newer platforms that may violate
- /// this assumption.
- ///
- /// The CUDAStream can be unpacked using unpack().
- struct c10::StreamData3 pack3() const {
- return stream_.pack3();
- }
- // Unpack a CUDAStream from the 3 fields generated by pack().
- static CUDAStream unpack3(
- StreamId stream_id,
- DeviceIndex device_index,
- DeviceType device_type) {
- return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
- }
- static std::tuple<int, int> priority_range() {
- // Note: this returns the range of priority **supported by PyTorch**, not
- // the range of priority **supported by CUDA**. The former is a subset of
- // the latter. Currently PyTorch only supports 0 and -1, which are "low" and
- // "high" priority.
- int least_priority, greatest_priority;
- C10_CUDA_CHECK(
- cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
- TORCH_INTERNAL_ASSERT(
- least_priority >= 0, "Unexpected CUDA stream priority range");
- TORCH_INTERNAL_ASSERT(
- greatest_priority <= -1, "Unexpected CUDA stream priority range");
- return std::make_tuple(0, -1);
- }
- // Deleted for now; use CUDAEvent::block instead
- // void synchronize_with(const CUDAEvent& event) const;
- private:
- Stream stream_;
- };
- /**
- * Get a new stream from the CUDA stream pool. You can think of this
- * as "creating" a new stream, but no such creation actually happens;
- * instead, streams are preallocated from the pool and returned in a
- * round-robin fashion.
- *
- * You can request a stream from the high priority pool by setting
- * isHighPriority to true, or a stream for a specific device by setting device
- * (defaulting to the current CUDA stream.)
- */
- C10_API CUDAStream
- getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
- /**
- * Get a CUDAStream from a externally allocated one.
- *
- * This is mainly for interoperability with different libraries where we
- * want to operate on a non-torch allocated stream for data exchange or similar
- * purposes
- */
- C10_API CUDAStream
- getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
- /**
- * Get the default CUDA stream, for the passed CUDA device, or for the
- * current device if no device index is passed. The default stream is
- * where most computation occurs when you aren't explicitly using
- * streams.
- */
- C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
- /**
- * Get the current CUDA stream, for the passed CUDA device, or for the
- * current device if no device index is passed. The current CUDA stream
- * will usually be the default CUDA stream for the device, but it may
- * be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
- * or 'CUDAStreamGuard'.
- */
- C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
- /**
- * Set the current stream on the device of the passed in stream to be
- * the passed in stream. Yes, you read that right: this function
- * has *nothing* to do with the current device: it toggles the current
- * stream of the device of the passed stream.
- *
- * Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
- * (which will switch both your current device and current stream in the way you
- * expect, and reset it back to its original state afterwards).
- */
- C10_API void setCurrentCUDAStream(CUDAStream stream);
- C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
- } // namespace cuda
- } // namespace c10
- namespace std {
- template <>
- struct hash<c10::cuda::CUDAStream> {
- size_t operator()(c10::cuda::CUDAStream s) const noexcept {
- return std::hash<c10::Stream>{}(s.unwrap());
- }
- };
- } // namespace std
|