123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250 |
- #pragma once
- #include <ATen/jit_macros.h>
- #if AT_USE_JITERATOR()
- #include <c10/util/variant.h>
- #include <ATen/native/TensorIterator.h>
- #include <ATen/cuda/detail/OffsetCalculator.cuh>
- #include <ATen/native/cuda/jit_utils.h>
- #include <ATen/native/cuda/MemoryAccess.cuh>
- #include <ATen/native/cuda/JitLoops.cuh>
- #include <string>
- #include <vector>
- namespace at {
- namespace native {
- #define AT_FOR_8_CASES(_) \
- _(1) \
- _(2) \
- _(3) \
- _(4) \
- _(5) \
- _(6) \
- _(7) \
- _(8)
- #define AT_FOR_8_CASES_WITH_COMMA(_) \
- _(1) , \
- _(2) , \
- _(3) , \
- _(4) , \
- _(5) , \
- _(6) , \
- _(7) , \
- _(8)
- c10::SmallVector<std::string> get_extra_args_typenames(const c10::SmallVector<at::Scalar>& extra_args) {
- c10::SmallVector<std::string> args_typenames(extra_args.size());
- for (auto i = 0; i < extra_args.size(); ++i) {
- args_typenames[i] = at::cuda::jit::typeName(extra_args[i].type());
- }
- return args_typenames;
- }
- int can_vectorize_up_to(at::ScalarType type, char* pointer) {
- switch(type) {
- #define DEFINE_CASE(ctype, scalartype) \
- case ScalarType::scalartype : return memory::can_vectorize_up_to<ctype>(pointer);
- AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
- #undef DEFINE_CASE
- default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
- }
- }
- // jitted version of the above
- // See Note [Jiterator], this relies on the assumptions enumerated there
- int jitted_can_vectorize_up_to(const TensorIteratorBase& iter) {
- const at::ScalarType common_dtype = iter.common_dtype();
- const at::ScalarType result_dtype = common_dtype;
- // Deals with output
- int result = can_vectorize_up_to(result_dtype, static_cast<char*>(iter.data_ptr(0)));
- // Incorporates input(s)
- for (auto i = 1; i < iter.ntensors(); ++i) {
- result = std::min<int>(result, can_vectorize_up_to(common_dtype, static_cast<char*>(iter.data_ptr(i))));
- }
- return result;
- }
- template<bool IS_INPUT, int N>
- static std::unique_ptr<OffsetCalculator<N>> make_unique_offset_calculator(
- const TensorIteratorBase& iter) {
- // array size can not be 0, this happens when N == 0
- constexpr int array_size = std::max<int>(N, 1);
- TORCH_INTERNAL_ASSERT(N == (IS_INPUT ? iter.ninputs() : iter.noutputs()));
- std::array<const int64_t*, array_size> strides;
- int64_t element_sizes[array_size];
- for (int i = 0; i < N; i++) {
- int index = IS_INPUT ? i + iter.noutputs() : i;
- strides[i] = iter.strides(index).data();
- element_sizes[i] = iter.element_size(index);
- }
- return std::make_unique<OffsetCalculator<N>>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
- }
- template <bool IS_INPUT>
- struct OffsetCalculatorVariant {
- #define DEFINE_CASE(index) std::unique_ptr<OffsetCalculator<index>>
- using OffsetCalculatorTypes = c10::variant<
- AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
- >;
- #undef DEFINE_CASE
- OffsetCalculatorVariant(const TensorIteratorBase& iter) {
- int num = IS_INPUT ? iter.ninputs() : iter.noutputs();
- switch(num) {
- #define DEFINE_CASE(index) \
- case index : v = make_unique_offset_calculator<IS_INPUT, index>(iter); break;
- AT_FOR_8_CASES(DEFINE_CASE)
- #undef DEFINE_CASE
- default:
- TORCH_CHECK(false, "OffsetCalculatorVariant is not implemented for num_tensor = ", num);
- }
- }
- void* data_ptr() {
- return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
- }
- private:
- OffsetCalculatorTypes v;
- };
- struct ArrayVariant {
- // works for up to 8 input + 8 outputs
- #define DEFINE_CASE(index) at::detail::Array<char*, index>, at::detail::Array<char*, index+8>
- using ArrayTypes = c10::variant<
- AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
- >;
- #undef DEFINE_CASE
- ArrayVariant(const TensorIteratorBase& iter) {
- int ntensors = iter.ntensors();
- switch(ntensors) {
- #define DEFINE_CASE(index) \
- case index: array = at::detail::Array<char*, index>{}; break; \
- case index+8: array = at::detail::Array<char*, index+8>{}; break;
- AT_FOR_8_CASES(DEFINE_CASE)
- #undef DEFINE_CASE
- default:
- TORCH_CHECK(false, "ArrayVariant is not implemented for ntensors = ", ntensors);
- }
- c10::visit([&](auto& a) {
- for (auto i = 0; i < ntensors; ++i) {
- a[i] = (char*)iter.data_ptr(i);
- }
- }, array);
- }
- void* data_ptr() {
- return c10::visit([](auto & a){ return static_cast<void*>(&a); }, array);
- }
- private:
- ArrayTypes array;
- };
- struct TrivialOffsetCalculatorVariant {
- #define DEFINE_CASE(index) TrivialOffsetCalculator<index>
- using TrivialOffsetCalculatorTypes = c10::variant<
- AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
- >;
- #undef DEFINE_CASE
- TrivialOffsetCalculatorVariant(int num) {
- switch(num) {
- #define DEFINE_CASE(index) \
- case index: v = TrivialOffsetCalculator<index>(); break;
- AT_FOR_8_CASES(DEFINE_CASE)
- #undef DEFINE_CASE
- default:
- TORCH_CHECK(false, "TrivialOffsetCalculatorVariant is not implemented for num_tensors = ", num);
- }
- }
- void* data_ptr() {
- return c10::visit([](auto & v){ return static_cast<void*>(&v); }, v);
- }
- private:
- TrivialOffsetCalculatorTypes v;
- };
- struct LoadWithCastVariant {
- #define DEFINE_CASE(index) std::unique_ptr<memory::LoadWithCast<index>>
- using LoadWithCastPtr = c10::variant<
- AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
- >;
- #undef DEFINE_CASE
- LoadWithCastVariant(const TensorIteratorBase& iter) {
- int arity = iter.ninputs();
- switch(arity) {
- #define DEFINE_CASE(index) \
- case index: v = std::make_unique<memory::LoadWithCast<index>>(iter); break;
- AT_FOR_8_CASES(DEFINE_CASE)
- #undef DEFINE_CASE
- default:
- TORCH_CHECK(false, "LoadWithCastVariant is not implemented for ninputs = ", arity);
- }
- }
- void* data_ptr() {
- return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
- }
- private:
- LoadWithCastPtr v;
- };
- struct StoreWithCastVariant {
- #define DEFINE_CASE(index) std::unique_ptr<memory::StoreWithCast<index>>
- using StoreWithCastPtr = c10::variant<
- AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
- >;
- #undef DEFINE_CASE
- StoreWithCastVariant(const TensorIteratorBase& iter) {
- int num = iter.noutputs();
- switch(num) {
- #define DEFINE_CASE(index) \
- case index: v = std::make_unique<memory::StoreWithCast<index>>(iter); break;
- AT_FOR_8_CASES(DEFINE_CASE)
- #undef DEFINE_CASE
- default:
- TORCH_CHECK(false, "StoreWithCastVariant is not implemented for noutputs = ", num);
- }
- }
- void* data_ptr() {
- return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
- }
- private:
- StoreWithCastPtr v;
- };
- }} // namespace at::native
- #endif // AT_USE_JITERATOR()
|