jiterator_impl.h 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. #pragma once
  2. #include <ATen/jit_macros.h>
  3. #if AT_USE_JITERATOR()
  4. #include <c10/util/variant.h>
  5. #include <ATen/native/TensorIterator.h>
  6. #include <ATen/cuda/detail/OffsetCalculator.cuh>
  7. #include <ATen/native/cuda/jit_utils.h>
  8. #include <ATen/native/cuda/MemoryAccess.cuh>
  9. #include <ATen/native/cuda/JitLoops.cuh>
  10. #include <string>
  11. #include <vector>
  12. namespace at {
  13. namespace native {
  14. #define AT_FOR_8_CASES(_) \
  15. _(1) \
  16. _(2) \
  17. _(3) \
  18. _(4) \
  19. _(5) \
  20. _(6) \
  21. _(7) \
  22. _(8)
  23. #define AT_FOR_8_CASES_WITH_COMMA(_) \
  24. _(1) , \
  25. _(2) , \
  26. _(3) , \
  27. _(4) , \
  28. _(5) , \
  29. _(6) , \
  30. _(7) , \
  31. _(8)
  32. c10::SmallVector<std::string> get_extra_args_typenames(const c10::SmallVector<at::Scalar>& extra_args) {
  33. c10::SmallVector<std::string> args_typenames(extra_args.size());
  34. for (auto i = 0; i < extra_args.size(); ++i) {
  35. args_typenames[i] = at::cuda::jit::typeName(extra_args[i].type());
  36. }
  37. return args_typenames;
  38. }
  39. int can_vectorize_up_to(at::ScalarType type, char* pointer) {
  40. switch(type) {
  41. #define DEFINE_CASE(ctype, scalartype) \
  42. case ScalarType::scalartype : return memory::can_vectorize_up_to<ctype>(pointer);
  43. AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_CASE)
  44. #undef DEFINE_CASE
  45. default: TORCH_INTERNAL_ASSERT(false, "Unrecognized ScalarType: ", type);
  46. }
  47. }
  48. // jitted version of the above
  49. // See Note [Jiterator], this relies on the assumptions enumerated there
  50. int jitted_can_vectorize_up_to(const TensorIteratorBase& iter) {
  51. const at::ScalarType common_dtype = iter.common_dtype();
  52. const at::ScalarType result_dtype = common_dtype;
  53. // Deals with output
  54. int result = can_vectorize_up_to(result_dtype, static_cast<char*>(iter.data_ptr(0)));
  55. // Incorporates input(s)
  56. for (auto i = 1; i < iter.ntensors(); ++i) {
  57. result = std::min<int>(result, can_vectorize_up_to(common_dtype, static_cast<char*>(iter.data_ptr(i))));
  58. }
  59. return result;
  60. }
  61. template<bool IS_INPUT, int N>
  62. static std::unique_ptr<OffsetCalculator<N>> make_unique_offset_calculator(
  63. const TensorIteratorBase& iter) {
  64. // array size can not be 0, this happens when N == 0
  65. constexpr int array_size = std::max<int>(N, 1);
  66. TORCH_INTERNAL_ASSERT(N == (IS_INPUT ? iter.ninputs() : iter.noutputs()));
  67. std::array<const int64_t*, array_size> strides;
  68. int64_t element_sizes[array_size];
  69. for (int i = 0; i < N; i++) {
  70. int index = IS_INPUT ? i + iter.noutputs() : i;
  71. strides[i] = iter.strides(index).data();
  72. element_sizes[i] = iter.element_size(index);
  73. }
  74. return std::make_unique<OffsetCalculator<N>>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
  75. }
  76. template <bool IS_INPUT>
  77. struct OffsetCalculatorVariant {
  78. #define DEFINE_CASE(index) std::unique_ptr<OffsetCalculator<index>>
  79. using OffsetCalculatorTypes = c10::variant<
  80. AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
  81. >;
  82. #undef DEFINE_CASE
  83. OffsetCalculatorVariant(const TensorIteratorBase& iter) {
  84. int num = IS_INPUT ? iter.ninputs() : iter.noutputs();
  85. switch(num) {
  86. #define DEFINE_CASE(index) \
  87. case index : v = make_unique_offset_calculator<IS_INPUT, index>(iter); break;
  88. AT_FOR_8_CASES(DEFINE_CASE)
  89. #undef DEFINE_CASE
  90. default:
  91. TORCH_CHECK(false, "OffsetCalculatorVariant is not implemented for num_tensor = ", num);
  92. }
  93. }
  94. void* data_ptr() {
  95. return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
  96. }
  97. private:
  98. OffsetCalculatorTypes v;
  99. };
  100. struct ArrayVariant {
  101. // works for up to 8 input + 8 outputs
  102. #define DEFINE_CASE(index) at::detail::Array<char*, index>, at::detail::Array<char*, index+8>
  103. using ArrayTypes = c10::variant<
  104. AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
  105. >;
  106. #undef DEFINE_CASE
  107. ArrayVariant(const TensorIteratorBase& iter) {
  108. int ntensors = iter.ntensors();
  109. switch(ntensors) {
  110. #define DEFINE_CASE(index) \
  111. case index: array = at::detail::Array<char*, index>{}; break; \
  112. case index+8: array = at::detail::Array<char*, index+8>{}; break;
  113. AT_FOR_8_CASES(DEFINE_CASE)
  114. #undef DEFINE_CASE
  115. default:
  116. TORCH_CHECK(false, "ArrayVariant is not implemented for ntensors = ", ntensors);
  117. }
  118. c10::visit([&](auto& a) {
  119. for (auto i = 0; i < ntensors; ++i) {
  120. a[i] = (char*)iter.data_ptr(i);
  121. }
  122. }, array);
  123. }
  124. void* data_ptr() {
  125. return c10::visit([](auto & a){ return static_cast<void*>(&a); }, array);
  126. }
  127. private:
  128. ArrayTypes array;
  129. };
  130. struct TrivialOffsetCalculatorVariant {
  131. #define DEFINE_CASE(index) TrivialOffsetCalculator<index>
  132. using TrivialOffsetCalculatorTypes = c10::variant<
  133. AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
  134. >;
  135. #undef DEFINE_CASE
  136. TrivialOffsetCalculatorVariant(int num) {
  137. switch(num) {
  138. #define DEFINE_CASE(index) \
  139. case index: v = TrivialOffsetCalculator<index>(); break;
  140. AT_FOR_8_CASES(DEFINE_CASE)
  141. #undef DEFINE_CASE
  142. default:
  143. TORCH_CHECK(false, "TrivialOffsetCalculatorVariant is not implemented for num_tensors = ", num);
  144. }
  145. }
  146. void* data_ptr() {
  147. return c10::visit([](auto & v){ return static_cast<void*>(&v); }, v);
  148. }
  149. private:
  150. TrivialOffsetCalculatorTypes v;
  151. };
  152. struct LoadWithCastVariant {
  153. #define DEFINE_CASE(index) std::unique_ptr<memory::LoadWithCast<index>>
  154. using LoadWithCastPtr = c10::variant<
  155. AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
  156. >;
  157. #undef DEFINE_CASE
  158. LoadWithCastVariant(const TensorIteratorBase& iter) {
  159. int arity = iter.ninputs();
  160. switch(arity) {
  161. #define DEFINE_CASE(index) \
  162. case index: v = std::make_unique<memory::LoadWithCast<index>>(iter); break;
  163. AT_FOR_8_CASES(DEFINE_CASE)
  164. #undef DEFINE_CASE
  165. default:
  166. TORCH_CHECK(false, "LoadWithCastVariant is not implemented for ninputs = ", arity);
  167. }
  168. }
  169. void* data_ptr() {
  170. return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
  171. }
  172. private:
  173. LoadWithCastPtr v;
  174. };
  175. struct StoreWithCastVariant {
  176. #define DEFINE_CASE(index) std::unique_ptr<memory::StoreWithCast<index>>
  177. using StoreWithCastPtr = c10::variant<
  178. AT_FOR_8_CASES_WITH_COMMA(DEFINE_CASE)
  179. >;
  180. #undef DEFINE_CASE
  181. StoreWithCastVariant(const TensorIteratorBase& iter) {
  182. int num = iter.noutputs();
  183. switch(num) {
  184. #define DEFINE_CASE(index) \
  185. case index: v = std::make_unique<memory::StoreWithCast<index>>(iter); break;
  186. AT_FOR_8_CASES(DEFINE_CASE)
  187. #undef DEFINE_CASE
  188. default:
  189. TORCH_CHECK(false, "StoreWithCastVariant is not implemented for noutputs = ", num);
  190. }
  191. }
  192. void* data_ptr() {
  193. return c10::visit([](auto & v){ return static_cast<void*>(v.get()); }, v);
  194. }
  195. private:
  196. StoreWithCastPtr v;
  197. };
  198. }} // namespace at::native
  199. #endif // AT_USE_JITERATOR()