jit_utils.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. #pragma once
  2. #include <string>
  3. #include <sstream>
  4. #include <unordered_map>
  5. #include <vector>
  6. #include <c10/util/irange.h>
  7. #include <ATen/jit_macros.h>
  8. #include <ATen/cuda/detail/LazyNVRTC.h>
  9. namespace at { namespace cuda { namespace jit {
  10. enum class BinaryFuncVariant {NoScalar, RhsScalar, LhsScalar};
  11. struct NvrtcFunction {
  12. CUmodule module = CUmodule();
  13. CUfunction function = nullptr;
  14. };
  15. struct KernelDescriptor {
  16. std::string name;
  17. std::string f;
  18. c10::ScalarType f_inputs_type;
  19. c10::ScalarType result_type;
  20. c10::SmallVector<c10::ScalarType> extra_args_types;
  21. int nInputs, nOutputs;
  22. };
  23. // Helper function to return a vector<string>
  24. // corresponding to the type of the arguments in parameter pack.
  25. template <typename... Args>
  26. c10::SmallVector<at::ScalarType> get_extra_args_types() {
  27. return {c10::CppTypeToScalarType<Args>::value ...};
  28. }
  29. template <
  30. typename result_type,
  31. typename f_inputs_type,
  32. typename... ExtraArgs>
  33. KernelDescriptor make_kernel_descriptor(
  34. std::string name,
  35. std::string f,
  36. int nInputs,
  37. int nOutputs) {
  38. KernelDescriptor ret;
  39. ret.name = std::move(name);
  40. ret.f = std::move(f);
  41. ret.f_inputs_type = c10::CppTypeToScalarType<f_inputs_type>::value;
  42. ret.result_type = c10::CppTypeToScalarType<result_type>::value;
  43. ret.extra_args_types = get_extra_args_types<ExtraArgs...>();
  44. ret.nInputs = nInputs;
  45. ret.nOutputs = nOutputs;
  46. return ret;
  47. }
  48. inline int can_vectorize_up_to(size_t default_alignment, void *pointer) {
  49. auto ip = reinterpret_cast<uintptr_t>(pointer);
  50. if (ip % (4 * default_alignment) == 0) {
  51. return 4;
  52. }
  53. if (ip % (2 * default_alignment) == 0) {
  54. return 2;
  55. }
  56. return 1;
  57. }
  58. inline int can_vectorize_up_to(const KernelDescriptor &desc, c10::ArrayRef<char*> pointers) {
  59. TORCH_INTERNAL_ASSERT(desc.nOutputs == 1);
  60. TORCH_INTERNAL_ASSERT(static_cast<int64_t>(pointers.size()) == 1 + desc.nInputs);
  61. // Deals with output
  62. auto result_size = c10::scalarTypeToTypeMeta(desc.result_type).itemsize();
  63. int result = can_vectorize_up_to(result_size, pointers[0]);
  64. // Incorporates input(s)
  65. auto input_size = c10::scalarTypeToTypeMeta(desc.f_inputs_type).itemsize();
  66. for (auto i : c10::irange(1, pointers.size())) {
  67. result = std::min(result, can_vectorize_up_to(input_size, pointers[i]));
  68. }
  69. return result;
  70. }
  71. std::string generate_code(
  72. int nInputs,
  73. int nOutputs,
  74. const std::string& func,
  75. const std::string& name,
  76. const std::string& f_input_type,
  77. const std::string& compute_type,
  78. const std::string& result_type,
  79. bool contiguous,
  80. bool dynamic_casting,
  81. BinaryFuncVariant scalar_pos,
  82. c10::SmallVector<std::string>& extra_args_typenames,
  83. bool vectorized=false,
  84. int vec_size=0,
  85. bool return_by_ref=false);
  86. std::string generate_code(
  87. const KernelDescriptor &desc,
  88. bool contiguous,
  89. bool dynamic_casting,
  90. BinaryFuncVariant scalar_pos,
  91. bool vectorized=false,
  92. int vec_size=0,
  93. bool return_by_ref=false);
  94. std::string generate_reduction_code(
  95. int nOutputs,
  96. const std::string& func,
  97. const std::string& name,
  98. const int vt0,
  99. const std::string& f_inputs_type,
  100. const std::string& reduction_accum_type,
  101. const std::string& result_type,
  102. bool contiguous,
  103. bool vectorized,
  104. int vec_size,
  105. int max_threads_codegen);
  106. std::string generate_reduction_code(
  107. const KernelDescriptor &desc,
  108. const int vt0,
  109. bool contiguous,
  110. bool vectorized,
  111. int vec_size,
  112. int max_threads_codegen);
  113. NvrtcFunction jit_pwise_function(
  114. const std::string& code,
  115. const std::string& kernel_name);
  116. void launch_jitted_pwise_function(
  117. NvrtcFunction function,
  118. void* args[],
  119. const dim3 nBlocks,
  120. const dim3 kBlockSize,
  121. const int smem=0);
  122. template <typename T>
  123. struct delayed_false : std::false_type {
  124. };
  125. // Defines type names
  126. // NOTE: General case is instantiated only for invalid types.
  127. // All the valid types have specialization using the TYPE_NAME_FN
  128. // macro below.
  129. template <typename T>
  130. inline std::string typeName() {
  131. // we can't use static_assert(false) directly as the
  132. // program will be not compiled even if the template is not
  133. // instantiated, so we use `delayed_false`
  134. // to make sure compiler doesn't eagerly raise
  135. // fail this assertion.
  136. static_assert(delayed_false<T>::value, "invalid type for jiterator");
  137. return "void";
  138. }
  139. #define TYPE_NAME_FN(ctype, name) \
  140. template <> inline std::string typeName<ctype>(){ \
  141. return std::string(#ctype); \
  142. }
  143. AT_FORALL_SCALAR_TYPES(TYPE_NAME_FN)
  144. #undef TYPE_NAME_FN
  145. // JIT uses std::complex directly, because nvRTC compile programs
  146. // with -default-device, so there is no such issue like:
  147. // "std::sin(complex) is __host__ only"
  148. template <> inline std::string typeName<bool>(){
  149. return "bool";
  150. }
  151. template <> inline std::string typeName<c10::complex<at::Half>>(){
  152. return "std::complex<at::Half>";
  153. }
  154. template <> inline std::string typeName<c10::complex<float>>(){
  155. return "std::complex<float>";
  156. }
  157. template <> inline std::string typeName<c10::complex<double>>(){
  158. return "std::complex<double>";
  159. }
  160. template <> inline std::string typeName<at::Half>(){
  161. return "at::Half";
  162. }
  163. template <> inline std::string typeName<at::BFloat16>(){
  164. return "at::BFloat16";
  165. }
  166. #define TYPE_NAME_CASE(ctype, scalartype) \
  167. case ScalarType::scalartype: return typeName<ctype>();
  168. inline std::string typeName(ScalarType t) {
  169. switch (t) {
  170. AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(TYPE_NAME_CASE)
  171. default:
  172. TORCH_CHECK(false, "invalid type for jiterator");
  173. }
  174. }
  175. #undef TYPE_NAME_CASE
  176. TORCH_CUDA_CPP_API void initializeCudaContext();
  177. }}} // namespace at::cuda::jit