cpp.py 79 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333
  1. import contextlib
  2. import dataclasses
  3. import functools
  4. import math
  5. import sys
  6. from copy import copy, deepcopy
  7. from pathlib import Path
  8. from typing import ClassVar, Dict, List
  9. import numpy
  10. import sympy
  11. import torch
  12. import torch.fx
  13. from torch._prims_common import is_float_dtype
  14. from .. import codecache, config, ir, metrics
  15. from ..codegen.wrapper import WrapperCodeGen
  16. from ..utils import cache_on_self, sympy_product, sympy_subs, sympy_symbol
  17. from ..virtualized import ops, V
  18. from .common import (
  19. BracesBuffer,
  20. CppWrapperKernelArgs,
  21. CSEVariable,
  22. DeferredIndentedBuffer,
  23. ExprPrinter,
  24. IndentedBuffer,
  25. Kernel,
  26. KernelArgs,
  27. OpOverrides,
  28. )
  29. DTYPE_TO_CPP = {
  30. torch.float32: "float",
  31. torch.float64: "double",
  32. torch.float16: "half",
  33. torch.int64: "long",
  34. torch.int32: "int",
  35. torch.int16: "short",
  36. torch.int8: "signed char",
  37. torch.uint8: "unsigned char",
  38. torch.bool: "bool",
  39. torch.bfloat16: "bfloat16",
  40. }
  41. DTYPE_TO_ATEN = {
  42. torch.float32: "at::ScalarType::Float",
  43. torch.float64: "at::ScalarType::Double",
  44. torch.float16: "at::ScalarType::Half",
  45. torch.int64: "at::ScalarType::Long",
  46. torch.int32: "at::ScalarType::Int",
  47. torch.int16: "at::ScalarType::Short",
  48. torch.int8: "at::ScalarType::Char",
  49. torch.uint8: "at::ScalarType::Byte",
  50. torch.bool: "at::ScalarType::Bool",
  51. torch.bfloat16: "at::ScalarType::BFloat16",
  52. }
  53. INDEX_TYPE = "long"
  54. RTYPE_TO_CPP = {
  55. "sum": "+",
  56. "min": "min",
  57. "max": "max",
  58. "argmin": "argmin",
  59. "argmax": "argmax",
  60. "any": "||",
  61. }
  62. def reduction_init(reduction_type, dtype):
  63. if reduction_type in ("sum", "any"):
  64. return 0
  65. if reduction_type in {"max", "argmax"}:
  66. return (
  67. f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
  68. if is_float_dtype(dtype)
  69. else f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::min()"
  70. )
  71. if reduction_type in {"min", "argmin"}:
  72. return (
  73. f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
  74. if is_float_dtype(dtype)
  75. else f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::max()"
  76. )
  77. raise AssertionError(reduction_type)
  78. def reduction_combine(reduction_type, var, next_value):
  79. if reduction_type == "sum":
  80. return f"{var} += {next_value}"
  81. if reduction_type == "any":
  82. return f"{var} = {var} || {next_value}"
  83. return f"{var} = std::{reduction_type}({var}, {next_value})"
  84. def reduction_combine_vec(reduction_type, var, next_value):
  85. if reduction_type == "max":
  86. return f"{var} = at::vec::maximum({var}, {next_value})"
  87. elif reduction_type == "min":
  88. return f"{var} = at::vec::minimum({var}, {next_value})"
  89. elif reduction_type == "sum":
  90. return f"{var} += {next_value}"
  91. else:
  92. raise NotImplementedError()
  93. index_value_name_counter = 1
  94. def argmax_argmin_prefix(reduction_type, src_dtype, tmpvar):
  95. global index_value_name_counter
  96. struct_name = f"IndexValue_{index_value_name_counter}"
  97. index_value_name_counter += 1
  98. # A small annoyance, due to it being a little cumbersome to just throw {} into strings
  99. prefix = [
  100. f"struct {struct_name} {{size_t index; {DTYPE_TO_CPP[src_dtype]} value;}};",
  101. f"{struct_name} {tmpvar}{{0, {reduction_init(reduction_type, src_dtype)}}};",
  102. ]
  103. if reduction_type == "argmax":
  104. prefix.extend(
  105. [
  106. f"#pragma omp declare reduction(argmax : struct {struct_name} :\\",
  107. " omp_out.value = omp_in.value < omp_out.value ? omp_out.value : omp_in.value,\\",
  108. " omp_out.index = omp_in.value < omp_out.value ? omp_out.index : omp_in.index)\\",
  109. f"\tinitializer(omp_priv = {{0, {reduction_init(reduction_type, src_dtype)}}})",
  110. ]
  111. )
  112. elif reduction_type == "argmin":
  113. prefix.extend(
  114. [
  115. f"#pragma omp declare reduction(argmin : struct {struct_name} :\\",
  116. " omp_out.value = omp_in.value > omp_out.value ? omp_out.value : omp_in.value,\\",
  117. " omp_out.index = omp_in.value > omp_out.value ? omp_out.index : omp_in.index)\\",
  118. f"\tinitializer(omp_priv = {{0, {reduction_init(reduction_type, src_dtype)}}})",
  119. ]
  120. )
  121. return prefix
  122. def float16_reduction_prefix(rtype):
  123. # TODO: This user-defined reduction uses float16 accumulation for sum. To reduce numerical
  124. # errors, float32 accumulation should be used instead.
  125. assert rtype in (
  126. "sum",
  127. "any",
  128. ), f"float16 user-defined reduction only supports 'sum' and 'any' but got {rtype}"
  129. prefix = [
  130. f"#pragma omp declare reduction({RTYPE_TO_CPP[rtype]}:{DTYPE_TO_CPP[torch.float16]}:"
  131. + f"omp_out = omp_out {RTYPE_TO_CPP[rtype]} omp_in)"
  132. ]
  133. return prefix
  134. def parallel_num_threads():
  135. threads = config.cpp.threads
  136. if threads < 1:
  137. threads = torch.get_num_threads()
  138. return threads
  139. @functools.lru_cache()
  140. def cpp_prefix():
  141. path = Path(__file__).parent / "cpp_prefix.h"
  142. with path.open() as f:
  143. _, filename = codecache.write(
  144. f.read(),
  145. "h",
  146. )
  147. return f'#include "{filename}"'
  148. class CppPrinter(ExprPrinter):
  149. def _print_ModularIndexing(self, expr):
  150. x, div, mod = expr.args
  151. x = self.paren(self.doprint(x))
  152. div = self.paren(self.doprint(div))
  153. mod = self.paren(self.doprint(mod))
  154. if div != "1":
  155. x = f"({x} / {div})"
  156. return f"{x} % {mod}"
  157. def _print_FloorDiv(self, expr):
  158. x, div = expr.args
  159. x = self.paren(self.doprint(x))
  160. div = self.paren(self.doprint(div))
  161. return f"({x} / {div})"
  162. cexpr = CppPrinter().doprint
  163. @dataclasses.dataclass
  164. class OptimizationContext:
  165. key: ClassVar[str] = "opt_ctx"
  166. # Masked load
  167. is_masked_load: bool = False
  168. # Load value as mask
  169. is_load_as_mask: bool = False
  170. dtype: torch.dtype = torch.float
  171. ops_name: str = ""
  172. is_most_inner_loop_irrevelant: bool = False
  173. class RecordOptimizationContext:
  174. def __init__(self, func_name: str = ""):
  175. self.func_name = func_name
  176. self.current_node: torch.fx.Node = None
  177. self.opt_ctx: OptimizationContext = None
  178. def __enter__(self):
  179. assert V.interpreter
  180. assert V.interpreter.current_node
  181. self.current_node: torch.fx.Node = V.interpreter.current_node
  182. if OptimizationContext.key in self.current_node.meta:
  183. self.opt_ctx = self.current_node.meta[OptimizationContext.key]
  184. else:
  185. self.opt_ctx = OptimizationContext()
  186. self.opt_ctx.ops_name = self.func_name
  187. return self
  188. def __exit__(self, exc_type, exc_val, exc_tb):
  189. assert self.current_node
  190. assert self.opt_ctx
  191. self.current_node.meta[OptimizationContext.key] = self.opt_ctx
  192. def get_opt_ctx(self):
  193. return self.opt_ctx
  194. def get_fx_node(self):
  195. assert self.current_node
  196. return self.current_node
  197. def get_current_node_opt_ctx() -> OptimizationContext:
  198. assert V.interpreter.current_node
  199. if OptimizationContext.key in V.interpreter.current_node.meta:
  200. return V.interpreter.current_node.meta[OptimizationContext.key]
  201. else:
  202. return None
  203. class CppVecOverrides(OpOverrides):
  204. """Map element-wise ops to aten vectorization C++"""
  205. @staticmethod
  206. def add(a, b):
  207. return f"{a} + {b}"
  208. @staticmethod
  209. def sub(a, b):
  210. return f"{a} - {b}"
  211. @staticmethod
  212. def mul(a, b):
  213. return f"{a} * {b}"
  214. @staticmethod
  215. def div(a, b):
  216. return f"{a} / {b}"
  217. @staticmethod
  218. def abs(x):
  219. return f"{x}.abs()"
  220. @staticmethod
  221. def sin(x):
  222. return f"{x}.sin()"
  223. @staticmethod
  224. def cos(x):
  225. return f"{x}.cos()"
  226. @staticmethod
  227. def exp(x):
  228. return f"{x}.exp()"
  229. @staticmethod
  230. def exp2(x):
  231. return f"{x}.exp2()"
  232. @staticmethod
  233. def expm1(x):
  234. # decompose for a better performance
  235. vec_one = f"decltype({x})(1)"
  236. return f"{x}.exp() - {vec_one}"
  237. @staticmethod
  238. def erf(x):
  239. return f"{x}.erf()"
  240. @staticmethod
  241. def sqrt(x):
  242. return f"{x}.sqrt()"
  243. @staticmethod
  244. def eq(x, y):
  245. return f"{x} == {y}"
  246. @staticmethod
  247. def ne(x, y):
  248. return f"{x} != {y}"
  249. @staticmethod
  250. def lt(x, y):
  251. return f"{x} < {y}"
  252. @staticmethod
  253. def gt(x, y):
  254. return f"{x} > {y}"
  255. @staticmethod
  256. def le(x, y):
  257. return f"{x} <= {y}"
  258. @staticmethod
  259. def ge(x, y):
  260. return f"{x} >= {y}"
  261. @staticmethod
  262. def and_(x, y):
  263. return f"{x} & {y}"
  264. @staticmethod
  265. def rsqrt(x):
  266. return f"{x}.rsqrt()"
  267. @staticmethod
  268. def pow(a, b):
  269. return f"{a}.pow({b})"
  270. @staticmethod
  271. def log(x):
  272. return f"{x}.log()"
  273. @staticmethod
  274. def round(x):
  275. return f"{x}.round()"
  276. @staticmethod
  277. def floor(x):
  278. return f"{x}.floor()"
  279. @staticmethod
  280. def ceil(x):
  281. return f"{x}.ceil()"
  282. @staticmethod
  283. def trunc(x):
  284. return f"{x}.trunc()"
  285. @staticmethod
  286. def fmod(a, b):
  287. return f"{a}.fmod({b})"
  288. @staticmethod
  289. def lgamma(x):
  290. return f"{x}.lgamma()"
  291. """
  292. #TODO: support logical_and and logical_or vectorization
  293. @staticmethod
  294. def logical_and(a, b):
  295. return f"{a} && {b}"
  296. @staticmethod
  297. def logical_or(a, b):
  298. return f"{a} || {b}"
  299. """
  300. @staticmethod
  301. def tan(a):
  302. return f"{a}.tan()"
  303. @staticmethod
  304. def tanh(a):
  305. vec_one = f"decltype({a})(1)"
  306. vec_two = f"decltype({a})(2)"
  307. vec_minus_two = f"decltype({a})(-2)"
  308. return f"{vec_two} / ({vec_one} + ({vec_minus_two} * {a}).exp()) - {vec_one}"
  309. @staticmethod
  310. def reciprocal(a):
  311. return f"{a}.reciprocal()"
  312. @staticmethod
  313. def atan(x):
  314. return f"{x}.atan()"
  315. @staticmethod
  316. def acos(x):
  317. return f"{x}.acos()"
  318. @staticmethod
  319. def asin(x):
  320. return f"{x}.asin()"
  321. @staticmethod
  322. def log10(x):
  323. return f"{x}.log10()"
  324. @staticmethod
  325. def erfc(x):
  326. return f"{x}.erfc()"
  327. @staticmethod
  328. def nextafter(x):
  329. return f"{x}.nextafter()"
  330. @staticmethod
  331. def copysign(a, b):
  332. return f"{a}.copysign({b})"
  333. @staticmethod
  334. def atan2(a, b):
  335. return f"{a}.atan2({b})"
  336. @staticmethod
  337. def hypot(a, b):
  338. return f"{a}.hypot({b})"
  339. @staticmethod
  340. def atanh(x):
  341. # For real x, atanh(x) = 1/2 * log((1+x)/(1-x))
  342. vec_one = f"decltype({x})(1)"
  343. vec_one_half = f"decltype({x})(0.5)"
  344. return f"{vec_one_half} * (({vec_one} + {x})/({vec_one} - {x})).log()"
  345. @staticmethod
  346. def asinh(x):
  347. # For real x, asinh(x) = log(x + sqrt(1 + x**2))
  348. vec_one = f"decltype({x})(1)"
  349. return f"({x} + ({vec_one} + {x}*{x}).sqrt()).log()"
  350. @staticmethod
  351. def acosh(x):
  352. # For real x, acosh(x) = log(x + sqrt(x**2 -1))
  353. vec_one = f"decltype({x})(1)"
  354. return f"({x} + ({x}*{x} - {vec_one}).sqrt()).log()"
  355. @staticmethod
  356. def constant(val, dtype):
  357. opt_ctx: OptimizationContext = get_current_node_opt_ctx()
  358. assert opt_ctx
  359. assert opt_ctx.dtype in [torch.int32, torch.float32]
  360. proposed_dtype = opt_ctx.dtype
  361. if val == float("inf"):
  362. assert proposed_dtype == torch.float
  363. quote = f"std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::infinity()"
  364. elif val == float("-inf"):
  365. assert proposed_dtype == torch.float
  366. quote = f"-std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::infinity()"
  367. elif math.isnan(val):
  368. quote = f"std::numeric_limits<{DTYPE_TO_CPP[proposed_dtype]}>::quiet_NaN()"
  369. elif val is True or val is False:
  370. quote = f"static_cast<{DTYPE_TO_CPP[proposed_dtype]}>({str(val).lower()})"
  371. else:
  372. quote = f"static_cast<{DTYPE_TO_CPP[proposed_dtype]}>({repr(val)})"
  373. return f"at::vec::Vectorized<{DTYPE_TO_CPP[proposed_dtype]}>({quote})"
  374. @staticmethod
  375. def relu(x):
  376. return f"at::vec::clamp_min({x}, decltype({x})(0))"
  377. @staticmethod
  378. def sigmoid(x):
  379. return f"decltype({x})(1)/(decltype({x})(1) + {x}.neg().exp())"
  380. @staticmethod
  381. def neg(x):
  382. return f"{x}.neg()"
  383. @staticmethod
  384. def floordiv(a, b):
  385. # a and b are integer type
  386. _t = f"decltype({a})"
  387. quot = f"{a} / {b}"
  388. rem = f"{a} % {b}"
  389. return f"(({a} < {_t}(0)) != ({b} < {_t}(0)) ? ({rem} != {_t}(0) ? {quot} - {_t}(1) : {quot}) : {quot})"
  390. @staticmethod
  391. def truncdiv(a, b):
  392. # a and b are integer type
  393. return f"{a} / {b}"
  394. @staticmethod
  395. def minimum(a, b):
  396. return f"at::vec::minimum({a}, {b})"
  397. @staticmethod
  398. def maximum(a, b):
  399. return f"at::vec::maximum({a}, {b})"
  400. @staticmethod
  401. def square(a):
  402. return f"{a}.pow(2)"
  403. @staticmethod
  404. def where(a, b, c):
  405. return f"decltype({b})::blendv({c}, {b}, {a})"
  406. @staticmethod
  407. def sign(x):
  408. code = BracesBuffer()
  409. # auto tmp5 = tmp4 < 0 ? -1 : 1;
  410. vec_zero = f"decltype({x})(0)"
  411. vec_one = f"decltype({x})(1)"
  412. blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {vec_zero} < {x})"
  413. left = V.kernel.cse.newvar()
  414. code.writeline(f"auto {left} = {blendv};")
  415. # auto tmp6 = tmp4 == 0 ? 0 : tmp5;
  416. blendv = f"decltype({x})::blendv({vec_zero}, {vec_one}, {x} < {vec_zero})"
  417. right = V.kernel.cse.newvar()
  418. code.writeline(f"auto {right} = {blendv};")
  419. result = V.kernel.cse.newvar()
  420. code.writeline(f"auto {result} = {left} - {right};")
  421. V.kernel.compute.splice(code)
  422. return result
  423. @staticmethod
  424. def to_dtype(x, dtype):
  425. assert dtype in [torch.bool], f"{__name__} does not support {dtype}"
  426. return f"({x})"
  427. @staticmethod
  428. def log1p(x):
  429. return f"{x}.log1p()"
  430. @staticmethod
  431. def masked(mask, body, other):
  432. opt_ctx: OptimizationContext = get_current_node_opt_ctx()
  433. assert opt_ctx
  434. assert opt_ctx.is_masked_load
  435. code = BracesBuffer()
  436. var = V.kernel.cse.newvar()
  437. if other == float("-inf"):
  438. code.writeline(
  439. f"auto {var} = at::vec::Vectorized<float>(-std::numeric_limits<float>::infinity());"
  440. )
  441. elif other == float("inf"):
  442. code.writeline(
  443. f"auto {var} = at::vec::Vectorized<float>(std::numeric_limits<float>::infinity());"
  444. )
  445. else:
  446. code.writeline(f"auto {var} = at::vec::Vectorized<float>({other!r});")
  447. with V.kernel.swap_buffers(code), code.indent():
  448. result = body()
  449. zero_val = "at::vec::Vectorized<float>(0)"
  450. float_mask = f"to_float_mask({mask})"
  451. blendv = f"decltype({result})::blendv({var}, {result}, {float_mask} != {zero_val})"
  452. code.writeline(f"{var} = {blendv};")
  453. V.kernel.compute.splice(code)
  454. return var
  455. @staticmethod
  456. def index_expr(expr, dtype):
  457. assert dtype == torch.int64
  458. opt_ctx: OptimizationContext = get_current_node_opt_ctx()
  459. assert opt_ctx
  460. assert opt_ctx.dtype == torch.int32
  461. assert opt_ctx.is_most_inner_loop_irrevelant
  462. return f"at::vec::Vectorized<int>(static_cast<int>({cexpr(V.kernel.rename_indexing(expr))}))"
  463. class CppOverrides(OpOverrides):
  464. """Map element-wise ops to C++"""
  465. @staticmethod
  466. def to_dtype(x, dtype):
  467. assert dtype in DTYPE_TO_CPP, f"{dtype} missing from {__name__}.DTYPE_TO_CPP"
  468. return f"static_cast<{DTYPE_TO_CPP[dtype]}>({x})"
  469. @staticmethod
  470. def abs(x):
  471. return f"std::abs({x})"
  472. @staticmethod
  473. def sin(x):
  474. return f"std::sin({x})"
  475. @staticmethod
  476. def cos(x):
  477. return f"std::cos({x})"
  478. @staticmethod
  479. def neg(x):
  480. return f"decltype({x})(-{x})"
  481. @staticmethod
  482. def exp(x):
  483. # return f"Sleef_expf_u10({x})"
  484. return f"std::exp({x})"
  485. @staticmethod
  486. def exp2(x):
  487. return f"std::exp2({x})"
  488. @staticmethod
  489. def expm1(x):
  490. return f"std::expm1({x})"
  491. @staticmethod
  492. def erf(x):
  493. return f"std::erf({x})"
  494. @staticmethod
  495. def sqrt(x):
  496. return f"std::sqrt({x})"
  497. @staticmethod
  498. def rsqrt(x):
  499. return f"1 / std::sqrt({x})"
  500. @staticmethod
  501. def log1p(x):
  502. return f"std::log1p({x})"
  503. @staticmethod
  504. def tan(x):
  505. return f"std::tan({x})"
  506. @staticmethod
  507. def tanh(x):
  508. return f"std::tanh({x})"
  509. @staticmethod
  510. def signbit(x):
  511. return f"std::signbit({x})"
  512. @staticmethod
  513. def pow(a, b):
  514. return f"std::pow({a}, {b})"
  515. @staticmethod
  516. def log(x):
  517. return f"std::log({x})"
  518. @staticmethod
  519. def round(x):
  520. return f"std::nearbyint({x})"
  521. @staticmethod
  522. def floor(x):
  523. return f"std::floor({x})"
  524. @staticmethod
  525. def floordiv(a, b):
  526. # a and b are integer type
  527. quot = f"{a} / {b}"
  528. rem = f"{a} % {b}"
  529. return f"(({a} < 0) != ({b} < 0) ? ({rem} != 0 ? {quot} - 1 : {quot}) : {quot})"
  530. @staticmethod
  531. def ceil(x):
  532. return f"std::ceil({x})"
  533. @staticmethod
  534. def trunc(x):
  535. return f"std::trunc({x})"
  536. @staticmethod
  537. def truncdiv(a, b):
  538. # a and b are integer type
  539. return f"{a} / {b}"
  540. @staticmethod
  541. def fmod(a, b):
  542. return f"std::fmod({a}, {b})"
  543. @staticmethod
  544. def isinf(x):
  545. return f"std::isinf({x})"
  546. @staticmethod
  547. def isnan(x):
  548. return f"std::isnan({x})"
  549. @staticmethod
  550. def lgamma(x):
  551. return f"std::lgamma({x})"
  552. @staticmethod
  553. def acos(x):
  554. return f"std::acos({x})"
  555. @staticmethod
  556. def acosh(x):
  557. return f"std::acosh({x})"
  558. @staticmethod
  559. def asin(x):
  560. return f"std::asin({x})"
  561. @staticmethod
  562. def asinh(x):
  563. return f"std::asinh({x})"
  564. @staticmethod
  565. def atan2(x, y):
  566. return f"std::atan2({x}, {y})"
  567. @staticmethod
  568. def atan(x):
  569. return f"std::atan({x})"
  570. @staticmethod
  571. def atanh(x):
  572. return f"std::atanh({x})"
  573. @staticmethod
  574. def copysign(x, y):
  575. return f"std::copysign({x}, {y})"
  576. @staticmethod
  577. def hypot(x, y):
  578. return f"std::hypot({x}, {y})"
  579. @staticmethod
  580. def erfc(x):
  581. return f"std::erfc({x})"
  582. @staticmethod
  583. def log10(x):
  584. return f"std::log10({x})"
  585. @staticmethod
  586. def nextafter(x, y):
  587. return f"std::nextafter({x}, {y})"
  588. @staticmethod
  589. def relu(x):
  590. return f"{x} * ({x}>0)"
  591. @staticmethod
  592. def minimum(a, b):
  593. return f"({b} != {b}) ? {b} : std::min({a}, {b})"
  594. @staticmethod
  595. def maximum(a, b):
  596. return f"({b} != {b}) ? {b} : std::max({a}, {b})"
  597. @staticmethod
  598. def where(a, b, c):
  599. return f"{a} ? {b} : {c}"
  600. @staticmethod
  601. def mod(a, b):
  602. return f"mod({a}, {b})"
  603. @staticmethod
  604. def constant(val, dtype):
  605. if dtype in (torch.float16, torch.bfloat16):
  606. # Since load promotes all half-precision inputs to float, constants
  607. # must be promoted as well
  608. dtype = torch.float32
  609. if val == float("inf"):
  610. return f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
  611. elif val == float("-inf"):
  612. return f"-std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::infinity()"
  613. elif math.isnan(val):
  614. return f"std::numeric_limits<{DTYPE_TO_CPP[dtype]}>::quiet_NaN()"
  615. elif val is True or val is False:
  616. return ops.to_dtype(str(val).lower(), dtype)
  617. return ops.to_dtype(repr(val), dtype)
  618. @staticmethod
  619. def index_expr(expr, dtype):
  620. return ops.to_dtype(cexpr(V.kernel.rename_indexing(expr)), dtype)
  621. @staticmethod
  622. def masked(mask, body, other):
  623. code = BracesBuffer()
  624. # Write masked operation into a lambda
  625. body_var = V.kernel.cse.newvar()
  626. code.writeline(f"auto {body_var} = [&]")
  627. with V.kernel.swap_buffers(code), code.indent():
  628. result = body()
  629. code.writeline(f"return {result};")
  630. code.writeline(";")
  631. V.kernel.compute.splice(code)
  632. # Use the lambda's return type as the type of other
  633. type = f"decltype({body_var}())"
  634. if other == float("-inf"):
  635. other_code = f"-std::numeric_limits<{type}>::infinity()"
  636. elif other == float("inf"):
  637. other_code = "std::numeric_limits<{type}>::infinity()"
  638. elif isinstance(other, bool):
  639. other_code = f"static_cast<{type}>({str(other).lower()})"
  640. else:
  641. other_code = f"static_cast<{type}>({repr(other)})"
  642. return f"{mask} ? {body_var}() : {other_code}"
  643. @staticmethod
  644. def logical_and(a, b):
  645. return f"{a} && {b}"
  646. @staticmethod
  647. def logical_or(a, b):
  648. return f"{a} || {b}"
  649. @staticmethod
  650. def rand(seed: sympy.Expr, offset: sympy.Expr, dtype):
  651. return f"static_cast<{DTYPE_TO_CPP[dtype]}>(normalized_rand_cpu({seed}, {offset}));"
  652. @staticmethod
  653. def randn(seed: sympy.Expr, offset: sympy.Expr, dtype):
  654. return f"static_cast<{DTYPE_TO_CPP[dtype]}>(randn_cpu({seed}, {offset}));"
  655. @staticmethod
  656. def sigmoid(x):
  657. return f"decltype({x})(1) / (decltype({x})(1) + std::exp(-{x}))"
  658. @staticmethod
  659. def sign(x):
  660. code = BracesBuffer()
  661. # auto tmp5 = tmp4 < 0 ? -1 : 1;
  662. left = V.kernel.cse.newvar()
  663. right = V.kernel.cse.newvar()
  664. result = V.kernel.cse.newvar()
  665. code.writeline(f"auto {left} = {x} > 0 ? 1 : 0;")
  666. code.writeline(f"auto {right} = {x} < 0 ? 1 : 0;")
  667. code.writeline(f"auto {result} = {left} - {right};")
  668. V.kernel.compute.splice(code)
  669. return result
  670. class CppKernel(Kernel):
  671. overrides = CppOverrides
  672. sexpr = cexpr
  673. newvar_prefix = "auto "
  674. suffix = ";"
  675. def __init__(self, args, num_threads):
  676. super().__init__(args)
  677. self.call_ranges = None
  678. self.ranges = None
  679. self.itervars = None
  680. self.reduction_depth = None
  681. self.reduction_prefix = IndentedBuffer()
  682. self.reduction_suffix = DeferredIndentedBuffer()
  683. self.reduction_var_map = {}
  684. self.preloads = IndentedBuffer()
  685. self.poststores = DeferredIndentedBuffer()
  686. self.num_threads = num_threads # num_threads the kernel specialized for
  687. def scale_index_with_offset(
  688. self, index: sympy.Expr, scale, itervar_idx=-1, offset=0
  689. ):
  690. expanded_index = sympy.expand(index)
  691. var = self.itervars[itervar_idx]
  692. replacement = {var: var * scale + offset}
  693. new_index = sympy_subs(expanded_index, replacement)
  694. return new_index
  695. def load(self, name: str, index: sympy.Expr):
  696. var = self.args.input(name)
  697. index = self.rename_indexing(index)
  698. line = f"{var}[{cexpr(index)}]"
  699. if V.graph.get_dtype(name) in (torch.float16, torch.bfloat16):
  700. line = f"static_cast<float>({line})"
  701. return self.cse.generate(self.loads, line)
  702. def store(self, name, index, value, mode=None):
  703. assert "buf" in name
  704. var = self.args.output(name)
  705. index = self.rename_indexing(index)
  706. if mode is None:
  707. line = f"{var}[{cexpr(index)}] = {value};"
  708. elif mode == "atomic_add":
  709. if not config.cpp.dynamic_threads and self.num_threads == 1:
  710. line = f"{var}[{cexpr(index)}] += {value};"
  711. else:
  712. line = f"atomic_add(&{var}[{cexpr(index)}], {value});"
  713. else:
  714. raise NotImplementedError(f"store mode={mode}")
  715. self.stores.writeline(name, line)
  716. def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
  717. argmax_or_argmin = reduction_type in {"argmax", "argmin"}
  718. tmpvar = self.cse.generate(
  719. self.loads, f"reduction {name} {cexpr(index)}", write=False
  720. )
  721. index = self.rename_indexing(index)
  722. self.reduction_var_map[tmpvar] = reduction_type
  723. if argmax_or_argmin:
  724. self.reduction_prefix.writelines(
  725. argmax_argmin_prefix(reduction_type, src_dtype, tmpvar)
  726. )
  727. compare_op = "<" if reduction_type == "argmax" else ">"
  728. self.stores.writelines(
  729. None,
  730. [
  731. f"if ({tmpvar}.value {compare_op} {value}) {{",
  732. f" {tmpvar}.index = {self.itervars[-1]}; {tmpvar}.value = {value};",
  733. "}",
  734. ],
  735. )
  736. else:
  737. if dtype == torch.float16:
  738. self.reduction_prefix.writelines(
  739. float16_reduction_prefix(reduction_type)
  740. )
  741. self.reduction_prefix.writeline(
  742. f"{DTYPE_TO_CPP[dtype]} {tmpvar} = {reduction_init(reduction_type, dtype)};"
  743. )
  744. self.stores.writeline(
  745. None, f"{reduction_combine(reduction_type, tmpvar, value)};"
  746. )
  747. if name not in V.graph.removed_buffers:
  748. var = self.args.output(name)
  749. member_name = ".index" if argmax_or_argmin else ""
  750. self.reduction_suffix.writeline(
  751. name, f"{var}[{cexpr(index)}] = {tmpvar}{member_name};"
  752. )
  753. self.cse.store_cache[name] = tmpvar
  754. def set_ranges(self, lengths, reduction_lengths):
  755. if self.call_ranges:
  756. assert self.call_ranges == tuple(lengths) + tuple(
  757. reduction_lengths
  758. ), f"{self.call_ranges} == {tuple(lengths)} + {tuple(reduction_lengths)}"
  759. assert self.reduction_depth == len(lengths)
  760. else:
  761. self.call_ranges = tuple(lengths) + tuple(reduction_lengths)
  762. self.ranges = [self.rename_indexing(x) for x in self.call_ranges]
  763. self.itervars = [sympy_symbol(f"i{n}") for n in range(len(self.ranges))]
  764. self.reduction_depth = len(lengths)
  765. return (
  766. self.itervars[: self.reduction_depth],
  767. self.itervars[self.reduction_depth :],
  768. )
  769. def size_hint(self):
  770. return V.graph.sizevars.size_hint(sympy_product(self.call_ranges))
  771. def codegen_loops_impl(self, loop_nest, code, worksharing):
  772. threads = parallel_num_threads()
  773. par_depth = self.decide_parallel_depth(
  774. self.call_ranges[: loop_nest.max_parallel_depth()], threads
  775. )
  776. with contextlib.ExitStack() as stack:
  777. if par_depth:
  778. if loop_nest.is_reduction_only():
  779. # need to close the worksharing scope to define reduction vars outside it
  780. worksharing.close()
  781. else:
  782. worksharing.parallel(threads)
  783. loop_nest.mark_parallel(par_depth)
  784. elif threads > 1:
  785. if worksharing.single():
  786. stack.enter_context(code.indent())
  787. def gen_kernel(kernel):
  788. with contextlib.ExitStack() as stack:
  789. assert kernel
  790. if hasattr(kernel, "codegen_inner_loops"):
  791. code.splice(kernel.preloads)
  792. kernel.codegen_inner_loops(code)
  793. stack.enter_context(code.indent())
  794. code.splice(kernel.loads)
  795. code.splice(kernel.compute)
  796. code.splice(kernel.stores)
  797. if hasattr(kernel, "codegen_inner_loops"):
  798. code.splice(kernel.poststores)
  799. def gen_loops(loops: List[LoopLevel], in_reduction=False):
  800. with contextlib.ExitStack() as stack_outer:
  801. if loops:
  802. loop = loops[0]
  803. if loop.is_reduction() and not in_reduction:
  804. kernels = loop.get_kernels()
  805. assert kernels
  806. # TODO(jgong5): should gen prefix for all kernels.
  807. # currently, Vec kernel generates prefix for both
  808. # vector and scalar kernels.
  809. if kernels[0].reduction_prefix:
  810. stack_outer.enter_context(code.indent())
  811. code.splice(kernels[0].reduction_prefix)
  812. if loop_nest.is_reduction_only() and loop.parallel:
  813. worksharing.parallel(threads)
  814. for loop in loops:
  815. gen_loop(loop, in_reduction)
  816. if loops:
  817. if loop_nest.is_reduction_only() and loop.parallel:
  818. worksharing.close()
  819. for loop in loops:
  820. if loop.is_reduction() and not in_reduction:
  821. kernels = loop.get_kernels()
  822. for kernel in kernels:
  823. code.splice(kernel.reduction_suffix)
  824. def gen_loop(loop: LoopLevel, in_reduction=False):
  825. with contextlib.ExitStack() as stack:
  826. code.writelines(loop.lines())
  827. stack.enter_context(code.indent())
  828. # generate inner loops or loop body
  829. if loop.inner:
  830. gen_loops(loop.inner, loop.is_reduction())
  831. else:
  832. kernels = loop.get_kernels()
  833. assert len(kernels) == 1
  834. gen_kernel(kernels[0])
  835. stack.enter_context(code.indent())
  836. if loop_nest.root:
  837. gen_loops(loop_nest.root)
  838. else:
  839. gen_kernel(loop_nest.kernel)
  840. def codegen_loops(self, code, worksharing):
  841. loop_nest = LoopNestWithSplit.build(self)
  842. self.codegen_loops_impl(loop_nest, code, worksharing)
  843. def decide_parallel_depth(self, ranges, threads):
  844. seq = self.size_hint()
  845. par = 1
  846. depth = 0
  847. for expr in ranges:
  848. hint = V.graph.sizevars.size_hint(expr)
  849. if par >= 2 * threads or par == threads:
  850. break
  851. if seq // threads < config.cpp.min_chunk_size:
  852. # not enough work
  853. break
  854. depth += 1
  855. par *= hint
  856. seq /= hint
  857. # if we assume thread number is dynamic, make sure we
  858. # have at least one parallel scope and let OMP runtime
  859. # to manage the serial vs. parallel.
  860. if config.cpp.dynamic_threads and depth == 0 and len(ranges) > 0:
  861. depth = 1
  862. return depth
  863. @contextlib.contextmanager
  864. def write_to_suffix(self):
  865. prior = (self.loads, self.compute, self.stores, self.cse)
  866. self.loads = IndentedBuffer()
  867. self.compute = IndentedBuffer()
  868. self.stores = DeferredIndentedBuffer()
  869. self.cse = self.cse.clone()
  870. yield
  871. self.reduction_suffix.splice(self.loads)
  872. self.reduction_suffix.splice(self.compute)
  873. self.reduction_suffix.splice(self.stores)
  874. (self.loads, self.compute, self.stores, self.cse) = prior
  875. class CppVecKernel(CppKernel):
  876. overrides = CppVecOverrides
  877. def __init__(self, args, num_threads, tiling_factor=0):
  878. super().__init__(args, num_threads)
  879. assert codecache.pick_vec_isa()
  880. if tiling_factor == 0:
  881. tiling_factor = codecache.pick_vec_isa().nelements()
  882. self.tiling_factor = tiling_factor
  883. self.reduction_omp_dec: Dict[str, str] = {}
  884. self.var_vec_buf_map: Dict[str, str] = {}
  885. metrics.generated_cpp_vec_kernel_count += 1
  886. def stride_at(self, var: sympy.Symbol, index: sympy.Expr):
  887. replacement = {var: var + 1}
  888. new_index = sympy_subs(index, replacement)
  889. return sympy.simplify(new_index - index)
  890. def is_stride1_at(self, var: sympy.Symbol, index: sympy.Expr):
  891. return self.stride_at(var, index) == 1
  892. def is_invariant_under(self, var: sympy.Symbol, index: sympy.Expr):
  893. expanded_index = sympy.expand(index)
  894. return not expanded_index.has(var)
  895. def load(self, name: str, index: sympy.Expr):
  896. var = self.args.input(name)
  897. index = self.rename_indexing(index)
  898. expanded_index = sympy.expand(index)
  899. new_index = self.scale_index_with_offset(index, self.tiling_factor)
  900. is_broadcast = expanded_index == new_index
  901. var_expr = (
  902. f"{var}[{cexpr(index)}]" if is_broadcast else f"{var} + {cexpr(new_index)}"
  903. )
  904. if V.graph.get_dtype(name) in [torch.bool, torch.uint8]:
  905. nelements = codecache.pick_vec_isa().nelements()
  906. if var not in self.var_vec_buf_map:
  907. self.var_vec_buf_map[var] = f"g_tmp_buffer_{var}"
  908. self.loads.writeline(
  909. f"float {self.var_vec_buf_map[var]}[{nelements}] = {{0}};"
  910. )
  911. self.loads.writeline(
  912. f"flag_to_float({var_expr}, {self.var_vec_buf_map[var]}, {nelements});"
  913. )
  914. line = f"at::vec::Vectorized<float>::loadu({self.var_vec_buf_map[var]})"
  915. elif is_broadcast:
  916. line = f"at::vec::Vectorized<float>({var_expr})"
  917. else:
  918. line = f"at::vec::Vectorized<float>::loadu({var_expr})"
  919. return self.cse.generate(self.loads, line)
  920. def store(self, name, index, value, mode=None):
  921. assert "buf" in name
  922. var = self.args.output(name)
  923. index = self.rename_indexing(index)
  924. assert mode is None
  925. expanded_index = sympy.expand(index)
  926. new_index = self.scale_index_with_offset(index, self.tiling_factor)
  927. assert new_index != expanded_index
  928. line = f"{value}.store({var} + {cexpr(new_index)});"
  929. self.stores.writeline(name, line)
  930. def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
  931. assert reduction_type in {"max", "min", "sum"}
  932. assert dtype == torch.float
  933. assert src_dtype == torch.float
  934. reduce_map = {"max": "maximum", "min": "minimum"}
  935. vec_ns = "at::vec"
  936. vec = f"{vec_ns}::Vectorized<{DTYPE_TO_CPP[dtype]}>"
  937. if reduction_type not in self.reduction_omp_dec:
  938. vec_reduc_prefix = "#pragma omp declare reduction("
  939. vec_reduc_prefix += f"{RTYPE_TO_CPP[reduction_type]}:{vec}:"
  940. if reduction_type == "sum":
  941. vec_reduc_prefix += "omp_out += omp_in"
  942. else:
  943. vec_reduc_prefix += (
  944. f"omp_out = {vec_ns}::{reduce_map[reduction_type]}(omp_out, omp_in)"
  945. )
  946. vec_reduc_prefix += ")"
  947. vec_reduc_prefix += " initializer("
  948. vec_reduc_prefix += "omp_priv={{"
  949. vec_reduc_prefix += f"{reduction_init(reduction_type, dtype)}"
  950. vec_reduc_prefix += "}})"
  951. self.reduction_omp_dec[reduction_type] = RTYPE_TO_CPP[reduction_type]
  952. self.reduction_prefix.writeline(vec_reduc_prefix)
  953. tmpvar = self.cse.generate(
  954. self.loads, f"reduction {name} {cexpr(index)}", write=False
  955. )
  956. tmpvar_vec = f"{tmpvar}_vec"
  957. index = self.rename_indexing(index)
  958. self.reduction_var_map[tmpvar_vec] = reduction_type
  959. self.reduction_prefix.writeline(
  960. f"{DTYPE_TO_CPP[dtype]} {tmpvar} = {reduction_init(reduction_type, dtype)};"
  961. )
  962. self.reduction_prefix.writeline(
  963. f"auto {tmpvar_vec} = at::vec::Vectorized<{DTYPE_TO_CPP[dtype]}>({tmpvar});"
  964. )
  965. self.stores.writeline(
  966. None, f"{reduction_combine_vec(reduction_type, tmpvar_vec, value)};"
  967. )
  968. reduce_all_body = "{"
  969. if reduction_type == "sum":
  970. reduce_all_body += "return x + y;"
  971. else:
  972. reduce_all_body += f"return {vec_ns}::{reduce_map[reduction_type]}(x, y);"
  973. reduce_all_body += "}"
  974. vec_reduce_all_func = f"{vec_ns}::vec_reduce_all<{DTYPE_TO_CPP[dtype]}>"
  975. next_value = f"{vec_reduce_all_func}([]({vec}& x, {vec}&y) {reduce_all_body}, {tmpvar_vec})"
  976. self.reduction_suffix.writeline(
  977. name,
  978. f"{reduction_combine(reduction_type, tmpvar, next_value)};",
  979. )
  980. # NOTE(jgong5): we do not generate the real stores here with the assumption that
  981. # the scalar kernel that handles the loop tail would be generated and generates
  982. # the stores there.
  983. self.cse.store_cache[name] = tmpvar
  984. class CppTile2DKernel(CppVecKernel):
  985. """
  986. A vector kernel that handles the 2d tiles with the tile size defined in `tiling_factor` on
  987. the inner-most loop level and one of the outer loop level (`outer_tiling_idx`). When the data
  988. tile is accessed in a contiguous way from the outer loop axis, a transposition is applied on the
  989. tile to make the access contiguous from the inner-most loop axis. Then, the same vectorization
  990. logic from its parent `CppVecKernel` is leveraged for load/store/compute. The transposed tile load
  991. and store are generated into kernel.preloads and kernel.poststores buffers.
  992. The loop structure looks like below:
  993. for ...
  994. for i_outer ...
  995. for ...
  996. for inner_most ...
  997. // generated by CppTile2DKernel
  998. float tmp0[16*16]; at::vec::transpose_mxn<...>(tmp0, in_ptr0 + ..., ...); // into kernel.preloads
  999. float tmp1[16*16]; // into kernel.preloads
  1000. for i_inner ... { // the kernel inner loop
  1001. vectorized loads/compute/stores (e.g., load tmp0, store tmp1) // into kernel.loads/compute/stores
  1002. }
  1003. at::vec::transpose_mxn(out_ptr0 + ..., tmp1, ...) // into kernel.poststores
  1004. for inner_most ... (tail)
  1005. // generated by CppTile2DTailKernel
  1006. ...
  1007. for i_outer ... (tail)
  1008. for ...
  1009. for ...
  1010. // generated by CppKernel
  1011. ...
  1012. """
  1013. def __init__(self, args, num_threads, tiling_factor, outer_tiling_idx):
  1014. super().__init__(args, num_threads, tiling_factor)
  1015. self.outer_tiling_idx = outer_tiling_idx
  1016. def inner_itervar(self):
  1017. return sympy.symbols(f"{self.itervars[self.outer_tiling_idx]}_inner")
  1018. def need_vec_transpose(self, index):
  1019. return self.is_stride1_at(
  1020. self.itervars[self.outer_tiling_idx], index
  1021. ) and not self.is_invariant_under(self.itervars[-1], index)
  1022. def gen_transposed_tile_load_store(self, name, var, index, is_store):
  1023. # transposed tile load/store outside the kernel inner loop
  1024. factor = self.tiling_factor
  1025. new_index = self.scale_index_with_offset(index, factor, itervar_idx=-1)
  1026. new_index = self.scale_index_with_offset(
  1027. new_index, factor, itervar_idx=self.outer_tiling_idx
  1028. )
  1029. src = f"{var} + {cexpr(new_index)}"
  1030. dst = "__place_holder__"
  1031. ld_src = f"{cexpr(self.stride_at(self.itervars[-1], index))}"
  1032. ld_dst = f"{factor}"
  1033. if is_store:
  1034. src, dst = dst, src
  1035. ld_src, ld_dst = ld_dst, ld_src
  1036. need_define = True
  1037. load_or_store = f"at::vec::transpose_mxn<float,{factor},{factor}>({src}, {ld_src}, {dst}, {ld_dst});"
  1038. if is_store:
  1039. tile_var = self.cse.newvar()
  1040. elif load_or_store not in self.cse.cache:
  1041. tile_var = self.cse.generate(self.preloads, load_or_store, write=False)
  1042. else:
  1043. need_define = False
  1044. tile_var = self.cse.cache[load_or_store]
  1045. if need_define:
  1046. define_line = f"float {tile_var}[{factor}*{factor}] __attribute__ ((aligned ({factor})));"
  1047. self.preloads.writeline(define_line)
  1048. load_or_store = load_or_store.replace("__place_holder__", str(tile_var))
  1049. if is_store:
  1050. self.poststores.writeline(name, load_or_store)
  1051. else:
  1052. self.preloads.writeline(load_or_store)
  1053. return tile_var
  1054. def load(self, name: str, index: sympy.Expr):
  1055. var = self.args.input(name)
  1056. index = self.rename_indexing(index)
  1057. inner = self.inner_itervar()
  1058. expanded_index = sympy.expand(index)
  1059. if self.need_vec_transpose(expanded_index):
  1060. tile_var = self.gen_transposed_tile_load_store(
  1061. name, var, expanded_index, is_store=False
  1062. )
  1063. # vector load inside the kernel inner loop
  1064. line = f"at::vec::Vectorized<float>::loadu({tile_var} + {cexpr(inner * self.tiling_factor)})"
  1065. return self.cse.generate(self.loads, line)
  1066. else:
  1067. new_index = self.scale_index_with_offset(
  1068. expanded_index,
  1069. self.tiling_factor,
  1070. itervar_idx=self.outer_tiling_idx,
  1071. offset=inner,
  1072. )
  1073. return super().load(name, new_index)
  1074. def store(self, name, index, value, mode=None):
  1075. assert "buf" in name
  1076. var = self.args.output(name)
  1077. inner = self.inner_itervar()
  1078. index = self.rename_indexing(index)
  1079. assert mode is None
  1080. # TODO(jgong5): assert the index is an affine expression on the itervars in concern
  1081. expanded_index = sympy.expand(index)
  1082. if self.need_vec_transpose(expanded_index):
  1083. tile_var = self.gen_transposed_tile_load_store(
  1084. name, var, expanded_index, is_store=True
  1085. )
  1086. # vector store inside the kernel inner loop
  1087. line = f"{value}.store({tile_var} + {cexpr(inner * self.tiling_factor)});"
  1088. self.stores.writeline(name, line)
  1089. else:
  1090. new_index = self.scale_index_with_offset(
  1091. expanded_index,
  1092. self.tiling_factor,
  1093. itervar_idx=self.outer_tiling_idx,
  1094. offset=inner,
  1095. )
  1096. super().store(name, new_index, value, mode)
  1097. def codegen_inner_loops(self, code):
  1098. inner = self.inner_itervar()
  1099. code.writeline(
  1100. f"for (long {inner} = 0; {inner} < {self.tiling_factor}; {inner}++)"
  1101. )
  1102. class CppTile2DTailKernel(CppKernel):
  1103. """
  1104. A scalar kernel that handles the tail of inner-most loop split from a 2d tiling. The tile of the outer
  1105. loop axis is handled with a kernel inner loop (see method `codegen_inner_loops`).
  1106. """
  1107. def __init__(self, args, num_threads, tiling_factor, outer_tiling_idx):
  1108. super().__init__(args, num_threads)
  1109. self.outer_tiling_idx = outer_tiling_idx
  1110. self.tiling_factor = tiling_factor
  1111. def inner_itervar(self):
  1112. return sympy.symbols(f"{self.itervars[self.outer_tiling_idx]}_inner")
  1113. def transform_index(self, index):
  1114. index = self.rename_indexing(index)
  1115. expanded_index = sympy.expand(index)
  1116. new_index = self.scale_index_with_offset(
  1117. expanded_index,
  1118. self.tiling_factor,
  1119. itervar_idx=self.outer_tiling_idx,
  1120. offset=self.inner_itervar(),
  1121. )
  1122. return new_index
  1123. def load(self, name: str, index: sympy.Expr):
  1124. new_index = self.transform_index(index)
  1125. return super().load(name, new_index)
  1126. def store(self, name, index, value, mode=None):
  1127. assert "buf" in name
  1128. var = self.args.output(name)
  1129. assert mode is None
  1130. new_index = self.transform_index(index)
  1131. super().store(name, new_index, value, mode)
  1132. def codegen_inner_loops(self, code):
  1133. inner = self.inner_itervar()
  1134. code.writeline(
  1135. f"for (long {inner} = 0; {inner} < {self.tiling_factor}; {inner}++)"
  1136. )
  1137. class CppVecKernelChecker(CppVecKernel):
  1138. def __init__(self, args, num_threads, tiling_factor):
  1139. super().__init__(args, num_threads, tiling_factor)
  1140. # Since this kernel is only for checker but does not genreate any
  1141. # code, so we need to decrease the kernel count.
  1142. metrics.generated_kernel_count -= 1
  1143. metrics.generated_cpp_vec_kernel_count -= 1
  1144. # Used to recorde the graph wrapper code as the wrapper_code status could be
  1145. # changed during graph run.
  1146. self._orig_wrapper_code = None
  1147. self.simd_vec = True
  1148. self.fast_vec_list = []
  1149. for k, v in CppVecOverrides.__dict__.items():
  1150. if isinstance(v, staticmethod):
  1151. self.fast_vec_list.append(k)
  1152. self.exit_stack = contextlib.ExitStack()
  1153. # Cache all the load result
  1154. self.load_results: list[CSEVariable] = []
  1155. self.load_supported_dtypes: list[torch.dtype] = [
  1156. torch.float,
  1157. torch.float32,
  1158. torch.bool,
  1159. torch.uint8,
  1160. ]
  1161. self.store_supported_dtypes: list[torch.dtype] = [torch.float, torch.float32]
  1162. # Cache the dtypes of the store operation. If the store is mixing dtypes, the
  1163. # vectorization would not support it as it is hard to determine the vec dtype
  1164. self.store_dtypes: list[torch.dtype] = []
  1165. # The dtype is used for vectorization
  1166. self.vec_dtype: torch.dtype = torch.float32
  1167. def is_indirect_indexing(self, index: sympy.Expr):
  1168. for _load_res in self.load_results:
  1169. # The index expression contains a value that loads from memory
  1170. if index.count(sympy_symbol(_load_res.name)) > 0:
  1171. return True
  1172. return False
  1173. def could_vec(self, name: str, index: sympy.Expr):
  1174. assert self.itervars is not None
  1175. # Not a loop
  1176. if len(self.itervars) == 0:
  1177. return False
  1178. if self.is_indirect_indexing(index):
  1179. return False
  1180. most_inner_var = self.itervars[-1]
  1181. return self.is_invariant_under(most_inner_var, index) or self.is_stride1_at(
  1182. most_inner_var, index
  1183. )
  1184. def is_mask(self, name: str, users: Dict[torch.fx.Node, None]):
  1185. load_type = V.graph.get_dtype(name)
  1186. if load_type == torch.bool:
  1187. return all(user.target in ("where", "masked") for user in users.keys())
  1188. elif load_type == torch.uint8:
  1189. """
  1190. If the load value is torch.uint8, then we only support the loaded
  1191. value is as the mask.
  1192. """
  1193. if not all(
  1194. user.target == "to_dtype" and user.args[-1] == torch.bool
  1195. for user in users.keys()
  1196. ):
  1197. return False
  1198. for to_dtype_node in users.keys():
  1199. assert to_dtype_node.target == "to_dtype"
  1200. if not all(
  1201. user.target in ("where", "masked")
  1202. for user in to_dtype_node.users.keys()
  1203. ):
  1204. return False
  1205. return True
  1206. else:
  1207. return False
  1208. def load(self, name: str, index: sympy.Expr):
  1209. with RecordOptimizationContext(__name__) as node_ctx:
  1210. load_dtype = V.graph.get_dtype(name)
  1211. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1212. assert opt_ctx
  1213. opt_ctx.dtype = load_dtype
  1214. opt_ctx.is_load_as_mask = self.is_mask(name, node_ctx.get_fx_node().users)
  1215. var = self.cse.newvar()
  1216. self.load_results.append(var)
  1217. if load_dtype in [torch.bool, torch.uint8] and not opt_ctx.is_load_as_mask:
  1218. self.simd_vec = False
  1219. return var
  1220. if load_dtype not in self.load_supported_dtypes:
  1221. self.simd_vec = False
  1222. return var
  1223. index = self.rename_indexing(index)
  1224. self.simd_vec = self.simd_vec and self.could_vec(name, index)
  1225. return var
  1226. def store(self, name, index, value, mode=None):
  1227. with RecordOptimizationContext(__name__) as node_ctx:
  1228. store_dtype = V.graph.get_dtype(name)
  1229. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1230. assert opt_ctx
  1231. opt_ctx.dtype = store_dtype
  1232. store_dtype = torch.float if store_dtype == torch.float32 else store_dtype
  1233. self.store_dtypes.append(store_dtype)
  1234. if store_dtype not in self.store_supported_dtypes:
  1235. self.simd_vec = False
  1236. return self.simd_vec
  1237. assert "buf" in name
  1238. index = self.rename_indexing(index)
  1239. if mode:
  1240. self.simd_vec = False
  1241. return False
  1242. self.simd_vec = self.simd_vec and self.could_vec(name, index)
  1243. return self.simd_vec
  1244. def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
  1245. if (
  1246. dtype == torch.float
  1247. and src_dtype == torch.float
  1248. and reduction_type in ["max", "min", "sum"]
  1249. ):
  1250. pass
  1251. else:
  1252. self.simd_vec = False
  1253. return self.simd_vec
  1254. def is_supported_cmp(self, node: torch.fx.Node):
  1255. def get_node_dtype(node):
  1256. if type(node) == torch.fx.Node:
  1257. opt_ctx: OptimizationContext = get_current_node_opt_ctx()
  1258. return opt_ctx.dtype if opt_ctx else None
  1259. else:
  1260. return None
  1261. def get_cmp_dtypes(node: torch.fx.Node):
  1262. return get_node_dtype(node.args[-2]), get_node_dtype(node.args[-1])
  1263. assert len(node.args) >= 2
  1264. # cmp(x, y): y is a magic value like x >= 1
  1265. if type(node.args[-1]) in [int, float]:
  1266. return True
  1267. # cmp(x, y): x is a magic value like 1 >= y
  1268. if type(node.args[-2]) in [int, float]:
  1269. return False
  1270. left_dtype, right_dtype = get_cmp_dtypes(node)
  1271. if left_dtype is None or right_dtype is None:
  1272. # TODO(Eikan): To record, deduce and propagate the data type of every expression.
  1273. return True
  1274. else:
  1275. return left_dtype == right_dtype
  1276. def is_load_only_block(self, sub_graph: torch.fx.Graph):
  1277. # The sub graph only contains "placeholder", "output", "get_index", "load"
  1278. is_load_only = False
  1279. load_dtype = None
  1280. skip_io_nodes = ["placeholder", "output"]
  1281. for _node in sub_graph.nodes:
  1282. if _node.op in skip_io_nodes:
  1283. continue
  1284. if _node.target not in ["load", "get_index"]:
  1285. # The body contains non load node
  1286. is_load_only = False
  1287. break
  1288. if _node.target == "load":
  1289. _, name, _ = _node.args
  1290. load_dtype = V.graph.get_dtype(name)
  1291. is_load_only = True
  1292. return is_load_only, load_dtype
  1293. def __exit__(self, exc_type, exc_val, exc_tb):
  1294. assert self._orig_wrapper_code is not None
  1295. # Restore the wrapper_code
  1296. V.graph.wrapper_code = self._orig_wrapper_code
  1297. self.exit_stack.__exit__(exc_type, exc_val, exc_tb)
  1298. def __enter__(self):
  1299. # Recorde the graph wrapper code. The wrapper_code status could be
  1300. # changed during graph run. Regarding this checker, we also need to
  1301. # run the graph but we don't expect to change any status that would
  1302. # impact the code generation. Hence, we record the graph wapper code
  1303. # and replace it with a dummy warpper_code and then restore to the
  1304. # original one as long as the checker is finished.
  1305. self._orig_wrapper_code = V.graph.wrapper_code
  1306. V.graph.wrapper_code = WrapperCodeGen()
  1307. class VecCheckerProxy:
  1308. @staticmethod
  1309. def _bin_cmp_op(x, y):
  1310. current_node: torch.fx.Node = V.interpreter.current_node
  1311. if not self.is_supported_cmp(current_node):
  1312. self.simd_vec = False
  1313. return self.simd_vec
  1314. @staticmethod
  1315. def __getattr__(name):
  1316. bin_cmp_ops = ["eq", "ne", "le", "ge", "lt", "gt"]
  1317. def inner(*args, **kwargs):
  1318. if name in bin_cmp_ops:
  1319. return VecCheckerProxy._bin_cmp_op(args, kwargs)
  1320. if not (name in self.fast_vec_list):
  1321. self.simd_vec = False
  1322. return self.simd_vec
  1323. return inner
  1324. @staticmethod
  1325. def load(name: str, index: sympy.Expr):
  1326. return self.load(name, index)
  1327. @staticmethod
  1328. def store(name, index, value, mode=None):
  1329. return self.store(name, index, value, mode=mode)
  1330. @staticmethod
  1331. def reduction(name, dtype, src_dtype, reduction_type, index, value):
  1332. return self.reduction(
  1333. name, dtype, src_dtype, reduction_type, index, value
  1334. )
  1335. @staticmethod
  1336. def constant(val, dtype):
  1337. with RecordOptimizationContext(__name__) as node_ctx:
  1338. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1339. assert opt_ctx
  1340. opt_ctx.dtype = dtype
  1341. i32_iinfo = numpy.iinfo(numpy.int32)
  1342. if (
  1343. dtype == torch.int64
  1344. and val <= i32_iinfo.max
  1345. and val >= i32_iinfo.min
  1346. ):
  1347. opt_ctx.dtype = torch.int32
  1348. f32_iinfo = numpy.finfo(numpy.float32)
  1349. if dtype == torch.double:
  1350. if (
  1351. (val <= f32_iinfo.max and val >= f32_iinfo.min)
  1352. or (val == numpy.inf)
  1353. or (val == -numpy.inf)
  1354. ):
  1355. opt_ctx.dtype = torch.float32
  1356. supported_dtype = (torch.float32, torch.int32)
  1357. is_supported_dtype = opt_ctx.dtype in (supported_dtype)
  1358. if not is_supported_dtype:
  1359. self.simd_vec = False
  1360. return is_supported_dtype
  1361. @staticmethod
  1362. def index_expr(expr, dtype):
  1363. current_node: torch.fx.Node = V.interpreter.current_node
  1364. assert len(self.ranges) == len(self.itervars)
  1365. if not len(self.ranges) or not all(
  1366. not isinstance(range, sympy.Expr) or sympy.simplify(range).is_number
  1367. for range in self.ranges
  1368. ):
  1369. # if the range value is sympy.Expr, we might could not deduce the accurate loop interval.
  1370. self.simd_vec = False
  1371. return self.cse.newvar()
  1372. def mod_indexing_rep(x, y, z):
  1373. if z.is_constant():
  1374. return x / y
  1375. # never really happens, we'll bail on optimizing
  1376. return (x / y) % z
  1377. def indexing_div_rep(x, y):
  1378. return x / y
  1379. with RecordOptimizationContext(__name__) as node_ctx:
  1380. assert len(self.ranges) == len(self.itervars)
  1381. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1382. assert opt_ctx
  1383. max_expr = expr.replace(
  1384. ir.ModularIndexing, mod_indexing_rep
  1385. ).replace(ir.FloorDiv, indexing_div_rep)
  1386. min_expr = max_expr
  1387. for idx in range(len(self.ranges)):
  1388. max_expr = sympy.maximum(
  1389. max_expr,
  1390. self.itervars[idx],
  1391. sympy.Interval(0, self.ranges[idx]),
  1392. )
  1393. min_expr = sympy.minimum(
  1394. min_expr,
  1395. self.itervars[idx],
  1396. sympy.Interval(0, self.ranges[idx]),
  1397. )
  1398. i32_iinfo = numpy.iinfo(numpy.int32)
  1399. if (
  1400. dtype == torch.int64
  1401. and max_expr.is_number
  1402. and min_expr.is_number
  1403. and max_expr <= i32_iinfo.max
  1404. and min_expr >= i32_iinfo.min
  1405. ):
  1406. opt_ctx.dtype = torch.int32
  1407. else:
  1408. opt_ctx.dtype = dtype
  1409. self.simd_vec = False
  1410. # Pick the most inner loop variable since we always vectorize the
  1411. # most inner loop
  1412. most_inner_var = self.itervars[-1]
  1413. most_inner_loop_irrevelant = self.is_invariant_under(
  1414. most_inner_var, expr
  1415. )
  1416. if not most_inner_loop_irrevelant:
  1417. self.simd_vec = False
  1418. opt_ctx.is_most_inner_loop_irrevelant = most_inner_loop_irrevelant
  1419. tmp_var = self.cse.newvar()
  1420. return tmp_var
  1421. @staticmethod
  1422. def indirect_indexing(index_var):
  1423. self.simd_vec = False
  1424. return sympy.Symbol(str(index_var))
  1425. @staticmethod
  1426. def masked(mask, body, other):
  1427. with RecordOptimizationContext(__name__) as node_ctx:
  1428. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1429. assert opt_ctx
  1430. is_masked_load, load_dtype = self.is_load_only_block(body.graph)
  1431. opt_ctx.dtype = load_dtype
  1432. opt_ctx.is_masked_load = is_masked_load
  1433. _simd_vec = is_masked_load and load_dtype in [
  1434. torch.float32,
  1435. torch.float,
  1436. ]
  1437. if not _simd_vec:
  1438. self.simd_vec = False
  1439. tmp_var = self.cse.newvar()
  1440. return tmp_var
  1441. @staticmethod
  1442. def to_dtype(x, dtype):
  1443. with RecordOptimizationContext(__name__) as node_ctx:
  1444. opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
  1445. assert opt_ctx
  1446. opt_ctx.dtype = dtype
  1447. if dtype != torch.bool:
  1448. self.simd_vec = False
  1449. return x
  1450. self.exit_stack.enter_context(V.set_ops_handler(VecCheckerProxy()))
  1451. self.exit_stack.enter_context(V.set_kernel_handler(self))
  1452. return self
  1453. class CppTile2DKernelChecker(CppVecKernelChecker):
  1454. """
  1455. Currently, we only address the situations with following constraints.
  1456. 1. There exists one and only one fp32 load/store with outer loop var having contiguous buffer accesses.
  1457. 2. When a load/store doesn't have contiguous access in an outer loop var, the access should be
  1458. vectorizable from the inner-most dim.
  1459. 3. No reduction.
  1460. """
  1461. def __init__(self, args, num_threads, tiling_factor):
  1462. super().__init__(args, num_threads, tiling_factor)
  1463. self.can_tile2d = True
  1464. self.outer_tiling_idx = -1
  1465. def check_can_tile2d(self, name: str, index: sympy.Expr):
  1466. if not self.can_tile2d:
  1467. return
  1468. # make sure the transpose_mxn(src, ld_src, dst, ld_dst) ld_src doesn't depend on most inner var.
  1469. if len(self.itervars) > 0 and not self.is_invariant_under(
  1470. self.itervars[-1], self.stride_at(self.itervars[-1], index)
  1471. ):
  1472. self.can_tile2d = False
  1473. return
  1474. # check contiguity from any of the outer loops
  1475. has_stride1 = False
  1476. for loop_idx, itervar in enumerate(self.itervars[:-1]):
  1477. if self.is_stride1_at(itervar, index):
  1478. # only support 2d tile now
  1479. if V.graph.get_dtype(name) not in [torch.float, torch.float32] or (
  1480. self.outer_tiling_idx >= 0 and self.outer_tiling_idx != loop_idx
  1481. ):
  1482. self.can_tile2d = False
  1483. return
  1484. else:
  1485. self.outer_tiling_idx = loop_idx
  1486. has_stride1 = True
  1487. if not has_stride1 and not self.could_vec(name, index):
  1488. self.can_tile2d = False
  1489. return self.can_tile2d
  1490. def load(self, name: str, index: sympy.Expr):
  1491. if not V.graph.get_dtype(name) in [
  1492. torch.float,
  1493. torch.float32,
  1494. torch.bool,
  1495. torch.uint8,
  1496. ]:
  1497. self.can_tile2d = False
  1498. return self.can_tile2d
  1499. index = self.rename_indexing(index)
  1500. return self.check_can_tile2d(name, index)
  1501. def store(self, name, index, value, mode=None):
  1502. if not V.graph.get_dtype(name) in [
  1503. torch.float,
  1504. torch.float32,
  1505. ]:
  1506. self.can_tile2d = False
  1507. return self.can_tile2d
  1508. index = self.rename_indexing(index)
  1509. return self.check_can_tile2d(name, index)
  1510. def reduction(self, name, dtype, src_dtype, reduction_type, index, value):
  1511. self.can_tile2d = False
  1512. return self.can_tile2d
  1513. def __exit__(self, exc_type, exc_val, exc_tb):
  1514. super().__exit__(exc_type, exc_val, exc_tb)
  1515. if not self.simd_vec or self.outer_tiling_idx < 0:
  1516. self.can_tile2d = False
  1517. class CppKernelProxy(CppKernel):
  1518. def __init__(self, kernel_group):
  1519. super().__init__(kernel_group.args, kernel_group.ws.num_threads)
  1520. self.kernel_group = kernel_group
  1521. self.loop_nest = None
  1522. self.call_ranges = None
  1523. self.picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
  1524. def codegen_nodes(self, nodes):
  1525. kernel_group = self.kernel_group
  1526. _, (group, reduction_group) = max(
  1527. nodes, key=lambda x: int(x.is_reduction())
  1528. ).group
  1529. def codegen_kernel(cls, *args):
  1530. with kernel_group.new_kernel(cls, *args) as kernel:
  1531. run(kernel)
  1532. # Ugly hack to maitain the metrics kernel count since
  1533. # we only count in CppKernelProxy, not those contained in it
  1534. metrics.generated_kernel_count -= 1
  1535. return kernel
  1536. def run(kernel):
  1537. vars, reduction_vars = kernel.set_ranges(group, reduction_group)
  1538. in_suffix = False
  1539. for node in nodes:
  1540. if node.group[1] in [
  1541. (group, reduction_group),
  1542. (group + reduction_group, ()),
  1543. ]:
  1544. assert not in_suffix
  1545. node.run(vars, reduction_vars)
  1546. else:
  1547. in_suffix = True
  1548. assert node.group[1] == (
  1549. group,
  1550. (),
  1551. ), f"unexpected group: {node.group[1]} != {group}, {reduction_group}"
  1552. # we can fuse in some extra pointwise into the suffix
  1553. with kernel.write_to_suffix():
  1554. node.run(vars, ())
  1555. scalar_kernel = codegen_kernel(CppKernel)
  1556. inner_most_idx = len(scalar_kernel.itervars) - 1
  1557. self.call_ranges = scalar_kernel.call_ranges
  1558. self.loop_nest = LoopNestWithSplit.build(scalar_kernel)
  1559. if not self.picked_vec_isa:
  1560. return
  1561. # TODO(jgong5): support alternative tiling factors and data types
  1562. tiling_factor = self.picked_vec_isa.nelements(dtype=torch.float)
  1563. # Kernels share the same global contexts like V.graph.wrapper_code, V.kernel.args.
  1564. # But the generated scalar kernel has updated these global contexts. Hence, the other kernels
  1565. # should not do this again to avoid context conflict. By now, we only control the
  1566. # config.inplace_buffers. In the future, we could maintain more contexts.
  1567. with torch._inductor.config.patch(inplace_buffers=False):
  1568. with CppVecKernelChecker(
  1569. deepcopy(self.kernel_group.args), parallel_num_threads(), tiling_factor
  1570. ) as vec_checker:
  1571. run(vec_checker)
  1572. with CppTile2DKernelChecker(
  1573. deepcopy(self.kernel_group.args), parallel_num_threads(), tiling_factor
  1574. ) as tile2d_checker:
  1575. run(tile2d_checker)
  1576. if vec_checker.simd_vec:
  1577. main_loop, tail_loop = self.loop_nest.split_with_tiling(
  1578. inner_most_idx, factor=tiling_factor
  1579. )
  1580. main_loop.set_kernel(codegen_kernel(CppVecKernel, tiling_factor))
  1581. tail_loop.set_kernel(scalar_kernel)
  1582. main_loop.simd_vec = True
  1583. tail_loop.simd_omp = True
  1584. # We chop the loop into two cubes by the nelements - main loop and tail loop.
  1585. # Regarding the main loop, it is straightforward that it could be vectorized with
  1586. # nelements. But for the tail loop, it still could be vectorized. For example,
  1587. # if the nelements is 8(256bits), then the tail loop still could be vectorized
  1588. # as 4(128bits).
  1589. tail_loop.simd_nelements = tiling_factor // 2
  1590. elif tile2d_checker.can_tile2d:
  1591. outer_tiling_idx = tile2d_checker.outer_tiling_idx
  1592. assert outer_tiling_idx < inner_most_idx
  1593. outer_main_loop, outer_tail_loop = self.loop_nest.split_with_tiling(
  1594. outer_tiling_idx, factor=tiling_factor
  1595. )
  1596. outer_tail_loop.set_kernel(scalar_kernel)
  1597. inner_main_loop, inner_tail_loop = outer_main_loop.split_with_tiling(
  1598. inner_most_idx - outer_tiling_idx, factor=tiling_factor
  1599. )
  1600. inner_main_loop.set_kernel(
  1601. codegen_kernel(CppTile2DKernel, tiling_factor, outer_tiling_idx)
  1602. )
  1603. inner_tail_loop.set_kernel(
  1604. codegen_kernel(CppTile2DTailKernel, tiling_factor, outer_tiling_idx)
  1605. )
  1606. def codegen_loops(self, code, worksharing):
  1607. self.codegen_loops_impl(self.loop_nest, code, worksharing)
  1608. class CppScheduling:
  1609. def __init__(self, scheduler):
  1610. self.scheduler = scheduler
  1611. self.get_kernel_group()
  1612. def group_fn(self, sizes):
  1613. return tuple(tuple(map(V.graph.sizevars.simplify, s)) for s in sizes)
  1614. def get_kernel_group(self):
  1615. from .wrapper import CppWrapperCodeGen
  1616. if isinstance(V.graph.wrapper_code, CppWrapperCodeGen):
  1617. self.kernel_group = CppWrapperKernelGroup()
  1618. else:
  1619. self.kernel_group = KernelGroup()
  1620. @staticmethod
  1621. def can_fuse_horizontal(node1, node2):
  1622. _, (vars1, reduce1) = node1.group
  1623. _, (vars2, reduce2) = node2.group
  1624. if vars1 == vars2 and reduce1 == reduce2:
  1625. return True
  1626. if reduce1 == () and vars1 == vars2 + reduce2:
  1627. return True
  1628. # TODO(jansel): allow fusion pointwise (vars1, ()) suffix?
  1629. return False
  1630. @classmethod
  1631. def can_fuse_vertical(cls, node1, node2):
  1632. return cls.can_fuse_horizontal(node1, node2) and not node1.is_reduction()
  1633. def codegen_nodes(self, nodes):
  1634. """
  1635. Turn an set of pre-fused nodes into a C++ kernel.
  1636. """
  1637. kernel_group = self.kernel_group
  1638. cpp_kernel_proxy = CppKernelProxy(kernel_group)
  1639. cpp_kernel_proxy.codegen_nodes(nodes)
  1640. kernel_group.finalize_kernel(cpp_kernel_proxy, None)
  1641. def codegen_sync(self):
  1642. pass
  1643. def flush(self):
  1644. self.kernel_group.codegen_define_and_call(V.graph.wrapper_code)
  1645. self.get_kernel_group()
  1646. class KernelGroup:
  1647. def __init__(self):
  1648. super().__init__()
  1649. self.args = KernelArgs()
  1650. self.loops_code = BracesBuffer()
  1651. self.ws = WorkSharing(self.loops_code)
  1652. self.stack = contextlib.ExitStack()
  1653. self.stack.enter_context(self.ws)
  1654. self.count = 0
  1655. def new_kernel(self, cls, *args):
  1656. return cls(self.args, parallel_num_threads(), *args)
  1657. def finalize_kernel(self, new_kernel, scheduler):
  1658. self.count += 1
  1659. code = self.loops_code
  1660. ws = self.ws
  1661. new_kernel.codegen_loops(code, ws)
  1662. def codegen_define_and_call(self, wrapper):
  1663. self.stack.close()
  1664. if self.count == 0:
  1665. return
  1666. kernel_name = "kernel_cpp_" + wrapper.next_kernel_suffix()
  1667. arg_defs, call_args, arg_types = self.args.cpp_argdefs()
  1668. arg_defs = ",\n".ljust(25).join(arg_defs)
  1669. arg_types = ",".join(arg_types)
  1670. code = BracesBuffer()
  1671. # TODO: support kernel profile on other platforms
  1672. enable_kernel_profile = (
  1673. config.cpp.enable_kernel_profile and sys.platform == "linux"
  1674. )
  1675. if enable_kernel_profile:
  1676. code.writelines(["#include <ATen/record_function.h>"])
  1677. code.writelines([cpp_prefix(), "" f'extern "C" void kernel({arg_defs})'])
  1678. with code.indent():
  1679. if enable_kernel_profile:
  1680. graph_id = V.graph.graph_id
  1681. prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
  1682. code.writelines(
  1683. [
  1684. f'RECORD_FUNCTION("{prefix + kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
  1685. ]
  1686. )
  1687. for old, new in self.args.aliases():
  1688. code.writeline(f"auto {old} = {new};")
  1689. code.splice(self.loops_code)
  1690. codecache_def = IndentedBuffer()
  1691. codecache_def.writeline("async_compile.cpp('''")
  1692. codecache_def.splice(code)
  1693. codecache_def.writeline("''')")
  1694. codecache_str = codecache_def.getvalue()
  1695. # TODO(voz): Ostensibly, we should not need this. But there are cases where C++ codegen does
  1696. # not use BracesBuffer, so we have no good indicator of a C++ buffer atm.
  1697. codecache_str = codecache_str.replace("#pragma CMT", "//")
  1698. wrapper.define_kernel(kernel_name, codecache_str)
  1699. wrapper.load_kernel(kernel_name, code, arg_types)
  1700. # generate the code to call this
  1701. wrapper.generate_kernel_call(kernel_name, call_args)
  1702. class CppWrapperKernelGroup(KernelGroup):
  1703. def __init__(self):
  1704. super().__init__()
  1705. self.args = CppWrapperKernelArgs()
  1706. class WorkSharing:
  1707. def __init__(self, code):
  1708. self.code = code
  1709. self.in_parallel = False
  1710. self.num_threads = None
  1711. self.stack = contextlib.ExitStack()
  1712. def parallel(self, threads):
  1713. if self.in_parallel and threads != self.num_threads:
  1714. # wrong number of threads
  1715. self.close()
  1716. if not self.in_parallel:
  1717. self.num_threads = threads
  1718. self.in_parallel = True
  1719. if config.cpp.dynamic_threads:
  1720. self.code.writeline("#pragma omp parallel")
  1721. else:
  1722. self.code.writeline(f"#pragma omp parallel num_threads({threads})")
  1723. self.stack.enter_context(self.code.indent())
  1724. def single(self):
  1725. if self.in_parallel:
  1726. self.code.writeline("#pragma omp single")
  1727. return self.in_parallel
  1728. def close(self):
  1729. self.stack.close()
  1730. self.in_parallel = False
  1731. def __enter__(self):
  1732. self.stack.__enter__()
  1733. return self
  1734. def __exit__(self, exc_type, exc_val, exc_tb):
  1735. self.stack.__exit__(exc_type, exc_val, exc_tb)
  1736. @dataclasses.dataclass
  1737. class LoopLevel:
  1738. var: sympy.Expr = None
  1739. size: sympy.Expr = None
  1740. offset: sympy.Expr = sympy.Integer(0)
  1741. steps: sympy.Expr = sympy.Integer(1)
  1742. parallel: int = 0
  1743. simd_omp: bool = False
  1744. picked_vec_isa: codecache.VecISA = codecache.pick_vec_isa()
  1745. simd_nelements: int = picked_vec_isa.nelements() if picked_vec_isa else 0
  1746. simd_vec: bool = False
  1747. collapsed: bool = False
  1748. reduction_var_map: Dict[str, str] = None
  1749. parent: "LoopLevel" = None
  1750. # the next inner level of the loop, empty if it is inner-most
  1751. # contains >1 LoopLevel if the inner level of loop is split
  1752. inner: List["LoopLevel"] = dataclasses.field(default_factory=list)
  1753. # kernel assigned to this loop level, only valid when it is a leaf
  1754. kernel: CppKernel = None
  1755. def get_kernels(self) -> List[CppKernel]:
  1756. """Get all kernel objects under this loop level"""
  1757. if self.kernel:
  1758. return [self.kernel]
  1759. kernels = []
  1760. for loop in self.inner:
  1761. kernels += loop.get_kernels()
  1762. return kernels
  1763. def set_kernel(self, kernel: CppKernel):
  1764. """
  1765. Set the kernel under this loop level. No split is allowed under
  1766. this loop level.
  1767. """
  1768. if not self.inner:
  1769. self.kernel = kernel
  1770. loop = self
  1771. if loop.is_reduction():
  1772. loop.reduction_var_map = kernel.reduction_var_map.copy()
  1773. loop = loop.parent
  1774. while loop is not None and loop.is_reduction():
  1775. loop.reduction_var_map.update(kernel.reduction_var_map)
  1776. loop = loop.parent
  1777. return
  1778. assert len(self.inner) == 1
  1779. self.inner[0].set_kernel(kernel)
  1780. def get_loops_at(self, depth) -> List["LoopLevel"]:
  1781. if depth == 0:
  1782. return [self]
  1783. else:
  1784. loops = []
  1785. for loop in self.inner:
  1786. loops += loop.get_loops_at(depth - 1)
  1787. return loops
  1788. def is_reduction(self):
  1789. return bool(self.reduction_var_map)
  1790. def split_with_tiling(self, depth, factor):
  1791. def clone_inner():
  1792. inner = []
  1793. if self.inner:
  1794. for loop in self.inner:
  1795. inner.append(loop.clone())
  1796. return inner
  1797. def do_split_with_tiling():
  1798. sympy_factor = sympy.Integer(factor)
  1799. main_loop_range = ir.FloorDiv(self.size, sympy_factor)
  1800. main_loop = LoopLevel(self.var, main_loop_range)
  1801. main_loop.parallel = self.parallel
  1802. main_loop.collapsed = False
  1803. main_loop.reduction_var_map = self.reduction_var_map
  1804. main_loop.inner = clone_inner()
  1805. if main_loop.inner:
  1806. for loop in main_loop.inner:
  1807. loop.parent = main_loop
  1808. offset = main_loop_range * sympy_factor
  1809. tail_loop = LoopLevel(self.var, self.size)
  1810. tail_loop.offset = offset
  1811. tail_loop.parallel = self.parallel
  1812. tail_loop.collapsed = False
  1813. tail_loop.reduction_var_map = self.reduction_var_map
  1814. tail_loop.inner = clone_inner()
  1815. if tail_loop.inner:
  1816. for loop in tail_loop.inner:
  1817. loop.parent = tail_loop
  1818. return main_loop, tail_loop
  1819. if depth == 0:
  1820. main_loop, tail_loop = do_split_with_tiling()
  1821. parent = self.parent
  1822. if parent:
  1823. parent.inner = [main_loop, tail_loop]
  1824. main_loop.parent = parent
  1825. tail_loop.parent = parent
  1826. return main_loop, tail_loop
  1827. else:
  1828. assert len(self.inner) == 1
  1829. return self.inner[0].split_with_tiling(depth - 1, factor)
  1830. def clone(self):
  1831. loop = copy(self)
  1832. loop.inner = []
  1833. if self.inner:
  1834. for inner_loop in self.inner:
  1835. inner_loop_clone = inner_loop.clone()
  1836. inner_loop_clone.parent = loop
  1837. loop.inner.append(inner_loop_clone)
  1838. loop.kernel = deepcopy(self.kernel)
  1839. return loop
  1840. def lines(self):
  1841. if self.reduction_var_map:
  1842. reduction = " " + " ".join(
  1843. f"reduction({RTYPE_TO_CPP[rtype]}:{var})"
  1844. for var, rtype in self.reduction_var_map.items()
  1845. )
  1846. else:
  1847. reduction = ""
  1848. simd = (
  1849. f"simd simdlen({self.simd_nelements}) "
  1850. if self.simd_omp and self.simd_nelements > 1
  1851. else ""
  1852. )
  1853. if self.parallel:
  1854. # TODO(jansel): look into chunk size and other schedules
  1855. line1 = f"#pragma omp for{reduction} "
  1856. if self.parallel > 1:
  1857. line1 += f" collapse({self.parallel})"
  1858. if self.simd_omp:
  1859. line1 = line1.replace(" for ", f" for {simd}")
  1860. elif self.simd_vec:
  1861. line1 = ""
  1862. elif self.simd_omp:
  1863. line1 = f"#pragma omp {simd}{reduction}"
  1864. elif not self.reduction_var_map and codecache.is_gcc():
  1865. line1 = "#pragma GCC ivdep"
  1866. else:
  1867. line1 = ""
  1868. line2 = f"for({INDEX_TYPE} {self.var}={cexpr(self.offset)}; {self.var}<{cexpr(self.size)}; {self.var}+={cexpr(self.steps)})"
  1869. if self.collapsed or not line1:
  1870. return [line2]
  1871. return [line1, line2]
  1872. @dataclasses.dataclass
  1873. class LoopNestWithSplit:
  1874. """
  1875. A loop-nest like structure but with some loop level split along
  1876. the loop range into the main tiling loop and the tail. It is built
  1877. with the `build` method as a loop nest and then split with
  1878. `split_with_tiling` at some depth.
  1879. A typical case is for vectorization where we typically split at the inner-most
  1880. loop level. A more complicated case is 2D tiling where we split at
  1881. both inner-most and outer levels.
  1882. """
  1883. root: List[LoopLevel] = None
  1884. kernel: CppKernel = None
  1885. @staticmethod
  1886. def build(kernel: CppKernel):
  1887. """Build a LoopNest with the given `kernel` as the leaf"""
  1888. itervars = kernel.itervars
  1889. ranges = kernel.ranges
  1890. reduction_depth = kernel.reduction_depth
  1891. root: List[LoopLevel] = []
  1892. levels: List[LoopLevel] = root
  1893. loop: LoopLevel = None
  1894. for loop_idx, (var, size) in enumerate(zip(itervars, ranges)):
  1895. loop = LoopLevel(var, size, parent=loop)
  1896. if loop_idx >= reduction_depth:
  1897. loop.reduction_var_map = kernel.reduction_var_map.copy()
  1898. levels.append(loop)
  1899. levels = loop.inner
  1900. loop_nest = LoopNestWithSplit(root, len(itervars))
  1901. if loop:
  1902. loop.kernel = kernel
  1903. else:
  1904. loop_nest.kernel = kernel
  1905. return loop_nest
  1906. def __bool__(self):
  1907. return bool(self.root)
  1908. def get_loops_at(self, depth) -> List[LoopLevel]:
  1909. """Get all the loop levels at the given `depth` (most outer loop has depth 0)"""
  1910. loops = []
  1911. for loop in self.root:
  1912. loops += loop.get_loops_at(depth)
  1913. return loops
  1914. @cache_on_self
  1915. def max_parallel_depth(self):
  1916. """
  1917. Maximal allowed depth for parallelism:
  1918. 1) Levels without splitting and
  1919. 2) All reduction or non-reduction levels
  1920. When the loop is split at the top level, the max depth is 1.
  1921. """
  1922. max_depth = 0
  1923. loops = self.root
  1924. if len(loops) > 1:
  1925. return 1
  1926. is_reduction = loops[0].is_reduction() if loops else False
  1927. while len(loops) == 1 and loops[0].is_reduction() == is_reduction:
  1928. max_depth += 1
  1929. loops = loops[0].inner
  1930. return max_depth
  1931. def is_reduction_only(self):
  1932. """
  1933. Whether all the loops are for reduction. Reduction loops
  1934. are always the inner most ones.
  1935. """
  1936. return self.root and self.root[0].is_reduction()
  1937. def mark_parallel(self, par_depth):
  1938. assert (
  1939. par_depth <= self.max_parallel_depth()
  1940. ), "Parallel depth cannot exceed the maximal allowed parallel depth"
  1941. loops = self.root
  1942. for loop in loops:
  1943. loop.parallel = par_depth
  1944. for i in range(1, par_depth):
  1945. loops = loops[0].inner
  1946. loops[0].collapsed = True
  1947. def split_with_tiling(self, depth, factor):
  1948. """
  1949. Split the loop into main and tail loops at given `depth` so that the range
  1950. of the main loop has range `floor_div(range, factor) * factor` and
  1951. the tail loop handles the remainder. The main loop is tiled
  1952. according to the `factor`.
  1953. """
  1954. loops = self.get_loops_at(depth)
  1955. assert len(loops) == 1
  1956. split_loops = loops[0].split_with_tiling(0, factor)
  1957. if depth == 0:
  1958. self.root = split_loops
  1959. return split_loops