cxx11_tensor_gpu.cu 57 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643
  1. // This file is part of Eigen, a lightweight C++ template library
  2. // for linear algebra.
  3. //
  4. // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
  5. //
  6. // This Source Code Form is subject to the terms of the Mozilla
  7. // Public License v. 2.0. If a copy of the MPL was not distributed
  8. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. #define EIGEN_TEST_NO_LONGDOUBLE
  10. #define EIGEN_TEST_NO_COMPLEX
  11. #define EIGEN_USE_GPU
  12. #include "main.h"
  13. #include <unsupported/Eigen/CXX11/Tensor>
  14. #include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
  15. #define EIGEN_GPU_TEST_C99_MATH EIGEN_HAS_CXX11
  16. using Eigen::Tensor;
  17. void test_gpu_nullary() {
  18. Tensor<float, 1, 0, int> in1(2);
  19. Tensor<float, 1, 0, int> in2(2);
  20. in1.setRandom();
  21. in2.setRandom();
  22. std::size_t tensor_bytes = in1.size() * sizeof(float);
  23. float* d_in1;
  24. float* d_in2;
  25. gpuMalloc((void**)(&d_in1), tensor_bytes);
  26. gpuMalloc((void**)(&d_in2), tensor_bytes);
  27. gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice);
  28. gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice);
  29. Eigen::GpuStreamDevice stream;
  30. Eigen::GpuDevice gpu_device(&stream);
  31. Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
  32. d_in1, 2);
  33. Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
  34. d_in2, 2);
  35. gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
  36. gpu_in2.device(gpu_device) = gpu_in2.random();
  37. Tensor<float, 1, 0, int> new1(2);
  38. Tensor<float, 1, 0, int> new2(2);
  39. assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost,
  40. gpu_device.stream()) == gpuSuccess);
  41. assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost,
  42. gpu_device.stream()) == gpuSuccess);
  43. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  44. for (int i = 0; i < 2; ++i) {
  45. VERIFY_IS_APPROX(new1(i), 3.14f);
  46. VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
  47. }
  48. gpuFree(d_in1);
  49. gpuFree(d_in2);
  50. }
  51. void test_gpu_elementwise_small() {
  52. Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
  53. Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
  54. Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
  55. in1.setRandom();
  56. in2.setRandom();
  57. std::size_t in1_bytes = in1.size() * sizeof(float);
  58. std::size_t in2_bytes = in2.size() * sizeof(float);
  59. std::size_t out_bytes = out.size() * sizeof(float);
  60. float* d_in1;
  61. float* d_in2;
  62. float* d_out;
  63. gpuMalloc((void**)(&d_in1), in1_bytes);
  64. gpuMalloc((void**)(&d_in2), in2_bytes);
  65. gpuMalloc((void**)(&d_out), out_bytes);
  66. gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
  67. gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
  68. Eigen::GpuStreamDevice stream;
  69. Eigen::GpuDevice gpu_device(&stream);
  70. Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
  71. d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
  72. Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
  73. d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
  74. Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
  75. d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
  76. gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
  77. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
  78. gpu_device.stream()) == gpuSuccess);
  79. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  80. for (int i = 0; i < 2; ++i) {
  81. VERIFY_IS_APPROX(
  82. out(Eigen::array<Eigen::DenseIndex, 1>(i)),
  83. in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
  84. }
  85. gpuFree(d_in1);
  86. gpuFree(d_in2);
  87. gpuFree(d_out);
  88. }
  89. void test_gpu_elementwise()
  90. {
  91. Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  92. Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  93. Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  94. Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  95. in1.setRandom();
  96. in2.setRandom();
  97. in3.setRandom();
  98. std::size_t in1_bytes = in1.size() * sizeof(float);
  99. std::size_t in2_bytes = in2.size() * sizeof(float);
  100. std::size_t in3_bytes = in3.size() * sizeof(float);
  101. std::size_t out_bytes = out.size() * sizeof(float);
  102. float* d_in1;
  103. float* d_in2;
  104. float* d_in3;
  105. float* d_out;
  106. gpuMalloc((void**)(&d_in1), in1_bytes);
  107. gpuMalloc((void**)(&d_in2), in2_bytes);
  108. gpuMalloc((void**)(&d_in3), in3_bytes);
  109. gpuMalloc((void**)(&d_out), out_bytes);
  110. gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
  111. gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice);
  112. gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice);
  113. Eigen::GpuStreamDevice stream;
  114. Eigen::GpuDevice gpu_device(&stream);
  115. Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  116. Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  117. Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  118. Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
  119. gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
  120. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  121. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  122. for (int i = 0; i < 72; ++i) {
  123. for (int j = 0; j < 53; ++j) {
  124. for (int k = 0; k < 97; ++k) {
  125. VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
  126. }
  127. }
  128. }
  129. gpuFree(d_in1);
  130. gpuFree(d_in2);
  131. gpuFree(d_in3);
  132. gpuFree(d_out);
  133. }
  134. void test_gpu_props() {
  135. Tensor<float, 1> in1(200);
  136. Tensor<bool, 1> out(200);
  137. in1.setRandom();
  138. std::size_t in1_bytes = in1.size() * sizeof(float);
  139. std::size_t out_bytes = out.size() * sizeof(bool);
  140. float* d_in1;
  141. bool* d_out;
  142. gpuMalloc((void**)(&d_in1), in1_bytes);
  143. gpuMalloc((void**)(&d_out), out_bytes);
  144. gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
  145. Eigen::GpuStreamDevice stream;
  146. Eigen::GpuDevice gpu_device(&stream);
  147. Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
  148. d_in1, 200);
  149. Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
  150. d_out, 200);
  151. gpu_out.device(gpu_device) = (gpu_in1.isnan)();
  152. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost,
  153. gpu_device.stream()) == gpuSuccess);
  154. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  155. for (int i = 0; i < 200; ++i) {
  156. VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
  157. }
  158. gpuFree(d_in1);
  159. gpuFree(d_out);
  160. }
  161. void test_gpu_reduction()
  162. {
  163. Tensor<float, 4> in1(72,53,97,113);
  164. Tensor<float, 2> out(72,97);
  165. in1.setRandom();
  166. std::size_t in1_bytes = in1.size() * sizeof(float);
  167. std::size_t out_bytes = out.size() * sizeof(float);
  168. float* d_in1;
  169. float* d_out;
  170. gpuMalloc((void**)(&d_in1), in1_bytes);
  171. gpuMalloc((void**)(&d_out), out_bytes);
  172. gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice);
  173. Eigen::GpuStreamDevice stream;
  174. Eigen::GpuDevice gpu_device(&stream);
  175. Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
  176. Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
  177. array<Eigen::DenseIndex, 2> reduction_axis;
  178. reduction_axis[0] = 1;
  179. reduction_axis[1] = 3;
  180. gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
  181. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  182. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  183. for (int i = 0; i < 72; ++i) {
  184. for (int j = 0; j < 97; ++j) {
  185. float expected = 0;
  186. for (int k = 0; k < 53; ++k) {
  187. for (int l = 0; l < 113; ++l) {
  188. expected =
  189. std::max<float>(expected, in1(i, k, j, l));
  190. }
  191. }
  192. VERIFY_IS_APPROX(out(i,j), expected);
  193. }
  194. }
  195. gpuFree(d_in1);
  196. gpuFree(d_out);
  197. }
  198. template<int DataLayout>
  199. void test_gpu_contraction()
  200. {
  201. // with these dimensions, the output has 300 * 140 elements, which is
  202. // more than 30 * 1024, which is the number of threads in blocks on
  203. // a 15 SM GK110 GPU
  204. Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
  205. Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
  206. Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
  207. t_left.setRandom();
  208. t_right.setRandom();
  209. std::size_t t_left_bytes = t_left.size() * sizeof(float);
  210. std::size_t t_right_bytes = t_right.size() * sizeof(float);
  211. std::size_t t_result_bytes = t_result.size() * sizeof(float);
  212. float* d_t_left;
  213. float* d_t_right;
  214. float* d_t_result;
  215. gpuMalloc((void**)(&d_t_left), t_left_bytes);
  216. gpuMalloc((void**)(&d_t_right), t_right_bytes);
  217. gpuMalloc((void**)(&d_t_result), t_result_bytes);
  218. gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
  219. gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
  220. Eigen::GpuStreamDevice stream;
  221. Eigen::GpuDevice gpu_device(&stream);
  222. Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
  223. Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
  224. Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
  225. typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
  226. MapXf m_left(t_left.data(), 300, 93);
  227. MapXf m_right(t_right.data(), 93, 140);
  228. Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
  229. typedef Tensor<float, 1>::DimensionPair DimPair;
  230. Eigen::array<DimPair, 2> dims;
  231. dims[0] = DimPair(2, 0);
  232. dims[1] = DimPair(3, 1);
  233. m_result = m_left * m_right;
  234. gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
  235. gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
  236. for (DenseIndex i = 0; i < t_result.size(); i++) {
  237. if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
  238. std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " << m_result.data()[i] << std::endl;
  239. assert(false);
  240. }
  241. }
  242. gpuFree(d_t_left);
  243. gpuFree(d_t_right);
  244. gpuFree(d_t_result);
  245. }
  246. template<int DataLayout>
  247. void test_gpu_convolution_1d()
  248. {
  249. Tensor<float, 4, DataLayout> input(74,37,11,137);
  250. Tensor<float, 1, DataLayout> kernel(4);
  251. Tensor<float, 4, DataLayout> out(74,34,11,137);
  252. input = input.constant(10.0f) + input.random();
  253. kernel = kernel.constant(7.0f) + kernel.random();
  254. std::size_t input_bytes = input.size() * sizeof(float);
  255. std::size_t kernel_bytes = kernel.size() * sizeof(float);
  256. std::size_t out_bytes = out.size() * sizeof(float);
  257. float* d_input;
  258. float* d_kernel;
  259. float* d_out;
  260. gpuMalloc((void**)(&d_input), input_bytes);
  261. gpuMalloc((void**)(&d_kernel), kernel_bytes);
  262. gpuMalloc((void**)(&d_out), out_bytes);
  263. gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
  264. gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
  265. Eigen::GpuStreamDevice stream;
  266. Eigen::GpuDevice gpu_device(&stream);
  267. Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
  268. Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
  269. Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
  270. Eigen::array<Eigen::DenseIndex, 1> dims(1);
  271. gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
  272. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  273. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  274. for (int i = 0; i < 74; ++i) {
  275. for (int j = 0; j < 34; ++j) {
  276. for (int k = 0; k < 11; ++k) {
  277. for (int l = 0; l < 137; ++l) {
  278. const float result = out(i,j,k,l);
  279. const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
  280. input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
  281. VERIFY_IS_APPROX(result, expected);
  282. }
  283. }
  284. }
  285. }
  286. gpuFree(d_input);
  287. gpuFree(d_kernel);
  288. gpuFree(d_out);
  289. }
  290. void test_gpu_convolution_inner_dim_col_major_1d()
  291. {
  292. Tensor<float, 4, ColMajor> input(74,9,11,7);
  293. Tensor<float, 1, ColMajor> kernel(4);
  294. Tensor<float, 4, ColMajor> out(71,9,11,7);
  295. input = input.constant(10.0f) + input.random();
  296. kernel = kernel.constant(7.0f) + kernel.random();
  297. std::size_t input_bytes = input.size() * sizeof(float);
  298. std::size_t kernel_bytes = kernel.size() * sizeof(float);
  299. std::size_t out_bytes = out.size() * sizeof(float);
  300. float* d_input;
  301. float* d_kernel;
  302. float* d_out;
  303. gpuMalloc((void**)(&d_input), input_bytes);
  304. gpuMalloc((void**)(&d_kernel), kernel_bytes);
  305. gpuMalloc((void**)(&d_out), out_bytes);
  306. gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
  307. gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
  308. Eigen::GpuStreamDevice stream;
  309. Eigen::GpuDevice gpu_device(&stream);
  310. Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
  311. Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
  312. Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
  313. Eigen::array<Eigen::DenseIndex, 1> dims(0);
  314. gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
  315. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  316. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  317. for (int i = 0; i < 71; ++i) {
  318. for (int j = 0; j < 9; ++j) {
  319. for (int k = 0; k < 11; ++k) {
  320. for (int l = 0; l < 7; ++l) {
  321. const float result = out(i,j,k,l);
  322. const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
  323. input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
  324. VERIFY_IS_APPROX(result, expected);
  325. }
  326. }
  327. }
  328. }
  329. gpuFree(d_input);
  330. gpuFree(d_kernel);
  331. gpuFree(d_out);
  332. }
  333. void test_gpu_convolution_inner_dim_row_major_1d()
  334. {
  335. Tensor<float, 4, RowMajor> input(7,9,11,74);
  336. Tensor<float, 1, RowMajor> kernel(4);
  337. Tensor<float, 4, RowMajor> out(7,9,11,71);
  338. input = input.constant(10.0f) + input.random();
  339. kernel = kernel.constant(7.0f) + kernel.random();
  340. std::size_t input_bytes = input.size() * sizeof(float);
  341. std::size_t kernel_bytes = kernel.size() * sizeof(float);
  342. std::size_t out_bytes = out.size() * sizeof(float);
  343. float* d_input;
  344. float* d_kernel;
  345. float* d_out;
  346. gpuMalloc((void**)(&d_input), input_bytes);
  347. gpuMalloc((void**)(&d_kernel), kernel_bytes);
  348. gpuMalloc((void**)(&d_out), out_bytes);
  349. gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
  350. gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
  351. Eigen::GpuStreamDevice stream;
  352. Eigen::GpuDevice gpu_device(&stream);
  353. Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
  354. Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
  355. Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
  356. Eigen::array<Eigen::DenseIndex, 1> dims(3);
  357. gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
  358. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  359. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  360. for (int i = 0; i < 7; ++i) {
  361. for (int j = 0; j < 9; ++j) {
  362. for (int k = 0; k < 11; ++k) {
  363. for (int l = 0; l < 71; ++l) {
  364. const float result = out(i,j,k,l);
  365. const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
  366. input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
  367. VERIFY_IS_APPROX(result, expected);
  368. }
  369. }
  370. }
  371. }
  372. gpuFree(d_input);
  373. gpuFree(d_kernel);
  374. gpuFree(d_out);
  375. }
  376. template<int DataLayout>
  377. void test_gpu_convolution_2d()
  378. {
  379. Tensor<float, 4, DataLayout> input(74,37,11,137);
  380. Tensor<float, 2, DataLayout> kernel(3,4);
  381. Tensor<float, 4, DataLayout> out(74,35,8,137);
  382. input = input.constant(10.0f) + input.random();
  383. kernel = kernel.constant(7.0f) + kernel.random();
  384. std::size_t input_bytes = input.size() * sizeof(float);
  385. std::size_t kernel_bytes = kernel.size() * sizeof(float);
  386. std::size_t out_bytes = out.size() * sizeof(float);
  387. float* d_input;
  388. float* d_kernel;
  389. float* d_out;
  390. gpuMalloc((void**)(&d_input), input_bytes);
  391. gpuMalloc((void**)(&d_kernel), kernel_bytes);
  392. gpuMalloc((void**)(&d_out), out_bytes);
  393. gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
  394. gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
  395. Eigen::GpuStreamDevice stream;
  396. Eigen::GpuDevice gpu_device(&stream);
  397. Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
  398. Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
  399. Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
  400. Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
  401. gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
  402. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  403. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  404. for (int i = 0; i < 74; ++i) {
  405. for (int j = 0; j < 35; ++j) {
  406. for (int k = 0; k < 8; ++k) {
  407. for (int l = 0; l < 137; ++l) {
  408. const float result = out(i,j,k,l);
  409. const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
  410. input(i,j+1,k+0,l) * kernel(1,0) +
  411. input(i,j+2,k+0,l) * kernel(2,0) +
  412. input(i,j+0,k+1,l) * kernel(0,1) +
  413. input(i,j+1,k+1,l) * kernel(1,1) +
  414. input(i,j+2,k+1,l) * kernel(2,1) +
  415. input(i,j+0,k+2,l) * kernel(0,2) +
  416. input(i,j+1,k+2,l) * kernel(1,2) +
  417. input(i,j+2,k+2,l) * kernel(2,2) +
  418. input(i,j+0,k+3,l) * kernel(0,3) +
  419. input(i,j+1,k+3,l) * kernel(1,3) +
  420. input(i,j+2,k+3,l) * kernel(2,3);
  421. VERIFY_IS_APPROX(result, expected);
  422. }
  423. }
  424. }
  425. }
  426. gpuFree(d_input);
  427. gpuFree(d_kernel);
  428. gpuFree(d_out);
  429. }
  430. template<int DataLayout>
  431. void test_gpu_convolution_3d()
  432. {
  433. Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
  434. Tensor<float, 3, DataLayout> kernel(3,4,2);
  435. Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
  436. input = input.constant(10.0f) + input.random();
  437. kernel = kernel.constant(7.0f) + kernel.random();
  438. std::size_t input_bytes = input.size() * sizeof(float);
  439. std::size_t kernel_bytes = kernel.size() * sizeof(float);
  440. std::size_t out_bytes = out.size() * sizeof(float);
  441. float* d_input;
  442. float* d_kernel;
  443. float* d_out;
  444. gpuMalloc((void**)(&d_input), input_bytes);
  445. gpuMalloc((void**)(&d_kernel), kernel_bytes);
  446. gpuMalloc((void**)(&d_out), out_bytes);
  447. gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice);
  448. gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice);
  449. Eigen::GpuStreamDevice stream;
  450. Eigen::GpuDevice gpu_device(&stream);
  451. Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
  452. Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
  453. Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
  454. Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
  455. gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
  456. assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  457. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  458. for (int i = 0; i < 74; ++i) {
  459. for (int j = 0; j < 35; ++j) {
  460. for (int k = 0; k < 8; ++k) {
  461. for (int l = 0; l < 136; ++l) {
  462. for (int m = 0; m < 17; ++m) {
  463. const float result = out(i,j,k,l,m);
  464. const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
  465. input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
  466. input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
  467. input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
  468. input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
  469. input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
  470. input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
  471. input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
  472. input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
  473. input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
  474. input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
  475. input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
  476. input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
  477. input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
  478. input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
  479. input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
  480. input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
  481. input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
  482. input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
  483. input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
  484. input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
  485. input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
  486. input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
  487. input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
  488. VERIFY_IS_APPROX(result, expected);
  489. }
  490. }
  491. }
  492. }
  493. }
  494. gpuFree(d_input);
  495. gpuFree(d_kernel);
  496. gpuFree(d_out);
  497. }
  498. #if EIGEN_GPU_TEST_C99_MATH
  499. template <typename Scalar>
  500. void test_gpu_lgamma(const Scalar stddev)
  501. {
  502. Tensor<Scalar, 2> in(72,97);
  503. in.setRandom();
  504. in *= in.constant(stddev);
  505. Tensor<Scalar, 2> out(72,97);
  506. out.setZero();
  507. std::size_t bytes = in.size() * sizeof(Scalar);
  508. Scalar* d_in;
  509. Scalar* d_out;
  510. gpuMalloc((void**)(&d_in), bytes);
  511. gpuMalloc((void**)(&d_out), bytes);
  512. gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
  513. Eigen::GpuStreamDevice stream;
  514. Eigen::GpuDevice gpu_device(&stream);
  515. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
  516. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
  517. gpu_out.device(gpu_device) = gpu_in.lgamma();
  518. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  519. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  520. for (int i = 0; i < 72; ++i) {
  521. for (int j = 0; j < 97; ++j) {
  522. VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
  523. }
  524. }
  525. gpuFree(d_in);
  526. gpuFree(d_out);
  527. }
  528. #endif
  529. template <typename Scalar>
  530. void test_gpu_digamma()
  531. {
  532. Tensor<Scalar, 1> in(7);
  533. Tensor<Scalar, 1> out(7);
  534. Tensor<Scalar, 1> expected_out(7);
  535. out.setZero();
  536. in(0) = Scalar(1);
  537. in(1) = Scalar(1.5);
  538. in(2) = Scalar(4);
  539. in(3) = Scalar(-10.5);
  540. in(4) = Scalar(10000.5);
  541. in(5) = Scalar(0);
  542. in(6) = Scalar(-1);
  543. expected_out(0) = Scalar(-0.5772156649015329);
  544. expected_out(1) = Scalar(0.03648997397857645);
  545. expected_out(2) = Scalar(1.2561176684318);
  546. expected_out(3) = Scalar(2.398239129535781);
  547. expected_out(4) = Scalar(9.210340372392849);
  548. expected_out(5) = std::numeric_limits<Scalar>::infinity();
  549. expected_out(6) = std::numeric_limits<Scalar>::infinity();
  550. std::size_t bytes = in.size() * sizeof(Scalar);
  551. Scalar* d_in;
  552. Scalar* d_out;
  553. gpuMalloc((void**)(&d_in), bytes);
  554. gpuMalloc((void**)(&d_out), bytes);
  555. gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
  556. Eigen::GpuStreamDevice stream;
  557. Eigen::GpuDevice gpu_device(&stream);
  558. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
  559. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
  560. gpu_out.device(gpu_device) = gpu_in.digamma();
  561. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  562. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  563. for (int i = 0; i < 5; ++i) {
  564. VERIFY_IS_APPROX(out(i), expected_out(i));
  565. }
  566. for (int i = 5; i < 7; ++i) {
  567. VERIFY_IS_EQUAL(out(i), expected_out(i));
  568. }
  569. gpuFree(d_in);
  570. gpuFree(d_out);
  571. }
  572. template <typename Scalar>
  573. void test_gpu_zeta()
  574. {
  575. Tensor<Scalar, 1> in_x(6);
  576. Tensor<Scalar, 1> in_q(6);
  577. Tensor<Scalar, 1> out(6);
  578. Tensor<Scalar, 1> expected_out(6);
  579. out.setZero();
  580. in_x(0) = Scalar(1);
  581. in_x(1) = Scalar(1.5);
  582. in_x(2) = Scalar(4);
  583. in_x(3) = Scalar(-10.5);
  584. in_x(4) = Scalar(10000.5);
  585. in_x(5) = Scalar(3);
  586. in_q(0) = Scalar(1.2345);
  587. in_q(1) = Scalar(2);
  588. in_q(2) = Scalar(1.5);
  589. in_q(3) = Scalar(3);
  590. in_q(4) = Scalar(1.0001);
  591. in_q(5) = Scalar(-2.5);
  592. expected_out(0) = std::numeric_limits<Scalar>::infinity();
  593. expected_out(1) = Scalar(1.61237534869);
  594. expected_out(2) = Scalar(0.234848505667);
  595. expected_out(3) = Scalar(1.03086757337e-5);
  596. expected_out(4) = Scalar(0.367879440865);
  597. expected_out(5) = Scalar(0.054102025820864097);
  598. std::size_t bytes = in_x.size() * sizeof(Scalar);
  599. Scalar* d_in_x;
  600. Scalar* d_in_q;
  601. Scalar* d_out;
  602. gpuMalloc((void**)(&d_in_x), bytes);
  603. gpuMalloc((void**)(&d_in_q), bytes);
  604. gpuMalloc((void**)(&d_out), bytes);
  605. gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  606. gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice);
  607. Eigen::GpuStreamDevice stream;
  608. Eigen::GpuDevice gpu_device(&stream);
  609. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
  610. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
  611. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
  612. gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
  613. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  614. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  615. VERIFY_IS_EQUAL(out(0), expected_out(0));
  616. VERIFY((std::isnan)(out(3)));
  617. for (int i = 1; i < 6; ++i) {
  618. if (i != 3) {
  619. VERIFY_IS_APPROX(out(i), expected_out(i));
  620. }
  621. }
  622. gpuFree(d_in_x);
  623. gpuFree(d_in_q);
  624. gpuFree(d_out);
  625. }
  626. template <typename Scalar>
  627. void test_gpu_polygamma()
  628. {
  629. Tensor<Scalar, 1> in_x(7);
  630. Tensor<Scalar, 1> in_n(7);
  631. Tensor<Scalar, 1> out(7);
  632. Tensor<Scalar, 1> expected_out(7);
  633. out.setZero();
  634. in_n(0) = Scalar(1);
  635. in_n(1) = Scalar(1);
  636. in_n(2) = Scalar(1);
  637. in_n(3) = Scalar(17);
  638. in_n(4) = Scalar(31);
  639. in_n(5) = Scalar(28);
  640. in_n(6) = Scalar(8);
  641. in_x(0) = Scalar(2);
  642. in_x(1) = Scalar(3);
  643. in_x(2) = Scalar(25.5);
  644. in_x(3) = Scalar(4.7);
  645. in_x(4) = Scalar(11.8);
  646. in_x(5) = Scalar(17.7);
  647. in_x(6) = Scalar(30.2);
  648. expected_out(0) = Scalar(0.644934066848);
  649. expected_out(1) = Scalar(0.394934066848);
  650. expected_out(2) = Scalar(0.0399946696496);
  651. expected_out(3) = Scalar(293.334565435);
  652. expected_out(4) = Scalar(0.445487887616);
  653. expected_out(5) = Scalar(-2.47810300902e-07);
  654. expected_out(6) = Scalar(-8.29668781082e-09);
  655. std::size_t bytes = in_x.size() * sizeof(Scalar);
  656. Scalar* d_in_x;
  657. Scalar* d_in_n;
  658. Scalar* d_out;
  659. gpuMalloc((void**)(&d_in_x), bytes);
  660. gpuMalloc((void**)(&d_in_n), bytes);
  661. gpuMalloc((void**)(&d_out), bytes);
  662. gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  663. gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice);
  664. Eigen::GpuStreamDevice stream;
  665. Eigen::GpuDevice gpu_device(&stream);
  666. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
  667. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
  668. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
  669. gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
  670. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  671. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  672. for (int i = 0; i < 7; ++i) {
  673. VERIFY_IS_APPROX(out(i), expected_out(i));
  674. }
  675. gpuFree(d_in_x);
  676. gpuFree(d_in_n);
  677. gpuFree(d_out);
  678. }
  679. template <typename Scalar>
  680. void test_gpu_igamma()
  681. {
  682. Tensor<Scalar, 2> a(6, 6);
  683. Tensor<Scalar, 2> x(6, 6);
  684. Tensor<Scalar, 2> out(6, 6);
  685. out.setZero();
  686. Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
  687. Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
  688. for (int i = 0; i < 6; ++i) {
  689. for (int j = 0; j < 6; ++j) {
  690. a(i, j) = a_s[i];
  691. x(i, j) = x_s[j];
  692. }
  693. }
  694. Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
  695. Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
  696. {0.0, 0.6321205588285578, 0.7768698398515702,
  697. 0.9816843611112658, 9.999500016666262e-05, 1.0},
  698. {0.0, 0.4275932955291202, 0.608374823728911,
  699. 0.9539882943107686, 7.522076445089201e-07, 1.0},
  700. {0.0, 0.01898815687615381, 0.06564245437845008,
  701. 0.5665298796332909, 4.166333347221828e-18, 1.0},
  702. {0.0, 0.9999780593618628, 0.9999899967080838,
  703. 0.9999996219837988, 0.9991370418689945, 1.0},
  704. {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
  705. std::size_t bytes = a.size() * sizeof(Scalar);
  706. Scalar* d_a;
  707. Scalar* d_x;
  708. Scalar* d_out;
  709. assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess);
  710. assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess);
  711. assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
  712. gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
  713. gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
  714. Eigen::GpuStreamDevice stream;
  715. Eigen::GpuDevice gpu_device(&stream);
  716. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
  717. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
  718. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
  719. gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
  720. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  721. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  722. for (int i = 0; i < 6; ++i) {
  723. for (int j = 0; j < 6; ++j) {
  724. if ((std::isnan)(igamma_s[i][j])) {
  725. VERIFY((std::isnan)(out(i, j)));
  726. } else {
  727. VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
  728. }
  729. }
  730. }
  731. gpuFree(d_a);
  732. gpuFree(d_x);
  733. gpuFree(d_out);
  734. }
  735. template <typename Scalar>
  736. void test_gpu_igammac()
  737. {
  738. Tensor<Scalar, 2> a(6, 6);
  739. Tensor<Scalar, 2> x(6, 6);
  740. Tensor<Scalar, 2> out(6, 6);
  741. out.setZero();
  742. Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
  743. Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
  744. for (int i = 0; i < 6; ++i) {
  745. for (int j = 0; j < 6; ++j) {
  746. a(i, j) = a_s[i];
  747. x(i, j) = x_s[j];
  748. }
  749. }
  750. Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
  751. Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
  752. {1.0, 0.36787944117144233, 0.22313016014842982,
  753. 0.018315638888734182, 0.9999000049998333, 0.0},
  754. {1.0, 0.5724067044708798, 0.3916251762710878,
  755. 0.04601170568923136, 0.9999992477923555, 0.0},
  756. {1.0, 0.9810118431238462, 0.9343575456215499,
  757. 0.4334701203667089, 1.0, 0.0},
  758. {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
  759. 3.7801620118431334e-07, 0.0008629581310054535,
  760. 0.0},
  761. {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
  762. std::size_t bytes = a.size() * sizeof(Scalar);
  763. Scalar* d_a;
  764. Scalar* d_x;
  765. Scalar* d_out;
  766. gpuMalloc((void**)(&d_a), bytes);
  767. gpuMalloc((void**)(&d_x), bytes);
  768. gpuMalloc((void**)(&d_out), bytes);
  769. gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice);
  770. gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice);
  771. Eigen::GpuStreamDevice stream;
  772. Eigen::GpuDevice gpu_device(&stream);
  773. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
  774. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
  775. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
  776. gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
  777. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  778. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  779. for (int i = 0; i < 6; ++i) {
  780. for (int j = 0; j < 6; ++j) {
  781. if ((std::isnan)(igammac_s[i][j])) {
  782. VERIFY((std::isnan)(out(i, j)));
  783. } else {
  784. VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
  785. }
  786. }
  787. }
  788. gpuFree(d_a);
  789. gpuFree(d_x);
  790. gpuFree(d_out);
  791. }
  792. #if EIGEN_GPU_TEST_C99_MATH
  793. template <typename Scalar>
  794. void test_gpu_erf(const Scalar stddev)
  795. {
  796. Tensor<Scalar, 2> in(72,97);
  797. in.setRandom();
  798. in *= in.constant(stddev);
  799. Tensor<Scalar, 2> out(72,97);
  800. out.setZero();
  801. std::size_t bytes = in.size() * sizeof(Scalar);
  802. Scalar* d_in;
  803. Scalar* d_out;
  804. assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess);
  805. assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess);
  806. gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
  807. Eigen::GpuStreamDevice stream;
  808. Eigen::GpuDevice gpu_device(&stream);
  809. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
  810. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
  811. gpu_out.device(gpu_device) = gpu_in.erf();
  812. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  813. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  814. for (int i = 0; i < 72; ++i) {
  815. for (int j = 0; j < 97; ++j) {
  816. VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
  817. }
  818. }
  819. gpuFree(d_in);
  820. gpuFree(d_out);
  821. }
  822. template <typename Scalar>
  823. void test_gpu_erfc(const Scalar stddev)
  824. {
  825. Tensor<Scalar, 2> in(72,97);
  826. in.setRandom();
  827. in *= in.constant(stddev);
  828. Tensor<Scalar, 2> out(72,97);
  829. out.setZero();
  830. std::size_t bytes = in.size() * sizeof(Scalar);
  831. Scalar* d_in;
  832. Scalar* d_out;
  833. gpuMalloc((void**)(&d_in), bytes);
  834. gpuMalloc((void**)(&d_out), bytes);
  835. gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice);
  836. Eigen::GpuStreamDevice stream;
  837. Eigen::GpuDevice gpu_device(&stream);
  838. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
  839. Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
  840. gpu_out.device(gpu_device) = gpu_in.erfc();
  841. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  842. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  843. for (int i = 0; i < 72; ++i) {
  844. for (int j = 0; j < 97; ++j) {
  845. VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
  846. }
  847. }
  848. gpuFree(d_in);
  849. gpuFree(d_out);
  850. }
  851. #endif
  852. template <typename Scalar>
  853. void test_gpu_ndtri()
  854. {
  855. Tensor<Scalar, 1> in_x(8);
  856. Tensor<Scalar, 1> out(8);
  857. Tensor<Scalar, 1> expected_out(8);
  858. out.setZero();
  859. in_x(0) = Scalar(1);
  860. in_x(1) = Scalar(0.);
  861. in_x(2) = Scalar(0.5);
  862. in_x(3) = Scalar(0.2);
  863. in_x(4) = Scalar(0.8);
  864. in_x(5) = Scalar(0.9);
  865. in_x(6) = Scalar(0.1);
  866. in_x(7) = Scalar(0.99);
  867. in_x(8) = Scalar(0.01);
  868. expected_out(0) = std::numeric_limits<Scalar>::infinity();
  869. expected_out(1) = -std::numeric_limits<Scalar>::infinity();
  870. expected_out(2) = Scalar(0.0);
  871. expected_out(3) = Scalar(-0.8416212335729142);
  872. expected_out(4) = Scalar(0.8416212335729142);
  873. expected_out(5) = Scalar(1.2815515655446004);
  874. expected_out(6) = Scalar(-1.2815515655446004);
  875. expected_out(7) = Scalar(2.3263478740408408);
  876. expected_out(8) = Scalar(-2.3263478740408408);
  877. std::size_t bytes = in_x.size() * sizeof(Scalar);
  878. Scalar* d_in_x;
  879. Scalar* d_out;
  880. gpuMalloc((void**)(&d_in_x), bytes);
  881. gpuMalloc((void**)(&d_out), bytes);
  882. gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  883. Eigen::GpuStreamDevice stream;
  884. Eigen::GpuDevice gpu_device(&stream);
  885. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
  886. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
  887. gpu_out.device(gpu_device) = gpu_in_x.ndtri();
  888. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  889. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  890. VERIFY_IS_EQUAL(out(0), expected_out(0));
  891. VERIFY((std::isnan)(out(3)));
  892. for (int i = 1; i < 6; ++i) {
  893. if (i != 3) {
  894. VERIFY_IS_APPROX(out(i), expected_out(i));
  895. }
  896. }
  897. gpuFree(d_in_x);
  898. gpuFree(d_out);
  899. }
  900. template <typename Scalar>
  901. void test_gpu_betainc()
  902. {
  903. Tensor<Scalar, 1> in_x(125);
  904. Tensor<Scalar, 1> in_a(125);
  905. Tensor<Scalar, 1> in_b(125);
  906. Tensor<Scalar, 1> out(125);
  907. Tensor<Scalar, 1> expected_out(125);
  908. out.setZero();
  909. Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
  910. Array<Scalar, 1, Dynamic> x(125);
  911. Array<Scalar, 1, Dynamic> a(125);
  912. Array<Scalar, 1, Dynamic> b(125);
  913. Array<Scalar, 1, Dynamic> v(125);
  914. a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  915. 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
  916. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  917. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  918. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  919. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  920. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  921. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  922. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  923. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  924. 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
  925. 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
  926. 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
  927. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  928. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  929. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  930. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  931. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  932. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  933. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  934. 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
  935. 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
  936. 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
  937. 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
  938. b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
  939. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
  940. 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
  941. 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
  942. 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
  943. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  944. 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
  945. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  946. 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
  947. 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
  948. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
  949. 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
  950. 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
  951. 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
  952. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
  953. 0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
  954. 31.62177660168379, 31.62177660168379, 31.62177660168379,
  955. 31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
  956. 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
  957. 0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
  958. 0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
  959. 31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
  960. 999.999, 999.999, 999.999;
  961. x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
  962. 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
  963. 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
  964. 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
  965. 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
  966. -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
  967. 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
  968. 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
  969. 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
  970. v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
  971. nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
  972. nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
  973. 0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
  974. 0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
  975. 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
  976. nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
  977. 0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
  978. 0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
  979. 0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
  980. 0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
  981. 1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
  982. nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
  983. nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
  984. nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
  985. nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
  986. 2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
  987. 1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
  988. 2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
  989. 0.5000000000004947, 0.9999999999999999, nan;
  990. for (int i = 0; i < 125; ++i) {
  991. in_x(i) = x(i);
  992. in_a(i) = a(i);
  993. in_b(i) = b(i);
  994. expected_out(i) = v(i);
  995. }
  996. std::size_t bytes = in_x.size() * sizeof(Scalar);
  997. Scalar* d_in_x;
  998. Scalar* d_in_a;
  999. Scalar* d_in_b;
  1000. Scalar* d_out;
  1001. gpuMalloc((void**)(&d_in_x), bytes);
  1002. gpuMalloc((void**)(&d_in_a), bytes);
  1003. gpuMalloc((void**)(&d_in_b), bytes);
  1004. gpuMalloc((void**)(&d_out), bytes);
  1005. gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  1006. gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
  1007. gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice);
  1008. Eigen::GpuStreamDevice stream;
  1009. Eigen::GpuDevice gpu_device(&stream);
  1010. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
  1011. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
  1012. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
  1013. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
  1014. gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
  1015. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
  1016. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  1017. for (int i = 1; i < 125; ++i) {
  1018. if ((std::isnan)(expected_out(i))) {
  1019. VERIFY((std::isnan)(out(i)));
  1020. } else {
  1021. VERIFY_IS_APPROX(out(i), expected_out(i));
  1022. }
  1023. }
  1024. gpuFree(d_in_x);
  1025. gpuFree(d_in_a);
  1026. gpuFree(d_in_b);
  1027. gpuFree(d_out);
  1028. }
  1029. template <typename Scalar>
  1030. void test_gpu_i0e()
  1031. {
  1032. Tensor<Scalar, 1> in_x(21);
  1033. Tensor<Scalar, 1> out(21);
  1034. Tensor<Scalar, 1> expected_out(21);
  1035. out.setZero();
  1036. Array<Scalar, 1, Dynamic> in_x_array(21);
  1037. Array<Scalar, 1, Dynamic> expected_out_array(21);
  1038. in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
  1039. -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
  1040. expected_out_array << 0.0897803118848, 0.0947062952128, 0.100544127361,
  1041. 0.107615251671, 0.116426221213, 0.127833337163, 0.143431781857,
  1042. 0.16665743264, 0.207001921224, 0.308508322554, 1.0, 0.308508322554,
  1043. 0.207001921224, 0.16665743264, 0.143431781857, 0.127833337163,
  1044. 0.116426221213, 0.107615251671, 0.100544127361, 0.0947062952128,
  1045. 0.0897803118848;
  1046. for (int i = 0; i < 21; ++i) {
  1047. in_x(i) = in_x_array(i);
  1048. expected_out(i) = expected_out_array(i);
  1049. }
  1050. std::size_t bytes = in_x.size() * sizeof(Scalar);
  1051. Scalar* d_in;
  1052. Scalar* d_out;
  1053. gpuMalloc((void**)(&d_in), bytes);
  1054. gpuMalloc((void**)(&d_out), bytes);
  1055. gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
  1056. Eigen::GpuStreamDevice stream;
  1057. Eigen::GpuDevice gpu_device(&stream);
  1058. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
  1059. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
  1060. gpu_out.device(gpu_device) = gpu_in.bessel_i0e();
  1061. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
  1062. gpu_device.stream()) == gpuSuccess);
  1063. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  1064. for (int i = 0; i < 21; ++i) {
  1065. VERIFY_IS_APPROX(out(i), expected_out(i));
  1066. }
  1067. gpuFree(d_in);
  1068. gpuFree(d_out);
  1069. }
  1070. template <typename Scalar>
  1071. void test_gpu_i1e()
  1072. {
  1073. Tensor<Scalar, 1> in_x(21);
  1074. Tensor<Scalar, 1> out(21);
  1075. Tensor<Scalar, 1> expected_out(21);
  1076. out.setZero();
  1077. Array<Scalar, 1, Dynamic> in_x_array(21);
  1078. Array<Scalar, 1, Dynamic> expected_out_array(21);
  1079. in_x_array << -20.0, -18.0, -16.0, -14.0, -12.0, -10.0, -8.0, -6.0, -4.0,
  1080. -2.0, 0.0, 2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0;
  1081. expected_out_array << -0.0875062221833, -0.092036796872, -0.0973496147565,
  1082. -0.103697667463, -0.11146429929, -0.121262681384, -0.134142493293,
  1083. -0.152051459309, -0.178750839502, -0.215269289249, 0.0, 0.215269289249,
  1084. 0.178750839502, 0.152051459309, 0.134142493293, 0.121262681384,
  1085. 0.11146429929, 0.103697667463, 0.0973496147565, 0.092036796872,
  1086. 0.0875062221833;
  1087. for (int i = 0; i < 21; ++i) {
  1088. in_x(i) = in_x_array(i);
  1089. expected_out(i) = expected_out_array(i);
  1090. }
  1091. std::size_t bytes = in_x.size() * sizeof(Scalar);
  1092. Scalar* d_in;
  1093. Scalar* d_out;
  1094. gpuMalloc((void**)(&d_in), bytes);
  1095. gpuMalloc((void**)(&d_out), bytes);
  1096. gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice);
  1097. Eigen::GpuStreamDevice stream;
  1098. Eigen::GpuDevice gpu_device(&stream);
  1099. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21);
  1100. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 21);
  1101. gpu_out.device(gpu_device) = gpu_in.bessel_i1e();
  1102. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
  1103. gpu_device.stream()) == gpuSuccess);
  1104. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  1105. for (int i = 0; i < 21; ++i) {
  1106. VERIFY_IS_APPROX(out(i), expected_out(i));
  1107. }
  1108. gpuFree(d_in);
  1109. gpuFree(d_out);
  1110. }
  1111. template <typename Scalar>
  1112. void test_gpu_igamma_der_a()
  1113. {
  1114. Tensor<Scalar, 1> in_x(30);
  1115. Tensor<Scalar, 1> in_a(30);
  1116. Tensor<Scalar, 1> out(30);
  1117. Tensor<Scalar, 1> expected_out(30);
  1118. out.setZero();
  1119. Array<Scalar, 1, Dynamic> in_a_array(30);
  1120. Array<Scalar, 1, Dynamic> in_x_array(30);
  1121. Array<Scalar, 1, Dynamic> expected_out_array(30);
  1122. // See special_functions.cpp for the Python code that generates the test data.
  1123. in_a_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0, 1.0,
  1124. 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0, 100.0,
  1125. 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
  1126. in_x_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
  1127. 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
  1128. 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
  1129. 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
  1130. 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
  1131. 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
  1132. 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
  1133. expected_out_array << -32.7256441441, -36.4394150514, -9.66467612263,
  1134. -36.4394150514, -36.4394150514, -1.0891900302, -2.66351229645,
  1135. -2.48666868596, -0.929700494428, -3.56327722764, -0.455320135314,
  1136. -0.391437214323, -0.491352055991, -0.350454834292, -0.471773162921,
  1137. -0.104084440522, -0.0723646747909, -0.0992828975532, -0.121638215446,
  1138. -0.122619605294, -0.0317670267286, -0.0359974812869, -0.0154359225363,
  1139. -0.0375775365921, -0.00794899153653, -0.00777303219211, -0.00796085782042,
  1140. -0.0125850719397, -0.00455500206958, -0.00476436993148;
  1141. for (int i = 0; i < 30; ++i) {
  1142. in_x(i) = in_x_array(i);
  1143. in_a(i) = in_a_array(i);
  1144. expected_out(i) = expected_out_array(i);
  1145. }
  1146. std::size_t bytes = in_x.size() * sizeof(Scalar);
  1147. Scalar* d_a;
  1148. Scalar* d_x;
  1149. Scalar* d_out;
  1150. gpuMalloc((void**)(&d_a), bytes);
  1151. gpuMalloc((void**)(&d_x), bytes);
  1152. gpuMalloc((void**)(&d_out), bytes);
  1153. gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice);
  1154. gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice);
  1155. Eigen::GpuStreamDevice stream;
  1156. Eigen::GpuDevice gpu_device(&stream);
  1157. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30);
  1158. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_x(d_x, 30);
  1159. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
  1160. gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x);
  1161. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
  1162. gpu_device.stream()) == gpuSuccess);
  1163. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  1164. for (int i = 0; i < 30; ++i) {
  1165. VERIFY_IS_APPROX(out(i), expected_out(i));
  1166. }
  1167. gpuFree(d_a);
  1168. gpuFree(d_x);
  1169. gpuFree(d_out);
  1170. }
  1171. template <typename Scalar>
  1172. void test_gpu_gamma_sample_der_alpha()
  1173. {
  1174. Tensor<Scalar, 1> in_alpha(30);
  1175. Tensor<Scalar, 1> in_sample(30);
  1176. Tensor<Scalar, 1> out(30);
  1177. Tensor<Scalar, 1> expected_out(30);
  1178. out.setZero();
  1179. Array<Scalar, 1, Dynamic> in_alpha_array(30);
  1180. Array<Scalar, 1, Dynamic> in_sample_array(30);
  1181. Array<Scalar, 1, Dynamic> expected_out_array(30);
  1182. // See special_functions.cpp for the Python code that generates the test data.
  1183. in_alpha_array << 0.01, 0.01, 0.01, 0.01, 0.01, 0.1, 0.1, 0.1, 0.1, 0.1, 1.0,
  1184. 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 100.0, 100.0, 100.0,
  1185. 100.0, 100.0, 1000.0, 1000.0, 1000.0, 1000.0, 1000.0;
  1186. in_sample_array << 1.25668890405e-26, 1.17549435082e-38, 1.20938905072e-05,
  1187. 1.17549435082e-38, 1.17549435082e-38, 5.66572070696e-16, 0.0132865061065,
  1188. 0.0200034203853, 6.29263709118e-17, 1.37160367764e-06, 0.333412038288,
  1189. 1.18135687766, 0.580629033777, 0.170631439426, 0.786686768458,
  1190. 7.63873279537, 13.1944344379, 11.896042354, 10.5830172417, 10.5020942233,
  1191. 92.8918587747, 95.003720371, 86.3715926467, 96.0330217672, 82.6389930677,
  1192. 968.702906754, 969.463546828, 1001.79726022, 955.047416547, 1044.27458568;
  1193. expected_out_array << 7.42424742367e-23, 1.02004297287e-34, 0.0130155240738,
  1194. 1.02004297287e-34, 1.02004297287e-34, 1.96505168277e-13, 0.525575786243,
  1195. 0.713903991771, 2.32077561808e-14, 0.000179348049886, 0.635500453302,
  1196. 1.27561284917, 0.878125852156, 0.41565819538, 1.03606488534,
  1197. 0.885964824887, 1.16424049334, 1.10764479598, 1.04590810812,
  1198. 1.04193666963, 0.965193152414, 0.976217589464, 0.93008035061,
  1199. 0.98153216096, 0.909196397698, 0.98434963993, 0.984738050206,
  1200. 1.00106492525, 0.97734200649, 1.02198794179;
  1201. for (int i = 0; i < 30; ++i) {
  1202. in_alpha(i) = in_alpha_array(i);
  1203. in_sample(i) = in_sample_array(i);
  1204. expected_out(i) = expected_out_array(i);
  1205. }
  1206. std::size_t bytes = in_alpha.size() * sizeof(Scalar);
  1207. Scalar* d_alpha;
  1208. Scalar* d_sample;
  1209. Scalar* d_out;
  1210. gpuMalloc((void**)(&d_alpha), bytes);
  1211. gpuMalloc((void**)(&d_sample), bytes);
  1212. gpuMalloc((void**)(&d_out), bytes);
  1213. gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice);
  1214. gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice);
  1215. Eigen::GpuStreamDevice stream;
  1216. Eigen::GpuDevice gpu_device(&stream);
  1217. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30);
  1218. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_sample(d_sample, 30);
  1219. Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 30);
  1220. gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample);
  1221. assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost,
  1222. gpu_device.stream()) == gpuSuccess);
  1223. assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
  1224. for (int i = 0; i < 30; ++i) {
  1225. VERIFY_IS_APPROX(out(i), expected_out(i));
  1226. }
  1227. gpuFree(d_alpha);
  1228. gpuFree(d_sample);
  1229. gpuFree(d_out);
  1230. }
  1231. EIGEN_DECLARE_TEST(cxx11_tensor_gpu)
  1232. {
  1233. CALL_SUBTEST_1(test_gpu_nullary());
  1234. CALL_SUBTEST_1(test_gpu_elementwise_small());
  1235. CALL_SUBTEST_1(test_gpu_elementwise());
  1236. CALL_SUBTEST_1(test_gpu_props());
  1237. CALL_SUBTEST_1(test_gpu_reduction());
  1238. CALL_SUBTEST_2(test_gpu_contraction<ColMajor>());
  1239. CALL_SUBTEST_2(test_gpu_contraction<RowMajor>());
  1240. CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>());
  1241. CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>());
  1242. CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d());
  1243. CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d());
  1244. CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>());
  1245. CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>());
  1246. #if !defined(EIGEN_USE_HIP)
  1247. // disable these tests on HIP for now.
  1248. // they hang..need to investigate and fix
  1249. CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>());
  1250. CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>());
  1251. #endif
  1252. #if EIGEN_GPU_TEST_C99_MATH
  1253. // std::erf, std::erfc, and so on where only added in c++11. We use them
  1254. // as a golden reference to validate the results produced by Eigen. Therefore
  1255. // we can only run these tests if we use a c++11 compiler.
  1256. CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f));
  1257. CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f));
  1258. CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f));
  1259. CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f));
  1260. CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0));
  1261. CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0));
  1262. CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01));
  1263. CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001));
  1264. CALL_SUBTEST_4(test_gpu_erf<float>(1.0f));
  1265. CALL_SUBTEST_4(test_gpu_erf<float>(100.0f));
  1266. CALL_SUBTEST_4(test_gpu_erf<float>(0.01f));
  1267. CALL_SUBTEST_4(test_gpu_erf<float>(0.001f));
  1268. CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f));
  1269. // CALL_SUBTEST(test_gpu_erfc<float>(100.0f));
  1270. CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs
  1271. CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f));
  1272. CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f));
  1273. CALL_SUBTEST_4(test_gpu_erf<double>(1.0));
  1274. CALL_SUBTEST_4(test_gpu_erf<double>(100.0));
  1275. CALL_SUBTEST_4(test_gpu_erf<double>(0.01));
  1276. CALL_SUBTEST_4(test_gpu_erf<double>(0.001));
  1277. CALL_SUBTEST_4(test_gpu_erfc<double>(1.0));
  1278. // CALL_SUBTEST(test_gpu_erfc<double>(100.0));
  1279. CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs
  1280. CALL_SUBTEST_4(test_gpu_erfc<double>(0.01));
  1281. CALL_SUBTEST_4(test_gpu_erfc<double>(0.001));
  1282. #if !defined(EIGEN_USE_HIP)
  1283. // disable these tests on HIP for now.
  1284. CALL_SUBTEST_5(test_gpu_ndtri<float>());
  1285. CALL_SUBTEST_5(test_gpu_ndtri<double>());
  1286. CALL_SUBTEST_5(test_gpu_digamma<float>());
  1287. CALL_SUBTEST_5(test_gpu_digamma<double>());
  1288. CALL_SUBTEST_5(test_gpu_polygamma<float>());
  1289. CALL_SUBTEST_5(test_gpu_polygamma<double>());
  1290. CALL_SUBTEST_5(test_gpu_zeta<float>());
  1291. CALL_SUBTEST_5(test_gpu_zeta<double>());
  1292. #endif
  1293. CALL_SUBTEST_5(test_gpu_igamma<float>());
  1294. CALL_SUBTEST_5(test_gpu_igammac<float>());
  1295. CALL_SUBTEST_5(test_gpu_igamma<double>());
  1296. CALL_SUBTEST_5(test_gpu_igammac<double>());
  1297. #if !defined(EIGEN_USE_HIP)
  1298. // disable these tests on HIP for now.
  1299. CALL_SUBTEST_6(test_gpu_betainc<float>());
  1300. CALL_SUBTEST_6(test_gpu_betainc<double>());
  1301. CALL_SUBTEST_6(test_gpu_i0e<float>());
  1302. CALL_SUBTEST_6(test_gpu_i0e<double>());
  1303. CALL_SUBTEST_6(test_gpu_i1e<float>());
  1304. CALL_SUBTEST_6(test_gpu_i1e<double>());
  1305. CALL_SUBTEST_6(test_gpu_i1e<float>());
  1306. CALL_SUBTEST_6(test_gpu_i1e<double>());
  1307. CALL_SUBTEST_6(test_gpu_igamma_der_a<float>());
  1308. CALL_SUBTEST_6(test_gpu_igamma_der_a<double>());
  1309. CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>());
  1310. CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>());
  1311. #endif
  1312. #endif
  1313. }