gpu_basic.cu 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. // This file is part of Eigen, a lightweight C++ template library
  2. // for linear algebra.
  3. //
  4. // Copyright (C) 2015-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
  5. //
  6. // This Source Code Form is subject to the terms of the Mozilla
  7. // Public License v. 2.0. If a copy of the MPL was not distributed
  8. // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
  9. // workaround issue between gcc >= 4.7 and cuda 5.5
  10. #if (defined __GNUC__) && (__GNUC__>4 || __GNUC_MINOR__>=7)
  11. #undef _GLIBCXX_ATOMIC_BUILTINS
  12. #undef _GLIBCXX_USE_INT128
  13. #endif
  14. #define EIGEN_TEST_NO_LONGDOUBLE
  15. #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
  16. #include "main.h"
  17. #include "gpu_common.h"
  18. // Check that dense modules can be properly parsed by nvcc
  19. #include <Eigen/Dense>
  20. // struct Foo{
  21. // EIGEN_DEVICE_FUNC
  22. // void operator()(int i, const float* mats, float* vecs) const {
  23. // using namespace Eigen;
  24. // // Matrix3f M(data);
  25. // // Vector3f x(data+9);
  26. // // Map<Vector3f>(data+9) = M.inverse() * x;
  27. // Matrix3f M(mats+i/16);
  28. // Vector3f x(vecs+i*3);
  29. // // using std::min;
  30. // // using std::sqrt;
  31. // Map<Vector3f>(vecs+i*3) << x.minCoeff(), 1, 2;// / x.dot(x);//(M.inverse() * x) / x.x();
  32. // //x = x*2 + x.y() * x + x * x.maxCoeff() - x / x.sum();
  33. // }
  34. // };
  35. template<typename T>
  36. struct coeff_wise {
  37. EIGEN_DEVICE_FUNC
  38. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  39. {
  40. using namespace Eigen;
  41. T x1(in+i);
  42. T x2(in+i+1);
  43. T x3(in+i+2);
  44. Map<T> res(out+i*T::MaxSizeAtCompileTime);
  45. res.array() += (in[0] * x1 + x2).array() * x3.array();
  46. }
  47. };
  48. template<typename T>
  49. struct complex_sqrt {
  50. EIGEN_DEVICE_FUNC
  51. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  52. {
  53. using namespace Eigen;
  54. typedef typename T::Scalar ComplexType;
  55. typedef typename T::Scalar::value_type ValueType;
  56. const int num_special_inputs = 18;
  57. if (i == 0) {
  58. const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
  59. typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
  60. SpecialInputs special_in;
  61. special_in.setZero();
  62. int idx = 0;
  63. special_in[idx++] = ComplexType(0, 0);
  64. special_in[idx++] = ComplexType(-0, 0);
  65. special_in[idx++] = ComplexType(0, -0);
  66. special_in[idx++] = ComplexType(-0, -0);
  67. // GCC's fallback sqrt implementation fails for inf inputs.
  68. // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
  69. // clang includes the GCC header (which temporarily disables
  70. // _GLIBCXX_USE_C99_COMPLEX)
  71. #if !defined(_GLIBCXX_COMPLEX) || \
  72. (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
  73. const ValueType inf = std::numeric_limits<ValueType>::infinity();
  74. special_in[idx++] = ComplexType(1.0, inf);
  75. special_in[idx++] = ComplexType(nan, inf);
  76. special_in[idx++] = ComplexType(1.0, -inf);
  77. special_in[idx++] = ComplexType(nan, -inf);
  78. special_in[idx++] = ComplexType(-inf, 1.0);
  79. special_in[idx++] = ComplexType(inf, 1.0);
  80. special_in[idx++] = ComplexType(-inf, -1.0);
  81. special_in[idx++] = ComplexType(inf, -1.0);
  82. special_in[idx++] = ComplexType(-inf, nan);
  83. special_in[idx++] = ComplexType(inf, nan);
  84. #endif
  85. special_in[idx++] = ComplexType(1.0, nan);
  86. special_in[idx++] = ComplexType(nan, 1.0);
  87. special_in[idx++] = ComplexType(nan, -1.0);
  88. special_in[idx++] = ComplexType(nan, nan);
  89. Map<SpecialInputs> special_out(out);
  90. special_out = special_in.cwiseSqrt();
  91. }
  92. T x1(in + i);
  93. Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
  94. res = x1.cwiseSqrt();
  95. }
  96. };
  97. template<typename T>
  98. struct complex_operators {
  99. EIGEN_DEVICE_FUNC
  100. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  101. {
  102. using namespace Eigen;
  103. typedef typename T::Scalar ComplexType;
  104. typedef typename T::Scalar::value_type ValueType;
  105. const int num_scalar_operators = 24;
  106. const int num_vector_operators = 23; // no unary + operator.
  107. int out_idx = i * (num_scalar_operators + num_vector_operators * T::MaxSizeAtCompileTime);
  108. // Scalar operators.
  109. const ComplexType a = in[i];
  110. const ComplexType b = in[i + 1];
  111. out[out_idx++] = +a;
  112. out[out_idx++] = -a;
  113. out[out_idx++] = a + b;
  114. out[out_idx++] = a + numext::real(b);
  115. out[out_idx++] = numext::real(a) + b;
  116. out[out_idx++] = a - b;
  117. out[out_idx++] = a - numext::real(b);
  118. out[out_idx++] = numext::real(a) - b;
  119. out[out_idx++] = a * b;
  120. out[out_idx++] = a * numext::real(b);
  121. out[out_idx++] = numext::real(a) * b;
  122. out[out_idx++] = a / b;
  123. out[out_idx++] = a / numext::real(b);
  124. out[out_idx++] = numext::real(a) / b;
  125. out[out_idx] = a; out[out_idx++] += b;
  126. out[out_idx] = a; out[out_idx++] -= b;
  127. out[out_idx] = a; out[out_idx++] *= b;
  128. out[out_idx] = a; out[out_idx++] /= b;
  129. const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
  130. const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
  131. out[out_idx++] = (a == b ? true_value : false_value);
  132. out[out_idx++] = (a == numext::real(b) ? true_value : false_value);
  133. out[out_idx++] = (numext::real(a) == b ? true_value : false_value);
  134. out[out_idx++] = (a != b ? true_value : false_value);
  135. out[out_idx++] = (a != numext::real(b) ? true_value : false_value);
  136. out[out_idx++] = (numext::real(a) != b ? true_value : false_value);
  137. // Vector versions.
  138. T x1(in + i);
  139. T x2(in + i + 1);
  140. const int res_size = T::MaxSizeAtCompileTime * num_scalar_operators;
  141. const int size = T::MaxSizeAtCompileTime;
  142. int block_idx = 0;
  143. Map<VectorX<ComplexType>> res(out + out_idx, res_size);
  144. res.segment(block_idx, size) = -x1;
  145. block_idx += size;
  146. res.segment(block_idx, size) = x1 + x2;
  147. block_idx += size;
  148. res.segment(block_idx, size) = x1 + x2.real();
  149. block_idx += size;
  150. res.segment(block_idx, size) = x1.real() + x2;
  151. block_idx += size;
  152. res.segment(block_idx, size) = x1 - x2;
  153. block_idx += size;
  154. res.segment(block_idx, size) = x1 - x2.real();
  155. block_idx += size;
  156. res.segment(block_idx, size) = x1.real() - x2;
  157. block_idx += size;
  158. res.segment(block_idx, size) = x1.array() * x2.array();
  159. block_idx += size;
  160. res.segment(block_idx, size) = x1.array() * x2.real().array();
  161. block_idx += size;
  162. res.segment(block_idx, size) = x1.real().array() * x2.array();
  163. block_idx += size;
  164. res.segment(block_idx, size) = x1.array() / x2.array();
  165. block_idx += size;
  166. res.segment(block_idx, size) = x1.array() / x2.real().array();
  167. block_idx += size;
  168. res.segment(block_idx, size) = x1.real().array() / x2.array();
  169. block_idx += size;
  170. res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
  171. block_idx += size;
  172. res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
  173. block_idx += size;
  174. res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() *= x2.array();
  175. block_idx += size;
  176. res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
  177. block_idx += size;
  178. const T true_vector = T::Constant(true_value);
  179. const T false_vector = T::Constant(false_value);
  180. res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
  181. block_idx += size;
  182. // Mixing types in equality comparison does not work.
  183. // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
  184. // block_idx += size;
  185. // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
  186. // block_idx += size;
  187. res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
  188. block_idx += size;
  189. // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
  190. // block_idx += size;
  191. // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
  192. // block_idx += size;
  193. }
  194. };
  195. template<typename T>
  196. struct replicate {
  197. EIGEN_DEVICE_FUNC
  198. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  199. {
  200. using namespace Eigen;
  201. T x1(in+i);
  202. int step = x1.size() * 4;
  203. int stride = 3 * step;
  204. typedef Map<Array<typename T::Scalar,Dynamic,Dynamic> > MapType;
  205. MapType(out+i*stride+0*step, x1.rows()*2, x1.cols()*2) = x1.replicate(2,2);
  206. MapType(out+i*stride+1*step, x1.rows()*3, x1.cols()) = in[i] * x1.colwise().replicate(3);
  207. MapType(out+i*stride+2*step, x1.rows(), x1.cols()*3) = in[i] * x1.rowwise().replicate(3);
  208. }
  209. };
  210. template<typename T>
  211. struct alloc_new_delete {
  212. EIGEN_DEVICE_FUNC
  213. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  214. {
  215. int offset = 2*i*T::MaxSizeAtCompileTime;
  216. T* x = new T(in + offset);
  217. Eigen::Map<T> u(out + offset);
  218. u = *x;
  219. delete x;
  220. offset += T::MaxSizeAtCompileTime;
  221. T* y = new T[1];
  222. y[0] = T(in + offset);
  223. Eigen::Map<T> v(out + offset);
  224. v = y[0];
  225. delete[] y;
  226. }
  227. };
  228. template<typename T>
  229. struct redux {
  230. EIGEN_DEVICE_FUNC
  231. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  232. {
  233. using namespace Eigen;
  234. int N = 10;
  235. T x1(in+i);
  236. out[i*N+0] = x1.minCoeff();
  237. out[i*N+1] = x1.maxCoeff();
  238. out[i*N+2] = x1.sum();
  239. out[i*N+3] = x1.prod();
  240. out[i*N+4] = x1.matrix().squaredNorm();
  241. out[i*N+5] = x1.matrix().norm();
  242. out[i*N+6] = x1.colwise().sum().maxCoeff();
  243. out[i*N+7] = x1.rowwise().maxCoeff().sum();
  244. out[i*N+8] = x1.matrix().colwise().squaredNorm().sum();
  245. }
  246. };
  247. template<typename T1, typename T2>
  248. struct prod_test {
  249. EIGEN_DEVICE_FUNC
  250. void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
  251. {
  252. using namespace Eigen;
  253. typedef Matrix<typename T1::Scalar, T1::RowsAtCompileTime, T2::ColsAtCompileTime> T3;
  254. T1 x1(in+i);
  255. T2 x2(in+i+1);
  256. Map<T3> res(out+i*T3::MaxSizeAtCompileTime);
  257. res += in[i] * x1 * x2;
  258. }
  259. };
  260. template<typename T1, typename T2>
  261. struct diagonal {
  262. EIGEN_DEVICE_FUNC
  263. void operator()(int i, const typename T1::Scalar* in, typename T1::Scalar* out) const
  264. {
  265. using namespace Eigen;
  266. T1 x1(in+i);
  267. Map<T2> res(out+i*T2::MaxSizeAtCompileTime);
  268. res += x1.diagonal();
  269. }
  270. };
  271. template<typename T>
  272. struct eigenvalues_direct {
  273. EIGEN_DEVICE_FUNC
  274. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  275. {
  276. using namespace Eigen;
  277. typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
  278. T M(in+i);
  279. Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
  280. T A = M*M.adjoint();
  281. SelfAdjointEigenSolver<T> eig;
  282. eig.computeDirect(A);
  283. res = eig.eigenvalues();
  284. }
  285. };
  286. template<typename T>
  287. struct eigenvalues {
  288. EIGEN_DEVICE_FUNC
  289. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  290. {
  291. using namespace Eigen;
  292. typedef Matrix<typename T::Scalar, T::RowsAtCompileTime, 1> Vec;
  293. T M(in+i);
  294. Map<Vec> res(out+i*Vec::MaxSizeAtCompileTime);
  295. T A = M*M.adjoint();
  296. SelfAdjointEigenSolver<T> eig;
  297. eig.compute(A);
  298. res = eig.eigenvalues();
  299. }
  300. };
  301. template<typename T>
  302. struct matrix_inverse {
  303. EIGEN_DEVICE_FUNC
  304. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  305. {
  306. using namespace Eigen;
  307. T M(in+i);
  308. Map<T> res(out+i*T::MaxSizeAtCompileTime);
  309. res = M.inverse();
  310. }
  311. };
  312. template<typename T>
  313. struct numeric_limits_test {
  314. EIGEN_DEVICE_FUNC
  315. void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
  316. {
  317. EIGEN_UNUSED_VARIABLE(in)
  318. int out_idx = i * 5;
  319. out[out_idx++] = numext::numeric_limits<float>::epsilon();
  320. out[out_idx++] = (numext::numeric_limits<float>::max)();
  321. out[out_idx++] = (numext::numeric_limits<float>::min)();
  322. out[out_idx++] = numext::numeric_limits<float>::infinity();
  323. out[out_idx++] = numext::numeric_limits<float>::quiet_NaN();
  324. }
  325. };
  326. template<typename Type1, typename Type2>
  327. bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
  328. {
  329. if (a.rows() != b.rows()) {
  330. return false;
  331. }
  332. if (a.cols() != b.cols()) {
  333. return false;
  334. }
  335. for (Index r = 0; r < a.rows(); ++r) {
  336. for (Index c = 0; c < a.cols(); ++c) {
  337. if (a(r, c) != b(r, c)
  338. && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
  339. && !test_isApprox(a(r, c), b(r, c))) {
  340. return false;
  341. }
  342. }
  343. }
  344. return true;
  345. }
  346. template<typename Kernel, typename Input, typename Output>
  347. void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
  348. {
  349. Output out_ref, out_gpu;
  350. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  351. out_ref = out_gpu = out;
  352. #else
  353. EIGEN_UNUSED_VARIABLE(in);
  354. EIGEN_UNUSED_VARIABLE(out);
  355. #endif
  356. run_on_cpu (ker, n, in, out_ref);
  357. run_on_gpu(ker, n, in, out_gpu);
  358. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  359. verifyIsApproxWithInfsNans(out_ref, out_gpu);
  360. #endif
  361. }
  362. EIGEN_DECLARE_TEST(gpu_basic)
  363. {
  364. ei_test_init_gpu();
  365. int nthreads = 100;
  366. Eigen::VectorXf in, out;
  367. Eigen::VectorXcf cfin, cfout;
  368. #if !defined(EIGEN_GPU_COMPILE_PHASE)
  369. int data_size = nthreads * 512;
  370. in.setRandom(data_size);
  371. out.setConstant(data_size, -1);
  372. cfin.setRandom(data_size);
  373. cfout.setConstant(data_size, -1);
  374. #endif
  375. CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
  376. CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Array44f>(), nthreads, in, out) );
  377. #if !defined(EIGEN_USE_HIP)
  378. // FIXME
  379. // These subtests result in a compile failure on the HIP platform
  380. //
  381. // eigen-upstream/Eigen/src/Core/Replicate.h:61:65: error:
  382. // base class 'internal::dense_xpr_base<Replicate<Array<float, 4, 1, 0, 4, 1>, -1, -1> >::type'
  383. // (aka 'ArrayBase<Eigen::Replicate<Eigen::Array<float, 4, 1, 0, 4, 1>, -1, -1> >') has protected default constructor
  384. CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array4f>(), nthreads, in, out) );
  385. CALL_SUBTEST( run_and_compare_to_gpu(replicate<Array33f>(), nthreads, in, out) );
  386. // HIP does not support new/delete on device.
  387. CALL_SUBTEST( run_and_compare_to_gpu(alloc_new_delete<Vector3f>(), nthreads, in, out) );
  388. #endif
  389. CALL_SUBTEST( run_and_compare_to_gpu(redux<Array4f>(), nthreads, in, out) );
  390. CALL_SUBTEST( run_and_compare_to_gpu(redux<Matrix3f>(), nthreads, in, out) );
  391. CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix3f,Matrix3f>(), nthreads, in, out) );
  392. CALL_SUBTEST( run_and_compare_to_gpu(prod_test<Matrix4f,Vector4f>(), nthreads, in, out) );
  393. CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix3f,Vector3f>(), nthreads, in, out) );
  394. CALL_SUBTEST( run_and_compare_to_gpu(diagonal<Matrix4f,Vector4f>(), nthreads, in, out) );
  395. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix2f>(), nthreads, in, out) );
  396. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix3f>(), nthreads, in, out) );
  397. CALL_SUBTEST( run_and_compare_to_gpu(matrix_inverse<Matrix4f>(), nthreads, in, out) );
  398. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
  399. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
  400. // Test std::complex.
  401. CALL_SUBTEST( run_and_compare_to_gpu(complex_operators<Vector3cf>(), nthreads, cfin, cfout) );
  402. CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
  403. // numeric_limits
  404. CALL_SUBTEST( test_with_infs_nans(numeric_limits_test<Vector3f>(), 1, in, out) );
  405. #if defined(__NVCC__)
  406. // FIXME
  407. // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
  408. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix4f>(), nthreads, in, out) );
  409. typedef Matrix<float,6,6> Matrix6f;
  410. CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues<Matrix6f>(), nthreads, in, out) );
  411. #endif
  412. }