spmv_benchmark.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445
  1. // Ceres Solver - A fast non-linear least squares minimizer
  2. // Copyright 2023 Google Inc. All rights reserved.
  3. // http://ceres-solver.org/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice,
  9. // this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above copyright notice,
  11. // this list of conditions and the following disclaimer in the documentation
  12. // and/or other materials provided with the distribution.
  13. // * Neither the name of Google Inc. nor the names of its contributors may be
  14. // used to endorse or promote products derived from this software without
  15. // specific prior written permission.
  16. //
  17. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. // POSSIBILITY OF SUCH DAMAGE.
  28. //
  29. // Authors: joydeepb@cs.utexas.edu (Joydeep Biswas)
  30. #include <memory>
  31. #include <random>
  32. #include <string>
  33. #include "Eigen/Dense"
  34. #include "benchmark/benchmark.h"
  35. #include "ceres/block_jacobi_preconditioner.h"
  36. #include "ceres/block_sparse_matrix.h"
  37. #include "ceres/context_impl.h"
  38. #include "ceres/cuda_sparse_matrix.h"
  39. #include "ceres/cuda_vector.h"
  40. #include "ceres/fake_bundle_adjustment_jacobian.h"
  41. #include "ceres/internal/config.h"
  42. #include "ceres/internal/eigen.h"
  43. #include "ceres/linear_solver.h"
  44. #ifndef CERES_NO_CUDA
  45. #include "cuda_runtime.h"
  46. #endif
  47. namespace ceres::internal {
  48. constexpr int kNumCameras = 1000;
  49. constexpr int kNumPoints = 10000;
  50. constexpr int kCameraSize = 6;
  51. constexpr int kPointSize = 3;
  52. constexpr double kVisibility = 0.1;
  53. constexpr int kNumRowBlocks = 100000;
  54. constexpr int kNumColBlocks = 10000;
  55. constexpr int kMinRowBlockSize = 1;
  56. constexpr int kMaxRowBlockSize = 5;
  57. constexpr int kMinColBlockSize = 1;
  58. constexpr int kMaxColBlockSize = 15;
  59. constexpr double kBlockDensity = 5.0 / kNumColBlocks;
  60. static void BM_BlockSparseRightMultiplyAndAccumulateBA(
  61. benchmark::State& state) {
  62. const int num_threads = static_cast<int>(state.range(0));
  63. std::mt19937 prng;
  64. auto jacobian = CreateFakeBundleAdjustmentJacobian(
  65. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  66. ContextImpl context;
  67. context.EnsureMinimumThreads(num_threads);
  68. Vector x(jacobian->num_cols());
  69. Vector y(jacobian->num_rows());
  70. x.setRandom();
  71. y.setRandom();
  72. double sum = 0;
  73. for (auto _ : state) {
  74. jacobian->RightMultiplyAndAccumulate(
  75. x.data(), y.data(), &context, num_threads);
  76. sum += y.norm();
  77. }
  78. CHECK_NE(sum, 0.0);
  79. }
  80. BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateBA)
  81. ->Arg(1)
  82. ->Arg(2)
  83. ->Arg(4)
  84. ->Arg(8)
  85. ->Arg(16);
  86. static void BM_BlockSparseRightMultiplyAndAccumulateUnstructured(
  87. benchmark::State& state) {
  88. const int num_threads = static_cast<int>(state.range(0));
  89. BlockSparseMatrix::RandomMatrixOptions options;
  90. options.num_row_blocks = kNumRowBlocks;
  91. options.num_col_blocks = kNumColBlocks;
  92. options.min_row_block_size = kMinRowBlockSize;
  93. options.min_col_block_size = kMinColBlockSize;
  94. options.max_row_block_size = kMaxRowBlockSize;
  95. options.max_col_block_size = kMaxColBlockSize;
  96. options.block_density = kBlockDensity;
  97. std::mt19937 prng;
  98. auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  99. ContextImpl context;
  100. context.EnsureMinimumThreads(num_threads);
  101. Vector x(jacobian->num_cols());
  102. Vector y(jacobian->num_rows());
  103. x.setRandom();
  104. y.setRandom();
  105. double sum = 0;
  106. for (auto _ : state) {
  107. jacobian->RightMultiplyAndAccumulate(
  108. x.data(), y.data(), &context, num_threads);
  109. sum += y.norm();
  110. }
  111. CHECK_NE(sum, 0.0);
  112. }
  113. BENCHMARK(BM_BlockSparseRightMultiplyAndAccumulateUnstructured)
  114. ->Arg(1)
  115. ->Arg(2)
  116. ->Arg(4)
  117. ->Arg(8)
  118. ->Arg(16);
  119. static void BM_BlockSparseLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  120. std::mt19937 prng;
  121. auto jacobian = CreateFakeBundleAdjustmentJacobian(
  122. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  123. Vector x(jacobian->num_rows());
  124. Vector y(jacobian->num_cols());
  125. x.setRandom();
  126. y.setRandom();
  127. double sum = 0;
  128. for (auto _ : state) {
  129. jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
  130. sum += y.norm();
  131. }
  132. CHECK_NE(sum, 0.0);
  133. }
  134. BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateBA);
  135. static void BM_BlockSparseLeftMultiplyAndAccumulateUnstructured(
  136. benchmark::State& state) {
  137. BlockSparseMatrix::RandomMatrixOptions options;
  138. options.num_row_blocks = 100000;
  139. options.num_col_blocks = 10000;
  140. options.min_row_block_size = 1;
  141. options.min_col_block_size = 1;
  142. options.max_row_block_size = 10;
  143. options.max_col_block_size = 15;
  144. options.block_density = 5.0 / options.num_col_blocks;
  145. std::mt19937 prng;
  146. auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  147. Vector x(jacobian->num_rows());
  148. Vector y(jacobian->num_cols());
  149. x.setRandom();
  150. y.setRandom();
  151. double sum = 0;
  152. for (auto _ : state) {
  153. jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
  154. sum += y.norm();
  155. }
  156. CHECK_NE(sum, 0.0);
  157. }
  158. BENCHMARK(BM_BlockSparseLeftMultiplyAndAccumulateUnstructured);
  159. static void BM_CRSRightMultiplyAndAccumulateBA(benchmark::State& state) {
  160. const int num_threads = static_cast<int>(state.range(0));
  161. std::mt19937 prng;
  162. auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
  163. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  164. auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();
  165. ContextImpl context;
  166. context.EnsureMinimumThreads(num_threads);
  167. Vector x(jacobian->num_cols());
  168. Vector y(jacobian->num_rows());
  169. x.setRandom();
  170. y.setRandom();
  171. double sum = 0;
  172. for (auto _ : state) {
  173. jacobian->RightMultiplyAndAccumulate(
  174. x.data(), y.data(), &context, num_threads);
  175. sum += y.norm();
  176. }
  177. CHECK_NE(sum, 0.0);
  178. }
  179. BENCHMARK(BM_CRSRightMultiplyAndAccumulateBA)
  180. ->Arg(1)
  181. ->Arg(2)
  182. ->Arg(4)
  183. ->Arg(8)
  184. ->Arg(16);
  185. static void BM_CRSRightMultiplyAndAccumulateUnstructured(
  186. benchmark::State& state) {
  187. const int num_threads = static_cast<int>(state.range(0));
  188. BlockSparseMatrix::RandomMatrixOptions options;
  189. options.num_row_blocks = kNumRowBlocks;
  190. options.num_col_blocks = kNumColBlocks;
  191. options.min_row_block_size = kMinRowBlockSize;
  192. options.min_col_block_size = kMinColBlockSize;
  193. options.max_row_block_size = kMaxRowBlockSize;
  194. options.max_col_block_size = kMaxColBlockSize;
  195. options.block_density = kBlockDensity;
  196. std::mt19937 prng;
  197. auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  198. auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();
  199. ContextImpl context;
  200. context.EnsureMinimumThreads(num_threads);
  201. Vector x(jacobian->num_cols());
  202. Vector y(jacobian->num_rows());
  203. x.setRandom();
  204. y.setRandom();
  205. double sum = 0;
  206. for (auto _ : state) {
  207. jacobian->RightMultiplyAndAccumulate(
  208. x.data(), y.data(), &context, num_threads);
  209. sum += y.norm();
  210. }
  211. CHECK_NE(sum, 0.0);
  212. }
  213. BENCHMARK(BM_CRSRightMultiplyAndAccumulateUnstructured)
  214. ->Arg(1)
  215. ->Arg(2)
  216. ->Arg(4)
  217. ->Arg(8)
  218. ->Arg(16);
  219. static void BM_CRSLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  220. std::mt19937 prng;
  221. // Perform setup here
  222. auto bsm_jacobian = CreateFakeBundleAdjustmentJacobian(
  223. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  224. auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();
  225. Vector x(jacobian->num_rows());
  226. Vector y(jacobian->num_cols());
  227. x.setRandom();
  228. y.setRandom();
  229. double sum = 0;
  230. for (auto _ : state) {
  231. // This code gets timed
  232. jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
  233. sum += y.norm();
  234. }
  235. CHECK_NE(sum, 0.0);
  236. }
  237. BENCHMARK(BM_CRSLeftMultiplyAndAccumulateBA);
  238. static void BM_CRSLeftMultiplyAndAccumulateUnstructured(
  239. benchmark::State& state) {
  240. BlockSparseMatrix::RandomMatrixOptions options;
  241. options.num_row_blocks = kNumRowBlocks;
  242. options.num_col_blocks = kNumColBlocks;
  243. options.min_row_block_size = kMinRowBlockSize;
  244. options.min_col_block_size = kMinColBlockSize;
  245. options.max_row_block_size = kMaxRowBlockSize;
  246. options.max_col_block_size = kMaxColBlockSize;
  247. options.block_density = kBlockDensity;
  248. std::mt19937 prng;
  249. auto bsm_jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  250. auto jacobian = bsm_jacobian->ToCompressedRowSparseMatrix();
  251. Vector x(jacobian->num_rows());
  252. Vector y(jacobian->num_cols());
  253. x.setRandom();
  254. y.setRandom();
  255. double sum = 0;
  256. for (auto _ : state) {
  257. // This code gets timed
  258. jacobian->LeftMultiplyAndAccumulate(x.data(), y.data());
  259. sum += y.norm();
  260. }
  261. CHECK_NE(sum, 0.0);
  262. }
  263. BENCHMARK(BM_CRSLeftMultiplyAndAccumulateUnstructured);
  264. #ifndef CERES_NO_CUDA
  265. static void BM_CudaRightMultiplyAndAccumulateBA(benchmark::State& state) {
  266. std::mt19937 prng;
  267. auto jacobian = CreateFakeBundleAdjustmentJacobian(
  268. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  269. ContextImpl context;
  270. std::string message;
  271. context.InitCuda(&message);
  272. auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  273. CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  274. CudaVector cuda_x(&context, 0);
  275. CudaVector cuda_y(&context, 0);
  276. Vector x(jacobian->num_cols());
  277. Vector y(jacobian->num_rows());
  278. x.setRandom();
  279. y.setRandom();
  280. cuda_x.CopyFromCpu(x);
  281. cuda_y.CopyFromCpu(y);
  282. double sum = 0;
  283. for (auto _ : state) {
  284. cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
  285. sum += cuda_y.Norm();
  286. CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  287. }
  288. CHECK_NE(sum, 0.0);
  289. }
  290. BENCHMARK(BM_CudaRightMultiplyAndAccumulateBA);
  291. static void BM_CudaRightMultiplyAndAccumulateUnstructured(
  292. benchmark::State& state) {
  293. BlockSparseMatrix::RandomMatrixOptions options;
  294. options.num_row_blocks = kNumRowBlocks;
  295. options.num_col_blocks = kNumColBlocks;
  296. options.min_row_block_size = kMinRowBlockSize;
  297. options.min_col_block_size = kMinColBlockSize;
  298. options.max_row_block_size = kMaxRowBlockSize;
  299. options.max_col_block_size = kMaxColBlockSize;
  300. options.block_density = kBlockDensity;
  301. std::mt19937 prng;
  302. auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  303. ContextImpl context;
  304. std::string message;
  305. context.InitCuda(&message);
  306. auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  307. CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  308. CudaVector cuda_x(&context, 0);
  309. CudaVector cuda_y(&context, 0);
  310. Vector x(jacobian->num_cols());
  311. Vector y(jacobian->num_rows());
  312. x.setRandom();
  313. y.setRandom();
  314. cuda_x.CopyFromCpu(x);
  315. cuda_y.CopyFromCpu(y);
  316. double sum = 0;
  317. for (auto _ : state) {
  318. cuda_jacobian.RightMultiplyAndAccumulate(cuda_x, &cuda_y);
  319. sum += cuda_y.Norm();
  320. CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  321. }
  322. CHECK_NE(sum, 0.0);
  323. }
  324. BENCHMARK(BM_CudaRightMultiplyAndAccumulateUnstructured);
  325. static void BM_CudaLeftMultiplyAndAccumulateBA(benchmark::State& state) {
  326. std::mt19937 prng;
  327. auto jacobian = CreateFakeBundleAdjustmentJacobian(
  328. kNumCameras, kNumPoints, kCameraSize, kPointSize, kVisibility, prng);
  329. ContextImpl context;
  330. std::string message;
  331. context.InitCuda(&message);
  332. auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  333. CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  334. CudaVector cuda_x(&context, 0);
  335. CudaVector cuda_y(&context, 0);
  336. Vector x(jacobian->num_rows());
  337. Vector y(jacobian->num_cols());
  338. x.setRandom();
  339. y.setRandom();
  340. cuda_x.CopyFromCpu(x);
  341. cuda_y.CopyFromCpu(y);
  342. double sum = 0;
  343. for (auto _ : state) {
  344. cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
  345. sum += cuda_y.Norm();
  346. CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  347. }
  348. CHECK_NE(sum, 0.0);
  349. }
  350. BENCHMARK(BM_CudaLeftMultiplyAndAccumulateBA);
  351. static void BM_CudaLeftMultiplyAndAccumulateUnstructured(
  352. benchmark::State& state) {
  353. BlockSparseMatrix::RandomMatrixOptions options;
  354. options.num_row_blocks = kNumRowBlocks;
  355. options.num_col_blocks = kNumColBlocks;
  356. options.min_row_block_size = kMinRowBlockSize;
  357. options.min_col_block_size = kMinColBlockSize;
  358. options.max_row_block_size = kMaxRowBlockSize;
  359. options.max_col_block_size = kMaxColBlockSize;
  360. options.block_density = kBlockDensity;
  361. std::mt19937 prng;
  362. auto jacobian = BlockSparseMatrix::CreateRandomMatrix(options, prng);
  363. ContextImpl context;
  364. std::string message;
  365. context.InitCuda(&message);
  366. auto jacobian_crs = jacobian->ToCompressedRowSparseMatrix();
  367. CudaSparseMatrix cuda_jacobian(&context, *jacobian_crs);
  368. CudaVector cuda_x(&context, 0);
  369. CudaVector cuda_y(&context, 0);
  370. Vector x(jacobian->num_rows());
  371. Vector y(jacobian->num_cols());
  372. x.setRandom();
  373. y.setRandom();
  374. cuda_x.CopyFromCpu(x);
  375. cuda_y.CopyFromCpu(y);
  376. double sum = 0;
  377. for (auto _ : state) {
  378. cuda_jacobian.LeftMultiplyAndAccumulate(cuda_x, &cuda_y);
  379. sum += cuda_y.Norm();
  380. CHECK_EQ(cudaDeviceSynchronize(), cudaSuccess);
  381. }
  382. CHECK_NE(sum, 0.0);
  383. }
  384. BENCHMARK(BM_CudaLeftMultiplyAndAccumulateUnstructured);
  385. #endif
  386. } // namespace ceres::internal
  387. BENCHMARK_MAIN();