cuda_kernels_bsm_to_crs.cu.cc 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. // Ceres Solver - A fast non-linear least squares minimizer
  2. // Copyright 2023 Google Inc. All rights reserved.
  3. // http://ceres-solver.org/
  4. //
  5. // Redistribution and use in source and binary forms, with or without
  6. // modification, are permitted provided that the following conditions are met:
  7. //
  8. // * Redistributions of source code must retain the above copyright notice,
  9. // this list of conditions and the following disclaimer.
  10. // * Redistributions in binary form must reproduce the above copyright notice,
  11. // this list of conditions and the following disclaimer in the documentation
  12. // and/or other materials provided with the distribution.
  13. // * Neither the name of Google Inc. nor the names of its contributors may be
  14. // used to endorse or promote products derived from this software without
  15. // specific prior written permission.
  16. //
  17. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27. // POSSIBILITY OF SUCH DAMAGE.
  28. //
  29. // Authors: dmitriy.korchemkin@gmail.com (Dmitriy Korchemkin)
  30. #include "ceres/cuda_kernels_bsm_to_crs.h"
  31. #include <cuda_runtime.h>
  32. #include <thrust/execution_policy.h>
  33. #include <thrust/scan.h>
  34. #include "ceres/block_structure.h"
  35. #include "ceres/cuda_kernels_utils.h"
  36. namespace ceres {
  37. namespace internal {
  38. namespace {
  39. inline auto ThrustCudaStreamExecutionPolicy(cudaStream_t stream) {
  40. // par_nosync execution policy was added in Thrust 1.16
  41. // https://github.com/NVIDIA/thrust/blob/main/CHANGELOG.md#thrust-1160
  42. #if THRUST_VERSION < 101700
  43. return thrust::cuda::par.on(stream);
  44. #else
  45. return thrust::cuda::par_nosync.on(stream);
  46. #endif
  47. }
  48. void* CudaMalloc(size_t size,
  49. cudaStream_t stream,
  50. bool memory_pools_supported) {
  51. void* data = nullptr;
  52. // Stream-ordered alloaction API is available since CUDA 11.2, but might be
  53. // not implemented by particular device
  54. #if CUDART_VERSION < 11020
  55. #warning \
  56. "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
  57. cudaMalloc(&data, size);
  58. #else
  59. if (memory_pools_supported) {
  60. cudaMallocAsync(&data, size, stream);
  61. } else {
  62. cudaMalloc(&data, size);
  63. }
  64. #endif
  65. return data;
  66. }
  67. void CudaFree(void* data, cudaStream_t stream, bool memory_pools_supported) {
  68. // Stream-ordered alloaction API is available since CUDA 11.2, but might be
  69. // not implemented by particular device
  70. #if CUDART_VERSION < 11020
  71. #warning \
  72. "Stream-ordered allocations are unavailable, consider updating CUDA toolkit to version 11.2+"
  73. cudaSuccess, cudaFree(data);
  74. #else
  75. if (memory_pools_supported) {
  76. cudaFreeAsync(data, stream);
  77. } else {
  78. cudaFree(data);
  79. }
  80. #endif
  81. }
  82. template <typename T>
  83. T* CudaAllocate(size_t num_elements,
  84. cudaStream_t stream,
  85. bool memory_pools_supported) {
  86. T* data = static_cast<T*>(
  87. CudaMalloc(num_elements * sizeof(T), stream, memory_pools_supported));
  88. return data;
  89. }
  90. } // namespace
  91. // Fill row block id and nnz for each row using block-sparse structure
  92. // represented by a set of flat arrays.
  93. // Inputs:
  94. // - num_row_blocks: number of row-blocks in block-sparse structure
  95. // - first_cell_in_row_block: index of the first cell of the row-block; size:
  96. // num_row_blocks + 1
  97. // - cells: cells of block-sparse structure as a continuous array
  98. // - row_blocks: row blocks of block-sparse structure stored sequentially
  99. // - col_blocks: column blocks of block-sparse structure stored sequentially
  100. // Outputs:
  101. // - rows: rows[i + 1] will contain number of non-zeros in i-th row, rows[0]
  102. // will be set to 0; rows are filled with a shift by one element in order
  103. // to obtain row-index array of CRS matrix with a inclusive scan afterwards
  104. // - row_block_ids: row_block_ids[i] will be set to index of row-block that
  105. // contains i-th row.
  106. // Computation is perform row-block-wise
  107. template <bool partitioned = false>
  108. __global__ void RowBlockIdAndNNZ(
  109. const int num_row_blocks,
  110. const int num_col_blocks_e,
  111. const int num_row_blocks_e,
  112. const int* __restrict__ first_cell_in_row_block,
  113. const Cell* __restrict__ cells,
  114. const Block* __restrict__ row_blocks,
  115. const Block* __restrict__ col_blocks,
  116. int* __restrict__ rows_e,
  117. int* __restrict__ rows_f,
  118. int* __restrict__ row_block_ids) {
  119. const int row_block_id = blockIdx.x * blockDim.x + threadIdx.x;
  120. if (row_block_id > num_row_blocks) {
  121. // No synchronization is performed in this kernel, thus it is safe to return
  122. return;
  123. }
  124. if (row_block_id == num_row_blocks) {
  125. // one extra thread sets the first element
  126. rows_f[0] = 0;
  127. if constexpr (partitioned) {
  128. rows_e[0] = 0;
  129. }
  130. return;
  131. }
  132. const auto& row_block = row_blocks[row_block_id];
  133. auto first_cell = cells + first_cell_in_row_block[row_block_id];
  134. const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
  135. int row_nnz_e = 0;
  136. if (partitioned && row_block_id < num_row_blocks_e) {
  137. // First cell is a cell from E
  138. row_nnz_e = col_blocks[first_cell->block_id].size;
  139. ++first_cell;
  140. }
  141. int row_nnz_f = 0;
  142. for (auto cell = first_cell; cell < last_cell; ++cell) {
  143. row_nnz_f += col_blocks[cell->block_id].size;
  144. }
  145. const int first_row = row_block.position;
  146. const int last_row = first_row + row_block.size;
  147. for (int i = first_row; i < last_row; ++i) {
  148. if constexpr (partitioned) {
  149. rows_e[i + 1] = row_nnz_e;
  150. }
  151. rows_f[i + 1] = row_nnz_f;
  152. row_block_ids[i] = row_block_id;
  153. }
  154. }
  155. // Row-wise creation of CRS structure
  156. // Inputs:
  157. // - num_rows: number of rows in matrix
  158. // - first_cell_in_row_block: index of the first cell of the row-block; size:
  159. // num_row_blocks + 1
  160. // - cells: cells of block-sparse structure as a continuous array
  161. // - row_blocks: row blocks of block-sparse structure stored sequentially
  162. // - col_blocks: column blocks of block-sparse structure stored sequentially
  163. // - row_block_ids: index of row-block that corresponds to row
  164. // - rows: row-index array of CRS structure
  165. // Outputs:
  166. // - cols: column-index array of CRS structure
  167. // Computaion is perform row-wise
  168. template <bool partitioned>
  169. __global__ void ComputeColumns(const int num_rows,
  170. const int num_row_blocks_e,
  171. const int num_col_blocks_e,
  172. const int* __restrict__ first_cell_in_row_block,
  173. const Cell* __restrict__ cells,
  174. const Block* __restrict__ row_blocks,
  175. const Block* __restrict__ col_blocks,
  176. const int* __restrict__ row_block_ids,
  177. const int* __restrict__ rows_e,
  178. int* __restrict__ cols_e,
  179. const int* __restrict__ rows_f,
  180. int* __restrict__ cols_f) {
  181. const int row = blockIdx.x * blockDim.x + threadIdx.x;
  182. if (row >= num_rows) {
  183. // No synchronization is performed in this kernel, thus it is safe to return
  184. return;
  185. }
  186. const int row_block_id = row_block_ids[row];
  187. // position in crs matrix
  188. auto first_cell = cells + first_cell_in_row_block[row_block_id];
  189. const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
  190. const int num_cols_e = col_blocks[num_col_blocks_e].position;
  191. // For reach cell of row-block only current row is being filled
  192. if (partitioned && row_block_id < num_row_blocks_e) {
  193. // The first cell is cell from E
  194. const auto& col_block = col_blocks[first_cell->block_id];
  195. const int col_block_size = col_block.size;
  196. int column_idx = col_block.position;
  197. int crs_position_e = rows_e[row];
  198. // Column indices for each element of row_in_block row of current cell
  199. for (int i = 0; i < col_block_size; ++i, ++crs_position_e) {
  200. cols_e[crs_position_e] = column_idx++;
  201. }
  202. ++first_cell;
  203. }
  204. int crs_position_f = rows_f[row];
  205. for (auto cell = first_cell; cell < last_cell; ++cell) {
  206. const auto& col_block = col_blocks[cell->block_id];
  207. const int col_block_size = col_block.size;
  208. int column_idx = col_block.position - num_cols_e;
  209. // Column indices for each element of row_in_block row of current cell
  210. for (int i = 0; i < col_block_size; ++i, ++crs_position_f) {
  211. cols_f[crs_position_f] = column_idx++;
  212. }
  213. }
  214. }
  215. void FillCRSStructure(const int num_row_blocks,
  216. const int num_rows,
  217. const int* first_cell_in_row_block,
  218. const Cell* cells,
  219. const Block* row_blocks,
  220. const Block* col_blocks,
  221. int* rows,
  222. int* cols,
  223. cudaStream_t stream,
  224. bool memory_pools_supported) {
  225. // Set number of non-zeros per row in rows array and row to row-block map in
  226. // row_block_ids array
  227. int* row_block_ids =
  228. CudaAllocate<int>(num_rows, stream, memory_pools_supported);
  229. const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
  230. RowBlockIdAndNNZ<false><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
  231. num_row_blocks,
  232. 0,
  233. 0,
  234. first_cell_in_row_block,
  235. cells,
  236. row_blocks,
  237. col_blocks,
  238. nullptr,
  239. rows,
  240. row_block_ids);
  241. // Finalize row-index array of CRS strucure by computing prefix sum
  242. thrust::inclusive_scan(
  243. ThrustCudaStreamExecutionPolicy(stream), rows, rows + num_rows + 1, rows);
  244. // Fill cols array of CRS structure
  245. const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
  246. ComputeColumns<false><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
  247. num_rows,
  248. 0,
  249. 0,
  250. first_cell_in_row_block,
  251. cells,
  252. row_blocks,
  253. col_blocks,
  254. row_block_ids,
  255. nullptr,
  256. nullptr,
  257. rows,
  258. cols);
  259. CudaFree(row_block_ids, stream, memory_pools_supported);
  260. }
  261. void FillCRSStructurePartitioned(const int num_row_blocks,
  262. const int num_rows,
  263. const int num_row_blocks_e,
  264. const int num_col_blocks_e,
  265. const int num_nonzeros_e,
  266. const int* first_cell_in_row_block,
  267. const Cell* cells,
  268. const Block* row_blocks,
  269. const Block* col_blocks,
  270. int* rows_e,
  271. int* cols_e,
  272. int* rows_f,
  273. int* cols_f,
  274. cudaStream_t stream,
  275. bool memory_pools_supported) {
  276. // Set number of non-zeros per row in rows array and row to row-block map in
  277. // row_block_ids array
  278. int* row_block_ids =
  279. CudaAllocate<int>(num_rows, stream, memory_pools_supported);
  280. const int num_blocks_blockwise = NumBlocksInGrid(num_row_blocks + 1);
  281. RowBlockIdAndNNZ<true><<<num_blocks_blockwise, kCudaBlockSize, 0, stream>>>(
  282. num_row_blocks,
  283. num_col_blocks_e,
  284. num_row_blocks_e,
  285. first_cell_in_row_block,
  286. cells,
  287. row_blocks,
  288. col_blocks,
  289. rows_e,
  290. rows_f,
  291. row_block_ids);
  292. // Finalize row-index array of CRS strucure by computing prefix sum
  293. thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
  294. rows_e,
  295. rows_e + num_rows + 1,
  296. rows_e);
  297. thrust::inclusive_scan(ThrustCudaStreamExecutionPolicy(stream),
  298. rows_f,
  299. rows_f + num_rows + 1,
  300. rows_f);
  301. // Fill cols array of CRS structure
  302. const int num_blocks_rowwise = NumBlocksInGrid(num_rows);
  303. ComputeColumns<true><<<num_blocks_rowwise, kCudaBlockSize, 0, stream>>>(
  304. num_rows,
  305. num_row_blocks_e,
  306. num_col_blocks_e,
  307. first_cell_in_row_block,
  308. cells,
  309. row_blocks,
  310. col_blocks,
  311. row_block_ids,
  312. rows_e,
  313. cols_e,
  314. rows_f,
  315. cols_f);
  316. CudaFree(row_block_ids, stream, memory_pools_supported);
  317. }
  318. template <typename T, typename Predicate>
  319. __device__ int PartitionPoint(const T* data,
  320. int first,
  321. int last,
  322. Predicate&& predicate) {
  323. if (!predicate(data[first])) {
  324. return first;
  325. }
  326. while (last - first > 1) {
  327. const auto midpoint = first + (last - first) / 2;
  328. if (predicate(data[midpoint])) {
  329. first = midpoint;
  330. } else {
  331. last = midpoint;
  332. }
  333. }
  334. return last;
  335. }
  336. // Element-wise reordering of block-sparse values
  337. // - first_cell_in_row_block - position of the first cell of row-block
  338. // - block_sparse_values - segment of block-sparse values starting from
  339. // block_sparse_offset, containing num_values
  340. template <bool partitioned>
  341. __global__ void PermuteToCrsKernel(
  342. const int block_sparse_offset,
  343. const int num_values,
  344. const int num_row_blocks,
  345. const int num_row_blocks_e,
  346. const int* __restrict__ first_cell_in_row_block,
  347. const int* __restrict__ value_offset_row_block_f,
  348. const Cell* __restrict__ cells,
  349. const Block* __restrict__ row_blocks,
  350. const Block* __restrict__ col_blocks,
  351. const int* __restrict__ crs_rows,
  352. const double* __restrict__ block_sparse_values,
  353. double* __restrict__ crs_values) {
  354. const int value_id = blockIdx.x * blockDim.x + threadIdx.x;
  355. if (value_id >= num_values) {
  356. return;
  357. }
  358. const int block_sparse_value_id = value_id + block_sparse_offset;
  359. // Find the corresponding row-block with a binary search
  360. const int row_block_id =
  361. (partitioned
  362. ? PartitionPoint(value_offset_row_block_f,
  363. 0,
  364. num_row_blocks,
  365. [block_sparse_value_id] __device__(
  366. const int row_block_offset) {
  367. return row_block_offset <= block_sparse_value_id;
  368. })
  369. : PartitionPoint(first_cell_in_row_block,
  370. 0,
  371. num_row_blocks,
  372. [cells, block_sparse_value_id] __device__(
  373. const int row_block_offset) {
  374. return cells[row_block_offset].position <=
  375. block_sparse_value_id;
  376. })) -
  377. 1;
  378. // Find cell and calculate offset within the row with a linear scan
  379. const auto& row_block = row_blocks[row_block_id];
  380. auto first_cell = cells + first_cell_in_row_block[row_block_id];
  381. const auto last_cell = cells + first_cell_in_row_block[row_block_id + 1];
  382. const int row_block_size = row_block.size;
  383. int num_cols_before = 0;
  384. if (partitioned && row_block_id < num_row_blocks_e) {
  385. ++first_cell;
  386. }
  387. for (const Cell* cell = first_cell; cell < last_cell; ++cell) {
  388. const auto& col_block = col_blocks[cell->block_id];
  389. const int col_block_size = col_block.size;
  390. const int cell_size = row_block_size * col_block_size;
  391. if (cell->position + cell_size > block_sparse_value_id) {
  392. const int pos_in_cell = block_sparse_value_id - cell->position;
  393. const int row_in_cell = pos_in_cell / col_block_size;
  394. const int col_in_cell = pos_in_cell % col_block_size;
  395. const int row = row_in_cell + row_block.position;
  396. crs_values[crs_rows[row] + num_cols_before + col_in_cell] =
  397. block_sparse_values[value_id];
  398. break;
  399. }
  400. num_cols_before += col_block_size;
  401. }
  402. }
  403. void PermuteToCRS(const int block_sparse_offset,
  404. const int num_values,
  405. const int num_row_blocks,
  406. const int* first_cell_in_row_block,
  407. const Cell* cells,
  408. const Block* row_blocks,
  409. const Block* col_blocks,
  410. const int* crs_rows,
  411. const double* block_sparse_values,
  412. double* crs_values,
  413. cudaStream_t stream) {
  414. const int num_blocks_valuewise = NumBlocksInGrid(num_values);
  415. PermuteToCrsKernel<false>
  416. <<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
  417. block_sparse_offset,
  418. num_values,
  419. num_row_blocks,
  420. 0,
  421. first_cell_in_row_block,
  422. nullptr,
  423. cells,
  424. row_blocks,
  425. col_blocks,
  426. crs_rows,
  427. block_sparse_values,
  428. crs_values);
  429. }
  430. void PermuteToCRSPartitionedF(const int block_sparse_offset,
  431. const int num_values,
  432. const int num_row_blocks,
  433. const int num_row_blocks_e,
  434. const int* first_cell_in_row_block,
  435. const int* value_offset_row_block_f,
  436. const Cell* cells,
  437. const Block* row_blocks,
  438. const Block* col_blocks,
  439. const int* crs_rows,
  440. const double* block_sparse_values,
  441. double* crs_values,
  442. cudaStream_t stream) {
  443. const int num_blocks_valuewise = NumBlocksInGrid(num_values);
  444. PermuteToCrsKernel<true><<<num_blocks_valuewise, kCudaBlockSize, 0, stream>>>(
  445. block_sparse_offset,
  446. num_values,
  447. num_row_blocks,
  448. num_row_blocks_e,
  449. first_cell_in_row_block,
  450. value_offset_row_block_f,
  451. cells,
  452. row_blocks,
  453. col_blocks,
  454. crs_rows,
  455. block_sparse_values,
  456. crs_values);
  457. }
  458. } // namespace internal
  459. } // namespace ceres