decoder.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. #include "decoder.h"
  2. #include <c10/util/Logging.h>
  3. #include <nppi_color_conversion.h>
  4. #include <cmath>
  5. #include <cstring>
  6. #include <unordered_map>
  7. static float chroma_height_factor(cudaVideoSurfaceFormat surface_format) {
  8. return (surface_format == cudaVideoSurfaceFormat_YUV444 ||
  9. surface_format == cudaVideoSurfaceFormat_YUV444_16Bit)
  10. ? 1.0
  11. : 0.5;
  12. }
  13. static int chroma_plane_count(cudaVideoSurfaceFormat surface_format) {
  14. return (surface_format == cudaVideoSurfaceFormat_YUV444 ||
  15. surface_format == cudaVideoSurfaceFormat_YUV444_16Bit)
  16. ? 2
  17. : 1;
  18. }
  19. /* Initialise cu_context and video_codec, create context lock and create parser
  20. * object.
  21. */
  22. void Decoder::init(CUcontext context, cudaVideoCodec codec) {
  23. cu_context = context;
  24. video_codec = codec;
  25. check_for_cuda_errors(
  26. cuvidCtxLockCreate(&ctx_lock, cu_context), __LINE__, __FILE__);
  27. CUVIDPARSERPARAMS parser_params = {};
  28. parser_params.CodecType = codec;
  29. parser_params.ulMaxNumDecodeSurfaces = 1;
  30. parser_params.ulClockRate = 1000;
  31. parser_params.ulMaxDisplayDelay = 0u;
  32. parser_params.pUserData = this;
  33. parser_params.pfnSequenceCallback = video_sequence_handler;
  34. parser_params.pfnDecodePicture = picture_decode_handler;
  35. parser_params.pfnDisplayPicture = picture_display_handler;
  36. parser_params.pfnGetOperatingPoint = operating_point_handler;
  37. check_for_cuda_errors(
  38. cuvidCreateVideoParser(&parser, &parser_params), __LINE__, __FILE__);
  39. }
  40. /* Destroy parser object and context lock.
  41. */
  42. Decoder::~Decoder() {
  43. if (parser) {
  44. cuvidDestroyVideoParser(parser);
  45. }
  46. cuvidCtxLockDestroy(ctx_lock);
  47. }
  48. /* Destroy CUvideodecoder object and free up all the unreturned decoded frames.
  49. */
  50. void Decoder::release() {
  51. cuCtxPushCurrent(cu_context);
  52. if (decoder) {
  53. cuvidDestroyDecoder(decoder);
  54. }
  55. cuCtxPopCurrent(NULL);
  56. }
  57. /* Trigger video decoding.
  58. */
  59. void Decoder::decode(const uint8_t* data, unsigned long size) {
  60. CUVIDSOURCEDATAPACKET pkt = {};
  61. pkt.flags = CUVID_PKT_TIMESTAMP;
  62. pkt.payload_size = size;
  63. pkt.payload = data;
  64. pkt.timestamp = 0;
  65. if (!data || size == 0) {
  66. pkt.flags |= CUVID_PKT_ENDOFSTREAM;
  67. }
  68. check_for_cuda_errors(cuvidParseVideoData(parser, &pkt), __LINE__, __FILE__);
  69. cuvidStream = 0;
  70. }
  71. /* Fetch a decoded frame and remove it from the queue.
  72. */
  73. torch::Tensor Decoder::fetch_frame() {
  74. if (decoded_frames.empty()) {
  75. auto options =
  76. torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA);
  77. return torch::zeros({0}, options);
  78. }
  79. torch::Tensor frame = decoded_frames.front();
  80. decoded_frames.pop();
  81. return frame;
  82. }
  83. /* Called when a picture is ready to be decoded.
  84. */
  85. int Decoder::handle_picture_decode(CUVIDPICPARAMS* pic_params) {
  86. if (!decoder) {
  87. TORCH_CHECK(false, "Uninitialised decoder");
  88. }
  89. pic_num_in_decode_order[pic_params->CurrPicIdx] = decode_pic_count++;
  90. check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
  91. check_for_cuda_errors(
  92. cuvidDecodePicture(decoder, pic_params), __LINE__, __FILE__);
  93. check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
  94. return 1;
  95. }
  96. /* Process the decoded data and copy it to a cuda memory location.
  97. */
  98. int Decoder::handle_picture_display(CUVIDPARSERDISPINFO* disp_info) {
  99. CUVIDPROCPARAMS proc_params = {};
  100. proc_params.progressive_frame = disp_info->progressive_frame;
  101. proc_params.second_field = disp_info->repeat_first_field + 1;
  102. proc_params.top_field_first = disp_info->top_field_first;
  103. proc_params.unpaired_field = disp_info->repeat_first_field < 0;
  104. proc_params.output_stream = cuvidStream;
  105. CUdeviceptr source_frame = 0;
  106. unsigned int source_pitch = 0;
  107. check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
  108. check_for_cuda_errors(
  109. cuvidMapVideoFrame(
  110. decoder,
  111. disp_info->picture_index,
  112. &source_frame,
  113. &source_pitch,
  114. &proc_params),
  115. __LINE__,
  116. __FILE__);
  117. CUVIDGETDECODESTATUS decode_status;
  118. memset(&decode_status, 0, sizeof(decode_status));
  119. CUresult result =
  120. cuvidGetDecodeStatus(decoder, disp_info->picture_index, &decode_status);
  121. if (result == CUDA_SUCCESS &&
  122. (decode_status.decodeStatus == cuvidDecodeStatus_Error ||
  123. decode_status.decodeStatus == cuvidDecodeStatus_Error_Concealed)) {
  124. VLOG(1) << "Decode Error occurred for picture "
  125. << pic_num_in_decode_order[disp_info->picture_index];
  126. }
  127. auto options = torch::TensorOptions().dtype(torch::kU8).device(torch::kCUDA);
  128. torch::Tensor decoded_frame = torch::empty({get_height(), width, 3}, options);
  129. uint8_t* frame_ptr = decoded_frame.data_ptr<uint8_t>();
  130. const uint8_t* const source_arr[] = {
  131. (const uint8_t* const)source_frame,
  132. (const uint8_t* const)(source_frame + source_pitch * ((surface_height + 1) & ~1))};
  133. auto err = nppiNV12ToRGB_709CSC_8u_P2C3R(
  134. source_arr,
  135. source_pitch,
  136. frame_ptr,
  137. width * 3,
  138. {(int)decoded_frame.size(1), (int)decoded_frame.size(0)});
  139. TORCH_CHECK(
  140. err == NPP_NO_ERROR,
  141. "Failed to convert from NV12 to RGB. Error code:",
  142. err);
  143. check_for_cuda_errors(cuStreamSynchronize(cuvidStream), __LINE__, __FILE__);
  144. decoded_frames.push(decoded_frame);
  145. check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
  146. check_for_cuda_errors(
  147. cuvidUnmapVideoFrame(decoder, source_frame), __LINE__, __FILE__);
  148. return 1;
  149. }
  150. /* Query the capabilities of the underlying hardware video decoder and
  151. * verify if the hardware supports decoding the passed video.
  152. */
  153. void Decoder::query_hardware(CUVIDEOFORMAT* video_format) {
  154. CUVIDDECODECAPS decode_caps = {};
  155. decode_caps.eCodecType = video_format->codec;
  156. decode_caps.eChromaFormat = video_format->chroma_format;
  157. decode_caps.nBitDepthMinus8 = video_format->bit_depth_luma_minus8;
  158. check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
  159. check_for_cuda_errors(cuvidGetDecoderCaps(&decode_caps), __LINE__, __FILE__);
  160. check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
  161. if (!decode_caps.bIsSupported) {
  162. TORCH_CHECK(false, "Codec not supported on this GPU");
  163. }
  164. if ((video_format->coded_width > decode_caps.nMaxWidth) ||
  165. (video_format->coded_height > decode_caps.nMaxHeight)) {
  166. TORCH_CHECK(
  167. false,
  168. "Resolution : ",
  169. video_format->coded_width,
  170. "x",
  171. video_format->coded_height,
  172. "\nMax Supported (wxh) : ",
  173. decode_caps.nMaxWidth,
  174. "x",
  175. decode_caps.nMaxHeight,
  176. "\nResolution not supported on this GPU");
  177. }
  178. if ((video_format->coded_width >> 4) * (video_format->coded_height >> 4) >
  179. decode_caps.nMaxMBCount) {
  180. TORCH_CHECK(
  181. false,
  182. "MBCount : ",
  183. (video_format->coded_width >> 4) * (video_format->coded_height >> 4),
  184. "\nMax Supported mbcnt : ",
  185. decode_caps.nMaxMBCount,
  186. "\nMBCount not supported on this GPU");
  187. }
  188. // Check if output format supported. If not, check fallback options
  189. if (!(decode_caps.nOutputFormatMask & (1 << video_output_format))) {
  190. if (decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12)) {
  191. video_output_format = cudaVideoSurfaceFormat_NV12;
  192. } else if (
  193. decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_P016)) {
  194. video_output_format = cudaVideoSurfaceFormat_P016;
  195. } else if (
  196. decode_caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_YUV444)) {
  197. video_output_format = cudaVideoSurfaceFormat_YUV444;
  198. } else if (
  199. decode_caps.nOutputFormatMask &
  200. (1 << cudaVideoSurfaceFormat_YUV444_16Bit)) {
  201. video_output_format = cudaVideoSurfaceFormat_YUV444_16Bit;
  202. } else {
  203. TORCH_CHECK(false, "No supported output format found");
  204. }
  205. }
  206. }
  207. /* Called before decoding frames and/or whenever there is a configuration
  208. * change.
  209. */
  210. int Decoder::handle_video_sequence(CUVIDEOFORMAT* video_format) {
  211. // video_codec has been set in init(). Here it's set
  212. // again for potential correction.
  213. video_codec = video_format->codec;
  214. video_chroma_format = video_format->chroma_format;
  215. bit_depth_minus8 = video_format->bit_depth_luma_minus8;
  216. bytes_per_pixel = bit_depth_minus8 > 0 ? 2 : 1;
  217. // Set the output surface format same as chroma format
  218. switch (video_chroma_format) {
  219. case cudaVideoChromaFormat_Monochrome:
  220. case cudaVideoChromaFormat_420:
  221. video_output_format = video_format->bit_depth_luma_minus8
  222. ? cudaVideoSurfaceFormat_P016
  223. : cudaVideoSurfaceFormat_NV12;
  224. break;
  225. case cudaVideoChromaFormat_444:
  226. video_output_format = video_format->bit_depth_luma_minus8
  227. ? cudaVideoSurfaceFormat_YUV444_16Bit
  228. : cudaVideoSurfaceFormat_YUV444;
  229. break;
  230. case cudaVideoChromaFormat_422:
  231. video_output_format = cudaVideoSurfaceFormat_NV12;
  232. }
  233. query_hardware(video_format);
  234. if (width && luma_height && chroma_height) {
  235. // cuvidCreateDecoder() has been called before and now there's possible
  236. // config change.
  237. return reconfigure_decoder(video_format);
  238. }
  239. cu_video_format = *video_format;
  240. unsigned long decode_surface = video_format->min_num_decode_surfaces;
  241. cudaVideoDeinterlaceMode deinterlace_mode = cudaVideoDeinterlaceMode_Adaptive;
  242. if (video_format->progressive_sequence) {
  243. deinterlace_mode = cudaVideoDeinterlaceMode_Weave;
  244. }
  245. CUVIDDECODECREATEINFO video_decode_create_info = {};
  246. video_decode_create_info.ulWidth = video_format->coded_width;
  247. video_decode_create_info.ulHeight = video_format->coded_height;
  248. video_decode_create_info.ulNumDecodeSurfaces = decode_surface;
  249. video_decode_create_info.CodecType = video_format->codec;
  250. video_decode_create_info.ChromaFormat = video_format->chroma_format;
  251. // With PreferCUVID, JPEG is still decoded by CUDA while video is decoded
  252. // by NVDEC hardware
  253. video_decode_create_info.ulCreationFlags = cudaVideoCreate_PreferCUVID;
  254. video_decode_create_info.bitDepthMinus8 = video_format->bit_depth_luma_minus8;
  255. video_decode_create_info.OutputFormat = video_output_format;
  256. video_decode_create_info.DeinterlaceMode = deinterlace_mode;
  257. video_decode_create_info.ulNumOutputSurfaces = 2;
  258. video_decode_create_info.vidLock = ctx_lock;
  259. // AV1 has max width/height of sequence in sequence header
  260. if (video_format->codec == cudaVideoCodec_AV1 &&
  261. video_format->seqhdr_data_length > 0) {
  262. CUVIDEOFORMATEX* video_format_ex = (CUVIDEOFORMATEX*)video_format;
  263. max_width = video_format_ex->av1.max_width;
  264. max_height = video_format_ex->av1.max_height;
  265. }
  266. if (max_width < video_format->coded_width) {
  267. max_width = video_format->coded_width;
  268. }
  269. if (max_height < video_format->coded_height) {
  270. max_height = video_format->coded_height;
  271. }
  272. video_decode_create_info.ulMaxWidth = max_width;
  273. video_decode_create_info.ulMaxHeight = max_height;
  274. width = video_format->display_area.right - video_format->display_area.left;
  275. luma_height =
  276. video_format->display_area.bottom - video_format->display_area.top;
  277. video_decode_create_info.ulTargetWidth = video_format->coded_width;
  278. video_decode_create_info.ulTargetHeight = video_format->coded_height;
  279. chroma_height =
  280. (int)(ceil(luma_height * chroma_height_factor(video_output_format)));
  281. num_chroma_planes = chroma_plane_count(video_output_format);
  282. surface_height = video_decode_create_info.ulTargetHeight;
  283. surface_width = video_decode_create_info.ulTargetWidth;
  284. display_rect.bottom = video_decode_create_info.display_area.bottom;
  285. display_rect.top = video_decode_create_info.display_area.top;
  286. display_rect.left = video_decode_create_info.display_area.left;
  287. display_rect.right = video_decode_create_info.display_area.right;
  288. check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
  289. check_for_cuda_errors(
  290. cuvidCreateDecoder(&decoder, &video_decode_create_info),
  291. __LINE__,
  292. __FILE__);
  293. check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
  294. return decode_surface;
  295. }
  296. int Decoder::reconfigure_decoder(CUVIDEOFORMAT* video_format) {
  297. if (video_format->bit_depth_luma_minus8 !=
  298. cu_video_format.bit_depth_luma_minus8 ||
  299. video_format->bit_depth_chroma_minus8 !=
  300. cu_video_format.bit_depth_chroma_minus8) {
  301. TORCH_CHECK(false, "Reconfigure not supported for bit depth change");
  302. }
  303. if (video_format->chroma_format != cu_video_format.chroma_format) {
  304. TORCH_CHECK(false, "Reconfigure not supported for chroma format change");
  305. }
  306. bool decode_res_change =
  307. !(video_format->coded_width == cu_video_format.coded_width &&
  308. video_format->coded_height == cu_video_format.coded_height);
  309. bool display_rect_change =
  310. !(video_format->display_area.bottom ==
  311. cu_video_format.display_area.bottom &&
  312. video_format->display_area.top == cu_video_format.display_area.top &&
  313. video_format->display_area.left == cu_video_format.display_area.left &&
  314. video_format->display_area.right == cu_video_format.display_area.right);
  315. unsigned int decode_surface = video_format->min_num_decode_surfaces;
  316. if ((video_format->coded_width > max_width) ||
  317. (video_format->coded_height > max_height)) {
  318. // For VP9, let driver handle the change if new width/height >
  319. // maxwidth/maxheight
  320. if (video_codec != cudaVideoCodec_VP9) {
  321. TORCH_CHECK(
  322. false,
  323. "Reconfigure not supported when width/height > maxwidth/maxheight");
  324. }
  325. return 1;
  326. }
  327. if (!decode_res_change) {
  328. // If the coded_width/coded_height hasn't changed but display resolution has
  329. // changed, then need to update width/height for correct output without
  330. // cropping. Example : 1920x1080 vs 1920x1088.
  331. if (display_rect_change) {
  332. width =
  333. video_format->display_area.right - video_format->display_area.left;
  334. luma_height =
  335. video_format->display_area.bottom - video_format->display_area.top;
  336. chroma_height =
  337. (int)ceil(luma_height * chroma_height_factor(video_output_format));
  338. num_chroma_planes = chroma_plane_count(video_output_format);
  339. }
  340. return 1;
  341. }
  342. cu_video_format.coded_width = video_format->coded_width;
  343. cu_video_format.coded_height = video_format->coded_height;
  344. CUVIDRECONFIGUREDECODERINFO reconfig_params = {};
  345. reconfig_params.ulWidth = video_format->coded_width;
  346. reconfig_params.ulHeight = video_format->coded_height;
  347. reconfig_params.ulTargetWidth = surface_width;
  348. reconfig_params.ulTargetHeight = surface_height;
  349. reconfig_params.ulNumDecodeSurfaces = decode_surface;
  350. reconfig_params.display_area.bottom = display_rect.bottom;
  351. reconfig_params.display_area.top = display_rect.top;
  352. reconfig_params.display_area.left = display_rect.left;
  353. reconfig_params.display_area.right = display_rect.right;
  354. check_for_cuda_errors(cuCtxPushCurrent(cu_context), __LINE__, __FILE__);
  355. check_for_cuda_errors(
  356. cuvidReconfigureDecoder(decoder, &reconfig_params), __LINE__, __FILE__);
  357. check_for_cuda_errors(cuCtxPopCurrent(NULL), __LINE__, __FILE__);
  358. return decode_surface;
  359. }
  360. /* Called from AV1 sequence header to get operating point of an AV1 bitstream.
  361. */
  362. int Decoder::get_operating_point(CUVIDOPERATINGPOINTINFO* oper_point_info) {
  363. return oper_point_info->codec == cudaVideoCodec_AV1 &&
  364. oper_point_info->av1.operating_points_cnt > 1
  365. ? 0
  366. : -1;
  367. }