trt_inference.cpp 31 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006
  1. /*
  2. * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3. *
  4. * Redistribution and use in source and binary forms, with or without
  5. * modification, are permitted provided that the following conditions
  6. * are met:
  7. * * Redistributions of source code must retain the above copyright
  8. * notice, this list of conditions and the following disclaimer.
  9. * * Redistributions in binary form must reproduce the above copyright
  10. * notice, this list of conditions and the following disclaimer in the
  11. * documentation and/or other materials provided with the distribution.
  12. * * Neither the name of NVIDIA CORPORATION nor the names of its
  13. * contributors may be used to endorse or promote products derived
  14. * from this software without specific prior written permission.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
  17. * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  19. * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  20. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  21. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  22. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  23. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
  24. * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  25. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  26. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  27. */
  28. #include "trt_inference.h"
  29. #include <stdlib.h>
  30. #include <sys/time.h>
  31. #include <assert.h>
  32. #include <sstream>
  33. #include <iostream>
  34. #include <sys/stat.h>
  35. #include <cmath>
  36. #include <cuda_runtime_api.h>
  37. #include <algorithm>
  38. #include <iterator>
  39. static const int TIMING_ITERATIONS = 1;
  40. static const int NUM_BINDINGS = 3;
  41. static const int FILTER_NUM = 6;
  42. #define CHECK(status) \
  43. { \
  44. if (status != 0) \
  45. { \
  46. std::cout << "Cuda failure: " << status; \
  47. abort(); \
  48. } \
  49. }
  50. // Logger for TRT info/warning/errors
  51. class Logger : public ILogger
  52. {
  53. void log(Severity severity, const char* msg) noexcept override
  54. {
  55. // suppress info-level messages
  56. if (severity != Severity::kINFO)
  57. std::cout << msg << std::endl;
  58. }
  59. };
  60. class Profiler : public IProfiler
  61. {
  62. typedef std::pair<std::string, float> Record;
  63. std::vector<Record> mProfile;
  64. virtual void reportLayerTime(const char* layerName, float ms) noexcept
  65. {
  66. auto record = std::find_if(mProfile.begin(), mProfile.end(),
  67. [&](const Record& r){ return r.first == layerName; });
  68. if (record == mProfile.end())
  69. mProfile.push_back(std::make_pair(layerName, ms));
  70. else
  71. record->second += ms;
  72. }
  73. void printLayerTimes()
  74. {
  75. float totalTime = 0;
  76. for (size_t i = 0; i < mProfile.size(); i++)
  77. {
  78. printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
  79. mProfile[i].second / TIMING_ITERATIONS);
  80. totalTime += mProfile[i].second;
  81. }
  82. printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
  83. }
  84. };
  85. class Int8EntropyCalibrator : public IInt8EntropyCalibrator
  86. {
  87. public:
  88. Int8EntropyCalibrator(bool readCache = true, bool onnxModel = false)
  89. {
  90. mReadCache = readCache;
  91. mOnnxModel = onnxModel;
  92. }
  93. virtual ~Int8EntropyCalibrator()
  94. {
  95. }
  96. //We don't support int8 calibration till now[ToDo].
  97. int getBatchSize() const noexcept override { return 0 /*mBF.m_Dims.n()*/; }
  98. bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override
  99. {
  100. return false;
  101. }
  102. const void* readCalibrationCache(size_t& length) noexcept override
  103. {
  104. mCalibrationCache.clear();
  105. const char* CACHE_PATH = mOnnxModel ? "../../data/Model/resnet10/CalibrationTable_ONNX"
  106. : "../../data/Model/resnet10/CalibrationTable_CAFFE";
  107. std::ifstream input(CACHE_PATH, std::ios::binary);
  108. input >> std::noskipws;
  109. if (mReadCache && input.good())
  110. std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
  111. length = mCalibrationCache.size();
  112. cout<<"text:"<<(char*)(&mCalibrationCache[0])<<endl;
  113. return length ? &mCalibrationCache[0] : nullptr;
  114. }
  115. void writeCalibrationCache(const void* cache, size_t length) noexcept override
  116. {
  117. std::ofstream output("CalibrationTable", std::ios::binary);
  118. output.write(reinterpret_cast<const char*>(cache), length);
  119. }
  120. private:
  121. bool mReadCache{ true };
  122. bool mOnnxModel{ false };
  123. std::vector<char> mCalibrationCache;
  124. };
  125. string stringtrim(string);
  126. //This function is used to trim space
  127. string
  128. stringtrim(string s)
  129. {
  130. int i = 0;
  131. while (s[i] == ' ')
  132. {
  133. i++;
  134. }
  135. s = s.substr(i);
  136. i = s.size()-1;
  137. while (s[i] == ' ')
  138. {
  139. i--;
  140. }
  141. s = s.substr(0, i + 1);
  142. return s;
  143. }
  144. int
  145. TRT_Context::getNetWidth() const
  146. {
  147. return net_width;
  148. }
  149. int
  150. TRT_Context::getNetHeight() const
  151. {
  152. return net_height;
  153. }
  154. int
  155. TRT_Context::getFilterNum() const
  156. {
  157. return filter_num;
  158. }
  159. void
  160. TRT_Context::setFilterNum(const unsigned int& filter_num)
  161. {
  162. this->filter_num = filter_num;
  163. }
  164. void*&
  165. TRT_Context::getBuffer(const int& index)
  166. {
  167. assert(index >= 0 && index < num_bindings);
  168. return buffers[index];
  169. }
  170. float*&
  171. TRT_Context::getInputBuf()
  172. {
  173. return input_buf;
  174. }
  175. uint32_t
  176. TRT_Context::getNumTrtInstances() const
  177. {
  178. return trtinstance_num;
  179. }
  180. uint32_t
  181. TRT_Context::getBatchSize() const
  182. {
  183. return batch_size;
  184. }
  185. int
  186. TRT_Context::getModelClassCnt() const
  187. {
  188. return g_pModelNetAttr->classCnt;
  189. }
  190. void*
  191. TRT_Context::getScales() const
  192. {
  193. return scales_gpu;
  194. }
  195. void*
  196. TRT_Context::getOffsets() const
  197. {
  198. return offset_gpu;
  199. }
  200. //0 fp16 1 fp32 2 int8
  201. void
  202. TRT_Context::setMode(const int& mode)
  203. {
  204. this->mode = mode;
  205. }
  206. void
  207. TRT_Context::setBatchSize(const uint32_t& batchsize)
  208. {
  209. this->batch_size = batchsize;
  210. }
  211. void
  212. TRT_Context::setDumpResult(const bool& dump_result)
  213. {
  214. this->dump_result = dump_result;
  215. }
  216. void
  217. TRT_Context::setTrtProfilerEnabled(const bool& enable_trt_profiler)
  218. {
  219. this->enable_trt_profiler = enable_trt_profiler;
  220. }
  221. int
  222. TRT_Context::getChannel() const
  223. {
  224. return channel;
  225. }
  226. TRT_Context::TRT_Context()
  227. {
  228. net_width = 0;
  229. net_height = 0;
  230. filter_num = FILTER_NUM;
  231. buffers = new void *[NUM_BINDINGS];
  232. for (int i = 0; i < NUM_BINDINGS; i++)
  233. {
  234. buffers[i] = NULL;
  235. }
  236. input_buf = NULL;
  237. output_cov_buf = NULL;
  238. output_bbox_buf = NULL;
  239. runtime = NULL;
  240. engine = NULL;
  241. context = NULL;
  242. pResultArray = new uint32_t[100*4];
  243. channel = 0;
  244. num_bindings = NUM_BINDINGS;
  245. batch_size = 1;
  246. trtinstance_num = 1;
  247. mode = MODE_FP16;
  248. elapsed_frame_num = 0;
  249. elapsed_time = 0;
  250. enable_trt_profiler = 1;
  251. dump_result = 0;
  252. frame_num = 0;
  253. result_file = "result.txt";
  254. pLogger = new Logger;
  255. pProfiler = new Profiler;
  256. }
  257. void
  258. TRT_Context::allocateMemory(bool bUseCPUBuf)
  259. {
  260. const ICudaEngine& cuda_engine = context->getEngine();
  261. // input and output buffer pointers that we pass to the engine
  262. // the engine requires exactly IEngine::getNbBindings() of these
  263. // but in this case we know that there is exactly one input and one output
  264. assert(cuda_engine.getNbBindings() == num_bindings);
  265. // In order to bind the buffers, we need to know the names of the input
  266. // and output tensors. note that indices are guaranteed to be less than
  267. // IEngine::getNbBindings()
  268. inputIndex = cuda_engine.getBindingIndex(g_pModelNetAttr->INPUT_BLOB_NAME);
  269. outputIndex = cuda_engine.getBindingIndex(g_pModelNetAttr->OUTPUT_BLOB_NAME);
  270. outputIndexBBOX = cuda_engine.getBindingIndex(g_pModelNetAttr->OUTPUT_BBOX_NAME);
  271. // allocate GPU buffers
  272. if (is_onnx_model)
  273. {
  274. //It's explicit batch
  275. Dims inputD = cuda_engine.getBindingDimensions(inputIndex);
  276. Dims outputD = cuda_engine.getBindingDimensions(outputIndex);
  277. Dims outputDB = cuda_engine.getBindingDimensions(outputIndexBBOX);
  278. inputDims = Dims3{inputD.d[1], inputD.d[2], inputD.d[3]};
  279. outputDims = Dims3{outputD.d[1], outputD.d[2], outputD.d[3]};
  280. outputDimsBBOX = Dims3{outputDB.d[1], outputDB.d[2], outputDB.d[3]};
  281. }
  282. else
  283. {
  284. inputDims = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(inputIndex));
  285. outputDims = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(outputIndex));
  286. outputDimsBBOX = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(outputIndexBBOX));
  287. }
  288. net_height = inputDims.d[1];
  289. net_width = inputDims.d[2];
  290. inputSize = batch_size * inputDims.d[0] * inputDims.d[1] * inputDims.d[2] * sizeof(float);
  291. outputSize = batch_size * outputDims.d[0] * outputDims.d[1] *
  292. outputDims.d[2] * sizeof(float);
  293. outputSizeBBOX = batch_size * outputDimsBBOX.d[0] * outputDimsBBOX.d[1] *
  294. outputDimsBBOX.d[2] * sizeof(float);
  295. if (bUseCPUBuf && input_buf == NULL)
  296. {
  297. input_buf = (float *)malloc(inputSize);
  298. assert(input_buf != NULL);
  299. }
  300. if (output_cov_buf == NULL)
  301. {
  302. output_cov_buf = (float *)malloc(outputSize);
  303. assert(output_cov_buf != NULL);
  304. }
  305. if (outputIndexBBOX >= 0)
  306. {
  307. if (output_bbox_buf == NULL)
  308. {
  309. output_bbox_buf = (float *)malloc(outputSizeBBOX);
  310. assert(output_bbox_buf != NULL);
  311. }
  312. }
  313. // create GPU buffers and a stream
  314. if (buffers[inputIndex] == NULL)
  315. {
  316. CHECK(cudaMalloc(&buffers[inputIndex], inputSize));
  317. }
  318. if (buffers[outputIndex] == NULL)
  319. {
  320. CHECK(cudaMalloc(&buffers[outputIndex], outputSize));
  321. }
  322. if (outputIndexBBOX >= 0)
  323. {
  324. if (buffers[outputIndexBBOX] == NULL)
  325. {
  326. CHECK(cudaMalloc(&buffers[outputIndexBBOX], outputSizeBBOX));
  327. }
  328. }
  329. CHECK(cudaMalloc(&offset_gpu, sizeof(int) * 3));
  330. CHECK(cudaMalloc(&scales_gpu, sizeof(float) * 3));
  331. CHECK(cudaMemcpy(offset_gpu, (void*)g_pModelNetAttr->offsets,
  332. sizeof(int) * 3,
  333. cudaMemcpyHostToDevice));
  334. CHECK(cudaMemcpy(scales_gpu, (void*)g_pModelNetAttr->input_scale,
  335. sizeof(float) * 3,
  336. cudaMemcpyHostToDevice));
  337. if (dump_result)
  338. {
  339. fstream.open(result_file.c_str(), ios::out);
  340. }
  341. }
  342. void
  343. TRT_Context::releaseMemory(bool bUseCPUBuf)
  344. {
  345. for (int i = 0; i < NUM_BINDINGS; i++)
  346. {
  347. if (buffers[i] != NULL)
  348. {
  349. CHECK(cudaFree(buffers[i]));
  350. buffers[i] = NULL;
  351. }
  352. }
  353. if (bUseCPUBuf && input_buf != NULL)
  354. {
  355. free(input_buf);
  356. input_buf = NULL;
  357. }
  358. if (output_cov_buf != NULL)
  359. {
  360. free(output_cov_buf);
  361. output_cov_buf = NULL;
  362. }
  363. if (output_bbox_buf != NULL)
  364. {
  365. free(output_bbox_buf);
  366. output_bbox_buf = NULL;
  367. }
  368. if (pResultArray != NULL)
  369. {
  370. delete []pResultArray;
  371. pResultArray = NULL;
  372. }
  373. if (dump_result)
  374. {
  375. fstream.close();
  376. }
  377. CHECK(cudaFree(offset_gpu));
  378. CHECK(cudaFree(scales_gpu));
  379. }
  380. TRT_Context::~TRT_Context()
  381. {
  382. delete pLogger;
  383. delete pProfiler;
  384. delete []buffers;
  385. }
  386. void
  387. TRT_Context::caffeToTRTModel(const string& deployfile, const string& modelfile)
  388. {
  389. Int8EntropyCalibrator calibrator;
  390. IInt8Calibrator* int8Calibrator = &calibrator;
  391. // create API root class - must span the lifetime of the engine usage
  392. IBuilder *builder = createInferBuilder(*pLogger);
  393. INetworkDefinition *network = builder->createNetworkV2(0U);
  394. IBuilderConfig* config = builder->createBuilderConfig();
  395. // parse the caffe model to populate the network, then set the outputs
  396. ICaffeParser *parser = createCaffeParser();
  397. bool hasFp16 = builder->platformHasFastFp16();
  398. // if user specify
  399. if (mode == MODE_FP16)
  400. {
  401. if (hasFp16)
  402. {
  403. printf("mode has been set to 0(using fp16)\n");
  404. }
  405. else
  406. {
  407. printf("platform don't have fp16, force to 1(using fp32)\n");
  408. }
  409. }
  410. else if(mode >= MODE_FP32)
  411. {
  412. printf("mode >= 1(using fp32 or int8)\n");
  413. hasFp16 = 0;
  414. }
  415. // create a 16-bit model if it's natively supported
  416. DataType modelDataType = hasFp16 ? DataType::kHALF : DataType::kFLOAT;
  417. const IBlobNameToTensor *blobNameToTensor =
  418. parser->parse(deployfile.c_str(), // caffe deploy file
  419. modelfile.c_str(), // caffe model file
  420. *network, // network definition that parser populate
  421. modelDataType);
  422. assert(blobNameToTensor != nullptr);
  423. // the caffe file has no notion of outputs
  424. // so we need to manually say which tensors the engine should generate
  425. outputs = {g_pModelNetAttr->OUTPUT_BLOB_NAME,
  426. g_pModelNetAttr->OUTPUT_BBOX_NAME};
  427. for (auto& s : outputs)
  428. {
  429. network->markOutput(*blobNameToTensor->find(s.c_str()));
  430. printf("outputs %s\n", s.c_str());
  431. }
  432. // Build the engine
  433. builder->setMaxBatchSize(batch_size);
  434. config->setMaxWorkspaceSize(g_pModelNetAttr->WORKSPACE_SIZE);
  435. if (mode == MODE_INT8)
  436. {
  437. config->setFlag(BuilderFlag::kINT8);
  438. config->setInt8Calibrator(int8Calibrator);
  439. }
  440. // Eliminate the side-effect from the delay of GPU frequency boost
  441. config->setMinTimingIterations(3);
  442. config->setAvgTimingIterations(2);
  443. // set up the network for paired-fp16 format, only on DriveCX
  444. if (hasFp16)
  445. {
  446. config->setFlag(BuilderFlag::kFP16);
  447. }
  448. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  449. assert(engine);
  450. // we don't need the network any more, and we can destroy the parser
  451. delete network;
  452. delete parser;
  453. delete config;
  454. // serialize the engine, then close everything down
  455. trtModelStream = engine->serialize();
  456. delete engine;
  457. delete builder;
  458. shutdownProtobufLibrary();
  459. }
  460. void
  461. TRT_Context::onnxToTRTModel(const string& modelfile)
  462. {
  463. Int8EntropyCalibrator calibrator(true, true);
  464. IInt8Calibrator* int8Calibrator = &calibrator;
  465. // create API root class - must span the lifetime of the engine usage
  466. IBuilder *builder = createInferBuilder(*pLogger);
  467. const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  468. INetworkDefinition* network = builder->createNetworkV2(explicitBatch);
  469. IBuilderConfig* config = builder->createBuilderConfig();
  470. auto parser = nvonnxparser::createParser(*network, *pLogger);
  471. bool hasFp16 = builder->platformHasFastFp16();
  472. // if user specify
  473. if (mode == MODE_FP16)
  474. {
  475. if (hasFp16)
  476. {
  477. printf("mode has been set to 0(using fp16)\n");
  478. }
  479. else
  480. {
  481. printf("platform don't have fp16, force to 1(using fp32)\n");
  482. }
  483. }
  484. else if(mode >= MODE_FP32)
  485. {
  486. printf("mode >= 1(using fp32 or int8)\n");
  487. hasFp16 = 0;
  488. }
  489. auto parsed = parser->parseFromFile(modelfile.c_str(), static_cast<int>(ILogger::Severity::kWARNING));
  490. if (!parsed)
  491. {
  492. printf("Failed to parse onnx model\n");
  493. return;
  494. }
  495. // Only support one input for resnet10(default model)
  496. auto input = network->getInput(0);
  497. auto inputDims = input->getDimensions();
  498. IOptimizationProfile* profile = builder->createOptimizationProfile();
  499. profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
  500. profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
  501. profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
  502. assert(config->addOptimizationProfile(profile) != -1);
  503. assert(profile->isValid());
  504. config->setCalibrationProfile(profile);
  505. // Build the engine
  506. config->setMaxWorkspaceSize(g_pModelNetAttr->WORKSPACE_SIZE);
  507. if (mode == MODE_INT8)
  508. {
  509. config->setFlag(BuilderFlag::kINT8);
  510. config->setInt8Calibrator(int8Calibrator);
  511. }
  512. // Eliminate the side-effect from the delay of GPU frequency boost
  513. config->setMinTimingIterations(3);
  514. config->setAvgTimingIterations(2);
  515. // set up the network for paired-fp16 format, only on DriveCX
  516. if (hasFp16)
  517. {
  518. config->setFlag(BuilderFlag::kFP16);
  519. }
  520. ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
  521. assert(engine);
  522. // we don't need the network any more, and we can destroy the parser
  523. delete network;
  524. delete parser;
  525. delete config;
  526. // serialize the engine, then close everything down
  527. trtModelStream = engine->serialize();
  528. delete engine;
  529. delete builder;
  530. shutdownProtobufLibrary();
  531. }
  532. void
  533. TRT_Context::setModelIndex(int index)
  534. {
  535. assert(index == GOOGLENET_SINGLE_CLASS ||
  536. index == GOOGLENET_THREE_CLASS ||
  537. index == RESNET_THREE_CLASS);
  538. g_pModelNetAttr = gModelNetAttr + index;
  539. assert(g_pModelNetAttr->classCnt > 0);
  540. assert(g_pModelNetAttr->STRIDE > 0);
  541. assert(g_pModelNetAttr->WORKSPACE_SIZE > 0);
  542. }
  543. void
  544. TRT_Context::buildTrtContext(const string& deployfile,
  545. const string& modelfile, bool bUseCPUBuf, bool isOnnxModel)
  546. {
  547. this->is_onnx_model = isOnnxModel;
  548. if (!parseNet(deployfile) && !isOnnxModel)
  549. {
  550. cout<<"parse net failed, exit!"<<endl;
  551. exit(0);
  552. }
  553. ifstream trtModelFile("trtModel.cache");
  554. if (trtModelFile.good())
  555. {
  556. // get cache file length
  557. size_t size = 0;
  558. size_t i = 0;
  559. cout<<"Using cached TRT model" <<endl;
  560. // Get the length
  561. trtModelFile.seekg(0, ios::end);
  562. size = trtModelFile.tellg();
  563. trtModelFile.seekg(0, ios::beg);
  564. char * buff = new char [size];
  565. while (trtModelFile.get(buff[i])) i++;
  566. trtModelFile.close();
  567. runtime = createInferRuntime(*pLogger);
  568. engine = runtime->deserializeCudaEngine((void *)buff, size, nullptr);
  569. }
  570. else
  571. {
  572. if (isOnnxModel)
  573. {
  574. onnxToTRTModel(modelfile);
  575. } else
  576. {
  577. caffeToTRTModel(deployfile, modelfile);
  578. }
  579. cout<<"Create TRT model cache"<<endl;
  580. ofstream trtModelFile("trtModel.cache");
  581. trtModelFile.write((char *)trtModelStream->data(), trtModelStream->size());
  582. trtModelFile.close();
  583. runtime = createInferRuntime(*pLogger);
  584. engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
  585. trtModelStream->destroy();
  586. }
  587. context = engine->createExecutionContext();
  588. allocateMemory(bUseCPUBuf);
  589. }
  590. void
  591. TRT_Context::destroyTrtContext(bool bUseCPUBuf)
  592. {
  593. releaseMemory(bUseCPUBuf);
  594. delete context;
  595. delete engine;
  596. delete runtime;
  597. }
  598. void
  599. TRT_Context::doInference(
  600. queue< vector<cv::Rect> >* rectList_queue,
  601. float *input)
  602. {
  603. struct timeval input_time;
  604. struct timeval output_time;
  605. if (!enable_trt_profiler)
  606. {
  607. cudaStream_t stream;
  608. CHECK(cudaStreamCreate(&stream));
  609. // DMA the input to the GPU, execute the batch asynchronously
  610. // and DMA it back
  611. if (input != NULL) //NULL means we have use GPU to map memory
  612. {
  613. CHECK(cudaMemcpyAsync(buffers[inputIndex], input, inputSize,
  614. cudaMemcpyHostToDevice, stream));
  615. }
  616. if (is_onnx_model)
  617. {
  618. context->setBindingDimensions(0, Dims4{batch_size, inputDims.d[0], inputDims.d[1], inputDims.d[2]});
  619. context->executeV2(buffers);
  620. }
  621. else
  622. {
  623. context->enqueue(batch_size, buffers, stream, nullptr);
  624. }
  625. CHECK(cudaMemcpyAsync(output_cov_buf, buffers[outputIndex], outputSize,
  626. cudaMemcpyDeviceToHost, stream));
  627. if (outputIndexBBOX >= 0)
  628. {
  629. CHECK(cudaMemcpyAsync(output_bbox_buf, buffers[outputIndexBBOX],
  630. outputSizeBBOX, cudaMemcpyDeviceToHost, stream));
  631. }
  632. cudaStreamSynchronize(stream);
  633. // release the stream and the buffers
  634. cudaStreamDestroy(stream);
  635. }
  636. else
  637. {
  638. // DMA the input to the GPU, execute the batch synchronously
  639. // and DMA it back
  640. if (input != NULL) //NULL means we have use GPU to map memory
  641. {
  642. CHECK(cudaMemcpy(buffers[inputIndex], input, inputSize,
  643. cudaMemcpyHostToDevice));
  644. }
  645. gettimeofday(&input_time, NULL);
  646. if (is_onnx_model)
  647. {
  648. context->setBindingDimensions(0, Dims4{batch_size, inputDims.d[0], inputDims.d[1], inputDims.d[2]});
  649. context->executeV2(buffers);
  650. }
  651. else
  652. {
  653. context->execute(batch_size, buffers);
  654. }
  655. gettimeofday(&output_time, NULL);
  656. CHECK(cudaMemcpy(output_cov_buf, buffers[outputIndex], outputSize,
  657. cudaMemcpyDeviceToHost));
  658. if (outputIndexBBOX >= 0)
  659. {
  660. CHECK(cudaMemcpy(output_bbox_buf, buffers[outputIndexBBOX],
  661. outputSizeBBOX, cudaMemcpyDeviceToHost));
  662. }
  663. elapsed_frame_num += batch_size;
  664. elapsed_time += (output_time.tv_sec - input_time.tv_sec) * 1000 +
  665. (output_time.tv_usec - input_time.tv_usec) / 1000;
  666. if (elapsed_frame_num >= 100)
  667. {
  668. printf("Time elapsed:%ld ms per frame in past %ld frames\n",
  669. elapsed_time / elapsed_frame_num, elapsed_frame_num);
  670. elapsed_frame_num = 0;
  671. elapsed_time = 0;
  672. }
  673. }
  674. vector<cv::Rect> rectList[getModelClassCnt()];
  675. for (int i = 0; i < batch_size; i++)
  676. {
  677. if (g_pModelNetAttr->ParseFunc_ID == 0)
  678. parseBbox(rectList, i);
  679. else if(g_pModelNetAttr->ParseFunc_ID == 1)
  680. ParseResnet10Bbox(rectList, i);
  681. if (dump_result)
  682. {
  683. for (int class_num = 0;
  684. class_num < (g_pModelNetAttr->ParseFunc_ID == 1 ? getModelClassCnt() - 1 : getModelClassCnt());
  685. class_num++)
  686. {
  687. fstream << "frame:" << frame_num << " class num:" << class_num
  688. << " has rect:" << rectList[class_num].size() << endl;
  689. for (uint32_t i = 0; i < rectList[class_num].size(); i++)
  690. {
  691. fstream << "\tx,y,w,h:"
  692. << (float) rectList[class_num][i].x / net_width << " "
  693. << (float) rectList[class_num][i].y / net_height << " "
  694. << (float) rectList[class_num][i].width / net_width << " "
  695. << (float) rectList[class_num][i].height / net_height << endl;
  696. }
  697. fstream << endl;
  698. }
  699. frame_num++;
  700. }
  701. for (int class_num = 0; class_num < getModelClassCnt(); class_num++)
  702. {
  703. rectList_queue[class_num].push(rectList[class_num]);
  704. }
  705. }
  706. }
  707. void
  708. TRT_Context::parseBbox(vector<cv::Rect>* rectList, int batch_th)
  709. {
  710. int gridsize = outputDims.d[1] * outputDims.d[2];
  711. int gridoffset = outputDims.d[0] * outputDims.d[1] * outputDims.d[2] * batch_th;
  712. for (int class_num = 0; class_num < getModelClassCnt(); class_num++)
  713. {
  714. float *output_x1 = output_bbox_buf +
  715. outputDimsBBOX.d[0] * outputDimsBBOX.d[1] * outputDimsBBOX.d[2] * batch_th +
  716. class_num * 4 * outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  717. float *output_y1 = output_x1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  718. float *output_x2 = output_y1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  719. float *output_y2 = output_x2 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  720. for (int i = 0; i < gridsize; ++i)
  721. {
  722. if (output_cov_buf[gridoffset + class_num * gridsize + i] >=
  723. g_pModelNetAttr->THRESHOLD[class_num])
  724. {
  725. int g_x = i % outputDims.d[2];
  726. int g_y = i / outputDims.d[2];
  727. int i_x = g_x * g_pModelNetAttr->STRIDE;
  728. int i_y = g_y * g_pModelNetAttr->STRIDE;
  729. int rectx1 = g_pModelNetAttr->bbox_output_scales[0] * output_x1[i] + i_x;
  730. int recty1 = g_pModelNetAttr->bbox_output_scales[1] * output_y1[i] + i_y;
  731. int rectx2 = g_pModelNetAttr->bbox_output_scales[2] * output_x2[i] + i_x;
  732. int recty2 = g_pModelNetAttr->bbox_output_scales[3] * output_y2[i] + i_y;
  733. if (rectx1 < 0)
  734. {
  735. rectx1 = 0;
  736. }
  737. if (rectx2 < 0)
  738. {
  739. rectx2 = 0;
  740. }
  741. if (recty1 < 0)
  742. {
  743. recty1 = 0;
  744. }
  745. if (recty2 < 0)
  746. {
  747. recty2 = 0;
  748. }
  749. if (rectx1 >= (int)net_width)
  750. {
  751. rectx1 = net_width - 1;
  752. }
  753. if (rectx2 >= (int)net_width)
  754. {
  755. rectx2 = net_width - 1;
  756. }
  757. if (recty1 >= (int)net_height)
  758. {
  759. recty1 = net_height - 1;
  760. }
  761. if (recty2 >= (int)net_height)
  762. {
  763. recty2 = net_height - 1;
  764. }
  765. rectList[class_num].push_back(cv::Rect(rectx1, recty1,
  766. rectx2 - rectx1, recty2 - recty1));
  767. }
  768. }
  769. cv::groupRectangles(rectList[class_num], 3, 0.2);
  770. }
  771. }
  772. void
  773. TRT_Context::ParseResnet10Bbox(vector<cv::Rect>* rectList, int batch_th)
  774. {
  775. int grid_x_ = outputDims.d[2];
  776. int grid_y_ = outputDims.d[1];
  777. int gridsize_ = grid_x_ * grid_y_;
  778. int target_shape[2] = {grid_x_, grid_y_};
  779. float bbox_norm[2] = {35.0, 35.0};
  780. float gc_centers_0[target_shape[0]];
  781. float gc_centers_1[target_shape[1]];
  782. for (int i = 0; i < target_shape[0]; i++)
  783. gc_centers_0[i] = (float)(i * 16 + 0.5)/bbox_norm[0];
  784. for (int i = 0; i < target_shape[1]; i++)
  785. gc_centers_1[i] = (float)(i * 16 + 0.5)/bbox_norm[1];
  786. for (int class_num = 0;
  787. class_num < (g_pModelNetAttr->ParseFunc_ID == 1 ? getModelClassCnt() - 1 : getModelClassCnt());
  788. class_num++)
  789. {
  790. float *output_x1 = output_bbox_buf + class_num * 4 * outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  791. float *output_y1 = output_x1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  792. float *output_x2 = output_y1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  793. float *output_y2 = output_x2 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
  794. for (int h = 0; h < grid_y_; h++)
  795. {
  796. for (int w = 0; w < grid_x_; w++)
  797. {
  798. int i = w + h * grid_x_;
  799. if (output_cov_buf[class_num * gridsize_ + i] >=
  800. g_pModelNetAttr->THRESHOLD[class_num])
  801. {
  802. float rectx1_f, recty1_f, rectx2_f, recty2_f;
  803. int rectx1, recty1, rectx2, recty2;
  804. rectx1_f = recty1_f = rectx2_f = recty2_f = 0.0;
  805. rectx1_f = output_x1[w + h * grid_x_] - gc_centers_0[w];
  806. recty1_f = output_y1[w + h * grid_x_] - gc_centers_1[h];
  807. rectx2_f = output_x2[w + h * grid_x_] + gc_centers_0[w];
  808. recty2_f = output_y2[w + h * grid_x_] + gc_centers_1[h];
  809. rectx1_f *= (float)(-bbox_norm[0]);
  810. recty1_f *= (float)(-bbox_norm[1]);
  811. rectx2_f *= (float)(bbox_norm[0]);
  812. recty2_f *= (float)(bbox_norm[1]);
  813. rectx1 = (int)rectx1_f;
  814. recty1 = (int)recty1_f;
  815. rectx2 = (int)rectx2_f;
  816. recty2 = (int)recty2_f;
  817. rectx1 = rectx1 < 0 ? 0 : (rectx1 >= net_width ? (net_width - 1) : rectx1);
  818. rectx2 = rectx2 < 0 ? 0 : (rectx2 >= net_width ? (net_width - 1) : rectx2);
  819. recty1 = recty1 < 0 ? 0 : (recty1 >= net_height ? (net_height - 1) : recty1);
  820. recty2 = recty2 < 0 ? 0 : (recty2 >= net_height ? (net_height - 1) : recty2);
  821. rectList[class_num].push_back(cv::Rect(rectx1, recty1,
  822. rectx2 - rectx1, recty2 - recty1));
  823. }
  824. }
  825. }
  826. cv::groupRectangles(rectList[class_num], 1, 0.1);
  827. }
  828. }
  829. int
  830. TRT_Context::parseNet(const string& deployfile)
  831. {
  832. ifstream readfile;
  833. string line;
  834. readfile.open(deployfile, ios::in);
  835. if (!readfile)
  836. {
  837. return 0;
  838. }
  839. int iterator = 0;
  840. while (1)
  841. {
  842. getline(readfile, line);
  843. string::size_type index;
  844. index = line.find("input_dim");
  845. if (index ==std::string::npos)
  846. {
  847. continue;
  848. }
  849. index = line.find_first_of(":", 0);
  850. string first = line.substr(0, index);
  851. string second = line.substr(index + 1);
  852. switch(iterator)
  853. {
  854. case 0: //for batch size
  855. // Deprecate this interface, and let APP set batch size
  856. //batch_size = atoi(stringtrim(second).c_str());
  857. //assert(batch_size > 0);
  858. break;
  859. case 1: // for channel num in net
  860. channel = atoi(stringtrim(second).c_str());
  861. assert(channel > 0);
  862. break;
  863. case 2: // for net's height
  864. net_height = atoi(stringtrim(second).c_str());
  865. assert(net_height > 0);
  866. break;
  867. case 3: // for net's width
  868. net_width = atoi(stringtrim(second).c_str());
  869. assert(net_width > 0);
  870. break;
  871. default:
  872. break;
  873. }
  874. if (iterator == 3)
  875. {
  876. break;
  877. }
  878. iterator++;
  879. }
  880. cout<<"Net has batch_size, channel, net_height, net_width:" <<
  881. batch_size << " " << channel << " " << net_height << " " <<
  882. net_width << endl;
  883. readfile.close();
  884. return 1;
  885. }