SRI
/
vehicle-backup


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006
							/*
 * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *  * Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *  * Neither the name of NVIDIA CORPORATION nor the names of its
 *    contributors may be used to endorse or promote products derived
 *    from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "trt_inference.h"
#include <stdlib.h>
#include <sys/time.h>
#include <assert.h>
#include <sstream>
#include <iostream>
#include <sys/stat.h>
#include <cmath>
#include <cuda_runtime_api.h>
#include <algorithm>
#include <iterator>

static const int TIMING_ITERATIONS = 1;
static const int NUM_BINDINGS = 3;
static const int FILTER_NUM = 6;

#define CHECK(status)                                   \
{                                                       \
    if (status != 0)                                    \
    {                                                   \
        std::cout << "Cuda failure: " << status;        \
        abort();                                        \
    }                                                   \
}

// Logger for TRT info/warning/errors
class Logger : public ILogger
{
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity != Severity::kINFO)
            std::cout << msg << std::endl;
    }
};

class Profiler : public IProfiler
{
    typedef std::pair<std::string, float> Record;
    std::vector<Record> mProfile;

    virtual void reportLayerTime(const char* layerName, float ms) noexcept
    {
        auto record = std::find_if(mProfile.begin(), mProfile.end(),
                        [&](const Record& r){ return r.first == layerName; });
        if (record == mProfile.end())
            mProfile.push_back(std::make_pair(layerName, ms));
        else
            record->second += ms;
    }

    void printLayerTimes()
    {
        float totalTime = 0;
        for (size_t i = 0; i < mProfile.size(); i++)
        {
            printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
                    mProfile[i].second / TIMING_ITERATIONS);
            totalTime += mProfile[i].second;
        }
        printf("Time over all layers: %4.3f\n", totalTime / TIMING_ITERATIONS);
    }

};

class Int8EntropyCalibrator : public IInt8EntropyCalibrator
{
    public:
        Int8EntropyCalibrator(bool readCache = true, bool onnxModel = false)
        {
           mReadCache = readCache;
           mOnnxModel = onnxModel;
        }

        virtual ~Int8EntropyCalibrator()
        {
        }

        //We don't support int8 calibration till now[ToDo].
        int getBatchSize() const noexcept override { return 0 /*mBF.m_Dims.n()*/; }

        bool getBatch(void* bindings[], const char* names[], int nbBindings) noexcept override
        {
            return false;
        }

        const void* readCalibrationCache(size_t& length) noexcept override
        {
            mCalibrationCache.clear();
	    const char* CACHE_PATH = mOnnxModel ? "../../data/Model/resnet10/CalibrationTable_ONNX"
		    : "../../data/Model/resnet10/CalibrationTable_CAFFE";
	    std::ifstream input(CACHE_PATH, std::ios::binary);
            input >> std::noskipws;
            if (mReadCache && input.good())
                std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
            length = mCalibrationCache.size();
            cout<<"text:"<<(char*)(&mCalibrationCache[0])<<endl;
            return length ? &mCalibrationCache[0] : nullptr;
        }

        void writeCalibrationCache(const void* cache, size_t length) noexcept override
        {
            std::ofstream output("CalibrationTable", std::ios::binary);
            output.write(reinterpret_cast<const char*>(cache), length);
        }

    private:
        bool mReadCache{ true };
	bool mOnnxModel{ false };
        std::vector<char> mCalibrationCache;
};

string stringtrim(string);

//This function is used to trim space
string
stringtrim(string s)
{
    int i = 0;
    while (s[i] == ' ')
    {
        i++;
    }
    s = s.substr(i);
    i = s.size()-1;
    while (s[i] == ' ')
    {
        i--;
    }

    s = s.substr(0, i + 1);
    return s;
}

int
TRT_Context::getNetWidth() const
{
    return net_width;
}

int
TRT_Context::getNetHeight() const
{
    return net_height;
}


int
TRT_Context::getFilterNum() const
{
    return filter_num;
}

void
TRT_Context::setFilterNum(const unsigned int& filter_num)
{
    this->filter_num = filter_num;
}

void*&
TRT_Context::getBuffer(const int& index)
{
    assert(index >= 0 && index < num_bindings);
    return buffers[index];
}

float*&
TRT_Context::getInputBuf()
{
    return input_buf;
}

uint32_t
TRT_Context::getNumTrtInstances() const
{
    return trtinstance_num;
}

uint32_t
TRT_Context::getBatchSize() const
{
    return batch_size;
}

int
TRT_Context::getModelClassCnt() const
{
    return g_pModelNetAttr->classCnt;
}


void*
TRT_Context::getScales() const
{
    return scales_gpu;
}

void*
TRT_Context::getOffsets() const
{
    return offset_gpu;
}

//0 fp16  1 fp32  2 int8
void
TRT_Context::setMode(const int& mode)
{
    this->mode = mode;
}

void
TRT_Context::setBatchSize(const uint32_t& batchsize)
{
    this->batch_size = batchsize;
}

void
TRT_Context::setDumpResult(const bool& dump_result)
{
    this->dump_result = dump_result;
}

void
TRT_Context::setTrtProfilerEnabled(const bool& enable_trt_profiler)
{
    this->enable_trt_profiler = enable_trt_profiler;
}

int
TRT_Context::getChannel() const
{
    return channel;
}


TRT_Context::TRT_Context()
{
    net_width = 0;
    net_height = 0;
    filter_num = FILTER_NUM;
    buffers = new void *[NUM_BINDINGS];
    for (int i = 0; i < NUM_BINDINGS; i++)
    {
        buffers[i] = NULL;
    }
    input_buf = NULL;
    output_cov_buf = NULL;
    output_bbox_buf = NULL;

    runtime = NULL;
    engine = NULL;
    context = NULL;
    pResultArray = new uint32_t[100*4];

    channel = 0;
    num_bindings = NUM_BINDINGS;

    batch_size = 1;

    trtinstance_num = 1;

    mode = MODE_FP16;
    elapsed_frame_num = 0;
    elapsed_time = 0;
    enable_trt_profiler = 1;
    dump_result = 0;
    frame_num = 0;
    result_file = "result.txt";
    pLogger = new Logger;
    pProfiler = new Profiler;
}


void
TRT_Context::allocateMemory(bool bUseCPUBuf)
{
    const ICudaEngine& cuda_engine = context->getEngine();
    // input and output buffer pointers that we pass to the engine
    // the engine requires exactly IEngine::getNbBindings() of these
    // but in this case we know that there is exactly one input and one output
    assert(cuda_engine.getNbBindings() == num_bindings);

    // In order to bind the buffers, we need to know the names of the input
    // and output tensors. note that indices are guaranteed to be less than
    // IEngine::getNbBindings()
    inputIndex = cuda_engine.getBindingIndex(g_pModelNetAttr->INPUT_BLOB_NAME);
    outputIndex = cuda_engine.getBindingIndex(g_pModelNetAttr->OUTPUT_BLOB_NAME);
    outputIndexBBOX = cuda_engine.getBindingIndex(g_pModelNetAttr->OUTPUT_BBOX_NAME);
    // allocate GPU buffers
    if (is_onnx_model)
    {
	//It's explicit batch
        Dims inputD = cuda_engine.getBindingDimensions(inputIndex);
        Dims outputD = cuda_engine.getBindingDimensions(outputIndex);
        Dims outputDB = cuda_engine.getBindingDimensions(outputIndexBBOX);
        inputDims = Dims3{inputD.d[1], inputD.d[2], inputD.d[3]};
        outputDims = Dims3{outputD.d[1], outputD.d[2], outputD.d[3]};
        outputDimsBBOX = Dims3{outputDB.d[1], outputDB.d[2], outputDB.d[3]};
    }
    else
    {
        inputDims = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(inputIndex));
        outputDims = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(outputIndex));
        outputDimsBBOX = static_cast<Dims3&&>(cuda_engine.getBindingDimensions(outputIndexBBOX));
    }

    net_height = inputDims.d[1];
    net_width = inputDims.d[2];

    inputSize = batch_size * inputDims.d[0] * inputDims.d[1] * inputDims.d[2] * sizeof(float);
    outputSize = batch_size * outputDims.d[0] * outputDims.d[1] *
                            outputDims.d[2] * sizeof(float);
    outputSizeBBOX = batch_size * outputDimsBBOX.d[0] * outputDimsBBOX.d[1] *
                            outputDimsBBOX.d[2] * sizeof(float);
    if (bUseCPUBuf && input_buf == NULL)
    {
        input_buf = (float *)malloc(inputSize);
        assert(input_buf != NULL);
    }

    if (output_cov_buf == NULL)
    {
        output_cov_buf = (float *)malloc(outputSize);
        assert(output_cov_buf != NULL);
    }
    if (outputIndexBBOX >= 0)
    {
        if (output_bbox_buf == NULL)
        {
            output_bbox_buf = (float *)malloc(outputSizeBBOX);
            assert(output_bbox_buf != NULL);
        }
    }
    // create GPU buffers and a stream
    if (buffers[inputIndex] == NULL)
    {
        CHECK(cudaMalloc(&buffers[inputIndex], inputSize));
    }
    if (buffers[outputIndex] == NULL)
    {
        CHECK(cudaMalloc(&buffers[outputIndex], outputSize));
    }
    if (outputIndexBBOX >= 0)
    {
        if (buffers[outputIndexBBOX] == NULL)
        {
            CHECK(cudaMalloc(&buffers[outputIndexBBOX], outputSizeBBOX));
        }
    }
    CHECK(cudaMalloc(&offset_gpu, sizeof(int) * 3));
    CHECK(cudaMalloc(&scales_gpu, sizeof(float) * 3));
    CHECK(cudaMemcpy(offset_gpu, (void*)g_pModelNetAttr->offsets,
                                sizeof(int) * 3,
                                cudaMemcpyHostToDevice));
    CHECK(cudaMemcpy(scales_gpu, (void*)g_pModelNetAttr->input_scale,
                                sizeof(float) * 3,
                                cudaMemcpyHostToDevice));
    if (dump_result)
    {
        fstream.open(result_file.c_str(), ios::out);
    }
}

void
TRT_Context::releaseMemory(bool bUseCPUBuf)
{
    for (int i = 0; i < NUM_BINDINGS; i++)
    {
        if (buffers[i] != NULL)
        {
            CHECK(cudaFree(buffers[i]));
            buffers[i] = NULL;
        }
    }
    if (bUseCPUBuf && input_buf != NULL)
    {
        free(input_buf);
        input_buf = NULL;
    }
    if (output_cov_buf != NULL)
    {
        free(output_cov_buf);
        output_cov_buf = NULL;
    }
    if (output_bbox_buf != NULL)
    {
        free(output_bbox_buf);
        output_bbox_buf = NULL;
    }

    if (pResultArray != NULL)
    {
        delete []pResultArray;
        pResultArray = NULL;
    }

    if (dump_result)
    {
        fstream.close();
    }
    CHECK(cudaFree(offset_gpu));
    CHECK(cudaFree(scales_gpu));
}

TRT_Context::~TRT_Context()
{

    delete pLogger;
    delete pProfiler;
    delete []buffers;
}

void
TRT_Context::caffeToTRTModel(const string& deployfile, const string& modelfile)
{
    Int8EntropyCalibrator calibrator;
    IInt8Calibrator* int8Calibrator = &calibrator;
    // create API root class - must span the lifetime of the engine usage
    IBuilder *builder = createInferBuilder(*pLogger);
    INetworkDefinition *network = builder->createNetworkV2(0U);
    IBuilderConfig* config = builder->createBuilderConfig();

    // parse the caffe model to populate the network, then set the outputs
    ICaffeParser *parser = createCaffeParser();

    bool hasFp16 = builder->platformHasFastFp16();

    // if user specify
    if (mode == MODE_FP16)
    {
        if (hasFp16)
        {
            printf("mode has been set to 0(using fp16)\n");
        }
        else
        {
            printf("platform don't have fp16, force to 1(using fp32)\n");
        }
    }
    else if(mode >= MODE_FP32)
    {
        printf("mode >= 1(using fp32 or int8)\n");
        hasFp16 = 0;
    }

    // create a 16-bit model if it's natively supported
    DataType modelDataType = hasFp16 ? DataType::kHALF : DataType::kFLOAT;
    const IBlobNameToTensor *blobNameToTensor =
        parser->parse(deployfile.c_str(),    // caffe deploy file
                      modelfile.c_str(),     // caffe model file
                      *network,              // network definition that parser populate
                      modelDataType);
    assert(blobNameToTensor != nullptr);

    // the caffe file has no notion of outputs
    // so we need to manually say which tensors the engine should generate
    outputs = {g_pModelNetAttr->OUTPUT_BLOB_NAME,
               g_pModelNetAttr->OUTPUT_BBOX_NAME};
    for (auto& s : outputs)
    {
        network->markOutput(*blobNameToTensor->find(s.c_str()));
        printf("outputs %s\n", s.c_str());
    }

    // Build the engine
    builder->setMaxBatchSize(batch_size);
    config->setMaxWorkspaceSize(g_pModelNetAttr->WORKSPACE_SIZE);
    if (mode == MODE_INT8)
    {
        config->setFlag(BuilderFlag::kINT8);
        config->setInt8Calibrator(int8Calibrator);
    }

    // Eliminate the side-effect from the delay of GPU frequency boost
    config->setMinTimingIterations(3);
    config->setAvgTimingIterations(2);

    // set up the network for paired-fp16 format, only on DriveCX
    if (hasFp16)
    {
        config->setFlag(BuilderFlag::kFP16);
    }

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    delete network;
    delete parser;
    delete config;

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    delete engine;
    delete builder;
    shutdownProtobufLibrary();
}

void
TRT_Context::onnxToTRTModel(const string& modelfile)
{
    Int8EntropyCalibrator calibrator(true, true);
    IInt8Calibrator* int8Calibrator = &calibrator;
    // create API root class - must span the lifetime of the engine usage
    IBuilder *builder = createInferBuilder(*pLogger);
    const auto explicitBatch = 1U << static_cast<uint32_t>(NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    INetworkDefinition* network = builder->createNetworkV2(explicitBatch);

    IBuilderConfig* config = builder->createBuilderConfig();

    auto parser = nvonnxparser::createParser(*network, *pLogger);

    bool hasFp16 = builder->platformHasFastFp16();

    // if user specify
    if (mode == MODE_FP16)
    {
        if (hasFp16)
        {
            printf("mode has been set to 0(using fp16)\n");
        }
        else
        {
            printf("platform don't have fp16, force to 1(using fp32)\n");
        }
    }
    else if(mode >= MODE_FP32)
    {
        printf("mode >= 1(using fp32 or int8)\n");
        hasFp16 = 0;
    }

    auto parsed = parser->parseFromFile(modelfile.c_str(), static_cast<int>(ILogger::Severity::kWARNING));
    if (!parsed)
    {
        printf("Failed to parse onnx model\n");
        return;
    }

    // Only support one input for resnet10(default model)
    auto input = network->getInput(0);
    auto inputDims = input->getDimensions();
    IOptimizationProfile* profile = builder->createOptimizationProfile();
    profile->setDimensions(input->getName(), OptProfileSelector::kMIN, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
    profile->setDimensions(input->getName(), OptProfileSelector::kOPT, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
    profile->setDimensions(input->getName(), OptProfileSelector::kMAX, Dims4{batch_size, inputDims.d[1], inputDims.d[2], inputDims.d[3]});
    assert(config->addOptimizationProfile(profile) != -1);
    assert(profile->isValid());
    config->setCalibrationProfile(profile);

    // Build the engine
    config->setMaxWorkspaceSize(g_pModelNetAttr->WORKSPACE_SIZE);
    if (mode == MODE_INT8)
    {
        config->setFlag(BuilderFlag::kINT8);
        config->setInt8Calibrator(int8Calibrator);
    }

    // Eliminate the side-effect from the delay of GPU frequency boost
    config->setMinTimingIterations(3);
    config->setAvgTimingIterations(2);

    // set up the network for paired-fp16 format, only on DriveCX
    if (hasFp16)
    {
        config->setFlag(BuilderFlag::kFP16);
    }

    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    assert(engine);

    // we don't need the network any more, and we can destroy the parser
    delete network;
    delete parser;
    delete config;

    // serialize the engine, then close everything down
    trtModelStream = engine->serialize();
    delete engine;
    delete builder;
    shutdownProtobufLibrary();
}

void
TRT_Context::setModelIndex(int index)
{
    assert(index == GOOGLENET_SINGLE_CLASS ||
           index == GOOGLENET_THREE_CLASS ||
           index == RESNET_THREE_CLASS);
    g_pModelNetAttr = gModelNetAttr + index;
    assert(g_pModelNetAttr->classCnt > 0);
    assert(g_pModelNetAttr->STRIDE > 0);
    assert(g_pModelNetAttr->WORKSPACE_SIZE > 0);
}

void
TRT_Context::buildTrtContext(const string& deployfile,
        const string& modelfile, bool bUseCPUBuf, bool isOnnxModel)
{
    this->is_onnx_model = isOnnxModel;
    if (!parseNet(deployfile) && !isOnnxModel)
    {
        cout<<"parse net failed, exit!"<<endl;
        exit(0);
    }
    ifstream trtModelFile("trtModel.cache");
    if (trtModelFile.good())
    {
        // get cache file length
        size_t size = 0;
        size_t i = 0;

        cout<<"Using cached TRT model" <<endl;

        // Get the length
        trtModelFile.seekg(0, ios::end);
        size = trtModelFile.tellg();
        trtModelFile.seekg(0, ios::beg);

        char * buff = new char [size];
        while (trtModelFile.get(buff[i])) i++;
        trtModelFile.close();
        runtime = createInferRuntime(*pLogger);
        engine = runtime->deserializeCudaEngine((void *)buff, size, nullptr);
    }
    else
    {
        if (isOnnxModel)
        {
            onnxToTRTModel(modelfile);
        } else
        {
            caffeToTRTModel(deployfile, modelfile);
        }
        cout<<"Create TRT model cache"<<endl;
        ofstream trtModelFile("trtModel.cache");
        trtModelFile.write((char *)trtModelStream->data(), trtModelStream->size());
        trtModelFile.close();
        runtime = createInferRuntime(*pLogger);
        engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
        trtModelStream->destroy();
    }
    context = engine->createExecutionContext();
    allocateMemory(bUseCPUBuf);
}

void
TRT_Context::destroyTrtContext(bool bUseCPUBuf)
{
    releaseMemory(bUseCPUBuf);
    delete context;
    delete engine;
    delete runtime;
}

void
TRT_Context::doInference(
    queue< vector<cv::Rect> >* rectList_queue,
    float *input)
{
    struct timeval input_time;
    struct timeval output_time;

    if (!enable_trt_profiler)
    {
        cudaStream_t stream;
        CHECK(cudaStreamCreate(&stream));

        // DMA the input to the GPU,  execute the batch asynchronously
        // and DMA it back
        if (input != NULL)   //NULL means we have use GPU to map memory
        {
            CHECK(cudaMemcpyAsync(buffers[inputIndex], input, inputSize,
                                cudaMemcpyHostToDevice, stream));
        }

        if (is_onnx_model)
        {
            context->setBindingDimensions(0, Dims4{batch_size, inputDims.d[0], inputDims.d[1], inputDims.d[2]});
            context->executeV2(buffers);
        }
        else
        {
            context->enqueue(batch_size, buffers, stream, nullptr);
        }

        CHECK(cudaMemcpyAsync(output_cov_buf, buffers[outputIndex], outputSize,
                                cudaMemcpyDeviceToHost, stream));
        if (outputIndexBBOX >= 0)
        {
            CHECK(cudaMemcpyAsync(output_bbox_buf, buffers[outputIndexBBOX],
                            outputSizeBBOX, cudaMemcpyDeviceToHost, stream));
        }

        cudaStreamSynchronize(stream);

        // release the stream and the buffers
        cudaStreamDestroy(stream);
    }
    else
    {
        // DMA the input to the GPU,  execute the batch synchronously
        // and DMA it back
        if (input != NULL)   //NULL means we have use GPU to map memory
        {
            CHECK(cudaMemcpy(buffers[inputIndex], input, inputSize,
                                cudaMemcpyHostToDevice));
        }

        gettimeofday(&input_time, NULL);
        if (is_onnx_model)
	{
            context->setBindingDimensions(0, Dims4{batch_size, inputDims.d[0], inputDims.d[1], inputDims.d[2]});
            context->executeV2(buffers);
	}
        else
        {
            context->execute(batch_size, buffers);
        }
        gettimeofday(&output_time, NULL);
        CHECK(cudaMemcpy(output_cov_buf, buffers[outputIndex], outputSize,
                                cudaMemcpyDeviceToHost));
        if (outputIndexBBOX >= 0)
        {
            CHECK(cudaMemcpy(output_bbox_buf, buffers[outputIndexBBOX],
                            outputSizeBBOX, cudaMemcpyDeviceToHost));
        }
        elapsed_frame_num += batch_size;
        elapsed_time += (output_time.tv_sec - input_time.tv_sec) * 1000 +
                        (output_time.tv_usec - input_time.tv_usec) / 1000;
        if (elapsed_frame_num >= 100)
        {
            printf("Time elapsed:%ld ms per frame in past %ld frames\n",
                elapsed_time / elapsed_frame_num, elapsed_frame_num);
            elapsed_frame_num = 0;
            elapsed_time = 0;
        }
    }

    vector<cv::Rect> rectList[getModelClassCnt()];
    for (int i = 0; i < batch_size; i++)
    {
        if (g_pModelNetAttr->ParseFunc_ID == 0)
            parseBbox(rectList, i);
        else if(g_pModelNetAttr->ParseFunc_ID == 1)
            ParseResnet10Bbox(rectList, i);
        if (dump_result)
        {
            for (int class_num = 0;
                     class_num < (g_pModelNetAttr->ParseFunc_ID == 1 ? getModelClassCnt() - 1 : getModelClassCnt());
                     class_num++)
            {
                fstream << "frame:" << frame_num << " class num:" << class_num
                        << " has rect:" << rectList[class_num].size() << endl;
                for (uint32_t i = 0; i < rectList[class_num].size(); i++)
                {
                    fstream << "\tx,y,w,h:"
                            << (float) rectList[class_num][i].x / net_width << " "
                            << (float) rectList[class_num][i].y / net_height << " "
                            << (float) rectList[class_num][i].width / net_width << " "
                            << (float) rectList[class_num][i].height / net_height << endl;
                }
                fstream << endl;
            }
            frame_num++;
        }

        for (int class_num = 0; class_num < getModelClassCnt(); class_num++)
        {
            rectList_queue[class_num].push(rectList[class_num]);
        }
    }
}

void
TRT_Context::parseBbox(vector<cv::Rect>* rectList, int batch_th)
{
    int gridsize = outputDims.d[1] * outputDims.d[2];
    int gridoffset = outputDims.d[0] * outputDims.d[1] * outputDims.d[2] * batch_th;

    for (int class_num = 0; class_num < getModelClassCnt(); class_num++)
    {
        float *output_x1 = output_bbox_buf +
                outputDimsBBOX.d[0] * outputDimsBBOX.d[1] * outputDimsBBOX.d[2] * batch_th +
                class_num * 4 * outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_y1 = output_x1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_x2 = output_y1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_y2 = output_x2 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];

        for (int i = 0; i < gridsize; ++i)
        {
            if (output_cov_buf[gridoffset + class_num * gridsize + i] >=
                                          g_pModelNetAttr->THRESHOLD[class_num])
            {
                int g_x = i % outputDims.d[2];
                int g_y = i / outputDims.d[2];
                int i_x = g_x * g_pModelNetAttr->STRIDE;
                int i_y = g_y * g_pModelNetAttr->STRIDE;
                int rectx1 = g_pModelNetAttr->bbox_output_scales[0] * output_x1[i] + i_x;
                int recty1 = g_pModelNetAttr->bbox_output_scales[1] * output_y1[i] + i_y;
                int rectx2 = g_pModelNetAttr->bbox_output_scales[2] * output_x2[i] + i_x;
                int recty2 = g_pModelNetAttr->bbox_output_scales[3] * output_y2[i] + i_y;
                if (rectx1 < 0)
                {
                    rectx1 = 0;
                }
                if (rectx2 < 0)
                {
                    rectx2 = 0;
                }
                if (recty1 < 0)
                {
                    recty1 = 0;
                }
                if (recty2 < 0)
                {
                    recty2 = 0;
                }
                if (rectx1 >= (int)net_width)
                {
                    rectx1 = net_width - 1;
                }
                if (rectx2 >= (int)net_width)
                {
                    rectx2 = net_width - 1;
                }
                if (recty1 >= (int)net_height)
                {
                    recty1 = net_height - 1;
                }
                if (recty2 >= (int)net_height)
                {
                    recty2 = net_height - 1;
                }
                rectList[class_num].push_back(cv::Rect(rectx1, recty1,
                                                      rectx2 - rectx1, recty2 - recty1));
            }
        }

        cv::groupRectangles(rectList[class_num], 3, 0.2);
    }
}

void
TRT_Context::ParseResnet10Bbox(vector<cv::Rect>* rectList, int batch_th)
{
    int grid_x_ = outputDims.d[2];
    int grid_y_ = outputDims.d[1];
    int gridsize_ = grid_x_ * grid_y_;

    int target_shape[2] = {grid_x_, grid_y_};
    float bbox_norm[2] = {35.0, 35.0};
    float gc_centers_0[target_shape[0]];
    float gc_centers_1[target_shape[1]];
    for (int i = 0; i < target_shape[0]; i++)
        gc_centers_0[i] = (float)(i * 16 + 0.5)/bbox_norm[0];
    for (int i = 0; i < target_shape[1]; i++)
        gc_centers_1[i] = (float)(i * 16 + 0.5)/bbox_norm[1];
    for (int class_num = 0;
             class_num  < (g_pModelNetAttr->ParseFunc_ID == 1 ? getModelClassCnt() - 1 : getModelClassCnt());
             class_num++)
    {
        float *output_x1 = output_bbox_buf + class_num * 4 * outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_y1 = output_x1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_x2 = output_y1 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];
        float *output_y2 = output_x2 + outputDimsBBOX.d[1] * outputDimsBBOX.d[2];

        for (int h = 0; h < grid_y_; h++)
        {
            for (int w = 0; w < grid_x_; w++)
            {
                int i = w + h * grid_x_;
                if (output_cov_buf[class_num * gridsize_ + i] >=
                        g_pModelNetAttr->THRESHOLD[class_num])
                {

                    float rectx1_f, recty1_f, rectx2_f, recty2_f;
                    int rectx1, recty1, rectx2, recty2;

                    rectx1_f = recty1_f = rectx2_f = recty2_f = 0.0;

                    rectx1_f = output_x1[w + h * grid_x_] - gc_centers_0[w];
                    recty1_f = output_y1[w + h * grid_x_] - gc_centers_1[h];
                    rectx2_f = output_x2[w + h * grid_x_] + gc_centers_0[w];
                    recty2_f = output_y2[w + h * grid_x_] + gc_centers_1[h];

                    rectx1_f *= (float)(-bbox_norm[0]);
                    recty1_f *= (float)(-bbox_norm[1]);
                    rectx2_f *= (float)(bbox_norm[0]);
                    recty2_f *= (float)(bbox_norm[1]);

                    rectx1 = (int)rectx1_f;
                    recty1 = (int)recty1_f;
                    rectx2 = (int)rectx2_f;
                    recty2 = (int)recty2_f;

                    rectx1 = rectx1 < 0 ? 0 : (rectx1 >= net_width ? (net_width - 1) : rectx1);
                    rectx2 = rectx2 < 0 ? 0 : (rectx2 >= net_width ? (net_width - 1) : rectx2);
                    recty1 = recty1 < 0 ? 0 : (recty1 >= net_height ? (net_height - 1) : recty1);
                    recty2 = recty2 < 0 ? 0 : (recty2 >= net_height ? (net_height - 1) : recty2);

                    rectList[class_num].push_back(cv::Rect(rectx1, recty1,
                                rectx2 - rectx1, recty2 - recty1));
                }
            }
        }
        cv::groupRectangles(rectList[class_num], 1, 0.1);
    }
}

int
TRT_Context::parseNet(const string& deployfile)
{
    ifstream readfile;
    string line;
    readfile.open(deployfile, ios::in);
    if (!readfile)
    {
        return 0;
    }
    int iterator = 0;

    while (1)
    {
        getline(readfile, line);
        string::size_type index;

        index = line.find("input_dim");
        if (index ==std::string::npos)
        {
            continue;
        }

        index = line.find_first_of(":", 0);
        string first = line.substr(0, index);
        string second = line.substr(index + 1);
        switch(iterator)
        {
        case 0:  //for batch size
            // Deprecate this interface, and let APP set batch size
            //batch_size = atoi(stringtrim(second).c_str());
            //assert(batch_size > 0);
            break;

        case 1:  // for channel num in net
            channel = atoi(stringtrim(second).c_str());
            assert(channel > 0);
            break;

        case 2:  // for net's height
            net_height = atoi(stringtrim(second).c_str());
            assert(net_height > 0);
            break;
        case 3:  // for net's width
            net_width = atoi(stringtrim(second).c_str());
            assert(net_width > 0);
            break;

        default:
            break;
        }

        if (iterator == 3)
        {
            break;
        }

        iterator++;
    }

    cout<<"Net has batch_size, channel, net_height, net_width:" <<
        batch_size << " " << channel << " " << net_height << " " <<
        net_width << endl;

    readfile.close();

    return 1;
}