ADPilot
/
modularization


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
							/******************************************************************************
 * Copyright 2020 The Apollo Authors. All Rights Reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *****************************************************************************/

/*
 * Copyright 2018-2019 Autoware Foundation. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * @author Kosuke Murakami
 * @date 2019/02/26
 */

/**
* @author Yan haixu
* Contact: just github.com/hova88
* @date 2021/04/30
*/


#include "pointpillars.h"

#include <chrono>
#include <iostream>
#include <iostream>

PointPillars::PointPillars(const float score_threshold,
                           const float nms_overlap_threshold,
                           const bool use_onnx,
                           const std::string pfe_file,
                           const std::string backbone_file,
                           const std::string pp_config)
    : score_threshold_(score_threshold),
      nms_overlap_threshold_(nms_overlap_threshold),
      use_onnx_(use_onnx),
      pfe_file_(pfe_file),
      backbone_file_(backbone_file),
      pp_config_(pp_config)
{
    InitParams();
    InitTRT(use_onnx_);
    DeviceMemoryMalloc();

    preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
        kNumThreads,
        kMaxNumPillars,
        kMaxNumPointsPerPillar,
        kNumPointFeature,
        kNumIndsForScan,
        kGridXSize,kGridYSize, kGridZSize,
        kPillarXSize,kPillarYSize, kPillarZSize,
        kMinXRange, kMinYRange, kMinZRange));

    scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));

    const float float_min = std::numeric_limits<float>::lowest();
    const float float_max = std::numeric_limits<float>::max();
    postprocess_cuda_ptr_.reset(
      new PostprocessCuda(kNumThreads,
                          float_min, float_max, 
                          kNumClass,kNumAnchorPerCls,
                          kMultiheadLabelMapping,
                          score_threshold_, 
                          nms_overlap_threshold_,
                          kNmsPreMaxsize, 
                          kNmsPostMaxsize,
                          kNumBoxCorners, 
                          kNumInputBoxFeature,
                          7));  /*kNumOutputBoxFeature*/
    
}

PointPillars::~PointPillars() {
    // for pillars 
    GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
    GPU_CHECK(cudaFree(dev_x_coors_));
    GPU_CHECK(cudaFree(dev_y_coors_));
    GPU_CHECK(cudaFree(dev_pillar_point_feature_));
    GPU_CHECK(cudaFree(dev_pillar_coors_));
    // for sparse map
    GPU_CHECK(cudaFree(dev_sparse_pillar_map_));    
    GPU_CHECK(cudaFree(dev_cumsum_along_x_));
    GPU_CHECK(cudaFree(dev_cumsum_along_y_));
    // for pfe forward
    GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
      
    GPU_CHECK(cudaFree(pfe_buffers_[0]));
    GPU_CHECK(cudaFree(pfe_buffers_[1]));

    GPU_CHECK(cudaFree(rpn_buffers_[0]));
    GPU_CHECK(cudaFree(rpn_buffers_[1]));
    GPU_CHECK(cudaFree(rpn_buffers_[2]));
    GPU_CHECK(cudaFree(rpn_buffers_[3]));
    GPU_CHECK(cudaFree(rpn_buffers_[4]));
    GPU_CHECK(cudaFree(rpn_buffers_[5]));
    GPU_CHECK(cudaFree(rpn_buffers_[6]));
    GPU_CHECK(cudaFree(rpn_buffers_[7]));
    pfe_context_->destroy();
    backbone_context_->destroy();
    pfe_engine_->destroy();
    backbone_engine_->destroy();
    // for post process
    GPU_CHECK(cudaFree(dev_scattered_feature_));
    GPU_CHECK(cudaFree(dev_filtered_box_));
    GPU_CHECK(cudaFree(dev_filtered_score_));
    GPU_CHECK(cudaFree(dev_filtered_label_));
    GPU_CHECK(cudaFree(dev_filtered_dir_));
    GPU_CHECK(cudaFree(dev_box_for_nms_));
    GPU_CHECK(cudaFree(dev_filter_count_));


}

void PointPillars::InitParams()
{
    YAML::Node params = YAML::LoadFile(pp_config_);
    kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
    kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
    kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
    kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
    kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
    kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
    kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
    kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
    kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
    kNumClass = params["CLASS_NAMES"].size();
    kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
    kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
    kNumPointFeature = 5; // [x, y, z, i,0]
    kNumInputBoxFeature = 7;
    kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
    kBatchSize = 1;
    kNumIndsForScan = 1024;
    kNumThreads = 64;
    kNumBoxCorners = 8;
    kAnchorStrides = 4;
    kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
    kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
    //params for initialize anchors
    //Adapt to OpenPCDet
    kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
    for (int i = 0; i < kAnchorNames.size(); ++i)
    {
        kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
        kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
        kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
        kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
    }
    for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
    {
        int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
        std::vector<int> value;
        for (int i = 0; i < num_cls_per_head; ++i)
        {
            value.emplace_back(idx_head + i);
        }
        kMultiheadLabelMapping.emplace_back(value);
    }

    // Generate secondary parameters based on above.
    kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
    kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
    kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
    kRpnInputSize = 64 * kGridYSize * kGridXSize;

    kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
    kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
    kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass;  // H * W * Ro * N = 196608

    kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
    kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
    kRpnClsOutputSize = kNumAnchor * kNumClass;
    kRpnDirOutputSize = kNumAnchor * 2;
}

void PointPillars::DeviceMemoryMalloc() {
    // for pillars 
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_),  kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
    // for sparse map
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]

    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
                        kMaxNumPillars * kMaxNumPointsPerPillar *
                            kNumGatherPointFeature * sizeof(float)));
    // for trt inference
    // create GPU buffers and a stream

    GPU_CHECK(
        cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
                                        kNumGatherPointFeature * sizeof(float)));
    GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));

    GPU_CHECK(cudaMalloc(&rpn_buffers_[0],  kRpnInputSize * sizeof(float)));

    GPU_CHECK(cudaMalloc(&rpn_buffers_[1],  kNumAnchorPerCls  * sizeof(float)));  //classes
    GPU_CHECK(cudaMalloc(&rpn_buffers_[2],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
    GPU_CHECK(cudaMalloc(&rpn_buffers_[3],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
    GPU_CHECK(cudaMalloc(&rpn_buffers_[4],  kNumAnchorPerCls  * sizeof(float)));
    GPU_CHECK(cudaMalloc(&rpn_buffers_[5],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
    GPU_CHECK(cudaMalloc(&rpn_buffers_[6],  kNumAnchorPerCls  * 2 * 2 * sizeof(float)));
    
    GPU_CHECK(cudaMalloc(&rpn_buffers_[7],  kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes

    // for scatter kernel
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
                        kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
    // for filter
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
                        kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
                        kNumAnchor * sizeof(float)));
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
                        kNumAnchor * sizeof(int)));
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
                        kNumAnchor * sizeof(int)));
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
                        kNumAnchor * kNumBoxCorners * sizeof(float)));
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));

}

void PointPillars::SetDeviceMemoryToZero() {

    GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
    GPU_CHECK(cudaMemset(dev_x_coors_,               0, kMaxNumPillars * sizeof(int)));
    GPU_CHECK(cudaMemset(dev_y_coors_,               0, kMaxNumPillars * sizeof(int)));
    GPU_CHECK(cudaMemset(dev_pillar_point_feature_,  0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
    GPU_CHECK(cudaMemset(dev_pillar_coors_,          0, kMaxNumPillars * 4 * sizeof(float)));
    // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_,     0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
    GPU_CHECK(cudaMemset(dev_pfe_gather_feature_,    0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
    
    // GPU_CHECK(cudaMemset(pfe_buffers_[0],       0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(pfe_buffers_[1],       0, kMaxNumPillars * 64 * sizeof(float)));

    GPU_CHECK(cudaMemset(dev_scattered_feature_,    0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
    
    // GPU_CHECK(cudaMemset(rpn_buffers_[0],    0, kRpnInputSize * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[1],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[2],    0, kNumAnchorPerCls     * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[3],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[4],    0, kNumAnchorPerCls * 4 * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[5],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[6],    0, kNumAnchorPerCls     * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[7],    0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[8],    0, kNumAnchorPerCls * 4 * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[9],    0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
    // GPU_CHECK(cudaMemset(rpn_buffers_[10],   0, kNumAnchorPerCls     * sizeof(float)));

    GPU_CHECK(cudaMemset(dev_filtered_box_,     0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
    GPU_CHECK(cudaMemset(dev_filtered_score_,   0, kNumAnchor * sizeof(float)));
    GPU_CHECK(cudaMemset(dev_filter_count_,     0, kNumClass * sizeof(int)));
}

void PointPillars::InitTRT(const bool use_onnx) {
  if (use_onnx_) {
    // create a TensorRT model from the onnx model and load it into an engine
    OnnxToTRTModel(pfe_file_, &pfe_engine_);
    SaveEngine(pfe_engine_, pfe_file_.substr(0, pfe_file_.find(".")) + ".trt");
    OnnxToTRTModel(backbone_file_, &backbone_engine_);
    SaveEngine(backbone_engine_, backbone_file_.substr(0, backbone_file_.find(".")) + ".trt");
  }else {
    EngineToTRTModel(pfe_file_, &pfe_engine_);
    EngineToTRTModel(backbone_file_, &backbone_engine_);
  }
    if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
        std::cerr << "Failed to load ONNX file.";
    }

    // create execution context from the engine
    pfe_context_ = pfe_engine_->createExecutionContext();
    backbone_context_ = backbone_engine_->createExecutionContext();
    if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
        std::cerr << "Failed to create TensorRT Execution Context.";
    }
  
}

void PointPillars::OnnxToTRTModel(
    const std::string& model_file,  // name of the onnx model
    nvinfer1::ICudaEngine** engine_ptr) {
    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);

    // create the builder
    const auto explicit_batch =
        static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
    nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
    nvinfer1::INetworkDefinition* network =
        builder->createNetworkV2(explicit_batch);

    // parse onnx model
    auto parser = nvonnxparser::createParser(*network, g_logger_);
    if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
        std::string msg("failed to parse onnx file");
        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
        exit(EXIT_FAILURE);
    }

    // Build the engine
    builder->setMaxBatchSize(kBatchSize);
    builder->setHalf2Mode(true);
    nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
    config->setMaxWorkspaceSize(1 << 25);
    nvinfer1::ICudaEngine* engine =
        builder->buildEngineWithConfig(*network, *config);

    *engine_ptr = engine;
    parser->destroy();
    network->destroy();
    config->destroy();
    builder->destroy();
}

void PointPillars::SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath)
{
    // serialize the engine, then close everything down
    nvinfer1::IHostMemory& trtModelStream = *(engine->serialize());
    std::ofstream file;
    file.open(engine_filepath, std::ios::binary | std::ios::out);
    if(!file.is_open())
    {
        std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
        return;
    }
    file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
    file.close();
}

void PointPillars::EngineToTRTModel(
    const std::string &engine_file ,     
    nvinfer1::ICudaEngine** engine_ptr)  {
    int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
    std::stringstream gieModelStream; 
    gieModelStream.seekg(0, gieModelStream.beg); 

    std::ifstream cache(engine_file); 
    gieModelStream << cache.rdbuf();
    cache.close(); 
    nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_); 

    if (runtime == nullptr) {
        std::string msg("failed to build runtime parser");
        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
        exit(EXIT_FAILURE);
    }
    gieModelStream.seekg(0, std::ios::end);
    const int modelSize = gieModelStream.tellg(); 

    gieModelStream.seekg(0, std::ios::beg);
    void* modelMem = malloc(modelSize); 
    gieModelStream.read((char*)modelMem, modelSize);

    std::cout << "                                                                  "<< std::endl;
    std::cout << "------------------------------------------------------------------"<< std::endl;
    std::cout << ">>>>                                                          >>>>"<< std::endl;
    std::cout << "                                                                  "<< std::endl;
    std::cout << "Input filename:   " << engine_file << std::endl;
    std::cout << "                                                                  "<< std::endl;
    std::cout << ">>>>                                                          >>>>"<< std::endl;
    std::cout << "------------------------------------------------------------------"<< std::endl;
    std::cout << "                                                                  "<< std::endl;

    nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL); 
    if (engine == nullptr) {
        std::string msg("failed to build engine parser");
        g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
        exit(EXIT_FAILURE);
    }
    *engine_ptr = engine;

}

void PointPillars::DoInference(const float* in_points_array,
                                const int in_num_points,
                                std::vector<float>* out_detections,
                                std::vector<int>* out_labels,
                                std::vector<float>* out_scores) 
{
    SetDeviceMemoryToZero();
    cudaDeviceSynchronize();
    // [STEP 1] : load pointcloud
    float* dev_points;
    GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
                        in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
    GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
    GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
                        in_num_points * kNumPointFeature * sizeof(float),
                        cudaMemcpyHostToDevice));
    
    // [STEP 2] : preprocess
    host_pillar_count_[0] = 0;
    auto preprocess_start = std::chrono::high_resolution_clock::now();
    preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
          dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
          dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
          dev_sparse_pillar_map_, host_pillar_count_ ,
          dev_pfe_gather_feature_ );
    cudaDeviceSynchronize();
    auto preprocess_end = std::chrono::high_resolution_clock::now();
    // DEVICE_SAVE<float>(dev_pfe_gather_feature_,  kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature  , "0_Model_pfe_input_gather_feature");

    // [STEP 3] : pfe forward
    cudaStream_t stream;
    GPU_CHECK(cudaStreamCreate(&stream));
    auto pfe_start = std::chrono::high_resolution_clock::now();
    GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
                            kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
                            cudaMemcpyDeviceToDevice, stream));
    pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
    cudaDeviceSynchronize();
    auto pfe_end = std::chrono::high_resolution_clock::now();
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]),  kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");

    // [STEP 4] : scatter pillar feature
    auto scatter_start = std::chrono::high_resolution_clock::now();
    scatter_cuda_ptr_->DoScatterCuda(
        host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
        reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
    cudaDeviceSynchronize();
    auto scatter_end = std::chrono::high_resolution_clock::now();   
    // DEVICE_SAVE<float>(dev_scattered_feature_ ,  kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");

    // [STEP 5] : backbone forward
    auto backbone_start = std::chrono::high_resolution_clock::now();
    GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
                            kBatchSize * kRpnInputSize * sizeof(float),
                            cudaMemcpyDeviceToDevice, stream));
    backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
    cudaDeviceSynchronize();
    auto backbone_end = std::chrono::high_resolution_clock::now();
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[1]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) ,  kNumAnchorPerCls    ,"3_rpn_buffers_[4]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) ,  kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
    // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) ,  kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");

    // [STEP 6]: postprocess (multihead)
    auto postprocess_start = std::chrono::high_resolution_clock::now();
    GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
    postprocess_cuda_ptr_->DoPostprocessCuda(
        reinterpret_cast<float*>(rpn_buffers_[1]), // [cls]   kNumAnchorPerCls 
        reinterpret_cast<float*>(rpn_buffers_[2]), // [cls]   kNumAnchorPerCls * 2 * 2
        reinterpret_cast<float*>(rpn_buffers_[3]), // [cls]   kNumAnchorPerCls * 2 * 2
        reinterpret_cast<float*>(rpn_buffers_[4]), // [cls]   kNumAnchorPerCls 
        reinterpret_cast<float*>(rpn_buffers_[5]), // [cls]   kNumAnchorPerCls * 2 * 2
        reinterpret_cast<float*>(rpn_buffers_[6]), // [cls]   kNumAnchorPerCls * 2 * 2
        reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
        dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
        *out_detections, *out_labels , *out_scores);
    cudaDeviceSynchronize();
    auto postprocess_end = std::chrono::high_resolution_clock::now();

    // release the stream and the buffers
//    std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
//    std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
//    std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
//    std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
//    std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;

//    std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
//    std::cout << "------------------------------------" << std::endl;
//    std::cout << setiosflags(ios::left)  << setw(14) << "Module" << setw(12)  << "Time"  << resetiosflags(ios::left) << std::endl;
//    std::cout << "------------------------------------" << std::endl;
//    std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
//    double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()};

//    for (int i =0 ; i < 6 ; ++i) {
//        std::cout << setiosflags(ios::left) << setw(14) << Modules[i]  << setw(8)  << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
//    }
//    std::cout << "------------------------------------" << std::endl;
    cudaStreamDestroy(stream);

}