pointpillars.cc 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507
  1. /******************************************************************************
  2. * Copyright 2020 The Apollo Authors. All Rights Reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. *****************************************************************************/
  16. /*
  17. * Copyright 2018-2019 Autoware Foundation. All rights reserved.
  18. *
  19. * Licensed under the Apache License, Version 2.0 (the "License");
  20. * you may not use this file except in compliance with the License.
  21. * You may obtain a copy of the License at
  22. *
  23. * http://www.apache.org/licenses/LICENSE-2.0
  24. *
  25. * Unless required by applicable law or agreed to in writing, software
  26. * distributed under the License is distributed on an "AS IS" BASIS,
  27. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. * See the License for the specific language governing permissions and
  29. * limitations under the License.
  30. */
  31. /**
  32. * @author Kosuke Murakami
  33. * @date 2019/02/26
  34. */
  35. /**
  36. * @author Yan haixu
  37. * Contact: just github.com/hova88
  38. * @date 2021/04/30
  39. */
  40. #include "pointpillars.h"
  41. #include <chrono>
  42. #include <iostream>
  43. #include <iostream>
  44. PointPillars::PointPillars(const float score_threshold,
  45. const float nms_overlap_threshold,
  46. const bool use_onnx,
  47. const std::string pfe_file,
  48. const std::string backbone_file,
  49. const std::string pp_config)
  50. : score_threshold_(score_threshold),
  51. nms_overlap_threshold_(nms_overlap_threshold),
  52. use_onnx_(use_onnx),
  53. pfe_file_(pfe_file),
  54. backbone_file_(backbone_file),
  55. pp_config_(pp_config)
  56. {
  57. InitParams();
  58. InitTRT(use_onnx_);
  59. DeviceMemoryMalloc();
  60. preprocess_points_cuda_ptr_.reset(new PreprocessPointsCuda(
  61. kNumThreads,
  62. kMaxNumPillars,
  63. kMaxNumPointsPerPillar,
  64. kNumPointFeature,
  65. kNumIndsForScan,
  66. kGridXSize,kGridYSize, kGridZSize,
  67. kPillarXSize,kPillarYSize, kPillarZSize,
  68. kMinXRange, kMinYRange, kMinZRange));
  69. scatter_cuda_ptr_.reset(new ScatterCuda(kNumThreads, kGridXSize, kGridYSize));
  70. const float float_min = std::numeric_limits<float>::lowest();
  71. const float float_max = std::numeric_limits<float>::max();
  72. postprocess_cuda_ptr_.reset(
  73. new PostprocessCuda(kNumThreads,
  74. float_min, float_max,
  75. kNumClass,kNumAnchorPerCls,
  76. kMultiheadLabelMapping,
  77. score_threshold_,
  78. nms_overlap_threshold_,
  79. kNmsPreMaxsize,
  80. kNmsPostMaxsize,
  81. kNumBoxCorners,
  82. kNumInputBoxFeature,
  83. 7)); /*kNumOutputBoxFeature*/
  84. }
  85. PointPillars::~PointPillars() {
  86. // for pillars
  87. GPU_CHECK(cudaFree(dev_num_points_per_pillar_));
  88. GPU_CHECK(cudaFree(dev_x_coors_));
  89. GPU_CHECK(cudaFree(dev_y_coors_));
  90. GPU_CHECK(cudaFree(dev_pillar_point_feature_));
  91. GPU_CHECK(cudaFree(dev_pillar_coors_));
  92. // for sparse map
  93. GPU_CHECK(cudaFree(dev_sparse_pillar_map_));
  94. GPU_CHECK(cudaFree(dev_cumsum_along_x_));
  95. GPU_CHECK(cudaFree(dev_cumsum_along_y_));
  96. // for pfe forward
  97. GPU_CHECK(cudaFree(dev_pfe_gather_feature_));
  98. GPU_CHECK(cudaFree(pfe_buffers_[0]));
  99. GPU_CHECK(cudaFree(pfe_buffers_[1]));
  100. GPU_CHECK(cudaFree(rpn_buffers_[0]));
  101. GPU_CHECK(cudaFree(rpn_buffers_[1]));
  102. GPU_CHECK(cudaFree(rpn_buffers_[2]));
  103. GPU_CHECK(cudaFree(rpn_buffers_[3]));
  104. GPU_CHECK(cudaFree(rpn_buffers_[4]));
  105. GPU_CHECK(cudaFree(rpn_buffers_[5]));
  106. GPU_CHECK(cudaFree(rpn_buffers_[6]));
  107. GPU_CHECK(cudaFree(rpn_buffers_[7]));
  108. pfe_context_->destroy();
  109. backbone_context_->destroy();
  110. pfe_engine_->destroy();
  111. backbone_engine_->destroy();
  112. // for post process
  113. GPU_CHECK(cudaFree(dev_scattered_feature_));
  114. GPU_CHECK(cudaFree(dev_filtered_box_));
  115. GPU_CHECK(cudaFree(dev_filtered_score_));
  116. GPU_CHECK(cudaFree(dev_filtered_label_));
  117. GPU_CHECK(cudaFree(dev_filtered_dir_));
  118. GPU_CHECK(cudaFree(dev_box_for_nms_));
  119. GPU_CHECK(cudaFree(dev_filter_count_));
  120. }
  121. void PointPillars::InitParams()
  122. {
  123. YAML::Node params = YAML::LoadFile(pp_config_);
  124. kPillarXSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][0].as<float>();
  125. kPillarYSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][1].as<float>();
  126. kPillarZSize = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["VOXEL_SIZE"][2].as<float>();
  127. kMinXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][0].as<float>();
  128. kMinYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][1].as<float>();
  129. kMinZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][2].as<float>();
  130. kMaxXRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][3].as<float>();
  131. kMaxYRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][4].as<float>();
  132. kMaxZRange = params["DATA_CONFIG"]["POINT_CLOUD_RANGE"][5].as<float>();
  133. kNumClass = params["CLASS_NAMES"].size();
  134. kMaxNumPillars = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_NUMBER_OF_VOXELS"]["test"].as<int>();
  135. kMaxNumPointsPerPillar = params["DATA_CONFIG"]["DATA_PROCESSOR"][2]["MAX_POINTS_PER_VOXEL"].as<int>();
  136. kNumPointFeature = 5; // [x, y, z, i,0]
  137. kNumInputBoxFeature = 7;
  138. kNumOutputBoxFeature = params["MODEL"]["DENSE_HEAD"]["TARGET_ASSIGNER_CONFIG"]["BOX_CODER_CONFIG"]["code_size"].as<int>();
  139. kBatchSize = 1;
  140. kNumIndsForScan = 1024;
  141. kNumThreads = 64;
  142. kNumBoxCorners = 8;
  143. kAnchorStrides = 4;
  144. kNmsPreMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_PRE_MAXSIZE"].as<int>();
  145. kNmsPostMaxsize = params["MODEL"]["POST_PROCESSING"]["NMS_CONFIG"]["NMS_POST_MAXSIZE"].as<int>();
  146. //params for initialize anchors
  147. //Adapt to OpenPCDet
  148. kAnchorNames = params["CLASS_NAMES"].as<std::vector<std::string>>();
  149. for (int i = 0; i < kAnchorNames.size(); ++i)
  150. {
  151. kAnchorDxSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][0].as<float>());
  152. kAnchorDySizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][1].as<float>());
  153. kAnchorDzSizes.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_sizes"][0][2].as<float>());
  154. kAnchorBottom.emplace_back(params["MODEL"]["DENSE_HEAD"]["ANCHOR_GENERATOR_CONFIG"][i]["anchor_bottom_heights"][0].as<float>());
  155. }
  156. for (int idx_head = 0; idx_head < params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"].size(); ++idx_head)
  157. {
  158. int num_cls_per_head = params["MODEL"]["DENSE_HEAD"]["RPN_HEAD_CFGS"][idx_head]["HEAD_CLS_NAME"].size();
  159. std::vector<int> value;
  160. for (int i = 0; i < num_cls_per_head; ++i)
  161. {
  162. value.emplace_back(idx_head + i);
  163. }
  164. kMultiheadLabelMapping.emplace_back(value);
  165. }
  166. // Generate secondary parameters based on above.
  167. kGridXSize = static_cast<int>((kMaxXRange - kMinXRange) / kPillarXSize); //512
  168. kGridYSize = static_cast<int>((kMaxYRange - kMinYRange) / kPillarYSize); //512
  169. kGridZSize = static_cast<int>((kMaxZRange - kMinZRange) / kPillarZSize); //1
  170. kRpnInputSize = 64 * kGridYSize * kGridXSize;
  171. kNumAnchorXinds = static_cast<int>(kGridXSize / kAnchorStrides); //Width
  172. kNumAnchorYinds = static_cast<int>(kGridYSize / kAnchorStrides); //Hight
  173. kNumAnchor = kNumAnchorXinds * kNumAnchorYinds * 2 * kNumClass; // H * W * Ro * N = 196608
  174. kNumAnchorPerCls = kNumAnchorXinds * kNumAnchorYinds * 2; //H * W * Ro = 32768
  175. kRpnBoxOutputSize = kNumAnchor * kNumOutputBoxFeature;
  176. kRpnClsOutputSize = kNumAnchor * kNumClass;
  177. kRpnDirOutputSize = kNumAnchor * 2;
  178. }
  179. void PointPillars::DeviceMemoryMalloc() {
  180. // for pillars
  181. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_num_points_per_pillar_), kMaxNumPillars * sizeof(float))); // M
  182. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_x_coors_), kMaxNumPillars * sizeof(int))); // M
  183. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_y_coors_), kMaxNumPillars * sizeof(int))); // M
  184. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_point_feature_), kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float))); // [M , m , 4]
  185. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pillar_coors_), kMaxNumPillars * 4 * sizeof(float))); // [M , 4]
  186. // for sparse map
  187. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sparse_pillar_map_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
  188. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_x_), kNumIndsForScan * kNumIndsForScan * sizeof(int))); // [1024 , 1024]
  189. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_cumsum_along_y_), kNumIndsForScan * kNumIndsForScan * sizeof(int)));// [1024 , 1024]
  190. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_pfe_gather_feature_),
  191. kMaxNumPillars * kMaxNumPointsPerPillar *
  192. kNumGatherPointFeature * sizeof(float)));
  193. // for trt inference
  194. // create GPU buffers and a stream
  195. GPU_CHECK(
  196. cudaMalloc(&pfe_buffers_[0], kMaxNumPillars * kMaxNumPointsPerPillar *
  197. kNumGatherPointFeature * sizeof(float)));
  198. GPU_CHECK(cudaMalloc(&pfe_buffers_[1], kMaxNumPillars * 64 * sizeof(float)));
  199. GPU_CHECK(cudaMalloc(&rpn_buffers_[0], kRpnInputSize * sizeof(float)));
  200. GPU_CHECK(cudaMalloc(&rpn_buffers_[1], kNumAnchorPerCls * sizeof(float))); //classes
  201. GPU_CHECK(cudaMalloc(&rpn_buffers_[2], kNumAnchorPerCls * 2 * 2 * sizeof(float)));
  202. GPU_CHECK(cudaMalloc(&rpn_buffers_[3], kNumAnchorPerCls * 2 * 2 * sizeof(float)));
  203. GPU_CHECK(cudaMalloc(&rpn_buffers_[4], kNumAnchorPerCls * sizeof(float)));
  204. GPU_CHECK(cudaMalloc(&rpn_buffers_[5], kNumAnchorPerCls * 2 * 2 * sizeof(float)));
  205. GPU_CHECK(cudaMalloc(&rpn_buffers_[6], kNumAnchorPerCls * 2 * 2 * sizeof(float)));
  206. GPU_CHECK(cudaMalloc(&rpn_buffers_[7], kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature * sizeof(float))); //boxes
  207. // for scatter kernel
  208. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_scattered_feature_),
  209. kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
  210. // for filter
  211. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_box_),
  212. kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
  213. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_score_),
  214. kNumAnchor * sizeof(float)));
  215. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_label_),
  216. kNumAnchor * sizeof(int)));
  217. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filtered_dir_),
  218. kNumAnchor * sizeof(int)));
  219. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_box_for_nms_),
  220. kNumAnchor * kNumBoxCorners * sizeof(float)));
  221. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_filter_count_), kNumClass * sizeof(int)));
  222. }
  223. void PointPillars::SetDeviceMemoryToZero() {
  224. GPU_CHECK(cudaMemset(dev_num_points_per_pillar_, 0, kMaxNumPillars * sizeof(float)));
  225. GPU_CHECK(cudaMemset(dev_x_coors_, 0, kMaxNumPillars * sizeof(int)));
  226. GPU_CHECK(cudaMemset(dev_y_coors_, 0, kMaxNumPillars * sizeof(int)));
  227. GPU_CHECK(cudaMemset(dev_pillar_point_feature_, 0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumPointFeature * sizeof(float)));
  228. GPU_CHECK(cudaMemset(dev_pillar_coors_, 0, kMaxNumPillars * 4 * sizeof(float)));
  229. // GPU_CHECK(cudaMemset(dev_sparse_pillar_map_, 0, kNumIndsForScan * kNumIndsForScan * sizeof(int)));
  230. GPU_CHECK(cudaMemset(dev_pfe_gather_feature_, 0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
  231. // GPU_CHECK(cudaMemset(pfe_buffers_[0], 0, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float)));
  232. // GPU_CHECK(cudaMemset(pfe_buffers_[1], 0, kMaxNumPillars * 64 * sizeof(float)));
  233. GPU_CHECK(cudaMemset(dev_scattered_feature_, 0, kNumThreads * kGridYSize * kGridXSize * sizeof(float)));
  234. // GPU_CHECK(cudaMemset(rpn_buffers_[0], 0, kRpnInputSize * sizeof(float)));
  235. // GPU_CHECK(cudaMemset(rpn_buffers_[1], 0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
  236. // GPU_CHECK(cudaMemset(rpn_buffers_[2], 0, kNumAnchorPerCls * sizeof(float)));
  237. // GPU_CHECK(cudaMemset(rpn_buffers_[3], 0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
  238. // GPU_CHECK(cudaMemset(rpn_buffers_[4], 0, kNumAnchorPerCls * 4 * sizeof(float)));
  239. // GPU_CHECK(cudaMemset(rpn_buffers_[5], 0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
  240. // GPU_CHECK(cudaMemset(rpn_buffers_[6], 0, kNumAnchorPerCls * sizeof(float)));
  241. // GPU_CHECK(cudaMemset(rpn_buffers_[7], 0, kNumAnchorPerCls * 2 * kNumOutputBoxFeature * sizeof(float)));
  242. // GPU_CHECK(cudaMemset(rpn_buffers_[8], 0, kNumAnchorPerCls * 4 * sizeof(float)));
  243. // GPU_CHECK(cudaMemset(rpn_buffers_[9], 0, kNumAnchorPerCls * kNumOutputBoxFeature * sizeof(float)));
  244. // GPU_CHECK(cudaMemset(rpn_buffers_[10], 0, kNumAnchorPerCls * sizeof(float)));
  245. GPU_CHECK(cudaMemset(dev_filtered_box_, 0, kNumAnchor * kNumOutputBoxFeature * sizeof(float)));
  246. GPU_CHECK(cudaMemset(dev_filtered_score_, 0, kNumAnchor * sizeof(float)));
  247. GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
  248. }
  249. void PointPillars::InitTRT(const bool use_onnx) {
  250. if (use_onnx_) {
  251. // create a TensorRT model from the onnx model and load it into an engine
  252. OnnxToTRTModel(pfe_file_, &pfe_engine_);
  253. SaveEngine(pfe_engine_, pfe_file_.substr(0, pfe_file_.find(".")) + ".trt");
  254. OnnxToTRTModel(backbone_file_, &backbone_engine_);
  255. SaveEngine(backbone_engine_, backbone_file_.substr(0, backbone_file_.find(".")) + ".trt");
  256. }else {
  257. EngineToTRTModel(pfe_file_, &pfe_engine_);
  258. EngineToTRTModel(backbone_file_, &backbone_engine_);
  259. }
  260. if (pfe_engine_ == nullptr || backbone_engine_ == nullptr) {
  261. std::cerr << "Failed to load ONNX file.";
  262. }
  263. // create execution context from the engine
  264. pfe_context_ = pfe_engine_->createExecutionContext();
  265. backbone_context_ = backbone_engine_->createExecutionContext();
  266. if (pfe_context_ == nullptr || backbone_context_ == nullptr) {
  267. std::cerr << "Failed to create TensorRT Execution Context.";
  268. }
  269. }
  270. void PointPillars::OnnxToTRTModel(
  271. const std::string& model_file, // name of the onnx model
  272. nvinfer1::ICudaEngine** engine_ptr) {
  273. int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
  274. // create the builder
  275. const auto explicit_batch =
  276. static_cast<uint32_t>(kBatchSize) << static_cast<uint32_t>(
  277. nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);
  278. nvinfer1::IBuilder* builder = nvinfer1::createInferBuilder(g_logger_);
  279. nvinfer1::INetworkDefinition* network =
  280. builder->createNetworkV2(explicit_batch);
  281. // parse onnx model
  282. auto parser = nvonnxparser::createParser(*network, g_logger_);
  283. if (!parser->parseFromFile(model_file.c_str(), verbosity)) {
  284. std::string msg("failed to parse onnx file");
  285. g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
  286. exit(EXIT_FAILURE);
  287. }
  288. // Build the engine
  289. builder->setMaxBatchSize(kBatchSize);
  290. builder->setHalf2Mode(true);
  291. nvinfer1::IBuilderConfig* config = builder->createBuilderConfig();
  292. config->setMaxWorkspaceSize(1 << 25);
  293. nvinfer1::ICudaEngine* engine =
  294. builder->buildEngineWithConfig(*network, *config);
  295. *engine_ptr = engine;
  296. parser->destroy();
  297. network->destroy();
  298. config->destroy();
  299. builder->destroy();
  300. }
  301. void PointPillars::SaveEngine(const nvinfer1::ICudaEngine* engine, const std::string& engine_filepath)
  302. {
  303. // serialize the engine, then close everything down
  304. nvinfer1::IHostMemory& trtModelStream = *(engine->serialize());
  305. std::ofstream file;
  306. file.open(engine_filepath, std::ios::binary | std::ios::out);
  307. if(!file.is_open())
  308. {
  309. std::cout << "read create engine file" << engine_filepath <<" failed" << std::endl;
  310. return;
  311. }
  312. file.write((const char*)trtModelStream.data(), std::streamsize(trtModelStream.size()));
  313. file.close();
  314. }
  315. void PointPillars::EngineToTRTModel(
  316. const std::string &engine_file ,
  317. nvinfer1::ICudaEngine** engine_ptr) {
  318. int verbosity = static_cast<int>(nvinfer1::ILogger::Severity::kWARNING);
  319. std::stringstream gieModelStream;
  320. gieModelStream.seekg(0, gieModelStream.beg);
  321. std::ifstream cache(engine_file);
  322. gieModelStream << cache.rdbuf();
  323. cache.close();
  324. nvinfer1::IRuntime* runtime = nvinfer1::createInferRuntime(g_logger_);
  325. if (runtime == nullptr) {
  326. std::string msg("failed to build runtime parser");
  327. g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
  328. exit(EXIT_FAILURE);
  329. }
  330. gieModelStream.seekg(0, std::ios::end);
  331. const int modelSize = gieModelStream.tellg();
  332. gieModelStream.seekg(0, std::ios::beg);
  333. void* modelMem = malloc(modelSize);
  334. gieModelStream.read((char*)modelMem, modelSize);
  335. std::cout << " "<< std::endl;
  336. std::cout << "------------------------------------------------------------------"<< std::endl;
  337. std::cout << ">>>> >>>>"<< std::endl;
  338. std::cout << " "<< std::endl;
  339. std::cout << "Input filename: " << engine_file << std::endl;
  340. std::cout << " "<< std::endl;
  341. std::cout << ">>>> >>>>"<< std::endl;
  342. std::cout << "------------------------------------------------------------------"<< std::endl;
  343. std::cout << " "<< std::endl;
  344. nvinfer1::ICudaEngine* engine = runtime->deserializeCudaEngine(modelMem, modelSize, NULL);
  345. if (engine == nullptr) {
  346. std::string msg("failed to build engine parser");
  347. g_logger_.log(nvinfer1::ILogger::Severity::kERROR, msg.c_str());
  348. exit(EXIT_FAILURE);
  349. }
  350. *engine_ptr = engine;
  351. }
  352. void PointPillars::DoInference(const float* in_points_array,
  353. const int in_num_points,
  354. std::vector<float>* out_detections,
  355. std::vector<int>* out_labels,
  356. std::vector<float>* out_scores)
  357. {
  358. SetDeviceMemoryToZero();
  359. cudaDeviceSynchronize();
  360. // [STEP 1] : load pointcloud
  361. float* dev_points;
  362. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_points),
  363. in_num_points * kNumPointFeature * sizeof(float))); // in_num_points , 5
  364. GPU_CHECK(cudaMemset(dev_points, 0, in_num_points * kNumPointFeature * sizeof(float)));
  365. GPU_CHECK(cudaMemcpy(dev_points, in_points_array,
  366. in_num_points * kNumPointFeature * sizeof(float),
  367. cudaMemcpyHostToDevice));
  368. // [STEP 2] : preprocess
  369. host_pillar_count_[0] = 0;
  370. auto preprocess_start = std::chrono::high_resolution_clock::now();
  371. preprocess_points_cuda_ptr_->DoPreprocessPointsCuda(
  372. dev_points, in_num_points, dev_x_coors_, dev_y_coors_,
  373. dev_num_points_per_pillar_, dev_pillar_point_feature_, dev_pillar_coors_,
  374. dev_sparse_pillar_map_, host_pillar_count_ ,
  375. dev_pfe_gather_feature_ );
  376. cudaDeviceSynchronize();
  377. auto preprocess_end = std::chrono::high_resolution_clock::now();
  378. // DEVICE_SAVE<float>(dev_pfe_gather_feature_, kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature , "0_Model_pfe_input_gather_feature");
  379. // [STEP 3] : pfe forward
  380. cudaStream_t stream;
  381. GPU_CHECK(cudaStreamCreate(&stream));
  382. auto pfe_start = std::chrono::high_resolution_clock::now();
  383. GPU_CHECK(cudaMemcpyAsync(pfe_buffers_[0], dev_pfe_gather_feature_,
  384. kMaxNumPillars * kMaxNumPointsPerPillar * kNumGatherPointFeature * sizeof(float), ///kNumGatherPointFeature
  385. cudaMemcpyDeviceToDevice, stream));
  386. pfe_context_->enqueueV2(pfe_buffers_, stream, nullptr);
  387. cudaDeviceSynchronize();
  388. auto pfe_end = std::chrono::high_resolution_clock::now();
  389. // DEVICE_SAVE<float>(reinterpret_cast<float*>(pfe_buffers_[1]), kMaxNumPillars * 64 , "1_Model_pfe_output_buffers_[1]");
  390. // [STEP 4] : scatter pillar feature
  391. auto scatter_start = std::chrono::high_resolution_clock::now();
  392. scatter_cuda_ptr_->DoScatterCuda(
  393. host_pillar_count_[0], dev_x_coors_, dev_y_coors_,
  394. reinterpret_cast<float*>(pfe_buffers_[1]), dev_scattered_feature_);
  395. cudaDeviceSynchronize();
  396. auto scatter_end = std::chrono::high_resolution_clock::now();
  397. // DEVICE_SAVE<float>(dev_scattered_feature_ , kRpnInputSize,"2_Model_backbone_input_dev_scattered_feature");
  398. // [STEP 5] : backbone forward
  399. auto backbone_start = std::chrono::high_resolution_clock::now();
  400. GPU_CHECK(cudaMemcpyAsync(rpn_buffers_[0], dev_scattered_feature_,
  401. kBatchSize * kRpnInputSize * sizeof(float),
  402. cudaMemcpyDeviceToDevice, stream));
  403. backbone_context_->enqueueV2(rpn_buffers_, stream, nullptr);
  404. cudaDeviceSynchronize();
  405. auto backbone_end = std::chrono::high_resolution_clock::now();
  406. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[1]) , kNumAnchorPerCls ,"3_rpn_buffers_[1]");
  407. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[2]) , kNumAnchorPerCls * 4,"3_rpn_buffers_[2]");
  408. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[3]) , kNumAnchorPerCls * 4,"3_rpn_buffers_[3]");
  409. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[4]) , kNumAnchorPerCls ,"3_rpn_buffers_[4]");
  410. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[5]) , kNumAnchorPerCls * 4,"3_rpn_buffers_[5]");
  411. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[6]) , kNumAnchorPerCls * 4,"3_rpn_buffers_[6]");
  412. // DEVICE_SAVE<float>(reinterpret_cast<float*>(rpn_buffers_[7]) , kNumAnchorPerCls * kNumClass * 9 ,"3_rpn_buffers_[7]");
  413. // [STEP 6]: postprocess (multihead)
  414. auto postprocess_start = std::chrono::high_resolution_clock::now();
  415. GPU_CHECK(cudaMemset(dev_filter_count_, 0, kNumClass * sizeof(int)));
  416. postprocess_cuda_ptr_->DoPostprocessCuda(
  417. reinterpret_cast<float*>(rpn_buffers_[1]), // [cls] kNumAnchorPerCls
  418. reinterpret_cast<float*>(rpn_buffers_[2]), // [cls] kNumAnchorPerCls * 2 * 2
  419. reinterpret_cast<float*>(rpn_buffers_[3]), // [cls] kNumAnchorPerCls * 2 * 2
  420. reinterpret_cast<float*>(rpn_buffers_[4]), // [cls] kNumAnchorPerCls
  421. reinterpret_cast<float*>(rpn_buffers_[5]), // [cls] kNumAnchorPerCls * 2 * 2
  422. reinterpret_cast<float*>(rpn_buffers_[6]), // [cls] kNumAnchorPerCls * 2 * 2
  423. reinterpret_cast<float*>(rpn_buffers_[7]), // [boxes] kNumAnchorPerCls * kNumClass * kNumOutputBoxFeature
  424. dev_filtered_box_, dev_filtered_score_, dev_filter_count_,
  425. *out_detections, *out_labels , *out_scores);
  426. cudaDeviceSynchronize();
  427. auto postprocess_end = std::chrono::high_resolution_clock::now();
  428. // release the stream and the buffers
  429. // std::chrono::duration<double> preprocess_cost = preprocess_end - preprocess_start;
  430. // std::chrono::duration<double> pfe_cost = pfe_end - pfe_start;
  431. // std::chrono::duration<double> scatter_cost = scatter_end - scatter_start;
  432. // std::chrono::duration<double> backbone_cost = backbone_end - backbone_start;
  433. // std::chrono::duration<double> postprocess_cost = postprocess_end - postprocess_start;
  434. // std::chrono::duration<double> pointpillars_cost = postprocess_end - preprocess_start;
  435. // std::cout << "------------------------------------" << std::endl;
  436. // std::cout << setiosflags(ios::left) << setw(14) << "Module" << setw(12) << "Time" << resetiosflags(ios::left) << std::endl;
  437. // std::cout << "------------------------------------" << std::endl;
  438. // std::string Modules[] = {"Preprocess" , "Pfe" , "Scatter" , "Backbone" , "Postprocess" , "Summary"};
  439. // double Times[] = {preprocess_cost.count() , pfe_cost.count() , scatter_cost.count() , backbone_cost.count() , postprocess_cost.count() , pointpillars_cost.count()};
  440. // for (int i =0 ; i < 6 ; ++i) {
  441. // std::cout << setiosflags(ios::left) << setw(14) << Modules[i] << setw(8) << Times[i] * 1000 << " ms" << resetiosflags(ios::left) << std::endl;
  442. // }
  443. // std::cout << "------------------------------------" << std::endl;
  444. cudaStreamDestroy(stream);
  445. }