postprocess_cuda.cu 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. /******************************************************************************
  2. * Copyright 2020 The Apollo Authors. All Rights Reserved.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. *****************************************************************************/
  16. /*
  17. * Copyright 2018-2019 Autoware Foundation. All rights reserved.
  18. *
  19. * Licensed under the Apache License, Version 2.0 (the "License");
  20. * you may not use this file except in compliance with the License.
  21. * You may obtain a copy of the License at
  22. *
  23. * http://www.apache.org/licenses/LICENSE-2.0
  24. *
  25. * Unless required by applicable law or agreed to in writing, software
  26. * distributed under the License is distributed on an "AS IS" BASIS,
  27. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  28. * See the License for the specific language governing permissions and
  29. * limitations under the License.
  30. */
  31. // headers in CUDA
  32. #include <thrust/sort.h>
  33. // headers in local files
  34. #include "postprocess_cuda.h"
  35. namespace apollo {
  36. namespace perception {
  37. namespace lidar {
  38. __global__ void filter_kernel(
  39. const float* box_preds, const float* cls_preds, const float* dir_preds,
  40. const int* anchor_mask, const float* dev_anchors_px,
  41. const float* dev_anchors_py, const float* dev_anchors_pz,
  42. const float* dev_anchors_dx, const float* dev_anchors_dy,
  43. const float* dev_anchors_dz, const float* dev_anchors_ro,
  44. float* filtered_box, float* filtered_score, int* filtered_label,
  45. int* filtered_dir, float* box_for_nms, int* filter_count,
  46. const float float_min, const float float_max, const float score_threshold,
  47. const int num_box_corners, const int num_output_box_feature,
  48. const int num_class) {
  49. // boxes ([N, 7] Tensor): normal boxes: x, y, z, w, l, h, r
  50. int tid = threadIdx.x + blockIdx.x * blockDim.x;
  51. // sigmoid function
  52. float top_score = 0;
  53. int top_label = 0;
  54. for (int i = 0; i < num_class; ++i) {
  55. float score = 1 / (1 + expf(-cls_preds[tid * num_class + i]));
  56. if (score > top_score) {
  57. top_score = score;
  58. top_label = i;
  59. }
  60. }
  61. if (anchor_mask[tid] == 1 && top_score > score_threshold) {
  62. int counter = atomicAdd(filter_count, 1);
  63. float za = dev_anchors_pz[tid] + dev_anchors_dz[tid] / 2;
  64. // decode network output
  65. float diagonal = sqrtf(dev_anchors_dx[tid] * dev_anchors_dx[tid] +
  66. dev_anchors_dy[tid] * dev_anchors_dy[tid]);
  67. float box_px = box_preds[tid * num_output_box_feature + 0] * diagonal +
  68. dev_anchors_px[tid];
  69. float box_py = box_preds[tid * num_output_box_feature + 1] * diagonal +
  70. dev_anchors_py[tid];
  71. float box_pz =
  72. box_preds[tid * num_output_box_feature + 2] * dev_anchors_dz[tid] + za;
  73. float box_dx =
  74. expf(box_preds[tid * num_output_box_feature + 3]) * dev_anchors_dx[tid];
  75. float box_dy =
  76. expf(box_preds[tid * num_output_box_feature + 4]) * dev_anchors_dy[tid];
  77. float box_dz =
  78. expf(box_preds[tid * num_output_box_feature + 5]) * dev_anchors_dz[tid];
  79. float box_ro =
  80. box_preds[tid * num_output_box_feature + 6] + dev_anchors_ro[tid];
  81. box_pz = box_pz - box_dz / 2;
  82. filtered_box[counter * num_output_box_feature + 0] = box_px;
  83. filtered_box[counter * num_output_box_feature + 1] = box_py;
  84. filtered_box[counter * num_output_box_feature + 2] = box_pz;
  85. filtered_box[counter * num_output_box_feature + 3] = box_dx;
  86. filtered_box[counter * num_output_box_feature + 4] = box_dy;
  87. filtered_box[counter * num_output_box_feature + 5] = box_dz;
  88. filtered_box[counter * num_output_box_feature + 6] = box_ro;
  89. filtered_score[counter] = top_score;
  90. filtered_label[counter] = top_label;
  91. int direction_label;
  92. if (dir_preds[tid * 2 + 0] < dir_preds[tid * 2 + 1]) {
  93. direction_label = 1;
  94. } else {
  95. direction_label = 0;
  96. }
  97. filtered_dir[counter] = direction_label;
  98. // convrt normal box(normal boxes: x, y, z, w, l, h, r) to box(xmin, ymin,
  99. // xmax, ymax) for nms calculation First: dx, dy -> box(x0y0, x0y1, x1y0,
  100. // x1y1)
  101. float corners[NUM_3D_BOX_CORNERS_MACRO] = {
  102. static_cast<float>(-0.5 * box_dx), static_cast<float>(-0.5 * box_dy),
  103. static_cast<float>(-0.5 * box_dx), static_cast<float>(0.5 * box_dy),
  104. static_cast<float>(0.5 * box_dx), static_cast<float>(0.5 * box_dy),
  105. static_cast<float>(0.5 * box_dx), static_cast<float>(-0.5 * box_dy)};
  106. // Second: Rotate, Offset and convert to point(xmin. ymin, xmax, ymax)
  107. float rotated_corners[NUM_3D_BOX_CORNERS_MACRO];
  108. float offset_corners[NUM_3D_BOX_CORNERS_MACRO];
  109. float sin_yaw = sinf(box_ro);
  110. float cos_yaw = cosf(box_ro);
  111. float xmin = float_max;
  112. float ymin = float_max;
  113. float xmax = float_min;
  114. float ymax = float_min;
  115. for (size_t i = 0; i < num_box_corners; ++i) {
  116. rotated_corners[i * 2 + 0] =
  117. cos_yaw * corners[i * 2 + 0] - sin_yaw * corners[i * 2 + 1];
  118. rotated_corners[i * 2 + 1] =
  119. sin_yaw * corners[i * 2 + 0] + cos_yaw * corners[i * 2 + 1];
  120. offset_corners[i * 2 + 0] = rotated_corners[i * 2 + 0] + box_px;
  121. offset_corners[i * 2 + 1] = rotated_corners[i * 2 + 1] + box_py;
  122. xmin = fminf(xmin, offset_corners[i * 2 + 0]);
  123. ymin = fminf(ymin, offset_corners[i * 2 + 1]);
  124. xmax = fmaxf(xmin, offset_corners[i * 2 + 0]);
  125. ymax = fmaxf(ymax, offset_corners[i * 2 + 1]);
  126. }
  127. // box_for_nms(num_box, 4)
  128. box_for_nms[counter * num_box_corners + 0] = xmin;
  129. box_for_nms[counter * num_box_corners + 1] = ymin;
  130. box_for_nms[counter * num_box_corners + 2] = xmax;
  131. box_for_nms[counter * num_box_corners + 3] = ymax;
  132. }
  133. }
  134. __global__ void sort_boxes_by_indexes_kernel(
  135. float* filtered_box, int* filtered_label, int* filtered_dir,
  136. float* box_for_nms, int* indexes, int filter_count,
  137. float* sorted_filtered_boxes, int* sorted_filtered_label,
  138. int* sorted_filtered_dir, float* sorted_box_for_nms,
  139. const int num_box_corners, const int num_output_box_feature) {
  140. int tid = threadIdx.x + blockIdx.x * blockDim.x;
  141. if (tid < filter_count) {
  142. int sort_index = indexes[tid];
  143. sorted_filtered_boxes[tid * num_output_box_feature + 0] =
  144. filtered_box[sort_index * num_output_box_feature + 0];
  145. sorted_filtered_boxes[tid * num_output_box_feature + 1] =
  146. filtered_box[sort_index * num_output_box_feature + 1];
  147. sorted_filtered_boxes[tid * num_output_box_feature + 2] =
  148. filtered_box[sort_index * num_output_box_feature + 2];
  149. sorted_filtered_boxes[tid * num_output_box_feature + 3] =
  150. filtered_box[sort_index * num_output_box_feature + 3];
  151. sorted_filtered_boxes[tid * num_output_box_feature + 4] =
  152. filtered_box[sort_index * num_output_box_feature + 4];
  153. sorted_filtered_boxes[tid * num_output_box_feature + 5] =
  154. filtered_box[sort_index * num_output_box_feature + 5];
  155. sorted_filtered_boxes[tid * num_output_box_feature + 6] =
  156. filtered_box[sort_index * num_output_box_feature + 6];
  157. sorted_filtered_label[tid] = filtered_label[sort_index];
  158. sorted_filtered_dir[tid] = filtered_dir[sort_index];
  159. sorted_box_for_nms[tid * num_box_corners + 0] =
  160. box_for_nms[sort_index * num_box_corners + 0];
  161. sorted_box_for_nms[tid * num_box_corners + 1] =
  162. box_for_nms[sort_index * num_box_corners + 1];
  163. sorted_box_for_nms[tid * num_box_corners + 2] =
  164. box_for_nms[sort_index * num_box_corners + 2];
  165. sorted_box_for_nms[tid * num_box_corners + 3] =
  166. box_for_nms[sort_index * num_box_corners + 3];
  167. }
  168. }
  169. PostprocessCuda::PostprocessCuda(const float float_min, const float float_max,
  170. const int num_anchor, const int num_class,
  171. const float score_threshold,
  172. const int num_threads,
  173. const float nms_overlap_threshold,
  174. const int num_box_corners,
  175. const int num_output_box_feature)
  176. : float_min_(float_min),
  177. float_max_(float_max),
  178. num_anchor_(num_anchor),
  179. num_class_(num_class),
  180. score_threshold_(score_threshold),
  181. num_threads_(num_threads),
  182. nms_overlap_threshold_(nms_overlap_threshold),
  183. num_box_corners_(num_box_corners),
  184. num_output_box_feature_(num_output_box_feature) {
  185. nms_cuda_ptr_.reset(
  186. new NmsCuda(num_threads, num_box_corners, nms_overlap_threshold));
  187. }
  188. void PostprocessCuda::DoPostprocessCuda(
  189. const float* rpn_box_output, const float* rpn_cls_output,
  190. const float* rpn_dir_output, int* dev_anchor_mask,
  191. const float* dev_anchors_px, const float* dev_anchors_py,
  192. const float* dev_anchors_pz, const float* dev_anchors_dx,
  193. const float* dev_anchors_dy, const float* dev_anchors_dz,
  194. const float* dev_anchors_ro, float* dev_filtered_box,
  195. float* dev_filtered_score, int* dev_filtered_label, int* dev_filtered_dir,
  196. float* dev_box_for_nms, int* dev_filter_count,
  197. std::vector<float>* out_detection, std::vector<int>* out_label) {
  198. const int num_blocks_filter_kernel = DIVUP(num_anchor_, num_threads_);
  199. filter_kernel<<<num_blocks_filter_kernel, num_threads_>>>(
  200. rpn_box_output, rpn_cls_output, rpn_dir_output, dev_anchor_mask,
  201. dev_anchors_px, dev_anchors_py, dev_anchors_pz, dev_anchors_dx,
  202. dev_anchors_dy, dev_anchors_dz, dev_anchors_ro, dev_filtered_box,
  203. dev_filtered_score, dev_filtered_label, dev_filtered_dir, dev_box_for_nms,
  204. dev_filter_count, float_min_, float_max_, score_threshold_,
  205. num_box_corners_, num_output_box_feature_, num_class_);
  206. int host_filter_count[1] = {0};
  207. GPU_CHECK(cudaMemcpy(host_filter_count, dev_filter_count, sizeof(int),
  208. cudaMemcpyDeviceToHost));
  209. if (host_filter_count[0] == 0) {
  210. return;
  211. }
  212. int* dev_indexes;
  213. float *dev_sorted_filtered_box, *dev_sorted_box_for_nms;
  214. int *dev_sorted_filtered_label, *dev_sorted_filtered_dir;
  215. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_indexes),
  216. host_filter_count[0] * sizeof(int)));
  217. GPU_CHECK(cudaMalloc(
  218. reinterpret_cast<void**>(&dev_sorted_filtered_box),
  219. num_output_box_feature_ * host_filter_count[0] * sizeof(float)));
  220. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sorted_filtered_label),
  221. host_filter_count[0] * sizeof(int)));
  222. GPU_CHECK(cudaMalloc(reinterpret_cast<void**>(&dev_sorted_filtered_dir),
  223. host_filter_count[0] * sizeof(int)));
  224. GPU_CHECK(
  225. cudaMalloc(reinterpret_cast<void**>(&dev_sorted_box_for_nms),
  226. num_box_corners_ * host_filter_count[0] * sizeof(float)));
  227. thrust::sequence(thrust::device, dev_indexes,
  228. dev_indexes + host_filter_count[0]);
  229. thrust::sort_by_key(thrust::device, dev_filtered_score,
  230. dev_filtered_score + size_t(host_filter_count[0]),
  231. dev_indexes, thrust::greater<float>());
  232. const int num_blocks = DIVUP(host_filter_count[0], num_threads_);
  233. sort_boxes_by_indexes_kernel<<<num_blocks, num_threads_>>>(
  234. dev_filtered_box, dev_filtered_label, dev_filtered_dir, dev_box_for_nms,
  235. dev_indexes, host_filter_count[0], dev_sorted_filtered_box,
  236. dev_sorted_filtered_label, dev_sorted_filtered_dir,
  237. dev_sorted_box_for_nms, num_box_corners_, num_output_box_feature_);
  238. int keep_inds[host_filter_count[0]];
  239. memset(keep_inds, 0, host_filter_count[0] * sizeof(int));
  240. int out_num_objects = 0;
  241. nms_cuda_ptr_->DoNmsCuda(host_filter_count[0], dev_sorted_box_for_nms,
  242. keep_inds, &out_num_objects);
  243. float host_filtered_box[host_filter_count[0] * num_output_box_feature_];
  244. int host_filtered_label[host_filter_count[0]];
  245. int host_filtered_dir[host_filter_count[0]];
  246. GPU_CHECK(
  247. cudaMemcpy(host_filtered_box, dev_sorted_filtered_box,
  248. num_output_box_feature_ * host_filter_count[0] * sizeof(float),
  249. cudaMemcpyDeviceToHost));
  250. GPU_CHECK(cudaMemcpy(host_filtered_label, dev_sorted_filtered_label,
  251. host_filter_count[0] * sizeof(int),
  252. cudaMemcpyDeviceToHost));
  253. GPU_CHECK(cudaMemcpy(host_filtered_dir, dev_sorted_filtered_dir,
  254. host_filter_count[0] * sizeof(int),
  255. cudaMemcpyDeviceToHost));
  256. for (size_t i = 0; i < out_num_objects; ++i) {
  257. out_detection->push_back(
  258. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 0]);
  259. out_detection->push_back(
  260. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 1]);
  261. out_detection->push_back(
  262. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 2]);
  263. out_detection->push_back(
  264. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 3]);
  265. out_detection->push_back(
  266. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 4]);
  267. out_detection->push_back(
  268. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 5]);
  269. if (host_filtered_dir[keep_inds[i]] == 0) {
  270. out_detection->push_back(
  271. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 6] + M_PI);
  272. } else {
  273. out_detection->push_back(
  274. host_filtered_box[keep_inds[i] * num_output_box_feature_ + 6]);
  275. }
  276. out_label->push_back(host_filtered_label[keep_inds[i]]);
  277. }
  278. GPU_CHECK(cudaFree(dev_indexes));
  279. GPU_CHECK(cudaFree(dev_sorted_filtered_box));
  280. GPU_CHECK(cudaFree(dev_sorted_filtered_label));
  281. GPU_CHECK(cudaFree(dev_sorted_filtered_dir));
  282. GPU_CHECK(cudaFree(dev_sorted_box_for_nms));
  283. }
  284. } // namespace lidar
  285. } // namespace perception
  286. } // namespace apollo