preprocess_kernel.cu 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. // Copyright 2024 TIER IV, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. /*
  15. * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
  16. * All rights reserved. SPDX-License-Identifier: Apache-2.0
  17. *
  18. * Licensed under the Apache License, Version 2.0 (the "License");
  19. * you may not use this file except in compliance with the License.
  20. * You may obtain a copy of the License at
  21. *
  22. * http://www.apache.org/licenses/LICENSE-2.0
  23. *
  24. * Unless required by applicable law or agreed to in writing, software
  25. * distributed under the License is distributed on an "AS IS" BASIS,
  26. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  27. * See the License for the specific language governing permissions and
  28. * limitations under the License.
  29. */
  30. #include "cuda_utils.hpp"
  31. #include "preprocess/preprocess_kernel.hpp"
  32. #include <cstdint>
  33. namespace autoware::lidar_transfusion
  34. {
  35. PreprocessCuda::PreprocessCuda(const TransfusionConfig & config, cudaStream_t & stream)
  36. : stream_(stream), config_(config)
  37. {
  38. mask_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_;
  39. voxels_size_ = config_.grid_z_size_ * config_.grid_y_size_ * config_.grid_x_size_ *
  40. config_.max_num_points_per_pillar_ * config_.num_point_feature_size_ +
  41. 1;
  42. mask_ = cuda::make_unique<unsigned int[]>(mask_size_);
  43. voxels_ = cuda::make_unique<float[]>(voxels_size_);
  44. }
  45. void PreprocessCuda::generateVoxels(
  46. float * points, unsigned int points_size, unsigned int * pillar_num, float * voxel_features,
  47. unsigned int * voxel_num, unsigned int * voxel_idxs)
  48. {
  49. cuda::clear_async(mask_.get(), mask_size_, stream_);
  50. cuda::clear_async(voxels_.get(), voxels_size_, stream_);
  51. CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
  52. CHECK_CUDA_ERROR(generateVoxels_random_launch(points, points_size, mask_.get(), voxels_.get()));
  53. CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
  54. CHECK_CUDA_ERROR(generateBaseFeatures_launch(
  55. mask_.get(), voxels_.get(), pillar_num, voxel_features, voxel_num, voxel_idxs));
  56. CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
  57. }
  58. __global__ void generateVoxels_random_kernel(
  59. float * points, unsigned int points_size, float min_x_range, float max_x_range, float min_y_range,
  60. float max_y_range, float min_z_range, float max_z_range, float pillar_x_size, float pillar_y_size,
  61. float pillar_z_size, int grid_y_size, int grid_x_size, int points_per_voxel, unsigned int * mask,
  62. float * voxels)
  63. {
  64. int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  65. if (point_idx >= points_size) return;
  66. float x = points[point_idx * 5];
  67. float y = points[point_idx * 5 + 1];
  68. float z = points[point_idx * 5 + 2];
  69. float i = points[point_idx * 5 + 3];
  70. float t = points[point_idx * 5 + 4];
  71. if (
  72. x <= min_x_range || x >= max_x_range || y <= min_y_range || y >= max_y_range ||
  73. z <= min_z_range || z >= max_z_range)
  74. return;
  75. int voxel_idx = floorf((x - min_x_range) / pillar_x_size);
  76. int voxel_idy = floorf((y - min_y_range) / pillar_y_size);
  77. unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
  78. unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
  79. if (point_id >= points_per_voxel) return;
  80. float * address = voxels + (voxel_index * points_per_voxel + point_id) * 5;
  81. atomicExch(address + 0, x);
  82. atomicExch(address + 1, y);
  83. atomicExch(address + 2, z);
  84. atomicExch(address + 3, i);
  85. atomicExch(address + 4, t);
  86. }
  87. cudaError_t PreprocessCuda::generateVoxels_random_launch(
  88. float * points, unsigned int points_size, unsigned int * mask, float * voxels)
  89. {
  90. if (points_size == 0) {
  91. return cudaGetLastError();
  92. }
  93. dim3 blocks(divup(points_size, config_.threads_for_voxel_));
  94. dim3 threads(config_.threads_for_voxel_);
  95. generateVoxels_random_kernel<<<blocks, threads, 0, stream_>>>(
  96. points, points_size, config_.min_x_range_, config_.max_x_range_, config_.min_y_range_,
  97. config_.max_y_range_, config_.min_z_range_, config_.max_z_range_, config_.voxel_x_size_,
  98. config_.voxel_y_size_, config_.voxel_z_size_, config_.grid_y_size_, config_.grid_x_size_,
  99. config_.points_per_voxel_, mask, voxels);
  100. cudaError_t err = cudaGetLastError();
  101. return err;
  102. }
  103. __global__ void generateBaseFeatures_kernel(
  104. unsigned int * mask, float * voxels, int grid_y_size, int grid_x_size, float points_per_voxel,
  105. float max_voxels, unsigned int * pillar_num, float * voxel_features, unsigned int * voxel_num,
  106. unsigned int * voxel_idxs)
  107. {
  108. unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
  109. unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
  110. if (voxel_idx >= grid_x_size || voxel_idy >= grid_y_size) return;
  111. unsigned int voxel_index = voxel_idy * grid_x_size + voxel_idx;
  112. unsigned int count = mask[voxel_index];
  113. if (!(count > 0)) return;
  114. count = count < points_per_voxel ? count : points_per_voxel;
  115. unsigned int current_pillarId = 0;
  116. current_pillarId = atomicAdd(pillar_num, 1);
  117. if (current_pillarId >= max_voxels) return;
  118. voxel_num[current_pillarId] = count;
  119. uint4 idx = {0, 0, voxel_idy, voxel_idx};
  120. ((uint4 *)voxel_idxs)[current_pillarId] = idx;
  121. for (int i = 0; i < count; i++) {
  122. int inIndex = voxel_index * points_per_voxel + i;
  123. int outIndex = current_pillarId * points_per_voxel + i;
  124. voxel_features[outIndex * 5] = voxels[inIndex * 5];
  125. voxel_features[outIndex * 5 + 1] = voxels[inIndex * 5 + 1];
  126. voxel_features[outIndex * 5 + 2] = voxels[inIndex * 5 + 2];
  127. voxel_features[outIndex * 5 + 3] = voxels[inIndex * 5 + 3];
  128. voxel_features[outIndex * 5 + 4] = voxels[inIndex * 5 + 4];
  129. }
  130. // clear buffer for next infer
  131. atomicExch(mask + voxel_index, 0);
  132. }
  133. // create 4 channels
  134. cudaError_t PreprocessCuda::generateBaseFeatures_launch(
  135. unsigned int * mask, float * voxels, unsigned int * pillar_num, float * voxel_features,
  136. unsigned int * voxel_num, unsigned int * voxel_idxs)
  137. {
  138. dim3 threads = {32, 32};
  139. dim3 blocks = {divup(config_.grid_x_size_, threads.x), divup(config_.grid_y_size_, threads.y)};
  140. generateBaseFeatures_kernel<<<blocks, threads, 0, stream_>>>(
  141. mask, voxels, config_.grid_y_size_, config_.grid_x_size_, config_.points_per_voxel_,
  142. config_.max_voxels_, pillar_num, voxel_features, voxel_num, voxel_idxs);
  143. cudaError_t err = cudaGetLastError();
  144. return err;
  145. }
  146. __global__ void generateSweepPoints_kernel(
  147. const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
  148. const float * transform_array, int num_features, float * output_points)
  149. {
  150. int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  151. if (point_idx >= points_size) return;
  152. union {
  153. uint32_t raw{0};
  154. float value;
  155. } input_x, input_y, input_z;
  156. #pragma unroll
  157. for (int i = 0; i < 4; i++) { // 4 bytes for float32
  158. input_x.raw |= input_data[point_idx * input_point_step + i] << i * 8;
  159. input_y.raw |= input_data[point_idx * input_point_step + i + 4] << i * 8;
  160. input_z.raw |= input_data[point_idx * input_point_step + i + 8] << i * 8;
  161. }
  162. float input_intensity = static_cast<float>(input_data[point_idx * input_point_step + 12]);
  163. // output_points[point_idx * num_features] =
  164. // transform_array[0] * input_x.value + transform_array[4] * input_y.value +
  165. // transform_array[8] * input_z.value + transform_array[12];
  166. // output_points[point_idx * num_features + 1] =
  167. // transform_array[1] * input_x.value + transform_array[5] * input_y.value +
  168. // transform_array[9] * input_z.value + transform_array[13];
  169. // output_points[point_idx * num_features + 2] =
  170. // transform_array[2] * input_x.value + transform_array[6] * input_y.value +
  171. // transform_array[10] * input_z.value + transform_array[14];
  172. // output_points[point_idx * num_features + 3] = input_intensity;
  173. // output_points[point_idx * num_features + 4] = time_lag;
  174. output_points[point_idx * num_features] =
  175. input_x.value ;
  176. output_points[point_idx * num_features + 1] =
  177. input_y.value ;
  178. output_points[point_idx * num_features + 2] =
  179. input_z.value;
  180. output_points[point_idx * num_features + 3] = input_intensity;
  181. output_points[point_idx * num_features + 4] = 0;
  182. }
  183. cudaError_t PreprocessCuda::generateSweepPoints_launch(
  184. const uint8_t * input_data, size_t points_size, int input_point_step, float time_lag,
  185. const float * transform_array, float * output_points)
  186. {
  187. dim3 blocks(divup(points_size, config_.threads_for_voxel_));
  188. dim3 threads(config_.threads_for_voxel_);
  189. generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
  190. input_data, points_size, input_point_step, time_lag, transform_array,
  191. config_.num_point_feature_size_, output_points);
  192. cudaError_t err = cudaGetLastError();
  193. return err;
  194. }
  195. } // namespace autoware::lidar_transfusion