preprocess_kernels.cu 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311
  1. /*
  2. * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  3. * SPDX-License-Identifier: Apache-2.0
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. #include "kernel.h"
  18. __global__ void generateVoxels_random_kernel(float *points, size_t points_size,
  19. float min_x_range, float max_x_range,
  20. float min_y_range, float max_y_range,
  21. float min_z_range, float max_z_range,
  22. float pillar_x_size, float pillar_y_size, float pillar_z_size,
  23. int grid_y_size, int grid_x_size,
  24. unsigned int *mask, float *voxels)
  25. {
  26. int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  27. if(point_idx >= points_size) return;
  28. float4 point = ((float4*)points)[point_idx];
  29. if(point.x<min_x_range||point.x>=max_x_range
  30. || point.y<min_y_range||point.y>=max_y_range
  31. || point.z<min_z_range||point.z>=max_z_range) return;
  32. int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
  33. int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
  34. unsigned int voxel_index = voxel_idy * grid_x_size
  35. + voxel_idx;
  36. unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
  37. if(point_id >= POINTS_PER_VOXEL) return;
  38. float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
  39. atomicExch(address+0, point.x);
  40. atomicExch(address+1, point.y);
  41. atomicExch(address+2, point.z);
  42. atomicExch(address+3, point.w);
  43. }
  44. cudaError_t generateVoxels_random_launch(float *points, size_t points_size,
  45. float min_x_range, float max_x_range,
  46. float min_y_range, float max_y_range,
  47. float min_z_range, float max_z_range,
  48. float pillar_x_size, float pillar_y_size, float pillar_z_size,
  49. int grid_y_size, int grid_x_size,
  50. unsigned int *mask, float *voxels,
  51. cudaStream_t stream)
  52. {
  53. int threadNum = THREADS_FOR_VOXEL;
  54. dim3 blocks((points_size+threadNum-1)/threadNum);
  55. dim3 threads(threadNum);
  56. generateVoxels_random_kernel<<<blocks, threads, 0, stream>>>
  57. (points, points_size,
  58. min_x_range, max_x_range,
  59. min_y_range, max_y_range,
  60. min_z_range, max_z_range,
  61. pillar_x_size, pillar_y_size, pillar_z_size,
  62. grid_y_size, grid_x_size,
  63. mask, voxels);
  64. cudaError_t err = cudaGetLastError();
  65. return err;
  66. }
  67. __global__ void generateVoxelsList_kernel(float *points, size_t points_size,
  68. float min_x_range, float max_x_range,
  69. float min_y_range, float max_y_range,
  70. float min_z_range, float max_z_range,
  71. float pillar_x_size, float pillar_y_size, float pillar_z_size,
  72. int grid_y_size, int grid_x_size,
  73. unsigned int *mask, int *voxelsList)
  74. {
  75. int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  76. if(point_idx >= points_size) return;
  77. float4 point = ((float4*)points)[point_idx];
  78. if(point.x<min_x_range||point.x>=max_x_range
  79. || point.y<min_y_range||point.y>=max_y_range
  80. || point.z<min_z_range||point.z>=max_z_range)
  81. {
  82. voxelsList[point_idx] = -1;
  83. return;
  84. }
  85. int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
  86. int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
  87. unsigned int voxel_index = voxel_idy * grid_x_size
  88. + voxel_idx;
  89. atomicAdd(&(mask[voxel_index]), 1);
  90. voxelsList[point_idx] = voxel_index;
  91. }
  92. __global__ void generateVoxels_kernel(float *points, size_t points_size,
  93. int *voxelsList,
  94. unsigned int *mask, float *voxels)
  95. {
  96. int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
  97. if(point_idx >= points_size) return;
  98. int voxel_index = voxelsList[point_idx];
  99. if (voxel_index == -1) return;
  100. int point_id = atomicAdd(&(mask[voxel_index]), 1);
  101. if(point_id >= POINTS_PER_VOXEL) return;
  102. float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
  103. float4 point = ((float4*)points)[point_idx];
  104. atomicExch(address+0, point.x);
  105. atomicExch(address+1, point.y);
  106. atomicExch(address+2, point.z);
  107. atomicExch(address+3, point.w);
  108. }
  109. __global__ void generateBaseFeatures_kernel(unsigned int *mask, float *voxels,
  110. int grid_y_size, int grid_x_size,
  111. unsigned int *pillar_num,
  112. float *voxel_features,
  113. unsigned int *voxel_num,
  114. unsigned int *voxel_idxs)
  115. {
  116. unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
  117. unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
  118. if(voxel_idx >= grid_x_size ||voxel_idy >= grid_y_size) return;
  119. unsigned int voxel_index = voxel_idy * grid_x_size
  120. + voxel_idx;
  121. unsigned int count = mask[voxel_index];
  122. if( !(count>0) ) return;
  123. count = count<POINTS_PER_VOXEL?count:POINTS_PER_VOXEL;
  124. unsigned int current_pillarId = 0;
  125. current_pillarId = atomicAdd(pillar_num, 1);
  126. voxel_num[current_pillarId] = count;
  127. uint4 idx = {0, 0, voxel_idy, voxel_idx};
  128. ((uint4*)voxel_idxs)[current_pillarId] = idx;
  129. for (int i=0; i<count; i++){
  130. int inIndex = voxel_index*POINTS_PER_VOXEL + i;
  131. int outIndex = current_pillarId*POINTS_PER_VOXEL + i;
  132. ((float4*)voxel_features)[outIndex] = ((float4*)voxels)[inIndex];
  133. }
  134. // clear buffer for next infer
  135. atomicExch(mask + voxel_index, 0);
  136. }
  137. // create 4 channels
  138. cudaError_t generateBaseFeatures_launch(unsigned int *mask, float *voxels,
  139. int grid_y_size, int grid_x_size,
  140. unsigned int *pillar_num,
  141. float *voxel_features,
  142. unsigned int *voxel_num,
  143. unsigned int *voxel_idxs,
  144. cudaStream_t stream)
  145. {
  146. dim3 threads = {32,32};
  147. dim3 blocks = {(grid_x_size + threads.x -1)/threads.x,
  148. (grid_y_size + threads.y -1)/threads.y};
  149. generateBaseFeatures_kernel<<<blocks, threads, 0, stream>>>
  150. (mask, voxels, grid_y_size, grid_x_size,
  151. pillar_num,
  152. voxel_features,
  153. voxel_num,
  154. voxel_idxs);
  155. cudaError_t err = cudaGetLastError();
  156. return err;
  157. }
  158. // 4 channels -> 10 channels
  159. __global__ void generateFeatures_kernel(float* voxel_features,
  160. unsigned int* voxel_num, unsigned int* voxel_idxs, unsigned int *params,
  161. float voxel_x, float voxel_y, float voxel_z,
  162. float range_min_x, float range_min_y, float range_min_z,
  163. float* features)
  164. {
  165. int pillar_idx = blockIdx.x * WARPS_PER_BLOCK + threadIdx.x/WARP_SIZE;
  166. int point_idx = threadIdx.x % WARP_SIZE;
  167. int pillar_idx_inBlock = threadIdx.x/32;
  168. unsigned int num_pillars = params[0];
  169. if (pillar_idx >= num_pillars) return;
  170. __shared__ float4 pillarSM[WARPS_PER_BLOCK][WARP_SIZE];
  171. __shared__ float4 pillarSumSM[WARPS_PER_BLOCK];
  172. __shared__ uint4 idxsSM[WARPS_PER_BLOCK];
  173. __shared__ int pointsNumSM[WARPS_PER_BLOCK];
  174. __shared__ float pillarOutSM[WARPS_PER_BLOCK][WARP_SIZE][FEATURES_SIZE];
  175. if (threadIdx.x < WARPS_PER_BLOCK) {
  176. pointsNumSM[threadIdx.x] = voxel_num[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
  177. idxsSM[threadIdx.x] = ((uint4*)voxel_idxs)[blockIdx.x * WARPS_PER_BLOCK + threadIdx.x];
  178. pillarSumSM[threadIdx.x] = {0,0,0,0};
  179. }
  180. pillarSM[pillar_idx_inBlock][point_idx] = ((float4*)voxel_features)[pillar_idx*WARP_SIZE + point_idx];
  181. __syncthreads();
  182. //calculate sm in a pillar
  183. if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
  184. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].x), pillarSM[pillar_idx_inBlock][point_idx].x);
  185. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].y), pillarSM[pillar_idx_inBlock][point_idx].y);
  186. atomicAdd(&(pillarSumSM[pillar_idx_inBlock].z), pillarSM[pillar_idx_inBlock][point_idx].z);
  187. }
  188. __syncthreads();
  189. //feature-mean
  190. float4 mean;
  191. float validPoints = pointsNumSM[pillar_idx_inBlock];
  192. mean.x = pillarSumSM[pillar_idx_inBlock].x / validPoints;
  193. mean.y = pillarSumSM[pillar_idx_inBlock].y / validPoints;
  194. mean.z = pillarSumSM[pillar_idx_inBlock].z / validPoints;
  195. mean.x = pillarSM[pillar_idx_inBlock][point_idx].x - mean.x;
  196. mean.y = pillarSM[pillar_idx_inBlock][point_idx].y - mean.y;
  197. mean.z = pillarSM[pillar_idx_inBlock][point_idx].z - mean.z;
  198. //calculate offset
  199. float x_offset = voxel_x / 2 + idxsSM[pillar_idx_inBlock].w * voxel_x + range_min_x;
  200. float y_offset = voxel_y / 2 + idxsSM[pillar_idx_inBlock].z * voxel_y + range_min_y;
  201. float z_offset = voxel_z / 2 + idxsSM[pillar_idx_inBlock].y * voxel_z + range_min_z;
  202. //feature-offset
  203. float4 center;
  204. center.x = pillarSM[pillar_idx_inBlock][point_idx].x - x_offset;
  205. center.y = pillarSM[pillar_idx_inBlock][point_idx].y - y_offset;
  206. center.z = pillarSM[pillar_idx_inBlock][point_idx].z - z_offset;
  207. //store output
  208. if (point_idx < pointsNumSM[pillar_idx_inBlock]) {
  209. pillarOutSM[pillar_idx_inBlock][point_idx][0] = pillarSM[pillar_idx_inBlock][point_idx].x;
  210. pillarOutSM[pillar_idx_inBlock][point_idx][1] = pillarSM[pillar_idx_inBlock][point_idx].y;
  211. pillarOutSM[pillar_idx_inBlock][point_idx][2] = pillarSM[pillar_idx_inBlock][point_idx].z;
  212. pillarOutSM[pillar_idx_inBlock][point_idx][3] = pillarSM[pillar_idx_inBlock][point_idx].w;
  213. pillarOutSM[pillar_idx_inBlock][point_idx][4] = mean.x;
  214. pillarOutSM[pillar_idx_inBlock][point_idx][5] = mean.y;
  215. pillarOutSM[pillar_idx_inBlock][point_idx][6] = mean.z;
  216. pillarOutSM[pillar_idx_inBlock][point_idx][7] = center.x;
  217. pillarOutSM[pillar_idx_inBlock][point_idx][8] = center.y;
  218. pillarOutSM[pillar_idx_inBlock][point_idx][9] = center.z;
  219. } else {
  220. pillarOutSM[pillar_idx_inBlock][point_idx][0] = 0;
  221. pillarOutSM[pillar_idx_inBlock][point_idx][1] = 0;
  222. pillarOutSM[pillar_idx_inBlock][point_idx][2] = 0;
  223. pillarOutSM[pillar_idx_inBlock][point_idx][3] = 0;
  224. pillarOutSM[pillar_idx_inBlock][point_idx][4] = 0;
  225. pillarOutSM[pillar_idx_inBlock][point_idx][5] = 0;
  226. pillarOutSM[pillar_idx_inBlock][point_idx][6] = 0;
  227. pillarOutSM[pillar_idx_inBlock][point_idx][7] = 0;
  228. pillarOutSM[pillar_idx_inBlock][point_idx][8] = 0;
  229. pillarOutSM[pillar_idx_inBlock][point_idx][9] = 0;
  230. }
  231. __syncthreads();
  232. for(int i = 0; i < FEATURES_SIZE; i ++) {
  233. int outputSMId = pillar_idx_inBlock*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
  234. int outputId = pillar_idx*WARP_SIZE*FEATURES_SIZE + i* WARP_SIZE + point_idx;
  235. features[outputId] = ((float*)pillarOutSM)[outputSMId] ;
  236. }
  237. }
  238. cudaError_t generateFeatures_launch(float* voxel_features,
  239. unsigned int * voxel_num,
  240. unsigned int* voxel_idxs,
  241. unsigned int *params,
  242. float voxel_x, float voxel_y, float voxel_z,
  243. float range_min_x, float range_min_y, float range_min_z,
  244. float* features,
  245. cudaStream_t stream)
  246. {
  247. dim3 blocks( (MAX_VOXELS+WARPS_PER_BLOCK-1)/WARPS_PER_BLOCK);
  248. dim3 threads(WARPS_PER_BLOCK*WARP_SIZE);
  249. generateFeatures_kernel<<<blocks, threads, 0, stream>>>
  250. (voxel_features,
  251. voxel_num,
  252. voxel_idxs,
  253. params,
  254. voxel_x, voxel_y, voxel_z,
  255. range_min_x, range_min_y, range_min_z,
  256. features);
  257. cudaError_t err = cudaGetLastError();
  258. return err;
  259. }