kernel.cu 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. /*
  2. * http://github.com/dusty-nv/jetson-inference
  3. */
  4. #include "cuda/cudaUtility.h"
  5. #include <iostream>
  6. // gpuPreImageNet
  7. __global__ void gpuPreImageNet( float2 scale, float4* input, int iWidth, float* output, int oWidth, int oHeight )
  8. {
  9. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  10. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  11. const int n = oWidth * oHeight;
  12. if( x >= oWidth || y >= oHeight )
  13. return;
  14. const int dx = ((float)x * scale.x);
  15. const int dy = ((float)y * scale.y);
  16. const float4 px = input[ dy * iWidth + dx ];
  17. const float3 bgr = make_float3(px.z, px.y, px.x);
  18. output[n * 0 + y * oWidth + x] = bgr.x;
  19. output[n * 1 + y * oWidth + x] = bgr.y;
  20. output[n * 2 + y * oWidth + x] = bgr.z;
  21. }
  22. // cudaPreImageNet
  23. cudaError_t cudaPreImageNet( float4* input, size_t inputWidth, size_t inputHeight,
  24. float* output, size_t outputWidth, size_t outputHeight )
  25. {
  26. if( !input || !output )
  27. return cudaErrorInvalidDevicePointer;
  28. if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 )
  29. return cudaErrorInvalidValue;
  30. const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
  31. float(inputHeight) / float(outputHeight) );
  32. // launch kernel
  33. const dim3 blockDim(8, 8);
  34. const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
  35. gpuPreImageNet<<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight);
  36. return CUDA(cudaGetLastError());
  37. }
  38. // gpuPreImageNetMean
  39. __global__ void gpuPreImageNetMean( float2 scale, float3* input, int iWidth, float* output, int oWidth, int oHeight, float3 mean_value )
  40. {
  41. const int x = blockIdx.x * blockDim.x + threadIdx.x;
  42. const int y = blockIdx.y * blockDim.y + threadIdx.y;
  43. const int n = oWidth * oHeight;
  44. if( x >= oWidth || y >= oHeight )
  45. return;
  46. const int dx = ((float)x * scale.x);
  47. const int dy = ((float)y * scale.y);
  48. const float3 px = input[ dy * iWidth + dx ];
  49. const float3 bgr = make_float3(px.z - mean_value.x, px.y - mean_value.y, px.x - mean_value.z);
  50. output[n * 0 + y * oWidth + x] = bgr.x;
  51. output[n * 1 + y * oWidth + x] = bgr.y;
  52. output[n * 2 + y * oWidth + x] = bgr.z;
  53. }
  54. // cudaPreImageNetMean
  55. cudaError_t cudaPreImageNetMean( float3* input, size_t inputWidth, size_t inputHeight,
  56. float* output, size_t outputWidth, size_t outputHeight, const float3& mean_value )
  57. {
  58. if( !input || !output ){
  59. std::cout << "error here. "<< std::endl;
  60. return cudaErrorInvalidDevicePointer;
  61. }
  62. if( inputWidth == 0 || outputWidth == 0 || inputHeight == 0 || outputHeight == 0 ){
  63. std::cout << "Or here. " << std::endl;
  64. return cudaErrorInvalidValue;
  65. }
  66. const float2 scale = make_float2( float(inputWidth) / float(outputWidth),
  67. float(inputHeight) / float(outputHeight) );
  68. // launch kernel
  69. const dim3 blockDim(8, 8);
  70. const dim3 gridDim(iDivUp(outputWidth,blockDim.x), iDivUp(outputHeight,blockDim.y));
  71. gpuPreImageNetMean<<<gridDim, blockDim>>>(scale, input, inputWidth, output, outputWidth, outputHeight, mean_value);
  72. return CUDA(cudaGetLastError());
  73. }
  74. __global__ void kernel_extract_roi(float* input, float* output, char* mean,
  75. const int input_w, const int output_w, const int output_h,
  76. const int in_plane_r, const int in_plane_g, const int in_plane_b,
  77. const int out_plane_r, const int out_plane_g, const int out_plane_b,
  78. const int bbox_x, const int bbox_y, const int bbox_w, const int bbox_h)
  79. {
  80. uint x = blockIdx.x * blockDim.x + threadIdx.x;
  81. uint y = blockIdx.y * blockDim.y + threadIdx.y;
  82. if( x < output_w && y < output_h)
  83. {
  84. float r[2] = { float(x) * bbox_w / output_w + bbox_x,
  85. float(y) * bbox_h / output_h + bbox_y };
  86. int pos[4][2] = { { int(floor(r[0])), int(floor(r[1])) },
  87. { int( ceil(r[0])), int(floor(r[1])) },
  88. { int(floor(r[0])), int(ceil(r[1])) },
  89. { int( ceil(r[0])), int(ceil(r[1])) } };
  90. float u = r[0]-floor(r[0]);
  91. float v = r[1]-floor(r[1]);
  92. float s[4] = { (1-u)*(1-v), u*(1-v), (1-u)*v, u*v };
  93. int map[4] = { pos[0][1]*input_w + pos[0][0], pos[1][1]*input_w + pos[1][0],
  94. pos[2][1]*input_w + pos[2][0], pos[3][1]*input_w + pos[3][0]};
  95. int idx = y * output_w + x;
  96. output[idx+out_plane_r] = round( s[0]*input[map[0]+in_plane_r]
  97. + s[1]*input[map[1]+in_plane_r]
  98. + s[2]*input[map[2]+in_plane_r]
  99. + s[3]*input[map[3]+in_plane_r] );// float(mean[idx+out_plane_r]));
  100. output[idx+out_plane_g] = round( s[0]*input[map[0]+in_plane_g]
  101. + s[1]*input[map[1]+in_plane_g]
  102. + s[2]*input[map[2]+in_plane_g]
  103. + s[3]*input[map[3]+in_plane_g] );//float(mean[idx+out_plane_g]));
  104. output[idx+out_plane_b] = round( s[0]*input[map[0]+in_plane_b]
  105. + s[1]*input[map[1]+in_plane_b]
  106. + s[2]*input[map[2]+in_plane_b]
  107. + s[3]*input[map[3]+in_plane_b] );//float(mean[idx+out_plane_b]));
  108. }
  109. }
  110. void convertROI(float* input, float* output, char* mean, const int* srcSize, const int* dstSize, const int* roi, cudaStream_t stream)
  111. {
  112. int in_plane_r = 0;
  113. int in_plane_g = srcSize[1] * srcSize[2];
  114. int in_plane_b = srcSize[1] * srcSize[2] * 2;
  115. int out_plane_r = 0;
  116. int out_plane_g = dstSize[1] * dstSize[2];
  117. int out_plane_b = dstSize[1] * dstSize[2] * 2;
  118. int bbox_x = min(max(roi[0], 0), srcSize[2]-1);
  119. int bbox_y = min(max(roi[1], 0), srcSize[1]-1);
  120. int bbox_w = min(max(roi[2]-roi[0], 0), srcSize[2]-bbox_x-1 );
  121. int bbox_h = min(max(roi[3]-roi[1], 0), srcSize[1]-bbox_y-1 );
  122. dim3 dimBlock(32,32);
  123. dim3 dimGrid(dstSize[2]/dimBlock.x+1, dstSize[1]/dimBlock.y+1);
  124. std::cout << "ROI: " << bbox_x << " " << bbox_y << " " << bbox_w << " " << bbox_h << std::endl;
  125. kernel_extract_roi <<< dimGrid, dimBlock, 0, stream >>> (input, output, mean,
  126. srcSize[2], dstSize[2], dstSize[1],
  127. in_plane_r, in_plane_g, in_plane_b,
  128. out_plane_r, out_plane_g, out_plane_b,
  129. bbox_x, bbox_y, bbox_w, bbox_h);
  130. }
  131. __global__ void kernelSoftmax( float* x, int channels, float* y)
  132. {
  133. extern __shared__ float mem[];
  134. __shared__ float sum_value;
  135. sum_value=0;
  136. float number = *(x + blockDim.x*blockIdx.x + threadIdx.x);
  137. float number_exp = __expf(number);
  138. // sum_value += number_exp ;
  139. /* *
  140. * @TODO: Can do with the help of atomicAdd.
  141. * */
  142. atomicAdd(&sum_value, number_exp);
  143. __syncthreads();
  144. // mem[threadIdx.x] = number_exp;
  145. /* *
  146. * @TODO: Can do with the help of a for loop. Try different methods and find the time taken.
  147. * */
  148. // float sum = 0.0f;
  149. // for (int i=0;i<channels;i++)
  150. // {
  151. // sum += mem[i];
  152. // }
  153. y[blockDim.x*blockIdx.x + threadIdx.x] = __fdiv_rd(number_exp, sum_value);
  154. }
  155. void cudaSoftmax(int n, int channels, float* x, float*y)
  156. {
  157. kernelSoftmax<<< (n/channels), channels, channels*sizeof(float)>>>( x, channels, y);
  158. cudaDeviceSynchronize();
  159. }