hdr_synthesis_kernel.cu 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. #include "hdr_synthesis.h"
  2. #include "hdr_synthesis_priv.h"
  3. static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
  4. static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
  5. static constexpr float smooth_coff = 1e-6;
  6. static constexpr float u8_to_f32_coff = 1.0f / 255;
  7. static constexpr float f32_to_u8_coff = 255;
  8. static constexpr auto block_size = 16;
  9. static_assert(block_size % 2 == 0); // TODO: even block_size will not work
  10. __global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
  11. float *out_ptr, size_t out_pitch,
  12. size_t width, size_t height) {
  13. auto x = blockIdx.x * blockDim.x + threadIdx.x;
  14. auto y = blockIdx.y * blockDim.y + threadIdx.y;
  15. if (x >= width || y >= height) return;
  16. auto in_data = *smart_offset((char3 *) in_ptr, in_pitch, x, y);
  17. float r = (float) in_data.x * u8_to_f32_coff;
  18. float g = (float) in_data.y * u8_to_f32_coff;
  19. float b = (float) in_data.z * u8_to_f32_coff;
  20. float rgb_mean = (r + g + b) / 3.0f;
  21. float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
  22. float sat_weight = rgb_std * saturation_coff;
  23. float rgb_max = fmaxf(fmaxf(r, g), b);
  24. float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));
  25. auto out_data = smart_offset(out_ptr, out_pitch, x, y);
  26. *out_data = sat_weight * expo_weight + smooth_coff;
  27. }
  28. __global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
  29. const float *weight_a, const float *weight_b, size_t weight_pitch,
  30. size_t width, size_t height) {
  31. auto x = blockIdx.x * blockDim.x + threadIdx.x;
  32. auto y = blockIdx.y * blockDim.y + threadIdx.y;
  33. if (x >= width || y >= height) return;
  34. auto ppa = smart_offset((float3 *) image_a, image_pitch, x, y);
  35. auto pa = *ppa;
  36. auto pb = *smart_offset((float3 *) image_b, image_pitch, x, y);
  37. auto wa = *smart_offset(weight_a, weight_pitch, x, y);
  38. auto wb = *smart_offset(weight_b, weight_pitch, x, y);
  39. auto w_cof = 1.0f / (wa + wb) * f32_to_u8_coff;
  40. pa.x = (pa.x * wa + pb.x * wb) * w_cof;
  41. pa.y = (pa.y * wa + pb.y * wb) * w_cof;
  42. pa.z = (pa.z * wa + pb.z * wb) * w_cof;
  43. *ppa = pa;
  44. }
  45. __device__ float3 mirrored_access(const float *src_ptr, size_t pitch,
  46. NppiSize src_size, NppiSize warp_size,
  47. int x, int y) {
  48. if (x < 0) [[unlikely]] {
  49. x = -x;
  50. } else if (x >= src_size.width) [[unlikely]] {
  51. x = warp_size.width - x;
  52. }
  53. if (y < 0)[[unlikely]] {
  54. y = -y;
  55. } else if (y >= src_size.height) [[unlikely]] {
  56. y = warp_size.height - y;
  57. }
  58. return *smart_offset((float3 *) src_ptr, pitch, x, y);
  59. }
  60. // up-sampling, filter and add/sub from dst
  61. template<bool IsAdd>
  62. __global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t pitch,
  63. const float *filter_ptr, NppiSize src_size, NppiSize dst_size) {
  64. auto x = blockIdx.x * blockDim.x + threadIdx.x;
  65. auto y = blockIdx.y * blockDim.y + threadIdx.y;
  66. // copy filter coefficients
  67. static_assert(block_size >= filter_size);
  68. __shared__ float filter[filter_size];
  69. if (threadIdx.y == 0 && threadIdx.x < filter_size) {
  70. filter[threadIdx.x] = filter_ptr[threadIdx.x];
  71. }
  72. // copy related pixels
  73. static constexpr auto board_offset = filter_size >> 1;
  74. static constexpr auto relate_size = (block_size >> 1) + board_offset + 1;
  75. static_assert(relate_size <= block_size);
  76. auto warp_size = NppiSize{(src_size.width - 1) << 1,
  77. (src_size.height - 1) << 1};
  78. __shared__ float3 pixels[relate_size][relate_size];
  79. auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
  80. auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
  81. if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
  82. auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
  83. sx + (int) threadIdx.x, sy + (int) threadIdx.y);
  84. pixels[threadIdx.y][threadIdx.x] = val;
  85. }
  86. __syncthreads();
  87. if (x >= dst_size.width || y >= dst_size.height) return;
  88. // do work for each pixel
  89. auto pout = smart_offset((float3 *) dst_ptr, pitch, x, y);
  90. auto old_val = *pout;
  91. for (auto ly = 0; ly < filter_size; ++ly) {
  92. auto y_cof = filter[ly];
  93. for (auto lx = 0; lx < filter_size; ++lx) {
  94. auto x_cof = filter[lx];
  95. auto rx = (threadIdx.x + lx) >> 1;
  96. auto ry = (threadIdx.y + ly) >> 1;
  97. auto cof = x_cof * y_cof;
  98. float3 pix = pixels[ry][rx];
  99. if constexpr (IsAdd) {
  100. old_val.x += pix.x * cof;
  101. old_val.y += pix.y * cof;
  102. old_val.z += pix.z * cof;
  103. } else {
  104. old_val.x -= pix.x * cof;
  105. old_val.y -= pix.y * cof;
  106. old_val.z -= pix.z * cof;
  107. }
  108. }
  109. }
  110. // write back answer
  111. *pout = old_val;
  112. }
  113. auto calc_dims(size_t width, size_t height) {
  114. auto block_dims = dim3{block_size, block_size};
  115. auto grid_dims = dim3{(uint32_t) width / block_size + (width % block_size != 0),
  116. (uint32_t) height / block_size + (height % block_size != 0)};
  117. return std::make_tuple(block_dims, grid_dims);
  118. }
  119. void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
  120. Npp32f *out_ptr, size_t out_pitch,
  121. size_t width, size_t height, cudaStream_t stream) {
  122. auto [block_dims, grid_dims] = calc_dims(width, height);
  123. hdr_weight<<<grid_dims, block_dims, 0, stream>>>(
  124. in_ptr, in_pitch, out_ptr, out_pitch, width, height);
  125. }
  126. void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
  127. const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
  128. size_t width, size_t height, cudaStream_t stream) {
  129. auto [block_dims, grid_dims] = calc_dims(width, height);
  130. hdr_merge<<<grid_dims, block_dims, 0, stream>>>(
  131. image_a, image_b, image_pitch,
  132. weight_a, weight_b, weight_pitch, width, height);
  133. }
  134. void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
  135. const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
  136. bool is_add, cudaStream_t stream) {
  137. auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
  138. if (is_add) {
  139. laplacian_operation<true><<<grid_dims, block_dims, 0, stream>>>(
  140. src, dst, pitch, filter, src_size, dst_size);
  141. } else {
  142. laplacian_operation<false><<<grid_dims, block_dims, 0, stream>>>(
  143. src, dst, pitch, filter, src_size, dst_size);
  144. }
  145. }