|
@@ -1,6 +1,5 @@
|
|
|
#include "hdr_synthesis.h"
|
|
#include "hdr_synthesis.h"
|
|
|
-
|
|
|
|
|
-#include <nppdefs.h>
|
|
|
|
|
|
|
+#include "hdr_synthesis_priv.h"
|
|
|
|
|
|
|
|
static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
|
|
static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
|
|
|
static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
|
|
static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
|
|
@@ -9,23 +8,19 @@ static constexpr float u8_to_f32_coff = 1.0f / 255;
|
|
|
static constexpr float f32_to_u8_coff = 255;
|
|
static constexpr float f32_to_u8_coff = 255;
|
|
|
|
|
|
|
|
static constexpr auto block_size = 16;
|
|
static constexpr auto block_size = 16;
|
|
|
-
|
|
|
|
|
-template<typename T>
|
|
|
|
|
-__device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
|
|
|
|
|
- return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
|
|
|
|
|
-}
|
|
|
|
|
|
|
+static_assert(block_size % 2 == 0); // TODO: even block_size will not work
|
|
|
|
|
|
|
|
__global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
|
|
__global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
|
|
|
float *out_ptr, size_t out_pitch,
|
|
float *out_ptr, size_t out_pitch,
|
|
|
size_t width, size_t height) {
|
|
size_t width, size_t height) {
|
|
|
auto x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
auto x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
auto y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
auto y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
- if (x >= height || y >= width) return;
|
|
|
|
|
|
|
+ if (x >= width || y >= height) return;
|
|
|
|
|
|
|
|
- auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
|
|
|
|
|
- float r = (float) in_data[0] * u8_to_f32_coff;
|
|
|
|
|
- float g = (float) in_data[1] * u8_to_f32_coff;
|
|
|
|
|
- float b = (float) in_data[2] * u8_to_f32_coff;
|
|
|
|
|
|
|
+ auto in_data = *smart_offset((char3 *) in_ptr, in_pitch, x, y);
|
|
|
|
|
+ float r = (float) in_data.x * u8_to_f32_coff;
|
|
|
|
|
+ float g = (float) in_data.y * u8_to_f32_coff;
|
|
|
|
|
+ float b = (float) in_data.z * u8_to_f32_coff;
|
|
|
|
|
|
|
|
float rgb_mean = (r + g + b) / 3.0f;
|
|
float rgb_mean = (r + g + b) / 3.0f;
|
|
|
float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
|
|
float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
|
|
@@ -38,46 +33,131 @@ __global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
|
|
|
*out_data = sat_weight * expo_weight + smooth_coff;
|
|
*out_data = sat_weight * expo_weight + smooth_coff;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-//__device__ float pixel_merge(const float *img_a, const float *img_b,
|
|
|
|
|
-// const float *wei_a, const float *wei_b,
|
|
|
|
|
-// size_t offset) {
|
|
|
|
|
-// auto w_a = *wei_a, w_b = *wei_b;
|
|
|
|
|
-// return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
|
|
|
|
|
-//}
|
|
|
|
|
-
|
|
|
|
|
__global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
|
|
__global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
|
|
|
const float *weight_a, const float *weight_b, size_t weight_pitch,
|
|
const float *weight_a, const float *weight_b, size_t weight_pitch,
|
|
|
size_t width, size_t height) {
|
|
size_t width, size_t height) {
|
|
|
auto x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
auto x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
auto y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
auto y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
- if (x >= height || y >= width) return;
|
|
|
|
|
|
|
+ if (x >= width || y >= height) return;
|
|
|
|
|
|
|
|
- auto ppa = smart_offset(image_a, image_pitch, x, y, 3);
|
|
|
|
|
- auto ppb = smart_offset(image_b, image_pitch, x, y, 3);
|
|
|
|
|
|
|
+ auto ppa = smart_offset((float3 *) image_a, image_pitch, x, y);
|
|
|
|
|
+ auto pa = *ppa;
|
|
|
|
|
+ auto pb = *smart_offset((float3 *) image_b, image_pitch, x, y);
|
|
|
auto wa = *smart_offset(weight_a, weight_pitch, x, y);
|
|
auto wa = *smart_offset(weight_a, weight_pitch, x, y);
|
|
|
auto wb = *smart_offset(weight_b, weight_pitch, x, y);
|
|
auto wb = *smart_offset(weight_b, weight_pitch, x, y);
|
|
|
|
|
|
|
|
-#pragma unroll
|
|
|
|
|
- for (auto i = 0; i < 3; ++i) {
|
|
|
|
|
- ppa[i] = (ppa[i] * wa + ppb[i] * wb) / (wa + wb) * f32_to_u8_coff;
|
|
|
|
|
|
|
+ auto w_cof = 1.0f / (wa + wb) * f32_to_u8_coff;
|
|
|
|
|
+ pa.x = (pa.x * wa + pb.x * wb) * w_cof;
|
|
|
|
|
+ pa.y = (pa.y * wa + pb.y * wb) * w_cof;
|
|
|
|
|
+ pa.z = (pa.z * wa + pb.z * wb) * w_cof;
|
|
|
|
|
+
|
|
|
|
|
+ *ppa = pa;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+__device__ float3 mirrored_access(const float *src_ptr, size_t pitch,
|
|
|
|
|
+ NppiSize src_size, NppiSize warp_size,
|
|
|
|
|
+ int x, int y) {
|
|
|
|
|
+ if (x < 0) [[unlikely]] {
|
|
|
|
|
+ x = -x;
|
|
|
|
|
+ } else if (x >= src_size.width) [[unlikely]] {
|
|
|
|
|
+ x = warp_size.width - x;
|
|
|
|
|
+ }
|
|
|
|
|
+ if (y < 0)[[unlikely]] {
|
|
|
|
|
+ y = -y;
|
|
|
|
|
+ } else if (y >= src_size.height) [[unlikely]] {
|
|
|
|
|
+ y = warp_size.height - y;
|
|
|
|
|
+ }
|
|
|
|
|
+ return *smart_offset((float3 *) src_ptr, pitch, x, y);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+// up-sampling, filter and add/sub from dst
|
|
|
|
|
+template<bool IsAdd>
|
|
|
|
|
+__global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t pitch,
|
|
|
|
|
+ const float *filter_ptr, NppiSize src_size, NppiSize dst_size) {
|
|
|
|
|
+ auto x = blockIdx.x * blockDim.x + threadIdx.x;
|
|
|
|
|
+ auto y = blockIdx.y * blockDim.y + threadIdx.y;
|
|
|
|
|
+
|
|
|
|
|
+ // copy filter coefficients
|
|
|
|
|
+ static_assert(block_size >= filter_size);
|
|
|
|
|
+ __shared__ float filter[filter_size];
|
|
|
|
|
+ if (threadIdx.y == 0 && threadIdx.x < filter_size) {
|
|
|
|
|
+ filter[threadIdx.x] = filter_ptr[threadIdx.x];
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ // copy related pixels
|
|
|
|
|
+ static constexpr auto board_offset = filter_size >> 1;
|
|
|
|
|
+ static constexpr auto relate_size = (block_size >> 1) + board_offset + 1;
|
|
|
|
|
+ static_assert(relate_size <= block_size);
|
|
|
|
|
+ auto warp_size = NppiSize{(src_size.width - 1) << 1,
|
|
|
|
|
+ (src_size.height - 1) << 1};
|
|
|
|
|
+ __shared__ float pixels[3][relate_size][relate_size + 1];
|
|
|
|
|
+ auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
|
|
|
|
|
+ auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
|
|
|
|
|
+ if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
|
|
|
|
|
+ auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
|
|
|
|
|
+ sx + (int) threadIdx.x, sy + (int) threadIdx.y);
|
|
|
|
|
+ pixels[0][threadIdx.y][threadIdx.x] = val.x;
|
|
|
|
|
+ pixels[1][threadIdx.y][threadIdx.x] = val.y;
|
|
|
|
|
+ pixels[2][threadIdx.y][threadIdx.x] = val.z;
|
|
|
|
|
+ }
|
|
|
|
|
+ __syncthreads();
|
|
|
|
|
+
|
|
|
|
|
+ if (x >= dst_size.width || y >= dst_size.height) return;
|
|
|
|
|
+
|
|
|
|
|
+ // do work for each pixel
|
|
|
|
|
+ auto pout = smart_offset((float3 *) dst_ptr, pitch, x, y);
|
|
|
|
|
+ auto old_val = *pout;
|
|
|
|
|
+ for (auto ly = 0; ly < filter_size; ++ly) {
|
|
|
|
|
+ auto y_cof = filter[ly];
|
|
|
|
|
+ for (auto lx = 0; lx < filter_size; ++lx) {
|
|
|
|
|
+ auto x_cof = filter[lx];
|
|
|
|
|
+ auto rx = (threadIdx.x + lx) >> 1;
|
|
|
|
|
+ auto ry = (threadIdx.y + ly) >> 1;
|
|
|
|
|
+ auto cof = x_cof * y_cof;
|
|
|
|
|
+ if constexpr (IsAdd) {
|
|
|
|
|
+ old_val.x += pixels[0][ry][rx] * cof;
|
|
|
|
|
+ old_val.y += pixels[1][ry][rx] * cof;
|
|
|
|
|
+ old_val.z += pixels[2][ry][rx] * cof;
|
|
|
|
|
+ } else {
|
|
|
|
|
+ old_val.x -= pixels[0][ry][rx] * cof;
|
|
|
|
|
+ old_val.y -= pixels[1][ry][rx] * cof;
|
|
|
|
|
+ old_val.z -= pixels[2][ry][rx] * cof;
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ // write back answer
|
|
|
|
|
+ *pout = old_val;
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+auto calc_dims(size_t width, size_t height) {
|
|
|
|
|
+ auto block_dims = dim3{block_size, block_size};
|
|
|
|
|
+ auto grid_dims = dim3{(uint32_t) width / block_size + (width % block_size != 0),
|
|
|
|
|
+ (uint32_t) height / block_size + (height % block_size != 0)};
|
|
|
|
|
+ return std::make_tuple(block_dims, grid_dims);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
|
|
void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
|
|
|
Npp32f *out_ptr, size_t out_pitch,
|
|
Npp32f *out_ptr, size_t out_pitch,
|
|
|
size_t width, size_t height) {
|
|
size_t width, size_t height) {
|
|
|
- auto block_dims = dim3{block_size, block_size};
|
|
|
|
|
- auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
|
|
|
|
|
- (uint32_t) width / block_size + (width % block_size != 0)};
|
|
|
|
|
|
|
+ auto [block_dims, grid_dims] = calc_dims(width, height);
|
|
|
hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
|
|
hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
|
|
void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
|
|
|
const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
|
|
const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
|
|
|
size_t width, size_t height) {
|
|
size_t width, size_t height) {
|
|
|
- auto block_dims = dim3{block_size, block_size};
|
|
|
|
|
- auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
|
|
|
|
|
- (uint32_t) width / block_size + (width % block_size != 0)};
|
|
|
|
|
|
|
+ auto [block_dims, grid_dims] = calc_dims(width, height);
|
|
|
hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
|
|
hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
|
|
|
weight_a, weight_b, weight_pitch, width, height);
|
|
weight_a, weight_b, weight_pitch, width, height);
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
|
|
|
|
|
+ const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add) {
|
|
|
|
|
+ auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
|
|
|
|
|
+ if (is_add) {
|
|
|
|
|
+ laplacian_operation<true><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
|
|
|
|
|
+ } else {
|
|
|
|
|
+ laplacian_operation<false><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
|
|
|
|
|
+ }
|
|
|
}
|
|
}
|