jcsyshc
/
HDRSynthesis


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
							#include "hdr_synthesis.h"
#include "hdr_synthesis_priv.h"

static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
static constexpr float smooth_coff = 1e-6;
static constexpr float u8_to_f32_coff = 1.0f / 255;
static constexpr float f32_to_u8_coff = 255;

static constexpr auto block_size = 16;
static_assert(block_size % 2 == 0); // TODO: even block_size will not work

__global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
                           float *out_ptr, size_t out_pitch,
                           size_t width, size_t height) {
    auto x = blockIdx.x * blockDim.x + threadIdx.x;
    auto y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    auto in_data = *smart_offset((char3 *) in_ptr, in_pitch, x, y);
    float r = (float) in_data.x * u8_to_f32_coff;
    float g = (float) in_data.y * u8_to_f32_coff;
    float b = (float) in_data.z * u8_to_f32_coff;

    float rgb_mean = (r + g + b) / 3.0f;
    float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
    float sat_weight = rgb_std * saturation_coff;

    float rgb_max = fmaxf(fmaxf(r, g), b);
    float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));

    auto out_data = smart_offset(out_ptr, out_pitch, x, y);
    *out_data = sat_weight * expo_weight + smooth_coff;
}

__global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
                          const float *weight_a, const float *weight_b, size_t weight_pitch,
                          size_t width, size_t height) {
    auto x = blockIdx.x * blockDim.x + threadIdx.x;
    auto y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= width || y >= height) return;

    auto ppa = smart_offset((float3 *) image_a, image_pitch, x, y);
    auto pa = *ppa;
    auto pb = *smart_offset((float3 *) image_b, image_pitch, x, y);
    auto wa = *smart_offset(weight_a, weight_pitch, x, y);
    auto wb = *smart_offset(weight_b, weight_pitch, x, y);

    auto w_cof = 1.0f / (wa + wb) * f32_to_u8_coff;
    pa.x = (pa.x * wa + pb.x * wb) * w_cof;
    pa.y = (pa.y * wa + pb.y * wb) * w_cof;
    pa.z = (pa.z * wa + pb.z * wb) * w_cof;

    *ppa = pa;
}

__device__ float3 mirrored_access(const float *src_ptr, size_t pitch,
                                  NppiSize src_size, NppiSize warp_size,
                                  int x, int y) {
    if (x < 0) [[unlikely]] {
        x = -x;
    } else if (x >= src_size.width) [[unlikely]] {
        x = warp_size.width - x;
    }
    if (y < 0)[[unlikely]] {
        y = -y;
    } else if (y >= src_size.height) [[unlikely]] {
        y = warp_size.height - y;
    }
    return *smart_offset((float3 *) src_ptr, pitch, x, y);
}

// up-sampling, filter and add/sub from dst
template<bool IsAdd>
__global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t pitch,
                                    const float *filter_ptr, NppiSize src_size, NppiSize dst_size) {
    auto x = blockIdx.x * blockDim.x + threadIdx.x;
    auto y = blockIdx.y * blockDim.y + threadIdx.y;

    // copy filter coefficients
    static_assert(block_size >= filter_size);
    __shared__ float filter[filter_size];
    if (threadIdx.y == 0 && threadIdx.x < filter_size) {
        filter[threadIdx.x] = filter_ptr[threadIdx.x];
    }

    // copy related pixels
    static constexpr auto board_offset = filter_size >> 1;
    static constexpr auto relate_size = (block_size >> 1) + board_offset + 1;
    static_assert(relate_size <= block_size);
    auto warp_size = NppiSize{(src_size.width - 1) << 1,
                              (src_size.height - 1) << 1};
    __shared__ float3 pixels[relate_size][relate_size];
    auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
    auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
    if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
        auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
                                   sx + (int) threadIdx.x, sy + (int) threadIdx.y);
        pixels[threadIdx.y][threadIdx.x] = val;
    }
    __syncthreads();

    if (x >= dst_size.width || y >= dst_size.height) return;

    // do work for each pixel
    auto pout = smart_offset((float3 *) dst_ptr, pitch, x, y);
    auto old_val = *pout;
    for (auto ly = 0; ly < filter_size; ++ly) {
        auto y_cof = filter[ly];
        for (auto lx = 0; lx < filter_size; ++lx) {
            auto x_cof = filter[lx];
            auto rx = (threadIdx.x + lx) >> 1;
            auto ry = (threadIdx.y + ly) >> 1;
            auto cof = x_cof * y_cof;
            float3 pix = pixels[ry][rx];
            if constexpr (IsAdd) {
                old_val.x += pix.x * cof;
                old_val.y += pix.y * cof;
                old_val.z += pix.z * cof;
            } else {
                old_val.x -= pix.x * cof;
                old_val.y -= pix.y * cof;
                old_val.z -= pix.z * cof;
            }
        }
    }

    // write back answer
    *pout = old_val;
}

auto calc_dims(size_t width, size_t height) {
    auto block_dims = dim3{block_size, block_size};
    auto grid_dims = dim3{(uint32_t) width / block_size + (width % block_size != 0),
                          (uint32_t) height / block_size + (height % block_size != 0)};
    return std::make_tuple(block_dims, grid_dims);
}

void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
                     Npp32f *out_ptr, size_t out_pitch,
                     size_t width, size_t height, cudaStream_t stream) {
    auto [block_dims, grid_dims] = calc_dims(width, height);
    hdr_weight<<<grid_dims, block_dims, 0, stream>>>(
            in_ptr, in_pitch, out_ptr, out_pitch, width, height);
}

void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
                    const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
                    size_t width, size_t height, cudaStream_t stream) {
    auto [block_dims, grid_dims] = calc_dims(width, height);
    hdr_merge<<<grid_dims, block_dims, 0, stream>>>(
            image_a, image_b, image_pitch,
            weight_a, weight_b, weight_pitch, width, height);
}

void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
                              bool is_add, cudaStream_t stream) {
    auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
    if (is_add) {
        laplacian_operation<true><<<grid_dims, block_dims, 0, stream>>>(
                src, dst, pitch, filter, src_size, dst_size);
    } else {
        laplacian_operation<false><<<grid_dims, block_dims, 0, stream>>>(
                src, dst, pitch, filter, src_size, dst_size);
    }
}