瀏覽代碼

First working version.

jcsyshc 2 年之前
當前提交
cf4ac0c671
共有 7 個文件被更改,包括 612 次插入0 次删除
  1. 29 0
      CMakeLists.txt
  2. 38 0
      src/cuda_helper.cpp
  3. 22 0
      src/cuda_helper.h
  4. 322 0
      src/hdr_synthesis.cpp
  5. 29 0
      src/hdr_synthesis.h
  6. 89 0
      src/hdr_synthesis_kernel.cu
  7. 83 0
      src/main.cpp

+ 29 - 0
CMakeLists.txt

@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.26)
+project(HDRSynthesis LANGUAGES C CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 20)
+
+add_executable(${PROJECT_NAME} src/main.cpp
+        src/cuda_helper.cpp
+        src/hdr_synthesis.cpp
+        src/hdr_synthesis_kernel.cu)
+
+# OpenCV config
+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
+
+# spdlog config
+find_package(spdlog REQUIRED)
+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
+
+# Boost config
+find_package(Boost REQUIRED COMPONENTS iostreams)
+target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
+
+# CUDA config
+find_package(CUDAToolkit REQUIRED)
+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
+target_link_libraries(${PROJECT_NAME} nppc nppicc nppif nppial nppidei)

+ 38 - 0
src/cuda_helper.cpp

@@ -0,0 +1,38 @@
+#include "cuda_helper.h"
+
+#include <spdlog/spdlog.h>
+
+#define RET_ERROR \
+    assert(false); \
+    return false; \
+    (void) 0
+
+bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
+                         const char *file_name, const char *api_call_str) {
+    if (api_ret == CUDA_SUCCESS) [[likely]] return true;
+    const char *error_name, *error_str;
+    auto ret = cuGetErrorName(api_ret, &error_name);
+    if (ret != CUDA_SUCCESS) [[unlikely]] error_name = "Unknown";
+    ret = cuGetErrorString(api_ret, &error_str);
+    if (ret != CUDA_SUCCESS) [[unlikely]] error_str = "Unknown";
+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}:{}, {}.",
+                 api_call_str, file_name, line_number,
+                 (int) api_ret, error_name, error_str);
+    RET_ERROR;
+}
+
+bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
+                                const char *file_name, const char *api_call_str) {
+    if (api_ret == cudaSuccess) [[likely]] return true;
+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}.",
+                 api_call_str, file_name, line_number, (int) api_ret);
+    RET_ERROR;
+}
+
+bool check_cuda_api_call(NppStatus api_ret, unsigned int line_number,
+                         const char *file_name, const char *api_call_str) {
+    if (api_ret == NPP_SUCCESS) [[likely]] return true;
+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}.",
+                 api_call_str, file_name, line_number, (int) api_ret);
+    RET_ERROR;
+}

+ 22 - 0
src/cuda_helper.h

@@ -0,0 +1,22 @@
+#ifndef HDRSYNTHESIS_CUDA_HELPER_H
+#define HDRSYNTHESIS_CUDA_HELPER_H
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <nppdefs.h>
+
+bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
+                         const char *file_name, const char *api_call_str);
+
+bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
+                         const char *file_name, const char *api_call_str);
+
+bool check_cuda_api_call(NppStatus api_ret, unsigned int line_number,
+                         const char *file_name, const char *api_call_str);
+
+#define CUDA_API_CHECK(api_call) \
+    if (!check_cuda_api_call( \
+        api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
+        return false
+
+#endif //HDRSYNTHESIS_CUDA_HELPER_H

+ 322 - 0
src/hdr_synthesis.cpp

@@ -0,0 +1,322 @@
+#include "hdr_synthesis.h"
+#include "cuda_helper.h"
+
+#include <nppi_arithmetic_and_logical_operations.h>
+#include <nppi_color_conversion.h>
+#include <nppi_data_exchange_and_initialization.h>
+#include <nppi_filtering_functions.h>
+
+#include <cassert>
+
+void call_hdr_weight(void *in_f32, size_t in_pitch,
+                     void *out_f32, size_t out_pitch,
+                     size_t width, size_t height);
+
+void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
+                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
+                    void *out_f32, size_t out_pitch,
+                    size_t width, size_t height);
+
+struct hdr_synthesizer::impl {
+
+    static constexpr auto u8_to_f32_coff = 1.0f / 255;
+    static constexpr float u8_to_f32_coff_arr[] = {u8_to_f32_coff,
+                                                   u8_to_f32_coff,
+                                                   u8_to_f32_coff};
+    static constexpr float identity_filter_coff[] = {1};
+    static constexpr float gaussian_filter_coff[] = {1 / 16.0f,
+                                                     4 / 16.0f,
+                                                     6 / 16.0f,
+                                                     4 / 16.0f,
+                                                     1 / 16.0f};
+    static constexpr auto identity_filter_len = 1;
+    static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
+
+    struct image_buffer {
+        void *pyr_image_f32, *pyr_weight_f32;
+        size_t image_pitch, weight_pitch;
+    };
+
+    uint16_t width, height, pyr_height;
+    uint8_t pyr_level;
+
+    NppiSize full_size;
+    NppiRect full_rect;
+    NppiPoint origin_point;
+    size_t *pyr_offset_arr;
+    NppiSize *pyr_size_arr;
+    void *identity_filter_coff_f32;
+    void *gaussian_filter_coff_f32;
+
+    // global temporary memory
+    void *raw_u8, *rgb_u8, *rgb_f32;
+    size_t raw_u8_pitch, rgb_u8_pitch, rgb_f32_pitch;
+
+    void *rgb2_f32, *rgb3_f32; // for test
+
+    impl(uint16_t _width, uint16_t _height, uint8_t _level)
+            : width(_width), height(_height), pyr_level(_level) {
+        pyr_height = height + (height >> 1);
+        malloc_global_memory();
+
+        full_size = NppiSize{width, height};
+        full_rect = NppiRect{0, 0, width, height};
+        origin_point = NppiPoint{0, 0};
+
+        pyr_offset_arr = new size_t[pyr_level];
+        pyr_size_arr = new NppiSize[pyr_level];
+        auto cur_width = width, cur_height = height;
+        for (auto i = 0; i < pyr_level; ++i) {
+//            assert(cur_width % 2 == 0);
+//            assert(cur_height % 2 == 0);
+            pyr_offset_arr[i] = (i == 0) ? 0 : (pyr_offset_arr[i - 1] + cur_width);
+            cur_width >>= 1;
+            cur_height >>= 1;
+            pyr_size_arr[i] = NppiSize{cur_width, cur_height};
+        }
+    }
+
+    ~impl() {
+        free_global_memory();
+
+        delete pyr_offset_arr;
+        delete pyr_size_arr;
+    }
+
+    bool malloc_global_memory() {
+        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
+                                       width * sizeof(unsigned char), height));
+        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
+                                       width * 3 * sizeof(unsigned char), height));
+        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
+                                       width * 3 * sizeof(float), pyr_height));
+        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
+                                       width * 3 * sizeof(float), pyr_height)); // for test
+        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
+                                       width * 3 * sizeof(float), pyr_height)); // for test
+
+        // upload gaussian kernel coefficient
+        CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
+        CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
+                                  sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
+        CUDA_API_CHECK(cudaMalloc(&identity_filter_coff_f32, sizeof(identity_filter_coff)));
+        CUDA_API_CHECK(cudaMemcpy(identity_filter_coff_f32, identity_filter_coff,
+                                  sizeof(identity_filter_coff), cudaMemcpyHostToDevice));
+
+        return true;
+    }
+
+    bool free_global_memory() {
+        CUDA_API_CHECK(cudaFree(raw_u8));
+        CUDA_API_CHECK(cudaFree(rgb_u8));
+        CUDA_API_CHECK(cudaFree(rgb_f32));
+        CUDA_API_CHECK(cudaFree(rgb2_f32));
+        CUDA_API_CHECK(cudaFree(rgb3_f32));
+        CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
+        CUDA_API_CHECK(cudaFree(identity_filter_coff_f32));
+        return true;
+    }
+
+    bool malloc_buffer(image_buffer *buf) const {
+        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_image_f32, &buf->image_pitch,
+                                       width * sizeof(float) * 3, pyr_height));
+        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_weight_f32, &buf->weight_pitch,
+                                       width * sizeof(float), pyr_height));
+        return true;
+    }
+
+    bool gaussian_pyramid(void *ptr_f32, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
+        CUDA_API_CHECK((is_rgb ?
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
+                               ((Npp32f *) ptr_f32, pitch, full_size, origin_point,
+                                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
+                                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+        for (int i = 0; i < pyr_level - 1; ++i) {
+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float) * (is_rgb ? 3 : 1);
+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * (is_rgb ? 3 : 1);
+            CUDA_API_CHECK((is_rgb ?
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
+                                   ((Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
+                                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
+                                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+        }
+        return true;
+    }
+
+    bool laplacian_operation(void *src_f32, NppiSize src_size,
+                             void *dst_f32, NppiSize dst_size,
+                             size_t pitch) const {
+        // up-sampling
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
+                (Npp32f *) src_f32, pitch, src_size, origin_point,
+                (Npp32f *) rgb_f32, rgb_f32_pitch, dst_size,
+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+        // gaussian blur
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb_f32, rgb_f32_pitch, dst_size, origin_point,
+                                                     (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
+                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
+        // add
+//        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb_f32, rgb_f32_pitch,
+//                                        (Npp32f *) dst_f32, pitch,
+//                                        dst_size));
+        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
+                                        (Npp32f *) dst_f32, pitch,
+                                        dst_size));
+        return true;
+    }
+
+    bool laplacian_pyramid(void *ptr_f32, size_t pitch) const { // for rgb image only
+        // generate gaussian pyramid first
+        gaussian_pyramid(ptr_f32, pitch, true);
+
+        // generate laplacian pyramid by up-sampling and subtraction
+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
+        laplacian_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
+        for (int i = 0; i < pyr_level - 1; ++i) {
+            laplacian_operation(pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * 3, pyr_size_arr[i + 1],
+                                pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
+                                pitch);
+        }
+        return true;
+    }
+
+    bool reconstruct_operation(void *src_f32, NppiSize src_size,
+                               void *dst_f32, NppiSize dst_size,
+                               size_t pitch) const {
+        // up-sampling
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
+                (Npp32f *) src_f32, pitch, src_size, origin_point,
+                (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+        // gaussian blur
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size, origin_point,
+                                                     (Npp32f *) rgb3_f32, rgb_f32_pitch, dst_size,
+                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
+        // add
+//        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
+//                                        (Npp32f *) dst_f32, pitch,
+//                                        dst_size));
+        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb3_f32, rgb_f32_pitch,
+                                        (Npp32f *) dst_f32, pitch,
+                                        dst_size));
+        return true;
+    }
+
+    // reconstruct from laplacian pyramid, for rgb image only
+    bool pyramid_reconstruct(void *ptr_f32, size_t pitch) const {
+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
+        for (int i = pyr_level - 1; i > 0; --i) {
+            reconstruct_operation(pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
+                                  pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float) * 3, pyr_size_arr[i - 1],
+                                  pitch);
+        }
+        reconstruct_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
+        return true;
+    }
+
+    bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
+        // upload image
+        CUDA_API_CHECK(cudaMemcpy2D(raw_u8, raw_u8_pitch,
+                                    raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
+                                    height, cudaMemcpyHostToDevice));
+
+        // debayer image
+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R((Npp8u *) raw_u8, raw_u8_pitch,
+                                             full_size, full_rect,
+                                             (Npp8u *) rgb_u8, rgb_u8_pitch,
+                                             NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
+
+        // convert to float
+        CUDA_API_CHECK(nppiConvert_8u32f_C3R((Npp8u *) rgb_u8, rgb_u8_pitch,
+                                             (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
+                                             full_size));
+        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
+                                         (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
+                                         full_size)); // normalize
+
+        // calc weight
+        call_hdr_weight(buf->pyr_image_f32, buf->image_pitch,
+                        buf->pyr_weight_f32, buf->weight_pitch,
+                        width, height);
+
+        // construct image pyramid
+        gaussian_pyramid(buf->pyr_weight_f32, buf->weight_pitch, false);
+        laplacian_pyramid(buf->pyr_image_f32, buf->image_pitch);
+
+        return true;
+    }
+
+    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
+                     void *out_u8, size_t out_pitch) const {
+        // merge
+        assert(buf_a->image_pitch == buf_b->image_pitch);
+        assert(buf_a->weight_pitch == buf_b->weight_pitch);
+        call_hdr_merge(buf_a->pyr_image_f32, buf_b->pyr_image_f32, buf_a->image_pitch,
+                       buf_a->pyr_weight_f32, buf_b->pyr_weight_f32, buf_a->weight_pitch,
+                       rgb_f32, rgb_f32_pitch,
+                       width, pyr_height);
+
+        // reconstruct image from laplacian pyramid
+        pyramid_reconstruct(rgb_f32, rgb_f32_pitch);
+
+        // convert to uint8 and copy
+        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) rgb_f32, rgb_f32_pitch,
+                                             (Npp8u *) out_u8, out_pitch,
+                                             full_size, NPP_RND_NEAR));
+
+        return true;
+    }
+
+//    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
+//                     void *out_u8, size_t out_pitch) const {
+//        // reconstruct image from laplacian pyramid
+//        pyramid_reconstruct(buf_a->pyr_image_f32, buf_a->image_pitch);
+//
+//        // convert to uint8 and copy
+//        float arr[] = {255, 255, 255};
+//        nppiMulC_32f_C3IR(arr, (Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch, full_size);
+//        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch,
+//                                             (Npp8u *) out_u8, out_pitch,
+//                                             full_size, NPP_RND_NEAR));
+//
+//        return true;
+//    }
+
+};
+
+hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
+        : pimpl(std::make_unique<impl>(width, height, pyramid_level)) {
+}
+
+hdr_synthesizer::~hdr_synthesizer() = default;
+
+bool hdr_synthesizer::malloc_buffer(void **out_buf) {
+    *out_buf = new impl::image_buffer{};
+    return pimpl->malloc_buffer((impl::image_buffer *) *out_buf);
+}
+
+bool hdr_synthesizer::free_buffer(void *out_buf) {
+    auto ptr = (impl::image_buffer *) out_buf;
+    CUDA_API_CHECK(cudaFree(ptr->pyr_image_f32));
+    CUDA_API_CHECK(cudaFree(ptr->pyr_weight_f32));
+    delete ptr;
+    return true;
+}
+
+bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
+    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr);
+}
+
+bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
+                                  void *img_u8, size_t img_pitch) {
+    return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
+                              img_u8, img_pitch);
+}
+
+void hdr_synthesizer::test_func(void **ptr, size_t *pitch) {
+    *ptr = pimpl->rgb_f32;
+    *pitch = pimpl->rgb_f32_pitch;
+}

+ 29 - 0
src/hdr_synthesis.h

@@ -0,0 +1,29 @@
+#ifndef HDRSYNTHESIS_HDR_SYNTHESIS_H
+#define HDRSYNTHESIS_HDR_SYNTHESIS_H
+
+#include <cstdint>
+#include <memory>
+
+class hdr_synthesizer {
+public:
+
+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
+
+    ~hdr_synthesizer();
+
+    bool malloc_buffer(void **out_buf); // per image buffer
+
+    static bool free_buffer(void *out_buf);
+
+    bool preprocess_image(void *img_buf, uint8_t *img_ptr);
+
+    bool merge_image(void *buf_a, void *buf_b, void *img_u8, size_t img_pitch);
+
+    void test_func(void **ptr, size_t *pitch);
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+#endif //HDRSYNTHESIS_HDR_SYNTHESIS_H

+ 89 - 0
src/hdr_synthesis_kernel.cu

@@ -0,0 +1,89 @@
+#include "hdr_synthesis.h"
+
+static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
+static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
+static constexpr float smooth_coff = 1e-6;
+static constexpr float f32_to_u8_coff = 255;
+
+static constexpr auto block_size = 16;
+
+template<typename T>
+__device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
+    return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
+}
+
+__global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
+                           float *out_ptr, size_t out_pitch,
+                           size_t width, size_t height) {
+    auto x = blockIdx.x * blockDim.x + threadIdx.x;
+    auto y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= height || y >= width) return;
+
+//    auto in_offset = x * in_pitch + y * 3 * sizeof(float);
+//    auto in_data = (float *) ((char *) in_ptr + in_offset);
+    auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
+    float r = in_data[0];
+    float g = in_data[1];
+    float b = in_data[2];
+
+    float rgb_mean = (r + g + b) / 3.0f;
+    float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
+    float sat_weight = rgb_std * saturation_coff;
+
+    float rgb_max = fmaxf(fmaxf(r, g), b);
+    float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));
+
+//    auto out_offset = x * out_pitch + y * sizeof(float);
+//    auto out_data = (float *) ((char *) out_ptr + out_offset);
+    auto out_data = smart_offset(out_ptr, out_pitch, x, y);
+    *out_data = sat_weight * expo_weight + smooth_coff;
+}
+
+__device__ float pixel_merge(const float *img_a, const float *img_b,
+                             const float *wei_a, const float *wei_b,
+                             size_t offset) {
+    auto w_a = *wei_a, w_b = *wei_b;
+    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
+}
+
+__global__ void hdr_merge(const float *img_a, const float *img_b, size_t img_pitch,
+                          const float *wei_a, const float *wei_b, size_t wei_pitch,
+                          float *out_ptr, size_t out_pitch,
+                          size_t width, size_t height) {
+    auto x = blockIdx.x * blockDim.x + threadIdx.x;
+    auto y = blockIdx.y * blockDim.y + threadIdx.y;
+    if (x >= height || y >= width) return;
+
+    auto cur_img_a = smart_offset(img_a, img_pitch, x, y, 3);
+    auto cur_img_b = smart_offset(img_b, img_pitch, x, y, 3);
+    auto cur_wei_a = smart_offset(wei_a, wei_pitch, x, y);
+    auto cur_wei_b = smart_offset(wei_b, wei_pitch, x, y);
+    auto cur_out = smart_offset(out_ptr, out_pitch, x, y, 3);
+    for (auto i = 0; i < 3; ++i) {
+        cur_out[i] = pixel_merge(cur_img_a, cur_img_b, cur_wei_a, cur_wei_b, i) * f32_to_u8_coff;
+    }
+}
+
+void call_hdr_weight(void *in_f32, size_t in_pitch,
+                     void *out_f32, size_t out_pitch,
+                     size_t width, size_t height) {
+    auto block_dims = dim3{block_size, block_size};
+    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
+                          (uint32_t) width / block_size + (width % block_size != 0)};
+    hdr_weight<<<grid_dims, block_dims>>>((float *) in_f32, in_pitch,
+                                          (float *) out_f32, out_pitch,
+                                          width, height);
+}
+
+void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
+                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
+                    void *out_f32, size_t out_pitch,
+                    size_t width, size_t height) {
+    auto block_dims = dim3{block_size, block_size};
+    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
+                          (uint32_t) width / block_size + (width % block_size != 0)};
+    hdr_merge<<<grid_dims, block_dims>>>((float *) img_a_f32, (float *) img_b_f32, img_pitch,
+                                         (float *) wei_a_f32, (float *) wei_b_f32, wei_pitch,
+                                         (float *) out_f32, out_pitch,
+                                         width, height);
+}

+ 83 - 0
src/main.cpp

@@ -0,0 +1,83 @@
+#include "cuda_helper.h"
+#include "hdr_synthesis.h"
+
+#include <cuda_profiler_api.h>
+
+#include <nppi_color_conversion.h>
+#include <nppi_filtering_functions.h>
+
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/imgcodecs.hpp>
+
+#include <boost/iostreams/device/mapped_file.hpp>
+
+#include <iostream>
+#include <chrono>
+
+static constexpr auto image_width = 2448;
+static constexpr auto image_height = 2048;
+
+cv::Mat download_image(void *ptr, size_t pitch, size_t width, size_t height, int type) {
+    auto gpu_mat = cv::cuda::GpuMat{(int) height, (int) width, type, ptr, pitch};
+    cv::Mat mat;
+    gpu_mat.download(mat);
+    return mat;
+}
+
+struct image_buffer {
+    void *pyr_image_f32, *pyr_weight_f32;
+    size_t image_pitch, weight_pitch;
+};
+
+int main() {
+    auto path_a = "/home/tpx/project/HDRSynthesis/data/phantom2_5ms.raw";
+    auto path_b = "/home/tpx/project/HDRSynthesis/data/phantom2_20ms.raw";
+
+    using boost::iostreams::mapped_file;
+    auto img_file_a = mapped_file{path_a, boost::iostreams::mapped_file_base::readonly};
+    auto img_file_b = mapped_file{path_b, boost::iostreams::mapped_file_base::readonly};
+
+    auto hdr = hdr_synthesizer{image_width, image_height};
+    void *buf_a, *buf_b;
+    hdr.malloc_buffer(&buf_a);
+    hdr.malloc_buffer(&buf_b);
+    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
+
+    auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
+    hdr.merge_image(buf_b, buf_a, img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+
+//    for (int i = 0; i < 8; ++i) {
+//        auto start_ts = std::chrono::system_clock::now();
+//        cudaDeviceSynchronize();
+//        hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
+//        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
+//        cudaDeviceSynchronize();
+//        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
+//                std::chrono::system_clock::now() - start_ts).count() << std::endl;
+//    }
+//
+//    cudaProfilerStart();
+//    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
+//    cudaProfilerStop();
+//
+
+    auto real_ptr = (image_buffer *) buf_b;
+    auto host_rgb_a = download_image(img_hdr_dev.cudaPtr(), img_hdr_dev.step1(),
+                                     image_width, image_height, CV_8UC3);
+//    auto host_rgb_a = download_image((char *) real_ptr->pyr_image_f32, real_ptr->image_pitch,
+//                                     image_width, image_height, CV_32FC3);
+//    void *ptr;
+//    size_t pitch;
+//    hdr.test_func(&ptr, &pitch);
+//    auto host_rgb_a = download_image(ptr, pitch,
+//                                     image_width, image_height, CV_32FC3);
+
+    double min_val, max_val;
+    cv::minMaxLoc(host_rgb_a, &min_val, &max_val);
+    std::cout << min_val << " " << max_val << " " << cv::mean(host_rgb_a) << std::endl;
+
+    cv::imwrite("test.bmp", host_rgb_a);
+
+    return 0;
+}