2 年之前 · cf4ac0c671
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,29 @@
 
				+cmake_minimum_required(VERSION 3.26)
			
 
				+project(HDRSynthesis LANGUAGES C CXX CUDA)
			
 
				+
			
 
				+set(CMAKE_CXX_STANDARD 20)
			
 
				+
			
 
				+add_executable(${PROJECT_NAME} src/main.cpp
			
 
				+        src/cuda_helper.cpp
			
 
				+        src/hdr_synthesis.cpp
			
 
				+        src/hdr_synthesis_kernel.cu)
			
 
				+
			
 
				+# OpenCV config
			
 
				+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
 
				+
			
 
				+# spdlog config
			
 
				+find_package(spdlog REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
			
 
				+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
			
 
				+
			
 
				+# Boost config
			
 
				+find_package(Boost REQUIRED COMPONENTS iostreams)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
			
 
				+
			
 
				+# CUDA config
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				+target_link_libraries(${PROJECT_NAME} nppc nppicc nppif nppial nppidei)
			
--- a/src/cuda_helper.cpp
+++ b/src/cuda_helper.cpp
@@ -0,0 +1,38 @@
 
				+#include "cuda_helper.h"
			
 
				+
			
 
				+#include <spdlog/spdlog.h>
			
 
				+
			
 
				+#define RET_ERROR \
			
 
				+    assert(false); \
			
 
				+    return false; \
			
 
				+    (void) 0
			
 
				+
			
 
				+bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
			
 
				+                         const char *file_name, const char *api_call_str) {
			
 
				+    if (api_ret == CUDA_SUCCESS) [[likely]] return true;
			
 
				+    const char *error_name, *error_str;
			
 
				+    auto ret = cuGetErrorName(api_ret, &error_name);
			
 
				+    if (ret != CUDA_SUCCESS) [[unlikely]] error_name = "Unknown";
			
 
				+    ret = cuGetErrorString(api_ret, &error_str);
			
 
				+    if (ret != CUDA_SUCCESS) [[unlikely]] error_str = "Unknown";
			
 
				+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}:{}, {}.",
			
 
				+                 api_call_str, file_name, line_number,
			
 
				+                 (int) api_ret, error_name, error_str);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
			
 
				+                                const char *file_name, const char *api_call_str) {
			
 
				+    if (api_ret == cudaSuccess) [[likely]] return true;
			
 
				+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}.",
			
 
				+                 api_call_str, file_name, line_number, (int) api_ret);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+bool check_cuda_api_call(NppStatus api_ret, unsigned int line_number,
			
 
				+                         const char *file_name, const char *api_call_str) {
			
 
				+    if (api_ret == NPP_SUCCESS) [[likely]] return true;
			
 
				+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}.",
			
 
				+                 api_call_str, file_name, line_number, (int) api_ret);
			
 
				+    RET_ERROR;
			
 
				+}
			
--- a/src/cuda_helper.h
+++ b/src/cuda_helper.h
@@ -0,0 +1,22 @@
 
				+#ifndef HDRSYNTHESIS_CUDA_HELPER_H
			
 
				+#define HDRSYNTHESIS_CUDA_HELPER_H
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+#include <nppdefs.h>
			
 
				+
			
 
				+bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
			
 
				+                         const char *file_name, const char *api_call_str);
			
 
				+
			
 
				+bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
			
 
				+                         const char *file_name, const char *api_call_str);
			
 
				+
			
 
				+bool check_cuda_api_call(NppStatus api_ret, unsigned int line_number,
			
 
				+                         const char *file_name, const char *api_call_str);
			
 
				+
			
 
				+#define CUDA_API_CHECK(api_call) \
			
 
				+    if (!check_cuda_api_call( \
			
 
				+        api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
			
 
				+        return false
			
 
				+
			
 
				+#endif //HDRSYNTHESIS_CUDA_HELPER_H
			
--- a/src/hdr_synthesis.cpp
+++ b/src/hdr_synthesis.cpp
@@ -0,0 +1,322 @@
 
				+#include "hdr_synthesis.h"
			
 
				+#include "cuda_helper.h"
			
 
				+
			
 
				+#include <nppi_arithmetic_and_logical_operations.h>
			
 
				+#include <nppi_color_conversion.h>
			
 
				+#include <nppi_data_exchange_and_initialization.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+#include <cassert>
			
 
				+
			
 
				+void call_hdr_weight(void *in_f32, size_t in_pitch,
			
 
				+                     void *out_f32, size_t out_pitch,
			
 
				+                     size_t width, size_t height);
			
 
				+
			
 
				+void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
			
 
				+                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
			
 
				+                    void *out_f32, size_t out_pitch,
			
 
				+                    size_t width, size_t height);
			
 
				+
			
 
				+struct hdr_synthesizer::impl {
			
 
				+
			
 
				+    static constexpr auto u8_to_f32_coff = 1.0f / 255;
			
 
				+    static constexpr float u8_to_f32_coff_arr[] = {u8_to_f32_coff,
			
 
				+                                                   u8_to_f32_coff,
			
 
				+                                                   u8_to_f32_coff};
			
 
				+    static constexpr float identity_filter_coff[] = {1};
			
 
				+    static constexpr float gaussian_filter_coff[] = {1 / 16.0f,
			
 
				+                                                     4 / 16.0f,
			
 
				+                                                     6 / 16.0f,
			
 
				+                                                     4 / 16.0f,
			
 
				+                                                     1 / 16.0f};
			
 
				+    static constexpr auto identity_filter_len = 1;
			
 
				+    static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
			
 
				+
			
 
				+    struct image_buffer {
			
 
				+        void *pyr_image_f32, *pyr_weight_f32;
			
 
				+        size_t image_pitch, weight_pitch;
			
 
				+    };
			
 
				+
			
 
				+    uint16_t width, height, pyr_height;
			
 
				+    uint8_t pyr_level;
			
 
				+
			
 
				+    NppiSize full_size;
			
 
				+    NppiRect full_rect;
			
 
				+    NppiPoint origin_point;
			
 
				+    size_t *pyr_offset_arr;
			
 
				+    NppiSize *pyr_size_arr;
			
 
				+    void *identity_filter_coff_f32;
			
 
				+    void *gaussian_filter_coff_f32;
			
 
				+
			
 
				+    // global temporary memory
			
 
				+    void *raw_u8, *rgb_u8, *rgb_f32;
			
 
				+    size_t raw_u8_pitch, rgb_u8_pitch, rgb_f32_pitch;
			
 
				+
			
 
				+    void *rgb2_f32, *rgb3_f32; // for test
			
 
				+
			
 
				+    impl(uint16_t _width, uint16_t _height, uint8_t _level)
			
 
				+            : width(_width), height(_height), pyr_level(_level) {
			
 
				+        pyr_height = height + (height >> 1);
			
 
				+        malloc_global_memory();
			
 
				+
			
 
				+        full_size = NppiSize{width, height};
			
 
				+        full_rect = NppiRect{0, 0, width, height};
			
 
				+        origin_point = NppiPoint{0, 0};
			
 
				+
			
 
				+        pyr_offset_arr = new size_t[pyr_level];
			
 
				+        pyr_size_arr = new NppiSize[pyr_level];
			
 
				+        auto cur_width = width, cur_height = height;
			
 
				+        for (auto i = 0; i < pyr_level; ++i) {
			
 
				+//            assert(cur_width % 2 == 0);
			
 
				+//            assert(cur_height % 2 == 0);
			
 
				+            pyr_offset_arr[i] = (i == 0) ? 0 : (pyr_offset_arr[i - 1] + cur_width);
			
 
				+            cur_width >>= 1;
			
 
				+            cur_height >>= 1;
			
 
				+            pyr_size_arr[i] = NppiSize{cur_width, cur_height};
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    ~impl() {
			
 
				+        free_global_memory();
			
 
				+
			
 
				+        delete pyr_offset_arr;
			
 
				+        delete pyr_size_arr;
			
 
				+    }
			
 
				+
			
 
				+    bool malloc_global_memory() {
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
			
 
				+                                       width * sizeof(unsigned char), height));
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
			
 
				+                                       width * 3 * sizeof(unsigned char), height));
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
			
 
				+                                       width * 3 * sizeof(float), pyr_height));
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
			
 
				+                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
			
 
				+                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				+
			
 
				+        // upload gaussian kernel coefficient
			
 
				+        CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
			
 
				+        CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
			
 
				+                                  sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
			
 
				+        CUDA_API_CHECK(cudaMalloc(&identity_filter_coff_f32, sizeof(identity_filter_coff)));
			
 
				+        CUDA_API_CHECK(cudaMemcpy(identity_filter_coff_f32, identity_filter_coff,
			
 
				+                                  sizeof(identity_filter_coff), cudaMemcpyHostToDevice));
			
 
				+
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool free_global_memory() {
			
 
				+        CUDA_API_CHECK(cudaFree(raw_u8));
			
 
				+        CUDA_API_CHECK(cudaFree(rgb_u8));
			
 
				+        CUDA_API_CHECK(cudaFree(rgb_f32));
			
 
				+        CUDA_API_CHECK(cudaFree(rgb2_f32));
			
 
				+        CUDA_API_CHECK(cudaFree(rgb3_f32));
			
 
				+        CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
			
 
				+        CUDA_API_CHECK(cudaFree(identity_filter_coff_f32));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool malloc_buffer(image_buffer *buf) const {
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_image_f32, &buf->image_pitch,
			
 
				+                                       width * sizeof(float) * 3, pyr_height));
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_weight_f32, &buf->weight_pitch,
			
 
				+                                       width * sizeof(float), pyr_height));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool gaussian_pyramid(void *ptr_f32, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
			
 
				+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				+        CUDA_API_CHECK((is_rgb ?
			
 
				+                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				+                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
			
 
				+                               ((Npp32f *) ptr_f32, pitch, full_size, origin_point,
			
 
				+                                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
			
 
				+                                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+        for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float) * (is_rgb ? 3 : 1);
			
 
				+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * (is_rgb ? 3 : 1);
			
 
				+            CUDA_API_CHECK((is_rgb ?
			
 
				+                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				+                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
			
 
				+                                   ((Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
			
 
				+                                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
			
 
				+                                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool laplacian_operation(void *src_f32, NppiSize src_size,
			
 
				+                             void *dst_f32, NppiSize dst_size,
			
 
				+                             size_t pitch) const {
			
 
				+        // up-sampling
			
 
				+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				+                (Npp32f *) src_f32, pitch, src_size, origin_point,
			
 
				+                (Npp32f *) rgb_f32, rgb_f32_pitch, dst_size,
			
 
				+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+        // gaussian blur
			
 
				+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb_f32, rgb_f32_pitch, dst_size, origin_point,
			
 
				+                                                     (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
			
 
				+                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
			
 
				+        // add
			
 
				+//        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb_f32, rgb_f32_pitch,
			
 
				+//                                        (Npp32f *) dst_f32, pitch,
			
 
				+//                                        dst_size));
			
 
				+        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
			
 
				+                                        (Npp32f *) dst_f32, pitch,
			
 
				+                                        dst_size));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool laplacian_pyramid(void *ptr_f32, size_t pitch) const { // for rgb image only
			
 
				+        // generate gaussian pyramid first
			
 
				+        gaussian_pyramid(ptr_f32, pitch, true);
			
 
				+
			
 
				+        // generate laplacian pyramid by up-sampling and subtraction
			
 
				+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				+        laplacian_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
			
 
				+        for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				+            laplacian_operation(pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * 3, pyr_size_arr[i + 1],
			
 
				+                                pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
			
 
				+                                pitch);
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool reconstruct_operation(void *src_f32, NppiSize src_size,
			
 
				+                               void *dst_f32, NppiSize dst_size,
			
 
				+                               size_t pitch) const {
			
 
				+        // up-sampling
			
 
				+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				+                (Npp32f *) src_f32, pitch, src_size, origin_point,
			
 
				+                (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
			
 
				+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+        // gaussian blur
			
 
				+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size, origin_point,
			
 
				+                                                     (Npp32f *) rgb3_f32, rgb_f32_pitch, dst_size,
			
 
				+                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
			
 
				+        // add
			
 
				+//        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
			
 
				+//                                        (Npp32f *) dst_f32, pitch,
			
 
				+//                                        dst_size));
			
 
				+        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb3_f32, rgb_f32_pitch,
			
 
				+                                        (Npp32f *) dst_f32, pitch,
			
 
				+                                        dst_size));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    // reconstruct from laplacian pyramid, for rgb image only
			
 
				+    bool pyramid_reconstruct(void *ptr_f32, size_t pitch) const {
			
 
				+        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				+        for (int i = pyr_level - 1; i > 0; --i) {
			
 
				+            reconstruct_operation(pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
			
 
				+                                  pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float) * 3, pyr_size_arr[i - 1],
			
 
				+                                  pitch);
			
 
				+        }
			
 
				+        reconstruct_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
			
 
				+        // upload image
			
 
				+        CUDA_API_CHECK(cudaMemcpy2D(raw_u8, raw_u8_pitch,
			
 
				+                                    raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
			
 
				+                                    height, cudaMemcpyHostToDevice));
			
 
				+
			
 
				+        // debayer image
			
 
				+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R((Npp8u *) raw_u8, raw_u8_pitch,
			
 
				+                                             full_size, full_rect,
			
 
				+                                             (Npp8u *) rgb_u8, rgb_u8_pitch,
			
 
				+                                             NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
			
 
				+
			
 
				+        // convert to float
			
 
				+        CUDA_API_CHECK(nppiConvert_8u32f_C3R((Npp8u *) rgb_u8, rgb_u8_pitch,
			
 
				+                                             (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
			
 
				+                                             full_size));
			
 
				+        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
			
 
				+                                         (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
			
 
				+                                         full_size)); // normalize
			
 
				+
			
 
				+        // calc weight
			
 
				+        call_hdr_weight(buf->pyr_image_f32, buf->image_pitch,
			
 
				+                        buf->pyr_weight_f32, buf->weight_pitch,
			
 
				+                        width, height);
			
 
				+
			
 
				+        // construct image pyramid
			
 
				+        gaussian_pyramid(buf->pyr_weight_f32, buf->weight_pitch, false);
			
 
				+        laplacian_pyramid(buf->pyr_image_f32, buf->image_pitch);
			
 
				+
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				+                     void *out_u8, size_t out_pitch) const {
			
 
				+        // merge
			
 
				+        assert(buf_a->image_pitch == buf_b->image_pitch);
			
 
				+        assert(buf_a->weight_pitch == buf_b->weight_pitch);
			
 
				+        call_hdr_merge(buf_a->pyr_image_f32, buf_b->pyr_image_f32, buf_a->image_pitch,
			
 
				+                       buf_a->pyr_weight_f32, buf_b->pyr_weight_f32, buf_a->weight_pitch,
			
 
				+                       rgb_f32, rgb_f32_pitch,
			
 
				+                       width, pyr_height);
			
 
				+
			
 
				+        // reconstruct image from laplacian pyramid
			
 
				+        pyramid_reconstruct(rgb_f32, rgb_f32_pitch);
			
 
				+
			
 
				+        // convert to uint8 and copy
			
 
				+        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) rgb_f32, rgb_f32_pitch,
			
 
				+                                             (Npp8u *) out_u8, out_pitch,
			
 
				+                                             full_size, NPP_RND_NEAR));
			
 
				+
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+//    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				+//                     void *out_u8, size_t out_pitch) const {
			
 
				+//        // reconstruct image from laplacian pyramid
			
 
				+//        pyramid_reconstruct(buf_a->pyr_image_f32, buf_a->image_pitch);
			
 
				+//
			
 
				+//        // convert to uint8 and copy
			
 
				+//        float arr[] = {255, 255, 255};
			
 
				+//        nppiMulC_32f_C3IR(arr, (Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch, full_size);
			
 
				+//        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch,
			
 
				+//                                             (Npp8u *) out_u8, out_pitch,
			
 
				+//                                             full_size, NPP_RND_NEAR));
			
 
				+//
			
 
				+//        return true;
			
 
				+//    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
			
 
				+        : pimpl(std::make_unique<impl>(width, height, pyramid_level)) {
			
 
				+}
			
 
				+
			
 
				+hdr_synthesizer::~hdr_synthesizer() = default;
			
 
				+
			
 
				+bool hdr_synthesizer::malloc_buffer(void **out_buf) {
			
 
				+    *out_buf = new impl::image_buffer{};
			
 
				+    return pimpl->malloc_buffer((impl::image_buffer *) *out_buf);
			
 
				+}
			
 
				+
			
 
				+bool hdr_synthesizer::free_buffer(void *out_buf) {
			
 
				+    auto ptr = (impl::image_buffer *) out_buf;
			
 
				+    CUDA_API_CHECK(cudaFree(ptr->pyr_image_f32));
			
 
				+    CUDA_API_CHECK(cudaFree(ptr->pyr_weight_f32));
			
 
				+    delete ptr;
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
			
 
				+    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr);
			
 
				+}
			
 
				+
			
 
				+bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
			
 
				+                                  void *img_u8, size_t img_pitch) {
			
 
				+    return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
			
 
				+                              img_u8, img_pitch);
			
 
				+}
			
 
				+
			
 
				+void hdr_synthesizer::test_func(void **ptr, size_t *pitch) {
			
 
				+    *ptr = pimpl->rgb_f32;
			
 
				+    *pitch = pimpl->rgb_f32_pitch;
			
 
				+}
			
--- a/src/hdr_synthesis.h
+++ b/src/hdr_synthesis.h
@@ -0,0 +1,29 @@
 
				+#ifndef HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				+#define HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				+
			
 
				+#include <cstdint>
			
 
				+#include <memory>
			
 
				+
			
 
				+class hdr_synthesizer {
			
 
				+public:
			
 
				+
			
 
				+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
			
 
				+
			
 
				+    ~hdr_synthesizer();
			
 
				+
			
 
				+    bool malloc_buffer(void **out_buf); // per image buffer
			
 
				+
			
 
				+    static bool free_buffer(void *out_buf);
			
 
				+
			
 
				+    bool preprocess_image(void *img_buf, uint8_t *img_ptr);
			
 
				+
			
 
				+    bool merge_image(void *buf_a, void *buf_b, void *img_u8, size_t img_pitch);
			
 
				+
			
 
				+    void test_func(void **ptr, size_t *pitch);
			
 
				+
			
 
				+private:
			
 
				+    struct impl;
			
 
				+    std::unique_ptr<impl> pimpl;
			
 
				+};
			
 
				+
			
 
				+#endif //HDRSYNTHESIS_HDR_SYNTHESIS_H
			
--- a/src/hdr_synthesis_kernel.cu
+++ b/src/hdr_synthesis_kernel.cu
@@ -0,0 +1,89 @@
 
				+#include "hdr_synthesis.h"
			
 
				+
			
 
				+static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
			
 
				+static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
			
 
				+static constexpr float smooth_coff = 1e-6;
			
 
				+static constexpr float f32_to_u8_coff = 255;
			
 
				+
			
 
				+static constexpr auto block_size = 16;
			
 
				+
			
 
				+template<typename T>
			
 
				+__device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
			
 
				+    return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
			
 
				+}
			
 
				+
			
 
				+__global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
			
 
				+                           float *out_ptr, size_t out_pitch,
			
 
				+                           size_t width, size_t height) {
			
 
				+    auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+    auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+    if (x >= height || y >= width) return;
			
 
				+
			
 
				+//    auto in_offset = x * in_pitch + y * 3 * sizeof(float);
			
 
				+//    auto in_data = (float *) ((char *) in_ptr + in_offset);
			
 
				+    auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
			
 
				+    float r = in_data[0];
			
 
				+    float g = in_data[1];
			
 
				+    float b = in_data[2];
			
 
				+
			
 
				+    float rgb_mean = (r + g + b) / 3.0f;
			
 
				+    float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
			
 
				+    float sat_weight = rgb_std * saturation_coff;
			
 
				+
			
 
				+    float rgb_max = fmaxf(fmaxf(r, g), b);
			
 
				+    float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));
			
 
				+
			
 
				+//    auto out_offset = x * out_pitch + y * sizeof(float);
			
 
				+//    auto out_data = (float *) ((char *) out_ptr + out_offset);
			
 
				+    auto out_data = smart_offset(out_ptr, out_pitch, x, y);
			
 
				+    *out_data = sat_weight * expo_weight + smooth_coff;
			
 
				+}
			
 
				+
			
 
				+__device__ float pixel_merge(const float *img_a, const float *img_b,
			
 
				+                             const float *wei_a, const float *wei_b,
			
 
				+                             size_t offset) {
			
 
				+    auto w_a = *wei_a, w_b = *wei_b;
			
 
				+    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
			
 
				+}
			
 
				+
			
 
				+__global__ void hdr_merge(const float *img_a, const float *img_b, size_t img_pitch,
			
 
				+                          const float *wei_a, const float *wei_b, size_t wei_pitch,
			
 
				+                          float *out_ptr, size_t out_pitch,
			
 
				+                          size_t width, size_t height) {
			
 
				+    auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+    auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+    if (x >= height || y >= width) return;
			
 
				+
			
 
				+    auto cur_img_a = smart_offset(img_a, img_pitch, x, y, 3);
			
 
				+    auto cur_img_b = smart_offset(img_b, img_pitch, x, y, 3);
			
 
				+    auto cur_wei_a = smart_offset(wei_a, wei_pitch, x, y);
			
 
				+    auto cur_wei_b = smart_offset(wei_b, wei_pitch, x, y);
			
 
				+    auto cur_out = smart_offset(out_ptr, out_pitch, x, y, 3);
			
 
				+    for (auto i = 0; i < 3; ++i) {
			
 
				+        cur_out[i] = pixel_merge(cur_img_a, cur_img_b, cur_wei_a, cur_wei_b, i) * f32_to_u8_coff;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void call_hdr_weight(void *in_f32, size_t in_pitch,
			
 
				+                     void *out_f32, size_t out_pitch,
			
 
				+                     size_t width, size_t height) {
			
 
				+    auto block_dims = dim3{block_size, block_size};
			
 
				+    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				+                          (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				+    hdr_weight<<<grid_dims, block_dims>>>((float *) in_f32, in_pitch,
			
 
				+                                          (float *) out_f32, out_pitch,
			
 
				+                                          width, height);
			
 
				+}
			
 
				+
			
 
				+void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
			
 
				+                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
			
 
				+                    void *out_f32, size_t out_pitch,
			
 
				+                    size_t width, size_t height) {
			
 
				+    auto block_dims = dim3{block_size, block_size};
			
 
				+    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				+                          (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				+    hdr_merge<<<grid_dims, block_dims>>>((float *) img_a_f32, (float *) img_b_f32, img_pitch,
			
 
				+                                         (float *) wei_a_f32, (float *) wei_b_f32, wei_pitch,
			
 
				+                                         (float *) out_f32, out_pitch,
			
 
				+                                         width, height);
			
 
				+}
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -0,0 +1,83 @@
 
				+#include "cuda_helper.h"
			
 
				+#include "hdr_synthesis.h"
			
 
				+
			
 
				+#include <cuda_profiler_api.h>
			
 
				+
			
 
				+#include <nppi_color_conversion.h>
			
 
				+#include <nppi_filtering_functions.h>
			
 
				+
			
 
				+#include <opencv2/core/cuda.hpp>
			
 
				+#include <opencv2/imgcodecs.hpp>
			
 
				+
			
 
				+#include <boost/iostreams/device/mapped_file.hpp>
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <chrono>
			
 
				+
			
 
				+static constexpr auto image_width = 2448;
			
 
				+static constexpr auto image_height = 2048;
			
 
				+
			
 
				+cv::Mat download_image(void *ptr, size_t pitch, size_t width, size_t height, int type) {
			
 
				+    auto gpu_mat = cv::cuda::GpuMat{(int) height, (int) width, type, ptr, pitch};
			
 
				+    cv::Mat mat;
			
 
				+    gpu_mat.download(mat);
			
 
				+    return mat;
			
 
				+}
			
 
				+
			
 
				+struct image_buffer {
			
 
				+    void *pyr_image_f32, *pyr_weight_f32;
			
 
				+    size_t image_pitch, weight_pitch;
			
 
				+};
			
 
				+
			
 
				+int main() {
			
 
				+    auto path_a = "/home/tpx/project/HDRSynthesis/data/phantom2_5ms.raw";
			
 
				+    auto path_b = "/home/tpx/project/HDRSynthesis/data/phantom2_20ms.raw";
			
 
				+
			
 
				+    using boost::iostreams::mapped_file;
			
 
				+    auto img_file_a = mapped_file{path_a, boost::iostreams::mapped_file_base::readonly};
			
 
				+    auto img_file_b = mapped_file{path_b, boost::iostreams::mapped_file_base::readonly};
			
 
				+
			
 
				+    auto hdr = hdr_synthesizer{image_width, image_height};
			
 
				+    void *buf_a, *buf_b;
			
 
				+    hdr.malloc_buffer(&buf_a);
			
 
				+    hdr.malloc_buffer(&buf_b);
			
 
				+    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				+
			
 
				+    auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				+    hdr.merge_image(buf_b, buf_a, img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+
			
 
				+//    for (int i = 0; i < 8; ++i) {
			
 
				+//        auto start_ts = std::chrono::system_clock::now();
			
 
				+//        cudaDeviceSynchronize();
			
 
				+//        hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				+//        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				+//        cudaDeviceSynchronize();
			
 
				+//        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
			
 
				+//                std::chrono::system_clock::now() - start_ts).count() << std::endl;
			
 
				+//    }
			
 
				+//
			
 
				+//    cudaProfilerStart();
			
 
				+//    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				+//    cudaProfilerStop();
			
 
				+//
			
 
				+
			
 
				+    auto real_ptr = (image_buffer *) buf_b;
			
 
				+    auto host_rgb_a = download_image(img_hdr_dev.cudaPtr(), img_hdr_dev.step1(),
			
 
				+                                     image_width, image_height, CV_8UC3);
			
 
				+//    auto host_rgb_a = download_image((char *) real_ptr->pyr_image_f32, real_ptr->image_pitch,
			
 
				+//                                     image_width, image_height, CV_32FC3);
			
 
				+//    void *ptr;
			
 
				+//    size_t pitch;
			
 
				+//    hdr.test_func(&ptr, &pitch);
			
 
				+//    auto host_rgb_a = download_image(ptr, pitch,
			
 
				+//                                     image_width, image_height, CV_32FC3);
			
 
				+
			
 
				+    double min_val, max_val;
			
 
				+    cv::minMaxLoc(host_rgb_a, &min_val, &max_val);
			
 
				+    std::cout << min_val << " " << max_val << " " << cv::mean(host_rgb_a) << std::endl;
			
 
				+
			
 
				+    cv::imwrite("test.bmp", host_rgb_a);
			
 
				+
			
 
				+    return 0;
			
 
				+}