il y a 2 ans · 2571498b00
--- a/src/hdr/CMakeLists.txt
+++ b/src/hdr/CMakeLists.txt
@@ -19,4 +19,9 @@ target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
 
				 # CUDA config
			
 
				 find_package(CUDAToolkit REQUIRED)
			
 
				 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				-target_link_libraries(${PROJECT_NAME} nppc nppicc nppif nppial nppidei)
			
 
				+target_link_libraries(${PROJECT_NAME} nppc nppicc nppif nppial nppidei)
			
 
				+
			
 
				+# OpenCV config
			
 
				+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
--- a/src/hdr/hdr_synthesis.cpp
+++ b/src/hdr/hdr_synthesis.cpp
@@ -1,5 +1,6 @@
 
				 #include "hdr_synthesis.h"
			
 
				 #include "hdr_synthesis_priv.h"
			
 
				+#include "../config.h"
			
 
				 #include "../cuda_helper.hpp"
			
 
				 
			
 
				 #include <nppi_arithmetic_and_logical_operations.h>
			
@@ -8,24 +9,9 @@
 
				 #include <nppi_filtering_functions.h>
			
 
				 
			
 
				 #include <cassert>
			
 
				-#include <numeric>
			
 
				 
			
 
				-template<typename T>
			
 
				-struct smart_buffer {
			
 
				-    T *ptr = nullptr;
			
 
				-    size_t pitch = 0;
			
 
				-
			
 
				-    size_t width, height, elem_cnt;
			
 
				-
			
 
				-    smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
			
 
				-            : width(_width), height(_height), elem_cnt(_elem_cnt) {
			
 
				-        auto width_bytes = width * elem_cnt * sizeof(T);
			
 
				-        CALL_ASSERT(cudaMallocPitch(&ptr, &pitch, width_bytes, height) == cudaSuccess);
			
 
				-    }
			
 
				-
			
 
				-    ~smart_buffer() {
			
 
				-        CALL_ASSERT(cudaFree(ptr) == cudaSuccess);
			
 
				-    }
			
 
				+struct hdr_buffer {
			
 
				+    cv::cuda::GpuMat image_pyr, weight_pyr;
			
 
				 };
			
 
				 
			
 
				 struct hdr_synthesizer::impl {
			
@@ -41,11 +27,7 @@ struct hdr_synthesizer::impl {
 
				                                                      1 / 16.0f};
			
 
				     static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
			
 
				 
			
 
				-    struct image_buffer {
			
 
				-        smart_buffer<Npp32f> *image_pyr, *weight_pyr;
			
 
				-    };
			
 
				-
			
 
				-    uint16_t width, height, pyr_height;
			
 
				+    cv::Size size, pyr_size;
			
 
				     uint8_t pyr_level;
			
 
				 
			
 
				     NppiSize full_size;
			
@@ -55,32 +37,33 @@ struct hdr_synthesizer::impl {
 
				     NppiSize *pyr_size_arr;
			
 
				     void *gaussian_filter_coff_f32;
			
 
				 
			
 
				-    NppStreamContext npp_ctx, extra_npp_ctx;
			
 
				-    cudaStream_t main_stream, extra_stream;
			
 
				+    NppStreamContext main_npp_ctx, extra_npp_ctx;
			
 
				+    cv::cuda::Stream &main_stream;
			
 
				+    cv::cuda::Stream extra_stream;
			
 
				     cudaEvent_t sync_event;
			
 
				 
			
 
				     // global temporary memory
			
 
				-    smart_buffer<Npp8u> *rgb_u8;
			
 
				-    smart_buffer<Npp32f> *rgb_f32[2];
			
 
				-
			
 
				-    impl(uint16_t _width, uint16_t _height, uint8_t _level, cudaStream_t stream)
			
 
				-            : width(_width), height(_height), pyr_level(_level), main_stream(stream) {
			
 
				-        rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
			
 
				-        rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				-        rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				+    cv::cuda::GpuMat rgb_u8, rgb_f32[2];
			
 
				+
			
 
				+    impl(cv::Size _size, uint8_t _level, cv::cuda::Stream &stream)
			
 
				+            : size(_size), pyr_level(_level), main_stream(stream) {
			
 
				+        rgb_u8.create(size, CV_8UC3);
			
 
				+        for (auto &mat: rgb_f32) {
			
 
				+            mat.create(size, CV_32FC3);
			
 
				+        }
			
 
				         CALL_ASSERT(malloc_dev_mem());
			
 
				 
			
 
				-        init_npp_ctx(&npp_ctx, main_stream);
			
 
				+        init_npp_ctx(&main_npp_ctx, main_stream);
			
 
				         init_npp_ctx(&extra_npp_ctx, extra_stream);
			
 
				 
			
 
				-        pyr_height = height + (height >> 1);
			
 
				-        full_size = NppiSize{width, height};
			
 
				-        full_rect = NppiRect{0, 0, width, height};
			
 
				+        pyr_size = cv::Size{size.width, size.height + (size.height >> 1)};
			
 
				+        full_size = NppiSize{size.width, size.height};
			
 
				+        full_rect = NppiRect{0, 0, size.width, size.height};
			
 
				         origin_point = NppiPoint{0, 0};
			
 
				 
			
 
				         pyr_offset_arr = new size_t[pyr_level];
			
 
				         pyr_size_arr = new NppiSize[pyr_level];
			
 
				-        auto cur_width = width, cur_height = height;
			
 
				+        auto cur_width = size.width, cur_height = size.height;
			
 
				         for (auto i = 0; i < pyr_level; ++i) {
			
 
				 //            assert(cur_width % 2 == 0);
			
 
				 //            assert(cur_height % 2 == 0);
			
@@ -92,11 +75,7 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     ~impl() {
			
 
				-        delete rgb_u8;
			
 
				-        delete rgb_f32[0];
			
 
				-        delete rgb_f32[1];
			
 
				         CALL_ASSERT(free_dev_mem());
			
 
				-
			
 
				         delete pyr_offset_arr;
			
 
				         delete pyr_size_arr;
			
 
				     }
			
@@ -107,7 +86,6 @@ struct hdr_synthesizer::impl {
 
				         CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
			
 
				                                   sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
			
 
				 
			
 
				-        CUDA_API_CHECK(cudaStreamCreate(&extra_stream));
			
 
				         CUDA_API_CHECK(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming));
			
 
				 
			
 
				         return true;
			
@@ -115,13 +93,12 @@ struct hdr_synthesizer::impl {
 
				 
			
 
				     bool free_dev_mem() {
			
 
				         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
			
 
				-        CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
			
 
				         CUDA_API_CHECK(cudaEventDestroy(sync_event));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    static bool init_npp_ctx(NppStreamContext *ctx, cudaStream_t stream) {
			
 
				-        ctx->hStream = stream;
			
 
				+    static bool init_npp_ctx(NppStreamContext *ctx, cv::cuda::Stream &stream) {
			
 
				+        ctx->hStream = (cudaStream_t) stream.cudaPtr();
			
 
				         CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
			
 
				         cudaDeviceProp dev_prop = {};
			
 
				         CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
			
@@ -133,34 +110,33 @@ struct hdr_synthesizer::impl {
 
				                                               cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
			
 
				         CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
			
 
				                                               cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
			
 
				-        CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
			
 
				-        return true;
			
 
				-    }
			
 
				-
			
 
				-    bool malloc_buffer(image_buffer *buf) const {
			
 
				-        buf->image_pyr = new smart_buffer<Npp32f>(width, pyr_height, 3);
			
 
				-        buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
			
 
				+        CUDA_API_CHECK(cudaStreamGetFlags(ctx->hStream, &ctx->nStreamFlags));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb,
			
 
				-                          NppStreamContext *ctx) const { // construct gaussian pyramid
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, is_rgb ? 3 : 1);
			
 
				+    bool gaussian_pyramid(cv::InputOutputArray img, NppStreamContext &ctx) const { // construct gaussian pyramid
			
 
				+        assert(img.isGpuMat());
			
 
				+        auto &img_dev = img.getGpuMatRef();
			
 
				+        assert(img_dev.size() == pyr_size);
			
 
				+        auto img_ptr = (float *) img_dev.cudaPtr();
			
 
				+        auto pyr_ptr = (float *) img_dev.ptr(size.height); // start position of pyramid images
			
 
				+        bool is_rgb = (img_dev.channels() == 3);
			
 
				+        assert(is_rgb || img_dev.channels() == 1);
			
 
				         CUDA_API_CHECK((is_rgb ?
			
 
				                         nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
			
 
				                         nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
			
 
				-                ptr, pitch, full_size, origin_point,
			
 
				-                pyr_ptr, pitch, pyr_size_arr[0],
			
 
				-                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
			
 
				+                img_ptr, img_dev.step, full_size, origin_point,
			
 
				+                pyr_ptr, img_dev.step, pyr_size_arr[0],
			
 
				+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, ctx));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, is_rgb ? 3 : 1);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, is_rgb ? 3 : 1);
			
 
				+            auto src_ptr = smart_offset(pyr_ptr, img_dev.step, pyr_offset_arr[i], 0, img_dev.channels());
			
 
				+            auto dst_ptr = smart_offset(pyr_ptr, img_dev.step, pyr_offset_arr[i + 1], 0, img_dev.channels());
			
 
				             CUDA_API_CHECK((is_rgb ?
			
 
				                             nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
			
 
				                             nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
			
 
				-                    src_ptr, pitch, pyr_size_arr[i], origin_point,
			
 
				-                    dst_ptr, pitch, pyr_size_arr[i + 1],
			
 
				-                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
			
 
				+                    src_ptr, img_dev.step, pyr_size_arr[i], origin_point,
			
 
				+                    dst_ptr, img_dev.step, pyr_size_arr[i + 1],
			
 
				+                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, ctx));
			
 
				         }
			
 
				         return true;
			
 
				     }
			
@@ -172,18 +148,22 @@ struct hdr_synthesizer::impl {
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
			
 
				-                           NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
			
 
				+    // construct laplacian pyramid
			
 
				+    bool laplacian_pyramid(cv::InputOutputArray img, NppStreamContext &ctx, cv::cuda::Stream &stream) const {
			
 
				         // generate gaussian pyramid first
			
 
				-        CALL_CHECK(gaussian_pyramid(ptr, pitch, true, ctx));
			
 
				+        CALL_CHECK(gaussian_pyramid(img, ctx));
			
 
				 
			
 
				         // generate laplacian pyramid by up-sampling and subtraction
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        CALL_CHECK(laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
			
 
				+        auto &img_dev = img.getGpuMatRef();
			
 
				+        auto img_ptr = (float *) img_dev.cudaPtr();
			
 
				+        auto pyr_ptr = (float *) img_dev.ptr(size.height);
			
 
				+        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
			
 
				+        CALL_CHECK(laplacian_operation(pyr_ptr, img_ptr, img_dev.step, pyr_size_arr[0], full_size, cuda_stream));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				-            CALL_CHECK(laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream));
			
 
				+            auto src_ptr = smart_offset(pyr_ptr, img_dev.step, pyr_offset_arr[i + 1], 0, img_dev.channels());
			
 
				+            auto dst_ptr = smart_offset(pyr_ptr, img_dev.step, pyr_offset_arr[i], 0, img_dev.channels());
			
 
				+            CALL_CHECK(laplacian_operation(src_ptr, dst_ptr, img_dev.step,
			
 
				+                                           pyr_size_arr[i + 1], pyr_size_arr[i], cuda_stream));
			
 
				         }
			
 
				         return true;
			
 
				     }
			
@@ -196,96 +176,106 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     // reconstruct from laplacian pyramid, for rgb image only
			
 
				-    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, cudaStream_t stream) const {
			
 
				+    bool pyramid_reconstruct(cv::InputOutputArray img, cv::cuda::Stream &stream) const {
			
 
				+        auto &img_dev = img.getGpuMatRef();
			
 
				+        auto img_ptr = (float *) img_dev.cudaPtr();
			
 
				+        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
			
 
				         for (int i = pyr_level - 1; i > 0; --i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
			
 
				-            CALL_CHECK(reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream));
			
 
				+            auto src_ptr = smart_offset(img_ptr, img_dev.step, pyr_offset_arr[i], size.height, img_dev.channels());
			
 
				+            auto dst_ptr = smart_offset(img_ptr, img_dev.step, pyr_offset_arr[i - 1], size.height, img_dev.channels());
			
 
				+            CALL_CHECK(reconstruct_operation(src_ptr, dst_ptr, img_dev.step,
			
 
				+                                             pyr_size_arr[i], pyr_size_arr[i - 1], cuda_stream));
			
 
				         }
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        CALL_CHECK(reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
			
 
				+        auto pyr_ptr = (float *) img_dev.ptr(size.height);
			
 
				+        CALL_CHECK(reconstruct_operation(pyr_ptr, img_ptr, img_dev.step, pyr_size_arr[0], full_size, cuda_stream));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool preprocess_image(image_buffer *buf, void *raw_u8, size_t pitch) {
			
 
				+    bool preprocess_image(hdr_buffer *buf, cv::InputArray img) {
			
 
				+        assert(img.isGpuMat());
			
 
				+        auto img_dev = img.getGpuMat();
			
 
				+        assert(img_dev.size() == size);
			
 
				+
			
 
				         // debayer image
			
 
				-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx((Npp8u *) raw_u8, pitch, full_size, full_rect,
			
 
				-                                                 rgb_u8->ptr, rgb_u8->pitch,
			
 
				-                                                 NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
			
 
				+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx((Npp8u *) img_dev.cudaPtr(), img_dev.step, full_size, full_rect,
			
 
				+                                                 (Npp8u *) rgb_u8.cudaPtr(), rgb_u8.step,
			
 
				+                                                 NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, main_npp_ctx));
			
 
				 
			
 
				         // convert to float
			
 
				-        CUDA_API_CHECK(nppiConvert_8u32f_C3R_Ctx(rgb_u8->ptr, rgb_u8->pitch,
			
 
				-                                                 buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				-                                                 full_size, npp_ctx));
			
 
				+        CUDA_API_CHECK(nppiConvert_8u32f_C3R_Ctx((Npp8u *) rgb_u8.cudaPtr(), rgb_u8.step,
			
 
				+                                                 (Npp32f *) buf->image_pyr.cudaPtr(), buf->image_pyr.step,
			
 
				+                                                 full_size, main_npp_ctx));
			
 
				         CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
			
 
				-                                             buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				-                                             full_size, npp_ctx));
			
 
				-        CUDA_API_CHECK(cudaEventRecord(sync_event, main_stream));
			
 
				+                                             (Npp32f *) buf->image_pyr.cudaPtr(), buf->image_pyr.step,
			
 
				+                                             full_size, main_npp_ctx));
			
 
				+        CUDA_API_CHECK(cudaEventRecord(sync_event, (cudaStream_t) main_stream.cudaPtr()));
			
 
				 
			
 
				         // calc weight and construct pyramid
			
 
				-        CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
			
 
				-        call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
			
 
				-                        buf->weight_pyr->ptr, buf->weight_pyr->pitch,
			
 
				-                        width, height, extra_stream); // parallel execution for weight related calculation
			
 
				-        CALL_CHECK(gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx));
			
 
				-        CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent((cudaStream_t) extra_stream.cudaPtr(), sync_event));
			
 
				+        // parallel execution for weight related calculation
			
 
				+        call_hdr_weight((Npp8u *) rgb_u8.cudaPtr(), rgb_u8.step,
			
 
				+                        (Npp32f *) buf->weight_pyr.cudaPtr(), buf->weight_pyr.step,
			
 
				+                        size.width, size.height, (cudaStream_t) extra_stream.cudaPtr());
			
 
				+        CALL_CHECK(gaussian_pyramid(buf->weight_pyr, extra_npp_ctx));
			
 
				+        CUDA_API_CHECK(cudaEventRecord(sync_event, (cudaStream_t) extra_stream.cudaPtr()));
			
 
				 
			
 
				         // construct image pyramid
			
 
				-        CALL_CHECK(laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, main_stream));
			
 
				-        CUDA_API_CHECK(cudaStreamWaitEvent(main_stream, sync_event));
			
 
				+        CALL_CHECK(laplacian_pyramid(buf->image_pyr, main_npp_ctx, main_stream));
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent((cudaStream_t) main_stream.cudaPtr(), sync_event));
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				-                     uint8_t *out_ptr, size_t out_pitch) {
			
 
				-        assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
			
 
				-        assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
			
 
				+    bool merge_image(hdr_buffer *buf_a, hdr_buffer *buf_b, cv::OutputArray img_out) {
			
 
				+        assert(buf_a->image_pyr.step == buf_b->image_pyr.step);
			
 
				+        assert(buf_a->weight_pyr.step == buf_b->weight_pyr.step);
			
 
				 
			
 
				         // merge
			
 
				-        call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				-                       buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
			
 
				-                       width, pyr_height, main_stream);
			
 
				+        call_hdr_merge((Npp32f *) buf_a->image_pyr.cudaPtr(),
			
 
				+                       (Npp32f *) buf_b->image_pyr.cudaPtr(),
			
 
				+                       buf_a->image_pyr.step,
			
 
				+                       (Npp32f *) buf_a->weight_pyr.cudaPtr(),
			
 
				+                       (Npp32f *) buf_b->weight_pyr.cudaPtr(),
			
 
				+                       buf_a->weight_pyr.step,
			
 
				+                       pyr_size.width, pyr_size.height,
			
 
				+                       (cudaStream_t) main_stream.cudaPtr());
			
 
				 
			
 
				         // reconstruct image from laplacian pyramid
			
 
				-        CALL_CHECK(pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, main_stream));
			
 
				+        CALL_CHECK(pyramid_reconstruct(buf_a->image_pyr, main_stream));
			
 
				 
			
 
				         // convert to uint8
			
 
				-        CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				-                                                 out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
			
 
				+        assert(img_out.isGpuMat());
			
 
				+        auto &img_dev = img_out.getGpuMatRef();
			
 
				+        CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx((Npp32f *) buf_a->image_pyr.cudaPtr(), buf_a->image_pyr.step,
			
 
				+                                                 (Npp8u *) img_dev.cudaPtr(), img_dev.step,
			
 
				+                                                 full_size, NPP_RND_NEAR, main_npp_ctx));
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				 };
			
 
				 
			
 
				-hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height,
			
 
				-                                 cudaStream_t stream, uint8_t pyramid_level)
			
 
				-        : pimpl(std::make_unique<impl>(width, height, pyramid_level, stream)) {
			
 
				+hdr_synthesizer::hdr_synthesizer(cv::Size size, uint8_t pyramid_level, cv::cuda::Stream &stream)
			
 
				+        : pimpl(std::make_unique<impl>(size, pyramid_level, stream)) {
			
 
				 }
			
 
				 
			
 
				 hdr_synthesizer::~hdr_synthesizer() = default;
			
 
				 
			
 
				-bool hdr_synthesizer::malloc_buffer(void **out_buf) {
			
 
				-    *out_buf = new impl::image_buffer{};
			
 
				-    return pimpl->malloc_buffer((impl::image_buffer *) *out_buf);
			
 
				+hdr_buffer *hdr_synthesizer::malloc_buffer() {
			
 
				+    auto ret = new hdr_buffer;
			
 
				+    ret->image_pyr.create(pimpl->pyr_size, CV_32FC3);
			
 
				+    ret->weight_pyr.create(pimpl->pyr_size, CV_32FC1);
			
 
				+    return ret;
			
 
				 }
			
 
				 
			
 
				-bool hdr_synthesizer::free_buffer(void *out_buf) {
			
 
				-    if (out_buf == nullptr) return true;
			
 
				-    auto ptr = (impl::image_buffer *) out_buf;
			
 
				-    delete ptr->image_pyr;
			
 
				-    delete ptr->weight_pyr;
			
 
				-    delete ptr;
			
 
				-    return true;
			
 
				+void hdr_synthesizer::free_buffer(hdr_buffer *buf) {
			
 
				+    delete buf;
			
 
				 }
			
 
				 
			
 
				-bool hdr_synthesizer::preprocess_image(void *img_buf, void *img_ptr, size_t pitch) {
			
 
				-    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, pitch);
			
 
				+void hdr_synthesizer::preprocess_image(hdr_buffer *buf, cv::InputArray img_dev) {
			
 
				+    CALL_ASSERT(pimpl->preprocess_image(buf, img_dev));
			
 
				 }
			
 
				 
			
 
				-bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
			
 
				-                                  uint8_t *img_ptr, size_t img_pitch) {
			
 
				-    return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
			
 
				-                              img_ptr, img_pitch);
			
 
				+void hdr_synthesizer::merge_image(hdr_buffer *buf_a, hdr_buffer *buf_b, cv::OutputArray img_dev_out) {
			
 
				+    CALL_ASSERT(pimpl->merge_image(buf_a, buf_b, img_dev_out));
			
 
				 }
			
--- a/src/hdr/hdr_synthesis.h
+++ b/src/hdr/hdr_synthesis.h
@@ -1,27 +1,31 @@
 
				 #ifndef HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				 #define HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				 
			
 
				-#include <cuda_runtime.h>
			
 
				+#include <opencv2/core/cuda.hpp>
			
 
				 
			
 
				 #include <cstdint>
			
 
				 #include <memory>
			
 
				 
			
 
				+static constexpr auto default_pyramid_level = 6;
			
 
				+
			
 
				+class hdr_buffer;
			
 
				+
			
 
				 // TODO: use half precision float point number to further optimize
			
 
				 class hdr_synthesizer {
			
 
				 public:
			
 
				 
			
 
				-    hdr_synthesizer(uint16_t width, uint16_t height,
			
 
				-                    cudaStream_t stream = nullptr, uint8_t pyramid_level = 4);
			
 
				+    explicit hdr_synthesizer(cv::Size size, uint8_t pyramid_level = default_pyramid_level,
			
 
				+                             cv::cuda::Stream &stream = cv::cuda::Stream::Null());
			
 
				 
			
 
				     ~hdr_synthesizer();
			
 
				 
			
 
				-    bool malloc_buffer(void **out_buf); // per image buffer
			
 
				+    hdr_buffer *malloc_buffer(); // per image buffer
			
 
				 
			
 
				-    static bool free_buffer(void *out_buf);
			
 
				+    static void free_buffer(hdr_buffer *buf);
			
 
				 
			
 
				-    bool preprocess_image(void *img_buf, void *img_ptr, size_t pitch);
			
 
				+    void preprocess_image(hdr_buffer *buf, cv::InputArray img_dev);
			
 
				 
			
 
				-    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
			
 
				+    void merge_image(hdr_buffer *buf_a, hdr_buffer *buf_b, cv::OutputArray img_dev_out);
			
 
				 
			
 
				 private:
			
 
				     struct impl;
			
--- a/src/mvs_camera.cpp
+++ b/src/mvs_camera.cpp
@@ -40,8 +40,8 @@ bool check_mvs_api_call(int api_ret, unsigned int line_number,
 
				         return false
			
 
				 
			
 
				 struct mvs_camera::impl {
			
 
				-    static constexpr auto image_width = 2448;
			
 
				-    static constexpr auto image_height = 2048;
			
 
				+    static constexpr auto image_width = 1224;
			
 
				+    static constexpr auto image_height = 1024;
			
 
				     static constexpr auto image_size = image_width * sizeof(uint8_t) * image_height;
			
 
				 
			
 
				     void *handle = nullptr;
			
@@ -99,6 +99,8 @@ struct mvs_camera::impl {
 
				         // config camera
			
 
				         assert(handle != nullptr);
			
 
				         MVS_API_CHECK(MV_CC_SetEnumValue(handle, "PixelFormat", PixelType_Gvsp_BayerRG8));
			
 
				+        MVS_API_CHECK(MV_CC_SetEnumValue(handle, "BinningHorizontal", 2));
			
 
				+        MVS_API_CHECK(MV_CC_SetEnumValue(handle, "BinningVertical", 2));
			
 
				         MVS_API_CHECK(MV_CC_SetEnumValue(handle, "AcquisitionMode",
			
 
				                                          MV_CAM_ACQUISITION_MODE::MV_ACQ_MODE_CONTINUOUS));
			
 
				         MVS_API_CHECK(MV_CC_SetEnumValue(handle, "TriggerMode", MV_TRIGGER_MODE_ON));
			
--- a/src/stereo_camera.cpp
+++ b/src/stereo_camera.cpp
@@ -16,7 +16,7 @@ struct advanced_camera {
 
				 
			
 
				     bool use_hdr = false;
			
 
				     std::unique_ptr<hdr_synthesizer> hdr;
			
 
				-    void *hdr_buf_last = nullptr, *hdr_buf_cur = nullptr;
			
 
				+    hdr_buffer *hdr_buf_last = nullptr, *hdr_buf_cur = nullptr;
			
 
				 
			
 
				     image_buffer *img_buf = nullptr;
			
 
				     cv::cuda::GpuMat out_rgb_u8;
			
@@ -29,8 +29,7 @@ struct advanced_camera {
 
				         mvs_cam->set_ring_buffer(ring_buf.get());
			
 
				 
			
 
				         auto img_size = mvs_camera::get_output_size(); // TODO: optimize signature of hdr_synthesizer
			
 
				-        hdr = std::make_unique<hdr_synthesizer>(img_size.width, img_size.height,
			
 
				-                                                (cudaStream_t) cuda_stream->cudaPtr(), 6);
			
 
				+        hdr = std::make_unique<hdr_synthesizer>(img_size, 6, *cuda_stream);
			
 
				 
			
 
				         out_rgb_u8.create(img_size, CV_8UC3); // TODO: get output size from hdr_synthesizer
			
 
				     }
			
@@ -40,8 +39,8 @@ struct advanced_camera {
 
				     }
			
 
				 
			
 
				     bool clear_hdr_buf() {
			
 
				-        CALL_CHECK(hdr_synthesizer::free_buffer(hdr_buf_last));
			
 
				-        CALL_CHECK(hdr_synthesizer::free_buffer(hdr_buf_cur));
			
 
				+        hdr_synthesizer::free_buffer(hdr_buf_last);
			
 
				+        hdr_synthesizer::free_buffer(hdr_buf_cur);
			
 
				         hdr_buf_last = nullptr;
			
 
				         hdr_buf_cur = nullptr;
			
 
				         return true;
			
@@ -53,11 +52,10 @@ struct advanced_camera {
 
				         if (use_hdr) {
			
 
				             std::swap(hdr_buf_last, hdr_buf_cur);
			
 
				             if (hdr_buf_cur == nullptr) [[unlikely]] {
			
 
				-                CALL_ASSERT(hdr->malloc_buffer(&hdr_buf_cur)); // TODO: make hdr_synthesizer throw no error
			
 
				+                hdr_buf_cur = hdr->malloc_buffer();
			
 
				             }
			
 
				-            CALL_ASSERT(hdr->preprocess_image(hdr_buf_cur, img_buf->cuda_img.cudaPtr(), img_buf->cuda_img.step1()));
			
 
				-            CALL_ASSERT(hdr->merge_image(hdr_buf_last ? hdr_buf_last : hdr_buf_cur, hdr_buf_cur,
			
 
				-                                         (uint8_t *) out_rgb_u8.cudaPtr(), out_rgb_u8.step1()));
			
 
				+            hdr->preprocess_image(hdr_buf_cur, img_buf->cuda_img);
			
 
				+            hdr->merge_image(hdr_buf_last ? hdr_buf_last : hdr_buf_cur, hdr_buf_cur, out_rgb_u8);
			
 
				         } else { // hdr disabled
			
 
				             CALL_ASSERT(cudaStreamSynchronize((cudaStream_t) cuda_stream->cudaPtr()) == cudaSuccess);
			
 
				             cv::cuda::cvtColor(img_buf->cuda_img, out_rgb_u8,
			
@@ -66,6 +64,7 @@ struct advanced_camera {
 
				     }
			
 
				 
			
 
				     void finish_frame() {
			
 
				+        CALL_ASSERT(cudaStreamSynchronize((cudaStream_t) cuda_stream->cudaPtr()) == cudaSuccess);
			
 
				         ring_buf->recycle_buffer(img_buf);
			
 
				         img_buf = nullptr;
			
 
				     }