2 năm trước cách đây · 00886fa2b9
--- a/src/cuda_helper.h
+++ b/src/cuda_helper.h
@@ -5,6 +5,11 @@
 
				 #include <cuda_runtime.h>
			
 
				 #include <nppdefs.h>
			
 
				 
			
 
				+#include <spdlog/spdlog.h>
			
 
				+#include <fmt/format.h>
			
 
				+
			
 
				+#include <stdexcept>
			
 
				+
			
 
				 bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
			
 
				                          const char *file_name, const char *api_call_str);
			
 
				 
			
@@ -14,9 +19,39 @@ bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
 
				 bool check_cuda_api_call(NppStatus api_ret, unsigned int line_number,
			
 
				                          const char *file_name, const char *api_call_str);
			
 
				 
			
 
				+#define RET_ERROR \
			
 
				+    assert(false); \
			
 
				+    return false; \
			
 
				+    (void) 0
			
 
				+
			
 
				+inline bool check_function_call(bool function_ret, unsigned int line_number,
			
 
				+                                const char *file_name, const char *function_call_str) {
			
 
				+    if (function_ret) [[likely]] return true;
			
 
				+    SPDLOG_ERROR("Function call {} failed at {}:{}.",
			
 
				+                 function_call_str, file_name, line_number);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+inline void check_function_call_exception(bool function_ret, unsigned int line_number,
			
 
				+                                          const char *file_name, const char *function_call_str) {
			
 
				+    if (function_ret) [[likely]] return;
			
 
				+    auto msg = fmt::format("Function call {} failed at {}:{}.",
			
 
				+                           function_call_str, file_name, line_number);
			
 
				+    throw std::runtime_error(msg);
			
 
				+}
			
 
				+
			
 
				 #define CUDA_API_CHECK(api_call) \
			
 
				     if (!check_cuda_api_call( \
			
 
				         api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
			
 
				         return false
			
 
				 
			
 
				+#define CALL_CHECK(function_call) \
			
 
				+    if (!check_function_call( \
			
 
				+        function_call, __LINE__, __FILE__, #function_call)) [[unlikely]] \
			
 
				+        return false
			
 
				+
			
 
				+#define CALL_ASSERT_EXCEPTION(function_call) \
			
 
				+    check_function_call_exception( \
			
 
				+        function_call, __LINE__, __FILE__, #function_call)
			
 
				+
			
 
				 #endif //HDRSYNTHESIS_CUDA_HELPER_H
			
--- a/src/hdr_synthesis.cpp
+++ b/src/hdr_synthesis.cpp
@@ -19,21 +19,12 @@ struct smart_buffer {
 
				 
			
 
				     smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
			
 
				             : width(_width), height(_height), elem_cnt(_elem_cnt) {
			
 
				-        malloc_memory();
			
 
				+        auto width_bytes = width * elem_cnt * sizeof(T);
			
 
				+        CALL_ASSERT_EXCEPTION(cudaMallocPitch(&ptr, &pitch, width_bytes, height) == cudaSuccess);
			
 
				     }
			
 
				 
			
 
				     ~smart_buffer() {
			
 
				-        free_memory();
			
 
				-    }
			
 
				-
			
 
				-    bool malloc_memory() {
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
			
 
				-        return true;
			
 
				-    }
			
 
				-
			
 
				-    bool free_memory() {
			
 
				-        CUDA_API_CHECK(cudaFree(ptr));
			
 
				-        return true;
			
 
				+        CALL_ASSERT_EXCEPTION(cudaFree(ptr) == cudaSuccess);
			
 
				     }
			
 
				 };
			
 
				 
			
@@ -64,20 +55,23 @@ struct hdr_synthesizer::impl {
 
				     NppiSize *pyr_size_arr;
			
 
				     void *gaussian_filter_coff_f32;
			
 
				 
			
 
				-    NppStreamContext npp_ctx; // user provided stream
			
 
				-    NppStreamContext extra_npp_ctx;
			
 
				-    cudaStream_t extra_stream;
			
 
				+    NppStreamContext npp_ctx, extra_npp_ctx;
			
 
				+    cudaStream_t main_stream, extra_stream;
			
 
				     cudaEvent_t sync_event;
			
 
				 
			
 
				     // global temporary memory
			
 
				-    smart_buffer<Npp8u> *raw_u8, *rgb_u8;
			
 
				+    smart_buffer<Npp8u> *rgb_u8;
			
 
				     smart_buffer<Npp32f> *rgb_f32[2];
			
 
				 
			
 
				-    impl(uint16_t _width, uint16_t _height, uint8_t _level)
			
 
				-            : width(_width), height(_height), pyr_level(_level) {
			
 
				-        malloc_global_memory();
			
 
				-        init_npp_ctx(&npp_ctx);
			
 
				-        init_npp_ctx(&extra_npp_ctx);
			
 
				+    impl(uint16_t _width, uint16_t _height, uint8_t _level, cudaStream_t stream)
			
 
				+            : width(_width), height(_height), pyr_level(_level), main_stream(stream) {
			
 
				+        rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
			
 
				+        rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				+        rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				+        CALL_ASSERT_EXCEPTION(malloc_dev_mem());
			
 
				+
			
 
				+        init_npp_ctx(&npp_ctx, main_stream);
			
 
				+        init_npp_ctx(&extra_npp_ctx, extra_stream);
			
 
				 
			
 
				         pyr_height = height + (height >> 1);
			
 
				         full_size = NppiSize{width, height};
			
@@ -98,18 +92,16 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     ~impl() {
			
 
				-        free_global_memory();
			
 
				+        delete rgb_u8;
			
 
				+        delete rgb_f32[0];
			
 
				+        delete rgb_f32[1];
			
 
				+        CALL_ASSERT_EXCEPTION(free_dev_mem());
			
 
				 
			
 
				         delete pyr_offset_arr;
			
 
				         delete pyr_size_arr;
			
 
				     }
			
 
				 
			
 
				-    bool malloc_global_memory() {
			
 
				-        raw_u8 = new smart_buffer<Npp8u>(width, height, 1);
			
 
				-        rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
			
 
				-        rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				-        rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
			
 
				-
			
 
				+    bool malloc_dev_mem() {
			
 
				         // upload gaussian kernel coefficient
			
 
				         CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
			
 
				         CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
			
@@ -121,21 +113,15 @@ struct hdr_synthesizer::impl {
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool free_global_memory() {
			
 
				+    bool free_dev_mem() {
			
 
				         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
			
 
				         CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
			
 
				         CUDA_API_CHECK(cudaEventDestroy(sync_event));
			
 
				-
			
 
				-        delete raw_u8;
			
 
				-        delete rgb_u8;
			
 
				-        delete rgb_f32[0];
			
 
				-        delete rgb_f32[1];
			
 
				-
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    static bool init_npp_ctx(NppStreamContext *ctx) {
			
 
				-        ctx->hStream = nullptr;
			
 
				+    static bool init_npp_ctx(NppStreamContext *ctx, cudaStream_t stream) {
			
 
				+        ctx->hStream = stream;
			
 
				         CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
			
 
				         cudaDeviceProp dev_prop = {};
			
 
				         CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
			
@@ -147,13 +133,6 @@ struct hdr_synthesizer::impl {
 
				                                               cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
			
 
				         CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
			
 
				                                               cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
			
 
				-        CUDA_API_CHECK(cudaStreamGetFlags(nullptr, &ctx->nStreamFlags));
			
 
				-        return true;
			
 
				-    }
			
 
				-
			
 
				-    static bool set_npp_stream(NppStreamContext *ctx, cudaStream_t stream) {
			
 
				-        if (ctx->hStream == stream) [[likely]] return true;
			
 
				-        ctx->hStream = stream;
			
 
				         CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
			
 
				         return true;
			
 
				     }
			
@@ -196,15 +175,15 @@ struct hdr_synthesizer::impl {
 
				     bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
			
 
				                            NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
			
 
				         // generate gaussian pyramid first
			
 
				-        gaussian_pyramid(ptr, pitch, true, ctx);
			
 
				+        CALL_CHECK(gaussian_pyramid(ptr, pitch, true, ctx));
			
 
				 
			
 
				         // generate laplacian pyramid by up-sampling and subtraction
			
 
				         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
			
 
				+        CALL_CHECK(laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
			
 
				             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				-            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream);
			
 
				+            CALL_CHECK(laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream));
			
 
				         }
			
 
				         return true;
			
 
				     }
			
@@ -221,23 +200,16 @@ struct hdr_synthesizer::impl {
 
				         for (int i = pyr_level - 1; i > 0; --i) {
			
 
				             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
			
 
				-            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream);
			
 
				+            CALL_CHECK(reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream));
			
 
				         }
			
 
				         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
			
 
				+        CALL_CHECK(reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool preprocess_image(image_buffer *buf, uint8_t *raw, cudaStream_t stream) {
			
 
				-        // upload image
			
 
				-        CUDA_API_CHECK(cudaMemcpy2DAsync(raw_u8->ptr, raw_u8->pitch,
			
 
				-                                         raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
			
 
				-                                         height, cudaMemcpyHostToDevice, stream));
			
 
				-
			
 
				+    bool preprocess_image(image_buffer *buf, void *raw_u8, size_t pitch) {
			
 
				         // debayer image
			
 
				-        set_npp_stream(&npp_ctx, stream);
			
 
				-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx(raw_u8->ptr, raw_u8->pitch,
			
 
				-                                                 full_size, full_rect,
			
 
				+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx((Npp8u *) raw_u8, pitch, full_size, full_rect,
			
 
				                                                  rgb_u8->ptr, rgb_u8->pitch,
			
 
				                                                  NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
			
 
				 
			
@@ -248,39 +220,37 @@ struct hdr_synthesizer::impl {
 
				         CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
			
 
				                                              buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				                                              full_size, npp_ctx));
			
 
				-        CUDA_API_CHECK(cudaEventRecord(sync_event, stream));
			
 
				+        CUDA_API_CHECK(cudaEventRecord(sync_event, main_stream));
			
 
				 
			
 
				         // calc weight and construct pyramid
			
 
				         CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
			
 
				         call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
			
 
				                         buf->weight_pyr->ptr, buf->weight_pyr->pitch,
			
 
				                         width, height, extra_stream); // parallel execution for weight related calculation
			
 
				-        set_npp_stream(&extra_npp_ctx, extra_stream);
			
 
				-        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx);
			
 
				+        CALL_CHECK(gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx));
			
 
				         CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
			
 
				 
			
 
				         // construct image pyramid
			
 
				-        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, stream);
			
 
				-        CUDA_API_CHECK(cudaStreamWaitEvent(stream, sync_event));
			
 
				+        CALL_CHECK(laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, main_stream));
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent(main_stream, sync_event));
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				-                     uint8_t *out_ptr, size_t out_pitch, cudaStream_t stream) {
			
 
				+                     uint8_t *out_ptr, size_t out_pitch) {
			
 
				         assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
			
 
				         assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
			
 
				 
			
 
				         // merge
			
 
				         call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				                        buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
			
 
				-                       width, pyr_height, stream);
			
 
				+                       width, pyr_height, main_stream);
			
 
				 
			
 
				         // reconstruct image from laplacian pyramid
			
 
				-        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, stream);
			
 
				+        CALL_CHECK(pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, main_stream));
			
 
				 
			
 
				         // convert to uint8
			
 
				-        set_npp_stream(&npp_ctx, stream);
			
 
				         CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				                                                  out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
			
 
				 
			
@@ -289,8 +259,9 @@ struct hdr_synthesizer::impl {
 
				 
			
 
				 };
			
 
				 
			
 
				-hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
			
 
				-        : pimpl(std::make_unique<impl>(width, height, pyramid_level)) {
			
 
				+hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height,
			
 
				+                                 cudaStream_t stream, uint8_t pyramid_level)
			
 
				+        : pimpl(std::make_unique<impl>(width, height, pyramid_level, stream)) {
			
 
				 }
			
 
				 
			
 
				 hdr_synthesizer::~hdr_synthesizer() = default;
			
@@ -308,12 +279,12 @@ bool hdr_synthesizer::free_buffer(void *out_buf) {
 
				     return true;
			
 
				 }
			
 
				 
			
 
				-bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream) {
			
 
				-    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, (cudaStream_t) stream);
			
 
				+bool hdr_synthesizer::preprocess_image(void *img_buf, void *img_ptr, size_t pitch) {
			
 
				+    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, pitch);
			
 
				 }
			
 
				 
			
 
				 bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
			
 
				-                                  uint8_t *img_ptr, size_t img_pitch, void *stream) {
			
 
				+                                  uint8_t *img_ptr, size_t img_pitch) {
			
 
				     return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
			
 
				-                              img_ptr, img_pitch, (cudaStream_t) stream);
			
 
				+                              img_ptr, img_pitch);
			
 
				 }
			
--- a/src/hdr_synthesis.h
+++ b/src/hdr_synthesis.h
@@ -1,6 +1,8 @@
 
				 #ifndef HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				 #define HDRSYNTHESIS_HDR_SYNTHESIS_H
			
 
				 
			
 
				+#include <cuda_runtime.h>
			
 
				+
			
 
				 #include <cstdint>
			
 
				 #include <memory>
			
 
				 
			
@@ -8,7 +10,8 @@
 
				 class hdr_synthesizer {
			
 
				 public:
			
 
				 
			
 
				-    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
			
 
				+    hdr_synthesizer(uint16_t width, uint16_t height,
			
 
				+                    cudaStream_t stream = nullptr, uint8_t pyramid_level = 4);
			
 
				 
			
 
				     ~hdr_synthesizer();
			
 
				 
			
@@ -16,9 +19,9 @@ public:
 
				 
			
 
				     static bool free_buffer(void *out_buf);
			
 
				 
			
 
				-    bool preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream = nullptr);
			
 
				+    bool preprocess_image(void *img_buf, void *img_ptr, size_t pitch);
			
 
				 
			
 
				-    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch, void *stream = nullptr);
			
 
				+    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
			
 
				 
			
 
				 private:
			
 
				     struct impl;
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -3,70 +3,14 @@
 
				 
			
 
				 #include <cuda_profiler_api.h>
			
 
				 
			
 
				-#include <nppi_color_conversion.h>
			
 
				-#include <nppi_filtering_functions.h>
			
 
				-
			
 
				 #include <opencv2/core/cuda.hpp>
			
 
				 #include <opencv2/imgcodecs.hpp>
			
 
				 
			
 
				 #include <boost/iostreams/device/mapped_file.hpp>
			
 
				 
			
 
				-#include <iostream>
			
 
				-#include <chrono>
			
 
				-
			
 
				 static constexpr auto image_width = 2448;
			
 
				 static constexpr auto image_height = 2048;
			
 
				 
			
 
				-cv::Mat download_image(void *ptr, size_t pitch, size_t width, size_t height, int type) {
			
 
				-    auto gpu_mat = cv::cuda::GpuMat{(int) height, (int) width, type, ptr, pitch};
			
 
				-    cv::Mat mat;
			
 
				-    gpu_mat.download(mat);
			
 
				-    return mat;
			
 
				-}
			
 
				-
			
 
				-struct image_buffer {
			
 
				-    void *pyr_image_f32, *pyr_weight_f32;
			
 
				-    size_t image_pitch, weight_pitch;
			
 
				-};
			
 
				-
			
 
				-void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				-                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add);
			
 
				-
			
 
				-//int main() {
			
 
				-//    float filter_cof[] = {.2, .2, .2, .2, .2};
			
 
				-//    void *filter_ptr;
			
 
				-//    cudaMalloc(&filter_ptr, sizeof(filter_cof));
			
 
				-//    cudaMemcpy(filter_ptr, filter_cof, sizeof(filter_cof), cudaMemcpyHostToDevice);
			
 
				-//
			
 
				-//    float dst_data[4 * 8 * 3], src_data[4 * 8 * 3];
			
 
				-//    for (auto &val: dst_data) val = 3;
			
 
				-//    for (auto &val: src_data) val = 0;
			
 
				-//    src_data[1 * 8 * 3 + 1 * 3 + 1] = 1;
			
 
				-////    for (auto &val: src_data) val = 1;
			
 
				-////    for (int i = 0; i < 4; ++i)
			
 
				-////        for (int j = 0; j < 8; ++j)
			
 
				-////            for (int k = 0; k < 3; ++k) {
			
 
				-////
			
 
				-////            }
			
 
				-//    size_t pitch;
			
 
				-//    Npp32f *dst_ptr, *src_ptr;
			
 
				-//    cudaMallocPitch(&dst_ptr, &pitch, 8 * 3 * sizeof(float), 4);
			
 
				-//    cudaMallocPitch(&src_ptr, &pitch, 8 * 3 * sizeof(float), 4);
			
 
				-//    cudaMemcpy2D(dst_ptr, pitch, dst_data, 8 * 3 * sizeof(float), 8 * 3 * sizeof(float), 4, cudaMemcpyHostToDevice);
			
 
				-//    cudaMemcpy2D(src_ptr, pitch, src_data, 8 * 3 * sizeof(float), 8 * 3 * sizeof(float), 4, cudaMemcpyHostToDevice);
			
 
				-//    call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				-//                             (Npp32f *) filter_ptr, {4, 2}, {8, 4}, true);
			
 
				-//    for (auto &val: dst_data) val = 0;
			
 
				-//    cudaMemcpy2D(dst_data, 8 * 3 * sizeof(float), dst_ptr, pitch, 8 * 3 * sizeof(float), 4, cudaMemcpyDeviceToHost);
			
 
				-//    for (int i = 0; i < 4; ++i) {
			
 
				-//        for (int j = 0; j < 8; ++j)
			
 
				-//            for (int k = 1; k < 2; ++k)
			
 
				-//                std::cout << dst_data[i * 8 * 3 + j * 3 + k] << " ";
			
 
				-//        std::cout << std::endl;
			
 
				-//    }
			
 
				-//    return 0;
			
 
				-//}
			
 
				-
			
 
				 int main() {
			
 
				     auto path_a = "/home/tpx/project/HDRSynthesis/data/chess_4ms.raw";
			
 
				     auto path_b = "/home/tpx/project/HDRSynthesis/data/chess_50ms.raw";
			
@@ -75,63 +19,36 @@ int main() {
 
				     auto img_file_a = mapped_file{path_a, boost::iostreams::mapped_file_base::readonly};
			
 
				     auto img_file_b = mapped_file{path_b, boost::iostreams::mapped_file_base::readonly};
			
 
				 
			
 
				-    auto hdr = hdr_synthesizer{image_width, image_height};
			
 
				-    void *buf_a, *buf_b;
			
 
				-    hdr.malloc_buffer(&buf_a);
			
 
				-    hdr.malloc_buffer(&buf_b);
			
 
				-    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				-    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				-
			
 
				-    auto hdr2 = hdr_synthesizer{image_width, image_height};
			
 
				-    void *buf_a2, *buf_b2;
			
 
				-    hdr2.malloc_buffer(&buf_a2);
			
 
				-    hdr2.malloc_buffer(&buf_b2);
			
 
				-    hdr2.preprocess_image(buf_a2, (uint8_t *) img_file_a.const_data());
			
 
				-
			
 
				-    auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				-    auto img_hdr_dev2 = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				-    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				-
			
 
				-    cudaStream_t stream, stream2;
			
 
				+    void *img_ptr_a, *img_ptr_b;
			
 
				+    size_t img_pitch;
			
 
				+    auto src_pitch = image_width * sizeof(uint8_t);
			
 
				+    cudaMallocPitch(&img_ptr_a, &img_pitch, src_pitch, image_height);
			
 
				+    cudaMallocPitch(&img_ptr_b, &img_pitch, src_pitch, image_height);
			
 
				+    cudaMemcpy2D(img_ptr_a, img_pitch, img_file_a.const_data(), src_pitch,
			
 
				+                 src_pitch, image_height, cudaMemcpyHostToDevice);
			
 
				+    cudaMemcpy2D(img_ptr_b, img_pitch, img_file_b.const_data(), src_pitch,
			
 
				+                 src_pitch, image_height, cudaMemcpyHostToDevice);
			
 
				+
			
 
				+    cudaStream_t stream;
			
 
				     cudaStreamCreate(&stream);
			
 
				-    cudaStreamCreate(&stream2);
			
 
				+    auto hdr = hdr_synthesizer{image_width, image_height, stream};
			
 
				+    void *img_buf_a, *img_buf_b;
			
 
				+    hdr.malloc_buffer(&img_buf_a);
			
 
				+    hdr.malloc_buffer(&img_buf_b);
			
 
				 
			
 
				-    for (int i = 0; i < 8; ++i) {
			
 
				-        auto start_ts = std::chrono::system_clock::now();
			
 
				-        cudaDeviceSynchronize();
			
 
				-        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
			
 
				-        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
			
 
				-        cudaDeviceSynchronize();
			
 
				-        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
			
 
				-                std::chrono::system_clock::now() - start_ts).count() << std::endl;
			
 
				-    }
			
 
				-
			
 
				-    cudaDeviceSynchronize();
			
 
				+    hdr.preprocess_image(img_buf_a, img_ptr_a, img_pitch);
			
 
				+    auto img_out_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				     cudaProfilerStart();
			
 
				-    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
			
 
				-//    hdr2.preprocess_image(buf_b2, (uint8_t *) img_file_a.const_data(), stream2);
			
 
				-    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
			
 
				-//    hdr2.merge_image(buf_b2, buf_a2, (uint8_t *) img_hdr_dev2.cudaPtr(), img_hdr_dev.step1(), stream2);
			
 
				-    cudaDeviceSynchronize();
			
 
				+    hdr.preprocess_image(img_buf_b, img_ptr_b, img_pitch);
			
 
				+    hdr.merge_image(img_buf_a, img_buf_b, (uint8_t *) img_out_dev.cudaPtr(), img_out_dev.step1());
			
 
				     cudaProfilerStop();
			
 
				 
			
 
				+    cv::Mat img_out;
			
 
				+    img_out_dev.download(img_out);
			
 
				+    cv::imwrite("result.bmp", img_out);
			
 
				 
			
 
				-    auto real_ptr = (image_buffer *) buf_b;
			
 
				-    auto host_rgb_a = download_image(img_hdr_dev.cudaPtr(), img_hdr_dev.step1(),
			
 
				-                                     image_width, image_height, CV_8UC3);
			
 
				-//    auto host_rgb_a = download_image((char *) real_ptr->pyr_image_f32, real_ptr->image_pitch,
			
 
				-//                                     image_width, image_height, CV_32FC3);
			
 
				-//    void *ptr;
			
 
				-//    size_t pitch;
			
 
				-//    hdr.test_func(&ptr, &pitch);
			
 
				-//    auto host_rgb_a = download_image(ptr, pitch,
			
 
				-//                                     image_width, image_height, CV_32FC3);
			
 
				-
			
 
				-    double min_val, max_val;
			
 
				-    cv::minMaxLoc(host_rgb_a, &min_val, &max_val);
			
 
				-    std::cout << min_val << " " << max_val << " " << cv::mean(host_rgb_a) << std::endl;
			
 
				-
			
 
				-    cv::imwrite("test.bmp", host_rgb_a);
			
 
				+    hdr_synthesizer::free_buffer(img_buf_a);
			
 
				+    hdr_synthesizer::free_buffer(img_buf_b);
			
 
				 
			
 
				     return 0;
			
 
				 }