2 年前 · 725f3d068c
--- a/src/hdr_synthesis.cpp
+++ b/src/hdr_synthesis.cpp
@@ -8,14 +8,73 @@
 
				 
			
 
				 #include <cassert>
			
 
				 
			
 
				-void call_hdr_weight(void *in_f32, size_t in_pitch,
			
 
				-                     void *out_f32, size_t out_pitch,
			
 
				+void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				+                     Npp32f *out_ptr, size_t out_pitch,
			
 
				                      size_t width, size_t height);
			
 
				 
			
 
				-void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
			
 
				-                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
			
 
				-                    void *out_f32, size_t out_pitch,
			
 
				-                    size_t width, size_t height);
			
 
				+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
			
 
				+                    const Npp32f *weight_a, const Npp32f *weight_b,
			
 
				+                    size_t pitch, size_t width, size_t height);
			
 
				+
			
 
				+template<typename T>
			
 
				+struct smart_buffer {
			
 
				+    T *ptr = nullptr;
			
 
				+    size_t pitch = 0;
			
 
				+
			
 
				+    size_t width, height, elem_cnt;
			
 
				+
			
 
				+    smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
			
 
				+            : width(_width), height(_height), elem_cnt(_elem_cnt) {
			
 
				+        malloc_memory();
			
 
				+    }
			
 
				+
			
 
				+    ~smart_buffer() {
			
 
				+        free_memory();
			
 
				+    }
			
 
				+
			
 
				+    bool malloc_memory() {
			
 
				+        CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool free_memory() {
			
 
				+        CUDA_API_CHECK(cudaFree(ptr));
			
 
				+        return true;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename T, size_t Len>
			
 
				+struct smart_bundle_buffer {
			
 
				+    std::array<T *, Len> ptrs;
			
 
				+    size_t pitch = 0;
			
 
				+
			
 
				+    size_t width, height, elem_cnt;
			
 
				+
			
 
				+    smart_bundle_buffer(size_t _width, size_t _height, size_t _elem_cnt)
			
 
				+            : width(_width), height(_height), elem_cnt(_elem_cnt) {
			
 
				+        malloc_memory();
			
 
				+    }
			
 
				+
			
 
				+    ~smart_bundle_buffer() {
			
 
				+        free_memory();
			
 
				+    }
			
 
				+
			
 
				+    bool malloc_memory() {
			
 
				+        for (auto &ptr: ptrs) {
			
 
				+            auto old_pitch = pitch;
			
 
				+            CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
			
 
				+            assert(old_pitch == 0 || pitch == old_pitch);
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    bool free_memory() {
			
 
				+        for (auto &ptr: ptrs) {
			
 
				+            CUDA_API_CHECK(cudaFree(ptr));
			
 
				+        }
			
 
				+        return true;
			
 
				+    }
			
 
				+};
			
 
				 
			
 
				 struct hdr_synthesizer::impl {
			
 
				 
			
@@ -33,8 +92,8 @@ struct hdr_synthesizer::impl {
 
				     static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
			
 
				 
			
 
				     struct image_buffer {
			
 
				-        void *pyr_image_f32, *pyr_weight_f32;
			
 
				-        size_t image_pitch, weight_pitch;
			
 
				+        smart_buffer<Npp32f> *weight_pyr;
			
 
				+        smart_bundle_buffer<Npp32f, 3> *image_pyr;
			
 
				     };
			
 
				 
			
 
				     uint16_t width, height, pyr_height;
			
@@ -49,10 +108,9 @@ struct hdr_synthesizer::impl {
 
				     void *gaussian_filter_coff_f32;
			
 
				 
			
 
				     // global temporary memory
			
 
				-    void *raw_u8, *rgb_u8, *rgb_f32;
			
 
				-    size_t raw_u8_pitch, rgb_u8_pitch, rgb_f32_pitch;
			
 
				-
			
 
				-    void *rgb2_f32, *rgb3_f32; // for test
			
 
				+    smart_buffer<Npp8u> *raw_u8, *rgb_u8;
			
 
				+    smart_bundle_buffer<Npp8u, 3> *planer_u8;
			
 
				+    smart_bundle_buffer<Npp32f, 3> *planer_f32[2];
			
 
				 
			
 
				     impl(uint16_t _width, uint16_t _height, uint8_t _level)
			
 
				             : width(_width), height(_height), pyr_level(_level) {
			
@@ -84,16 +142,22 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     bool malloc_global_memory() {
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
			
 
				-                                       width * sizeof(unsigned char), height));
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
			
 
				-                                       width * 3 * sizeof(unsigned char), height));
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
			
 
				-                                       width * 3 * sizeof(float), pyr_height));
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
			
 
				-                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
			
 
				-                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				+//        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
			
 
				+//                                       width * sizeof(unsigned char), height));
			
 
				+//        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
			
 
				+//                                       width * 3 * sizeof(unsigned char), height));
			
 
				+//        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
			
 
				+//                                       width * 3 * sizeof(float), pyr_height));
			
 
				+//        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
			
 
				+//                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				+//        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
			
 
				+//                                       width * 3 * sizeof(float), pyr_height)); // for test
			
 
				+
			
 
				+        raw_u8 = new smart_buffer<Npp8u>(width, height, 1);
			
 
				+        rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
			
 
				+        planer_u8 = new smart_bundle_buffer<Npp8u, 3>(width, height, 1);
			
 
				+        planer_f32[0] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
			
 
				+        planer_f32[1] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
			
 
				 
			
 
				         // upload gaussian kernel coefficient
			
 
				         CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
			
@@ -107,184 +171,180 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     bool free_global_memory() {
			
 
				-        CUDA_API_CHECK(cudaFree(raw_u8));
			
 
				-        CUDA_API_CHECK(cudaFree(rgb_u8));
			
 
				-        CUDA_API_CHECK(cudaFree(rgb_f32));
			
 
				-        CUDA_API_CHECK(cudaFree(rgb2_f32));
			
 
				-        CUDA_API_CHECK(cudaFree(rgb3_f32));
			
 
				+//        CUDA_API_CHECK(cudaFree(raw_u8));
			
 
				+//        CUDA_API_CHECK(cudaFree(rgb_u8));
			
 
				+//        CUDA_API_CHECK(cudaFree(rgb_f32));
			
 
				+//        CUDA_API_CHECK(cudaFree(rgb2_f32));
			
 
				+//        CUDA_API_CHECK(cudaFree(rgb3_f32));
			
 
				         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
			
 
				         CUDA_API_CHECK(cudaFree(identity_filter_coff_f32));
			
 
				+
			
 
				+        delete raw_u8;
			
 
				+        delete planer_u8;
			
 
				+        delete planer_f32[0];
			
 
				+        delete planer_f32[1];
			
 
				+
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool malloc_buffer(image_buffer *buf) const {
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_image_f32, &buf->image_pitch,
			
 
				-                                       width * sizeof(float) * 3, pyr_height));
			
 
				-        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_weight_f32, &buf->weight_pitch,
			
 
				-                                       width * sizeof(float), pyr_height));
			
 
				+        buf->image_pyr = new smart_bundle_buffer<Npp32f, 3>(width, pyr_height, 1);
			
 
				+        buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool gaussian_pyramid(void *ptr_f32, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
			
 
				-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				-        CUDA_API_CHECK((is_rgb ?
			
 
				-                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				-                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
			
 
				-                               ((Npp32f *) ptr_f32, pitch, full_size, origin_point,
			
 
				-                                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
			
 
				-                                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+    bool gaussian_pyramid(Npp32f *ptr, size_t pitch) const { // construct gaussian pyramid
			
 
				+        auto pyr_ptr = (char *) ptr + pitch * height;
			
 
				+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
			
 
				+                ptr, pitch, full_size, origin_point,
			
 
				+                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
			
 
				+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float) * (is_rgb ? 3 : 1);
			
 
				-            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * (is_rgb ? 3 : 1);
			
 
				-            CUDA_API_CHECK((is_rgb ?
			
 
				-                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				-                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
			
 
				-                                   ((Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
			
 
				-                                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
			
 
				-                                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
			
 
				+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
			
 
				+            CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
			
 
				+                    (Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
			
 
				+                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
			
 
				+                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool laplacian_operation(void *src_f32, NppiSize src_size,
			
 
				-                             void *dst_f32, NppiSize dst_size,
			
 
				-                             size_t pitch) const {
			
 
				+    bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				+                             NppiSize src_size, NppiSize dst_size, int buf_index) const {
			
 
				         // up-sampling
			
 
				-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				-                (Npp32f *) src_f32, pitch, src_size, origin_point,
			
 
				-                (Npp32f *) rgb_f32, rgb_f32_pitch, dst_size,
			
 
				+        // TODO: check why gaussian blur is not performed
			
 
				+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
			
 
				+                src_ptr, pitch, src_size, origin_point,
			
 
				+                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
			
 
				                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				         // gaussian blur
			
 
				-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb_f32, rgb_f32_pitch, dst_size, origin_point,
			
 
				-                                                     (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
			
 
				-                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
			
 
				-        // add
			
 
				-//        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb_f32, rgb_f32_pitch,
			
 
				-//                                        (Npp32f *) dst_f32, pitch,
			
 
				-//                                        dst_size));
			
 
				-        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
			
 
				-                                        (Npp32f *) dst_f32, pitch,
			
 
				-                                        dst_size));
			
 
				+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
			
 
				+                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
			
 
				+                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
			
 
				+                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
			
 
				+        // subtraction
			
 
				+        CUDA_API_CHECK(nppiSub_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
			
 
				+                                        dst_ptr, pitch, dst_size));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool laplacian_pyramid(void *ptr_f32, size_t pitch) const { // for rgb image only
			
 
				+    bool laplacian_pyramid(Npp32f *ptr, size_t pitch, int buf_index) const { // construct laplacian pyramid
			
 
				         // generate gaussian pyramid first
			
 
				-        gaussian_pyramid(ptr_f32, pitch, true);
			
 
				+        gaussian_pyramid(ptr, pitch);
			
 
				 
			
 
				         // generate laplacian pyramid by up-sampling and subtraction
			
 
				-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				-        laplacian_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
			
 
				+        auto pyr_ptr = (char *) ptr + pitch * height;
			
 
				+        laplacian_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            laplacian_operation(pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * 3, pyr_size_arr[i + 1],
			
 
				-                                pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
			
 
				-                                pitch);
			
 
				+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
			
 
				+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
			
 
				+            laplacian_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
			
 
				+                                pyr_size_arr[i + 1], pyr_size_arr[i], buf_index);
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool reconstruct_operation(void *src_f32, NppiSize src_size,
			
 
				-                               void *dst_f32, NppiSize dst_size,
			
 
				-                               size_t pitch) const {
			
 
				+    bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				+                               NppiSize src_size, NppiSize dst_size, int buf_index) const {
			
 
				         // up-sampling
			
 
				-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				-                (Npp32f *) src_f32, pitch, src_size, origin_point,
			
 
				-                (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
			
 
				+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
			
 
				+                src_ptr, pitch, src_size, origin_point,
			
 
				+                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
			
 
				                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				         // gaussian blur
			
 
				-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size, origin_point,
			
 
				-                                                     (Npp32f *) rgb3_f32, rgb_f32_pitch, dst_size,
			
 
				-                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
			
 
				+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
			
 
				+                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
			
 
				+                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
			
 
				+                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
			
 
				         // add
			
 
				-//        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
			
 
				-//                                        (Npp32f *) dst_f32, pitch,
			
 
				-//                                        dst_size));
			
 
				-        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb3_f32, rgb_f32_pitch,
			
 
				-                                        (Npp32f *) dst_f32, pitch,
			
 
				-                                        dst_size));
			
 
				+        CUDA_API_CHECK(nppiAdd_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
			
 
				+                                        dst_ptr, pitch, dst_size));
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     // reconstruct from laplacian pyramid, for rgb image only
			
 
				-    bool pyramid_reconstruct(void *ptr_f32, size_t pitch) const {
			
 
				-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
			
 
				+    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, int buf_index) const {
			
 
				+        auto pyr_ptr = (char *) ptr + pitch * height;
			
 
				         for (int i = pyr_level - 1; i > 0; --i) {
			
 
				-            reconstruct_operation(pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
			
 
				-                                  pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float) * 3, pyr_size_arr[i - 1],
			
 
				-                                  pitch);
			
 
				+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
			
 
				+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float);
			
 
				+            reconstruct_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
			
 
				+                                  pyr_size_arr[i], pyr_size_arr[i - 1], buf_index);
			
 
				         }
			
 
				-        reconstruct_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
			
 
				+        reconstruct_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
			
 
				         // upload image
			
 
				-        CUDA_API_CHECK(cudaMemcpy2D(raw_u8, raw_u8_pitch,
			
 
				+        CUDA_API_CHECK(cudaMemcpy2D(raw_u8->ptr, raw_u8->pitch,
			
 
				                                     raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
			
 
				                                     height, cudaMemcpyHostToDevice));
			
 
				 
			
 
				         // debayer image
			
 
				-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R((Npp8u *) raw_u8, raw_u8_pitch,
			
 
				+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R(raw_u8->ptr, raw_u8->pitch,
			
 
				                                              full_size, full_rect,
			
 
				-                                             (Npp8u *) rgb_u8, rgb_u8_pitch,
			
 
				+                                             rgb_u8->ptr, rgb_u8->pitch,
			
 
				                                              NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
			
 
				 
			
 
				+        // split channels
			
 
				+        CUDA_API_CHECK(nppiCopy_8u_C3P3R(rgb_u8->ptr, rgb_u8->pitch,
			
 
				+                                         planer_u8->ptrs.data(), planer_u8->pitch,
			
 
				+                                         full_size));
			
 
				+
			
 
				         // convert to float
			
 
				-        CUDA_API_CHECK(nppiConvert_8u32f_C3R((Npp8u *) rgb_u8, rgb_u8_pitch,
			
 
				-                                             (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
			
 
				+        for (auto i = 0; i < 3; ++i) {
			
 
				+            CUDA_API_CHECK(nppiConvert_8u32f_C1R(planer_u8->ptrs[i], planer_u8->pitch,
			
 
				+                                                 buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
			
 
				+                                                 full_size));
			
 
				+            CUDA_API_CHECK(nppiMulC_32f_C1IR(u8_to_f32_coff,
			
 
				+                                             buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
			
 
				                                              full_size));
			
 
				-        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
			
 
				-                                         (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
			
 
				-                                         full_size)); // normalize
			
 
				+        }
			
 
				 
			
 
				         // calc weight
			
 
				-        call_hdr_weight(buf->pyr_image_f32, buf->image_pitch,
			
 
				-                        buf->pyr_weight_f32, buf->weight_pitch,
			
 
				+        call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
			
 
				+                        buf->weight_pyr->ptr, buf->weight_pyr->pitch,
			
 
				                         width, height);
			
 
				 
			
 
				         // construct image pyramid
			
 
				-        gaussian_pyramid(buf->pyr_weight_f32, buf->weight_pitch, false);
			
 
				-        laplacian_pyramid(buf->pyr_image_f32, buf->image_pitch);
			
 
				+        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch);
			
 
				+        for (auto i = 0; i < 3; ++i) {
			
 
				+            laplacian_pyramid(buf->image_pyr->ptrs[i], buf->image_pyr->pitch, i);
			
 
				+        }
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				-                     void *out_u8, size_t out_pitch) const {
			
 
				-        // merge
			
 
				-        assert(buf_a->image_pitch == buf_b->image_pitch);
			
 
				-        assert(buf_a->weight_pitch == buf_b->weight_pitch);
			
 
				-        call_hdr_merge(buf_a->pyr_image_f32, buf_b->pyr_image_f32, buf_a->image_pitch,
			
 
				-                       buf_a->pyr_weight_f32, buf_b->pyr_weight_f32, buf_a->weight_pitch,
			
 
				-                       rgb_f32, rgb_f32_pitch,
			
 
				-                       width, pyr_height);
			
 
				-
			
 
				-        // reconstruct image from laplacian pyramid
			
 
				-        pyramid_reconstruct(rgb_f32, rgb_f32_pitch);
			
 
				-
			
 
				-        // convert to uint8 and copy
			
 
				-        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) rgb_f32, rgb_f32_pitch,
			
 
				-                                             (Npp8u *) out_u8, out_pitch,
			
 
				-                                             full_size, NPP_RND_NEAR));
			
 
				+                     uint8_t *out_ptr, size_t out_pitch) const {
			
 
				+        auto pitch = buf_a->image_pyr->pitch;
			
 
				+        assert(pitch == buf_b->image_pyr->pitch);
			
 
				+        assert(pitch == buf_b->weight_pyr->pitch);
			
 
				+        assert(pitch == buf_a->weight_pyr->pitch);
			
 
				+
			
 
				+        for (auto i = 0; i < 3; ++i) {
			
 
				+            // merge
			
 
				+            call_hdr_merge(buf_a->image_pyr->ptrs[i], buf_b->image_pyr->ptrs[i],
			
 
				+                           buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr,
			
 
				+                           pitch, width, pyr_height);
			
 
				+
			
 
				+            // reconstruct image from laplacian pyramid
			
 
				+            pyramid_reconstruct(buf_a->image_pyr->ptrs[i], pitch, i);
			
 
				+
			
 
				+            // convert to uint8
			
 
				+            CUDA_API_CHECK(nppiConvert_32f8u_C1R(buf_a->image_pyr->ptrs[i], buf_a->image_pyr->pitch,
			
 
				+                                                 planer_u8->ptrs[i], planer_u8->pitch, full_size, NPP_RND_NEAR));
			
 
				+        }
			
 
				+
			
 
				+        // packet and copy
			
 
				+        CUDA_API_CHECK(nppiCopy_8u_P3C3R(planer_u8->ptrs.data(), planer_u8->pitch,
			
 
				+                                         out_ptr, out_pitch, full_size));
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-//    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				-//                     void *out_u8, size_t out_pitch) const {
			
 
				-//        // reconstruct image from laplacian pyramid
			
 
				-//        pyramid_reconstruct(buf_a->pyr_image_f32, buf_a->image_pitch);
			
 
				-//
			
 
				-//        // convert to uint8 and copy
			
 
				-//        float arr[] = {255, 255, 255};
			
 
				-//        nppiMulC_32f_C3IR(arr, (Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch, full_size);
			
 
				-//        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch,
			
 
				-//                                             (Npp8u *) out_u8, out_pitch,
			
 
				-//                                             full_size, NPP_RND_NEAR));
			
 
				-//
			
 
				-//        return true;
			
 
				-//    }
			
 
				-
			
 
				 };
			
 
				 
			
 
				 hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
			
@@ -300,8 +360,8 @@ bool hdr_synthesizer::malloc_buffer(void **out_buf) {
 
				 
			
 
				 bool hdr_synthesizer::free_buffer(void *out_buf) {
			
 
				     auto ptr = (impl::image_buffer *) out_buf;
			
 
				-    CUDA_API_CHECK(cudaFree(ptr->pyr_image_f32));
			
 
				-    CUDA_API_CHECK(cudaFree(ptr->pyr_weight_f32));
			
 
				+    delete ptr->image_pyr;
			
 
				+    delete ptr->weight_pyr;
			
 
				     delete ptr;
			
 
				     return true;
			
 
				 }
			
@@ -311,12 +371,7 @@ bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
 
				 }
			
 
				 
			
 
				 bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
			
 
				-                                  void *img_u8, size_t img_pitch) {
			
 
				+                                  uint8_t *img_ptr, size_t img_pitch) {
			
 
				     return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
			
 
				-                              img_u8, img_pitch);
			
 
				-}
			
 
				-
			
 
				-void hdr_synthesizer::test_func(void **ptr, size_t *pitch) {
			
 
				-    *ptr = pimpl->rgb_f32;
			
 
				-    *pitch = pimpl->rgb_f32_pitch;
			
 
				+                              img_ptr, img_pitch);
			
 
				 }
			
--- a/src/hdr_synthesis.h
+++ b/src/hdr_synthesis.h
@@ -7,7 +7,7 @@
 
				 class hdr_synthesizer {
			
 
				 public:
			
 
				 
			
 
				-    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
			
 
				+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 4);
			
 
				 
			
 
				     ~hdr_synthesizer();
			
 
				 
			
@@ -17,9 +17,7 @@ public:
 
				 
			
 
				     bool preprocess_image(void *img_buf, uint8_t *img_ptr);
			
 
				 
			
 
				-    bool merge_image(void *buf_a, void *buf_b, void *img_u8, size_t img_pitch);
			
 
				-
			
 
				-    void test_func(void **ptr, size_t *pitch);
			
 
				+    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
			
 
				 
			
 
				 private:
			
 
				     struct impl;
			
--- a/src/hdr_synthesis_kernel.cu
+++ b/src/hdr_synthesis_kernel.cu
@@ -1,8 +1,11 @@
 
				 #include "hdr_synthesis.h"
			
 
				 
			
 
				+#include <nppdefs.h>
			
 
				+
			
 
				 static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
			
 
				 static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
			
 
				 static constexpr float smooth_coff = 1e-6;
			
 
				+static constexpr float u8_to_f32_coff = 1.0f / 255;
			
 
				 static constexpr float f32_to_u8_coff = 255;
			
 
				 
			
 
				 static constexpr auto block_size = 16;
			
@@ -12,19 +15,17 @@ __device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem
 
				     return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
			
 
				 }
			
 
				 
			
 
				-__global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
			
 
				+__global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
			
 
				                            float *out_ptr, size_t out_pitch,
			
 
				                            size_t width, size_t height) {
			
 
				     auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				     auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				     if (x >= height || y >= width) return;
			
 
				 
			
 
				-//    auto in_offset = x * in_pitch + y * 3 * sizeof(float);
			
 
				-//    auto in_data = (float *) ((char *) in_ptr + in_offset);
			
 
				     auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
			
 
				-    float r = in_data[0];
			
 
				-    float g = in_data[1];
			
 
				-    float b = in_data[2];
			
 
				+    float r = (float) in_data[0] * u8_to_f32_coff;
			
 
				+    float g = (float) in_data[1] * u8_to_f32_coff;
			
 
				+    float b = (float) in_data[2] * u8_to_f32_coff;
			
 
				 
			
 
				     float rgb_mean = (r + g + b) / 3.0f;
			
 
				     float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
			
@@ -33,57 +34,47 @@ __global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
 
				     float rgb_max = fmaxf(fmaxf(r, g), b);
			
 
				     float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));
			
 
				 
			
 
				-//    auto out_offset = x * out_pitch + y * sizeof(float);
			
 
				-//    auto out_data = (float *) ((char *) out_ptr + out_offset);
			
 
				     auto out_data = smart_offset(out_ptr, out_pitch, x, y);
			
 
				     *out_data = sat_weight * expo_weight + smooth_coff;
			
 
				 }
			
 
				 
			
 
				-__device__ float pixel_merge(const float *img_a, const float *img_b,
			
 
				-                             const float *wei_a, const float *wei_b,
			
 
				-                             size_t offset) {
			
 
				-    auto w_a = *wei_a, w_b = *wei_b;
			
 
				-    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
			
 
				-}
			
 
				+//__device__ float pixel_merge(const float *img_a, const float *img_b,
			
 
				+//                             const float *wei_a, const float *wei_b,
			
 
				+//                             size_t offset) {
			
 
				+//    auto w_a = *wei_a, w_b = *wei_b;
			
 
				+//    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
			
 
				+//}
			
 
				 
			
 
				-__global__ void hdr_merge(const float *img_a, const float *img_b, size_t img_pitch,
			
 
				-                          const float *wei_a, const float *wei_b, size_t wei_pitch,
			
 
				-                          float *out_ptr, size_t out_pitch,
			
 
				-                          size_t width, size_t height) {
			
 
				+__global__ void hdr_merge(float *image_a, const float *image_b,
			
 
				+                          const float *weight_a, const float *weight_b,
			
 
				+                          size_t pitch, size_t width, size_t height) {
			
 
				     auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				     auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				     if (x >= height || y >= width) return;
			
 
				 
			
 
				-    auto cur_img_a = smart_offset(img_a, img_pitch, x, y, 3);
			
 
				-    auto cur_img_b = smart_offset(img_b, img_pitch, x, y, 3);
			
 
				-    auto cur_wei_a = smart_offset(wei_a, wei_pitch, x, y);
			
 
				-    auto cur_wei_b = smart_offset(wei_b, wei_pitch, x, y);
			
 
				-    auto cur_out = smart_offset(out_ptr, out_pitch, x, y, 3);
			
 
				-    for (auto i = 0; i < 3; ++i) {
			
 
				-        cur_out[i] = pixel_merge(cur_img_a, cur_img_b, cur_wei_a, cur_wei_b, i) * f32_to_u8_coff;
			
 
				-    }
			
 
				+    auto ppa = smart_offset(image_a, pitch, x, y);
			
 
				+    auto pb = *smart_offset(image_b, pitch, x, y);
			
 
				+    auto wa = *smart_offset(weight_a, pitch, x, y);
			
 
				+    auto wb = *smart_offset(weight_b, pitch, x, y);
			
 
				+
			
 
				+    *ppa = (*ppa * wa + pb * wb) / (wa + wb) * f32_to_u8_coff;
			
 
				 }
			
 
				 
			
 
				-void call_hdr_weight(void *in_f32, size_t in_pitch,
			
 
				-                     void *out_f32, size_t out_pitch,
			
 
				+void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				+                     Npp32f *out_ptr, size_t out_pitch,
			
 
				                      size_t width, size_t height) {
			
 
				     auto block_dims = dim3{block_size, block_size};
			
 
				     auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				                           (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				-    hdr_weight<<<grid_dims, block_dims>>>((float *) in_f32, in_pitch,
			
 
				-                                          (float *) out_f32, out_pitch,
			
 
				-                                          width, height);
			
 
				+    hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
			
 
				 }
			
 
				 
			
 
				-void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
			
 
				-                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
			
 
				-                    void *out_f32, size_t out_pitch,
			
 
				-                    size_t width, size_t height) {
			
 
				+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
			
 
				+                    const Npp32f *weight_a, const Npp32f *weight_b,
			
 
				+                    size_t pitch, size_t width, size_t height) {
			
 
				     auto block_dims = dim3{block_size, block_size};
			
 
				     auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				                           (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				-    hdr_merge<<<grid_dims, block_dims>>>((float *) img_a_f32, (float *) img_b_f32, img_pitch,
			
 
				-                                         (float *) wei_a_f32, (float *) wei_b_f32, wei_pitch,
			
 
				-                                         (float *) out_f32, out_pitch,
			
 
				-                                         width, height);
			
 
				+    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, weight_a, weight_b,
			
 
				+                                         pitch, width, height);
			
 
				 }
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -30,8 +30,8 @@ struct image_buffer {
 
				 };
			
 
				 
			
 
				 int main() {
			
 
				-    auto path_a = "/home/tpx/project/HDRSynthesis/data/phantom2_5ms.raw";
			
 
				-    auto path_b = "/home/tpx/project/HDRSynthesis/data/phantom2_20ms.raw";
			
 
				+    auto path_a = "/home/tpx/project/HDRSynthesis/data/chess_4ms.raw";
			
 
				+    auto path_b = "/home/tpx/project/HDRSynthesis/data/chess_50ms.raw";
			
 
				 
			
 
				     using boost::iostreams::mapped_file;
			
 
				     auto img_file_a = mapped_file{path_a, boost::iostreams::mapped_file_base::readonly};
			
@@ -45,22 +45,25 @@ int main() {
 
				     hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				 
			
 
				     auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				-    hdr.merge_image(buf_b, buf_a, img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				-
			
 
				-//    for (int i = 0; i < 8; ++i) {
			
 
				-//        auto start_ts = std::chrono::system_clock::now();
			
 
				-//        cudaDeviceSynchronize();
			
 
				-//        hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				-//        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				-//        cudaDeviceSynchronize();
			
 
				-//        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
			
 
				-//                std::chrono::system_clock::now() - start_ts).count() << std::endl;
			
 
				-//    }
			
 
				-//
			
 
				-//    cudaProfilerStart();
			
 
				-//    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				-//    cudaProfilerStop();
			
 
				-//
			
 
				+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+
			
 
				+    for (int i = 0; i < 8; ++i) {
			
 
				+        auto start_ts = std::chrono::system_clock::now();
			
 
				+        cudaDeviceSynchronize();
			
 
				+        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				+        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+        cudaDeviceSynchronize();
			
 
				+        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
			
 
				+                std::chrono::system_clock::now() - start_ts).count() << std::endl;
			
 
				+    }
			
 
				+
			
 
				+    cudaDeviceSynchronize();
			
 
				+    cudaProfilerStart();
			
 
				+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+    cudaDeviceSynchronize();
			
 
				+    cudaProfilerStop();
			
 
				+
			
 
				 
			
 
				     auto real_ptr = (image_buffer *) buf_b;
			
 
				     auto host_rgb_a = download_image(img_hdr_dev.cudaPtr(), img_hdr_dev.step1(),