ソースを参照

Split planer, but works slower.

jcsyshc 2 年 前
コミット
725f3d068c
4 ファイル変更251 行追加204 行削除
  1. 198 143
      src/hdr_synthesis.cpp
  2. 2 4
      src/hdr_synthesis.h
  3. 30 39
      src/hdr_synthesis_kernel.cu
  4. 21 18
      src/main.cpp

+ 198 - 143
src/hdr_synthesis.cpp

@@ -8,14 +8,73 @@
 
 #include <cassert>
 
-void call_hdr_weight(void *in_f32, size_t in_pitch,
-                     void *out_f32, size_t out_pitch,
+void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
+                     Npp32f *out_ptr, size_t out_pitch,
                      size_t width, size_t height);
 
-void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
-                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
-                    void *out_f32, size_t out_pitch,
-                    size_t width, size_t height);
+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
+                    const Npp32f *weight_a, const Npp32f *weight_b,
+                    size_t pitch, size_t width, size_t height);
+
+template<typename T>
+struct smart_buffer {
+    T *ptr = nullptr;
+    size_t pitch = 0;
+
+    size_t width, height, elem_cnt;
+
+    smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
+            : width(_width), height(_height), elem_cnt(_elem_cnt) {
+        malloc_memory();
+    }
+
+    ~smart_buffer() {
+        free_memory();
+    }
+
+    bool malloc_memory() {
+        CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
+        return true;
+    }
+
+    bool free_memory() {
+        CUDA_API_CHECK(cudaFree(ptr));
+        return true;
+    }
+};
+
+template<typename T, size_t Len>
+struct smart_bundle_buffer {
+    std::array<T *, Len> ptrs;
+    size_t pitch = 0;
+
+    size_t width, height, elem_cnt;
+
+    smart_bundle_buffer(size_t _width, size_t _height, size_t _elem_cnt)
+            : width(_width), height(_height), elem_cnt(_elem_cnt) {
+        malloc_memory();
+    }
+
+    ~smart_bundle_buffer() {
+        free_memory();
+    }
+
+    bool malloc_memory() {
+        for (auto &ptr: ptrs) {
+            auto old_pitch = pitch;
+            CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
+            assert(old_pitch == 0 || pitch == old_pitch);
+        }
+        return true;
+    }
+
+    bool free_memory() {
+        for (auto &ptr: ptrs) {
+            CUDA_API_CHECK(cudaFree(ptr));
+        }
+        return true;
+    }
+};
 
 struct hdr_synthesizer::impl {
 
@@ -33,8 +92,8 @@ struct hdr_synthesizer::impl {
     static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
 
     struct image_buffer {
-        void *pyr_image_f32, *pyr_weight_f32;
-        size_t image_pitch, weight_pitch;
+        smart_buffer<Npp32f> *weight_pyr;
+        smart_bundle_buffer<Npp32f, 3> *image_pyr;
     };
 
     uint16_t width, height, pyr_height;
@@ -49,10 +108,9 @@ struct hdr_synthesizer::impl {
     void *gaussian_filter_coff_f32;
 
     // global temporary memory
-    void *raw_u8, *rgb_u8, *rgb_f32;
-    size_t raw_u8_pitch, rgb_u8_pitch, rgb_f32_pitch;
-
-    void *rgb2_f32, *rgb3_f32; // for test
+    smart_buffer<Npp8u> *raw_u8, *rgb_u8;
+    smart_bundle_buffer<Npp8u, 3> *planer_u8;
+    smart_bundle_buffer<Npp32f, 3> *planer_f32[2];
 
     impl(uint16_t _width, uint16_t _height, uint8_t _level)
             : width(_width), height(_height), pyr_level(_level) {
@@ -84,16 +142,22 @@ struct hdr_synthesizer::impl {
     }
 
     bool malloc_global_memory() {
-        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
-                                       width * sizeof(unsigned char), height));
-        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
-                                       width * 3 * sizeof(unsigned char), height));
-        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
-                                       width * 3 * sizeof(float), pyr_height));
-        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
-                                       width * 3 * sizeof(float), pyr_height)); // for test
-        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
-                                       width * 3 * sizeof(float), pyr_height)); // for test
+//        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
+//                                       width * sizeof(unsigned char), height));
+//        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
+//                                       width * 3 * sizeof(unsigned char), height));
+//        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
+//                                       width * 3 * sizeof(float), pyr_height));
+//        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
+//                                       width * 3 * sizeof(float), pyr_height)); // for test
+//        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
+//                                       width * 3 * sizeof(float), pyr_height)); // for test
+
+        raw_u8 = new smart_buffer<Npp8u>(width, height, 1);
+        rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
+        planer_u8 = new smart_bundle_buffer<Npp8u, 3>(width, height, 1);
+        planer_f32[0] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
+        planer_f32[1] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
 
         // upload gaussian kernel coefficient
         CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
@@ -107,184 +171,180 @@ struct hdr_synthesizer::impl {
     }
 
     bool free_global_memory() {
-        CUDA_API_CHECK(cudaFree(raw_u8));
-        CUDA_API_CHECK(cudaFree(rgb_u8));
-        CUDA_API_CHECK(cudaFree(rgb_f32));
-        CUDA_API_CHECK(cudaFree(rgb2_f32));
-        CUDA_API_CHECK(cudaFree(rgb3_f32));
+//        CUDA_API_CHECK(cudaFree(raw_u8));
+//        CUDA_API_CHECK(cudaFree(rgb_u8));
+//        CUDA_API_CHECK(cudaFree(rgb_f32));
+//        CUDA_API_CHECK(cudaFree(rgb2_f32));
+//        CUDA_API_CHECK(cudaFree(rgb3_f32));
         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
         CUDA_API_CHECK(cudaFree(identity_filter_coff_f32));
+
+        delete raw_u8;
+        delete planer_u8;
+        delete planer_f32[0];
+        delete planer_f32[1];
+
         return true;
     }
 
     bool malloc_buffer(image_buffer *buf) const {
-        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_image_f32, &buf->image_pitch,
-                                       width * sizeof(float) * 3, pyr_height));
-        CUDA_API_CHECK(cudaMallocPitch(&buf->pyr_weight_f32, &buf->weight_pitch,
-                                       width * sizeof(float), pyr_height));
+        buf->image_pyr = new smart_bundle_buffer<Npp32f, 3>(width, pyr_height, 1);
+        buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
         return true;
     }
 
-    bool gaussian_pyramid(void *ptr_f32, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
-        CUDA_API_CHECK((is_rgb ?
-                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
-                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
-                               ((Npp32f *) ptr_f32, pitch, full_size, origin_point,
-                                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
-                                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+    bool gaussian_pyramid(Npp32f *ptr, size_t pitch) const { // construct gaussian pyramid
+        auto pyr_ptr = (char *) ptr + pitch * height;
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
+                ptr, pitch, full_size, origin_point,
+                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         for (int i = 0; i < pyr_level - 1; ++i) {
-            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float) * (is_rgb ? 3 : 1);
-            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * (is_rgb ? 3 : 1);
-            CUDA_API_CHECK((is_rgb ?
-                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
-                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)
-                                   ((Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
-                                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
-                                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
+            CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
+                    (Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
+                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
+                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         }
         return true;
     }
 
-    bool laplacian_operation(void *src_f32, NppiSize src_size,
-                             void *dst_f32, NppiSize dst_size,
-                             size_t pitch) const {
+    bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
+                             NppiSize src_size, NppiSize dst_size, int buf_index) const {
         // up-sampling
-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
-                (Npp32f *) src_f32, pitch, src_size, origin_point,
-                (Npp32f *) rgb_f32, rgb_f32_pitch, dst_size,
+        // TODO: check why gaussian blur is not performed
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
+                src_ptr, pitch, src_size, origin_point,
+                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         // gaussian blur
-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb_f32, rgb_f32_pitch, dst_size, origin_point,
-                                                     (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
-                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
-        // add
-//        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb_f32, rgb_f32_pitch,
-//                                        (Npp32f *) dst_f32, pitch,
-//                                        dst_size));
-        CUDA_API_CHECK(nppiSub_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
-                                        (Npp32f *) dst_f32, pitch,
-                                        dst_size));
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
+                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
+                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
+                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
+        // subtraction
+        CUDA_API_CHECK(nppiSub_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
+                                        dst_ptr, pitch, dst_size));
         return true;
     }
 
-    bool laplacian_pyramid(void *ptr_f32, size_t pitch) const { // for rgb image only
+    bool laplacian_pyramid(Npp32f *ptr, size_t pitch, int buf_index) const { // construct laplacian pyramid
         // generate gaussian pyramid first
-        gaussian_pyramid(ptr_f32, pitch, true);
+        gaussian_pyramid(ptr, pitch);
 
         // generate laplacian pyramid by up-sampling and subtraction
-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
-        laplacian_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
+        auto pyr_ptr = (char *) ptr + pitch * height;
+        laplacian_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
         for (int i = 0; i < pyr_level - 1; ++i) {
-            laplacian_operation(pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float) * 3, pyr_size_arr[i + 1],
-                                pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
-                                pitch);
+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
+            laplacian_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
+                                pyr_size_arr[i + 1], pyr_size_arr[i], buf_index);
         }
         return true;
     }
 
-    bool reconstruct_operation(void *src_f32, NppiSize src_size,
-                               void *dst_f32, NppiSize dst_size,
-                               size_t pitch) const {
+    bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
+                               NppiSize src_size, NppiSize dst_size, int buf_index) const {
         // up-sampling
-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
-                (Npp32f *) src_f32, pitch, src_size, origin_point,
-                (Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size,
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
+                src_ptr, pitch, src_size, origin_point,
+                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         // gaussian blur
-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R((Npp32f *) rgb2_f32, rgb_f32_pitch, dst_size, origin_point,
-                                                     (Npp32f *) rgb3_f32, rgb_f32_pitch, dst_size,
-                                                     NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE)); // for test
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
+                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
+                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
+                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
         // add
-//        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb2_f32, rgb_f32_pitch,
-//                                        (Npp32f *) dst_f32, pitch,
-//                                        dst_size));
-        CUDA_API_CHECK(nppiAdd_32f_C3IR((Npp32f *) rgb3_f32, rgb_f32_pitch,
-                                        (Npp32f *) dst_f32, pitch,
-                                        dst_size));
+        CUDA_API_CHECK(nppiAdd_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
+                                        dst_ptr, pitch, dst_size));
         return true;
     }
 
     // reconstruct from laplacian pyramid, for rgb image only
-    bool pyramid_reconstruct(void *ptr_f32, size_t pitch) const {
-        auto pyr_ptr = (char *) ptr_f32 + pitch * height;
+    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, int buf_index) const {
+        auto pyr_ptr = (char *) ptr + pitch * height;
         for (int i = pyr_level - 1; i > 0; --i) {
-            reconstruct_operation(pyr_ptr + pyr_offset_arr[i] * sizeof(float) * 3, pyr_size_arr[i],
-                                  pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float) * 3, pyr_size_arr[i - 1],
-                                  pitch);
+            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
+            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float);
+            reconstruct_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
+                                  pyr_size_arr[i], pyr_size_arr[i - 1], buf_index);
         }
-        reconstruct_operation(pyr_ptr, pyr_size_arr[0], ptr_f32, full_size, pitch);
+        reconstruct_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
         return true;
     }
 
     bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
         // upload image
-        CUDA_API_CHECK(cudaMemcpy2D(raw_u8, raw_u8_pitch,
+        CUDA_API_CHECK(cudaMemcpy2D(raw_u8->ptr, raw_u8->pitch,
                                     raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
                                     height, cudaMemcpyHostToDevice));
 
         // debayer image
-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R((Npp8u *) raw_u8, raw_u8_pitch,
+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R(raw_u8->ptr, raw_u8->pitch,
                                              full_size, full_rect,
-                                             (Npp8u *) rgb_u8, rgb_u8_pitch,
+                                             rgb_u8->ptr, rgb_u8->pitch,
                                              NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
 
+        // split channels
+        CUDA_API_CHECK(nppiCopy_8u_C3P3R(rgb_u8->ptr, rgb_u8->pitch,
+                                         planer_u8->ptrs.data(), planer_u8->pitch,
+                                         full_size));
+
         // convert to float
-        CUDA_API_CHECK(nppiConvert_8u32f_C3R((Npp8u *) rgb_u8, rgb_u8_pitch,
-                                             (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
+        for (auto i = 0; i < 3; ++i) {
+            CUDA_API_CHECK(nppiConvert_8u32f_C1R(planer_u8->ptrs[i], planer_u8->pitch,
+                                                 buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
+                                                 full_size));
+            CUDA_API_CHECK(nppiMulC_32f_C1IR(u8_to_f32_coff,
+                                             buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
                                              full_size));
-        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
-                                         (Npp32f *) buf->pyr_image_f32, buf->image_pitch,
-                                         full_size)); // normalize
+        }
 
         // calc weight
-        call_hdr_weight(buf->pyr_image_f32, buf->image_pitch,
-                        buf->pyr_weight_f32, buf->weight_pitch,
+        call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
+                        buf->weight_pyr->ptr, buf->weight_pyr->pitch,
                         width, height);
 
         // construct image pyramid
-        gaussian_pyramid(buf->pyr_weight_f32, buf->weight_pitch, false);
-        laplacian_pyramid(buf->pyr_image_f32, buf->image_pitch);
+        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch);
+        for (auto i = 0; i < 3; ++i) {
+            laplacian_pyramid(buf->image_pyr->ptrs[i], buf->image_pyr->pitch, i);
+        }
 
         return true;
     }
 
     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
-                     void *out_u8, size_t out_pitch) const {
-        // merge
-        assert(buf_a->image_pitch == buf_b->image_pitch);
-        assert(buf_a->weight_pitch == buf_b->weight_pitch);
-        call_hdr_merge(buf_a->pyr_image_f32, buf_b->pyr_image_f32, buf_a->image_pitch,
-                       buf_a->pyr_weight_f32, buf_b->pyr_weight_f32, buf_a->weight_pitch,
-                       rgb_f32, rgb_f32_pitch,
-                       width, pyr_height);
-
-        // reconstruct image from laplacian pyramid
-        pyramid_reconstruct(rgb_f32, rgb_f32_pitch);
-
-        // convert to uint8 and copy
-        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) rgb_f32, rgb_f32_pitch,
-                                             (Npp8u *) out_u8, out_pitch,
-                                             full_size, NPP_RND_NEAR));
+                     uint8_t *out_ptr, size_t out_pitch) const {
+        auto pitch = buf_a->image_pyr->pitch;
+        assert(pitch == buf_b->image_pyr->pitch);
+        assert(pitch == buf_b->weight_pyr->pitch);
+        assert(pitch == buf_a->weight_pyr->pitch);
+
+        for (auto i = 0; i < 3; ++i) {
+            // merge
+            call_hdr_merge(buf_a->image_pyr->ptrs[i], buf_b->image_pyr->ptrs[i],
+                           buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr,
+                           pitch, width, pyr_height);
+
+            // reconstruct image from laplacian pyramid
+            pyramid_reconstruct(buf_a->image_pyr->ptrs[i], pitch, i);
+
+            // convert to uint8
+            CUDA_API_CHECK(nppiConvert_32f8u_C1R(buf_a->image_pyr->ptrs[i], buf_a->image_pyr->pitch,
+                                                 planer_u8->ptrs[i], planer_u8->pitch, full_size, NPP_RND_NEAR));
+        }
+
+        // packet and copy
+        CUDA_API_CHECK(nppiCopy_8u_P3C3R(planer_u8->ptrs.data(), planer_u8->pitch,
+                                         out_ptr, out_pitch, full_size));
 
         return true;
     }
 
-//    bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
-//                     void *out_u8, size_t out_pitch) const {
-//        // reconstruct image from laplacian pyramid
-//        pyramid_reconstruct(buf_a->pyr_image_f32, buf_a->image_pitch);
-//
-//        // convert to uint8 and copy
-//        float arr[] = {255, 255, 255};
-//        nppiMulC_32f_C3IR(arr, (Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch, full_size);
-//        CUDA_API_CHECK(nppiConvert_32f8u_C3R((Npp32f *) buf_a->pyr_image_f32, buf_a->image_pitch,
-//                                             (Npp8u *) out_u8, out_pitch,
-//                                             full_size, NPP_RND_NEAR));
-//
-//        return true;
-//    }
-
 };
 
 hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
@@ -300,8 +360,8 @@ bool hdr_synthesizer::malloc_buffer(void **out_buf) {
 
 bool hdr_synthesizer::free_buffer(void *out_buf) {
     auto ptr = (impl::image_buffer *) out_buf;
-    CUDA_API_CHECK(cudaFree(ptr->pyr_image_f32));
-    CUDA_API_CHECK(cudaFree(ptr->pyr_weight_f32));
+    delete ptr->image_pyr;
+    delete ptr->weight_pyr;
     delete ptr;
     return true;
 }
@@ -311,12 +371,7 @@ bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
 }
 
 bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
-                                  void *img_u8, size_t img_pitch) {
+                                  uint8_t *img_ptr, size_t img_pitch) {
     return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
-                              img_u8, img_pitch);
-}
-
-void hdr_synthesizer::test_func(void **ptr, size_t *pitch) {
-    *ptr = pimpl->rgb_f32;
-    *pitch = pimpl->rgb_f32_pitch;
+                              img_ptr, img_pitch);
 }

+ 2 - 4
src/hdr_synthesis.h

@@ -7,7 +7,7 @@
 class hdr_synthesizer {
 public:
 
-    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 4);
 
     ~hdr_synthesizer();
 
@@ -17,9 +17,7 @@ public:
 
     bool preprocess_image(void *img_buf, uint8_t *img_ptr);
 
-    bool merge_image(void *buf_a, void *buf_b, void *img_u8, size_t img_pitch);
-
-    void test_func(void **ptr, size_t *pitch);
+    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
 
 private:
     struct impl;

+ 30 - 39
src/hdr_synthesis_kernel.cu

@@ -1,8 +1,11 @@
 #include "hdr_synthesis.h"
 
+#include <nppdefs.h>
+
 static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
 static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
 static constexpr float smooth_coff = 1e-6;
+static constexpr float u8_to_f32_coff = 1.0f / 255;
 static constexpr float f32_to_u8_coff = 255;
 
 static constexpr auto block_size = 16;
@@ -12,19 +15,17 @@ __device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem
     return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
 }
 
-__global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
+__global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
                            float *out_ptr, size_t out_pitch,
                            size_t width, size_t height) {
     auto x = blockIdx.x * blockDim.x + threadIdx.x;
     auto y = blockIdx.y * blockDim.y + threadIdx.y;
     if (x >= height || y >= width) return;
 
-//    auto in_offset = x * in_pitch + y * 3 * sizeof(float);
-//    auto in_data = (float *) ((char *) in_ptr + in_offset);
     auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
-    float r = in_data[0];
-    float g = in_data[1];
-    float b = in_data[2];
+    float r = (float) in_data[0] * u8_to_f32_coff;
+    float g = (float) in_data[1] * u8_to_f32_coff;
+    float b = (float) in_data[2] * u8_to_f32_coff;
 
     float rgb_mean = (r + g + b) / 3.0f;
     float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
@@ -33,57 +34,47 @@ __global__ void hdr_weight(const float *in_ptr, size_t in_pitch,
     float rgb_max = fmaxf(fmaxf(r, g), b);
     float expo_weight = expf(-exposure_coff * (rgb_max - 0.5f) * (rgb_max - 0.5f));
 
-//    auto out_offset = x * out_pitch + y * sizeof(float);
-//    auto out_data = (float *) ((char *) out_ptr + out_offset);
     auto out_data = smart_offset(out_ptr, out_pitch, x, y);
     *out_data = sat_weight * expo_weight + smooth_coff;
 }
 
-__device__ float pixel_merge(const float *img_a, const float *img_b,
-                             const float *wei_a, const float *wei_b,
-                             size_t offset) {
-    auto w_a = *wei_a, w_b = *wei_b;
-    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
-}
+//__device__ float pixel_merge(const float *img_a, const float *img_b,
+//                             const float *wei_a, const float *wei_b,
+//                             size_t offset) {
+//    auto w_a = *wei_a, w_b = *wei_b;
+//    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
+//}
 
-__global__ void hdr_merge(const float *img_a, const float *img_b, size_t img_pitch,
-                          const float *wei_a, const float *wei_b, size_t wei_pitch,
-                          float *out_ptr, size_t out_pitch,
-                          size_t width, size_t height) {
+__global__ void hdr_merge(float *image_a, const float *image_b,
+                          const float *weight_a, const float *weight_b,
+                          size_t pitch, size_t width, size_t height) {
     auto x = blockIdx.x * blockDim.x + threadIdx.x;
     auto y = blockIdx.y * blockDim.y + threadIdx.y;
     if (x >= height || y >= width) return;
 
-    auto cur_img_a = smart_offset(img_a, img_pitch, x, y, 3);
-    auto cur_img_b = smart_offset(img_b, img_pitch, x, y, 3);
-    auto cur_wei_a = smart_offset(wei_a, wei_pitch, x, y);
-    auto cur_wei_b = smart_offset(wei_b, wei_pitch, x, y);
-    auto cur_out = smart_offset(out_ptr, out_pitch, x, y, 3);
-    for (auto i = 0; i < 3; ++i) {
-        cur_out[i] = pixel_merge(cur_img_a, cur_img_b, cur_wei_a, cur_wei_b, i) * f32_to_u8_coff;
-    }
+    auto ppa = smart_offset(image_a, pitch, x, y);
+    auto pb = *smart_offset(image_b, pitch, x, y);
+    auto wa = *smart_offset(weight_a, pitch, x, y);
+    auto wb = *smart_offset(weight_b, pitch, x, y);
+
+    *ppa = (*ppa * wa + pb * wb) / (wa + wb) * f32_to_u8_coff;
 }
 
-void call_hdr_weight(void *in_f32, size_t in_pitch,
-                     void *out_f32, size_t out_pitch,
+void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
+                     Npp32f *out_ptr, size_t out_pitch,
                      size_t width, size_t height) {
     auto block_dims = dim3{block_size, block_size};
     auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
                           (uint32_t) width / block_size + (width % block_size != 0)};
-    hdr_weight<<<grid_dims, block_dims>>>((float *) in_f32, in_pitch,
-                                          (float *) out_f32, out_pitch,
-                                          width, height);
+    hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
 }
 
-void call_hdr_merge(void *img_a_f32, void *img_b_f32, size_t img_pitch,
-                    void *wei_a_f32, void *wei_b_f32, size_t wei_pitch,
-                    void *out_f32, size_t out_pitch,
-                    size_t width, size_t height) {
+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
+                    const Npp32f *weight_a, const Npp32f *weight_b,
+                    size_t pitch, size_t width, size_t height) {
     auto block_dims = dim3{block_size, block_size};
     auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
                           (uint32_t) width / block_size + (width % block_size != 0)};
-    hdr_merge<<<grid_dims, block_dims>>>((float *) img_a_f32, (float *) img_b_f32, img_pitch,
-                                         (float *) wei_a_f32, (float *) wei_b_f32, wei_pitch,
-                                         (float *) out_f32, out_pitch,
-                                         width, height);
+    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, weight_a, weight_b,
+                                         pitch, width, height);
 }

+ 21 - 18
src/main.cpp

@@ -30,8 +30,8 @@ struct image_buffer {
 };
 
 int main() {
-    auto path_a = "/home/tpx/project/HDRSynthesis/data/phantom2_5ms.raw";
-    auto path_b = "/home/tpx/project/HDRSynthesis/data/phantom2_20ms.raw";
+    auto path_a = "/home/tpx/project/HDRSynthesis/data/chess_4ms.raw";
+    auto path_b = "/home/tpx/project/HDRSynthesis/data/chess_50ms.raw";
 
     using boost::iostreams::mapped_file;
     auto img_file_a = mapped_file{path_a, boost::iostreams::mapped_file_base::readonly};
@@ -45,22 +45,25 @@ int main() {
     hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
 
     auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
-    hdr.merge_image(buf_b, buf_a, img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
-
-//    for (int i = 0; i < 8; ++i) {
-//        auto start_ts = std::chrono::system_clock::now();
-//        cudaDeviceSynchronize();
-//        hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
-//        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
-//        cudaDeviceSynchronize();
-//        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
-//                std::chrono::system_clock::now() - start_ts).count() << std::endl;
-//    }
-//
-//    cudaProfilerStart();
-//    hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
-//    cudaProfilerStop();
-//
+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+
+    for (int i = 0; i < 8; ++i) {
+        auto start_ts = std::chrono::system_clock::now();
+        cudaDeviceSynchronize();
+        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
+        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+        cudaDeviceSynchronize();
+        std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
+                std::chrono::system_clock::now() - start_ts).count() << std::endl;
+    }
+
+    cudaDeviceSynchronize();
+    cudaProfilerStart();
+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+    cudaDeviceSynchronize();
+    cudaProfilerStop();
+
 
     auto real_ptr = (image_buffer *) buf_b;
     auto host_rgb_a = download_image(img_hdr_dev.cudaPtr(), img_hdr_dev.step1(),