Prechádzať zdrojové kódy

Combined channels again.

jcsyshc 2 rokov pred
rodič
commit
ef933db7aa
2 zmenil súbory, kde vykonal 92 pridanie a 160 odobranie
  1. 76 147
      src/hdr_synthesis.cpp
  2. 16 13
      src/hdr_synthesis_kernel.cu

+ 76 - 147
src/hdr_synthesis.cpp

@@ -12,9 +12,14 @@ void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
                      Npp32f *out_ptr, size_t out_pitch,
                      size_t width, size_t height);
 
-void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
-                    const Npp32f *weight_a, const Npp32f *weight_b,
-                    size_t pitch, size_t width, size_t height);
+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
+                    const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
+                    size_t width, size_t height);
+
+template<typename T>
+T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
+    return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
+}
 
 template<typename T>
 struct smart_buffer {
@@ -43,57 +48,21 @@ struct smart_buffer {
     }
 };
 
-template<typename T, size_t Len>
-struct smart_bundle_buffer {
-    std::array<T *, Len> ptrs;
-    size_t pitch = 0;
-
-    size_t width, height, elem_cnt;
-
-    smart_bundle_buffer(size_t _width, size_t _height, size_t _elem_cnt)
-            : width(_width), height(_height), elem_cnt(_elem_cnt) {
-        malloc_memory();
-    }
-
-    ~smart_bundle_buffer() {
-        free_memory();
-    }
-
-    bool malloc_memory() {
-        for (auto &ptr: ptrs) {
-            auto old_pitch = pitch;
-            CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
-            assert(old_pitch == 0 || pitch == old_pitch);
-        }
-        return true;
-    }
-
-    bool free_memory() {
-        for (auto &ptr: ptrs) {
-            CUDA_API_CHECK(cudaFree(ptr));
-        }
-        return true;
-    }
-};
-
 struct hdr_synthesizer::impl {
 
     static constexpr auto u8_to_f32_coff = 1.0f / 255;
     static constexpr float u8_to_f32_coff_arr[] = {u8_to_f32_coff,
                                                    u8_to_f32_coff,
                                                    u8_to_f32_coff};
-    static constexpr float identity_filter_coff[] = {1};
     static constexpr float gaussian_filter_coff[] = {1 / 16.0f,
                                                      4 / 16.0f,
                                                      6 / 16.0f,
                                                      4 / 16.0f,
                                                      1 / 16.0f};
-    static constexpr auto identity_filter_len = 1;
     static constexpr auto gaussian_filter_len = sizeof(gaussian_filter_coff) / sizeof(float);
 
     struct image_buffer {
-        smart_buffer<Npp32f> *weight_pyr;
-        smart_bundle_buffer<Npp32f, 3> *image_pyr;
+        smart_buffer<Npp32f> *image_pyr, *weight_pyr;
     };
 
     uint16_t width, height, pyr_height;
@@ -104,13 +73,11 @@ struct hdr_synthesizer::impl {
     NppiPoint origin_point;
     size_t *pyr_offset_arr;
     NppiSize *pyr_size_arr;
-    void *identity_filter_coff_f32;
     void *gaussian_filter_coff_f32;
 
     // global temporary memory
     smart_buffer<Npp8u> *raw_u8, *rgb_u8;
-    smart_bundle_buffer<Npp8u, 3> *planer_u8;
-    smart_bundle_buffer<Npp32f, 3> *planer_f32[2];
+    smart_buffer<Npp32f> *rgb_f32[2];
 
     impl(uint16_t _width, uint16_t _height, uint8_t _level)
             : width(_width), height(_height), pyr_level(_level) {
@@ -142,137 +109,116 @@ struct hdr_synthesizer::impl {
     }
 
     bool malloc_global_memory() {
-//        CUDA_API_CHECK(cudaMallocPitch(&raw_u8, &raw_u8_pitch,
-//                                       width * sizeof(unsigned char), height));
-//        CUDA_API_CHECK(cudaMallocPitch(&rgb_u8, &rgb_u8_pitch,
-//                                       width * 3 * sizeof(unsigned char), height));
-//        CUDA_API_CHECK(cudaMallocPitch(&rgb_f32, &rgb_f32_pitch,
-//                                       width * 3 * sizeof(float), pyr_height));
-//        CUDA_API_CHECK(cudaMallocPitch(&rgb2_f32, &rgb_f32_pitch,
-//                                       width * 3 * sizeof(float), pyr_height)); // for test
-//        CUDA_API_CHECK(cudaMallocPitch(&rgb3_f32, &rgb_f32_pitch,
-//                                       width * 3 * sizeof(float), pyr_height)); // for test
-
         raw_u8 = new smart_buffer<Npp8u>(width, height, 1);
         rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
-        planer_u8 = new smart_bundle_buffer<Npp8u, 3>(width, height, 1);
-        planer_f32[0] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
-        planer_f32[1] = new smart_bundle_buffer<Npp32f, 3>(width, height, 1);
+        rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
+        rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
 
         // upload gaussian kernel coefficient
         CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
         CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
                                   sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
-        CUDA_API_CHECK(cudaMalloc(&identity_filter_coff_f32, sizeof(identity_filter_coff)));
-        CUDA_API_CHECK(cudaMemcpy(identity_filter_coff_f32, identity_filter_coff,
-                                  sizeof(identity_filter_coff), cudaMemcpyHostToDevice));
 
         return true;
     }
 
     bool free_global_memory() {
-//        CUDA_API_CHECK(cudaFree(raw_u8));
-//        CUDA_API_CHECK(cudaFree(rgb_u8));
-//        CUDA_API_CHECK(cudaFree(rgb_f32));
-//        CUDA_API_CHECK(cudaFree(rgb2_f32));
-//        CUDA_API_CHECK(cudaFree(rgb3_f32));
         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
-        CUDA_API_CHECK(cudaFree(identity_filter_coff_f32));
 
         delete raw_u8;
-        delete planer_u8;
-        delete planer_f32[0];
-        delete planer_f32[1];
+        delete rgb_u8;
+        delete rgb_f32[0];
+        delete rgb_f32[1];
 
         return true;
     }
 
     bool malloc_buffer(image_buffer *buf) const {
-        buf->image_pyr = new smart_bundle_buffer<Npp32f, 3>(width, pyr_height, 1);
+        buf->image_pyr = new smart_buffer<Npp32f>(width, pyr_height, 3);
         buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
         return true;
     }
 
-    bool gaussian_pyramid(Npp32f *ptr, size_t pitch) const { // construct gaussian pyramid
-        auto pyr_ptr = (char *) ptr + pitch * height;
-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
+    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
+        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, is_rgb ? 3 : 1);
+        CUDA_API_CHECK((is_rgb ?
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
                 ptr, pitch, full_size, origin_point,
-                (Npp32f *) pyr_ptr, pitch, pyr_size_arr[0],
+                pyr_ptr, pitch, pyr_size_arr[0],
                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         for (int i = 0; i < pyr_level - 1; ++i) {
-            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
-            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
-            CUDA_API_CHECK(nppiFilterGaussPyramidLayerDownBorder_32f_C1R(
-                    (Npp32f *) src_f32, pitch, pyr_size_arr[i], origin_point,
-                    (Npp32f *) dst_f32, pitch, pyr_size_arr[i + 1],
+            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], is_rgb ? 3 : 1);
+            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i + 1], is_rgb ? 3 : 1);
+            CUDA_API_CHECK((is_rgb ?
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
+                    src_ptr, pitch, pyr_size_arr[i], origin_point,
+                    dst_ptr, pitch, pyr_size_arr[i + 1],
                     2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         }
         return true;
     }
 
     bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
-                             NppiSize src_size, NppiSize dst_size, int buf_index) const {
+                             NppiSize src_size, NppiSize dst_size) const {
         // up-sampling
         // TODO: check why gaussian blur is not performed
-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
                 src_ptr, pitch, src_size, origin_point,
-                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
+                rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size,
                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         // gaussian blur
-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
-                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
-                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R
+                               (rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size, origin_point,
+                                rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_size,
                                 NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
         // subtraction
-        CUDA_API_CHECK(nppiSub_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
-                                        dst_ptr, pitch, dst_size));
+        CUDA_API_CHECK(nppiSub_32f_C3IR(rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_ptr, pitch, dst_size));
         return true;
     }
 
-    bool laplacian_pyramid(Npp32f *ptr, size_t pitch, int buf_index) const { // construct laplacian pyramid
+    bool laplacian_pyramid(Npp32f *ptr, size_t pitch) const { // construct laplacian pyramid
         // generate gaussian pyramid first
-        gaussian_pyramid(ptr, pitch);
+        gaussian_pyramid(ptr, pitch, true);
 
         // generate laplacian pyramid by up-sampling and subtraction
-        auto pyr_ptr = (char *) ptr + pitch * height;
-        laplacian_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
+        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, 3);
+        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
         for (int i = 0; i < pyr_level - 1; ++i) {
-            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i + 1] * sizeof(float);
-            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
-            laplacian_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
-                                pyr_size_arr[i + 1], pyr_size_arr[i], buf_index);
+            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i + 1], 3);
+            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], 3);
+            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i]);
         }
         return true;
     }
 
     bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
-                               NppiSize src_size, NppiSize dst_size, int buf_index) const {
+                               NppiSize src_size, NppiSize dst_size) const {
         // up-sampling
-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C1R(
+        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
                 src_ptr, pitch, src_size, origin_point,
-                planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size,
+                rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size,
                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
         // gaussian blur
-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C1R
-                               (planer_f32[0]->ptrs[buf_index], planer_f32[0]->pitch, dst_size, origin_point,
-                                planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch, dst_size,
+        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R
+                               (rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size, origin_point,
+                                rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_size,
                                 NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
         // add
-        CUDA_API_CHECK(nppiAdd_32f_C1IR(planer_f32[1]->ptrs[buf_index], planer_f32[1]->pitch,
-                                        dst_ptr, pitch, dst_size));
+        CUDA_API_CHECK(nppiAdd_32f_C3IR(rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_ptr, pitch, dst_size));
         return true;
     }
 
     // reconstruct from laplacian pyramid, for rgb image only
-    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, int buf_index) const {
-        auto pyr_ptr = (char *) ptr + pitch * height;
+    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch) const {
         for (int i = pyr_level - 1; i > 0; --i) {
-            auto src_f32 = (char *) pyr_ptr + pyr_offset_arr[i] * sizeof(float);
-            auto dst_f32 = (char *) pyr_ptr + pyr_offset_arr[i - 1] * sizeof(float);
-            reconstruct_operation((Npp32f *) src_f32, (Npp32f *) dst_f32, pitch,
-                                  pyr_size_arr[i], pyr_size_arr[i - 1], buf_index);
+            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], 3);
+            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i - 1], 3);
+            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1]);
         }
-        reconstruct_operation((Npp32f *) pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, buf_index);
+        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, 3);
+        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
         return true;
     }
 
@@ -288,20 +234,13 @@ struct hdr_synthesizer::impl {
                                              rgb_u8->ptr, rgb_u8->pitch,
                                              NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
 
-        // split channels
-        CUDA_API_CHECK(nppiCopy_8u_C3P3R(rgb_u8->ptr, rgb_u8->pitch,
-                                         planer_u8->ptrs.data(), planer_u8->pitch,
-                                         full_size));
-
         // convert to float
-        for (auto i = 0; i < 3; ++i) {
-            CUDA_API_CHECK(nppiConvert_8u32f_C1R(planer_u8->ptrs[i], planer_u8->pitch,
-                                                 buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
-                                                 full_size));
-            CUDA_API_CHECK(nppiMulC_32f_C1IR(u8_to_f32_coff,
-                                             buf->image_pyr->ptrs[i], buf->image_pyr->pitch,
+        CUDA_API_CHECK(nppiConvert_8u32f_C3R(rgb_u8->ptr, rgb_u8->pitch,
+                                             buf->image_pyr->ptr, buf->image_pyr->pitch,
                                              full_size));
-        }
+        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
+                                         buf->image_pyr->ptr, buf->image_pyr->pitch,
+                                         full_size));
 
         // calc weight
         call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
@@ -309,38 +248,28 @@ struct hdr_synthesizer::impl {
                         width, height);
 
         // construct image pyramid
-        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch);
-        for (auto i = 0; i < 3; ++i) {
-            laplacian_pyramid(buf->image_pyr->ptrs[i], buf->image_pyr->pitch, i);
-        }
+        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false);
+        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch);
 
         return true;
     }
 
     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
                      uint8_t *out_ptr, size_t out_pitch) const {
-        auto pitch = buf_a->image_pyr->pitch;
-        assert(pitch == buf_b->image_pyr->pitch);
-        assert(pitch == buf_b->weight_pyr->pitch);
-        assert(pitch == buf_a->weight_pyr->pitch);
-
-        for (auto i = 0; i < 3; ++i) {
-            // merge
-            call_hdr_merge(buf_a->image_pyr->ptrs[i], buf_b->image_pyr->ptrs[i],
-                           buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr,
-                           pitch, width, pyr_height);
-
-            // reconstruct image from laplacian pyramid
-            pyramid_reconstruct(buf_a->image_pyr->ptrs[i], pitch, i);
-
-            // convert to uint8
-            CUDA_API_CHECK(nppiConvert_32f8u_C1R(buf_a->image_pyr->ptrs[i], buf_a->image_pyr->pitch,
-                                                 planer_u8->ptrs[i], planer_u8->pitch, full_size, NPP_RND_NEAR));
-        }
+        assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
+        assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
+
+        // merge
+        call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
+                       buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
+                       width, pyr_height);
+
+        // reconstruct image from laplacian pyramid
+        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch);
 
-        // packet and copy
-        CUDA_API_CHECK(nppiCopy_8u_P3C3R(planer_u8->ptrs.data(), planer_u8->pitch,
-                                         out_ptr, out_pitch, full_size));
+        // convert to uint8
+        CUDA_API_CHECK(nppiConvert_32f8u_C3R(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
+                                             out_ptr, out_pitch, full_size, NPP_RND_NEAR));
 
         return true;
     }

+ 16 - 13
src/hdr_synthesis_kernel.cu

@@ -45,19 +45,22 @@ __global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
 //    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
 //}
 
-__global__ void hdr_merge(float *image_a, const float *image_b,
-                          const float *weight_a, const float *weight_b,
-                          size_t pitch, size_t width, size_t height) {
+__global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
+                          const float *weight_a, const float *weight_b, size_t weight_pitch,
+                          size_t width, size_t height) {
     auto x = blockIdx.x * blockDim.x + threadIdx.x;
     auto y = blockIdx.y * blockDim.y + threadIdx.y;
     if (x >= height || y >= width) return;
 
-    auto ppa = smart_offset(image_a, pitch, x, y);
-    auto pb = *smart_offset(image_b, pitch, x, y);
-    auto wa = *smart_offset(weight_a, pitch, x, y);
-    auto wb = *smart_offset(weight_b, pitch, x, y);
+    auto ppa = smart_offset(image_a, image_pitch, x, y, 3);
+    auto ppb = smart_offset(image_b, image_pitch, x, y, 3);
+    auto wa = *smart_offset(weight_a, weight_pitch, x, y);
+    auto wb = *smart_offset(weight_b, weight_pitch, x, y);
 
-    *ppa = (*ppa * wa + pb * wb) / (wa + wb) * f32_to_u8_coff;
+#pragma unroll
+    for (auto i = 0; i < 3; ++i) {
+        ppa[i] = (ppa[i] * wa + ppb[i] * wb) / (wa + wb) * f32_to_u8_coff;
+    }
 }
 
 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
@@ -69,12 +72,12 @@ void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
     hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
 }
 
-void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b,
-                    const Npp32f *weight_a, const Npp32f *weight_b,
-                    size_t pitch, size_t width, size_t height) {
+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
+                    const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
+                    size_t width, size_t height) {
     auto block_dims = dim3{block_size, block_size};
     auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
                           (uint32_t) width / block_size + (width % block_size != 0)};
-    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, weight_a, weight_b,
-                                         pitch, width, height);
+    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
+                                         weight_a, weight_b, weight_pitch, width, height);
 }