2 years ago · 9e66186a3f
--- a/src/hdr_synthesis.cpp
+++ b/src/hdr_synthesis.cpp
@@ -1,4 +1,5 @@
 
				 #include "hdr_synthesis.h"
			
 
				+#include "hdr_synthesis_priv.h"
			
 
				 #include "cuda_helper.h"
			
 
				 
			
 
				 #include <nppi_arithmetic_and_logical_operations.h>
			
@@ -7,19 +8,7 @@
 
				 #include <nppi_filtering_functions.h>
			
 
				 
			
 
				 #include <cassert>
			
 
				-
			
 
				-void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				-                     Npp32f *out_ptr, size_t out_pitch,
			
 
				-                     size_t width, size_t height);
			
 
				-
			
 
				-void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
			
 
				-                    const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
			
 
				-                    size_t width, size_t height);
			
 
				-
			
 
				-template<typename T>
			
 
				-T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
			
 
				-    return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
			
 
				-}
			
 
				+#include <numeric>
			
 
				 
			
 
				 template<typename T>
			
 
				 struct smart_buffer {
			
@@ -140,7 +129,7 @@ struct hdr_synthesizer::impl {
 
				     }
			
 
				 
			
 
				     bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, is_rgb ? 3 : 1);
			
 
				+        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, is_rgb ? 3 : 1);
			
 
				         CUDA_API_CHECK((is_rgb ?
			
 
				                         nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				                         nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
			
@@ -148,8 +137,8 @@ struct hdr_synthesizer::impl {
 
				                 pyr_ptr, pitch, pyr_size_arr[0],
			
 
				                 2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], is_rgb ? 3 : 1);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i + 1], is_rgb ? 3 : 1);
			
 
				+            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, is_rgb ? 3 : 1);
			
 
				+            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, is_rgb ? 3 : 1);
			
 
				             CUDA_API_CHECK((is_rgb ?
			
 
				                             nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				                             nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
			
@@ -162,19 +151,8 @@ struct hdr_synthesizer::impl {
 
				 
			
 
				     bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				                              NppiSize src_size, NppiSize dst_size) const {
			
 
				-        // up-sampling
			
 
				-        // TODO: check why gaussian blur is not performed
			
 
				-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				-                src_ptr, pitch, src_size, origin_point,
			
 
				-                rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size,
			
 
				-                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				-        // gaussian blur
			
 
				-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R
			
 
				-                               (rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size, origin_point,
			
 
				-                                rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_size,
			
 
				-                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
			
 
				-        // subtraction
			
 
				-        CUDA_API_CHECK(nppiSub_32f_C3IR(rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_ptr, pitch, dst_size));
			
 
				+        call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, false);
			
 
				         return true;
			
 
				     }
			
 
				 
			
@@ -183,11 +161,11 @@ struct hdr_synthesizer::impl {
 
				         gaussian_pyramid(ptr, pitch, true);
			
 
				 
			
 
				         // generate laplacian pyramid by up-sampling and subtraction
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, 3);
			
 
				+        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				         laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i + 1], 3);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], 3);
			
 
				+            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
			
 
				+            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				             laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i]);
			
 
				         }
			
 
				         return true;
			
@@ -195,29 +173,19 @@ struct hdr_synthesizer::impl {
 
				 
			
 
				     bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				                                NppiSize src_size, NppiSize dst_size) const {
			
 
				-        // up-sampling
			
 
				-        CUDA_API_CHECK(nppiFilterGaussPyramidLayerUpBorder_32f_C3R(
			
 
				-                src_ptr, pitch, src_size, origin_point,
			
 
				-                rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size,
			
 
				-                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				-        // gaussian blur
			
 
				-        CUDA_API_CHECK(nppiFilterGaussBorder_32f_C3R
			
 
				-                               (rgb_f32[0]->ptr, rgb_f32[0]->pitch, dst_size, origin_point,
			
 
				-                                rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_size,
			
 
				-                                NPP_MASK_SIZE_5_X_5, NPP_BORDER_REPLICATE));
			
 
				-        // add
			
 
				-        CUDA_API_CHECK(nppiAdd_32f_C3IR(rgb_f32[1]->ptr, rgb_f32[1]->pitch, dst_ptr, pitch, dst_size));
			
 
				+        call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, true);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     // reconstruct from laplacian pyramid, for rgb image only
			
 
				     bool pyramid_reconstruct(Npp32f *ptr, size_t pitch) const {
			
 
				         for (int i = pyr_level - 1; i > 0; --i) {
			
 
				-            auto src_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i], 3);
			
 
				-            auto dst_ptr = smart_offset(ptr, pitch, height, pyr_offset_arr[i - 1], 3);
			
 
				+            auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				+            auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
			
 
				             reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1]);
			
 
				         }
			
 
				-        auto pyr_ptr = smart_offset(ptr, pitch, height, 0, 3);
			
 
				+        auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				         reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
			
 
				         return true;
			
 
				     }
			
--- a/src/hdr_synthesis_kernel.cu
+++ b/src/hdr_synthesis_kernel.cu
@@ -1,6 +1,5 @@
 
				 #include "hdr_synthesis.h"
			
 
				-
			
 
				-#include <nppdefs.h>
			
 
				+#include "hdr_synthesis_priv.h"
			
 
				 
			
 
				 static constexpr float saturation_coff = 2.1213203435596424; // np.std([0, 0, 1])
			
 
				 static constexpr float exposure_coff = 6.25; // 1 / (2 * sigma)**2, with sigma = 0.2
			
@@ -9,23 +8,19 @@ static constexpr float u8_to_f32_coff = 1.0f / 255;
 
				 static constexpr float f32_to_u8_coff = 255;
			
 
				 
			
 
				 static constexpr auto block_size = 16;
			
 
				-
			
 
				-template<typename T>
			
 
				-__device__ T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
			
 
				-    return (T *) ((char *) ptr + x * pitch + y * sizeof(T) * elem_cnt);
			
 
				-}
			
 
				+static_assert(block_size % 2 == 0); // TODO: even block_size will not work
			
 
				 
			
 
				 __global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
			
 
				                            float *out_ptr, size_t out_pitch,
			
 
				                            size_t width, size_t height) {
			
 
				     auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				     auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				-    if (x >= height || y >= width) return;
			
 
				+    if (x >= width || y >= height) return;
			
 
				 
			
 
				-    auto in_data = smart_offset(in_ptr, in_pitch, x, y, 3);
			
 
				-    float r = (float) in_data[0] * u8_to_f32_coff;
			
 
				-    float g = (float) in_data[1] * u8_to_f32_coff;
			
 
				-    float b = (float) in_data[2] * u8_to_f32_coff;
			
 
				+    auto in_data = *smart_offset((char3 *) in_ptr, in_pitch, x, y);
			
 
				+    float r = (float) in_data.x * u8_to_f32_coff;
			
 
				+    float g = (float) in_data.y * u8_to_f32_coff;
			
 
				+    float b = (float) in_data.z * u8_to_f32_coff;
			
 
				 
			
 
				     float rgb_mean = (r + g + b) / 3.0f;
			
 
				     float rgb_std = norm3df(r - rgb_mean, b - rgb_mean, g - rgb_mean) / sqrtf(3.0f);
			
@@ -38,46 +33,131 @@ __global__ void hdr_weight(const unsigned char *in_ptr, size_t in_pitch,
 
				     *out_data = sat_weight * expo_weight + smooth_coff;
			
 
				 }
			
 
				 
			
 
				-//__device__ float pixel_merge(const float *img_a, const float *img_b,
			
 
				-//                             const float *wei_a, const float *wei_b,
			
 
				-//                             size_t offset) {
			
 
				-//    auto w_a = *wei_a, w_b = *wei_b;
			
 
				-//    return (img_a[offset] * w_a + img_b[offset] * w_b) / (w_a + w_b);
			
 
				-//}
			
 
				-
			
 
				 __global__ void hdr_merge(float *image_a, const float *image_b, size_t image_pitch,
			
 
				                           const float *weight_a, const float *weight_b, size_t weight_pitch,
			
 
				                           size_t width, size_t height) {
			
 
				     auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				     auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				-    if (x >= height || y >= width) return;
			
 
				+    if (x >= width || y >= height) return;
			
 
				 
			
 
				-    auto ppa = smart_offset(image_a, image_pitch, x, y, 3);
			
 
				-    auto ppb = smart_offset(image_b, image_pitch, x, y, 3);
			
 
				+    auto ppa = smart_offset((float3 *) image_a, image_pitch, x, y);
			
 
				+    auto pa = *ppa;
			
 
				+    auto pb = *smart_offset((float3 *) image_b, image_pitch, x, y);
			
 
				     auto wa = *smart_offset(weight_a, weight_pitch, x, y);
			
 
				     auto wb = *smart_offset(weight_b, weight_pitch, x, y);
			
 
				 
			
 
				-#pragma unroll
			
 
				-    for (auto i = 0; i < 3; ++i) {
			
 
				-        ppa[i] = (ppa[i] * wa + ppb[i] * wb) / (wa + wb) * f32_to_u8_coff;
			
 
				+    auto w_cof = 1.0f / (wa + wb) * f32_to_u8_coff;
			
 
				+    pa.x = (pa.x * wa + pb.x * wb) * w_cof;
			
 
				+    pa.y = (pa.y * wa + pb.y * wb) * w_cof;
			
 
				+    pa.z = (pa.z * wa + pb.z * wb) * w_cof;
			
 
				+
			
 
				+    *ppa = pa;
			
 
				+}
			
 
				+
			
 
				+__device__ float3 mirrored_access(const float *src_ptr, size_t pitch,
			
 
				+                                  NppiSize src_size, NppiSize warp_size,
			
 
				+                                  int x, int y) {
			
 
				+    if (x < 0) [[unlikely]] {
			
 
				+        x = -x;
			
 
				+    } else if (x >= src_size.width) [[unlikely]] {
			
 
				+        x = warp_size.width - x;
			
 
				+    }
			
 
				+    if (y < 0)[[unlikely]] {
			
 
				+        y = -y;
			
 
				+    } else if (y >= src_size.height) [[unlikely]] {
			
 
				+        y = warp_size.height - y;
			
 
				+    }
			
 
				+    return *smart_offset((float3 *) src_ptr, pitch, x, y);
			
 
				+}
			
 
				+
			
 
				+// up-sampling, filter and add/sub from dst
			
 
				+template<bool IsAdd>
			
 
				+__global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t pitch,
			
 
				+                                    const float *filter_ptr, NppiSize src_size, NppiSize dst_size) {
			
 
				+    auto x = blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+    auto y = blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+
			
 
				+    // copy filter coefficients
			
 
				+    static_assert(block_size >= filter_size);
			
 
				+    __shared__ float filter[filter_size];
			
 
				+    if (threadIdx.y == 0 && threadIdx.x < filter_size) {
			
 
				+        filter[threadIdx.x] = filter_ptr[threadIdx.x];
			
 
				     }
			
 
				+
			
 
				+    // copy related pixels
			
 
				+    static constexpr auto board_offset = filter_size >> 1;
			
 
				+    static constexpr auto relate_size = (block_size >> 1) + board_offset + 1;
			
 
				+    static_assert(relate_size <= block_size);
			
 
				+    auto warp_size = NppiSize{(src_size.width - 1) << 1,
			
 
				+                              (src_size.height - 1) << 1};
			
 
				+    __shared__ float pixels[3][relate_size][relate_size + 1];
			
 
				+    auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
			
 
				+    auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
			
 
				+    if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
			
 
				+        auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
			
 
				+                                   sx + (int) threadIdx.x, sy + (int) threadIdx.y);
			
 
				+        pixels[0][threadIdx.y][threadIdx.x] = val.x;
			
 
				+        pixels[1][threadIdx.y][threadIdx.x] = val.y;
			
 
				+        pixels[2][threadIdx.y][threadIdx.x] = val.z;
			
 
				+    }
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if (x >= dst_size.width || y >= dst_size.height) return;
			
 
				+
			
 
				+    // do work for each pixel
			
 
				+    auto pout = smart_offset((float3 *) dst_ptr, pitch, x, y);
			
 
				+    auto old_val = *pout;
			
 
				+    for (auto ly = 0; ly < filter_size; ++ly) {
			
 
				+        auto y_cof = filter[ly];
			
 
				+        for (auto lx = 0; lx < filter_size; ++lx) {
			
 
				+            auto x_cof = filter[lx];
			
 
				+            auto rx = (threadIdx.x + lx) >> 1;
			
 
				+            auto ry = (threadIdx.y + ly) >> 1;
			
 
				+            auto cof = x_cof * y_cof;
			
 
				+            if constexpr (IsAdd) {
			
 
				+                old_val.x += pixels[0][ry][rx] * cof;
			
 
				+                old_val.y += pixels[1][ry][rx] * cof;
			
 
				+                old_val.z += pixels[2][ry][rx] * cof;
			
 
				+            } else {
			
 
				+                old_val.x -= pixels[0][ry][rx] * cof;
			
 
				+                old_val.y -= pixels[1][ry][rx] * cof;
			
 
				+                old_val.z -= pixels[2][ry][rx] * cof;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // write back answer
			
 
				+    *pout = old_val;
			
 
				+}
			
 
				+
			
 
				+auto calc_dims(size_t width, size_t height) {
			
 
				+    auto block_dims = dim3{block_size, block_size};
			
 
				+    auto grid_dims = dim3{(uint32_t) width / block_size + (width % block_size != 0),
			
 
				+                          (uint32_t) height / block_size + (height % block_size != 0)};
			
 
				+    return std::make_tuple(block_dims, grid_dims);
			
 
				 }
			
 
				 
			
 
				 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				                      Npp32f *out_ptr, size_t out_pitch,
			
 
				                      size_t width, size_t height) {
			
 
				-    auto block_dims = dim3{block_size, block_size};
			
 
				-    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				-                          (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				+    auto [block_dims, grid_dims] = calc_dims(width, height);
			
 
				     hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
			
 
				 }
			
 
				 
			
 
				 void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
			
 
				                     const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
			
 
				                     size_t width, size_t height) {
			
 
				-    auto block_dims = dim3{block_size, block_size};
			
 
				-    auto grid_dims = dim3{(uint32_t) height / block_size + (height % block_size != 0),
			
 
				-                          (uint32_t) width / block_size + (width % block_size != 0)};
			
 
				+    auto [block_dims, grid_dims] = calc_dims(width, height);
			
 
				     hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
			
 
				                                          weight_a, weight_b, weight_pitch, width, height);
			
 
				+}
			
 
				+
			
 
				+void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add) {
			
 
				+    auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
			
 
				+    if (is_add) {
			
 
				+        laplacian_operation<true><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
			
 
				+    } else {
			
 
				+        laplacian_operation<false><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
			
 
				+    }
			
 
				 }
			
--- a/src/hdr_synthesis_priv.h
+++ b/src/hdr_synthesis_priv.h
@@ -0,0 +1,24 @@
 
				+#ifndef HDRSYNTHESIS_HDR_SYNTHESIS_PRIV_H
			
 
				+#define HDRSYNTHESIS_HDR_SYNTHESIS_PRIV_H
			
 
				+
			
 
				+#include <nppdefs.h>
			
 
				+
			
 
				+static constexpr auto filter_size = 5;
			
 
				+
			
 
				+void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				+                     Npp32f *out_ptr, size_t out_pitch,
			
 
				+                     size_t width, size_t height);
			
 
				+
			
 
				+void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
			
 
				+                    const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
			
 
				+                    size_t width, size_t height);
			
 
				+
			
 
				+void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add);
			
 
				+
			
 
				+template<typename T>
			
 
				+__device__ __host__ inline T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
			
 
				+    return (T *) ((char *) ptr + y * pitch + x * sizeof(T) * elem_cnt);
			
 
				+}
			
 
				+
			
 
				+#endif //HDRSYNTHESIS_HDR_SYNTHESIS_PRIV_H
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -29,6 +29,44 @@ struct image_buffer {
 
				     size_t image_pitch, weight_pitch;
			
 
				 };
			
 
				 
			
 
				+void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add);
			
 
				+
			
 
				+//int main() {
			
 
				+//    float filter_cof[] = {.2, .2, .2, .2, .2};
			
 
				+//    void *filter_ptr;
			
 
				+//    cudaMalloc(&filter_ptr, sizeof(filter_cof));
			
 
				+//    cudaMemcpy(filter_ptr, filter_cof, sizeof(filter_cof), cudaMemcpyHostToDevice);
			
 
				+//
			
 
				+//    float dst_data[4 * 8 * 3], src_data[4 * 8 * 3];
			
 
				+//    for (auto &val: dst_data) val = 3;
			
 
				+//    for (auto &val: src_data) val = 0;
			
 
				+//    src_data[1 * 8 * 3 + 1 * 3 + 1] = 1;
			
 
				+////    for (auto &val: src_data) val = 1;
			
 
				+////    for (int i = 0; i < 4; ++i)
			
 
				+////        for (int j = 0; j < 8; ++j)
			
 
				+////            for (int k = 0; k < 3; ++k) {
			
 
				+////
			
 
				+////            }
			
 
				+//    size_t pitch;
			
 
				+//    Npp32f *dst_ptr, *src_ptr;
			
 
				+//    cudaMallocPitch(&dst_ptr, &pitch, 8 * 3 * sizeof(float), 4);
			
 
				+//    cudaMallocPitch(&src_ptr, &pitch, 8 * 3 * sizeof(float), 4);
			
 
				+//    cudaMemcpy2D(dst_ptr, pitch, dst_data, 8 * 3 * sizeof(float), 8 * 3 * sizeof(float), 4, cudaMemcpyHostToDevice);
			
 
				+//    cudaMemcpy2D(src_ptr, pitch, src_data, 8 * 3 * sizeof(float), 8 * 3 * sizeof(float), 4, cudaMemcpyHostToDevice);
			
 
				+//    call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				+//                             (Npp32f *) filter_ptr, {4, 2}, {8, 4}, true);
			
 
				+//    for (auto &val: dst_data) val = 0;
			
 
				+//    cudaMemcpy2D(dst_data, 8 * 3 * sizeof(float), dst_ptr, pitch, 8 * 3 * sizeof(float), 4, cudaMemcpyDeviceToHost);
			
 
				+//    for (int i = 0; i < 4; ++i) {
			
 
				+//        for (int j = 0; j < 8; ++j)
			
 
				+//            for (int k = 1; k < 2; ++k)
			
 
				+//                std::cout << dst_data[i * 8 * 3 + j * 3 + k] << " ";
			
 
				+//        std::cout << std::endl;
			
 
				+//    }
			
 
				+//    return 0;
			
 
				+//}
			
 
				+
			
 
				 int main() {
			
 
				     auto path_a = "/home/tpx/project/HDRSynthesis/data/chess_4ms.raw";
			
 
				     auto path_b = "/home/tpx/project/HDRSynthesis/data/chess_50ms.raw";