2 years ago · a88f184887
--- a/src/hdr_synthesis.cpp
+++ b/src/hdr_synthesis.cpp
@@ -64,15 +64,22 @@ struct hdr_synthesizer::impl {
 
				     NppiSize *pyr_size_arr;
			
 
				     void *gaussian_filter_coff_f32;
			
 
				 
			
 
				+    NppStreamContext npp_ctx; // user provided stream
			
 
				+    NppStreamContext extra_npp_ctx;
			
 
				+    cudaStream_t extra_stream;
			
 
				+    cudaEvent_t sync_event;
			
 
				+
			
 
				     // global temporary memory
			
 
				     smart_buffer<Npp8u> *raw_u8, *rgb_u8;
			
 
				     smart_buffer<Npp32f> *rgb_f32[2];
			
 
				 
			
 
				     impl(uint16_t _width, uint16_t _height, uint8_t _level)
			
 
				             : width(_width), height(_height), pyr_level(_level) {
			
 
				-        pyr_height = height + (height >> 1);
			
 
				         malloc_global_memory();
			
 
				+        init_npp_ctx(&npp_ctx);
			
 
				+        init_npp_ctx(&extra_npp_ctx);
			
 
				 
			
 
				+        pyr_height = height + (height >> 1);
			
 
				         full_size = NppiSize{width, height};
			
 
				         full_rect = NppiRect{0, 0, width, height};
			
 
				         origin_point = NppiPoint{0, 0};
			
@@ -108,11 +115,16 @@ struct hdr_synthesizer::impl {
 
				         CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
			
 
				                                   sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
			
 
				 
			
 
				+        CUDA_API_CHECK(cudaStreamCreate(&extra_stream));
			
 
				+        CUDA_API_CHECK(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming));
			
 
				+
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool free_global_memory() {
			
 
				         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
			
 
				+        CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
			
 
				+        CUDA_API_CHECK(cudaEventDestroy(sync_event));
			
 
				 
			
 
				         delete raw_u8;
			
 
				         delete rgb_u8;
			
@@ -122,122 +134,155 @@ struct hdr_synthesizer::impl {
 
				         return true;
			
 
				     }
			
 
				 
			
 
				+    static bool init_npp_ctx(NppStreamContext *ctx) {
			
 
				+        ctx->hStream = nullptr;
			
 
				+        CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
			
 
				+        cudaDeviceProp dev_prop = {};
			
 
				+        CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
			
 
				+        ctx->nMultiProcessorCount = dev_prop.multiProcessorCount;
			
 
				+        ctx->nMaxThreadsPerMultiProcessor = dev_prop.maxThreadsPerMultiProcessor;
			
 
				+        ctx->nMaxThreadsPerBlock = dev_prop.maxThreadsPerBlock;
			
 
				+        ctx->nSharedMemPerBlock = dev_prop.sharedMemPerBlock;
			
 
				+        CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMajor,
			
 
				+                                              cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
			
 
				+        CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
			
 
				+                                              cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
			
 
				+        CUDA_API_CHECK(cudaStreamGetFlags(nullptr, &ctx->nStreamFlags));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    static bool set_npp_stream(NppStreamContext *ctx, cudaStream_t stream) {
			
 
				+        if (ctx->hStream == stream) [[likely]] return true;
			
 
				+        ctx->hStream = stream;
			
 
				+        CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				     bool malloc_buffer(image_buffer *buf) const {
			
 
				         buf->image_pyr = new smart_buffer<Npp32f>(width, pyr_height, 3);
			
 
				         buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
			
 
				+    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb,
			
 
				+                          NppStreamContext *ctx) const { // construct gaussian pyramid
			
 
				         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, is_rgb ? 3 : 1);
			
 
				         CUDA_API_CHECK((is_rgb ?
			
 
				-                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				-                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
			
 
				+                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
			
 
				+                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
			
 
				                 ptr, pitch, full_size, origin_point,
			
 
				                 pyr_ptr, pitch, pyr_size_arr[0],
			
 
				-                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, is_rgb ? 3 : 1);
			
 
				             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, is_rgb ? 3 : 1);
			
 
				             CUDA_API_CHECK((is_rgb ?
			
 
				-                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
			
 
				-                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
			
 
				+                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
			
 
				+                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
			
 
				                     src_ptr, pitch, pyr_size_arr[i], origin_point,
			
 
				                     dst_ptr, pitch, pyr_size_arr[i + 1],
			
 
				-                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
			
 
				+                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				-                             NppiSize src_size, NppiSize dst_size) const {
			
 
				+                             NppiSize src_size, NppiSize dst_size, cudaStream_t stream) const {
			
 
				         call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				-                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, false);
			
 
				+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, false, stream);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool laplacian_pyramid(Npp32f *ptr, size_t pitch) const { // construct laplacian pyramid
			
 
				+    bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
			
 
				+                           NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
			
 
				         // generate gaussian pyramid first
			
 
				-        gaussian_pyramid(ptr, pitch, true);
			
 
				+        gaussian_pyramid(ptr, pitch, true, ctx);
			
 
				 
			
 
				         // generate laplacian pyramid by up-sampling and subtraction
			
 
				         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
			
 
				+        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
			
 
				         for (int i = 0; i < pyr_level - 1; ++i) {
			
 
				             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
			
 
				             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				-            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i]);
			
 
				+            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream);
			
 
				         }
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
			
 
				-                               NppiSize src_size, NppiSize dst_size) const {
			
 
				+                               NppiSize src_size, NppiSize dst_size, cudaStream_t stream) const {
			
 
				         call_laplacian_operation(src_ptr, dst_ptr, pitch,
			
 
				-                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, true);
			
 
				+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, true, stream);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     // reconstruct from laplacian pyramid, for rgb image only
			
 
				-    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch) const {
			
 
				+    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, cudaStream_t stream) const {
			
 
				         for (int i = pyr_level - 1; i > 0; --i) {
			
 
				             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
			
 
				             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
			
 
				-            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1]);
			
 
				+            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream);
			
 
				         }
			
 
				         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
			
 
				-        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
			
 
				+        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				-    bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
			
 
				+    bool preprocess_image(image_buffer *buf, uint8_t *raw, cudaStream_t stream) {
			
 
				         // upload image
			
 
				-        CUDA_API_CHECK(cudaMemcpy2D(raw_u8->ptr, raw_u8->pitch,
			
 
				-                                    raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
			
 
				-                                    height, cudaMemcpyHostToDevice));
			
 
				+        CUDA_API_CHECK(cudaMemcpy2DAsync(raw_u8->ptr, raw_u8->pitch,
			
 
				+                                         raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
			
 
				+                                         height, cudaMemcpyHostToDevice, stream));
			
 
				 
			
 
				         // debayer image
			
 
				-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R(raw_u8->ptr, raw_u8->pitch,
			
 
				-                                             full_size, full_rect,
			
 
				-                                             rgb_u8->ptr, rgb_u8->pitch,
			
 
				-                                             NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
			
 
				+        set_npp_stream(&npp_ctx, stream);
			
 
				+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx(raw_u8->ptr, raw_u8->pitch,
			
 
				+                                                 full_size, full_rect,
			
 
				+                                                 rgb_u8->ptr, rgb_u8->pitch,
			
 
				+                                                 NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
			
 
				 
			
 
				         // convert to float
			
 
				-        CUDA_API_CHECK(nppiConvert_8u32f_C3R(rgb_u8->ptr, rgb_u8->pitch,
			
 
				+        CUDA_API_CHECK(nppiConvert_8u32f_C3R_Ctx(rgb_u8->ptr, rgb_u8->pitch,
			
 
				+                                                 buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				+                                                 full_size, npp_ctx));
			
 
				+        CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
			
 
				                                              buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				-                                             full_size));
			
 
				-        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
			
 
				-                                         buf->image_pyr->ptr, buf->image_pyr->pitch,
			
 
				-                                         full_size));
			
 
				+                                             full_size, npp_ctx));
			
 
				+        CUDA_API_CHECK(cudaEventRecord(sync_event, stream));
			
 
				 
			
 
				-        // calc weight
			
 
				+        // calc weight and construct pyramid
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
			
 
				         call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
			
 
				                         buf->weight_pyr->ptr, buf->weight_pyr->pitch,
			
 
				-                        width, height);
			
 
				+                        width, height, extra_stream); // parallel execution for weight related calculation
			
 
				+        set_npp_stream(&extra_npp_ctx, extra_stream);
			
 
				+        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx);
			
 
				+        CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
			
 
				 
			
 
				         // construct image pyramid
			
 
				-        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false);
			
 
				-        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch);
			
 
				+        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, stream);
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent(stream, sync_event));
			
 
				 
			
 
				         return true;
			
 
				     }
			
 
				 
			
 
				     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
			
 
				-                     uint8_t *out_ptr, size_t out_pitch) const {
			
 
				+                     uint8_t *out_ptr, size_t out_pitch, cudaStream_t stream) {
			
 
				         assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
			
 
				         assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
			
 
				 
			
 
				         // merge
			
 
				         call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				                        buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
			
 
				-                       width, pyr_height);
			
 
				+                       width, pyr_height, stream);
			
 
				 
			
 
				         // reconstruct image from laplacian pyramid
			
 
				-        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch);
			
 
				+        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, stream);
			
 
				 
			
 
				         // convert to uint8
			
 
				-        CUDA_API_CHECK(nppiConvert_32f8u_C3R(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				-                                             out_ptr, out_pitch, full_size, NPP_RND_NEAR));
			
 
				+        set_npp_stream(&npp_ctx, stream);
			
 
				+        CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
			
 
				+                                                 out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
			
 
				 
			
 
				         return true;
			
 
				     }
			
@@ -263,12 +308,12 @@ bool hdr_synthesizer::free_buffer(void *out_buf) {
 
				     return true;
			
 
				 }
			
 
				 
			
 
				-bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
			
 
				-    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr);
			
 
				+bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream) {
			
 
				+    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, (cudaStream_t) stream);
			
 
				 }
			
 
				 
			
 
				 bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
			
 
				-                                  uint8_t *img_ptr, size_t img_pitch) {
			
 
				+                                  uint8_t *img_ptr, size_t img_pitch, void *stream) {
			
 
				     return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
			
 
				-                              img_ptr, img_pitch);
			
 
				+                              img_ptr, img_pitch, (cudaStream_t) stream);
			
 
				 }
			
--- a/src/hdr_synthesis.h
+++ b/src/hdr_synthesis.h
@@ -4,10 +4,11 @@
 
				 #include <cstdint>
			
 
				 #include <memory>
			
 
				 
			
 
				+// TODO: use half precision float point number to further optimize
			
 
				 class hdr_synthesizer {
			
 
				 public:
			
 
				 
			
 
				-    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 4);
			
 
				+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
			
 
				 
			
 
				     ~hdr_synthesizer();
			
 
				 
			
@@ -15,9 +16,9 @@ public:
 
				 
			
 
				     static bool free_buffer(void *out_buf);
			
 
				 
			
 
				-    bool preprocess_image(void *img_buf, uint8_t *img_ptr);
			
 
				+    bool preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream = nullptr);
			
 
				 
			
 
				-    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
			
 
				+    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch, void *stream = nullptr);
			
 
				 
			
 
				 private:
			
 
				     struct impl;
			
--- a/src/hdr_synthesis_kernel.cu
+++ b/src/hdr_synthesis_kernel.cu
@@ -90,15 +90,13 @@ __global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t
 
				     static_assert(relate_size <= block_size);
			
 
				     auto warp_size = NppiSize{(src_size.width - 1) << 1,
			
 
				                               (src_size.height - 1) << 1};
			
 
				-    __shared__ float pixels[3][relate_size][relate_size + 1];
			
 
				+    __shared__ float3 pixels[relate_size][relate_size];
			
 
				     auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
			
 
				     auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
			
 
				     if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
			
 
				         auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
			
 
				                                    sx + (int) threadIdx.x, sy + (int) threadIdx.y);
			
 
				-        pixels[0][threadIdx.y][threadIdx.x] = val.x;
			
 
				-        pixels[1][threadIdx.y][threadIdx.x] = val.y;
			
 
				-        pixels[2][threadIdx.y][threadIdx.x] = val.z;
			
 
				+        pixels[threadIdx.y][threadIdx.x] = val;
			
 
				     }
			
 
				     __syncthreads();
			
 
				 
			
@@ -114,14 +112,15 @@ __global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t
 
				             auto rx = (threadIdx.x + lx) >> 1;
			
 
				             auto ry = (threadIdx.y + ly) >> 1;
			
 
				             auto cof = x_cof * y_cof;
			
 
				+            float3 pix = pixels[ry][rx];
			
 
				             if constexpr (IsAdd) {
			
 
				-                old_val.x += pixels[0][ry][rx] * cof;
			
 
				-                old_val.y += pixels[1][ry][rx] * cof;
			
 
				-                old_val.z += pixels[2][ry][rx] * cof;
			
 
				+                old_val.x += pix.x * cof;
			
 
				+                old_val.y += pix.y * cof;
			
 
				+                old_val.z += pix.z * cof;
			
 
				             } else {
			
 
				-                old_val.x -= pixels[0][ry][rx] * cof;
			
 
				-                old_val.y -= pixels[1][ry][rx] * cof;
			
 
				-                old_val.z -= pixels[2][ry][rx] * cof;
			
 
				+                old_val.x -= pix.x * cof;
			
 
				+                old_val.y -= pix.y * cof;
			
 
				+                old_val.z -= pix.z * cof;
			
 
				             }
			
 
				         }
			
 
				     }
			
@@ -139,25 +138,30 @@ auto calc_dims(size_t width, size_t height) {
 
				 
			
 
				 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				                      Npp32f *out_ptr, size_t out_pitch,
			
 
				-                     size_t width, size_t height) {
			
 
				+                     size_t width, size_t height, cudaStream_t stream) {
			
 
				     auto [block_dims, grid_dims] = calc_dims(width, height);
			
 
				-    hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
			
 
				+    hdr_weight<<<grid_dims, block_dims, 0, stream>>>(
			
 
				+            in_ptr, in_pitch, out_ptr, out_pitch, width, height);
			
 
				 }
			
 
				 
			
 
				 void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
			
 
				                     const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
			
 
				-                    size_t width, size_t height) {
			
 
				+                    size_t width, size_t height, cudaStream_t stream) {
			
 
				     auto [block_dims, grid_dims] = calc_dims(width, height);
			
 
				-    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
			
 
				-                                         weight_a, weight_b, weight_pitch, width, height);
			
 
				+    hdr_merge<<<grid_dims, block_dims, 0, stream>>>(
			
 
				+            image_a, image_b, image_pitch,
			
 
				+            weight_a, weight_b, weight_pitch, width, height);
			
 
				 }
			
 
				 
			
 
				 void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				-                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add) {
			
 
				+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
			
 
				+                              bool is_add, cudaStream_t stream) {
			
 
				     auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
			
 
				     if (is_add) {
			
 
				-        laplacian_operation<true><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
			
 
				+        laplacian_operation<true><<<grid_dims, block_dims, 0, stream>>>(
			
 
				+                src, dst, pitch, filter, src_size, dst_size);
			
 
				     } else {
			
 
				-        laplacian_operation<false><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
			
 
				+        laplacian_operation<false><<<grid_dims, block_dims, 0, stream>>>(
			
 
				+                src, dst, pitch, filter, src_size, dst_size);
			
 
				     }
			
 
				 }
			
--- a/src/hdr_synthesis_priv.h
+++ b/src/hdr_synthesis_priv.h
@@ -7,14 +7,15 @@ static constexpr auto filter_size = 5;
 
				 
			
 
				 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
			
 
				                      Npp32f *out_ptr, size_t out_pitch,
			
 
				-                     size_t width, size_t height);
			
 
				+                     size_t width, size_t height, cudaStream_t stream);
			
 
				 
			
 
				 void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
			
 
				                     const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
			
 
				-                    size_t width, size_t height);
			
 
				+                    size_t width, size_t height, cudaStream_t stream);
			
 
				 
			
 
				 void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
			
 
				-                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add);
			
 
				+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
			
 
				+                              bool is_add, cudaStream_t stream);
			
 
				 
			
 
				 template<typename T>
			
 
				 __device__ __host__ inline T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -82,14 +82,25 @@ int main() {
 
				     hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
			
 
				     hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				 
			
 
				+    auto hdr2 = hdr_synthesizer{image_width, image_height};
			
 
				+    void *buf_a2, *buf_b2;
			
 
				+    hdr2.malloc_buffer(&buf_a2);
			
 
				+    hdr2.malloc_buffer(&buf_b2);
			
 
				+    hdr2.preprocess_image(buf_a2, (uint8_t *) img_file_a.const_data());
			
 
				+
			
 
				     auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				+    auto img_hdr_dev2 = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
			
 
				     hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				 
			
 
				+    cudaStream_t stream, stream2;
			
 
				+    cudaStreamCreate(&stream);
			
 
				+    cudaStreamCreate(&stream2);
			
 
				+
			
 
				     for (int i = 0; i < 8; ++i) {
			
 
				         auto start_ts = std::chrono::system_clock::now();
			
 
				         cudaDeviceSynchronize();
			
 
				-        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				-        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
			
 
				+        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
			
 
				         cudaDeviceSynchronize();
			
 
				         std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
			
 
				                 std::chrono::system_clock::now() - start_ts).count() << std::endl;
			
@@ -97,8 +108,10 @@ int main() {
 
				 
			
 
				     cudaDeviceSynchronize();
			
 
				     cudaProfilerStart();
			
 
				-    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
			
 
				-    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
			
 
				+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
			
 
				+//    hdr2.preprocess_image(buf_b2, (uint8_t *) img_file_a.const_data(), stream2);
			
 
				+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
			
 
				+//    hdr2.merge_image(buf_b2, buf_a2, (uint8_t *) img_hdr_dev2.cudaPtr(), img_hdr_dev.step1(), stream2);
			
 
				     cudaDeviceSynchronize();
			
 
				     cudaProfilerStop();