浏览代码

Added cuda stream support.

jcsyshc 2 年之前
父节点
当前提交
a88f184887
共有 5 个文件被更改,包括 137 次插入73 次删除
  1. 90 45
      src/hdr_synthesis.cpp
  2. 4 3
      src/hdr_synthesis.h
  3. 22 18
      src/hdr_synthesis_kernel.cu
  4. 4 3
      src/hdr_synthesis_priv.h
  5. 17 4
      src/main.cpp

+ 90 - 45
src/hdr_synthesis.cpp

@@ -64,15 +64,22 @@ struct hdr_synthesizer::impl {
     NppiSize *pyr_size_arr;
     void *gaussian_filter_coff_f32;
 
+    NppStreamContext npp_ctx; // user provided stream
+    NppStreamContext extra_npp_ctx;
+    cudaStream_t extra_stream;
+    cudaEvent_t sync_event;
+
     // global temporary memory
     smart_buffer<Npp8u> *raw_u8, *rgb_u8;
     smart_buffer<Npp32f> *rgb_f32[2];
 
     impl(uint16_t _width, uint16_t _height, uint8_t _level)
             : width(_width), height(_height), pyr_level(_level) {
-        pyr_height = height + (height >> 1);
         malloc_global_memory();
+        init_npp_ctx(&npp_ctx);
+        init_npp_ctx(&extra_npp_ctx);
 
+        pyr_height = height + (height >> 1);
         full_size = NppiSize{width, height};
         full_rect = NppiRect{0, 0, width, height};
         origin_point = NppiPoint{0, 0};
@@ -108,11 +115,16 @@ struct hdr_synthesizer::impl {
         CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
                                   sizeof(gaussian_filter_coff), cudaMemcpyHostToDevice));
 
+        CUDA_API_CHECK(cudaStreamCreate(&extra_stream));
+        CUDA_API_CHECK(cudaEventCreateWithFlags(&sync_event, cudaEventDisableTiming));
+
         return true;
     }
 
     bool free_global_memory() {
         CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
+        CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
+        CUDA_API_CHECK(cudaEventDestroy(sync_event));
 
         delete raw_u8;
         delete rgb_u8;
@@ -122,122 +134,155 @@ struct hdr_synthesizer::impl {
         return true;
     }
 
+    static bool init_npp_ctx(NppStreamContext *ctx) {
+        ctx->hStream = nullptr;
+        CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
+        cudaDeviceProp dev_prop = {};
+        CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
+        ctx->nMultiProcessorCount = dev_prop.multiProcessorCount;
+        ctx->nMaxThreadsPerMultiProcessor = dev_prop.maxThreadsPerMultiProcessor;
+        ctx->nMaxThreadsPerBlock = dev_prop.maxThreadsPerBlock;
+        ctx->nSharedMemPerBlock = dev_prop.sharedMemPerBlock;
+        CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMajor,
+                                              cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
+        CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
+                                              cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
+        CUDA_API_CHECK(cudaStreamGetFlags(nullptr, &ctx->nStreamFlags));
+        return true;
+    }
+
+    static bool set_npp_stream(NppStreamContext *ctx, cudaStream_t stream) {
+        if (ctx->hStream == stream) [[likely]] return true;
+        ctx->hStream = stream;
+        CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
+        return true;
+    }
+
     bool malloc_buffer(image_buffer *buf) const {
         buf->image_pyr = new smart_buffer<Npp32f>(width, pyr_height, 3);
         buf->weight_pyr = new smart_buffer<Npp32f>(width, pyr_height, 1);
         return true;
     }
 
-    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb) const { // construct gaussian pyramid
+    bool gaussian_pyramid(Npp32f *ptr, size_t pitch, bool is_rgb,
+                          NppStreamContext *ctx) const { // construct gaussian pyramid
         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, is_rgb ? 3 : 1);
         CUDA_API_CHECK((is_rgb ?
-                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
-                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
+                        nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
                 ptr, pitch, full_size, origin_point,
                 pyr_ptr, pitch, pyr_size_arr[0],
-                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+                2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
         for (int i = 0; i < pyr_level - 1; ++i) {
             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, is_rgb ? 3 : 1);
             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, is_rgb ? 3 : 1);
             CUDA_API_CHECK((is_rgb ?
-                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R :
-                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R)(
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C3R_Ctx :
+                            nppiFilterGaussPyramidLayerDownBorder_32f_C1R_Ctx)(
                     src_ptr, pitch, pyr_size_arr[i], origin_point,
                     dst_ptr, pitch, pyr_size_arr[i + 1],
-                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR));
+                    2, gaussian_filter_len, (Npp32f *) gaussian_filter_coff_f32, NPP_BORDER_MIRROR, *ctx));
         }
         return true;
     }
 
     bool laplacian_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
-                             NppiSize src_size, NppiSize dst_size) const {
+                             NppiSize src_size, NppiSize dst_size, cudaStream_t stream) const {
         call_laplacian_operation(src_ptr, dst_ptr, pitch,
-                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, false);
+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, false, stream);
         return true;
     }
 
-    bool laplacian_pyramid(Npp32f *ptr, size_t pitch) const { // construct laplacian pyramid
+    bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
+                           NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
         // generate gaussian pyramid first
-        gaussian_pyramid(ptr, pitch, true);
+        gaussian_pyramid(ptr, pitch, true, ctx);
 
         // generate laplacian pyramid by up-sampling and subtraction
         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
-        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
+        laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
         for (int i = 0; i < pyr_level - 1; ++i) {
             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
-            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i]);
+            laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream);
         }
         return true;
     }
 
     bool reconstruct_operation(Npp32f *src_ptr, Npp32f *dst_ptr, size_t pitch,
-                               NppiSize src_size, NppiSize dst_size) const {
+                               NppiSize src_size, NppiSize dst_size, cudaStream_t stream) const {
         call_laplacian_operation(src_ptr, dst_ptr, pitch,
-                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, true);
+                                 (Npp32f *) gaussian_filter_coff_f32, src_size, dst_size, true, stream);
         return true;
     }
 
     // reconstruct from laplacian pyramid, for rgb image only
-    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch) const {
+    bool pyramid_reconstruct(Npp32f *ptr, size_t pitch, cudaStream_t stream) const {
         for (int i = pyr_level - 1; i > 0; --i) {
             auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
             auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
-            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1]);
+            reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream);
         }
         auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
-        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size);
+        reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
         return true;
     }
 
-    bool preprocess_image(image_buffer *buf, uint8_t *raw) const {
+    bool preprocess_image(image_buffer *buf, uint8_t *raw, cudaStream_t stream) {
         // upload image
-        CUDA_API_CHECK(cudaMemcpy2D(raw_u8->ptr, raw_u8->pitch,
-                                    raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
-                                    height, cudaMemcpyHostToDevice));
+        CUDA_API_CHECK(cudaMemcpy2DAsync(raw_u8->ptr, raw_u8->pitch,
+                                         raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
+                                         height, cudaMemcpyHostToDevice, stream));
 
         // debayer image
-        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R(raw_u8->ptr, raw_u8->pitch,
-                                             full_size, full_rect,
-                                             rgb_u8->ptr, rgb_u8->pitch,
-                                             NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED));
+        set_npp_stream(&npp_ctx, stream);
+        CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx(raw_u8->ptr, raw_u8->pitch,
+                                                 full_size, full_rect,
+                                                 rgb_u8->ptr, rgb_u8->pitch,
+                                                 NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
 
         // convert to float
-        CUDA_API_CHECK(nppiConvert_8u32f_C3R(rgb_u8->ptr, rgb_u8->pitch,
+        CUDA_API_CHECK(nppiConvert_8u32f_C3R_Ctx(rgb_u8->ptr, rgb_u8->pitch,
+                                                 buf->image_pyr->ptr, buf->image_pyr->pitch,
+                                                 full_size, npp_ctx));
+        CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
                                              buf->image_pyr->ptr, buf->image_pyr->pitch,
-                                             full_size));
-        CUDA_API_CHECK(nppiMulC_32f_C3IR(u8_to_f32_coff_arr,
-                                         buf->image_pyr->ptr, buf->image_pyr->pitch,
-                                         full_size));
+                                             full_size, npp_ctx));
+        CUDA_API_CHECK(cudaEventRecord(sync_event, stream));
 
-        // calc weight
+        // calc weight and construct pyramid
+        CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
         call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
                         buf->weight_pyr->ptr, buf->weight_pyr->pitch,
-                        width, height);
+                        width, height, extra_stream); // parallel execution for weight related calculation
+        set_npp_stream(&extra_npp_ctx, extra_stream);
+        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx);
+        CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
 
         // construct image pyramid
-        gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false);
-        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch);
+        laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, stream);
+        CUDA_API_CHECK(cudaStreamWaitEvent(stream, sync_event));
 
         return true;
     }
 
     bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
-                     uint8_t *out_ptr, size_t out_pitch) const {
+                     uint8_t *out_ptr, size_t out_pitch, cudaStream_t stream) {
         assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
         assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
 
         // merge
         call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
                        buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
-                       width, pyr_height);
+                       width, pyr_height, stream);
 
         // reconstruct image from laplacian pyramid
-        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch);
+        pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, stream);
 
         // convert to uint8
-        CUDA_API_CHECK(nppiConvert_32f8u_C3R(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
-                                             out_ptr, out_pitch, full_size, NPP_RND_NEAR));
+        set_npp_stream(&npp_ctx, stream);
+        CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
+                                                 out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
 
         return true;
     }
@@ -263,12 +308,12 @@ bool hdr_synthesizer::free_buffer(void *out_buf) {
     return true;
 }
 
-bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr) {
-    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr);
+bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream) {
+    return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, (cudaStream_t) stream);
 }
 
 bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
-                                  uint8_t *img_ptr, size_t img_pitch) {
+                                  uint8_t *img_ptr, size_t img_pitch, void *stream) {
     return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
-                              img_ptr, img_pitch);
+                              img_ptr, img_pitch, (cudaStream_t) stream);
 }

+ 4 - 3
src/hdr_synthesis.h

@@ -4,10 +4,11 @@
 #include <cstdint>
 #include <memory>
 
+// TODO: use half precision float point number to further optimize
 class hdr_synthesizer {
 public:
 
-    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 4);
+    hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level = 8);
 
     ~hdr_synthesizer();
 
@@ -15,9 +16,9 @@ public:
 
     static bool free_buffer(void *out_buf);
 
-    bool preprocess_image(void *img_buf, uint8_t *img_ptr);
+    bool preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream = nullptr);
 
-    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch);
+    bool merge_image(void *buf_a, void *buf_b, uint8_t *img_ptr, size_t img_pitch, void *stream = nullptr);
 
 private:
     struct impl;

+ 22 - 18
src/hdr_synthesis_kernel.cu

@@ -90,15 +90,13 @@ __global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t
     static_assert(relate_size <= block_size);
     auto warp_size = NppiSize{(src_size.width - 1) << 1,
                               (src_size.height - 1) << 1};
-    __shared__ float pixels[3][relate_size][relate_size + 1];
+    __shared__ float3 pixels[relate_size][relate_size];
     auto sx = (int) (blockIdx.x * blockDim.x - board_offset) >> 1;
     auto sy = (int) (blockIdx.y * blockDim.y - board_offset) >> 1;
     if (threadIdx.x < relate_size && threadIdx.y < relate_size) {
         auto val = mirrored_access(src_ptr, pitch, src_size, warp_size,
                                    sx + (int) threadIdx.x, sy + (int) threadIdx.y);
-        pixels[0][threadIdx.y][threadIdx.x] = val.x;
-        pixels[1][threadIdx.y][threadIdx.x] = val.y;
-        pixels[2][threadIdx.y][threadIdx.x] = val.z;
+        pixels[threadIdx.y][threadIdx.x] = val;
     }
     __syncthreads();
 
@@ -114,14 +112,15 @@ __global__ void laplacian_operation(const float *src_ptr, float *dst_ptr, size_t
             auto rx = (threadIdx.x + lx) >> 1;
             auto ry = (threadIdx.y + ly) >> 1;
             auto cof = x_cof * y_cof;
+            float3 pix = pixels[ry][rx];
             if constexpr (IsAdd) {
-                old_val.x += pixels[0][ry][rx] * cof;
-                old_val.y += pixels[1][ry][rx] * cof;
-                old_val.z += pixels[2][ry][rx] * cof;
+                old_val.x += pix.x * cof;
+                old_val.y += pix.y * cof;
+                old_val.z += pix.z * cof;
             } else {
-                old_val.x -= pixels[0][ry][rx] * cof;
-                old_val.y -= pixels[1][ry][rx] * cof;
-                old_val.z -= pixels[2][ry][rx] * cof;
+                old_val.x -= pix.x * cof;
+                old_val.y -= pix.y * cof;
+                old_val.z -= pix.z * cof;
             }
         }
     }
@@ -139,25 +138,30 @@ auto calc_dims(size_t width, size_t height) {
 
 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
                      Npp32f *out_ptr, size_t out_pitch,
-                     size_t width, size_t height) {
+                     size_t width, size_t height, cudaStream_t stream) {
     auto [block_dims, grid_dims] = calc_dims(width, height);
-    hdr_weight<<<grid_dims, block_dims>>>(in_ptr, in_pitch, out_ptr, out_pitch, width, height);
+    hdr_weight<<<grid_dims, block_dims, 0, stream>>>(
+            in_ptr, in_pitch, out_ptr, out_pitch, width, height);
 }
 
 void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
                     const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
-                    size_t width, size_t height) {
+                    size_t width, size_t height, cudaStream_t stream) {
     auto [block_dims, grid_dims] = calc_dims(width, height);
-    hdr_merge<<<grid_dims, block_dims>>>(image_a, image_b, image_pitch,
-                                         weight_a, weight_b, weight_pitch, width, height);
+    hdr_merge<<<grid_dims, block_dims, 0, stream>>>(
+            image_a, image_b, image_pitch,
+            weight_a, weight_b, weight_pitch, width, height);
 }
 
 void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
-                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add) {
+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
+                              bool is_add, cudaStream_t stream) {
     auto [block_dims, grid_dims] = calc_dims(dst_size.width, dst_size.height);
     if (is_add) {
-        laplacian_operation<true><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
+        laplacian_operation<true><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst, pitch, filter, src_size, dst_size);
     } else {
-        laplacian_operation<false><<<grid_dims, block_dims>>>(src, dst, pitch, filter, src_size, dst_size);
+        laplacian_operation<false><<<grid_dims, block_dims, 0, stream>>>(
+                src, dst, pitch, filter, src_size, dst_size);
     }
 }

+ 4 - 3
src/hdr_synthesis_priv.h

@@ -7,14 +7,15 @@ static constexpr auto filter_size = 5;
 
 void call_hdr_weight(const Npp8u *in_ptr, size_t in_pitch,
                      Npp32f *out_ptr, size_t out_pitch,
-                     size_t width, size_t height);
+                     size_t width, size_t height, cudaStream_t stream);
 
 void call_hdr_merge(Npp32f *image_a, const Npp32f *image_b, size_t image_pitch,
                     const Npp32f *weight_a, const Npp32f *weight_b, size_t weight_pitch,
-                    size_t width, size_t height);
+                    size_t width, size_t height, cudaStream_t stream);
 
 void call_laplacian_operation(const Npp32f *src, Npp32f *dst, size_t pitch,
-                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size, bool is_add);
+                              const Npp32f *filter, NppiSize src_size, NppiSize dst_size,
+                              bool is_add, cudaStream_t stream);
 
 template<typename T>
 __device__ __host__ inline T *smart_offset(T *ptr, size_t pitch, size_t x, size_t y, size_t elem_cnt = 1) {

+ 17 - 4
src/main.cpp

@@ -82,14 +82,25 @@ int main() {
     hdr.preprocess_image(buf_a, (uint8_t *) img_file_a.const_data());
     hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
 
+    auto hdr2 = hdr_synthesizer{image_width, image_height};
+    void *buf_a2, *buf_b2;
+    hdr2.malloc_buffer(&buf_a2);
+    hdr2.malloc_buffer(&buf_b2);
+    hdr2.preprocess_image(buf_a2, (uint8_t *) img_file_a.const_data());
+
     auto img_hdr_dev = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
+    auto img_hdr_dev2 = cv::cuda::GpuMat{image_height, image_width, CV_8UC3};
     hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
 
+    cudaStream_t stream, stream2;
+    cudaStreamCreate(&stream);
+    cudaStreamCreate(&stream2);
+
     for (int i = 0; i < 8; ++i) {
         auto start_ts = std::chrono::system_clock::now();
         cudaDeviceSynchronize();
-        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
-        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+        hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
+        hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
         cudaDeviceSynchronize();
         std::cout << std::chrono::duration_cast<std::chrono::microseconds>(
                 std::chrono::system_clock::now() - start_ts).count() << std::endl;
@@ -97,8 +108,10 @@ int main() {
 
     cudaDeviceSynchronize();
     cudaProfilerStart();
-    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data());
-    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1());
+    hdr.preprocess_image(buf_b, (uint8_t *) img_file_b.const_data(), stream);
+//    hdr2.preprocess_image(buf_b2, (uint8_t *) img_file_a.const_data(), stream2);
+    hdr.merge_image(buf_b, buf_a, (uint8_t *) img_hdr_dev.cudaPtr(), img_hdr_dev.step1(), stream);
+//    hdr2.merge_image(buf_b2, buf_a2, (uint8_t *) img_hdr_dev2.cudaPtr(), img_hdr_dev.step1(), stream2);
     cudaDeviceSynchronize();
     cudaProfilerStop();