Преглед на файлове

Implemented image enhancement.

jcsyshc преди 2 години
родител
ревизия
0de7a8e49d
променени са 8 файла, в които са добавени 583 реда и са изтрити 9 реда
  1. 3 0
      CMakeLists.txt
  2. 7 2
      src/frame_sender.cpp
  3. 104 3
      src/image_process.cpp
  4. 1 1
      src/image_process.h
  5. 22 0
      src/image_process/CMakeLists.txt
  6. 399 0
      src/image_process/process_kernels.cu
  7. 36 0
      src/image_process/process_kernels.cuh
  8. 11 3
      src/main_ext.cpp

+ 3 - 0
CMakeLists.txt

@@ -11,6 +11,9 @@ add_executable(RemoteAR3 src/main.cpp
         src/simple_opengl.cpp
         src/third_party/rs.c)
 
+add_subdirectory(src/image_process)
+target_link_libraries(${PROJECT_NAME} ImageProcess)
+
 # CUDA config
 find_package(CUDAToolkit REQUIRED)
 target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)

+ 7 - 2
src/frame_sender.cpp

@@ -315,8 +315,13 @@ struct frame_sender::impl {
                 clear_frame_list();
                 mq().update_variable(REQUEST_IDR, true);
                 mq().update_variable(SENDER_CONNECTED, true);
-                SPDLOG_INFO("New client from {}:{}.",
-                            remote_ep->address().to_string(), remote_ep->port());
+
+                static uint32_t last_frame_id = 0;
+                if (req.frame_id != last_frame_id) {
+                    SPDLOG_INFO("New client from {}:{}.",
+                                remote_ep->address().to_string(), remote_ep->port());
+                    last_frame_id = req.frame_id;
+                }
                 return;
             }
             default: {

+ 104 - 3
src/image_process.cpp

@@ -1,10 +1,70 @@
+#include "cuda_helper.hpp"
 #include "image_process.h"
+#include "image_process/process_kernels.cuh"
 #include "utility.hpp"
 
 #include <opencv2/cudaimgproc.hpp>
 
+namespace process_impl {
+
+    template<typename T>
+    struct smart_gpu_buffer {
+        T *ptr = nullptr;
+        size_t size = 0;
+
+        ~smart_gpu_buffer() {
+            deallocate();
+        }
+
+        void create(size_t req_size) {
+            if (req_size > capacity) [[unlikely]] {
+                deallocate();
+                CUDA_API_CHECK(cudaMalloc(&ptr, req_size * sizeof(T)));
+                capacity = req_size;
+            }
+            size = req_size;
+        }
+
+    private:
+        size_t capacity = 0;
+
+        void deallocate() {
+            if (ptr == nullptr) return;
+            CUDA_API_CHECK(cudaFree(ptr));
+            ptr = nullptr;
+        }
+    };
+
+    template<typename T>
+    void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
+        assert(in.elemSize() == sizeof(T));
+        out->create(in.size().area());
+        auto flatten_pitch = in.cols * in.elemSize();
+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
+                                         flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
+    }
+
+    template<typename T>
+    void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
+                   cv::Size size, int type, cudaStream_t stream) {
+        assert(sizeof(T) == CV_ELEM_SIZE(type));
+        assert(in.size == size.area());
+        out->create(size, type);
+        auto flatten_pitch = out->cols * out->elemSize();
+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
+                                         flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
+    }
+
+}
+
+using namespace process_impl;
+
 struct monocular_processor::impl {
     cv::cuda::GpuMat raw_dev;
+    smart_gpu_buffer<uchar3> rgb_f;
+    smart_gpu_buffer<float> hsv_v_f;
+    smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
+    smart_gpu_buffer<enhance_coeff> enhance_ext;
 
     static void debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
                         cv::cuda::Stream &stream) {
@@ -17,14 +77,55 @@ struct monocular_processor::impl {
         unreachable();
     }
 
+    void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
+        assert(in.type() == CV_8UC3);
+
+        // flatten image into a line
+        flatten(in, &rgb_f, stream);
+        auto line_size = rgb_f.size;
+
+        // extract V channel of HSV
+        constexpr auto block_size = 256;
+        constexpr auto grid_dim = 512;
+        hsv_v_f.create(rgb_f.size);
+        call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
+                           block_size, grid_dim, stream);
+
+        // reduce enhance coefficients
+        hsv_v_max.create(grid_dim);
+        call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
+                        block_size, grid_dim, stream);
+        hsv_v_sum_log.create(grid_dim);
+        call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
+                            block_size, grid_dim, stream);
+
+        // prepare enhance coefficients
+        enhance_ext.create(1);
+        call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
+                                   line_size, enhance_ext.ptr, stream);
+
+        // enhance image
+        call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
+                           block_size, grid_dim, stream);
+
+        // unflatten image
+        unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
+    }
+
     void process(const cv::Mat &in, cv::cuda::GpuMat *out,
-                 cv::cuda::Stream &stream) {
+                 bool enhance, cv::cuda::Stream &stream) {
         // upload from host to device
         raw_dev.upload(in, stream);
 
         // debayer using OpenCV
         debayer(raw_dev, out, stream);
 
+        // enhance image
+        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
+        if (enhance) {
+            enhance_image(*out, out, cuda_stream);
+        }
+
         // TODO: un-distort
     }
 };
@@ -35,6 +136,6 @@ monocular_processor::monocular_processor()
 monocular_processor::~monocular_processor() = default;
 
 void monocular_processor::process(const cv::Mat &in, cv::cuda::GpuMat *out,
-                                  cv::cuda::Stream &stream) {
-    pimpl->process(in, out, stream);
+                                  bool enhance, cv::cuda::Stream &stream) {
+    pimpl->process(in, out, enhance, stream);
 }

+ 1 - 1
src/image_process.h

@@ -13,7 +13,7 @@ public:
 
     ~monocular_processor();
 
-    void process(const cv::Mat &in, cv::cuda::GpuMat *out,
+    void process(const cv::Mat &in, cv::cuda::GpuMat *out, bool enhance = false,
                  cv::cuda::Stream &stream = cv::cuda::Stream::Null());
 
 private:

+ 22 - 0
src/image_process/CMakeLists.txt

@@ -0,0 +1,22 @@
+cmake_minimum_required(VERSION 3.25)
+project(ImageProcess LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 20)
+
+add_library(${PROJECT_NAME}
+        process_kernels.cu)
+
+# CUDA config
+find_package(CUDAToolkit REQUIRED)
+target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
+
+# spdlog config
+find_package(spdlog REQUIRED)
+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
+
+# OpenCV config
+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})

+ 399 - 0
src/image_process/process_kernels.cu

@@ -0,0 +1,399 @@
+#include "process_kernels.cuh"
+
+#include <cassert>
+#include <type_traits>
+
+// kernel templates
+
+template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
+__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
+    static_assert(std::is_fundamental_v<OutT>);
+    if constexpr (BlockSize >= 64) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
+    }
+    if constexpr (BlockSize >= 32) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
+    }
+    if constexpr (BlockSize >= 16) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
+    }
+    if constexpr (BlockSize >= 8) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
+    }
+    if constexpr (BlockSize >= 4) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
+    }
+    if constexpr (BlockSize >= 2) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
+    }
+}
+
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
+__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
+    extern __shared__ int shmem[];
+    auto s_buf = (OutT *) shmem;
+
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = BlockSize * gridDim.x;
+
+    OutT t_out = InitVal;
+
+    // load per-thread data
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        UpdateFunc::Op(&t_out, in[i]);
+    }
+
+    // update to shared memory
+    s_buf[tdx] = t_out;
+    __syncthreads();
+
+    if constexpr (BlockSize >= 512) {
+        if (tdx < 256) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
+        }
+        __syncthreads();
+    }
+    if constexpr (BlockSize >= 256) {
+        if (tdx < 128) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
+        }
+        __syncthreads();
+    }
+    if constexpr (BlockSize >= 128) {
+        if (tdx < 64) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
+        }
+        __syncthreads();
+    }
+
+    if (tdx < 32) {
+        warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
+    }
+    if (tdx == 0) {
+        out[bkx] = s_buf[0];
+    }
+}
+
+template<typename InT, typename OutT, typename Func>
+__global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = blockDim.x * gridDim.x;
+
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        Func::Op(&out[i], in[i]);
+    }
+}
+
+template<typename InT, typename OutT, typename ExtT, typename Func>
+__global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = blockDim.x * gridDim.x;
+
+    // load extra values
+    ExtT ext = *p_ext;
+
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        Func::Op(&out[i], in[i], ext);
+    }
+}
+
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
+void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
+                            uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    auto shmem_size = block_size * (1 + (block_size <= 32));
+    auto shmem_length = shmem_size * sizeof(OutT);
+    switch (block_size) {
+        case 512: {
+            constexpr uint16_t BlockSize = 512;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        case 256: {
+            constexpr uint16_t BlockSize = 256;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        case 128: {
+            constexpr uint16_t BlockSize = 128;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+// result resides in out[0]
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
+void call_reduce_any(InT *in, OutT *out, uint32_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    { // first step
+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
+        helper_func(in, out, n, block_size, grid_dim, stream);
+    }
+    { // second step
+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
+        helper_func(out, out, grid_dim, block_size, 1, stream);
+    }
+}
+
+// working functions
+
+template<typename T>
+struct type_max_value {
+    static constexpr T value = std::numeric_limits<T>::max();
+};
+
+template<typename T>
+struct reduce_max_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = max(*out, val);
+    }
+};
+
+template<typename T>
+struct reduce_min_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = min(*out, val);
+    }
+};
+
+template<typename T>
+struct reduce_sum_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = *out + val;
+    }
+};
+
+template<typename T>
+struct update_log_sum_func {
+    static constexpr T eps = (T) 1e-6;
+
+    static __device__ __forceinline__ void Op(T *out, T val) {
+        *out += log(val + eps);
+    }
+};
+
+template<typename InT, typename OutT>
+struct rgb_extract_v_func { // Extract V value of HSV from RGB
+    static __device__ __forceinline__ void Op(OutT *out, InT in) {
+        if constexpr (std::is_floating_point_v<OutT>) {
+            using InElemT = decltype(in.x);
+            constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
+            *out = factor * max(max(in.x, in.y), in.z);
+        } else {
+            *out = max(max(in.x, in.y), in.z);
+        }
+    }
+};
+
+struct enhance_v_func {
+    static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
+        *out = ext.norm_factor * log(in / ext.log_avg + 1);
+    }
+};
+
+template<typename ImgT>
+struct enhance_image_func {
+    static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
+        // convert RGB to HSV
+        // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
+        using ImgElemT = decltype(in.x);
+        static_assert(std::is_integral_v<ImgElemT>);
+        ImgElemT c_max = max(max(in.x, in.y), in.z);
+        ImgElemT c_min = min(min(in.x, in.y), in.z);
+        ImgElemT delta = c_max - c_min;
+
+        float h; // 60 is eliminated
+        if (delta == 0) {
+            h = 0;
+        } else {
+            float delta_inv = 1.0f / delta;
+            if (c_max == in.x) { // c_max == r
+                h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
+                if (h < 0) {
+                    h += 6;
+                }
+            } else if (c_max == in.y) { // c_max == g
+                h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
+            } else { // c_max == b
+                h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
+            }
+
+        }
+
+        float s;
+        if (c_max == 0) {
+            s = 0;
+        } else {
+            s = (float) delta / c_max;
+        }
+
+        constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
+        float v = v_factor * (float) c_max;
+
+        // enhance V channel
+        v = ext.norm_factor * log(v / ext.log_avg + 1);
+
+        // convert HSV to RGB
+        // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
+        float c = v * s;
+        float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
+        float m = v - c;
+        float r, g, b;
+        switch ((uint8_t) h) {
+            case 0: {
+                r = c;
+                g = x;
+                b = 0;
+                break;
+            }
+            case 1: {
+                r = x;
+                g = c;
+                b = 0;
+                break;
+            }
+            case 2: {
+                r = 0;
+                g = c;
+                b = x;
+                break;
+            }
+            case 3: {
+                r = 0;
+                g = x;
+                b = c;
+                break;
+            }
+            case 4: {
+                r = x;
+                g = 0;
+                b = c;
+                break;
+            }
+            case 5: {
+                r = c;
+                g = 0;
+                b = x;
+                break;
+            }
+            default: {
+                assert(false);
+            }
+        }
+
+        constexpr float out_factor = type_max_value<ImgElemT>::value;
+        ImgT out;
+        out.x = out_factor * (r + m);
+        out.y = out_factor * (g + m);
+        out.z = out_factor * (b + m);
+
+        *p_out = out;
+    }
+};
+
+// special kernels
+
+__global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
+                                      enhance_coeff *p_out) {
+    float max_v = *p_max_v;
+    float sum_log_v = *p_sum_log_v;
+    float log_avg = exp(sum_log_v / n);
+    float norm_factor = 1.0f / (log(max_v / log_avg + 1));
+    p_out->log_avg = log_avg;
+    p_out->norm_factor = norm_factor;
+}
+
+// calling endpoints
+
+template<typename T>
+void call_reduce_max(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_max_func<T>;
+    constexpr T InitVal = std::numeric_limits<T>::min();
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_min(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_min_func<T>;
+    constexpr T InitVal = std::numeric_limits<T>::max();
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_sum(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_sum_func<T>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_log_sum(T *in, T *out, size_t n,
+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using UpdateFuncType = update_log_sum_func<T>;
+    using ReduceFuncType = reduce_sum_func<T>;
+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+
+template<typename InT, typename OutT>
+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    using FuncType = rgb_extract_v_func<InT, OutT>;
+    elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
+}
+
+template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
+                                enhance_coeff *out, cudaStream_t stream) {
+    prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
+}
+
+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
+}
+
+template<typename ImgT>
+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    using FuncType = enhance_image_func<ImgT>;
+    auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
+}
+
+template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);

+ 36 - 0
src/image_process/process_kernels.cuh

@@ -0,0 +1,36 @@
+#ifndef IMAGEHDR_PROCESS_KERNELS_CUH
+#define IMAGEHDR_PROCESS_KERNELS_CUH
+
+#include <cstdint>
+
+template<typename T>
+void call_reduce_max(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename T>
+void call_reduce_min(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename T>
+void call_reduce_log_sum(T *in, T *out, size_t n,
+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename InT, typename OutT>
+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+struct enhance_coeff {
+    float log_avg, norm_factor;
+};
+
+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
+                                enhance_coeff *out, cudaStream_t stream);
+
+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename ImgT>
+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+#endif //IMAGEHDR_PROCESS_KERNELS_CUH

+ 11 - 3
src/main_ext.cpp

@@ -47,6 +47,7 @@ cudaStream_t left_cuda_stream = nullptr, right_cuda_stream = nullptr;
 std::unique_ptr<monocular_processor> left_processor, right_processor;
 std::unique_ptr<simple_render> opengl_render;
 float process_frame_rate = 0;
+bool enhance_image = false;
 
 std::unique_ptr<std::thread> encoder_thread;
 bool output_full_frame = false;
@@ -131,7 +132,10 @@ void load_config() {
     main_sender_conf.conn_mtu = sender_conf["mtu"].as<int>();
     main_sender_conf.parity_rate = sender_conf["parity"].as<float>();
     sender_listen_port = sender_conf["port"].as<int>();
-    mq().update_variable(SENDER_CONNECTED, false); // make variable exist
+
+    // make variables exist
+    mq().update_variable(SENDER_CONNECTED, false);
+    mq().update_variable(REQUEST_IDR, false);
 }
 
 void initialize_main_window() {
@@ -215,6 +219,7 @@ bool upload_capture_config_impl() {
 }
 
 void upload_capture_config() {
+    if (!is_camera_opened()) return;
     if (!upload_capture_config_impl()) {
         // TODO: show error msg
     }
@@ -375,6 +380,8 @@ void cleanup() {
     // avoid cudaErrorCudartUnloading
     opengl_render.reset();
     output_fbo.reset();
+    left_processor.reset();
+    right_processor.reset();
 }
 
 void prepare_imgui_frame() {
@@ -436,6 +443,7 @@ void prepare_imgui_frame() {
                                  0.1, 0, 23.4, "%.01f")) {
                 simple_eq.push(upload_capture_config);
             }
+            ImGui::Checkbox("Enhance", &enhance_image);
 
             if (is_capturing()) {
                 // preview config
@@ -589,8 +597,8 @@ void process_camera_frames() {
     right_raw_cnt = cur_cnt;
 
     // process images
-    left_processor->process(*left_raw_ptr, left_img_dev.get(), *left_stream);
-    right_processor->process(*right_raw_ptr, right_img_dev.get(), *right_stream);
+    left_processor->process(*left_raw_ptr, left_img_dev.get(), enhance_image, *left_stream);
+    right_processor->process(*right_raw_ptr, right_img_dev.get(), enhance_image, *right_stream);
 }
 
 void render_main_window() {