2 anos atrás · 0de7a8e49d
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,6 +11,9 @@ add_executable(RemoteAR3 src/main.cpp
 
				         src/simple_opengl.cpp
			
 
				         src/third_party/rs.c)
			
 
				 
			
 
				+add_subdirectory(src/image_process)
			
 
				+target_link_libraries(${PROJECT_NAME} ImageProcess)
			
 
				+
			
 
				 # CUDA config
			
 
				 find_package(CUDAToolkit REQUIRED)
			
 
				 target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
			
--- a/src/frame_sender.cpp
+++ b/src/frame_sender.cpp
@@ -315,8 +315,13 @@ struct frame_sender::impl {
 
				                 clear_frame_list();
			
 
				                 mq().update_variable(REQUEST_IDR, true);
			
 
				                 mq().update_variable(SENDER_CONNECTED, true);
			
 
				-                SPDLOG_INFO("New client from {}:{}.",
			
 
				-                            remote_ep->address().to_string(), remote_ep->port());
			
 
				+
			
 
				+                static uint32_t last_frame_id = 0;
			
 
				+                if (req.frame_id != last_frame_id) {
			
 
				+                    SPDLOG_INFO("New client from {}:{}.",
			
 
				+                                remote_ep->address().to_string(), remote_ep->port());
			
 
				+                    last_frame_id = req.frame_id;
			
 
				+                }
			
 
				                 return;
			
 
				             }
			
 
				             default: {
			
--- a/src/image_process.cpp
+++ b/src/image_process.cpp
@@ -1,10 +1,70 @@
 
				+#include "cuda_helper.hpp"
			
 
				 #include "image_process.h"
			
 
				+#include "image_process/process_kernels.cuh"
			
 
				 #include "utility.hpp"
			
 
				 
			
 
				 #include <opencv2/cudaimgproc.hpp>
			
 
				 
			
 
				+namespace process_impl {
			
 
				+
			
 
				+    template<typename T>
			
 
				+    struct smart_gpu_buffer {
			
 
				+        T *ptr = nullptr;
			
 
				+        size_t size = 0;
			
 
				+
			
 
				+        ~smart_gpu_buffer() {
			
 
				+            deallocate();
			
 
				+        }
			
 
				+
			
 
				+        void create(size_t req_size) {
			
 
				+            if (req_size > capacity) [[unlikely]] {
			
 
				+                deallocate();
			
 
				+                CUDA_API_CHECK(cudaMalloc(&ptr, req_size * sizeof(T)));
			
 
				+                capacity = req_size;
			
 
				+            }
			
 
				+            size = req_size;
			
 
				+        }
			
 
				+
			
 
				+    private:
			
 
				+        size_t capacity = 0;
			
 
				+
			
 
				+        void deallocate() {
			
 
				+            if (ptr == nullptr) return;
			
 
				+            CUDA_API_CHECK(cudaFree(ptr));
			
 
				+            ptr = nullptr;
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
			
 
				+        assert(in.elemSize() == sizeof(T));
			
 
				+        out->create(in.size().area());
			
 
				+        auto flatten_pitch = in.cols * in.elemSize();
			
 
				+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
			
 
				+                                         flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
			
 
				+                   cv::Size size, int type, cudaStream_t stream) {
			
 
				+        assert(sizeof(T) == CV_ELEM_SIZE(type));
			
 
				+        assert(in.size == size.area());
			
 
				+        out->create(size, type);
			
 
				+        auto flatten_pitch = out->cols * out->elemSize();
			
 
				+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
			
 
				+                                         flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+using namespace process_impl;
			
 
				+
			
 
				 struct monocular_processor::impl {
			
 
				     cv::cuda::GpuMat raw_dev;
			
 
				+    smart_gpu_buffer<uchar3> rgb_f;
			
 
				+    smart_gpu_buffer<float> hsv_v_f;
			
 
				+    smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
			
 
				+    smart_gpu_buffer<enhance_coeff> enhance_ext;
			
 
				 
			
 
				     static void debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
			
 
				                         cv::cuda::Stream &stream) {
			
@@ -17,14 +77,55 @@ struct monocular_processor::impl {
 
				         unreachable();
			
 
				     }
			
 
				 
			
 
				+    void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
			
 
				+        assert(in.type() == CV_8UC3);
			
 
				+
			
 
				+        // flatten image into a line
			
 
				+        flatten(in, &rgb_f, stream);
			
 
				+        auto line_size = rgb_f.size;
			
 
				+
			
 
				+        // extract V channel of HSV
			
 
				+        constexpr auto block_size = 256;
			
 
				+        constexpr auto grid_dim = 512;
			
 
				+        hsv_v_f.create(rgb_f.size);
			
 
				+        call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
			
 
				+                           block_size, grid_dim, stream);
			
 
				+
			
 
				+        // reduce enhance coefficients
			
 
				+        hsv_v_max.create(grid_dim);
			
 
				+        call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
			
 
				+                        block_size, grid_dim, stream);
			
 
				+        hsv_v_sum_log.create(grid_dim);
			
 
				+        call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
			
 
				+                            block_size, grid_dim, stream);
			
 
				+
			
 
				+        // prepare enhance coefficients
			
 
				+        enhance_ext.create(1);
			
 
				+        call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
			
 
				+                                   line_size, enhance_ext.ptr, stream);
			
 
				+
			
 
				+        // enhance image
			
 
				+        call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
			
 
				+                           block_size, grid_dim, stream);
			
 
				+
			
 
				+        // unflatten image
			
 
				+        unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
			
 
				+    }
			
 
				+
			
 
				     void process(const cv::Mat &in, cv::cuda::GpuMat *out,
			
 
				-                 cv::cuda::Stream &stream) {
			
 
				+                 bool enhance, cv::cuda::Stream &stream) {
			
 
				         // upload from host to device
			
 
				         raw_dev.upload(in, stream);
			
 
				 
			
 
				         // debayer using OpenCV
			
 
				         debayer(raw_dev, out, stream);
			
 
				 
			
 
				+        // enhance image
			
 
				+        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
			
 
				+        if (enhance) {
			
 
				+            enhance_image(*out, out, cuda_stream);
			
 
				+        }
			
 
				+
			
 
				         // TODO: un-distort
			
 
				     }
			
 
				 };
			
@@ -35,6 +136,6 @@ monocular_processor::monocular_processor()
 
				 monocular_processor::~monocular_processor() = default;
			
 
				 
			
 
				 void monocular_processor::process(const cv::Mat &in, cv::cuda::GpuMat *out,
			
 
				-                                  cv::cuda::Stream &stream) {
			
 
				-    pimpl->process(in, out, stream);
			
 
				+                                  bool enhance, cv::cuda::Stream &stream) {
			
 
				+    pimpl->process(in, out, enhance, stream);
			
 
				 }
			
--- a/src/image_process.h
+++ b/src/image_process.h
@@ -13,7 +13,7 @@ public:
 
				 
			
 
				     ~monocular_processor();
			
 
				 
			
 
				-    void process(const cv::Mat &in, cv::cuda::GpuMat *out,
			
 
				+    void process(const cv::Mat &in, cv::cuda::GpuMat *out, bool enhance = false,
			
 
				                  cv::cuda::Stream &stream = cv::cuda::Stream::Null());
			
 
				 
			
 
				 private:
			
--- a/src/image_process/CMakeLists.txt
+++ b/src/image_process/CMakeLists.txt
@@ -0,0 +1,22 @@
 
				+cmake_minimum_required(VERSION 3.25)
			
 
				+project(ImageProcess LANGUAGES CXX CUDA)
			
 
				+
			
 
				+set(CMAKE_CXX_STANDARD 20)
			
 
				+
			
 
				+add_library(${PROJECT_NAME}
			
 
				+        process_kernels.cu)
			
 
				+
			
 
				+# CUDA config
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
			
 
				+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				+
			
 
				+# spdlog config
			
 
				+find_package(spdlog REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
			
 
				+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
			
 
				+
			
 
				+# OpenCV config
			
 
				+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
--- a/src/image_process/process_kernels.cu
+++ b/src/image_process/process_kernels.cu
@@ -0,0 +1,399 @@
 
				+#include "process_kernels.cuh"
			
 
				+
			
 
				+#include <cassert>
			
 
				+#include <type_traits>
			
 
				+
			
 
				+// kernel templates
			
 
				+
			
 
				+template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
			
 
				+__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
			
 
				+    static_assert(std::is_fundamental_v<OutT>);
			
 
				+    if constexpr (BlockSize >= 64) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 32) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 16) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 8) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 4) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 2) {
			
 
				+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
			
 
				+__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
			
 
				+    extern __shared__ int shmem[];
			
 
				+    auto s_buf = (OutT *) shmem;
			
 
				+
			
 
				+    uint32_t tdx = threadIdx.x;
			
 
				+    uint32_t bkx = blockIdx.x;
			
 
				+    uint32_t grid_size = BlockSize * gridDim.x;
			
 
				+
			
 
				+    OutT t_out = InitVal;
			
 
				+
			
 
				+    // load per-thread data
			
 
				+    for (uint32_t i = bkx * blockDim.x + tdx;
			
 
				+         i < n;
			
 
				+         i += grid_size) {
			
 
				+        UpdateFunc::Op(&t_out, in[i]);
			
 
				+    }
			
 
				+
			
 
				+    // update to shared memory
			
 
				+    s_buf[tdx] = t_out;
			
 
				+    __syncthreads();
			
 
				+
			
 
				+    if constexpr (BlockSize >= 512) {
			
 
				+        if (tdx < 256) {
			
 
				+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 256) {
			
 
				+        if (tdx < 128) {
			
 
				+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+    }
			
 
				+    if constexpr (BlockSize >= 128) {
			
 
				+        if (tdx < 64) {
			
 
				+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
			
 
				+        }
			
 
				+        __syncthreads();
			
 
				+    }
			
 
				+
			
 
				+    if (tdx < 32) {
			
 
				+        warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
			
 
				+    }
			
 
				+    if (tdx == 0) {
			
 
				+        out[bkx] = s_buf[0];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename InT, typename OutT, typename Func>
			
 
				+__global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
			
 
				+    uint32_t tdx = threadIdx.x;
			
 
				+    uint32_t bkx = blockIdx.x;
			
 
				+    uint32_t grid_size = blockDim.x * gridDim.x;
			
 
				+
			
 
				+    for (uint32_t i = bkx * blockDim.x + tdx;
			
 
				+         i < n;
			
 
				+         i += grid_size) {
			
 
				+        Func::Op(&out[i], in[i]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename InT, typename OutT, typename ExtT, typename Func>
			
 
				+__global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
			
 
				+    uint32_t tdx = threadIdx.x;
			
 
				+    uint32_t bkx = blockIdx.x;
			
 
				+    uint32_t grid_size = blockDim.x * gridDim.x;
			
 
				+
			
 
				+    // load extra values
			
 
				+    ExtT ext = *p_ext;
			
 
				+
			
 
				+    for (uint32_t i = bkx * blockDim.x + tdx;
			
 
				+         i < n;
			
 
				+         i += grid_size) {
			
 
				+        Func::Op(&out[i], in[i], ext);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
			
 
				+void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
			
 
				+                            uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    assert(n <= std::numeric_limits<uint32_t>::max());
			
 
				+    auto shmem_size = block_size * (1 + (block_size <= 32));
			
 
				+    auto shmem_length = shmem_size * sizeof(OutT);
			
 
				+    switch (block_size) {
			
 
				+        case 512: {
			
 
				+            constexpr uint16_t BlockSize = 512;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				+            return;
			
 
				+        }
			
 
				+        case 256: {
			
 
				+            constexpr uint16_t BlockSize = 256;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				+            return;
			
 
				+        }
			
 
				+        case 128: {
			
 
				+            constexpr uint16_t BlockSize = 128;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				+            return;
			
 
				+        }
			
 
				+        default: {
			
 
				+            assert(false);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// result resides in out[0]
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
			
 
				+void call_reduce_any(InT *in, OutT *out, uint32_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    { // first step
			
 
				+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
			
 
				+        helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				+    }
			
 
				+    { // second step
			
 
				+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
			
 
				+        helper_func(out, out, grid_dim, block_size, 1, stream);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// working functions
			
 
				+
			
 
				+template<typename T>
			
 
				+struct type_max_value {
			
 
				+    static constexpr T value = std::numeric_limits<T>::max();
			
 
				+};
			
 
				+
			
 
				+template<typename T>
			
 
				+struct reduce_max_func {
			
 
				+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				+        *out = max(*out, val);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename T>
			
 
				+struct reduce_min_func {
			
 
				+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				+        *out = min(*out, val);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename T>
			
 
				+struct reduce_sum_func {
			
 
				+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				+        *out = *out + val;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename T>
			
 
				+struct update_log_sum_func {
			
 
				+    static constexpr T eps = (T) 1e-6;
			
 
				+
			
 
				+    static __device__ __forceinline__ void Op(T *out, T val) {
			
 
				+        *out += log(val + eps);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename InT, typename OutT>
			
 
				+struct rgb_extract_v_func { // Extract V value of HSV from RGB
			
 
				+    static __device__ __forceinline__ void Op(OutT *out, InT in) {
			
 
				+        if constexpr (std::is_floating_point_v<OutT>) {
			
 
				+            using InElemT = decltype(in.x);
			
 
				+            constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
			
 
				+            *out = factor * max(max(in.x, in.y), in.z);
			
 
				+        } else {
			
 
				+            *out = max(max(in.x, in.y), in.z);
			
 
				+        }
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+struct enhance_v_func {
			
 
				+    static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
			
 
				+        *out = ext.norm_factor * log(in / ext.log_avg + 1);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+struct enhance_image_func {
			
 
				+    static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
			
 
				+        // convert RGB to HSV
			
 
				+        // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
			
 
				+        using ImgElemT = decltype(in.x);
			
 
				+        static_assert(std::is_integral_v<ImgElemT>);
			
 
				+        ImgElemT c_max = max(max(in.x, in.y), in.z);
			
 
				+        ImgElemT c_min = min(min(in.x, in.y), in.z);
			
 
				+        ImgElemT delta = c_max - c_min;
			
 
				+
			
 
				+        float h; // 60 is eliminated
			
 
				+        if (delta == 0) {
			
 
				+            h = 0;
			
 
				+        } else {
			
 
				+            float delta_inv = 1.0f / delta;
			
 
				+            if (c_max == in.x) { // c_max == r
			
 
				+                h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
			
 
				+                if (h < 0) {
			
 
				+                    h += 6;
			
 
				+                }
			
 
				+            } else if (c_max == in.y) { // c_max == g
			
 
				+                h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
			
 
				+            } else { // c_max == b
			
 
				+                h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
			
 
				+            }
			
 
				+
			
 
				+        }
			
 
				+
			
 
				+        float s;
			
 
				+        if (c_max == 0) {
			
 
				+            s = 0;
			
 
				+        } else {
			
 
				+            s = (float) delta / c_max;
			
 
				+        }
			
 
				+
			
 
				+        constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
			
 
				+        float v = v_factor * (float) c_max;
			
 
				+
			
 
				+        // enhance V channel
			
 
				+        v = ext.norm_factor * log(v / ext.log_avg + 1);
			
 
				+
			
 
				+        // convert HSV to RGB
			
 
				+        // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
			
 
				+        float c = v * s;
			
 
				+        float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
			
 
				+        float m = v - c;
			
 
				+        float r, g, b;
			
 
				+        switch ((uint8_t) h) {
			
 
				+            case 0: {
			
 
				+                r = c;
			
 
				+                g = x;
			
 
				+                b = 0;
			
 
				+                break;
			
 
				+            }
			
 
				+            case 1: {
			
 
				+                r = x;
			
 
				+                g = c;
			
 
				+                b = 0;
			
 
				+                break;
			
 
				+            }
			
 
				+            case 2: {
			
 
				+                r = 0;
			
 
				+                g = c;
			
 
				+                b = x;
			
 
				+                break;
			
 
				+            }
			
 
				+            case 3: {
			
 
				+                r = 0;
			
 
				+                g = x;
			
 
				+                b = c;
			
 
				+                break;
			
 
				+            }
			
 
				+            case 4: {
			
 
				+                r = x;
			
 
				+                g = 0;
			
 
				+                b = c;
			
 
				+                break;
			
 
				+            }
			
 
				+            case 5: {
			
 
				+                r = c;
			
 
				+                g = 0;
			
 
				+                b = x;
			
 
				+                break;
			
 
				+            }
			
 
				+            default: {
			
 
				+                assert(false);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        constexpr float out_factor = type_max_value<ImgElemT>::value;
			
 
				+        ImgT out;
			
 
				+        out.x = out_factor * (r + m);
			
 
				+        out.y = out_factor * (g + m);
			
 
				+        out.z = out_factor * (b + m);
			
 
				+
			
 
				+        *p_out = out;
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+// special kernels
			
 
				+
			
 
				+__global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
			
 
				+                                      enhance_coeff *p_out) {
			
 
				+    float max_v = *p_max_v;
			
 
				+    float sum_log_v = *p_sum_log_v;
			
 
				+    float log_avg = exp(sum_log_v / n);
			
 
				+    float norm_factor = 1.0f / (log(max_v / log_avg + 1));
			
 
				+    p_out->log_avg = log_avg;
			
 
				+    p_out->norm_factor = norm_factor;
			
 
				+}
			
 
				+
			
 
				+// calling endpoints
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_max(T *in, T *out, size_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    using FuncType = reduce_max_func<T>;
			
 
				+    constexpr T InitVal = std::numeric_limits<T>::min();
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
			
 
				+    helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				+}
			
 
				+
			
 
				+template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_min(T *in, T *out, size_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    using FuncType = reduce_min_func<T>;
			
 
				+    constexpr T InitVal = std::numeric_limits<T>::max();
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
			
 
				+    helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				+}
			
 
				+
			
 
				+template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_sum(T *in, T *out, size_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    using FuncType = reduce_sum_func<T>;
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
			
 
				+    helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				+}
			
 
				+
			
 
				+template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_log_sum(T *in, T *out, size_t n,
			
 
				+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    using UpdateFuncType = update_log_sum_func<T>;
			
 
				+    using ReduceFuncType = reduce_sum_func<T>;
			
 
				+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
			
 
				+    helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				+}
			
 
				+
			
 
				+template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+
			
 
				+template<typename InT, typename OutT>
			
 
				+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
			
 
				+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    assert(n <= std::numeric_limits<uint32_t>::max());
			
 
				+    using FuncType = rgb_extract_v_func<InT, OutT>;
			
 
				+    elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
			
 
				+}
			
 
				+
			
 
				+template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
			
 
				+                                enhance_coeff *out, cudaStream_t stream) {
			
 
				+    prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
			
 
				+}
			
 
				+
			
 
				+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
			
 
				+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    assert(n <= std::numeric_limits<uint32_t>::max());
			
 
				+    auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
			
 
				+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
			
 
				+}
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
			
 
				+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				+    assert(n <= std::numeric_limits<uint32_t>::max());
			
 
				+    using FuncType = enhance_image_func<ImgT>;
			
 
				+    auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
			
 
				+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
			
 
				+}
			
 
				+
			
 
				+template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
			
--- a/src/image_process/process_kernels.cuh
+++ b/src/image_process/process_kernels.cuh
@@ -0,0 +1,36 @@
 
				+#ifndef IMAGEHDR_PROCESS_KERNELS_CUH
			
 
				+#define IMAGEHDR_PROCESS_KERNELS_CUH
			
 
				+
			
 
				+#include <cstdint>
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_max(T *in, T *out, size_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_min(T *in, T *out, size_t n,
			
 
				+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+template<typename T>
			
 
				+void call_reduce_log_sum(T *in, T *out, size_t n,
			
 
				+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+template<typename InT, typename OutT>
			
 
				+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
			
 
				+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+struct enhance_coeff {
			
 
				+    float log_avg, norm_factor;
			
 
				+};
			
 
				+
			
 
				+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
			
 
				+                                enhance_coeff *out, cudaStream_t stream);
			
 
				+
			
 
				+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
			
 
				+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
			
 
				+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+#endif //IMAGEHDR_PROCESS_KERNELS_CUH
			
--- a/src/main_ext.cpp
+++ b/src/main_ext.cpp
@@ -47,6 +47,7 @@ cudaStream_t left_cuda_stream = nullptr, right_cuda_stream = nullptr;
 
				 std::unique_ptr<monocular_processor> left_processor, right_processor;
			
 
				 std::unique_ptr<simple_render> opengl_render;
			
 
				 float process_frame_rate = 0;
			
 
				+bool enhance_image = false;
			
 
				 
			
 
				 std::unique_ptr<std::thread> encoder_thread;
			
 
				 bool output_full_frame = false;
			
@@ -131,7 +132,10 @@ void load_config() {
 
				     main_sender_conf.conn_mtu = sender_conf["mtu"].as<int>();
			
 
				     main_sender_conf.parity_rate = sender_conf["parity"].as<float>();
			
 
				     sender_listen_port = sender_conf["port"].as<int>();
			
 
				-    mq().update_variable(SENDER_CONNECTED, false); // make variable exist
			
 
				+
			
 
				+    // make variables exist
			
 
				+    mq().update_variable(SENDER_CONNECTED, false);
			
 
				+    mq().update_variable(REQUEST_IDR, false);
			
 
				 }
			
 
				 
			
 
				 void initialize_main_window() {
			
@@ -215,6 +219,7 @@ bool upload_capture_config_impl() {
 
				 }
			
 
				 
			
 
				 void upload_capture_config() {
			
 
				+    if (!is_camera_opened()) return;
			
 
				     if (!upload_capture_config_impl()) {
			
 
				         // TODO: show error msg
			
 
				     }
			
@@ -375,6 +380,8 @@ void cleanup() {
 
				     // avoid cudaErrorCudartUnloading
			
 
				     opengl_render.reset();
			
 
				     output_fbo.reset();
			
 
				+    left_processor.reset();
			
 
				+    right_processor.reset();
			
 
				 }
			
 
				 
			
 
				 void prepare_imgui_frame() {
			
@@ -436,6 +443,7 @@ void prepare_imgui_frame() {
 
				                                  0.1, 0, 23.4, "%.01f")) {
			
 
				                 simple_eq.push(upload_capture_config);
			
 
				             }
			
 
				+            ImGui::Checkbox("Enhance", &enhance_image);
			
 
				 
			
 
				             if (is_capturing()) {
			
 
				                 // preview config
			
@@ -589,8 +597,8 @@ void process_camera_frames() {
 
				     right_raw_cnt = cur_cnt;
			
 
				 
			
 
				     // process images
			
 
				-    left_processor->process(*left_raw_ptr, left_img_dev.get(), *left_stream);
			
 
				-    right_processor->process(*right_raw_ptr, right_img_dev.get(), *right_stream);
			
 
				+    left_processor->process(*left_raw_ptr, left_img_dev.get(), enhance_image, *left_stream);
			
 
				+    right_processor->process(*right_raw_ptr, right_img_dev.get(), enhance_image, *right_stream);
			
 
				 }
			
 
				 
			
 
				 void render_main_window() {