Browse Source

Merged MVS camera modules.

jcsyshc 1 year ago
parent
commit
027c905d1d
42 changed files with 2824 additions and 36 deletions
  1. 44 0
      CMakeLists.txt
  2. 1 0
      data/config_depth_guide_20240410.yaml
  3. 32 0
      data/config_remote_ar_20240410.yaml
  4. 116 0
      src/core/image_utility.hpp
  5. 1 1
      src/core/impl/memory_pool_impl.h
  6. 3 0
      src/core/impl/object_manager.cpp
  7. 1 1
      src/core/memory_pool.h
  8. 39 0
      src/core/yaml_utility.hpp
  9. 210 0
      src/device/impl/mvs_camera.cpp
  10. 48 0
      src/device/impl/mvs_camera_impl.h
  11. 98 0
      src/device/impl/mvs_camera_ui.cpp
  12. 38 0
      src/device/impl/mvs_camera_ui_impl.h
  13. 10 7
      src/device/impl/orb_camera.cpp
  14. 1 1
      src/device/impl/orb_camera_impl.h
  15. 1 1
      src/device/impl/orb_camera_ui.cpp
  16. 91 0
      src/device/mvs_camera.h
  17. 41 0
      src/device/mvs_camera_ui.h
  18. 4 1
      src/device/orb_camera.h
  19. 58 0
      src/image_process/camera_utility.hpp
  20. 49 0
      src/image_process/image_process_ui.h
  21. 60 0
      src/image_process/impl/image_process_ui.cpp
  22. 37 0
      src/image_process/impl/image_process_ui_impl.h
  23. 24 0
      src/image_process_v3/cuda_impl/CMakeLists.txt
  24. 33 0
      src/image_process_v3/cuda_impl/kernel_types.cuh
  25. 128 0
      src/image_process_v3/cuda_impl/kernel_utility.cuh
  26. 508 0
      src/image_process_v3/cuda_impl/process_kernels.cu
  27. 50 0
      src/image_process_v3/cuda_impl/process_kernels.cuh
  28. 300 0
      src/image_process_v3/cuda_impl/vis_marker_kernels.cu
  29. 16 0
      src/image_process_v3/cuda_impl/vis_marker_kernels.cuh
  30. 442 0
      src/image_process_v3/image_process.cpp
  31. 42 0
      src/image_process_v3/image_process.h
  32. 3 0
      src/impl/app_base.h
  33. 62 0
      src/impl/apps/app_selector/app_selector.cpp
  34. 39 0
      src/impl/apps/app_selector/app_selector.h
  35. 93 0
      src/impl/apps/remote_ar/remote_ar.cpp
  36. 53 0
      src/impl/apps/remote_ar/remote_ar.h
  37. 7 6
      src/impl/main_impl.cpp
  38. 0 18
      src/impl/object_names.h
  39. 3 0
      src/module/image_streamer.h
  40. 3 0
      src/module/image_viewer.h
  41. 31 0
      src/module/impl/image_viewer.cpp
  42. 4 0
      src/module/impl/image_viewer_impl.h

+ 44 - 0
CMakeLists.txt

@@ -4,8 +4,11 @@ project(DepthGuide)
 set(CMAKE_CXX_STANDARD 20)
 
 add_executable(${PROJECT_NAME} src/main.cpp
+        src/image_process/impl/image_process_ui.cpp
         src/impl/main_impl.cpp
+        src/impl/apps/app_selector/app_selector.cpp
         src/impl/apps/depth_guide/depth_guide.cpp
+        src/impl/apps/remote_ar/remote_ar.cpp
         src/core/impl/event_timer.cpp
         src/core/impl/memory_pool.cpp
         src/core/impl/object_manager.cpp
@@ -25,6 +28,12 @@ add_executable(${PROJECT_NAME} src/main.cpp
 
 target_include_directories(${PROJECT_NAME} PRIVATE src)
 
+# image process sub-module
+add_subdirectory(src/image_process_v3/cuda_impl)
+target_link_libraries(${PROJECT_NAME} ImageProcessCudaV3)
+target_sources(${PROJECT_NAME} PRIVATE
+        src/image_process_v3/image_process.cpp)
+
 # CUDA config
 find_package(CUDAToolkit REQUIRED)
 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
@@ -83,11 +92,29 @@ target_sources(${PROJECT_NAME} PRIVATE
         ${IMGUI_BACKENDS_DIR}/imgui_impl_glfw.cpp
         ${IMGUI_BACKENDS_DIR}/imgui_impl_opengl3.cpp)
 
+# ImGuiFileDialog config
+set(ImGuiFileDialog_DIR /home/tpx/src/ImGuiFileDialog-0.6.7)
+add_subdirectory(${ImGuiFileDialog_DIR} third_party/imgui_file_dialog)
+target_include_directories(ImGuiFileDialog PRIVATE ${IMGUI_DIR})
+target_link_libraries(${PROJECT_NAME} ImGuiFileDialog)
+
+# yaml-cpp
+find_package(yaml-cpp REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${YAML_CPP_INCLUDE_DIR})
+target_link_libraries(${PROJECT_NAME} ${YAML_CPP_LIBRARIES})
+if (WIN32)
+    target_link_directories(${PROJECT_NAME} PRIVATE C:/BuildEssentials/VS2019Libs/lib)
+endif ()
+
 # Boost config
 find_package(Boost REQUIRED COMPONENTS iostreams)
 target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
 target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
 
+# Eigen3 config
+find_package(Eigen3 REQUIRED)
+target_link_libraries(${PROJECT_NAME} Eigen3::Eigen)
+
 # Orbbec config
 set(OrbbecSDK_DIR /home/tpx/src/OrbbecSDK-1.9.5)
 find_package(OrbbecSDK REQUIRED)
@@ -96,6 +123,23 @@ target_sources(${PROJECT_NAME} PRIVATE
         src/device/impl/orb_camera.cpp
         src/device/impl/orb_camera_ui.cpp)
 
+# MVS config
+if (WIN32)
+    set(MVS_DIR "C:/BuildEssentials/Library/MVS/Development")
+    set(MVS_INCLUDE_DIR ${MVS_DIR}/Includes)
+    set(MVS_LIB_DIR ${MVS_DIR}/Libraries/win64)
+else ()
+    set(MVS_DIR /opt/MVS)
+    set(MVS_INCLUDE_DIR ${MVS_DIR}/include)
+    set(MVS_LIB_DIR ${MVS_DIR}/lib/64)
+endif ()
+find_library(MVS_LIB MvCameraControl HINTS ${MVS_LIB_DIR})
+target_include_directories(${PROJECT_NAME} PRIVATE ${MVS_INCLUDE_DIR})
+target_link_libraries(${PROJECT_NAME} ${MVS_LIB})
+target_sources(${PROJECT_NAME} PRIVATE
+        src/device/impl/mvs_camera.cpp
+        src/device/impl/mvs_camera_ui.cpp)
+
 # Crypto++ config
 set(CRYPTOPP_DIR /home/tpx/usr)
 set(CRYPTOPP_LIB_DIR ${CRYPTOPP_DIR}/lib)

+ 1 - 0
data/config_depth_guide_20240410.yaml

@@ -0,0 +1 @@
+app_name: depth_guide

+ 32 - 0
data/config_remote_ar_20240410.yaml

@@ -0,0 +1,32 @@
+app_name: remote_ar
+
+left_camera_name: "LeftEye"
+right_camera_name: "RightEye"
+
+stereo_info:
+  left:
+    fx: 3566.07514575013
+    fy: 3565.09801365950
+    cx: 1230.14290684677
+    cy: 1026.56369172491
+    k0: -0.0668340443448111
+    k1: 0.0831050102080411
+    width: 2448
+    height: 2048
+  right:
+    fx: 3579.11217391698
+    fy: 3578.22682676712
+    cx: 1219.97484179738
+    cy: 1036.82186898493
+    k0: -0.0802083588903196
+    k1: 0.158880530651155
+    width: 2448
+    height: 2048
+  transform:
+    tx: -65.3265350213620
+    ty: 0.245313839615459
+    tz: -0.508569900882309
+    qw: 0.999987680292989
+    qx: -0.000428742826329354
+    qy: -0.00395921570373319
+    qz: -0.00296311539549921

+ 116 - 0
src/core/image_utility.hpp

@@ -4,6 +4,8 @@
 #include "cuda_helper.hpp"
 #include "memory_pool.h"
 
+#include <boost/integer.hpp>
+
 #include <opencv2/core/types.hpp>
 
 enum image_pixel_type {
@@ -22,6 +24,37 @@ constexpr inline int get_cv_type() {
     return 0;
 }
 
+template<typename T1, typename T2>
+constexpr inline auto binary_merge(T1 a, T2 b) {
+    constexpr auto bits_a = sizeof(T1) * 8;
+    constexpr auto bits_b = sizeof(T2) * 8;
+    using ret_type = boost::uint_t<bits_a + bits_b>::least;
+    return (ret_type(a) << bits_b) | b;
+}
+
+enum mem_copy_kind {
+    COPY_HOST_TO_HOST = binary_merge(MEM_HOST, MEM_HOST),
+    COPY_HOST_TO_CUDA = binary_merge(MEM_HOST, MEM_CUDA),
+    COPY_CUDA_TO_HOST = binary_merge(MEM_CUDA, MEM_HOST),
+    COPY_CUDA_TO_CUDA = binary_merge(MEM_CUDA, MEM_CUDA)
+};
+
+inline cudaMemcpyKind get_copy_kind(memory_location src,
+                                    memory_location dst) {
+    auto flag = binary_merge(src, dst);
+    switch (flag) {
+        // @formatter:off
+        case COPY_HOST_TO_HOST: { return cudaMemcpyHostToHost; }
+        case COPY_HOST_TO_CUDA: { return cudaMemcpyHostToDevice; }
+        case COPY_CUDA_TO_HOST: { return cudaMemcpyDeviceToHost; }
+        case COPY_CUDA_TO_CUDA: { return cudaMemcpyDeviceToDevice; }
+        // @formatter:on
+        default: {
+            RET_ERROR_E;
+        }
+    }
+}
+
 #define ALLOC_IMG(type, size, loc, pitch) \
     ALLOC_PITCH_SHARED(type, size.width, size.height, loc, pitch)
 
@@ -37,6 +70,7 @@ template<typename T>
 struct image_info_type {
 
     using pix_type = T;
+    using this_type = image_info_type<T>;
 
     std::shared_ptr<T> ptr;
     memory_location loc = MEM_HOST;
@@ -45,6 +79,65 @@ struct image_info_type {
 
     void *start_ptr() const { return ptr.get(); }
 
+    size_t size_in_bytes() const { return sizeof(T) * size.area(); }
+
+    size_t width_in_bytes() const { return sizeof(T) * size.width; }
+
+    bool is_continuous() const { return sizeof(T) * size.width == pitch; }
+
+    this_type flatten(smart_cuda_stream *stream) const {
+        if (is_continuous()) return *this;
+        assert(loc == MEM_CUDA); // image in host is always continuous
+        return flatten_cuda(stream);
+    }
+
+    this_type unflatten(smart_cuda_stream *stream) const {
+        assert(is_continuous());
+        if (loc == MEM_HOST) return *this; // image in host does not to be pitched
+        return unflatten_cuda(stream);
+    }
+
+    // use after create, force memory copy
+    void fill_from_async(const this_type &o,
+                         smart_cuda_stream *stream) {
+        assert(size == o.size);
+        assert(ptr != o.ptr);
+        auto copy_kind = get_copy_kind(o.loc, loc);
+        CUDA_API_CHECK(cudaMemcpy2DAsync(
+                start_ptr(), pitch, o.start_ptr(), o.pitch,
+                width_in_bytes(), size.height, copy_kind, stream->cuda));
+    }
+
+    // use after create, force memory copy
+    void fill_from_async(void *data, size_t src_pitch,
+                         memory_location src_loc,
+                         smart_cuda_stream *stream) {
+        if (src_pitch == -1) {
+            src_pitch = width_in_bytes();
+        }
+        auto copy_kind = get_copy_kind(src_loc, loc);
+        CUDA_API_CHECK(cudaMemcpy2DAsync(
+                start_ptr(), pitch, data, src_pitch,
+                width_in_bytes(), size.height, copy_kind, stream->cuda));
+    }
+
+    void fill_from_async(const cv::cuda::GpuMat &mat,
+                         smart_cuda_stream *stream) {
+        fill_from_async(mat.data, mat.step, MEM_CUDA, stream);
+    }
+
+    // use after create, force memory copy
+    void fill_from(void *data, size_t src_pitch = -1,
+                   memory_location src_loc = MEM_HOST) {
+        if (src_pitch == -1) {
+            src_pitch = width_in_bytes();
+        }
+        auto copy_kind = get_copy_kind(src_loc, loc);
+        CUDA_API_CHECK(cudaMemcpy2D(
+                start_ptr(), pitch, data, src_pitch,
+                width_in_bytes(), size.height, copy_kind));
+    }
+
     cv::Mat as_mat() const {
         assert(loc == MEM_HOST);
         return {size, get_cv_type<T>(), ptr.get(), pitch};
@@ -66,6 +159,28 @@ struct image_info_type {
         size = _size;
         ptr = ALLOC_IMG(T, size, loc, &pitch);
     }
+
+private:
+    this_type flatten_cuda(smart_cuda_stream *stream) const {
+        assert(loc == MEM_CUDA);
+        auto ret = this_type();
+        ret.ptr = ALLOC_SHARED(T, size.area(), MEM_CUDA);
+        ret.loc = MEM_CUDA;
+        ret.size = size;
+        ret.pitch = width_in_bytes();
+        ret.fill_from_async(*this, stream);
+        return ret;
+    }
+
+    this_type unflatten_cuda(smart_cuda_stream *stream) const {
+        static constexpr auto pitch_align = 32;
+        if ((pitch % pitch_align) == 0) return *this;
+        auto ret = this_type();
+        ret.create(size, MEM_CUDA);
+        ret.fill_from_async(*this, stream);
+        return ret;
+    }
+
 };
 
 template<typename T>
@@ -164,6 +279,7 @@ auto create_image(image_info_type<T> info) {
     return std::make_shared<smart_image<T>>(info);
 }
 
+using image_u8c1 = std::shared_ptr<smart_image<uchar1>>;
 using image_u8c3 = std::shared_ptr<smart_image<uchar3>>;
 using image_u8c4 = std::shared_ptr<smart_image<uchar4>>;
 using image_u16c1 = std::shared_ptr<smart_image<ushort1>>;

+ 1 - 1
src/core/impl/memory_pool_impl.h

@@ -37,7 +37,7 @@ struct memory_pool::impl {
 
     using reuse_host_pool_type = std::multimap<size_t, mem_info_type>;
     using reuse_cuda_linear_pool_type = std::multimap<size_t, mem_info_type>;
-    using reuse_cuda_pitch_pool_type = std::list<mem_info_type>;
+    using reuse_cuda_pitch_pool_type = std::list<mem_info_type>; // TODO: simulate pitched allocation manually
     reuse_host_pool_type reuse_host_pool;
     reuse_cuda_linear_pool_type reuse_cuda_linear_pool;
     reuse_cuda_pitch_pool_type reuse_cuda_pitch_pool;

+ 3 - 0
src/core/impl/object_manager.cpp

@@ -15,6 +15,7 @@ object_manager::impl::~impl() {
 
 object_manager::impl::obj_st_type *
 object_manager::impl::query_st(name_type obj_name) {
+    assert(switch_ctx() == nullptr);
     auto iter = obj_pool.find(obj_name);
     assert(iter != obj_pool.end());
     return &iter->second;
@@ -22,6 +23,7 @@ object_manager::impl::query_st(name_type obj_name) {
 
 std::optional<object_manager::obj_info>
 object_manager::impl::query_info(name_type obj_name) {
+    assert(switch_ctx() == nullptr);
     auto iter = obj_pool.find(obj_name);
     if (iter == obj_pool.end()) [[unlikely]] return {};
     auto &st = iter->second;
@@ -42,6 +44,7 @@ object_manager::impl::query_obj_stats(name_type obj_name) {
 
 void object_manager::impl::create_placeholder(name_type obj_name, std::type_index obj_type,
                                               void *ptr, del_func_type del_func) {
+    assert(switch_ctx() == nullptr);
     assert(!obj_pool.contains(obj_name));
     obj_pool.emplace(std::piecewise_construct,
                      std::forward_as_tuple(obj_name),

+ 1 - 1
src/core/memory_pool.h

@@ -4,7 +4,7 @@
 #include <cassert>
 #include <memory>
 
-enum memory_location {
+enum memory_location : uint8_t {
     MEM_HOST,
     MEM_CUDA
 };

+ 39 - 0
src/core/yaml_utility.hpp

@@ -0,0 +1,39 @@
+#ifndef DEPTHGUIDE_YAML_UTILITY_HPP
+#define DEPTHGUIDE_YAML_UTILITY_HPP
+
+#include <yaml-cpp/yaml.h>
+
+template<typename T>
+inline auto yaml_load_number(const YAML::Node &conf, const char *name) {
+    static_assert(std::is_arithmetic_v<T>);
+    assert(conf[name].IsScalar());
+    return conf[name].as<T>();
+}
+
+inline auto yaml_load_str(const YAML::Node &conf, const char *name) {
+    assert(conf[name].IsScalar());
+    return conf[name].as<std::string>();
+}
+
+inline auto yaml_load_sub(const YAML::Node &conf, const char *name) {
+    assert(conf[name].IsMap());
+    return conf[name];
+}
+
+#define LOAD_NUMBER(type, name) \
+    yaml_load_number<type>(conf, name)
+
+#define LOAD_STR(name) \
+    yaml_load_str(conf, name)
+
+#define LOAD_SUB(name) \
+    yaml_load_sub(conf, name)
+
+#define FROM_YAML_IMPL(type) \
+    static auto from_yaml(const YAML::Node &conf) { \
+        auto ret = type(); \
+        ret.fill_from_yaml(conf); \
+        return ret; \
+    }
+
+#endif //DEPTHGUIDE_YAML_UTILITY_HPP

+ 210 - 0
src/device/impl/mvs_camera.cpp

@@ -0,0 +1,210 @@
+#include "mvs_camera_impl.h"
+#include "core/image_utility.hpp"
+#include "third_party/scope_guard.hpp"
+
+namespace mvs_camera_impl {
+
+    bool check_api_call(int api_ret, unsigned int line_number,
+                        const char *file_name, const char *api_call_str) {
+        if (api_ret == MV_OK) [[likely]] return true;
+        SPDLOG_ERROR("MVS api call {} failed at {}:{} with error 0x{:x}",
+                     api_call_str, file_name, line_number, api_ret);
+        return false;
+    }
+
+#define API_CHECK(api_call) \
+    check_api_call( \
+        api_call, __LINE__, __FILE__, #api_call)
+
+#define API_CHECK_P(api_call) \
+    if (!check_api_call( \
+        api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
+        return nullptr
+
+#define API_CHECK_B(api_call) \
+    if (!check_api_call( \
+        api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
+        return false
+}
+
+using namespace mvs_camera_impl;
+
+mvs_camera::impl::~impl() {
+    if (is_capture) {
+        API_CHECK(MV_CC_StopGrabbing(handle));
+    }
+    API_CHECK(MV_CC_CloseDevice(handle));
+    API_CHECK(MV_CC_DestroyHandle(handle));
+    SPDLOG_INFO("MVS camera {} closed.", dev_name);
+}
+
+void mvs_camera::impl::on_image_impl(unsigned char *data, MV_FRAME_OUT_INFO_EX *frame_info) {
+    auto img_info = create_image_info<uchar1>(frame_size, MEM_HOST);
+    assert(frame_info->nFrameLen == img_info.size_in_bytes());
+    img_info.fill_from(data);
+    OBJ_SAVE(img_name, create_image(img_info));
+}
+
+MvGvspPixelType mvs_camera::impl::convert_pixel_type(pixel_type type) {
+    switch (type) {
+        case RG_8:
+            return PixelType_Gvsp_BayerRG8;
+        case MONO_8:
+            return PixelType_Gvsp_Mono8;
+        default: {
+            RET_ERROR_E;
+        }
+    }
+}
+
+mvs_camera::impl *mvs_camera::impl::create(create_config conf) {
+    // find
+    MV_CC_DEVICE_INFO_LIST dev_info_list;
+    API_CHECK_P(MV_CC_EnumDevices(MV_USB_DEVICE, &dev_info_list));
+    MV_CC_DEVICE_INFO *dev_info = nullptr;
+    for (int i = 0; i < dev_info_list.nDeviceNum; ++i) {
+        auto cur_dev_info = dev_info_list.pDeviceInfo[i];
+        auto cur_dev_name = (char *) cur_dev_info->SpecialInfo.stUsb3VInfo.chUserDefinedName;
+        if (cur_dev_name == conf.dev_name) {
+            dev_info = cur_dev_info;
+            break;
+        }
+    }
+    if (dev_info == nullptr) {
+        SPDLOG_ERROR("MVS camera with name {} not found.", conf.dev_name);
+        return nullptr;
+    }
+    void *handle = nullptr;
+    API_CHECK_P(MV_CC_CreateHandle(&handle, dev_info));
+    assert(handle != nullptr);
+
+    // open
+    API_CHECK_P(MV_CC_OpenDevice(handle, MV_ACCESS_Control));
+    API_CHECK_P(MV_CC_CloseDevice(handle)); // close and open again to fix some bug
+    API_CHECK_P(MV_CC_OpenDevice(handle, MV_ACCESS_Control));
+    SPDLOG_INFO("MVS camera {} opened.", conf.dev_name);
+
+    // config
+    API_CHECK_P(MV_CC_SetEnumValue(handle, "PixelFormat",
+                                   convert_pixel_type(conf.pixel)));
+    API_CHECK_P(MV_CC_SetEnumValue(handle, "AcquisitionMode",
+                                   MV_CAM_ACQUISITION_MODE::MV_ACQ_MODE_CONTINUOUS));
+    API_CHECK_P(MV_CC_SetEnumValue(handle, "TriggerMode", MV_TRIGGER_MODE_OFF));
+    API_CHECK_P(MV_CC_SetBoolValue(handle, "AcquisitionFrameRateEnable", true));
+
+    // create impl
+    auto ret = new impl;
+    auto closer = sg::make_scope_guard([&] { delete ret; });
+    ret->handle = handle;
+    ret->dev_name = conf.dev_name;
+    ret->type = conf.pixel;
+    ret->img_name = conf.img_name;
+    MVCC_INTVALUE int_val;
+    API_CHECK_P(MV_CC_GetIntValue(handle, "Width", &int_val));
+    ret->frame_size.width = int_val.nCurValue;
+    API_CHECK_P(MV_CC_GetIntValue(handle, "Height", &int_val));
+    ret->frame_size.height = int_val.nCurValue;
+
+    // register callbacks
+    API_CHECK_P(MV_CC_RegisterExceptionCallBack(handle, impl::on_error, ret));
+    API_CHECK_P(MV_CC_RegisterImageCallBackEx(handle, impl::on_image, ret));
+
+    closer.dismiss();
+    return ret;
+}
+
+bool mvs_camera::impl::set_capture_config(capture_config conf) {
+    API_CHECK_B(MV_CC_SetFloatValue(handle, "AcquisitionFrameRate", conf.frame_rate));
+    API_CHECK_B(MV_CC_SetFloatValue(handle, "ExposureTime", conf.expo_time_ms * 1000)); // ms -> us
+    API_CHECK_B(MV_CC_SetFloatValue(handle, "Gain", conf.gain_db));
+    return true;
+}
+
+bool mvs_camera::impl::start() {
+    assert(!is_capture);
+    API_CHECK_B(MV_CC_StartGrabbing(handle));
+    is_capture = true;
+    SPDLOG_INFO("MVS camera {} started capturing.", dev_name);
+    return true;
+}
+
+bool mvs_camera::impl::stop() {
+    assert(is_capture);
+    API_CHECK_B(MV_CC_StopGrabbing(handle));
+    is_capture = false;
+    SPDLOG_INFO("MVS camera {} stopped capturing.", dev_name);
+    return true;
+}
+
+mvs_camera::~mvs_camera() = default;
+
+mvs_camera::pointer mvs_camera::create(create_config conf) {
+    auto pimpl = impl::create(conf);
+    if (pimpl == nullptr) return nullptr;
+    auto ret = std::make_unique<this_type>();
+    ret->pimpl = std::unique_ptr<impl>(pimpl);
+    return ret;
+}
+
+bool mvs_camera::set_capture_config(capture_config conf) {
+    return pimpl->set_capture_config(conf);
+}
+
+bool mvs_camera::start() {
+    return pimpl->start();
+}
+
+bool mvs_camera::stop() {
+    return pimpl->stop();
+}
+
+bool mvs_camera::is_capturing() const {
+    return pimpl->is_capture;
+}
+
+mvs_camera_group::pointer mvs_camera_group::create(const create_config &conf) {
+    auto ret = std::make_unique<mvs_camera_group>();
+    auto cam_num = conf.cameras.size();
+    ret->cameras.resize(cam_num);
+    for (auto k = 0; k < cam_num; ++k) {
+        auto cam_info = conf.cameras[k];
+        auto cam_conf = mvs_camera::create_config{
+                .dev_name = cam_info.dev_name,
+                .pixel = conf.pixel,
+                .img_name = cam_info.img_name
+        };
+        auto cam = mvs_camera::create(cam_conf);
+        if (cam == nullptr) return nullptr;
+        ret->cameras[k] = std::move(cam);
+    }
+    return ret;
+}
+
+bool mvs_camera_group::set_capture_config(capture_config conf) {
+    for (auto &cam: cameras) {
+        auto ok = cam->set_capture_config(conf);
+        if (!ok) { RET_ERROR_B; }
+    }
+    return true;
+}
+
+bool mvs_camera_group::start() {
+    for (auto &cam: cameras) {
+        auto ok = cam->start();
+        if (!ok) { RET_ERROR_B; }
+    }
+    return true;
+}
+
+bool mvs_camera_group::stop() {
+    for (auto &cam: cameras) {
+        auto ok = cam->stop();
+        assert(ok);
+    }
+    return true;
+}
+
+bool mvs_camera_group::is_capturing() const {
+    assert(!cameras.empty());
+    return cameras.front()->is_capturing();
+}

+ 48 - 0
src/device/impl/mvs_camera_impl.h

@@ -0,0 +1,48 @@
+#ifndef DEPTHGUIDE_MVS_CAMERA_IMPL_H
+#define DEPTHGUIDE_MVS_CAMERA_IMPL_H
+
+#include "device/mvs_camera.h"
+
+#include <MvCameraControl.h>
+
+#include <opencv2/core/types.hpp>
+
+#include <spdlog/spdlog.h>
+
+struct mvs_camera::impl {
+    void *handle = nullptr;
+    std::string dev_name;
+    pixel_type type = RG_8;
+    obj_name_type img_name = invalid_obj_name;
+    cv::Size frame_size;
+    bool is_capture = false;
+
+    ~impl();
+
+    void on_error_impl(unsigned int msg_type) const {
+        if (msg_type == 0x8003) return; // stop capture event, not an error
+        SPDLOG_ERROR("MVS camera {} exception 0x{:x}.", dev_name, msg_type);
+    }
+
+    static void on_error(unsigned int msg_type, void *user_data) {
+        ((impl *) user_data)->on_error_impl(msg_type);
+    }
+
+    void on_image_impl(unsigned char *data, MV_FRAME_OUT_INFO_EX *frame_info);
+
+    static void on_image(unsigned char *data, MV_FRAME_OUT_INFO_EX *frame_info, void *user_data) {
+        ((impl *) user_data)->on_image_impl(data, frame_info);
+    }
+
+    static MvGvspPixelType convert_pixel_type(pixel_type type);
+
+    static impl *create(create_config conf);
+
+    bool set_capture_config(capture_config conf);
+
+    bool start();
+
+    bool stop();
+};
+
+#endif //DEPTHGUIDE_MVS_CAMERA_IMPL_H

+ 98 - 0
src/device/impl/mvs_camera_ui.cpp

@@ -0,0 +1,98 @@
+#include "mvs_camera_ui_impl.h"
+#include "core/imgui_utility.hpp"
+
+using boost::asio::post;
+
+mvs_camera_ui::impl::impl(const create_config &_conf) {
+    conf = _conf;
+    ctx = conf.ctx;
+}
+
+void mvs_camera_ui::impl::open_cameras() {
+    auto create_conf = mvs_camera_group::create_config{
+            .cameras = conf.cameras,
+            .pixel = pix_type
+    };
+    cameras = mvs_camera_group::create(create_conf);
+    assert(cameras != nullptr);
+    emit_cap_info_sig();
+}
+
+void mvs_camera_ui::impl::start_cameras() {
+    assert(cameras != nullptr);
+    cameras->set_capture_config(cap_conf);
+    cameras->start();
+}
+
+void mvs_camera_ui::impl::update_capture_config() {
+    if (cameras == nullptr || !cameras->is_capturing()) return;
+    cameras->set_capture_config(cap_conf);
+}
+
+void mvs_camera_ui::impl::emit_cap_info_sig() {
+    auto info = capture_info_type{
+            .frame_rate = cap_conf.frame_rate,
+            .is_mono = (pix_type == mvs_camera::MONO_8),
+    };
+    q_this->cap_info_sig(info);
+}
+
+void mvs_camera_ui::impl::show() {
+
+    // camera actions
+    ImGui::SeparatorText("Actions");
+    if (cameras == nullptr) {
+        if (ImGui::Button("Open")) {
+            post(*ctx, [this] { open_cameras(); });
+        }
+    } else { // cameras have been opened
+        if (ImGui::Button("Close")) {
+            post(*ctx, [this] { cameras = nullptr; });
+        }
+        ImGui::SameLine();
+        if (!cameras->is_capturing()) {
+            if (ImGui::Button("Start")) {
+                post(*ctx, [this] { start_cameras(); });
+            }
+        } else {
+            if (ImGui::Button("Stop")) {
+                post(*ctx, [this] { cameras->stop(); });
+            }
+        }
+    }
+
+    // camera configs
+    ImGui::SeparatorText("Capture Configs");
+    if (ImGui::DragInt("Frame Rate (fps)", &cap_conf.frame_rate, 1, 1, 60)) {
+        update_capture_config();
+        emit_cap_info_sig();
+    }
+    if (ImGui::DragFloat("Exposure Time (ms)", &cap_conf.expo_time_ms,
+                         0.1, 0.1, 1e3f / cap_conf.frame_rate, "%.01f")) {
+        update_capture_config();
+    }
+    if (ImGui::DragFloat("Analog Gain (dB)", &cap_conf.gain_db,
+                         0.1, 0, 23.4, "%.01f")) {
+        update_capture_config();
+    }
+
+    { // pixel type config
+        ImGui::SeparatorText("Pixel Type");
+        auto guard = imgui_disable_guard(cameras != nullptr);
+        auto pix_ptr = (int *) &pix_type;
+        ImGui::RadioButton("RG8", pix_ptr, pixel_type::RG_8);
+        ImGui::SameLine();
+        ImGui::RadioButton("MONO8", pix_ptr, pixel_type::MONO_8);
+    }
+}
+
+mvs_camera_ui::mvs_camera_ui(const create_config &conf)
+        : pimpl(std::make_unique<impl>(conf)) {
+    pimpl->q_this = this;
+}
+
+mvs_camera_ui::~mvs_camera_ui() = default;
+
+void mvs_camera_ui::show() {
+    pimpl->show();
+}

+ 38 - 0
src/device/impl/mvs_camera_ui_impl.h

@@ -0,0 +1,38 @@
+#ifndef DEPTHGUIDE_MVS_CAMERA_UI_IMPL_H
+#define DEPTHGUIDE_MVS_CAMERA_UI_IMPL_H
+
+#include "device/mvs_camera_ui.h"
+
+using boost::asio::io_context;
+
+struct mvs_camera_ui::impl {
+
+    static constexpr auto max_camera_count =
+            mvs_camera_group::max_camera_count;
+
+    mvs_camera_ui *q_this = nullptr;
+    io_context *ctx = nullptr;
+    create_config conf;
+
+    std::unique_ptr<mvs_camera_group> cameras;
+
+    using pixel_type = mvs_camera::pixel_type;
+    using capture_config = mvs_camera::capture_config;
+    pixel_type pix_type = pixel_type::RG_8;
+    capture_config cap_conf = {};
+
+    explicit impl(const create_config &conf);
+
+    void emit_cap_info_sig();
+
+    void update_capture_config();
+
+    void open_cameras();
+
+    void start_cameras();
+
+    void show();
+
+};
+
+#endif //DEPTHGUIDE_MVS_CAMERA_UI_IMPL_H

+ 10 - 7
src/device/impl/orb_camera.cpp

@@ -12,7 +12,10 @@ using boost::asio::post;
 
 namespace orb_camera_impl {
 
-    ob::Context ob_ctx;
+    ob::Context *ob_ctx() { // on request creation
+        static ob::Context ctx;
+        return &ctx;
+    }
 
     const char *get_fmt_name(OBFormat fmt) {
         switch (fmt) {
@@ -61,7 +64,7 @@ namespace orb_camera_impl {
 }
 
 std::shared_ptr<ob::Device> orb_camera::impl::get_device(const char *sn) {
-    auto dev_list = ob_ctx.queryDeviceList();
+    auto dev_list = ob_ctx()->queryDeviceList();
     return dev_list->getDeviceBySN(sn);
 }
 
@@ -178,11 +181,11 @@ void orb_camera::impl::stop() {
     is_capturing = false;
 }
 
-orb_camera *orb_camera::create(orb_camera::create_config conf) {
-    auto pimpl = std::unique_ptr<impl>(impl::create(conf));
+orb_camera::pointer orb_camera::create(create_config conf) {
+    auto pimpl = impl::create(conf);
     if (pimpl == nullptr) return nullptr;
-    auto ret = new orb_camera();
-    ret->pimpl = std::move(pimpl);
+    auto ret = std::make_unique<this_type>();
+    ret->pimpl = std::unique_ptr<impl>(pimpl);
     return ret;
 }
 
@@ -190,7 +193,7 @@ orb_camera::~orb_camera() = default;
 
 std::vector<orb_camera::device_info>
 orb_camera::query_device_info() {
-    auto dev_list = ob_ctx.queryDeviceList();
+    auto dev_list = ob_ctx()->queryDeviceList();
     std::vector<device_info> ret;
     ret.reserve(dev_list->deviceCount());
     for (auto k = 0; k < dev_list->deviceCount(); ++k) {

+ 1 - 1
src/device/impl/orb_camera_impl.h

@@ -10,7 +10,7 @@
 
 namespace orb_camera_impl {
 
-    extern ob::Context ob_ctx;
+    ob::Context *ob_ctx();
 
     const char *get_fmt_name(OBFormat fmt);
 

+ 1 - 1
src/device/impl/orb_camera_ui.cpp

@@ -24,7 +24,7 @@ void orb_camera_ui::impl::refresh_dev_info_list() {
 
 void orb_camera_ui::impl::open_camera() {
     cam_c_conf.sn_str = dev_info_list[dev_index].sn_str.c_str();
-    cam = std::unique_ptr<orb_camera>(orb_camera::create(cam_c_conf));
+    cam = orb_camera::create(cam_c_conf);
     assert(cam != nullptr);
 
     c_conf_list.clear();

+ 91 - 0
src/device/mvs_camera.h

@@ -0,0 +1,91 @@
+#ifndef REMOTEAR3_MVS_CAMERA_H
+#define REMOTEAR3_MVS_CAMERA_H
+
+#include "core/object_manager.h"
+
+#include <boost/container/static_vector.hpp>
+
+#include <memory>
+
+class mvs_camera {
+public:
+
+    ~mvs_camera();
+
+    enum pixel_type : int {
+        RG_8,
+        MONO_8
+    };
+
+    struct create_config {
+        std::string dev_name;
+        pixel_type pixel;
+        obj_name_type img_name;
+        // TODO: output RGB image
+    };
+
+    using this_type = mvs_camera;
+    using pointer = std::unique_ptr<this_type>;
+
+    static pointer create(create_config conf);
+
+    struct capture_config {
+        int frame_rate = 50; // frames per second
+        float expo_time_ms = 12.0f;
+        float gain_db = 20.0f;
+    };
+
+    bool set_capture_config(capture_config conf);
+
+    bool start();
+
+    bool stop();
+
+    bool is_capturing() const;
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+class mvs_camera_group {
+public:
+    static constexpr auto max_camera_count = 2;
+
+    struct camera_config {
+        std::string dev_name;
+        obj_name_type img_name; // image_u8c1
+    };
+
+    struct create_config {
+        using camera_list_type = boost::container::static_vector<
+                camera_config, max_camera_count>;
+        using pixel_type = mvs_camera::pixel_type;
+
+        camera_list_type cameras;
+        pixel_type pixel;
+    };
+
+    using this_type = mvs_camera_group;
+    using pointer = std::unique_ptr<this_type>;
+
+    static pointer create(const create_config &conf);
+
+    using capture_config = mvs_camera::capture_config;
+
+    bool set_capture_config(capture_config conf);
+
+    bool start();
+
+    bool stop();
+
+    bool is_capturing() const;
+
+private:
+    using camera_type = mvs_camera::pointer;
+    using cameras_type = boost::container::static_vector<
+            camera_type, max_camera_count>;
+    cameras_type cameras;
+};
+
+#endif //REMOTEAR3_MVS_CAMERA_H

+ 41 - 0
src/device/mvs_camera_ui.h

@@ -0,0 +1,41 @@
+#ifndef DEPTHGUIDE_MVS_CAMERA_UI_H
+#define DEPTHGUIDE_MVS_CAMERA_UI_H
+
+#include "mvs_camera.h"
+#include "core/object_manager.h"
+
+#include <boost/signals2.hpp>
+
+#include <memory>
+
+class mvs_camera_ui {
+public:
+
+    struct create_config {
+        using camera_list_type =
+                mvs_camera_group::create_config::camera_list_type;
+        camera_list_type cameras;
+        boost::asio::io_context *ctx = nullptr;
+    };
+
+    explicit mvs_camera_ui(const create_config &conf);
+
+    ~mvs_camera_ui();
+
+    struct capture_info_type {
+        int frame_rate = 0;
+        bool is_mono = false;
+    };
+
+    using capture_info_sig_type =
+            boost::signals2::signal<void(capture_info_type)>;
+    capture_info_sig_type cap_info_sig;
+
+    void show();
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+#endif //DEPTHGUIDE_MVS_CAMERA_UI_H

+ 4 - 1
src/device/orb_camera.h

@@ -29,7 +29,10 @@ public:
         boost::asio::io_context *ctx = nullptr;
     };
 
-    static orb_camera *create(create_config conf);
+    using this_type = orb_camera;
+    using pointer = std::unique_ptr<this_type>;
+
+    static pointer create(create_config conf);
 
     struct video_info {
         uint32_t index = 0;

+ 58 - 0
src/image_process/camera_utility.hpp

@@ -0,0 +1,58 @@
+#ifndef DEPTHGUIDE_CAMERA_UTILITY_HPP
+#define DEPTHGUIDE_CAMERA_UTILITY_HPP
+
+#include "core/yaml_utility.hpp"
+
+#include <Eigen/Geometry>
+
+#include <cstdint>
+
+struct camera_intrinsic {
+    float fx, fy; // focus length in pixel
+    float cx, cy; // optical center in pixel
+    float k[2]; // distort coefficients
+    uint32_t width, height;
+
+    void fill_from_yaml(const YAML::Node &conf) {
+        fx = LOAD_NUMBER(float, "fx");
+        fy = LOAD_NUMBER(float, "fy");
+        cx = LOAD_NUMBER(float, "cx");
+        cy = LOAD_NUMBER(float, "cy");
+        k[0] = LOAD_NUMBER(float, "k0");
+        k[1] = LOAD_NUMBER(float, "k1");
+        width = LOAD_NUMBER(int, "width");
+        height = LOAD_NUMBER(int, "height");
+    }
+
+    FROM_YAML_IMPL(camera_intrinsic)
+
+};
+
+inline Eigen::Isometry3f transform_from_yaml_f(const YAML::Node &conf) {
+    return Eigen::Translation3f(
+            LOAD_NUMBER(float, "tx"),
+            LOAD_NUMBER(float, "ty"),
+            LOAD_NUMBER(float, "tz")
+    ) * Eigen::Quaternionf(
+            LOAD_NUMBER(float, "qw"),
+            LOAD_NUMBER(float, "qx"),
+            LOAD_NUMBER(float, "qy"),
+            LOAD_NUMBER(float, "qz")
+    );
+}
+
+struct stereo_camera_info {
+    camera_intrinsic left = {}, right = {};
+    Eigen::Isometry3f transform; // left in right
+
+    void fill_from_yaml(const YAML::Node &conf) {
+        left.fill_from_yaml(LOAD_SUB("left"));
+        right.fill_from_yaml(LOAD_SUB("right"));
+        transform = transform_from_yaml_f(LOAD_SUB("transform"));
+    }
+
+    FROM_YAML_IMPL(stereo_camera_info)
+
+};
+
+#endif //DEPTHGUIDE_CAMERA_UTILITY_HPP

+ 49 - 0
src/image_process/image_process_ui.h

@@ -0,0 +1,49 @@
+#ifndef DEPTHGUIDE_IMAGE_PROCESS_UI_H
+#define DEPTHGUIDE_IMAGE_PROCESS_UI_H
+
+#include "core/object_manager.h"
+#include "image_process_v3/image_process.h"
+
+#include <opencv2/core/types.hpp>
+
+#include <memory>
+
+class image_process_ui {
+public:
+
+    // device related information
+    struct device_info_type {
+        cv::Size2f valid_range; // see calc_valid_range()
+        camera_intrinsic cam_int;
+    };
+
+    struct create_config {
+        device_info_type dev_info;
+        uint32_t output_height = 1080;
+
+        obj_name_type in_name = invalid_obj_name;
+        obj_name_type out_name = invalid_obj_name;
+        smart_cuda_stream *stream = nullptr;
+    };
+
+    explicit image_process_ui(create_config conf);
+
+    ~image_process_ui();
+
+    struct modifiable_config {
+        bool is_mono = false;
+    };
+
+    void change_config(modifiable_config conf);
+
+    void show();
+
+    void sync_with(image_process_ui *ui);
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+
+#endif //DEPTHGUIDE_IMAGE_PROCESS_UI_H

+ 60 - 0
src/image_process/impl/image_process_ui.cpp

@@ -0,0 +1,60 @@
+#include "image_process_ui_impl.h"
+#include "core/imgui_utility.hpp"
+
+image_process_ui::impl::impl(create_config _conf) {
+    conf = _conf;
+    img_cb_conn = OBJ_SIG(conf.in_name)->connect(
+            [this](auto name) { process(); });
+}
+
+image_process_ui::impl::~impl() {
+    img_cb_conn.disconnect();
+}
+
+void image_process_ui::impl::show() {
+    ImGui::Checkbox("Enhance", &ui_conf->enhance);
+    ImGui::SameLine();
+    ImGui::Checkbox("Undistort", &ui_conf->undistort);
+    if (!is_mono) {
+        auto guard = imgui_disable_guard(ui_conf->undistort);
+        ImGui::SameLine();
+        ImGui::Checkbox("Crude Debayer", &ui_conf->crude_debayer);
+    }
+}
+
+void image_process_ui::impl::process() {
+    auto proc_conf = monocular_processor::process_config();
+    proc_conf.is_mono = is_mono;
+    proc_conf.crude_debayer = ui_conf->crude_debayer;
+    proc_conf.enhance = ui_conf->enhance;
+    proc_conf.undistort = ui_conf->undistort;
+    proc_conf.valid_range = conf.dev_info.valid_range;
+    proc_conf.camera = conf.dev_info.cam_int;
+    proc_conf.resample_height = conf.output_height;
+    proc_conf.stream = conf.stream;
+    auto img_in = OBJ_QUERY(image_u8c1, conf.in_name);
+    auto img_out = processor.process(img_in, proc_conf);
+    OBJ_SAVE(conf.out_name, img_out);
+}
+
+void image_process_ui::impl::change_config(modifiable_config _conf) {
+    is_mono = _conf.is_mono;
+}
+
+image_process_ui::image_process_ui(create_config conf)
+        : pimpl(std::make_unique<impl>(conf)) {
+}
+
+image_process_ui::~image_process_ui() = default;
+
+void image_process_ui::change_config(modifiable_config conf) {
+    pimpl->change_config(conf);
+}
+
+void image_process_ui::show() {
+    pimpl->show();
+}
+
+void image_process_ui::sync_with(image_process_ui *ui) {
+    pimpl->ui_conf = ui->pimpl->ui_conf;
+}

+ 37 - 0
src/image_process/impl/image_process_ui_impl.h

@@ -0,0 +1,37 @@
+#ifndef DEPTHGUIDE_IMAGE_PROCESS_UI_IMPL_H
+#define DEPTHGUIDE_IMAGE_PROCESS_UI_IMPL_H
+
+#include "image_process/image_process_ui.h"
+
+struct image_process_ui::impl {
+
+    struct ui_config_type {
+        bool crude_debayer = true;
+        bool enhance = false;
+        bool undistort = true;
+    };
+
+    std::shared_ptr<ui_config_type> ui_conf =
+            std::make_shared<ui_config_type>();
+
+    create_config conf;
+    bool is_mono = false;
+
+    using conn_type = boost::signals2::connection;
+    conn_type img_cb_conn;
+
+    monocular_processor processor;
+
+    explicit impl(create_config conf);
+
+    ~impl();
+
+    void show();
+
+    void process();
+
+    void change_config(modifiable_config conf);
+
+};
+
+#endif //DEPTHGUIDE_IMAGE_PROCESS_UI_IMPL_H

+ 24 - 0
src/image_process_v3/cuda_impl/CMakeLists.txt

@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.25)
+project(ImageProcessCudaV3 LANGUAGES CXX CUDA)
+
+set(CMAKE_CXX_STANDARD 20)
+
+add_library(${PROJECT_NAME}
+        process_kernels.cu
+        vis_marker_kernels.cu)
+
+# CUDA config
+find_package(CUDAToolkit REQUIRED)
+target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
+
+# spdlog config
+find_package(spdlog REQUIRED)
+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
+
+# OpenCV config
+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
+target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})

+ 33 - 0
src/image_process_v3/cuda_impl/kernel_types.cuh

@@ -0,0 +1,33 @@
+#ifndef REMOTEAR3_KERNEL_DEFS_CUH
+#define REMOTEAR3_KERNEL_DEFS_CUH
+
+#include <cstdint>
+
+template<typename ImgT>
+struct image_type {
+    ImgT *ptr;
+    uint32_t pitch; // in bytes
+    uint32_t width, height; // in pixels
+};
+
+template<typename T>
+struct filter_result_type {
+    uint32_t capacity;
+    uint32_t *next_pos;
+    T *data;
+};
+
+template<typename T>
+struct kernel_bunch {
+    uint8_t border_width;
+    uint8_t kernel_length;
+    T *data; // [2*bw+1, 2*bw+1, ks]
+};
+
+struct camera_info {
+    float fx, fy; // focus length in pixel / width (height)
+    float cx, cy; // optical center in pixel / width (height)
+    float k[2]; // distort coefficients
+};
+
+#endif //REMOTEAR3_KERNEL_DEFS_CUH

+ 128 - 0
src/image_process_v3/cuda_impl/kernel_utility.cuh

@@ -0,0 +1,128 @@
+#ifndef REMOTEAR3_KERNEL_UTILITY_CUH
+#define REMOTEAR3_KERNEL_UTILITY_CUH
+
+#include "kernel_types.cuh"
+
+#include <cassert>
+#include <climits>
+#include <cstdint>
+
+template<typename T>
+struct type_max_value {
+    static constexpr T value = std::numeric_limits<T>::max();
+};
+
+template<typename T>
+struct type_min_value {
+    static constexpr T value = std::numeric_limits<T>::min();
+};
+
+template<typename T, uint32_t Len>
+struct packed_type {
+};
+
+template<>
+struct packed_type<uint8_t, 2> {
+    using type = uchar2;
+};
+
+template<>
+struct packed_type<uint8_t, 3> {
+    using type = uchar3;
+};
+
+template<>
+struct packed_type<uint8_t, 4> {
+    using type = uchar4;
+};
+
+__device__ __forceinline__ uint32_t get_ix() { // thread index x of current work
+    return blockIdx.x * blockDim.x + threadIdx.x;
+}
+
+__device__ __forceinline__ uint32_t get_iy() { // thread index x of current work
+    return blockIdx.y * blockDim.y + threadIdx.y;
+}
+
+__device__ __forceinline__ uint32_t get_id() { // thread index in current block
+    return threadIdx.y * blockDim.x + threadIdx.x;
+}
+
+__device__ __forceinline__ uint32_t get_bs() { // number of threads in a block
+    return blockDim.x * blockDim.y;
+}
+
+__device__ __forceinline__ uint32_t get_gw() { // grid width
+    return blockDim.x * gridDim.x;
+}
+
+__device__ __forceinline__ uint32_t get_gh() { // grid height
+    return blockDim.y * gridDim.y;
+}
+
+__device__ __forceinline__ uint32_t get_bx() { // start position x of block
+    return blockIdx.x * blockDim.x;
+}
+
+__device__ __forceinline__ uint32_t get_by() { // start position y of block
+    return blockIdx.y * blockDim.y;
+}
+
+template<typename T>
+__device__ __forceinline__ T clip_value(T x, T min_x, T max_x) {
+    assert(max_x > min_x);
+    if (x < min_x) [[unlikely]] {
+        return min_x;
+    } else if (x > max_x) [[unlikely]] {
+        return max_x;
+    }
+    return x;
+}
+
+template<typename T, typename U=T>
+__device__ __host__ __forceinline__ auto ceil_div(T x, U y) {
+    return (x / y) + ((x % y) != 0);
+}
+
+template<typename T>
+__device__ __forceinline__ T pow2(T x) {
+    return x * x;
+}
+
+template<typename T, typename U=T>
+__device__ __forceinline__ U image_fetch(image_type<T> img, uint32_t x, uint32_t y) {
+    auto row_ptr = (uint8_t *) img.ptr + y * img.pitch;
+    return *((U *) row_ptr + x);
+}
+
+template<typename T, typename U=T>
+__device__ __forceinline__ U image_fetch_repeat(image_type<T> img, int x, int y) {
+    x = clip_value<int>(x, 0, img.width - 1);
+    y = clip_value<int>(y, 0, img.height - 1);
+    return image_fetch(img, x, y);
+}
+
+template<typename T>
+__device__ __forceinline__ void image_store(image_type<T> img, uint32_t x, uint32_t y, T elem) {
+    auto row_ptr = (uint8_t *) img.ptr + y * img.pitch;
+    *((T *) row_ptr + x) = elem;
+}
+
+template<typename T, uint32_t Size>
+__device__ __forceinline__ void simple_copy(T *dst, const T *src) {
+    auto bs = get_bs();
+    for (auto i = get_id(); i < Size; i += bs) {
+        dst[i] = src[i];
+    }
+}
+
+template<typename T>
+__host__ dim3 calc_grid_size(image_type<T> img, uint8_t block_width) {
+    return {
+            ceil_div(img.width, block_width),
+            ceil_div(img.height, block_width),
+            1
+    };
+}
+
+#endif //REMOTEAR3_KERNEL_UTILITY_CUH

+ 508 - 0
src/image_process_v3/cuda_impl/process_kernels.cu

@@ -0,0 +1,508 @@
+#include "kernel_utility.cuh"
+#include "process_kernels.cuh"
+
+#include <cassert>
+#include <limits>
+#include <type_traits>
+
+// kernel templates
+
+template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
+__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
+    static_assert(std::is_fundamental_v<OutT>,
+                  "Only fundamental type can be reduced.");
+    if constexpr (BlockSize >= 64) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
+    }
+    if constexpr (BlockSize >= 32) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
+    }
+    if constexpr (BlockSize >= 16) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
+    }
+    if constexpr (BlockSize >= 8) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
+    }
+    if constexpr (BlockSize >= 4) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
+    }
+    if constexpr (BlockSize >= 2) {
+        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
+    }
+}
+
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, uint16_t BlockSize>
+__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
+    extern __shared__ int shmem[];
+    auto s_buf = (OutT *) shmem;
+
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = BlockSize * gridDim.x;
+
+    OutT t_out = UpdateFunc::InitVal();
+
+    // load per-thread data
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        UpdateFunc::Op(&t_out, in[i]);
+    }
+
+    // update to shared memory
+    s_buf[tdx] = t_out;
+    __syncthreads();
+
+    if constexpr (BlockSize >= 512) {
+        if (tdx < 256) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
+        }
+        __syncthreads();
+    }
+    if constexpr (BlockSize >= 256) {
+        if (tdx < 128) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
+        }
+        __syncthreads();
+    }
+    if constexpr (BlockSize >= 128) {
+        if (tdx < 64) {
+            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
+        }
+        __syncthreads();
+    }
+
+    if (tdx < 32) {
+        warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
+    }
+    if (tdx == 0) {
+        out[bkx] = s_buf[0];
+    }
+}
+
+template<typename InT, typename OutT, typename Func>
+__global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = blockDim.x * gridDim.x;
+
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        Func::Op(&out[i], in[i]);
+    }
+}
+
+template<typename InT, typename OutT, typename ExtT, typename Func>
+__global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
+    uint32_t tdx = threadIdx.x;
+    uint32_t bkx = blockIdx.x;
+    uint32_t grid_size = blockDim.x * gridDim.x;
+
+    // load extra values
+    ExtT ext = *p_ext;
+
+    for (uint32_t i = bkx * blockDim.x + tdx;
+         i < n;
+         i += grid_size) {
+        Func::Op(&out[i], in[i], ext);
+    }
+}
+
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
+void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
+                            uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    auto shmem_size = block_size * (1 + (block_size <= 32));
+    auto shmem_length = shmem_size * sizeof(OutT);
+    switch (block_size) {
+        case 512: {
+            constexpr uint16_t BlockSize = 512;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        case 256: {
+            constexpr uint16_t BlockSize = 256;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        case 128: {
+            constexpr uint16_t BlockSize = 128;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
+            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+// result resides in out[0]
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
+void call_reduce_any(InT *in, OutT *out, uint32_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    { // first step
+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc>;
+        helper_func(in, out, n, block_size, grid_dim, stream);
+    }
+    { // second step
+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc>;
+        helper_func(out, out, grid_dim, block_size, 1, stream);
+    }
+}
+
+// working functions
+
+template<typename T>
+struct reduce_max_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = max(*out, val);
+    }
+
+    static __device__ __forceinline__ T InitVal() {
+        return type_min_value<T>::value;
+    }
+};
+
+template<typename T>
+struct reduce_min_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = min(*out, val);
+    }
+
+    static __device__ __forceinline__ T InitVal() {
+        return type_max_value<T>::value;
+    }
+};
+
+template<typename T>
+struct reduce_sum_func {
+    static __device__ __forceinline__ void Op(volatile T *out, T val) {
+        *out = *out + val;
+    }
+
+    static __device__ __forceinline__ T InitVal() {
+        return 0;
+    }
+};
+
+template<typename T>
+struct update_log_sum_func {
+    static constexpr T eps = (T) 1e-6;
+
+    static __device__ __forceinline__ void Op(T *out, T val) {
+        *out += log(val + eps);
+    }
+
+    static __device__ __forceinline__ T InitVal() {
+        return 0;
+    }
+};
+
+template<typename InT, typename OutT>
+struct rgb_extract_v_func { // Extract V value of HSV from RGB
+    static __device__ __forceinline__ void Op(OutT *out, InT in) {
+        if constexpr (std::is_floating_point_v<OutT>) {
+            using InElemT = decltype(in.x);
+            constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
+            *out = factor * max(max(in.x, in.y), in.z);
+        } else {
+            *out = max(max(in.x, in.y), in.z);
+        }
+    }
+};
+
+struct enhance_v_func {
+    static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
+        *out = ext.norm_factor * log(in / ext.log_avg + 1);
+    }
+};
+
+template<typename ImgT>
+struct enhance_image_func {
+    static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
+        // convert RGB to HSV
+        // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
+        using ImgElemT = decltype(in.x);
+        static_assert(std::is_integral_v<ImgElemT>,
+                      "Type of image element must be integer.");
+        ImgElemT c_max = max(max(in.x, in.y), in.z);
+        ImgElemT c_min = min(min(in.x, in.y), in.z);
+        ImgElemT delta = c_max - c_min;
+
+        float h; // 60 is eliminated
+        if (delta == 0) {
+            h = 0;
+        } else {
+            float delta_inv = 1.0f / delta;
+            if (c_max == in.x) { // c_max == r
+                h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
+                if (h < 0) {
+                    h += 6;
+                }
+            } else if (c_max == in.y) { // c_max == g
+                h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
+            } else { // c_max == b
+                h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
+            }
+
+        }
+
+        float s;
+        if (c_max == 0) {
+            s = 0;
+        } else {
+            s = (float) delta / c_max;
+        }
+
+        constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
+        float v = v_factor * (float) c_max;
+
+        // enhance V channel
+        v = ext.norm_factor * log(v / ext.log_avg + 1);
+
+        // convert HSV to RGB
+        // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
+        float c = v * s;
+        float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
+        float m = v - c;
+        float r, g, b;
+        switch ((uint8_t) h) {
+            case 0: {
+                r = c;
+                g = x;
+                b = 0;
+                break;
+            }
+            case 1: {
+                r = x;
+                g = c;
+                b = 0;
+                break;
+            }
+            case 2: {
+                r = 0;
+                g = c;
+                b = x;
+                break;
+            }
+            case 3: {
+                r = 0;
+                g = x;
+                b = c;
+                break;
+            }
+            case 4: {
+                r = x;
+                g = 0;
+                b = c;
+                break;
+            }
+            case 5: {
+                r = c;
+                g = 0;
+                b = x;
+                break;
+            }
+            default: {
+                assert(false);
+            }
+        }
+
+        constexpr float out_factor = type_max_value<ImgElemT>::value;
+        ImgT out;
+        out.x = out_factor * (r + m);
+        out.y = out_factor * (g + m);
+        out.z = out_factor * (b + m);
+
+        *p_out = out;
+    }
+};
+
+// special kernels
+
+__global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
+                                      enhance_coeff *p_out) {
+    float max_v = *p_max_v;
+    float sum_log_v = *p_sum_log_v;
+    float log_avg = exp(sum_log_v / n);
+    float norm_factor = 1.0f / (log(max_v / log_avg + 1));
+    p_out->log_avg = log_avg;
+    p_out->norm_factor = norm_factor;
+}
+
+template<typename InT, typename OutT>
+__global__ void crude_debayer(image_type<InT> in, image_type<OutT> out) {
+    uint32_t gw = get_gw(), gh = get_gh();
+    for (uint32_t iy = get_iy(); iy < out.height; iy += gh)
+        for (uint32_t ix = get_ix(); ix < out.width; ix += gw) {
+            // fetch elements
+            using FetchType = typename packed_type<InT, 2>::type;
+            auto raw_rg = image_fetch<InT, FetchType>(in, ix, iy << 1);
+            auto raw_gb = image_fetch<InT, FetchType>(in, ix, (iy << 1) | 1);
+
+            // reconstruct
+            static_assert(std::is_integral_v<InT>,
+                          "Type of image element must be one channel integer.");
+            static_assert(sizeof(InT) < sizeof(uint32_t),
+                          "Size of image element must be smaller than int.");
+            InT r = raw_rg.x;
+            InT g = ((uint32_t) raw_rg.y + (uint32_t) raw_gb.x) >> 1;
+            InT b = raw_gb.y;
+
+            // store result
+            if constexpr (std::is_same_v<OutT, typename packed_type<InT, 3>::type>) {
+                image_store(out, ix, iy, {r, g, b});
+            } else if constexpr (std::is_same_v<OutT, typename packed_type<InT, 4>::type>) {
+                constexpr InT alpha_val = type_max_value<InT>::value;
+                image_store(out, ix, iy, {r, g, b, alpha_val});
+            }
+        }
+}
+
+template<typename ImgT>
+__global__ void resample_image(cudaTextureObject_t in, image_type<ImgT> out,
+                               resample_info info, camera_info cam) {
+    uint32_t gw = get_gw(), gh = get_gh();
+    for (uint32_t iy = get_iy(); iy < out.height; iy += gh)
+        for (uint32_t ix = get_ix(); ix < out.width; ix += gw) {
+            // undistorted coordinates
+            float u = info.x + info.ps * ix;
+            float v = info.y + info.ps * iy;
+
+            // distort coordinates
+            float r2 = u * u + v * v;
+            float k = 1 + cam.k[0] * r2 + cam.k[1] * r2 * r2;
+            u *= k;
+            v *= k;
+
+            // convert to normalized pixel plane
+            u = cam.fx * u + cam.cx;
+            v = cam.fy * v + cam.cy;
+
+            // sample origin image
+            if constexpr (std::is_integral_v<ImgT>) { // Mono -> Mono
+                constexpr auto factor = type_max_value<ImgT>::value;
+                auto val = tex2D<float>(in, u, v);
+                ImgT ret = val * factor;
+                image_store(out, ix, iy, ret);
+            } else { // RGBA -> RGB
+                ImgT ret;
+                using ElemT = decltype(ret.x);
+                constexpr auto factor = type_max_value<ElemT>::value;
+                auto val = tex2D<float4>(in, u, v);
+                ret.x = factor * val.x;
+                ret.y = factor * val.y;
+                ret.z = factor * val.z;
+                image_store(out, ix, iy, ret);
+            }
+        }
+}
+
+// calling endpoints
+
+template<typename T>
+void call_reduce_max(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_max_func<T>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_min(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_min_func<T>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_sum(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using FuncType = reduce_sum_func<T>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+template<typename T>
+void call_reduce_log_sum(T *in, T *out, size_t n,
+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    using UpdateFuncType = update_log_sum_func<T>;
+    using ReduceFuncType = reduce_sum_func<T>;
+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType>;
+    helper_func(in, out, n, block_size, grid_dim, stream);
+}
+
+template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+
+template<typename InT, typename OutT>
+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    using FuncType = rgb_extract_v_func<InT, OutT>;
+    elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
+}
+
+template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
+
+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
+                                enhance_coeff *out, cudaStream_t stream) {
+    prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
+}
+
+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
+}
+
+template<typename ImgT>
+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
+    assert(n <= std::numeric_limits<uint32_t>::max());
+    using FuncType = enhance_image_func<ImgT>;
+    auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
+    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
+}
+
+template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
+
+template<typename InT, typename OutT>
+void call_crude_debayer(image_type<InT> in, image_type<OutT> out,
+                        uint2 _block_size, uint2 _grid_dim, cudaStream_t stream) {
+    auto block_size = dim3{_block_size.x, _block_size.y, 1};
+    auto grid_dim = dim3{_grid_dim.x, _grid_dim.y, 1};
+    assert((void *) in.ptr != (void *) out.ptr);
+    crude_debayer<<<grid_dim, block_size, 0, stream>>>(in, out);
+}
+
+template void call_crude_debayer(image_type<uint8_t>, image_type<uchar3>, uint2, uint2, cudaStream_t);
+
+template void call_crude_debayer(image_type<uint8_t>, image_type<uchar4>, uint2, uint2, cudaStream_t);
+
+template<typename ImgT>
+void call_resample_image(cudaTextureObject_t in, image_type<ImgT> out,
+                         resample_info info, camera_info cam,
+                         uint2 _block_size, uint2 _grid_dim, cudaStream_t stream) {
+    auto block_size = dim3{_block_size.x, _block_size.y, 1};
+    auto grid_dim = dim3{_grid_dim.x, _grid_dim.y, 1};
+    resample_image<<<grid_dim, block_size, 0, stream>>>(in, out, info, cam);
+}
+
+template void call_resample_image(cudaTextureObject_t, image_type<uchar3>,
+                                  resample_info, camera_info, uint2, uint2, cudaStream_t);
+
+template void call_resample_image(cudaTextureObject_t, image_type<uint8_t>,
+                                  resample_info, camera_info, uint2, uint2, cudaStream_t);

+ 50 - 0
src/image_process_v3/cuda_impl/process_kernels.cuh

@@ -0,0 +1,50 @@
+#ifndef IMAGEHDR_PROCESS_KERNELS_CUH
+#define IMAGEHDR_PROCESS_KERNELS_CUH
+
+#include "kernel_types.cuh"
+
+template<typename T>
+void call_reduce_max(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename T>
+void call_reduce_min(T *in, T *out, size_t n,
+                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename T>
+void call_reduce_log_sum(T *in, T *out, size_t n,
+                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename InT, typename OutT>
+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+struct enhance_coeff {
+    float log_avg, norm_factor;
+};
+
+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
+                                enhance_coeff *out, cudaStream_t stream);
+
+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
+                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename ImgT>
+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
+                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
+
+template<typename InT, typename OutT>
+void call_crude_debayer(image_type<InT> in, image_type<OutT> out,
+                        uint2 _block_size, uint2 _grid_dim, cudaStream_t stream);
+
+struct resample_info {
+    float x, y; // coordinates of the start point in normalized plane
+    float ps; // pixel size in normalized plane
+};
+
+template<typename ImgT>
+void call_resample_image(cudaTextureObject_t in, image_type<ImgT> out,
+                         resample_info info, camera_info cam,
+                         uint2 _block_size, uint2 _grid_dim, cudaStream_t stream);
+
+#endif //IMAGEHDR_PROCESS_KERNELS_CUH

+ 300 - 0
src/image_process_v3/cuda_impl/vis_marker_kernels.cu

@@ -0,0 +1,300 @@
+#include "kernel_utility.cuh"
+#include "vis_marker_kernels.cuh"
+
+// p_ker pointer to kernel values
+// p_cnt pointer to positive elements in each kernel
+template<typename InT, typename OutT, uint8_t BlockW, uint8_t KBor, uint8_t KNum>
+__global__ void corner_score(image_type<InT> in, image_type<OutT> out,
+                             int8_t *p_ker, uint16_t *p_cnt) {
+    // copy kernels
+    static constexpr auto KLen = 2 * KBor + 1;
+    __shared__ int8_t ker_loc[KLen][KLen][KNum];
+    __shared__ uint16_t cnt_loc[KNum];
+    simple_copy<int8_t, KLen * KLen * KNum>((int8_t *) ker_loc, p_ker);
+    simple_copy<uint16_t, KNum>(cnt_loc, p_cnt);
+
+    // copy related pixels
+    static constexpr auto RefW = BlockW + 2 * KBor;
+    __shared__ float ref_img[RefW][RefW];
+    int ref_sx = get_bx() - KBor, ref_sy = get_by() - KBor;
+    for (auto j = threadIdx.y; j < RefW; j += BlockW)
+        for (auto i = threadIdx.x; i < RefW; i += BlockW) {
+            if constexpr (std::is_integral_v<InT>) { // normalize if needed
+                ref_img[j][i] = 1.0f * image_fetch_repeat(in, i + ref_sx, j + ref_sy)
+                                / type_max_value<InT>::value;
+            } else {
+                ref_img[j][i] = image_fetch_border(in, i + ref_sx, j + ref_sy);
+            }
+        }
+    __syncthreads();
+
+    auto x = get_ix(), y = get_iy();
+
+    // calculate patch mean
+    float patch_mean = 0;
+    for (int j = 0; j <= 2 * KBor; ++j) {
+        auto rj = j - KBor + y - ref_sy;
+        for (int i = 0; i <= 2 * KBor; ++i) {
+            auto ri = i - KBor + x - ref_sx;
+            patch_mean += ref_img[rj][ri];
+        }
+    }
+    static constexpr auto PatchW = 2 * KBor + 1;
+    static constexpr auto MeanThreshold = 0.01f; // TODO
+    patch_mean /= PatchW * PatchW;
+    if (patch_mean < MeanThreshold) [[unlikely]] {
+        image_store<OutT>(out, x, y, 0);
+        return;
+    }
+    auto patch_norm_factor = 0.5f / patch_mean;
+
+    // ind1, ind2 = (mask == 1), (mask == -1)
+    // m1, m2 = np.mean(patch[ind1]), np.mean(patch[ind2])
+    float mean_pos[KNum]{}, mean_neg[KNum]{};
+    for (int j = 0; j <= 2 * KBor; ++j) {
+        auto rj = j - KBor + y - ref_sy;
+        for (int i = 0; i <= 2 * KBor; ++i) {
+            auto ri = i - KBor + x - ref_sx;
+            for (auto k = 0; k < KNum; ++k) {
+                auto ker_val = ker_loc[j][i][k];
+                // patch = np.clip(patch/mean*0.5, 0, 1)
+                auto pix_val = clip_value<float>(ref_img[rj][ri] * patch_norm_factor, 0, 1);
+                if (ker_val > 0) {
+                    mean_pos[k] += pix_val;
+                } else if (ker_val < 0) {
+                    mean_neg[k] += pix_val;
+                }
+            }
+        }
+    }
+    for (auto k = 0; k < KNum; ++k) {
+        mean_pos[k] /= cnt_loc[k];
+        mean_neg[k] /= cnt_loc[k];
+    }
+
+    // s1, s2 = np.std(patch[ind1]), np.std(patch[ind2])
+    float std_pos[KNum]{}, std_neg[KNum]{};
+    for (int j = 0; j <= 2 * KBor; ++j) {
+        auto rj = j - KBor + y - ref_sy;
+        for (int i = 0; i <= 2 * KBor; ++i) {
+            auto ri = i - KBor + x - ref_sx;
+            for (auto k = 0; k < KNum; ++k) {
+                auto ker_val = ker_loc[j][i][k];
+                auto pix_val = clip_value<float>(ref_img[rj][ri] * patch_norm_factor, 0, 1);
+                if (ker_val > 0) {
+                    std_pos[k] += pow2(pix_val - mean_pos[k]);
+                } else if (ker_val < 0) {
+                    std_neg[k] += pow2(pix_val - mean_neg[k]);
+                }
+            }
+        }
+    }
+    for (auto k = 0; k < KNum; ++k) {
+        std_pos[k] = sqrt(std_pos[k] / cnt_loc[k]);
+        std_neg[k] = sqrt(std_neg[k] / cnt_loc[k]);
+    }
+
+    // s_in = (1-2*s1)*(1-2*s2)
+    // s_in = np.power(s_in, alpha1)
+    static constexpr auto IntraAlpha = 2;
+    float intra_score[KNum];
+    for (auto k = 0; k < KNum; ++k) {
+        intra_score[k] = (1 - 2 * std_pos[k])
+                         * (1 - 2 * std_neg[k]);
+        intra_score[k] = pow(intra_score[k], IntraAlpha);
+    }
+
+    // m1, m2 = max(m1, m2), min(m1, m2)
+    // s_out = 2*(m1/(m1+m2)-0.5)
+    static constexpr auto InterAlpha = 1;
+    float inter_score[KNum];
+    for (auto k = 0; k < KNum; ++k) {
+        auto mean_sum = mean_pos[k] + mean_neg[k];
+        if (mean_sum < MeanThreshold) {
+            image_store<OutT>(out, x, y, 0);
+            return;
+        }
+        auto mean_max = max(mean_neg[k], mean_pos[k]);
+        inter_score[k] = 2 * (mean_max / mean_sum - 0.5);
+        inter_score[k] = pow(inter_score[k], InterAlpha);
+    }
+
+    // return s_in*s_out {
+    // ret = max(ret, masked_score(patch, masks[k]))
+    float max_score = 0;
+    for (auto k = 0; k < KNum; ++k) {
+        auto cur_score = intra_score[k] * inter_score[k];
+        max_score = max(max_score, cur_score);
+    }
+
+    // store result
+    if constexpr (std::is_integral_v<OutT>) { // de-normalize if needed
+        image_store<OutT>(out, x, y, max_score * type_max_value<OutT>::value);
+    } else {
+        image_store(out, x, y, max_score);
+    }
+}
+
+// local maximum filter
+template<typename T, uint8_t BlockW, uint8_t KBor>
+__global__ void lm_filter(image_type<T> in, filter_result_type<ushort2> out, T threshold) {
+    // copy related pixels
+    static constexpr auto RefW = BlockW + 2 * KBor;
+    __shared__ float ref_img[RefW][RefW];
+    int ref_sx = get_bx() - KBor, ref_sy = get_by() - KBor;
+    for (auto j = threadIdx.y; j < RefW; j += BlockW)
+        for (auto i = threadIdx.x; i < RefW; i += BlockW) {
+            ref_img[j][i] = image_fetch_repeat(in, i + ref_sx, j + ref_sy);
+        }
+    __syncthreads();
+
+    // score *= (score == maximum_filter(score, footprint=np.ones((3, 3))))
+    auto x = get_ix(), y = get_iy();
+    auto pix_val = ref_img[y - ref_sy][x - ref_sx];
+    if (pix_val < threshold) return;
+    for (int j = 0; j <= 2 * KBor; ++j) {
+        auto rj = j - KBor + y - ref_sy;
+        for (int i = 0; i <= 2 * KBor; ++i) {
+            auto ri = i - KBor + x - ref_sx;
+            auto ref_val = ref_img[rj][ri];
+            if (ref_val >= pix_val) {
+                if (ref_val == pix_val) [[unlikely]] {
+                    if (j >= KBor && i >= KBor) continue;
+                }
+                return;
+            }
+        }
+    }
+
+    // current pixel is the local maximum
+    auto cur_pos = atomicAdd(out.next_pos, 1);
+    if (cur_pos < out.capacity) {
+        out.data[cur_pos] = {(uint16_t) x, (uint16_t) y};
+    }
+}
+
+template<typename InT, typename OutT, uint8_t BlockW, uint8_t KBor, uint8_t KNum>
+void call_corner_score_impl_3(image_type<InT> in, image_type<OutT> out,
+                              kernel_bunch<int8_t> ker, uint16_t *ker_weight, cudaStream_t stream) {
+    assert(ker.border_width == KBor);
+    assert(ker.kernel_length == KNum);
+    auto block_size = dim3{BlockW, BlockW, 1};
+    auto grid_dim = calc_grid_size(in, BlockW);
+    auto KernelFunc = corner_score<InT, OutT, BlockW, KBor, KNum>;
+    KernelFunc<<<grid_dim, block_size, 0, stream>>>(in, out, ker.data, ker_weight);
+}
+
+template<typename InT, typename OutT, uint8_t BlockW, uint8_t KBor>
+void call_corner_score_impl_2(image_type<InT> in, image_type<OutT> out,
+                              kernel_bunch<int8_t> ker, uint16_t *ker_weight, cudaStream_t stream) {
+    assert(ker.border_width == KBor);
+    switch (ker.kernel_length) {
+        case 6: {
+            constexpr uint8_t KNum = 6;
+            call_corner_score_impl_3<InT, OutT, BlockW, KBor, KNum>(in, out, ker, ker_weight, stream);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+template<typename InT, typename OutT, uint8_t BlockW>
+void call_corner_score_impl_1(image_type<InT> in, image_type<OutT> out,
+                              kernel_bunch<int8_t> ker, uint16_t *ker_weight, cudaStream_t stream) {
+    switch (ker.border_width) {
+        case 5: {
+            constexpr uint8_t KBor = 5;
+            call_corner_score_impl_2<InT, OutT, BlockW, KBor>(in, out, ker, ker_weight, stream);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+template<typename InT, typename OutT>
+void call_corner_score(image_type<InT> in, image_type<OutT> out,
+                       kernel_bunch<int8_t> ker, uint16_t *ker_weight,
+                       uint8_t block_width, cudaStream_t stream) {
+    switch (block_width) {
+        case 16: {
+            constexpr uint8_t BlockW = 16;
+            call_corner_score_impl_1<InT, OutT, BlockW>(in, out, ker, ker_weight, stream);
+            return;
+        }
+        case 8: {
+            constexpr uint8_t BlockW = 8;
+            call_corner_score_impl_1<InT, OutT, BlockW>(in, out, ker, ker_weight, stream);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+template void call_corner_score(image_type<uint8_t>, image_type<uint8_t>,
+                                kernel_bunch<int8_t>, uint16_t *, uint8_t, cudaStream_t);
+
+template void call_corner_score(image_type<uint16_t>, image_type<uint8_t>,
+                                kernel_bunch<int8_t>, uint16_t *, uint8_t, cudaStream_t);
+
+template<typename T, uint8_t BlockW, uint8_t KBor>
+void call_lm_filter_impl_2(image_type<T> in, filter_result_type<ushort2> ret,
+                           float threshold, cudaStream_t stream) {
+    auto block_size = dim3{BlockW, BlockW, 1};
+    auto grid_dim = calc_grid_size(in, BlockW);
+    auto KernelFunc = lm_filter<T, BlockW, KBor>;
+    if constexpr (std::is_integral_v<T>) {
+        KernelFunc<<<grid_dim, block_size, 0, stream>>>(
+                in, ret, threshold * std::numeric_limits<T>::max());
+    } else {
+        KernelFunc<<<grid_dim, block_size, 0, stream>>>(in, ret, threshold);
+    }
+}
+
+template<typename T, uint8_t BlockW>
+void call_lm_filter_impl_1(image_type<T> in, filter_result_type<ushort2> ret, float threshold,
+                           uint8_t border_width, cudaStream_t stream) {
+    switch (border_width) {
+        case 1: {
+            constexpr uint8_t KBor = 1;
+            call_lm_filter_impl_2<T, BlockW, KBor>(in, ret, threshold, stream);
+            return;
+        }
+        case 2: {
+            constexpr uint8_t KBor = 2;
+            call_lm_filter_impl_2<T, BlockW, KBor>(in, ret, threshold, stream);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+template<typename T>
+void call_lm_filter(image_type<T> in, filter_result_type<ushort2> ret, float threshold,
+                    uint8_t border_width, uint8_t block_width, cudaStream_t stream) {
+    switch (block_width) {
+        case 16: {
+            constexpr uint8_t BlockW = 16;
+            call_lm_filter_impl_1<T, BlockW>(in, ret, threshold, border_width, stream);
+            return;
+        }
+        case 8: {
+            constexpr uint8_t BlockW = 8;
+            call_lm_filter_impl_1<T, BlockW>(in, ret, threshold, border_width, stream);
+            return;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+template void call_lm_filter(image_type<uint8_t>, filter_result_type<ushort2>, float,
+                             uint8_t, uint8_t, cudaStream_t);

+ 16 - 0
src/image_process_v3/cuda_impl/vis_marker_kernels.cuh

@@ -0,0 +1,16 @@
+#ifndef REMOTEAR3_VIS_MARKER_KERNELS_CUH
+#define REMOTEAR3_VIS_MARKER_KERNELS_CUH
+
+#include "kernel_types.cuh"
+
+template<typename InT, typename OutT>
+void call_corner_score(image_type<InT> in, image_type<OutT> out,
+                       kernel_bunch<int8_t> ker, uint16_t *ker_weight,
+                       uint8_t block_width, cudaStream_t stream);
+
+// threshold in [0, 1]
+template<typename T>
+void call_lm_filter(image_type<T> in, filter_result_type<ushort2> ret, float threshold,
+                    uint8_t border_width, uint8_t block_width, cudaStream_t stream);
+
+#endif //REMOTEAR3_VIS_MARKER_KERNELS_CUH

+ 442 - 0
src/image_process_v3/image_process.cpp

@@ -0,0 +1,442 @@
+#include "image_process.h"
+#include "core/cuda_helper.hpp"
+#include "core/image_utility.hpp"
+#include "core/memory_pool.h"
+#include "cuda_impl/process_kernels.cuh"
+
+#include <opencv2/cudaimgproc.hpp>
+
+#include <boost/noncopyable.hpp>
+
+namespace process_impl {
+
+    template<typename T>
+    struct smart_buffer : private boost::noncopyable {
+        static_assert(std::is_trivial_v<T>);
+
+        T *ptr = nullptr;
+        size_t length = 0;
+
+        smart_buffer() = default;
+
+        template<typename U=T>
+        smart_buffer(const smart_buffer<U> &other) = delete;
+
+        ~smart_buffer() {
+            MEM_DEALLOC(ptr);
+        }
+
+        void create(size_t req_length) {
+            if (req_length > capacity) [[unlikely]] {
+                MEM_DEALLOC(ptr);
+                MEM_ALLOC(T, req_length, MEM_HOST);
+                capacity = req_length;
+            }
+            length = req_length;
+        }
+
+        size_t size() const {
+            return length * sizeof(T);
+        }
+
+    private:
+        size_t capacity = 0;
+    };
+
+    template<typename T>
+    struct smart_gpu_buffer : private boost::noncopyable {
+        T *ptr = nullptr;
+        size_t size = 0;
+
+        smart_gpu_buffer() = default;
+
+        template<typename U>
+        smart_gpu_buffer(const smart_gpu_buffer<T> &other) = delete;
+
+        ~smart_gpu_buffer() {
+            deallocate();
+        }
+
+        void create(size_t req_size) {
+            if (req_size > capacity) [[unlikely]] {
+                deallocate();
+                ptr = MEM_ALLOC(T, req_size, MEM_CUDA);
+                capacity = req_size;
+            }
+            size = req_size;
+        }
+
+        template<typename U=T>
+        void upload_from(const smart_buffer<U> &buf, cudaStream_t stream = nullptr) {
+            assert(buf.length * sizeof(U) % sizeof(T) == 0);
+            create(buf.length * sizeof(U) / sizeof(T));
+            if (stream == nullptr) {
+                CUDA_API_CHECK(cudaMemcpy(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice));
+            } else {
+                CUDA_API_CHECK(cudaMemcpyAsync(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice, stream));
+            }
+        }
+
+        template<typename U=T>
+        void upload_from(const U *src_ptr, size_t src_size, cudaStream_t stream = nullptr) {
+            assert(src_size * sizeof(U) % sizeof(T) == 0);
+            create(src_size * sizeof(U) / sizeof(T));
+            if (stream == nullptr) {
+                CUDA_API_CHECK(cudaMemcpy(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice));
+            } else {
+                CUDA_API_CHECK(cudaMemcpyAsync(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice, stream));
+            }
+        }
+
+        template<typename U=T>
+        void download_to(smart_buffer<U> *buf, cudaStream_t stream = nullptr) {
+            assert(size * sizeof(T) % sizeof(U) == 0);
+            buf->create(size * sizeof(T) / sizeof(U));
+            if (stream == nullptr) {
+                CUDA_API_CHECK(cudaMemcpy(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
+            } else {
+                CUDA_API_CHECK(cudaMemcpyAsync(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
+            }
+        }
+
+    private:
+        size_t capacity = 0;
+
+        void deallocate() {
+            if (ptr == nullptr) return;
+            MEM_DEALLOC(ptr);
+            ptr = nullptr;
+        }
+    };
+
+    struct smart_cuda_texture {
+        cudaTextureObject_t obj = 0;
+        int mat_type = -1;
+
+        ~smart_cuda_texture() {
+            deallocate();
+        }
+
+        smart_cuda_texture() = default;
+
+        smart_cuda_texture(const smart_cuda_texture &other) = delete;
+
+        void create(const cv::cuda::GpuMat &mat) {
+            if (last_ptr != mat.cudaPtr()) [[unlikely]] {
+                deallocate();
+                allocate(mat);
+            }
+        }
+
+    private:
+        void *last_ptr = nullptr;
+
+        void allocate(const cv::cuda::GpuMat &mat) {
+            auto res_desc = cudaResourceDesc{};
+            res_desc.resType = cudaResourceTypePitch2D;
+            res_desc.res.pitch2D.devPtr = mat.cudaPtr();
+            res_desc.res.pitch2D.width = mat.cols;
+            res_desc.res.pitch2D.height = mat.rows;
+            res_desc.res.pitch2D.pitchInBytes = mat.step;
+
+            auto tex_desc = cudaTextureDesc{};
+            tex_desc.addressMode[0] = cudaAddressModeClamp;
+            tex_desc.addressMode[1] = cudaAddressModeClamp;
+            tex_desc.filterMode = cudaFilterModeLinear;
+            tex_desc.readMode = cudaReadModeNormalizedFloat;
+            tex_desc.normalizedCoords = true;
+
+            mat_type = mat.type();
+            switch (mat_type) {
+                case CV_8UC1: {
+                    res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uint8_t>();
+                    break;
+                }
+                case CV_8UC4: {
+                    res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uchar4>();
+                    break;
+                }
+                default: {
+                    RET_ERROR;
+                }
+            }
+
+            assert(obj == 0);
+            CUDA_API_CHECK(cudaCreateTextureObject(&obj, &res_desc, &tex_desc, nullptr));
+            last_ptr = mat.cudaPtr();
+        }
+
+        void deallocate() {
+            if (obj == 0) return;
+            CUDA_API_CHECK(cudaDestroyTextureObject(obj));
+            last_ptr = nullptr;
+            obj = 0;
+        }
+    };
+
+    camera_info to_camera_info(const camera_intrinsic &cam) {
+        camera_info ret{};
+        ret.fx = cam.fx / cam.width;
+        ret.fy = cam.fy / cam.height;
+        ret.cx = cam.cx / cam.width;
+        ret.cy = cam.cy / cam.height;
+        ret.k[0] = cam.k[0];
+        ret.k[1] = cam.k[1];
+        return ret;
+    }
+
+    void opencv_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
+        switch (in.type()) {
+            case CV_8UC1: {
+                cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2BGR, 3, stream);
+                return;
+            }
+        }
+        unreachable();
+    }
+
+    void opencv_gray2rgb(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
+        switch (in.type()) {
+            case CV_8UC1: {
+                cv::cuda::cvtColor(in, *out, cv::COLOR_GRAY2BGR, 3, stream);
+                return;
+            }
+        }
+        unreachable();
+    }
+
+    template<typename T>
+    image_type<T> to_image_type(const cv::cuda::GpuMat &mat) {
+        assert(sizeof(T) == CV_ELEM_SIZE(mat.type()));
+        auto ret = image_type<T>();
+        ret.ptr = (T *) mat.cudaPtr();
+        ret.pitch = mat.step;
+        ret.width = mat.cols;
+        ret.height = mat.rows;
+        return ret;
+    }
+
+    template<typename T>
+    void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
+        assert(in.elemSize() == sizeof(T));
+        out->create(in.size().area());
+        auto flatten_pitch = in.cols * in.elemSize();
+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
+                                         flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
+    }
+
+    template<typename T>
+    void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
+                   cv::Size size, int type, cudaStream_t stream) {
+        assert(sizeof(T) == CV_ELEM_SIZE(type));
+        assert(in.size == size.area());
+        out->create(size, type);
+        auto flatten_pitch = out->cols * out->elemSize();
+        CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
+                                         flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
+    }
+
+    void crude_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
+                       bool alpha, cudaStream_t stream) {
+        constexpr uint2 block_size = {32, 4};
+        constexpr uint2 grid_dim = {8, 128};
+        auto out_size = cv::Size{in.cols >> 1, in.rows >> 1};
+        switch (in.type()) {
+            case CV_8UC1: {
+                if (alpha) {
+                    out->create(out_size, CV_8UC4);
+                    call_crude_debayer(to_image_type<uint8_t>(in),
+                                       to_image_type<uchar4>(*out),
+                                       block_size, grid_dim, stream);
+                } else {
+                    out->create(out_size, CV_8UC3);
+                    call_crude_debayer(to_image_type<uint8_t>(in),
+                                       to_image_type<uchar3>(*out),
+                                       block_size, grid_dim, stream);
+                }
+                return;
+            }
+            default: {
+                RET_ERROR;
+            }
+        }
+    }
+
+    // pixel coordinate to undistorted normalized plane
+    cv::Point2f undistort_point(const camera_intrinsic &info, cv::Point2f p) {
+        auto u = (p.x - info.cx) / info.fx;
+        auto v = (p.y - info.cy) / info.fy;
+        auto r0 = sqrtf(u * u + v * v);
+
+        // Newton's Method
+        constexpr auto SOLVE_ITERATION_CNT = 4;
+        auto r = r0;
+        for (auto k = 0; k < SOLVE_ITERATION_CNT; ++k) {
+            auto r2 = r * r;
+            auto r3 = r2 * r;
+            auto r4 = r3 * r;
+            auto r5 = r4 * r;
+            r -= (info.k[1] * r5 + info.k[0] * r3 + r - r0) /
+                 (5 * info.k[1] * r4 + 3 * info.k[0] * r2 + 1);
+        }
+
+        auto factor = r / r0;
+        u *= factor;
+        v *= factor;
+        return {u, v};
+    }
+
+    void resample_image(const smart_cuda_texture &in, cv::cuda::GpuMat *out,
+                        cv::Size2f range, const camera_intrinsic &cam, uint32_t height, cudaStream_t stream) {
+        constexpr uint2 block_size = {32, 4};
+        constexpr uint2 grid_dim = {8, 128};
+        float ps = 2 * range.height / height;
+        uint32_t width = 2 * range.width / ps;
+        width = (width + 3) & (-4); // make OpenGL happy
+        resample_info info{};
+        info.x = -range.width;
+        info.y = -range.height;
+        info.ps = ps;
+        switch (in.mat_type) {
+            case CV_8UC1: {
+                out->create(height, width, CV_8UC1);
+                call_resample_image(in.obj, to_image_type<uint8_t>(*out), info,
+                                    to_camera_info(cam), block_size, grid_dim, stream);
+                return;
+            }
+            case CV_8UC4: {
+                out->create(height, width, CV_8UC3);
+                call_resample_image(in.obj, to_image_type<uchar3>(*out), info,
+                                    to_camera_info(cam), block_size, grid_dim, stream);
+                return;
+            }
+            default: {
+                RET_ERROR;
+            }
+        }
+    }
+
+}
+
+using namespace process_impl;
+
+struct monocular_processor::impl {
+    cv::cuda::GpuMat rgba_dev;
+    cv::cuda::GpuMat resample_dev;
+    cv::cuda::GpuMat ugly_out; // TODO: ugly hack
+    smart_cuda_texture resample_tex;
+    smart_gpu_buffer<uchar3> rgb_f;
+    smart_gpu_buffer<float> hsv_v_f;
+    smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
+    smart_gpu_buffer<enhance_coeff> enhance_ext;
+
+    void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
+        assert(in.type() == CV_8UC3);
+
+        // flatten image into a line
+        flatten(in, &rgb_f, stream);
+        auto line_size = rgb_f.size;
+
+        // extract V channel of HSV
+        constexpr auto block_size = 256;
+        constexpr auto grid_dim = 512;
+        hsv_v_f.create(rgb_f.size);
+        call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
+                           block_size, grid_dim, stream);
+
+        // reduce enhance coefficients
+        hsv_v_max.create(grid_dim);
+        call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
+                        block_size, grid_dim, stream);
+        hsv_v_sum_log.create(grid_dim);
+        call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
+                            block_size, grid_dim, stream);
+
+        // prepare enhance coefficients
+        enhance_ext.create(1);
+        call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
+                                   line_size, enhance_ext.ptr, stream);
+
+        // enhance image
+        call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
+                           block_size, grid_dim, stream);
+
+        // unflatten image
+        unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
+    }
+
+    image_u8c3 process(const image_u8c1 &in, process_config conf) {
+        auto cuda_stream = conf.stream->cuda;
+        auto cv_stream = conf.stream->cv;
+        auto in_mat = in->as_cuda(conf.stream);
+        if (conf.is_mono) {
+
+            // undistort
+            if (conf.undistort) {
+                resample_tex.create(in_mat);
+                resample_image(resample_tex, &resample_dev, conf.valid_range,
+                               conf.camera, conf.resample_height, cuda_stream);
+            } else {
+                resample_dev = in_mat;
+            }
+
+            // Mono -> RGB
+            opencv_gray2rgb(resample_dev, &ugly_out, cv_stream);
+
+        } else {
+
+            // debayer
+            if (conf.crude_debayer) {
+                if (conf.undistort) {
+                    crude_debayer(in_mat, &rgba_dev, true, cuda_stream);
+                } else {
+                    crude_debayer(in_mat, &ugly_out, false, cuda_stream);
+                }
+            } else {
+                assert(!conf.undistort);
+                opencv_debayer(in_mat, &ugly_out, cv_stream);
+            }
+
+            // undistort
+            if (conf.undistort) {
+                assert(conf.crude_debayer);
+                resample_tex.create(rgba_dev);
+                resample_image(resample_tex, &ugly_out, conf.valid_range,
+                               conf.camera, conf.resample_height, cuda_stream);
+            }
+        }
+
+        // enhance image
+        if (conf.enhance) {
+            enhance_image(ugly_out, &ugly_out, cuda_stream);
+        }
+
+        auto out_info = create_image_info<uchar3>(ugly_out.size(), MEM_CUDA);
+        out_info.fill_from_async(ugly_out, conf.stream);
+        return create_image(out_info);
+    }
+};
+
+monocular_processor::monocular_processor()
+        : pimpl(std::make_unique<impl>()) {}
+
+monocular_processor::~monocular_processor() = default;
+
+image_u8c3 monocular_processor::process(const image_u8c1 &in, process_config conf) {
+    return pimpl->process(in, conf);
+}
+
+cv::Size2f calc_valid_range(const camera_intrinsic &left, const camera_intrinsic &right, float *angle) {
+    auto u_lim = std::min({-undistort_point(left, {0, left.cy}).x,
+                           undistort_point(left, {(float) left.width, left.cy}).x,
+                           -undistort_point(right, {0, right.cy}).x,
+                           undistort_point(right, {(float) right.width, right.cy}).x});
+    auto v_lim = std::min({-undistort_point(left, {left.cx, 0}).y,
+                           undistort_point(left, {left.cx, (float) left.height}).y,
+                           -undistort_point(right, {right.cx, 0}).y,
+                           undistort_point(right, {right.cx, (float) right.height}).y});
+    if (angle != nullptr) {
+        *angle = 2 * atanf(v_lim);
+    }
+    return {u_lim, v_lim};
+}

+ 42 - 0
src/image_process_v3/image_process.h

@@ -0,0 +1,42 @@
+#ifndef REMOTEAR3_IMAGE_PROCESS_H
+#define REMOTEAR3_IMAGE_PROCESS_H
+
+#include "core/image_utility.hpp"
+#include "image_process/camera_utility.hpp"
+
+#include <memory>
+
+// return 1/4 part of valid ranges
+// angle: view angle
+cv::Size2f calc_valid_range(const camera_intrinsic &left,
+                            const camera_intrinsic &right,
+                            float *angle = nullptr);
+
+class monocular_processor {
+public:
+
+    monocular_processor();
+
+    ~monocular_processor();
+
+    struct process_config {
+        bool is_mono = false;
+        bool crude_debayer = true;
+        bool enhance = false;
+
+        bool undistort = true;
+        cv::Size2f valid_range; // used for undistort
+        camera_intrinsic camera; // used for undistort
+        uint32_t resample_height; // used for undistort
+
+        smart_cuda_stream *stream = nullptr;
+    };
+
+    image_u8c3 process(const image_u8c1 &in, process_config conf);
+
+private:
+    struct impl;
+    std::unique_ptr<impl> pimpl;
+};
+
+#endif //REMOTEAR3_IMAGE_PROCESS_H

+ 3 - 0
src/impl/app_base.h

@@ -3,6 +3,8 @@
 
 #include "core/cuda_helper.hpp"
 
+#include <yaml-cpp/yaml.h>
+
 #include <boost/asio/io_context.hpp>
 
 class app_base {
@@ -13,6 +15,7 @@ public:
     struct create_config {
         io_context *asio_ctx = nullptr;
         CUcontext *cuda_ctx = nullptr;
+        YAML::Node ext_conf;
     };
 
     virtual ~app_base() = default;

+ 62 - 0
src/impl/apps/app_selector/app_selector.cpp

@@ -0,0 +1,62 @@
+#include "app_selector.h"
+#include "impl/apps/depth_guide/depth_guide.h"
+#include "impl/apps/remote_ar/remote_ar.h"
+
+#include <GLFW/glfw3.h>
+
+#include <ImGuiFileDialog.h>
+
+#include <boost/asio/post.hpp>
+
+using boost::asio::post;
+
+app_selector::app_selector(const create_config &_conf) {
+    conf = _conf;
+    auto dialog_conf = IGFD::FileDialogConfig();
+    dialog_conf.flags |= ImGuiFileDialogFlags_DisableCreateDirectoryButton;
+    dialog_conf.flags |= ImGuiFileDialogFlags_HideColumnType;
+    dialog_conf.flags |= ImGuiFileDialogFlags_ReadOnlyFileNameField;
+    dialog_conf.flags |= ImGuiFileDialogFlags_CaseInsensitiveExtention;
+    dialog_conf.path = "/home/tpx/project/DepthGuide/data"; // TODO: remember last value
+    dialog->OpenDialog(dialog_name, "Choose YAML file",
+                       "YAML files{.yaml,.yml}", dialog_conf);
+}
+
+app_selector::~app_selector() {
+    dialog->Close();
+}
+
+void app_selector::show_ui() {
+    if (dialog->Display(dialog_name)) {
+        if (dialog->IsOk()) {
+            load_app(dialog->GetFilePathName());
+        }
+    }
+}
+
+void app_selector::load_app(const std::string &conf_path) {
+    auto app_conf = YAML::LoadFile(conf_path);
+    SPDLOG_INFO("Load application from {}", conf_path);
+    auto app_name = app_conf["app_name"].as<std::string>();
+    auto create_conf = base_config{
+            .asio_ctx = conf.asio_ctx,
+            .cuda_ctx = conf.cuda_ctx,
+            .ext_conf = app_conf,
+    };
+
+    auto app = std::unique_ptr<app_base>();
+    if (app_name == "depth_guide") {
+        app = std::make_unique<app_depth_guide>(create_conf);
+    } else if (app_name == "remote_ar") {
+        app = std::make_unique<app_remote_ar>(create_conf);
+    }
+
+    // replace application
+    assert(app != nullptr);
+    auto app_ptr = conf.app_ptr;
+    auto window = conf.window;
+    post(*conf.asio_ctx, [=, app = std::move(app)]() mutable {
+        glfwSetWindowTitle(window, app->window_name());
+        *app_ptr = std::move(app);
+    });
+}

+ 39 - 0
src/impl/apps/app_selector/app_selector.h

@@ -0,0 +1,39 @@
+#ifndef DEPTHGUIDE_APP_SELECTOR_H
+#define DEPTHGUIDE_APP_SELECTOR_H
+
+#include "impl/app_base.h"
+
+#include <ImGuiFileDialog.h>
+
+class GLFWwindow;
+
+class app_selector : public app_base {
+public:
+
+    using base_config = app_base::create_config;
+    struct create_config : base_config {
+        std::unique_ptr<app_base> *app_ptr = nullptr;
+        GLFWwindow *window = nullptr;
+    };
+
+    explicit app_selector(const create_config &conf);
+
+    ~app_selector() override;
+
+    const char *window_name() override { return "Please select application YAML file"; }
+
+    void show_ui() override;
+
+    void render_background() override {}
+
+    void load_app(const std::string &conf_path);
+
+private:
+    create_config conf;
+
+    IGFD::FileDialog *dialog = ImGuiFileDialog::Instance();
+    const char *dialog_name = "ChooseConfigFile";
+};
+
+
+#endif //DEPTHGUIDE_APP_SELECTOR_H

+ 93 - 0
src/impl/apps/remote_ar/remote_ar.cpp

@@ -0,0 +1,93 @@
+#include "remote_ar.h"
+#include "core/imgui_utility.hpp"
+#include "core/yaml_utility.hpp"
+
+app_remote_ar::app_remote_ar(const create_config &_conf) {
+    conf = _conf.ext_conf;
+    asio_ctx = _conf.asio_ctx;
+    cuda_ctx = _conf.cuda_ctx;
+
+    // initialize object manager
+    OBJ_SAVE(raw_left, image_u8c1());
+    OBJ_SAVE(raw_right, image_u8c1());
+    OBJ_SAVE(rgb_left, image_u8c3());
+    OBJ_SAVE(rgb_right, image_u8c3());
+
+    // process callbacks caused by OBJ_SAVE
+    asio_ctx->poll();
+
+    // initialize modules
+    auto mvs_conf = mvs_camera_ui::create_config{.ctx = asio_ctx};
+    mvs_conf.cameras.push_back({.dev_name = LOAD_STR("left_camera_name"), .img_name = raw_left});
+    mvs_conf.cameras.push_back({.dev_name = LOAD_STR("right_camera_name"), .img_name = raw_right});
+    mvs_cam = std::make_unique<mvs_camera_ui>(mvs_conf);
+
+    auto stereo_info = stereo_camera_info::from_yaml(LOAD_SUB("stereo_info"));
+    float view_angle = 0.0f;
+    auto img_range = calc_valid_range(stereo_info.left, stereo_info.right, &view_angle);
+
+    auto cam_left_conf = image_process_ui::create_config{
+            .in_name = raw_left, .out_name = rgb_left, .stream = &cam_left.stream
+    };
+    cam_left_conf.dev_info = {.valid_range = img_range, .cam_int = stereo_info.left};
+    cam_left.img_proc = std::make_unique<image_process_ui>(cam_left_conf);
+
+    auto cam_right_conf = image_process_ui::create_config{
+            .in_name = raw_right, .out_name = rgb_right, .stream = &cam_right.stream
+    };
+    cam_right_conf.dev_info = {.valid_range = img_range, .cam_int = stereo_info.right};
+    cam_right.img_proc = std::make_unique<image_process_ui>(cam_right_conf);
+    cam_right.img_proc->sync_with(cam_left.img_proc.get());
+
+    mvs_cam->cap_info_sig.connect([this](auto info) {
+        cam_left.img_proc->change_config({.is_mono = info.is_mono});
+        cam_right.img_proc->change_config({.is_mono = info.is_mono});
+    });
+
+    auto bg_viewer_conf = image_viewer::create_config{
+            .mode = VIEW_STEREO, .flip_y = true,
+            .stream = default_cuda_stream,
+    };
+    auto &bg_extra_conf = bg_viewer_conf.extra.stereo;
+    bg_extra_conf.left_name = rgb_left;
+    bg_extra_conf.right_name = rgb_right;
+    bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
+}
+
+void app_remote_ar::show_ui() {
+    if (ImGui::Begin("Remote AR Control")) {
+        ImGui::PushItemWidth(200);
+
+        if (ImGui::CollapsingHeader("Camera")) {
+            auto id_guard = imgui_id_guard("camera");
+            mvs_cam->show();
+            ImGui::SeparatorText("Post-Processing");
+            cam_left.img_proc->show();
+        }
+
+        if (ImGui::CollapsingHeader("Debug")) {
+            if (ImGui::TreeNode("Background")) {
+                bg_viewer->show();
+                ImGui::TreePop();
+            }
+            if (ImGui::TreeNode("Memory Pool")) {
+                if (ImGui::Button("Purge")) {
+                    post(*asio_ctx, [] { global_mp.purge(); });
+                }
+                ImGui::TreePop();
+            }
+            if (ImGui::TreeNode("Performance")) {
+                ImGui::Text("UI Refresh Rate: %.2fms", perf_timer.query().interval);
+                ImGui::TreePop();
+            }
+        }
+
+        ImGui::PopItemWidth();
+    }
+    ImGui::End();
+    perf_timer.record();
+}
+
+void app_remote_ar::render_background() {
+    bg_viewer->render();
+}

+ 53 - 0
src/impl/apps/remote_ar/remote_ar.h

@@ -0,0 +1,53 @@
+#ifndef DEPTHGUIDE_REMOTE_AR_H
+#define DEPTHGUIDE_REMOTE_AR_H
+
+#include "core/event_timer.h"
+#include "core/object_manager.h"
+#include "device/mvs_camera_ui.h"
+#include "module/image_viewer.h"
+#include "image_process/image_process_ui.h"
+#include "impl/app_base.h"
+
+class app_remote_ar : public app_base {
+public:
+
+    explicit app_remote_ar(const create_config &conf);
+
+    ~app_remote_ar() override = default;
+
+    const char *window_name() override { return "RemoteAR V4.-1"; }
+
+    void show_ui() override;
+
+    void render_background() override;
+
+private:
+
+    enum obj_names : object_manager::name_type {
+        // raw images
+        raw_left, raw_right,
+        rgb_left, rgb_right,
+    };
+
+    struct camera_module {
+        std::unique_ptr<image_process_ui> img_proc;
+        smart_cuda_stream stream;
+    };
+
+    YAML::Node conf;
+    io_context *asio_ctx;
+    CUcontext *cuda_ctx;
+
+    // modules
+    std::unique_ptr<mvs_camera_ui> mvs_cam;
+    std::unique_ptr<image_viewer> bg_viewer; // background viewer
+
+    camera_module cam_left;
+    camera_module cam_right;
+
+    // miscellaneous
+    event_timer perf_timer; // performance timer
+};
+
+
+#endif //DEPTHGUIDE_REMOTE_AR_H

+ 7 - 6
src/impl/main_impl.cpp

@@ -1,6 +1,6 @@
 #include "main_impl.h"
 #include "core/object_manager.h"
-#include "impl/apps/depth_guide/depth_guide.h"
+#include "apps/app_selector/app_selector.h"
 
 #include <boost/asio/io_context.hpp>
 #include <boost/asio/post.hpp>
@@ -121,11 +121,12 @@ void init_all() {
     main_ctx = new io_context();
     main_ob = new object_manager({.ctx = main_ctx});
 
-    auto app_config = app_base::create_config{
-            .asio_ctx = main_ctx, .cuda_ctx = &cuda_ctx
-    };
-    // TODO: switch app here
-    app = std::make_unique<app_depth_guide>(app_config);
+    auto app_conf = app_selector::create_config();
+    app_conf.asio_ctx = main_ctx;
+    app_conf.cuda_ctx = &cuda_ctx;
+    app_conf.app_ptr = &app;
+    app_conf.window = window;
+    app = std::make_unique<app_selector>(app_conf);
 
     glfwSetWindowTitle(window, app->window_name());
     ui_interval = std::chrono::milliseconds(33); // TODO: select refresh rate

+ 0 - 18
src/impl/object_names.h

@@ -1,18 +0,0 @@
-#ifndef DEPTHGUIDE_OBJECT_NAMES_H
-#define DEPTHGUIDE_OBJECT_NAMES_H
-
-#include "core/object_manager.h"
-
-enum obj_names : object_manager::name_type {
-
-    // images from device
-    img_color, img_depth,
-
-    // background image
-    img_bg,
-
-    // output image
-    img_out,
-};
-
-#endif //DEPTHGUIDE_OBJECT_NAMES_H

+ 3 - 0
src/module/image_streamer.h

@@ -13,6 +13,7 @@ public:
         // image must be valid before start
         obj_name_type img_name = invalid_obj_name;
         boost::asio::io_context *asio_ctx = nullptr;
+        // TODO: add frame rate
 
         // for encoder
         CUcontext *cuda_ctx = nullptr;
@@ -25,6 +26,8 @@ public:
 
     void show();
 
+    // TODO: add change frame rate
+
     using size_change_sig_type = boost::signals2::signal<void(cv::Size)>;
     size_change_sig_type sig_size_changed;
 

+ 3 - 0
src/module/image_viewer.h

@@ -22,6 +22,9 @@ public:
             struct {
                 obj_name_type c_name, d_name;
             } color_depth;
+            struct {
+                obj_name_type left_name, right_name;
+            } stereo;
         } extra = {};
     };
 

+ 31 - 0
src/module/impl/image_viewer.cpp

@@ -26,12 +26,22 @@ void image_viewer::impl::show_color_depth() {
     ImGui::PopItemWidth();
 }
 
+void image_viewer::impl::show_stereo() {
+    ImGui::RadioButton("Left", &chose_index, 0);
+    ImGui::SameLine();
+    ImGui::RadioButton("Right", &chose_index, 1);
+}
+
 void image_viewer::impl::show() {
     switch (conf.mode) {
         case VIEW_COLOR_DEPTH: {
             show_color_depth();
             break;
         }
+        case VIEW_STEREO: {
+            show_stereo();
+            break;
+        }
         default: {
             RET_ERROR;
         }
@@ -69,12 +79,33 @@ void image_viewer::impl::render_color_depth() {
     }
 }
 
+void image_viewer::impl::render_stereo() {
+    auto info = conf.extra.stereo;
+    switch (chose_index) {
+        case 0: { // left
+            render_color_obj(info.left_name);
+            break;
+        }
+        case 1: {
+            render_color_obj(info.right_name);
+            break;
+        }
+        default: {
+            RET_ERROR;
+        }
+    }
+}
+
 void image_viewer::impl::render() {
     switch (conf.mode) {
         case VIEW_COLOR_DEPTH: {
             render_color_depth();
             break;
         }
+        case VIEW_STEREO: {
+            render_stereo();
+            break;
+        }
         default: {
             RET_ERROR;
         }

+ 4 - 0
src/module/impl/image_viewer_impl.h

@@ -27,6 +27,8 @@ struct image_viewer::impl {
 
     void show_color_depth();
 
+    void show_stereo();
+
     void show();
 
     void render_color_obj(obj_name_type name);
@@ -36,6 +38,8 @@ struct image_viewer::impl {
 
     void render_color_depth();
 
+    void render_stereo();
+
     void render();
 
     explicit impl(create_config conf);