hace 1 año · 06595de6a6
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,11 +8,12 @@ add_executable(${PROJECT_NAME} src/main.cpp
 
				         src/image_process/impl/versatile_convertor.cpp
			
 
				         src/impl/main_impl.cpp
			
 
				         src/impl/apps/app_selector/app_selector.cpp
			
 
				+        src/impl/apps/debug/app_debug.cpp
			
 
				         src/impl/apps/depth_guide/depth_guide.cpp
			
 
				         src/impl/apps/remote_ar/remote_ar.cpp
			
 
				         src/impl/apps/tiny_player/tiny_player.cpp
			
 
				         src/core/impl/event_timer.cpp
			
 
				-#        src/core/impl/image_utility_v2.cpp
			
 
				+        src/core/impl/image_utility_v2.cpp
			
 
				         src/core/impl/memory_pool.cpp
			
 
				         src/core/impl/object_manager.cpp
			
 
				         src/module_v3/registration.cpp
			
@@ -69,7 +70,7 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LE
 
				 
			
 
				 # OpenCV config
			
 
				 cmake_policy(SET CMP0146 OLD)
			
 
				-find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs calib3d)
			
 
				+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc calib3d imgcodecs opencv_cudastereo)
			
 
				 target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
			
 
				 target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
 
				 
			
--- a/data/config_debug.yaml
+++ b/data/config_debug.yaml
@@ -0,0 +1 @@
 
				+app_name: debug
			
--- a/src/core/cuda_helper.hpp
+++ b/src/core/cuda_helper.hpp
@@ -58,6 +58,13 @@ struct smart_cuda_stream {
 
				 
			
 
				 extern smart_cuda_stream *default_cuda_stream;
			
 
				 
			
 
				+inline cudaStream_t cuda_stream(smart_cuda_stream *stream = nullptr) {
			
 
				+    return stream == nullptr ? nullptr : stream->cuda;
			
 
				+}
			
 
				+
			
 
				+inline cv::cuda::Stream &cv_stream(smart_cuda_stream *stream = nullptr) {
			
 
				+    return stream == nullptr ? cv::cuda::Stream::Null() : stream->cv;
			
 
				+}
			
 
				 
			
 
				 template<typename T>
			
 
				 inline void extend_pointer_life(const std::shared_ptr<T> &ptr,
			
--- a/src/core/image_utility.hpp
+++ b/src/core/image_utility.hpp
@@ -362,6 +362,8 @@ private:
 
				     image_info_type<T> cuda_info;
			
 
				 
			
 
				     cudaEvent_t event = nullptr;
			
 
				+
			
 
				+    friend class generic_image;
			
 
				 };
			
 
				 
			
 
				 template<typename T>
			
--- a/src/core/image_utility_v2.h
+++ b/src/core/image_utility_v2.h
@@ -1,45 +1,119 @@
 
				 #ifndef DEPTHGUIDE_IMAGE_UTILITY_V2_H
			
 
				 #define DEPTHGUIDE_IMAGE_UTILITY_V2_H
			
 
				 
			
 
				-#include <memory>
			
 
				+#include "core/cuda_helper.hpp"
			
 
				+#include "core/image_utility.hpp"
			
 
				+#include "core/memory_pool.h"
			
 
				+#include "image_process/cuda_impl/image_utility.cuh"
			
 
				 
			
 
				 #include <opencv2/core/types.hpp>
			
 
				 
			
 
				-enum pixel_enum : uint8_t {
			
 
				-    PIX_GRAY,
			
 
				-    PIX_RGB,
			
 
				-    PIX_RGBA,
			
 
				-    PIX_NV12
			
 
				+#include <memory>
			
 
				+#include <optional>
			
 
				+
			
 
				+enum pixel_format_enum {
			
 
				+    PIX_NORMAL,
			
 
				+    PIX_NV12,
			
 
				 };
			
 
				 
			
 
				-// an image in certain device
			
 
				-class generic_image_info {
			
 
				+class generic_image;
			
 
				+
			
 
				+struct image_memory {
			
 
				+    std::shared_ptr<generic_image> img;
			
 
				+    std::shared_ptr<void> ptr;
			
 
				+    size_t width, pitch; // in bytes
			
 
				+    size_t height; // in pixel
			
 
				 
			
 
				+    void *start_ptr(int component = 0);
			
 
				+
			
 
				+    void *at(int row = 0, int col = 0, int component = 0);
			
 
				+
			
 
				+    void modified(smart_cuda_stream *stream = nullptr);
			
 
				 };
			
 
				 
			
 
				 // collection of a same image in multiple devices
			
 
				-class generic_image {
			
 
				+class generic_image : public std::enable_shared_from_this<generic_image> {
			
 
				 public:
			
 
				 
			
 
				     struct create_config {
			
 
				-
			
 
				+        cv::Size size; // display size
			
 
				+        int type; // storage type, like CV_8UC3
			
 
				+        pixel_format_enum pixel = PIX_NORMAL;
			
 
				     };
			
 
				 
			
 
				     using pointer = std::shared_ptr<generic_image>;
			
 
				 
			
 
				-    pixel_enum pixel_type() const;
			
 
				+    static pointer create(create_config conf);
			
 
				 
			
 
				-    int storage_type() const; // like CV_8UC3
			
 
				+    static pointer create(cv::Size size, int type,
			
 
				+                          pixel_format_enum pixel = PIX_NORMAL);
			
 
				 
			
 
				-    cv::Size size() const;
			
 
				+    size_t width() const {
			
 
				+        return size().width;
			
 
				+    }
			
 
				+
			
 
				+    size_t height() const {
			
 
				+        return size().height;
			
 
				+    }
			
 
				+
			
 
				+    cv::Size size() const; // display size
			
 
				+
			
 
				+    size_t elem_size() const {
			
 
				+        return CV_ELEM_SIZE(cv_type());
			
 
				+    }
			
 
				+
			
 
				+    size_t size_in_bytes() const;
			
 
				 
			
 
				     size_t width_in_bytes() const;
			
 
				 
			
 
				+    int cv_type() const;
			
 
				+
			
 
				+    pixel_format_enum pixel_format() const;
			
 
				+
			
 
				+    // synchronization will be performed
			
 
				+    cv::Mat cv_mat(smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    cv::cuda::GpuMat cv_gpumat(smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    template<typename T>
			
 
				+    image_type_v2<T> cuda(smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    image_memory memory(memory_location loc,
			
 
				+                        smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    pointer sub_image(int row = 0, int col = 0,
			
 
				+                      int width = -1, int height = -1) const;
			
 
				+
			
 
				+    pointer bit_cast(int type);
			
 
				+
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<smart_image<T>> v1() const;
			
 
				+
			
 
				+    void host_modified(smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    void cuda_modified(smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    using meta_key_type = size_t; // std::hash
			
 
				+    using meta_value_type = uint64_t;
			
 
				+
			
 
				+    std::optional<meta_value_type> get_meta(meta_key_type key);
			
 
				+
			
 
				+    void set_meta(meta_key_type key, meta_value_type val);
			
 
				+
			
 
				 private:
			
 
				     struct impl;
			
 
				     std::unique_ptr<impl> pimpl;
			
 
				+
			
 
				+    pointer shallow_clone() const;
			
 
				+
			
 
				+    friend class image_memory;
			
 
				 };
			
 
				 
			
 
				 using image_ptr = generic_image::pointer;
			
 
				 
			
 
				+image_ptr create_image(cv::Size size, int type,
			
 
				+                       pixel_format_enum pixel = PIX_NORMAL) {
			
 
				+    return generic_image::create(size, type, pixel);
			
 
				+}
			
 
				+
			
 
				 #endif //DEPTHGUIDE_IMAGE_UTILITY_V2_H
			
--- a/src/core/impl/image_utility_v2.cpp
+++ b/src/core/impl/image_utility_v2.cpp
@@ -1 +1,384 @@
 
				-#include "image_utility_v2_impl.h"
			
 
				+#include "image_utility_v2_impl.h"
			
 
				+
			
 
				+namespace image_utility_impl {
			
 
				+
			
 
				+    std::type_index cv_type_id(int type) {
			
 
				+        switch (type) {
			
 
				+            // @formatter:off
			
 
				+            case CV_8UC1: { return typeid(uchar1); }
			
 
				+            case CV_8UC2: { return typeid(uchar2); }
			
 
				+            case CV_8UC3: { return typeid(uchar3); }
			
 
				+            case CV_8UC4: { return typeid(uchar4); }
			
 
				+            case CV_16UC1: { return typeid(ushort1); }
			
 
				+            case CV_32FC1: { return typeid(float1); }
			
 
				+            default: { RET_ERROR; }
			
 
				+                // @formatter:on
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void *image_memory::start_ptr(int component) {
			
 
				+    switch (img->pixel_format()) {
			
 
				+        case PIX_NORMAL: {
			
 
				+            assert(component == 0);
			
 
				+            return ptr.get();
			
 
				+        }
			
 
				+        case PIX_NV12: {
			
 
				+            if (component == 0) { return ptr.get(); }
			
 
				+            if (component == 1) { return (uint8_t *) ptr.get() + pitch * img->height(); }
			
 
				+            RET_ERROR_E;
			
 
				+        }
			
 
				+        default: {
			
 
				+            RET_ERROR_E;
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void *image_memory::at(int row, int col, int component) {
			
 
				+    auto sp = (uint8_t *) start_ptr(component);
			
 
				+    return sp + row * pitch + col * img->elem_size();
			
 
				+}
			
 
				+
			
 
				+void image_memory::modified(smart_cuda_stream *stream) {
			
 
				+    auto &pimpl = img->pimpl;
			
 
				+    if (ptr == pimpl->store_host.ptr) {
			
 
				+        pimpl->q_this->host_modified(stream);
			
 
				+    } else {
			
 
				+        assert(ptr == pimpl->store_cuda.ptr);
			
 
				+        pimpl->q_this->cuda_modified(stream);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void *generic_image::impl::storage_info::row_start(size_t row) {
			
 
				+    return (uint8_t *) ptr.get() + row * pitch;
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::storage_info::reset() {
			
 
				+    ptr = nullptr;
			
 
				+    pitch = 0;
			
 
				+}
			
 
				+
			
 
				+generic_image::impl::impl(generic_image::create_config conf) {
			
 
				+    type = conf.type;
			
 
				+    pix_fmt = conf.pixel;
			
 
				+    size = conf.size;
			
 
				+
			
 
				+    // adjust display size to storage size
			
 
				+    switch (pix_fmt) {
			
 
				+        case PIX_NORMAL: {
			
 
				+            break;
			
 
				+        }
			
 
				+        case PIX_NV12: {
			
 
				+            size.height = nv12_storage_height(size.height);
			
 
				+            break;
			
 
				+        }
			
 
				+        default: {
			
 
				+            assert(false);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+cv::Size generic_image::impl::display_size() const {
			
 
				+    cv::Size ret = size;
			
 
				+    switch (pix_fmt) {
			
 
				+        case PIX_NORMAL: {
			
 
				+            break;
			
 
				+        }
			
 
				+        case PIX_NV12: {
			
 
				+            ret.height = nv12_display_height(ret.height);
			
 
				+            break;
			
 
				+        }
			
 
				+        default: {
			
 
				+            RET_ERROR_E;
			
 
				+        }
			
 
				+    }
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+size_t generic_image::impl::elem_bytes() const {
			
 
				+    return CV_ELEM_SIZE(type);
			
 
				+}
			
 
				+
			
 
				+size_t generic_image::impl::width_in_bytes() const {
			
 
				+    return size.width * elem_bytes();
			
 
				+}
			
 
				+
			
 
				+size_t generic_image::impl::size_in_bytes() const {
			
 
				+    return size.height * width_in_bytes();
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::create_host(smart_cuda_stream *stream) {
			
 
				+    if (store_host.ptr != nullptr) {
			
 
				+        SYNC_CREATE(store_host.ptr, stream);
			
 
				+        return;
			
 
				+    }
			
 
				+    store_host.ptr = ALLOC_PITCH_SHARED(
			
 
				+            uint8_t, width_in_bytes(), size.height, MEM_HOST, &store_host.pitch);
			
 
				+    if (store_cuda.ptr != nullptr) {
			
 
				+        SYNC_CREATE(store_cuda.ptr, stream);
			
 
				+        CUDA_API_CHECK(cudaMemcpy2DAsync(store_host.ptr.get(), store_host.pitch, // dst
			
 
				+                                         store_cuda.ptr.get(), store_cuda.pitch, // src
			
 
				+                                         width_in_bytes(), size.height,
			
 
				+                                         cudaMemcpyDeviceToHost, cuda_stream(stream)));
			
 
				+        REC_CREATE(store_host.ptr, stream);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::create_cuda(smart_cuda_stream *stream) {
			
 
				+    if (store_cuda.ptr != nullptr) {
			
 
				+        SYNC_CREATE(store_cuda.ptr, stream);
			
 
				+        return;
			
 
				+    }
			
 
				+    store_cuda.ptr = ALLOC_PITCH_SHARED(
			
 
				+            uint8_t, width_in_bytes(), size.height, MEM_CUDA, &store_cuda.pitch);
			
 
				+    if (store_host.ptr != nullptr) {
			
 
				+        SYNC_CREATE(store_host.ptr, stream);
			
 
				+        CUDA_API_CHECK(cudaMemcpy2DAsync(store_cuda.ptr.get(), store_cuda.pitch, // dst
			
 
				+                                         store_host.ptr.get(), store_host.pitch, // src
			
 
				+                                         width_in_bytes(), size.height,
			
 
				+                                         cudaMemcpyHostToDevice, stream->cuda));
			
 
				+        REC_CREATE(store_cuda.ptr, stream);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+image_memory generic_image::impl::get_memory(memory_location loc,
			
 
				+                                             smart_cuda_stream *stream) {
			
 
				+    auto ret = image_memory();
			
 
				+    ret.img = q_this->shared_from_this();
			
 
				+    ret.width = width_in_bytes();
			
 
				+    ret.height = size.height;
			
 
				+
			
 
				+    switch (loc) {
			
 
				+        case MEM_HOST: {
			
 
				+            create_host(stream);
			
 
				+            ret.ptr = store_host.ptr;
			
 
				+            ret.pitch = store_host.pitch;
			
 
				+            break;
			
 
				+        }
			
 
				+        case MEM_CUDA: {
			
 
				+            create_cuda(stream);
			
 
				+            ret.ptr = store_cuda.ptr;
			
 
				+            ret.pitch = store_cuda.pitch;
			
 
				+            break;
			
 
				+        }
			
 
				+        default: {
			
 
				+            RET_ERROR_E;
			
 
				+        }
			
 
				+    }
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+cv::Mat generic_image::impl::get_cv_mat(smart_cuda_stream *stream) {
			
 
				+    create_host(stream);
			
 
				+    return cv::Mat(size, type, store_host.ptr.get(), store_host.pitch);
			
 
				+}
			
 
				+
			
 
				+cv::cuda::GpuMat generic_image::impl::get_cv_gpumat(smart_cuda_stream *stream) {
			
 
				+    create_cuda(stream);
			
 
				+    return cv::cuda::GpuMat(size, type, store_host.ptr.get(), store_host.pitch);
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+image_type_v2<T> generic_image::impl::get_image_type_v2(smart_cuda_stream *stream) {
			
 
				+    create_cuda(stream);
			
 
				+    assert(cv_type_id(type) == typeid(T));
			
 
				+    assert(size.width <= std::numeric_limits<ushort>::max());
			
 
				+    assert(size.height <= std::numeric_limits<ushort>::max());
			
 
				+    return image_type_v2<T>(
			
 
				+            store_cuda.ptr.get(), size.width, size.height, store_cuda.pitch);
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+std::shared_ptr<smart_image<T>> generic_image::impl::get_image_v1() const {
			
 
				+    using ret_type = std::shared_ptr<smart_image<T>>;
			
 
				+    auto ret = ret_type();
			
 
				+    using info_type = image_info_type<T>;
			
 
				+    if (store_host.ptr != nullptr) {
			
 
				+        auto host_info = info_type{
			
 
				+                .ptr = store_host.ptr, .loc = MEM_HOST,
			
 
				+                .size = size, .pitch = store_host.pitch,
			
 
				+        };
			
 
				+        ret = std::make_shared<ret_type>(host_info);
			
 
				+    }
			
 
				+
			
 
				+    if (store_cuda.ptr != nullptr) {
			
 
				+        auto cuda_info = info_type{
			
 
				+                .ptr = store_cuda.ptr, .loc= MEM_CUDA,
			
 
				+                .size = size, .pitch = store_cuda.pitch,
			
 
				+        };
			
 
				+        if (ret == nullptr) {
			
 
				+            ret = std::make_shared<ret_type>(cuda_info);
			
 
				+        } else {
			
 
				+            ret->cuda_info = cuda_info;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    assert(ret != nullptr);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::sub_image_inplace(int row, int col, int width, int height) {
			
 
				+    // sub-image of other formats are not implemented
			
 
				+    assert(pix_fmt == PIX_NORMAL);
			
 
				+    if (store_host.ptr != nullptr) {
			
 
				+        store_host.ptr = std::shared_ptr<void>(
			
 
				+                (uint8_t *) store_host.row_start(row) + col * elem_bytes(),
			
 
				+                [p = store_host.ptr](void *) {});
			
 
				+    }
			
 
				+    if (store_cuda.ptr != nullptr) {
			
 
				+        store_cuda.ptr = std::shared_ptr<void>(
			
 
				+                (uint8_t *) store_cuda.row_start(row) + col * elem_bytes(),
			
 
				+                [p = store_cuda.ptr](void *) {});
			
 
				+    }
			
 
				+    size = cv::Size(width, height);
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::type_cast_inplace(int _type) {
			
 
				+    // bit-cast of other formats are not implemented
			
 
				+    assert(pix_fmt == PIX_NORMAL);
			
 
				+    auto _width = width_in_bytes() / CV_ELEM_SIZE(_type);
			
 
				+    assert(_width * CV_ELEM_SIZE(_type) == width_in_bytes());
			
 
				+    size.width = _width;
			
 
				+    type = _type;
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::host_modified(smart_cuda_stream *stream) {
			
 
				+    assert(store_host.ptr != nullptr);
			
 
				+    store_cuda.reset();
			
 
				+    REC_CREATE(store_host.ptr, stream);
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::cuda_modified(smart_cuda_stream *stream) {
			
 
				+    assert(store_cuda.ptr != nullptr);
			
 
				+    store_host.reset();
			
 
				+    REC_CREATE(store_cuda.ptr, stream);
			
 
				+}
			
 
				+
			
 
				+std::optional<generic_image::meta_value_type>
			
 
				+generic_image::impl::get_meta(meta_key_type key) {
			
 
				+    if (auto iter = meta_pool->find(key); iter != meta_pool->end()) {
			
 
				+        return iter->second;
			
 
				+    } else {
			
 
				+        return {};
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void generic_image::impl::set_meta(meta_key_type key, meta_value_type val) {
			
 
				+    if (auto iter = meta_pool->find(key); iter != meta_pool->end()) {
			
 
				+        iter->second = val;
			
 
				+    } else {
			
 
				+        meta_pool->emplace(key, val);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+generic_image::pointer generic_image::create(create_config conf) {
			
 
				+    auto ret = std::make_shared<generic_image>();
			
 
				+    ret->pimpl = std::make_unique<impl>(conf);
			
 
				+    ret->pimpl->q_this = ret.get();
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+generic_image::pointer generic_image::create(cv::Size size, int type, pixel_format_enum pixel) {
			
 
				+    auto conf = create_config{
			
 
				+            .size = size, .type = type, .pixel = pixel,
			
 
				+    };
			
 
				+    return create(conf);
			
 
				+}
			
 
				+
			
 
				+cv::Size generic_image::size() const {
			
 
				+    return pimpl->display_size();
			
 
				+}
			
 
				+
			
 
				+size_t generic_image::size_in_bytes() const {
			
 
				+    return pimpl->size_in_bytes();
			
 
				+}
			
 
				+
			
 
				+size_t generic_image::width_in_bytes() const {
			
 
				+    return pimpl->width_in_bytes();
			
 
				+}
			
 
				+
			
 
				+int generic_image::cv_type() const {
			
 
				+    return pimpl->type;
			
 
				+}
			
 
				+
			
 
				+pixel_format_enum generic_image::pixel_format() const {
			
 
				+    return pimpl->pix_fmt;
			
 
				+}
			
 
				+
			
 
				+image_memory generic_image::memory(memory_location loc,
			
 
				+                                   smart_cuda_stream *stream) {
			
 
				+    return pimpl->get_memory(loc, stream);
			
 
				+}
			
 
				+
			
 
				+cv::Mat generic_image::cv_mat(smart_cuda_stream *stream) {
			
 
				+    return pimpl->get_cv_mat(stream);
			
 
				+}
			
 
				+
			
 
				+cv::cuda::GpuMat generic_image::cv_gpumat(smart_cuda_stream *stream) {
			
 
				+    return pimpl->get_cv_gpumat(stream);
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+image_type_v2<T> generic_image::cuda(smart_cuda_stream *stream) {
			
 
				+    return pimpl->get_image_type_v2<T>(stream);
			
 
				+}
			
 
				+
			
 
				+// @formatter:off
			
 
				+template<> image_type_v2<uchar1> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+template<> image_type_v2<uchar2> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+template<> image_type_v2<uchar3> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+template<> image_type_v2<uchar4> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+template<> image_type_v2<ushort1> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+template<> image_type_v2<float1> generic_image::cuda(smart_cuda_stream *stream);
			
 
				+// @formatter:on
			
 
				+
			
 
				+template<typename T>
			
 
				+std::shared_ptr<smart_image<T>> generic_image::v1() const {
			
 
				+    return pimpl->get_image_v1<T>();
			
 
				+}
			
 
				+
			
 
				+// @formatter:off
			
 
				+template<> std::shared_ptr<smart_image<uchar1>> generic_image::v1() const;
			
 
				+template<> std::shared_ptr<smart_image<uchar2>> generic_image::v1() const;
			
 
				+template<> std::shared_ptr<smart_image<uchar3>> generic_image::v1() const;
			
 
				+template<> std::shared_ptr<smart_image<uchar4>> generic_image::v1() const;
			
 
				+template<> std::shared_ptr<smart_image<ushort1>> generic_image::v1() const;
			
 
				+template<> std::shared_ptr<smart_image<float1>> generic_image::v1() const;
			
 
				+// @formatter:on
			
 
				+
			
 
				+generic_image::pointer generic_image::shallow_clone() const {
			
 
				+    auto ret = std::make_shared<generic_image>();
			
 
				+    ret->pimpl = std::make_unique<impl>(*pimpl);
			
 
				+    ret->pimpl->q_this = ret.get();
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+generic_image::pointer generic_image::sub_image(int row, int col, int width, int height) const {
			
 
				+    auto ret = shallow_clone();
			
 
				+    ret->pimpl->sub_image_inplace(row, col, width, height);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+generic_image::pointer generic_image::bit_cast(int type) {
			
 
				+    auto ret = shallow_clone();
			
 
				+    ret->pimpl->type_cast_inplace(type);
			
 
				+    return ret;
			
 
				+}
			
 
				+
			
 
				+void generic_image::host_modified(smart_cuda_stream *stream) {
			
 
				+    pimpl->host_modified(stream);
			
 
				+}
			
 
				+
			
 
				+void generic_image::cuda_modified(smart_cuda_stream *stream) {
			
 
				+    pimpl->cuda_modified(stream);
			
 
				+}
			
 
				+
			
 
				+std::optional<generic_image::meta_value_type>
			
 
				+generic_image::get_meta(meta_key_type key) {
			
 
				+    return pimpl->get_meta(key);
			
 
				+}
			
 
				+
			
 
				+void generic_image::set_meta(meta_key_type key, meta_value_type val) {
			
 
				+    pimpl->set_meta(key, val);
			
 
				+}
			
--- a/src/core/impl/image_utility_v2_impl.h
+++ b/src/core/impl/image_utility_v2_impl.h
@@ -3,4 +3,97 @@
 
				 
			
 
				 #include "core/image_utility_v2.h"
			
 
				 
			
 
				+#include <boost/container/flat_map.hpp>
			
 
				+#include <boost/container/static_vector.hpp>
			
 
				+
			
 
				+namespace image_utility_impl {
			
 
				+
			
 
				+    inline size_t nv12_display_height(size_t height) {
			
 
				+        assert(height % 3 == 0);
			
 
				+        return height / 3 * 2;
			
 
				+    }
			
 
				+
			
 
				+    inline size_t nv12_storage_height(size_t height) {
			
 
				+        assert(height % 2 == 0);
			
 
				+        return height / 2 * 3;
			
 
				+    }
			
 
				+
			
 
				+    std::type_index cv_type_id(int type);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+using namespace image_utility_impl;
			
 
				+
			
 
				+struct generic_image::impl {
			
 
				+
			
 
				+    generic_image *q_this = nullptr;
			
 
				+
			
 
				+    static constexpr auto max_meta_count = 16;
			
 
				+    using meta_pool_type =
			
 
				+            boost::container::flat_map<meta_key_type, meta_value_type, std::less<>,
			
 
				+                    boost::container::static_vector<std::pair<meta_key_type, meta_value_type>, max_meta_count>>;
			
 
				+    std::shared_ptr<meta_pool_type> meta_pool =
			
 
				+            std::make_shared<meta_pool_type>();
			
 
				+
			
 
				+    struct storage_info {
			
 
				+        std::shared_ptr<void> ptr;
			
 
				+        size_t pitch;
			
 
				+
			
 
				+        void *row_start(size_t row);
			
 
				+
			
 
				+        void reset();
			
 
				+    };
			
 
				+
			
 
				+    storage_info store_host;
			
 
				+    storage_info store_cuda;
			
 
				+
			
 
				+    cv::Size size; // storage size
			
 
				+    int type;
			
 
				+    pixel_format_enum pix_fmt;
			
 
				+
			
 
				+    explicit impl(create_config conf);
			
 
				+
			
 
				+    cv::Size display_size() const;
			
 
				+
			
 
				+    size_t elem_bytes() const;
			
 
				+
			
 
				+    size_t size_in_bytes() const;
			
 
				+
			
 
				+    size_t width_in_bytes() const;
			
 
				+
			
 
				+    // imply that stream want to use host
			
 
				+    void create_host(smart_cuda_stream *stream);
			
 
				+
			
 
				+    // imply that stream want to use cuda
			
 
				+    void create_cuda(smart_cuda_stream *stream);
			
 
				+
			
 
				+    image_memory get_memory(memory_location loc,
			
 
				+                            smart_cuda_stream *stream);
			
 
				+
			
 
				+    cv::Mat get_cv_mat(smart_cuda_stream *stream);
			
 
				+
			
 
				+    cv::cuda::GpuMat get_cv_gpumat(smart_cuda_stream *stream);
			
 
				+
			
 
				+    template<typename T>
			
 
				+    image_type_v2<T> get_image_type_v2(smart_cuda_stream *stream);
			
 
				+
			
 
				+    template<typename T>
			
 
				+    std::shared_ptr<smart_image<T>> get_image_v1() const;
			
 
				+
			
 
				+    // use after copy of impl
			
 
				+    void sub_image_inplace(int row = 0, int col = 0,
			
 
				+                           int width = -1, int height = -1);
			
 
				+
			
 
				+    void type_cast_inplace(int type);
			
 
				+
			
 
				+    void host_modified(smart_cuda_stream *stream);
			
 
				+
			
 
				+    void cuda_modified(smart_cuda_stream *stream);
			
 
				+
			
 
				+    std::optional<meta_value_type> get_meta(meta_key_type key);
			
 
				+
			
 
				+    void set_meta(meta_key_type key, meta_value_type val);
			
 
				+
			
 
				+};
			
 
				+
			
 
				 #endif //DEPTHGUIDE_IMAGE_UTILITY_V2_IMPL_H
			
--- a/src/core/impl/memory_pool.cpp
+++ b/src/core/impl/memory_pool.cpp
@@ -31,54 +31,29 @@ void *memory_pool::impl::try_reuse_host(size_t count) {
 
				     return mem_info.ptr;
			
 
				 }
			
 
				 
			
 
				-void *memory_pool::impl::try_reuse_cuda_linear(size_t count) {
			
 
				-    auto iter = reuse_cuda_linear_pool.lower_bound(count);
			
 
				-    if (iter == reuse_cuda_linear_pool.end()) [[unlikely]] return nullptr;
			
 
				+void *memory_pool::impl::try_reuse_cuda(size_t count) {
			
 
				+    auto iter = reuse_cuda_pool.lower_bound(count);
			
 
				+    if (iter == reuse_cuda_pool.end()) [[unlikely]] return nullptr;
			
 
				     auto mem_info = iter->second;
			
 
				     if (mem_info.count * reuse_threshold > count) [[unlikely]] return nullptr;
			
 
				-    reuse_cuda_linear_pool.erase(iter);
			
 
				+    reuse_cuda_pool.erase(iter);
			
 
				     reg_allocate(mem_info);
			
 
				     return mem_info.ptr;
			
 
				 }
			
 
				 
			
 
				-void *memory_pool::impl::try_reuse_cuda_pitch(size_t width, size_t rows, size_t *pitch) {
			
 
				-    auto iter = std::ranges::find_if(
			
 
				-            reuse_cuda_pitch_pool,
			
 
				-            [=](mem_info_type info) {
			
 
				-                return info.pitch >= width && info.rows >= rows;
			
 
				-            }
			
 
				-    );
			
 
				-    if (iter == reuse_cuda_pitch_pool.end()) [[unlikely]] return nullptr;
			
 
				-    auto mem_info = *iter;
			
 
				-    if (mem_info.count * reuse_threshold > width * rows) [[unlikely]] return nullptr;
			
 
				-    reuse_cuda_pitch_pool.erase(iter);
			
 
				-    reg_allocate(mem_info);
			
 
				-    *pitch = mem_info.pitch;
			
 
				-    return mem_info.ptr;
			
 
				-}
			
 
				-
			
 
				 void *memory_pool::impl::direct_allocate_host(size_t count) {
			
 
				     auto ptr = ::malloc(count);
			
 
				     reg_allocate({.ptr = ptr, .loc = MEM_HOST, .lay = MEM_LINEAR, .count = count});
			
 
				     return ptr;
			
 
				 }
			
 
				 
			
 
				-void *memory_pool::impl::direct_allocate_cuda_linear(size_t count) {
			
 
				+void *memory_pool::impl::direct_allocate_cuda(size_t count) {
			
 
				     void *ptr = nullptr;
			
 
				     CUDA_API_CHECK(cudaMalloc(&ptr, count));
			
 
				     reg_allocate({.ptr = ptr, .loc = MEM_CUDA, .lay = MEM_LINEAR, .count = count});
			
 
				     return ptr;
			
 
				 }
			
 
				 
			
 
				-void *memory_pool::impl::direct_allocate_cuda_pitch(
			
 
				-        size_t width, size_t rows, size_t *pitch) {
			
 
				-    void *ptr = nullptr;
			
 
				-    CUDA_API_CHECK(cudaMallocPitch(&ptr, pitch, width, rows));
			
 
				-    reg_allocate({.ptr = ptr, .loc = MEM_CUDA, .lay = MEM_PITCH,
			
 
				-                         .count = *pitch * rows, .pitch = *pitch, .rows = rows});
			
 
				-    return ptr;
			
 
				-}
			
 
				-
			
 
				 void *memory_pool::impl::allocate_host(size_t count) {
			
 
				     if (auto ptr = try_reuse_host(count);
			
 
				             ptr != nullptr) [[likely]] {
			
@@ -88,14 +63,15 @@ void *memory_pool::impl::allocate_host(size_t count) {
 
				 }
			
 
				 
			
 
				 void *memory_pool::impl::allocate_cuda(size_t count) {
			
 
				-    if (auto ptr = try_reuse_cuda_linear(count);
			
 
				+    if (auto ptr = try_reuse_cuda(count);
			
 
				             ptr != nullptr) [[likely]] {
			
 
				         return ptr;
			
 
				     }
			
 
				-    return direct_allocate_cuda_linear(count);
			
 
				+    return direct_allocate_cuda(count);
			
 
				 }
			
 
				 
			
 
				 void *memory_pool::impl::allocate(size_t count, memory_location mem_loc) {
			
 
				+    auto guard = std::lock_guard(mu);
			
 
				     switch (mem_loc) {
			
 
				         case MEM_HOST: {
			
 
				             return allocate_host(count);
			
@@ -107,29 +83,41 @@ void *memory_pool::impl::allocate(size_t count, memory_location mem_loc) {
 
				     RET_ERROR_P;
			
 
				 }
			
 
				 
			
 
				-void *memory_pool::impl::allocate_pitch_cuda(size_t width, size_t rows, size_t *pitch) {
			
 
				-    if (auto ptr = try_reuse_cuda_pitch(width, rows, pitch);
			
 
				-            ptr != nullptr) [[likely]] {
			
 
				-        return ptr;
			
 
				-    }
			
 
				-    return direct_allocate_cuda_pitch(width, rows, pitch);
			
 
				-}
			
 
				-
			
 
				 void *memory_pool::impl::allocate_pitch(
			
 
				         size_t width, size_t rows, memory_location mem_loc, size_t *pitch) {
			
 
				+    auto guard = std::lock_guard(mu);
			
 
				     switch (mem_loc) {
			
 
				         case MEM_HOST: {
			
 
				             *pitch = width;
			
 
				             return allocate_host(width * rows);
			
 
				         }
			
 
				         case MEM_CUDA: {
			
 
				-            return allocate_pitch_cuda(width, rows, pitch);
			
 
				+            if (width & 0x1F) { // next multiples of 32
			
 
				+                *pitch = (width + 0x20) & 0x1F;
			
 
				+            } else {
			
 
				+                *pitch = width;
			
 
				+            }
			
 
				+            return allocate_cuda(*pitch * rows);
			
 
				         }
			
 
				     }
			
 
				     RET_ERROR_P;
			
 
				 }
			
 
				 
			
 
				+cudaEvent_t memory_pool::impl::get_event(void *ptr) {
			
 
				+    auto guard = std::lock_guard(mu);
			
 
				+    auto iter = malloc_pool.lower_bound(ptr);
			
 
				+    assert(iter != malloc_pool.end());
			
 
				+    auto &mem_info = iter->second;
			
 
				+    assert((char *) ptr - (char *) mem_info.ptr < mem_info.count);
			
 
				+    if (mem_info.event == nullptr) [[unlikely]] {
			
 
				+        CUDA_API_CHECK(cudaEventCreate(&mem_info.event, cudaEventDisableTiming));
			
 
				+    }
			
 
				+    assert(mem_info.event != nullptr);
			
 
				+    return mem_info.event;
			
 
				+}
			
 
				+
			
 
				 void memory_pool::impl::deallocate(void *ptr) {
			
 
				+    auto guard = std::lock_guard(mu);
			
 
				     auto iter = malloc_pool.find(ptr);
			
 
				     if (iter == malloc_pool.end()) {
			
 
				         SPDLOG_WARN("Deallocate unknown pointer: {}.", fmt::ptr(ptr));
			
@@ -143,12 +131,7 @@ void memory_pool::impl::deallocate(void *ptr) {
 
				             return;
			
 
				         }
			
 
				         case MEM_CUDA: {
			
 
				-            if (mem_info.lay == MEM_LINEAR) {
			
 
				-                reuse_cuda_linear_pool.emplace(mem_info.count, mem_info);
			
 
				-            } else {
			
 
				-                assert(mem_info.lay == MEM_PITCH);
			
 
				-                reuse_cuda_pitch_pool.push_back(mem_info);
			
 
				-            }
			
 
				+            reuse_cuda_pool.emplace(mem_info.count, mem_info);
			
 
				             return;
			
 
				         }
			
 
				     }
			
@@ -170,40 +153,47 @@ void memory_pool::impl::system_deallocate(mem_info_type mem_info) {
 
				 }
			
 
				 
			
 
				 void memory_pool::impl::purge() {
			
 
				+    auto guard = std::lock_guard(mu);
			
 
				     for (auto item: reuse_host_pool | std::views::values) {
			
 
				         system_deallocate(item);
			
 
				     }
			
 
				     reuse_host_pool.clear();
			
 
				 
			
 
				-    for (auto item: reuse_cuda_linear_pool | std::views::values) {
			
 
				-        system_deallocate(item);
			
 
				-    }
			
 
				-    reuse_cuda_linear_pool.clear();
			
 
				-
			
 
				-    for (auto item: reuse_cuda_pitch_pool) {
			
 
				+    for (auto item: reuse_cuda_pool | std::views::values) {
			
 
				         system_deallocate(item);
			
 
				     }
			
 
				-    reuse_cuda_pitch_pool.clear();
			
 
				+    reuse_cuda_pool.clear();
			
 
				 }
			
 
				 
			
 
				 void *memory_pool::allocate_impl(size_t count, memory_location mem_loc) {
			
 
				-    auto guard = std::lock_guard(pimpl->mu);
			
 
				     return pimpl->allocate(count, mem_loc);
			
 
				 }
			
 
				 
			
 
				 void *memory_pool::allocate_pitch_impl(
			
 
				         size_t width, size_t rows, memory_location mem_loc, size_t *pitch) {
			
 
				-    auto guard = std::lock_guard(pimpl->mu);
			
 
				     return pimpl->allocate_pitch(width, rows, mem_loc, pitch);
			
 
				 }
			
 
				 
			
 
				+void memory_pool::record_create(void *ptr, smart_cuda_stream *stream) {
			
 
				+    if (stream == nullptr) return;
			
 
				+    auto event = pimpl->get_event(ptr);
			
 
				+    CUDA_API_CHECK(cudaEventRecord(event, stream->cuda));
			
 
				+}
			
 
				+
			
 
				+void memory_pool::sync_create(void *ptr, smart_cuda_stream *stream) {
			
 
				+    auto event = pimpl->get_event(ptr);
			
 
				+    if (stream == nullptr) {
			
 
				+        CUDA_API_CHECK(cudaEventSynchronize(event));
			
 
				+    } else {
			
 
				+        CUDA_API_CHECK(cudaStreamWaitEvent(stream->cuda, event));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				 void memory_pool::deallocate(void *ptr) {
			
 
				-    auto guard = std::lock_guard(pimpl->mu);
			
 
				     return pimpl->deallocate(ptr);
			
 
				 }
			
 
				 
			
 
				 void memory_pool::purge() {
			
 
				-    auto guard = std::lock_guard(pimpl->mu);
			
 
				     pimpl->purge();
			
 
				 }
			
 
				 
			
--- a/src/core/impl/memory_pool_impl.h
+++ b/src/core/impl/memory_pool_impl.h
@@ -9,14 +9,13 @@
 
				 #include <map>
			
 
				 #include <mutex>
			
 
				 #include <thread>
			
 
				-#include <unordered_map>
			
 
				 
			
 
				 struct memory_pool::impl {
			
 
				 
			
 
				     // reuse_length * reuse_threshold >= request_length
			
 
				     static constexpr auto reuse_threshold = 0.5;
			
 
				 
			
 
				-    enum memory_layout {
			
 
				+    enum memory_layout : uint8_t {
			
 
				         MEM_LINEAR, MEM_PITCH
			
 
				     };
			
 
				 
			
@@ -28,19 +27,15 @@ struct memory_pool::impl {
 
				         // for MEM_LINEAR and MEM_PITCH
			
 
				         size_t count;
			
 
				 
			
 
				-        // for MEM_PITCH
			
 
				-        size_t pitch, rows;
			
 
				+        cudaEvent_t event = nullptr;
			
 
				     };
			
 
				 
			
 
				-    using malloc_pool_type = std::unordered_map<void *, mem_info_type>;
			
 
				+    using malloc_pool_type = std::map<void *, mem_info_type, std::greater<>>;
			
 
				     malloc_pool_type malloc_pool;
			
 
				 
			
 
				-    using reuse_host_pool_type = std::multimap<size_t, mem_info_type>;
			
 
				-    using reuse_cuda_linear_pool_type = std::multimap<size_t, mem_info_type>;
			
 
				-    using reuse_cuda_pitch_pool_type = std::list<mem_info_type>; // TODO: simulate pitched allocation manually
			
 
				-    reuse_host_pool_type reuse_host_pool;
			
 
				-    reuse_cuda_linear_pool_type reuse_cuda_linear_pool;
			
 
				-    reuse_cuda_pitch_pool_type reuse_cuda_pitch_pool;
			
 
				+    using reuse_pool_type = std::multimap<size_t, mem_info_type>;
			
 
				+    reuse_pool_type reuse_host_pool;
			
 
				+    reuse_pool_type reuse_cuda_pool;
			
 
				 
			
 
				     std::mutex mu;
			
 
				 
			
@@ -48,15 +43,11 @@ struct memory_pool::impl {
 
				 
			
 
				     void *try_reuse_host(size_t count);
			
 
				 
			
 
				-    void *try_reuse_cuda_linear(size_t count);
			
 
				-
			
 
				-    void *try_reuse_cuda_pitch(size_t width, size_t rows, size_t *pitch);
			
 
				+    void *try_reuse_cuda(size_t count);
			
 
				 
			
 
				     void *direct_allocate_host(size_t count);
			
 
				 
			
 
				-    void *direct_allocate_cuda_linear(size_t count);
			
 
				-
			
 
				-    void *direct_allocate_cuda_pitch(size_t width, size_t rows, size_t *pitch);
			
 
				+    void *direct_allocate_cuda(size_t count);
			
 
				 
			
 
				     void *allocate_host(size_t count);
			
 
				 
			
@@ -64,10 +55,10 @@ struct memory_pool::impl {
 
				 
			
 
				     void *allocate(size_t count, memory_location mem_loc);
			
 
				 
			
 
				-    void *allocate_pitch_cuda(size_t width, size_t rows, size_t *pitch);
			
 
				-
			
 
				     void *allocate_pitch(size_t width, size_t rows, memory_location mem_loc, size_t *pitch);
			
 
				 
			
 
				+    cudaEvent_t get_event(void *ptr);
			
 
				+
			
 
				     static void system_deallocate(mem_info_type mem_info);
			
 
				 
			
 
				     void deallocate(void *ptr);
			
--- a/src/core/memory_pool.h
+++ b/src/core/memory_pool.h
@@ -1,6 +1,8 @@
 
				 #ifndef DEPTHGUIDE_MEMORY_POOL_H
			
 
				 #define DEPTHGUIDE_MEMORY_POOL_H
			
 
				 
			
 
				+#include "core/cuda_helper.hpp"
			
 
				+
			
 
				 #include <cassert>
			
 
				 #include <memory>
			
 
				 
			
@@ -39,6 +41,21 @@ public:
 
				         return as_shared(allocate_pitch<T>(cols, rows, mem_loc, pitch));
			
 
				     }
			
 
				 
			
 
				+    // does nothing if stream == nullptr.
			
 
				+    void record_create(void *ptr, smart_cuda_stream *stream);
			
 
				+
			
 
				+    void sync_create(void *ptr, smart_cuda_stream *stream = nullptr);
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void record_create(const std::shared_ptr<T> &ptr, smart_cuda_stream *stream) {
			
 
				+        record_create(ptr.get(), stream);
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    void sync_create(const std::shared_ptr<T> &ptr, smart_cuda_stream *stream = nullptr) {
			
 
				+        sync_create(ptr.get(), stream);
			
 
				+    }
			
 
				+
			
 
				     void deallocate(void *ptr);
			
 
				 
			
 
				     // free all unused memory
			
@@ -75,4 +92,13 @@ extern memory_pool global_mp;
 
				 #define ALLOC_PITCH_SHARED(type, cols, rows, loc, pitch) \
			
 
				     global_mp.allocate_pitch_shared<type>(cols, rows, loc, pitch)
			
 
				 
			
 
				+#define REC_CREATE(ptr, stream) \
			
 
				+    global_mp.record_create(ptr, stream)
			
 
				+
			
 
				+#define SYNC_CREATE(ptr, stream) \
			
 
				+    global_mp.sync_create(ptr, stream)
			
 
				+
			
 
				+#define WAIT_CREATE(ptr) \
			
 
				+    global_mp.sync_create(ptr)
			
 
				+
			
 
				 #endif //DEPTHGUIDE_MEMORY_POOL_H
			
--- a/src/impl/apps/app_selector/app_selector.cpp
+++ b/src/impl/apps/app_selector/app_selector.cpp
@@ -1,4 +1,5 @@
 
				 #include "app_selector.h"
			
 
				+#include "impl/apps/debug/app_debug.h"
			
 
				 #include "impl/apps/depth_guide/depth_guide.h"
			
 
				 #include "impl/apps/remote_ar/remote_ar.h"
			
 
				 #include "impl/apps/tiny_player/tiny_player.h"
			
@@ -52,6 +53,8 @@ void app_selector::load_app(const std::string &conf_path) {
 
				         app = std::make_unique<app_remote_ar>(create_conf);
			
 
				     } else if (app_name == "tiny_player") {
			
 
				         app = std::make_unique<app_tiny_player>(create_conf);
			
 
				+    } else if (app_name == "debug") {
			
 
				+        app = std::make_unique<app_debug>(create_conf);
			
 
				     }
			
 
				 
			
 
				     // change window title
			
--- a/src/impl/apps/debug/app_debug.cpp
+++ b/src/impl/apps/debug/app_debug.cpp
@@ -0,0 +1,52 @@
 
				+#include "app_debug.h"
			
 
				+#include "GLFW/glfw3.h"
			
 
				+
			
 
				+#include <opencv2/imgcodecs.hpp>
			
 
				+#include <opencv2/cudastereo.hpp>
			
 
				+#include <opencv2/cudaimgproc.hpp>
			
 
				+
			
 
				+app_debug::app_debug(const create_config &conf) {
			
 
				+
			
 
				+    auto left_path = "/home/tpx/project/DepthGuide/cmake-build-debug/Left.png";
			
 
				+    auto right_path = "/home/tpx/project/DepthGuide/cmake-build-debug/Right.png";
			
 
				+
			
 
				+    auto left_img = cv::imread(left_path);
			
 
				+    auto right_img = cv::imread(right_path);
			
 
				+
			
 
				+    auto left_img_cuda = cv::cuda::GpuMat();
			
 
				+    auto right_img_cuda = cv::cuda::GpuMat();
			
 
				+
			
 
				+    left_img_cuda.upload(left_img);
			
 
				+    right_img_cuda.upload(right_img);
			
 
				+
			
 
				+    // covert to gray
			
 
				+    cv::cuda::cvtColor(left_img_cuda, left_img_cuda, cv::COLOR_RGB2GRAY);
			
 
				+    cv::cuda::cvtColor(right_img_cuda, right_img_cuda, cv::COLOR_RGB2GRAY);
			
 
				+
			
 
				+    auto disparity_cuda = cv::cuda::GpuMat();
			
 
				+    auto stereo = cv::cuda::createStereoSGM();
			
 
				+    stereo->setNumDisparities(128);
			
 
				+//    stereo->setMode(cv::StereoSGBM::MODE_HH);
			
 
				+    stereo->compute(left_img_cuda, right_img_cuda, disparity_cuda);
			
 
				+
			
 
				+    auto filter = cv::cuda::createDisparityBilateralFilter();
			
 
				+    filter->setNumDisparities(128);
			
 
				+    filter->setRadius(5);
			
 
				+    filter->setNumIters(3);
			
 
				+    filter->apply(disparity_cuda, left_img_cuda, disparity_cuda);
			
 
				+
			
 
				+    auto disparity = cv::Mat();
			
 
				+    disparity_cuda.download(disparity);
			
 
				+
			
 
				+    double min_val, max_val;
			
 
				+    cv::minMaxLoc(disparity, &min_val, &max_val);
			
 
				+    SPDLOG_INFO("Min: {}, Max: {}", min_val, max_val);
			
 
				+
			
 
				+    auto tmp = cv::Mat();
			
 
				+    disparity.convertTo(tmp, CV_32FC1);
			
 
				+    tmp = (tmp - min_val) / (max_val - min_val) * 255.f;
			
 
				+    tmp.convertTo(disparity, CV_8UC1);
			
 
				+    cv::imwrite("disparity.png", disparity);
			
 
				+
			
 
				+    glfwSetWindowShouldClose(glfwGetCurrentContext(), true);
			
 
				+}
			
--- a/src/impl/apps/debug/app_debug.h
+++ b/src/impl/apps/debug/app_debug.h
@@ -0,0 +1,20 @@
 
				+#ifndef DEPTHGUIDE_APP_DEBUG_H
			
 
				+#define DEPTHGUIDE_APP_DEBUG_H
			
 
				+
			
 
				+#include "impl/app_base.h"
			
 
				+
			
 
				+class app_debug : public app_base {
			
 
				+public:
			
 
				+    explicit app_debug(const create_config &conf);
			
 
				+
			
 
				+    ~app_debug() override = default;
			
 
				+
			
 
				+    const char *window_name() override { return "You should not see this."; }
			
 
				+
			
 
				+    void show_ui() override {}
			
 
				+
			
 
				+    void render_background() override {}
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif //DEPTHGUIDE_APP_DEBUG_H