Explorar el Código

Re-implemented image utility.

jcsyshc hace 1 año
padre
commit
06595de6a6

+ 3 - 2
CMakeLists.txt

@@ -8,11 +8,12 @@ add_executable(${PROJECT_NAME} src/main.cpp
         src/image_process/impl/versatile_convertor.cpp
         src/impl/main_impl.cpp
         src/impl/apps/app_selector/app_selector.cpp
+        src/impl/apps/debug/app_debug.cpp
         src/impl/apps/depth_guide/depth_guide.cpp
         src/impl/apps/remote_ar/remote_ar.cpp
         src/impl/apps/tiny_player/tiny_player.cpp
         src/core/impl/event_timer.cpp
-#        src/core/impl/image_utility_v2.cpp
+        src/core/impl/image_utility_v2.cpp
         src/core/impl/memory_pool.cpp
         src/core/impl/object_manager.cpp
         src/module_v3/registration.cpp
@@ -69,7 +70,7 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LE
 
 # OpenCV config
 cmake_policy(SET CMP0146 OLD)
-find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs calib3d)
+find_package(OpenCV REQUIRED COMPONENTS cudaimgproc calib3d imgcodecs opencv_cudastereo)
 target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
 target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
 

+ 1 - 0
data/config_debug.yaml

@@ -0,0 +1 @@
+app_name: debug

+ 7 - 0
src/core/cuda_helper.hpp

@@ -58,6 +58,13 @@ struct smart_cuda_stream {
 
 extern smart_cuda_stream *default_cuda_stream;
 
+inline cudaStream_t cuda_stream(smart_cuda_stream *stream = nullptr) {
+    return stream == nullptr ? nullptr : stream->cuda;
+}
+
+inline cv::cuda::Stream &cv_stream(smart_cuda_stream *stream = nullptr) {
+    return stream == nullptr ? cv::cuda::Stream::Null() : stream->cv;
+}
 
 template<typename T>
 inline void extend_pointer_life(const std::shared_ptr<T> &ptr,

+ 2 - 0
src/core/image_utility.hpp

@@ -362,6 +362,8 @@ private:
     image_info_type<T> cuda_info;
 
     cudaEvent_t event = nullptr;
+
+    friend class generic_image;
 };
 
 template<typename T>

+ 87 - 13
src/core/image_utility_v2.h

@@ -1,45 +1,119 @@
 #ifndef DEPTHGUIDE_IMAGE_UTILITY_V2_H
 #define DEPTHGUIDE_IMAGE_UTILITY_V2_H
 
-#include <memory>
+#include "core/cuda_helper.hpp"
+#include "core/image_utility.hpp"
+#include "core/memory_pool.h"
+#include "image_process/cuda_impl/image_utility.cuh"
 
 #include <opencv2/core/types.hpp>
 
-enum pixel_enum : uint8_t {
-    PIX_GRAY,
-    PIX_RGB,
-    PIX_RGBA,
-    PIX_NV12
+#include <memory>
+#include <optional>
+
+enum pixel_format_enum {
+    PIX_NORMAL,
+    PIX_NV12,
 };
 
-// an image in certain device
-class generic_image_info {
+class generic_image;
+
+struct image_memory {
+    std::shared_ptr<generic_image> img;
+    std::shared_ptr<void> ptr;
+    size_t width, pitch; // in bytes
+    size_t height; // in pixel
 
+    void *start_ptr(int component = 0);
+
+    void *at(int row = 0, int col = 0, int component = 0);
+
+    void modified(smart_cuda_stream *stream = nullptr);
 };
 
 // collection of a same image in multiple devices
-class generic_image {
+class generic_image : public std::enable_shared_from_this<generic_image> {
 public:
 
     struct create_config {
-
+        cv::Size size; // display size
+        int type; // storage type, like CV_8UC3
+        pixel_format_enum pixel = PIX_NORMAL;
     };
 
     using pointer = std::shared_ptr<generic_image>;
 
-    pixel_enum pixel_type() const;
+    static pointer create(create_config conf);
 
-    int storage_type() const; // like CV_8UC3
+    static pointer create(cv::Size size, int type,
+                          pixel_format_enum pixel = PIX_NORMAL);
 
-    cv::Size size() const;
+    size_t width() const {
+        return size().width;
+    }
+
+    size_t height() const {
+        return size().height;
+    }
+
+    cv::Size size() const; // display size
+
+    size_t elem_size() const {
+        return CV_ELEM_SIZE(cv_type());
+    }
+
+    size_t size_in_bytes() const;
 
     size_t width_in_bytes() const;
 
+    int cv_type() const;
+
+    pixel_format_enum pixel_format() const;
+
+    // synchronization will be performed
+    cv::Mat cv_mat(smart_cuda_stream *stream = nullptr);
+
+    cv::cuda::GpuMat cv_gpumat(smart_cuda_stream *stream = nullptr);
+
+    template<typename T>
+    image_type_v2<T> cuda(smart_cuda_stream *stream = nullptr);
+
+    image_memory memory(memory_location loc,
+                        smart_cuda_stream *stream = nullptr);
+
+    pointer sub_image(int row = 0, int col = 0,
+                      int width = -1, int height = -1) const;
+
+    pointer bit_cast(int type);
+
+    template<typename T>
+    std::shared_ptr<smart_image<T>> v1() const;
+
+    void host_modified(smart_cuda_stream *stream = nullptr);
+
+    void cuda_modified(smart_cuda_stream *stream = nullptr);
+
+    using meta_key_type = size_t; // std::hash
+    using meta_value_type = uint64_t;
+
+    std::optional<meta_value_type> get_meta(meta_key_type key);
+
+    void set_meta(meta_key_type key, meta_value_type val);
+
 private:
     struct impl;
     std::unique_ptr<impl> pimpl;
+
+    pointer shallow_clone() const;
+
+    friend class image_memory;
 };
 
 using image_ptr = generic_image::pointer;
 
+image_ptr create_image(cv::Size size, int type,
+                       pixel_format_enum pixel = PIX_NORMAL) {
+    return generic_image::create(size, type, pixel);
+}
+
 #endif //DEPTHGUIDE_IMAGE_UTILITY_V2_H

+ 384 - 1
src/core/impl/image_utility_v2.cpp

@@ -1 +1,384 @@
-#include "image_utility_v2_impl.h"
+#include "image_utility_v2_impl.h"
+
+namespace image_utility_impl {
+
+    std::type_index cv_type_id(int type) {
+        switch (type) {
+            // @formatter:off
+            case CV_8UC1: { return typeid(uchar1); }
+            case CV_8UC2: { return typeid(uchar2); }
+            case CV_8UC3: { return typeid(uchar3); }
+            case CV_8UC4: { return typeid(uchar4); }
+            case CV_16UC1: { return typeid(ushort1); }
+            case CV_32FC1: { return typeid(float1); }
+            default: { RET_ERROR; }
+                // @formatter:on
+        }
+    }
+
+}
+
+void *image_memory::start_ptr(int component) {
+    switch (img->pixel_format()) {
+        case PIX_NORMAL: {
+            assert(component == 0);
+            return ptr.get();
+        }
+        case PIX_NV12: {
+            if (component == 0) { return ptr.get(); }
+            if (component == 1) { return (uint8_t *) ptr.get() + pitch * img->height(); }
+            RET_ERROR_E;
+        }
+        default: {
+            RET_ERROR_E;
+        }
+    }
+}
+
+void *image_memory::at(int row, int col, int component) {
+    auto sp = (uint8_t *) start_ptr(component);
+    return sp + row * pitch + col * img->elem_size();
+}
+
+void image_memory::modified(smart_cuda_stream *stream) {
+    auto &pimpl = img->pimpl;
+    if (ptr == pimpl->store_host.ptr) {
+        pimpl->q_this->host_modified(stream);
+    } else {
+        assert(ptr == pimpl->store_cuda.ptr);
+        pimpl->q_this->cuda_modified(stream);
+    }
+}
+
+void *generic_image::impl::storage_info::row_start(size_t row) {
+    return (uint8_t *) ptr.get() + row * pitch;
+}
+
+void generic_image::impl::storage_info::reset() {
+    ptr = nullptr;
+    pitch = 0;
+}
+
+generic_image::impl::impl(generic_image::create_config conf) {
+    type = conf.type;
+    pix_fmt = conf.pixel;
+    size = conf.size;
+
+    // adjust display size to storage size
+    switch (pix_fmt) {
+        case PIX_NORMAL: {
+            break;
+        }
+        case PIX_NV12: {
+            size.height = nv12_storage_height(size.height);
+            break;
+        }
+        default: {
+            assert(false);
+        }
+    }
+}
+
+cv::Size generic_image::impl::display_size() const {
+    cv::Size ret = size;
+    switch (pix_fmt) {
+        case PIX_NORMAL: {
+            break;
+        }
+        case PIX_NV12: {
+            ret.height = nv12_display_height(ret.height);
+            break;
+        }
+        default: {
+            RET_ERROR_E;
+        }
+    }
+    return ret;
+}
+
+size_t generic_image::impl::elem_bytes() const {
+    return CV_ELEM_SIZE(type);
+}
+
+size_t generic_image::impl::width_in_bytes() const {
+    return size.width * elem_bytes();
+}
+
+size_t generic_image::impl::size_in_bytes() const {
+    return size.height * width_in_bytes();
+}
+
+void generic_image::impl::create_host(smart_cuda_stream *stream) {
+    if (store_host.ptr != nullptr) {
+        SYNC_CREATE(store_host.ptr, stream);
+        return;
+    }
+    store_host.ptr = ALLOC_PITCH_SHARED(
+            uint8_t, width_in_bytes(), size.height, MEM_HOST, &store_host.pitch);
+    if (store_cuda.ptr != nullptr) {
+        SYNC_CREATE(store_cuda.ptr, stream);
+        CUDA_API_CHECK(cudaMemcpy2DAsync(store_host.ptr.get(), store_host.pitch, // dst
+                                         store_cuda.ptr.get(), store_cuda.pitch, // src
+                                         width_in_bytes(), size.height,
+                                         cudaMemcpyDeviceToHost, cuda_stream(stream)));
+        REC_CREATE(store_host.ptr, stream);
+    }
+}
+
+void generic_image::impl::create_cuda(smart_cuda_stream *stream) {
+    if (store_cuda.ptr != nullptr) {
+        SYNC_CREATE(store_cuda.ptr, stream);
+        return;
+    }
+    store_cuda.ptr = ALLOC_PITCH_SHARED(
+            uint8_t, width_in_bytes(), size.height, MEM_CUDA, &store_cuda.pitch);
+    if (store_host.ptr != nullptr) {
+        SYNC_CREATE(store_host.ptr, stream);
+        CUDA_API_CHECK(cudaMemcpy2DAsync(store_cuda.ptr.get(), store_cuda.pitch, // dst
+                                         store_host.ptr.get(), store_host.pitch, // src
+                                         width_in_bytes(), size.height,
+                                         cudaMemcpyHostToDevice, stream->cuda));
+        REC_CREATE(store_cuda.ptr, stream);
+    }
+}
+
+image_memory generic_image::impl::get_memory(memory_location loc,
+                                             smart_cuda_stream *stream) {
+    auto ret = image_memory();
+    ret.img = q_this->shared_from_this();
+    ret.width = width_in_bytes();
+    ret.height = size.height;
+
+    switch (loc) {
+        case MEM_HOST: {
+            create_host(stream);
+            ret.ptr = store_host.ptr;
+            ret.pitch = store_host.pitch;
+            break;
+        }
+        case MEM_CUDA: {
+            create_cuda(stream);
+            ret.ptr = store_cuda.ptr;
+            ret.pitch = store_cuda.pitch;
+            break;
+        }
+        default: {
+            RET_ERROR_E;
+        }
+    }
+    return ret;
+}
+
+cv::Mat generic_image::impl::get_cv_mat(smart_cuda_stream *stream) {
+    create_host(stream);
+    return cv::Mat(size, type, store_host.ptr.get(), store_host.pitch);
+}
+
+cv::cuda::GpuMat generic_image::impl::get_cv_gpumat(smart_cuda_stream *stream) {
+    create_cuda(stream);
+    return cv::cuda::GpuMat(size, type, store_host.ptr.get(), store_host.pitch);
+}
+
+template<typename T>
+image_type_v2<T> generic_image::impl::get_image_type_v2(smart_cuda_stream *stream) {
+    create_cuda(stream);
+    assert(cv_type_id(type) == typeid(T));
+    assert(size.width <= std::numeric_limits<ushort>::max());
+    assert(size.height <= std::numeric_limits<ushort>::max());
+    return image_type_v2<T>(
+            store_cuda.ptr.get(), size.width, size.height, store_cuda.pitch);
+}
+
+template<typename T>
+std::shared_ptr<smart_image<T>> generic_image::impl::get_image_v1() const {
+    using ret_type = std::shared_ptr<smart_image<T>>;
+    auto ret = ret_type();
+    using info_type = image_info_type<T>;
+    if (store_host.ptr != nullptr) {
+        auto host_info = info_type{
+                .ptr = store_host.ptr, .loc = MEM_HOST,
+                .size = size, .pitch = store_host.pitch,
+        };
+        ret = std::make_shared<ret_type>(host_info);
+    }
+
+    if (store_cuda.ptr != nullptr) {
+        auto cuda_info = info_type{
+                .ptr = store_cuda.ptr, .loc= MEM_CUDA,
+                .size = size, .pitch = store_cuda.pitch,
+        };
+        if (ret == nullptr) {
+            ret = std::make_shared<ret_type>(cuda_info);
+        } else {
+            ret->cuda_info = cuda_info;
+        }
+    }
+
+    assert(ret != nullptr);
+    return ret;
+}
+
+void generic_image::impl::sub_image_inplace(int row, int col, int width, int height) {
+    // sub-image of other formats are not implemented
+    assert(pix_fmt == PIX_NORMAL);
+    if (store_host.ptr != nullptr) {
+        store_host.ptr = std::shared_ptr<void>(
+                (uint8_t *) store_host.row_start(row) + col * elem_bytes(),
+                [p = store_host.ptr](void *) {});
+    }
+    if (store_cuda.ptr != nullptr) {
+        store_cuda.ptr = std::shared_ptr<void>(
+                (uint8_t *) store_cuda.row_start(row) + col * elem_bytes(),
+                [p = store_cuda.ptr](void *) {});
+    }
+    size = cv::Size(width, height);
+}
+
+void generic_image::impl::type_cast_inplace(int _type) {
+    // bit-cast of other formats are not implemented
+    assert(pix_fmt == PIX_NORMAL);
+    auto _width = width_in_bytes() / CV_ELEM_SIZE(_type);
+    assert(_width * CV_ELEM_SIZE(_type) == width_in_bytes());
+    size.width = _width;
+    type = _type;
+}
+
+void generic_image::impl::host_modified(smart_cuda_stream *stream) {
+    assert(store_host.ptr != nullptr);
+    store_cuda.reset();
+    REC_CREATE(store_host.ptr, stream);
+}
+
+void generic_image::impl::cuda_modified(smart_cuda_stream *stream) {
+    assert(store_cuda.ptr != nullptr);
+    store_host.reset();
+    REC_CREATE(store_cuda.ptr, stream);
+}
+
+std::optional<generic_image::meta_value_type>
+generic_image::impl::get_meta(meta_key_type key) {
+    if (auto iter = meta_pool->find(key); iter != meta_pool->end()) {
+        return iter->second;
+    } else {
+        return {};
+    }
+}
+
+void generic_image::impl::set_meta(meta_key_type key, meta_value_type val) {
+    if (auto iter = meta_pool->find(key); iter != meta_pool->end()) {
+        iter->second = val;
+    } else {
+        meta_pool->emplace(key, val);
+    }
+}
+
+generic_image::pointer generic_image::create(create_config conf) {
+    auto ret = std::make_shared<generic_image>();
+    ret->pimpl = std::make_unique<impl>(conf);
+    ret->pimpl->q_this = ret.get();
+    return ret;
+}
+
+generic_image::pointer generic_image::create(cv::Size size, int type, pixel_format_enum pixel) {
+    auto conf = create_config{
+            .size = size, .type = type, .pixel = pixel,
+    };
+    return create(conf);
+}
+
+cv::Size generic_image::size() const {
+    return pimpl->display_size();
+}
+
+size_t generic_image::size_in_bytes() const {
+    return pimpl->size_in_bytes();
+}
+
+size_t generic_image::width_in_bytes() const {
+    return pimpl->width_in_bytes();
+}
+
+int generic_image::cv_type() const {
+    return pimpl->type;
+}
+
+pixel_format_enum generic_image::pixel_format() const {
+    return pimpl->pix_fmt;
+}
+
+image_memory generic_image::memory(memory_location loc,
+                                   smart_cuda_stream *stream) {
+    return pimpl->get_memory(loc, stream);
+}
+
+cv::Mat generic_image::cv_mat(smart_cuda_stream *stream) {
+    return pimpl->get_cv_mat(stream);
+}
+
+cv::cuda::GpuMat generic_image::cv_gpumat(smart_cuda_stream *stream) {
+    return pimpl->get_cv_gpumat(stream);
+}
+
+template<typename T>
+image_type_v2<T> generic_image::cuda(smart_cuda_stream *stream) {
+    return pimpl->get_image_type_v2<T>(stream);
+}
+
+// @formatter:off
+template<> image_type_v2<uchar1> generic_image::cuda(smart_cuda_stream *stream);
+template<> image_type_v2<uchar2> generic_image::cuda(smart_cuda_stream *stream);
+template<> image_type_v2<uchar3> generic_image::cuda(smart_cuda_stream *stream);
+template<> image_type_v2<uchar4> generic_image::cuda(smart_cuda_stream *stream);
+template<> image_type_v2<ushort1> generic_image::cuda(smart_cuda_stream *stream);
+template<> image_type_v2<float1> generic_image::cuda(smart_cuda_stream *stream);
+// @formatter:on
+
+template<typename T>
+std::shared_ptr<smart_image<T>> generic_image::v1() const {
+    return pimpl->get_image_v1<T>();
+}
+
+// @formatter:off
+template<> std::shared_ptr<smart_image<uchar1>> generic_image::v1() const;
+template<> std::shared_ptr<smart_image<uchar2>> generic_image::v1() const;
+template<> std::shared_ptr<smart_image<uchar3>> generic_image::v1() const;
+template<> std::shared_ptr<smart_image<uchar4>> generic_image::v1() const;
+template<> std::shared_ptr<smart_image<ushort1>> generic_image::v1() const;
+template<> std::shared_ptr<smart_image<float1>> generic_image::v1() const;
+// @formatter:on
+
+generic_image::pointer generic_image::shallow_clone() const {
+    auto ret = std::make_shared<generic_image>();
+    ret->pimpl = std::make_unique<impl>(*pimpl);
+    ret->pimpl->q_this = ret.get();
+    return ret;
+}
+
+generic_image::pointer generic_image::sub_image(int row, int col, int width, int height) const {
+    auto ret = shallow_clone();
+    ret->pimpl->sub_image_inplace(row, col, width, height);
+    return ret;
+}
+
+generic_image::pointer generic_image::bit_cast(int type) {
+    auto ret = shallow_clone();
+    ret->pimpl->type_cast_inplace(type);
+    return ret;
+}
+
+void generic_image::host_modified(smart_cuda_stream *stream) {
+    pimpl->host_modified(stream);
+}
+
+void generic_image::cuda_modified(smart_cuda_stream *stream) {
+    pimpl->cuda_modified(stream);
+}
+
+std::optional<generic_image::meta_value_type>
+generic_image::get_meta(meta_key_type key) {
+    return pimpl->get_meta(key);
+}
+
+void generic_image::set_meta(meta_key_type key, meta_value_type val) {
+    pimpl->set_meta(key, val);
+}

+ 93 - 0
src/core/impl/image_utility_v2_impl.h

@@ -3,4 +3,97 @@
 
 #include "core/image_utility_v2.h"
 
+#include <boost/container/flat_map.hpp>
+#include <boost/container/static_vector.hpp>
+
+namespace image_utility_impl {
+
+    inline size_t nv12_display_height(size_t height) {
+        assert(height % 3 == 0);
+        return height / 3 * 2;
+    }
+
+    inline size_t nv12_storage_height(size_t height) {
+        assert(height % 2 == 0);
+        return height / 2 * 3;
+    }
+
+    std::type_index cv_type_id(int type);
+
+}
+
+using namespace image_utility_impl;
+
+struct generic_image::impl {
+
+    generic_image *q_this = nullptr;
+
+    static constexpr auto max_meta_count = 16;
+    using meta_pool_type =
+            boost::container::flat_map<meta_key_type, meta_value_type, std::less<>,
+                    boost::container::static_vector<std::pair<meta_key_type, meta_value_type>, max_meta_count>>;
+    std::shared_ptr<meta_pool_type> meta_pool =
+            std::make_shared<meta_pool_type>();
+
+    struct storage_info {
+        std::shared_ptr<void> ptr;
+        size_t pitch;
+
+        void *row_start(size_t row);
+
+        void reset();
+    };
+
+    storage_info store_host;
+    storage_info store_cuda;
+
+    cv::Size size; // storage size
+    int type;
+    pixel_format_enum pix_fmt;
+
+    explicit impl(create_config conf);
+
+    cv::Size display_size() const;
+
+    size_t elem_bytes() const;
+
+    size_t size_in_bytes() const;
+
+    size_t width_in_bytes() const;
+
+    // imply that stream want to use host
+    void create_host(smart_cuda_stream *stream);
+
+    // imply that stream want to use cuda
+    void create_cuda(smart_cuda_stream *stream);
+
+    image_memory get_memory(memory_location loc,
+                            smart_cuda_stream *stream);
+
+    cv::Mat get_cv_mat(smart_cuda_stream *stream);
+
+    cv::cuda::GpuMat get_cv_gpumat(smart_cuda_stream *stream);
+
+    template<typename T>
+    image_type_v2<T> get_image_type_v2(smart_cuda_stream *stream);
+
+    template<typename T>
+    std::shared_ptr<smart_image<T>> get_image_v1() const;
+
+    // use after copy of impl
+    void sub_image_inplace(int row = 0, int col = 0,
+                           int width = -1, int height = -1);
+
+    void type_cast_inplace(int type);
+
+    void host_modified(smart_cuda_stream *stream);
+
+    void cuda_modified(smart_cuda_stream *stream);
+
+    std::optional<meta_value_type> get_meta(meta_key_type key);
+
+    void set_meta(meta_key_type key, meta_value_type val);
+
+};
+
 #endif //DEPTHGUIDE_IMAGE_UTILITY_V2_IMPL_H

+ 48 - 58
src/core/impl/memory_pool.cpp

@@ -31,54 +31,29 @@ void *memory_pool::impl::try_reuse_host(size_t count) {
     return mem_info.ptr;
 }
 
-void *memory_pool::impl::try_reuse_cuda_linear(size_t count) {
-    auto iter = reuse_cuda_linear_pool.lower_bound(count);
-    if (iter == reuse_cuda_linear_pool.end()) [[unlikely]] return nullptr;
+void *memory_pool::impl::try_reuse_cuda(size_t count) {
+    auto iter = reuse_cuda_pool.lower_bound(count);
+    if (iter == reuse_cuda_pool.end()) [[unlikely]] return nullptr;
     auto mem_info = iter->second;
     if (mem_info.count * reuse_threshold > count) [[unlikely]] return nullptr;
-    reuse_cuda_linear_pool.erase(iter);
+    reuse_cuda_pool.erase(iter);
     reg_allocate(mem_info);
     return mem_info.ptr;
 }
 
-void *memory_pool::impl::try_reuse_cuda_pitch(size_t width, size_t rows, size_t *pitch) {
-    auto iter = std::ranges::find_if(
-            reuse_cuda_pitch_pool,
-            [=](mem_info_type info) {
-                return info.pitch >= width && info.rows >= rows;
-            }
-    );
-    if (iter == reuse_cuda_pitch_pool.end()) [[unlikely]] return nullptr;
-    auto mem_info = *iter;
-    if (mem_info.count * reuse_threshold > width * rows) [[unlikely]] return nullptr;
-    reuse_cuda_pitch_pool.erase(iter);
-    reg_allocate(mem_info);
-    *pitch = mem_info.pitch;
-    return mem_info.ptr;
-}
-
 void *memory_pool::impl::direct_allocate_host(size_t count) {
     auto ptr = ::malloc(count);
     reg_allocate({.ptr = ptr, .loc = MEM_HOST, .lay = MEM_LINEAR, .count = count});
     return ptr;
 }
 
-void *memory_pool::impl::direct_allocate_cuda_linear(size_t count) {
+void *memory_pool::impl::direct_allocate_cuda(size_t count) {
     void *ptr = nullptr;
     CUDA_API_CHECK(cudaMalloc(&ptr, count));
     reg_allocate({.ptr = ptr, .loc = MEM_CUDA, .lay = MEM_LINEAR, .count = count});
     return ptr;
 }
 
-void *memory_pool::impl::direct_allocate_cuda_pitch(
-        size_t width, size_t rows, size_t *pitch) {
-    void *ptr = nullptr;
-    CUDA_API_CHECK(cudaMallocPitch(&ptr, pitch, width, rows));
-    reg_allocate({.ptr = ptr, .loc = MEM_CUDA, .lay = MEM_PITCH,
-                         .count = *pitch * rows, .pitch = *pitch, .rows = rows});
-    return ptr;
-}
-
 void *memory_pool::impl::allocate_host(size_t count) {
     if (auto ptr = try_reuse_host(count);
             ptr != nullptr) [[likely]] {
@@ -88,14 +63,15 @@ void *memory_pool::impl::allocate_host(size_t count) {
 }
 
 void *memory_pool::impl::allocate_cuda(size_t count) {
-    if (auto ptr = try_reuse_cuda_linear(count);
+    if (auto ptr = try_reuse_cuda(count);
             ptr != nullptr) [[likely]] {
         return ptr;
     }
-    return direct_allocate_cuda_linear(count);
+    return direct_allocate_cuda(count);
 }
 
 void *memory_pool::impl::allocate(size_t count, memory_location mem_loc) {
+    auto guard = std::lock_guard(mu);
     switch (mem_loc) {
         case MEM_HOST: {
             return allocate_host(count);
@@ -107,29 +83,41 @@ void *memory_pool::impl::allocate(size_t count, memory_location mem_loc) {
     RET_ERROR_P;
 }
 
-void *memory_pool::impl::allocate_pitch_cuda(size_t width, size_t rows, size_t *pitch) {
-    if (auto ptr = try_reuse_cuda_pitch(width, rows, pitch);
-            ptr != nullptr) [[likely]] {
-        return ptr;
-    }
-    return direct_allocate_cuda_pitch(width, rows, pitch);
-}
-
 void *memory_pool::impl::allocate_pitch(
         size_t width, size_t rows, memory_location mem_loc, size_t *pitch) {
+    auto guard = std::lock_guard(mu);
     switch (mem_loc) {
         case MEM_HOST: {
             *pitch = width;
             return allocate_host(width * rows);
         }
         case MEM_CUDA: {
-            return allocate_pitch_cuda(width, rows, pitch);
+            if (width & 0x1F) { // next multiples of 32
+                *pitch = (width + 0x20) & 0x1F;
+            } else {
+                *pitch = width;
+            }
+            return allocate_cuda(*pitch * rows);
         }
     }
     RET_ERROR_P;
 }
 
+cudaEvent_t memory_pool::impl::get_event(void *ptr) {
+    auto guard = std::lock_guard(mu);
+    auto iter = malloc_pool.lower_bound(ptr);
+    assert(iter != malloc_pool.end());
+    auto &mem_info = iter->second;
+    assert((char *) ptr - (char *) mem_info.ptr < mem_info.count);
+    if (mem_info.event == nullptr) [[unlikely]] {
+        CUDA_API_CHECK(cudaEventCreate(&mem_info.event, cudaEventDisableTiming));
+    }
+    assert(mem_info.event != nullptr);
+    return mem_info.event;
+}
+
 void memory_pool::impl::deallocate(void *ptr) {
+    auto guard = std::lock_guard(mu);
     auto iter = malloc_pool.find(ptr);
     if (iter == malloc_pool.end()) {
         SPDLOG_WARN("Deallocate unknown pointer: {}.", fmt::ptr(ptr));
@@ -143,12 +131,7 @@ void memory_pool::impl::deallocate(void *ptr) {
             return;
         }
         case MEM_CUDA: {
-            if (mem_info.lay == MEM_LINEAR) {
-                reuse_cuda_linear_pool.emplace(mem_info.count, mem_info);
-            } else {
-                assert(mem_info.lay == MEM_PITCH);
-                reuse_cuda_pitch_pool.push_back(mem_info);
-            }
+            reuse_cuda_pool.emplace(mem_info.count, mem_info);
             return;
         }
     }
@@ -170,40 +153,47 @@ void memory_pool::impl::system_deallocate(mem_info_type mem_info) {
 }
 
 void memory_pool::impl::purge() {
+    auto guard = std::lock_guard(mu);
     for (auto item: reuse_host_pool | std::views::values) {
         system_deallocate(item);
     }
     reuse_host_pool.clear();
 
-    for (auto item: reuse_cuda_linear_pool | std::views::values) {
-        system_deallocate(item);
-    }
-    reuse_cuda_linear_pool.clear();
-
-    for (auto item: reuse_cuda_pitch_pool) {
+    for (auto item: reuse_cuda_pool | std::views::values) {
         system_deallocate(item);
     }
-    reuse_cuda_pitch_pool.clear();
+    reuse_cuda_pool.clear();
 }
 
 void *memory_pool::allocate_impl(size_t count, memory_location mem_loc) {
-    auto guard = std::lock_guard(pimpl->mu);
     return pimpl->allocate(count, mem_loc);
 }
 
 void *memory_pool::allocate_pitch_impl(
         size_t width, size_t rows, memory_location mem_loc, size_t *pitch) {
-    auto guard = std::lock_guard(pimpl->mu);
     return pimpl->allocate_pitch(width, rows, mem_loc, pitch);
 }
 
+void memory_pool::record_create(void *ptr, smart_cuda_stream *stream) {
+    if (stream == nullptr) return;
+    auto event = pimpl->get_event(ptr);
+    CUDA_API_CHECK(cudaEventRecord(event, stream->cuda));
+}
+
+void memory_pool::sync_create(void *ptr, smart_cuda_stream *stream) {
+    auto event = pimpl->get_event(ptr);
+    if (stream == nullptr) {
+        CUDA_API_CHECK(cudaEventSynchronize(event));
+    } else {
+        CUDA_API_CHECK(cudaStreamWaitEvent(stream->cuda, event));
+    }
+}
+
 void memory_pool::deallocate(void *ptr) {
-    auto guard = std::lock_guard(pimpl->mu);
     return pimpl->deallocate(ptr);
 }
 
 void memory_pool::purge() {
-    auto guard = std::lock_guard(pimpl->mu);
     pimpl->purge();
 }
 

+ 10 - 19
src/core/impl/memory_pool_impl.h

@@ -9,14 +9,13 @@
 #include <map>
 #include <mutex>
 #include <thread>
-#include <unordered_map>
 
 struct memory_pool::impl {
 
     // reuse_length * reuse_threshold >= request_length
     static constexpr auto reuse_threshold = 0.5;
 
-    enum memory_layout {
+    enum memory_layout : uint8_t {
         MEM_LINEAR, MEM_PITCH
     };
 
@@ -28,19 +27,15 @@ struct memory_pool::impl {
         // for MEM_LINEAR and MEM_PITCH
         size_t count;
 
-        // for MEM_PITCH
-        size_t pitch, rows;
+        cudaEvent_t event = nullptr;
     };
 
-    using malloc_pool_type = std::unordered_map<void *, mem_info_type>;
+    using malloc_pool_type = std::map<void *, mem_info_type, std::greater<>>;
     malloc_pool_type malloc_pool;
 
-    using reuse_host_pool_type = std::multimap<size_t, mem_info_type>;
-    using reuse_cuda_linear_pool_type = std::multimap<size_t, mem_info_type>;
-    using reuse_cuda_pitch_pool_type = std::list<mem_info_type>; // TODO: simulate pitched allocation manually
-    reuse_host_pool_type reuse_host_pool;
-    reuse_cuda_linear_pool_type reuse_cuda_linear_pool;
-    reuse_cuda_pitch_pool_type reuse_cuda_pitch_pool;
+    using reuse_pool_type = std::multimap<size_t, mem_info_type>;
+    reuse_pool_type reuse_host_pool;
+    reuse_pool_type reuse_cuda_pool;
 
     std::mutex mu;
 
@@ -48,15 +43,11 @@ struct memory_pool::impl {
 
     void *try_reuse_host(size_t count);
 
-    void *try_reuse_cuda_linear(size_t count);
-
-    void *try_reuse_cuda_pitch(size_t width, size_t rows, size_t *pitch);
+    void *try_reuse_cuda(size_t count);
 
     void *direct_allocate_host(size_t count);
 
-    void *direct_allocate_cuda_linear(size_t count);
-
-    void *direct_allocate_cuda_pitch(size_t width, size_t rows, size_t *pitch);
+    void *direct_allocate_cuda(size_t count);
 
     void *allocate_host(size_t count);
 
@@ -64,10 +55,10 @@ struct memory_pool::impl {
 
     void *allocate(size_t count, memory_location mem_loc);
 
-    void *allocate_pitch_cuda(size_t width, size_t rows, size_t *pitch);
-
     void *allocate_pitch(size_t width, size_t rows, memory_location mem_loc, size_t *pitch);
 
+    cudaEvent_t get_event(void *ptr);
+
     static void system_deallocate(mem_info_type mem_info);
 
     void deallocate(void *ptr);

+ 26 - 0
src/core/memory_pool.h

@@ -1,6 +1,8 @@
 #ifndef DEPTHGUIDE_MEMORY_POOL_H
 #define DEPTHGUIDE_MEMORY_POOL_H
 
+#include "core/cuda_helper.hpp"
+
 #include <cassert>
 #include <memory>
 
@@ -39,6 +41,21 @@ public:
         return as_shared(allocate_pitch<T>(cols, rows, mem_loc, pitch));
     }
 
+    // does nothing if stream == nullptr.
+    void record_create(void *ptr, smart_cuda_stream *stream);
+
+    void sync_create(void *ptr, smart_cuda_stream *stream = nullptr);
+
+    template<typename T>
+    void record_create(const std::shared_ptr<T> &ptr, smart_cuda_stream *stream) {
+        record_create(ptr.get(), stream);
+    }
+
+    template<typename T>
+    void sync_create(const std::shared_ptr<T> &ptr, smart_cuda_stream *stream = nullptr) {
+        sync_create(ptr.get(), stream);
+    }
+
     void deallocate(void *ptr);
 
     // free all unused memory
@@ -75,4 +92,13 @@ extern memory_pool global_mp;
 #define ALLOC_PITCH_SHARED(type, cols, rows, loc, pitch) \
     global_mp.allocate_pitch_shared<type>(cols, rows, loc, pitch)
 
+#define REC_CREATE(ptr, stream) \
+    global_mp.record_create(ptr, stream)
+
+#define SYNC_CREATE(ptr, stream) \
+    global_mp.sync_create(ptr, stream)
+
+#define WAIT_CREATE(ptr) \
+    global_mp.sync_create(ptr)
+
 #endif //DEPTHGUIDE_MEMORY_POOL_H

+ 3 - 0
src/impl/apps/app_selector/app_selector.cpp

@@ -1,4 +1,5 @@
 #include "app_selector.h"
+#include "impl/apps/debug/app_debug.h"
 #include "impl/apps/depth_guide/depth_guide.h"
 #include "impl/apps/remote_ar/remote_ar.h"
 #include "impl/apps/tiny_player/tiny_player.h"
@@ -52,6 +53,8 @@ void app_selector::load_app(const std::string &conf_path) {
         app = std::make_unique<app_remote_ar>(create_conf);
     } else if (app_name == "tiny_player") {
         app = std::make_unique<app_tiny_player>(create_conf);
+    } else if (app_name == "debug") {
+        app = std::make_unique<app_debug>(create_conf);
     }
 
     // change window title

+ 52 - 0
src/impl/apps/debug/app_debug.cpp

@@ -0,0 +1,52 @@
+#include "app_debug.h"
+#include "GLFW/glfw3.h"
+
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/cudastereo.hpp>
+#include <opencv2/cudaimgproc.hpp>
+
+app_debug::app_debug(const create_config &conf) {
+
+    auto left_path = "/home/tpx/project/DepthGuide/cmake-build-debug/Left.png";
+    auto right_path = "/home/tpx/project/DepthGuide/cmake-build-debug/Right.png";
+
+    auto left_img = cv::imread(left_path);
+    auto right_img = cv::imread(right_path);
+
+    auto left_img_cuda = cv::cuda::GpuMat();
+    auto right_img_cuda = cv::cuda::GpuMat();
+
+    left_img_cuda.upload(left_img);
+    right_img_cuda.upload(right_img);
+
+    // covert to gray
+    cv::cuda::cvtColor(left_img_cuda, left_img_cuda, cv::COLOR_RGB2GRAY);
+    cv::cuda::cvtColor(right_img_cuda, right_img_cuda, cv::COLOR_RGB2GRAY);
+
+    auto disparity_cuda = cv::cuda::GpuMat();
+    auto stereo = cv::cuda::createStereoSGM();
+    stereo->setNumDisparities(128);
+//    stereo->setMode(cv::StereoSGBM::MODE_HH);
+    stereo->compute(left_img_cuda, right_img_cuda, disparity_cuda);
+
+    auto filter = cv::cuda::createDisparityBilateralFilter();
+    filter->setNumDisparities(128);
+    filter->setRadius(5);
+    filter->setNumIters(3);
+    filter->apply(disparity_cuda, left_img_cuda, disparity_cuda);
+
+    auto disparity = cv::Mat();
+    disparity_cuda.download(disparity);
+
+    double min_val, max_val;
+    cv::minMaxLoc(disparity, &min_val, &max_val);
+    SPDLOG_INFO("Min: {}, Max: {}", min_val, max_val);
+
+    auto tmp = cv::Mat();
+    disparity.convertTo(tmp, CV_32FC1);
+    tmp = (tmp - min_val) / (max_val - min_val) * 255.f;
+    tmp.convertTo(disparity, CV_8UC1);
+    cv::imwrite("disparity.png", disparity);
+
+    glfwSetWindowShouldClose(glfwGetCurrentContext(), true);
+}

+ 20 - 0
src/impl/apps/debug/app_debug.h

@@ -0,0 +1,20 @@
+#ifndef DEPTHGUIDE_APP_DEBUG_H
+#define DEPTHGUIDE_APP_DEBUG_H
+
+#include "impl/app_base.h"
+
+class app_debug : public app_base {
+public:
+    explicit app_debug(const create_config &conf);
+
+    ~app_debug() override = default;
+
+    const char *window_name() override { return "You should not see this."; }
+
+    void show_ui() override {}
+
+    void render_background() override {}
+};
+
+
+#endif //DEPTHGUIDE_APP_DEBUG_H