#ifndef DEPTHGUIDE_IMAGE_UTILITY_HPP #define DEPTHGUIDE_IMAGE_UTILITY_HPP #include "cuda_helper.hpp" #include "memory_pool.h" #include #include enum image_pixel_type { PIX_RGBA, PIX_RGB }; template constexpr inline int get_cv_type() { // @formatter:off if constexpr (std::is_same_v) { return CV_8UC1; } if constexpr (std::is_same_v) { return CV_8UC3; } if constexpr (std::is_same_v) { return CV_16UC1; } if constexpr (std::is_same_v) { return CV_32FC1; } // @formatter:on return 0; } template constexpr inline auto binary_merge(T1 a, T2 b) { constexpr auto bits_a = sizeof(T1) * 8; constexpr auto bits_b = sizeof(T2) * 8; using ret_type = boost::uint_t::least; return (ret_type(a) << bits_b) | b; } enum mem_copy_kind { COPY_HOST_TO_HOST = binary_merge(MEM_HOST, MEM_HOST), COPY_HOST_TO_CUDA = binary_merge(MEM_HOST, MEM_CUDA), COPY_CUDA_TO_HOST = binary_merge(MEM_CUDA, MEM_HOST), COPY_CUDA_TO_CUDA = binary_merge(MEM_CUDA, MEM_CUDA) }; inline cudaMemcpyKind get_copy_kind(memory_location src, memory_location dst) { auto flag = binary_merge(src, dst); switch (flag) { // @formatter:off case COPY_HOST_TO_HOST: { return cudaMemcpyHostToHost; } case COPY_HOST_TO_CUDA: { return cudaMemcpyHostToDevice; } case COPY_CUDA_TO_HOST: { return cudaMemcpyDeviceToHost; } case COPY_CUDA_TO_CUDA: { return cudaMemcpyDeviceToDevice; } // @formatter:on default: { RET_ERROR_E; } } } // calculate height of a nv12 image inline auto img_height_to_nv12(auto h) { assert(h % 2 == 0); return h / 2 * 3; } inline auto img_size_to_nv12(cv::Size size) { return cv::Size(size.width, img_height_to_nv12(size.height)); } // calculate image height from a nv12 image // inverse of img_height_to_nv12() inline auto nv12_height_to_img(auto h) { assert(h % 3 == 0); return h / 3 * 2; } inline auto nv12_size_to_img(cv::Size size) { return cv::Size(size.width, nv12_height_to_img(size.height)); } #define ALLOC_IMG(type, size, loc, pitch) \ ALLOC_PITCH_SHARED(type, size.width, size.height, loc, pitch) struct image_mem_info { std::shared_ptr ptr; memory_location loc = MEM_HOST; size_t width = 0, pitch = 0; // in bytes size_t height = 0; }; // mutable image storage type template struct image_info_type { using pix_type = T; using this_type = image_info_type; std::shared_ptr ptr; memory_location loc = MEM_HOST; cv::Size size = {}; size_t pitch = 0; // start pointer of specific pixel void *start_ptr(int row = 0, int col = 0) const { return ptr.get() + row * pitch + col * sizeof(T); } size_t size_in_bytes() const { return sizeof(T) * size.area(); } size_t width_in_bytes() const { return sizeof(T) * size.width; } bool is_continuous() const { return sizeof(T) * size.width == pitch; } this_type sub_image(int row = 0, int col = 0, int width = -1, int height = -1) const { if (width == -1) { width = size.width - col; } if (height == -1) { height = size.height - row; } auto ret_size = cv::Size(width, height); auto ret_ptr = (row == 0 && col == 0) ? ptr : std::shared_ptr((T *) start_ptr(row, col), [p = ptr](void *) {}); return {ret_ptr, loc, ret_size, pitch}; } template image_info_type cast() const { auto ret_width = size.width * sizeof(T) / sizeof(U); assert(size.width * sizeof(T) == ret_width * sizeof(U)); auto ret_size = cv::Size(ret_width, size.height); auto ret_ptr = std::reinterpret_pointer_cast(ptr); return {ret_ptr, loc, ret_size, pitch}; } this_type flatten(smart_cuda_stream *stream) const { if (is_continuous()) return *this; assert(loc == MEM_CUDA); // image in host is always continuous return flatten_cuda(stream); } this_type unflatten(smart_cuda_stream *stream) const { assert(is_continuous()); if (loc == MEM_HOST) return *this; // image in host does not to be pitched return unflatten_cuda(stream); } // use after create, force memory copy void fill_from_async(const this_type &o, smart_cuda_stream *stream) { assert(size == o.size); assert(ptr != o.ptr); auto copy_kind = get_copy_kind(o.loc, loc); CUDA_API_CHECK(cudaMemcpy2DAsync( start_ptr(), pitch, o.start_ptr(), o.pitch, width_in_bytes(), size.height, copy_kind, stream->cuda)); } // use after create, force memory copy void fill_from_async(void *data, size_t src_pitch, memory_location src_loc, smart_cuda_stream *stream) { if (src_pitch == -1) { src_pitch = width_in_bytes(); } auto copy_kind = get_copy_kind(src_loc, loc); CUDA_API_CHECK(cudaMemcpy2DAsync( start_ptr(), pitch, data, src_pitch, width_in_bytes(), size.height, copy_kind, stream->cuda)); } void fill_from_async(const cv::cuda::GpuMat &mat, smart_cuda_stream *stream) { fill_from_async(mat.data, mat.step, MEM_CUDA, stream); } // use after create, force memory copy void fill_from(void *data, size_t src_pitch = -1, memory_location src_loc = MEM_HOST) { if (src_pitch == -1) { src_pitch = width_in_bytes(); } auto copy_kind = get_copy_kind(src_loc, loc); CUDA_API_CHECK(cudaMemcpy2D( start_ptr(), pitch, data, src_pitch, width_in_bytes(), size.height, copy_kind)); } cv::Mat as_mat() const { assert(loc == MEM_HOST); return {size, get_cv_type(), ptr.get(), pitch}; } cv::cuda::GpuMat as_gpu_mat() const { assert(loc == MEM_CUDA); return {size, get_cv_type(), ptr.get(), pitch}; } image_mem_info mem_info() const { return {std::static_pointer_cast(ptr), loc, sizeof(T) * (size_t) size.width, pitch, (size_t) size.height}; } void create(cv::Size _size, memory_location _loc) { if (_size == size && _loc == loc) [[likely]] return; loc = _loc; size = _size; ptr = ALLOC_IMG(T, size, loc, &pitch); } private: this_type flatten_cuda(smart_cuda_stream *stream) const { assert(loc == MEM_CUDA); auto ret = this_type(); ret.ptr = ALLOC_SHARED(T, size.area(), MEM_CUDA); ret.loc = MEM_CUDA; ret.size = size; ret.pitch = width_in_bytes(); ret.fill_from_async(*this, stream); return ret; } this_type unflatten_cuda(smart_cuda_stream *stream) const { static constexpr auto pitch_align = 32; if ((pitch % pitch_align) == 0) return *this; auto ret = this_type(); ret.create(size, MEM_CUDA); ret.fill_from_async(*this, stream); return ret; } }; template auto create_image_info(cv::Size size, memory_location mem_loc) { auto info = image_info_type(); info.ptr = ALLOC_IMG(T, size, mem_loc, &info.pitch); info.loc = mem_loc; info.size = size; return info; } template auto create_image_info(const cv::Mat &img) { assert(get_cv_type() == img.type()); auto info = image_info_type(); info.ptr = std::shared_ptr( // extend cv::Mat's lifetime (T *) img.data, [_img = img](void *) {}); info.loc = MEM_HOST; info.size = img.size(); info.pitch = img.step; return info; } // read-only image type to decrease host-gpu memory copy template class smart_image { public: explicit smart_image(image_info_type info) { assert(info.ptr != nullptr); if (info.loc == MEM_HOST) { host_info = info; } else { assert(info.loc == MEM_CUDA); cuda_info = info; } } image_info_type as_host_info(smart_cuda_stream *stream = nullptr) { if (host_info.ptr == nullptr) { assert(cuda_info.ptr != nullptr); host_info = create_image_info(cuda_info.size, MEM_HOST); CUDA_API_CHECK(cudaMemcpy2DAsync(host_info.ptr.get(), host_info.pitch, cuda_info.ptr.get(), cuda_info.pitch, cuda_info.size.width * sizeof(T), cuda_info.size.height, cudaMemcpyDeviceToHost, stream->cuda)); } assert(host_info.ptr != nullptr); return host_info; } image_info_type as_cuda_info(smart_cuda_stream *stream = nullptr) { if (cuda_info.ptr == nullptr) { assert(host_info.ptr != nullptr); cuda_info = create_image_info(host_info.size, MEM_CUDA); CUDA_API_CHECK(cudaMemcpy2DAsync(cuda_info.ptr.get(), cuda_info.pitch, host_info.ptr.get(), host_info.pitch, host_info.size.width * sizeof(T), host_info.size.height, cudaMemcpyHostToDevice, stream->cuda)); } assert(cuda_info.ptr != nullptr); return cuda_info; } image_info_type as_info() { // TODO: select prefer location if (cuda_info.ptr != nullptr) { return cuda_info; } assert(host_info.ptr != nullptr); return host_info; } cv::Mat as_host(smart_cuda_stream *stream = nullptr) { return as_host_info(stream).as_mat(); } cv::cuda::GpuMat as_cuda(smart_cuda_stream *stream = nullptr) { return as_cuda_info(stream).as_gpu_mat(); } cv::Size size() const { if (cuda_info.ptr != nullptr) { return cuda_info.size; } assert(host_info.ptr != nullptr); return host_info.size; } private: image_info_type host_info; image_info_type cuda_info; }; template auto create_image(image_info_type info) { return std::make_shared>(info); } using image_info_u8c1 = image_info_type; using image_info_u8c2 = image_info_type; using image_info_u8c3 = image_info_type; using image_info_u8c4 = image_info_type; using image_u8c1 = std::shared_ptr>; using image_u8c2 = std::shared_ptr>; using image_u8c3 = std::shared_ptr>; using image_u8c4 = std::shared_ptr>; using image_u16c1 = std::shared_ptr>; using image_f32c1 = std::shared_ptr>; #endif //DEPTHGUIDE_IMAGE_UTILITY_HPP