jcsyshc
/
DepthGuide


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
							#ifndef DEPTHGUIDE_IMAGE_UTILITY_HPP
#define DEPTHGUIDE_IMAGE_UTILITY_HPP

#include "cuda_helper.hpp"
#include "memory_pool.h"

#include <boost/integer.hpp>

#include <opencv2/core/types.hpp>

enum image_pixel_type {
    PIX_RGBA,
    PIX_RGB
};

template<typename T>
constexpr inline int get_cv_type() {
    // @formatter:off
    if constexpr (std::is_same_v<T, uchar1>) { return CV_8UC1; }
    if constexpr (std::is_same_v<T, uchar3>) { return CV_8UC3; }
    if constexpr (std::is_same_v<T, ushort1>) { return CV_16UC1; }
    if constexpr (std::is_same_v<T, float1>) { return CV_32FC1; }
    // @formatter:on
    return 0;
}

template<typename T1, typename T2>
constexpr inline auto binary_merge(T1 a, T2 b) {
    constexpr auto bits_a = sizeof(T1) * 8;
    constexpr auto bits_b = sizeof(T2) * 8;
    using ret_type = boost::uint_t<bits_a + bits_b>::least;
    return (ret_type(a) << bits_b) | b;
}

enum mem_copy_kind {
    COPY_HOST_TO_HOST = binary_merge(MEM_HOST, MEM_HOST),
    COPY_HOST_TO_CUDA = binary_merge(MEM_HOST, MEM_CUDA),
    COPY_CUDA_TO_HOST = binary_merge(MEM_CUDA, MEM_HOST),
    COPY_CUDA_TO_CUDA = binary_merge(MEM_CUDA, MEM_CUDA)
};

inline cudaMemcpyKind get_copy_kind(memory_location src,
                                    memory_location dst) {
    auto flag = binary_merge(src, dst);
    switch (flag) {
        // @formatter:off
        case COPY_HOST_TO_HOST: { return cudaMemcpyHostToHost; }
        case COPY_HOST_TO_CUDA: { return cudaMemcpyHostToDevice; }
        case COPY_CUDA_TO_HOST: { return cudaMemcpyDeviceToHost; }
        case COPY_CUDA_TO_CUDA: { return cudaMemcpyDeviceToDevice; }
        // @formatter:on
        default: {
            RET_ERROR_E;
        }
    }
}

// calculate height of a nv12 image
inline auto img_height_to_nv12(auto h) {
    assert(h % 2 == 0);
    return h / 2 * 3;
}

inline auto img_size_to_nv12(cv::Size size) {
    return cv::Size(size.width,
                    img_height_to_nv12(size.height));
}

// calculate image height from a nv12 image
// inverse of img_height_to_nv12()
inline auto nv12_height_to_img(auto h) {
    assert(h % 3 == 0);
    return h / 3 * 2;
}

inline auto nv12_size_to_img(cv::Size size) {
    return cv::Size(size.width,
                    nv12_height_to_img(size.height));
}

#define ALLOC_IMG(type, size, loc, pitch) \
    ALLOC_PITCH_SHARED(type, size.width, size.height, loc, pitch)

struct image_mem_info {
    std::shared_ptr<void> ptr;
    memory_location loc = MEM_HOST;
    size_t width = 0, pitch = 0; // in bytes
    size_t height = 0;
};

// mutable image storage type
template<typename T>
struct image_info_type {

    using pix_type = T;
    using this_type = image_info_type<T>;

    std::shared_ptr<T> ptr;
    memory_location loc = MEM_HOST;
    cv::Size size = {};
    size_t pitch = 0;

    // start pointer of specific pixel
    void *start_ptr(int row = 0, int col = 0) const {
        return ptr.get() + row * pitch + col * sizeof(T);
    }

    size_t size_in_bytes() const { return sizeof(T) * size.area(); }

    size_t width_in_bytes() const { return sizeof(T) * size.width; }

    bool is_continuous() const { return sizeof(T) * size.width == pitch; }

    this_type sub_image(int row = 0, int col = 0,
                        int width = -1, int height = -1) const {
        if (width == -1) {
            width = size.width - col;
        }
        if (height == -1) {
            height = size.height - row;
        }
        auto ret_size = cv::Size(width, height);
        auto ret_ptr = (row == 0 && col == 0) ? ptr :
                       std::shared_ptr<T>((T *) start_ptr(row, col), [p = ptr](void *) {});
        return {ret_ptr, loc, ret_size, pitch};
    }

    template<typename U>
    image_info_type<U> cast() const {
        auto ret_width = size.width * sizeof(T) / sizeof(U);
        assert(size.width * sizeof(T) == ret_width * sizeof(U));
        auto ret_size = cv::Size(ret_width, size.height);
        auto ret_ptr = std::reinterpret_pointer_cast<U>(ptr);
        return {ret_ptr, loc, ret_size, pitch};
    }

    this_type flatten(smart_cuda_stream *stream) const {
        if (is_continuous()) return *this;
        assert(loc == MEM_CUDA); // image in host is always continuous
        return flatten_cuda(stream);
    }

    this_type unflatten(smart_cuda_stream *stream) const {
        assert(is_continuous());
        if (loc == MEM_HOST) return *this; // image in host does not to be pitched
        return unflatten_cuda(stream);
    }

    // use after create, force memory copy
    void fill_from_async(const this_type &o,
                         smart_cuda_stream *stream) {
        assert(size == o.size);
        assert(ptr != o.ptr);
        auto copy_kind = get_copy_kind(o.loc, loc);
        CUDA_API_CHECK(cudaMemcpy2DAsync(
                start_ptr(), pitch, o.start_ptr(), o.pitch,
                width_in_bytes(), size.height, copy_kind, stream->cuda));
    }

    // use after create, force memory copy
    void fill_from_async(void *data, size_t src_pitch,
                         memory_location src_loc,
                         smart_cuda_stream *stream) {
        if (src_pitch == -1) {
            src_pitch = width_in_bytes();
        }
        auto copy_kind = get_copy_kind(src_loc, loc);
        CUDA_API_CHECK(cudaMemcpy2DAsync(
                start_ptr(), pitch, data, src_pitch,
                width_in_bytes(), size.height, copy_kind, stream->cuda));
    }

    void fill_from_async(const cv::cuda::GpuMat &mat,
                         smart_cuda_stream *stream) {
        fill_from_async(mat.data, mat.step, MEM_CUDA, stream);
    }

    // use after create, force memory copy
    void fill_from(void *data, size_t src_pitch = -1,
                   memory_location src_loc = MEM_HOST) {
        if (src_pitch == -1) {
            src_pitch = width_in_bytes();
        }
        auto copy_kind = get_copy_kind(src_loc, loc);
        CUDA_API_CHECK(cudaMemcpy2D(
                start_ptr(), pitch, data, src_pitch,
                width_in_bytes(), size.height, copy_kind));
    }

    cv::Mat as_mat() const {
        assert(loc == MEM_HOST);
        return {size, get_cv_type<T>(), ptr.get(), pitch};
    }

    cv::cuda::GpuMat as_gpu_mat() const {
        assert(loc == MEM_CUDA);
        return {size, get_cv_type<T>(), ptr.get(), pitch};
    }

    image_mem_info mem_info() const {
        return {std::static_pointer_cast<void>(ptr),
                loc, sizeof(T) * (size_t) size.width, pitch, (size_t) size.height};
    }

    void create(cv::Size _size, memory_location _loc) {
        if (_size == size && _loc == loc) [[likely]] return;
        loc = _loc;
        size = _size;
        ptr = ALLOC_IMG(T, size, loc, &pitch);
    }

private:
    this_type flatten_cuda(smart_cuda_stream *stream) const {
        assert(loc == MEM_CUDA);
        auto ret = this_type();
        ret.ptr = ALLOC_SHARED(T, size.area(), MEM_CUDA);
        ret.loc = MEM_CUDA;
        ret.size = size;
        ret.pitch = width_in_bytes();
        ret.fill_from_async(*this, stream);
        return ret;
    }

    this_type unflatten_cuda(smart_cuda_stream *stream) const {
        static constexpr auto pitch_align = 32;
        if ((pitch % pitch_align) == 0) return *this;
        auto ret = this_type();
        ret.create(size, MEM_CUDA);
        ret.fill_from_async(*this, stream);
        return ret;
    }

};

template<typename T>
auto create_image_info(cv::Size size, memory_location mem_loc) {
    auto info = image_info_type<T>();
    info.ptr = ALLOC_IMG(T, size, mem_loc, &info.pitch);
    info.loc = mem_loc;
    info.size = size;
    return info;
}

template<typename T>
auto create_image_info(const cv::Mat &img) {
    assert(get_cv_type<T>() == img.type());
    auto info = image_info_type<T>();
    info.ptr = std::shared_ptr<T>( // extend cv::Mat's lifetime
            (T *) img.data, [_img = img](void *) {});
    info.loc = MEM_HOST;
    info.size = img.size();
    info.pitch = img.step;
    return info;
}

// read-only image type to decrease host-gpu memory copy
template<typename T>
class smart_image {
public:

    explicit smart_image(image_info_type<T> info) {
        assert(info.ptr != nullptr);
        if (info.loc == MEM_HOST) {
            host_info = info;
        } else {
            assert(info.loc == MEM_CUDA);
            cuda_info = info;
        }
    }

    image_info_type<T> as_host_info(smart_cuda_stream *stream = nullptr) {
        if (host_info.ptr == nullptr) {
            assert(cuda_info.ptr != nullptr);
            host_info = create_image_info<T>(cuda_info.size, MEM_HOST);
            CUDA_API_CHECK(cudaMemcpy2DAsync(host_info.ptr.get(), host_info.pitch,
                                             cuda_info.ptr.get(), cuda_info.pitch,
                                             cuda_info.size.width * sizeof(T), cuda_info.size.height,
                                             cudaMemcpyDeviceToHost, stream->cuda));
        }
        assert(host_info.ptr != nullptr);
        return host_info;
    }

    image_info_type<T> as_cuda_info(smart_cuda_stream *stream = nullptr) {
        if (cuda_info.ptr == nullptr) {
            assert(host_info.ptr != nullptr);
            cuda_info = create_image_info<T>(host_info.size, MEM_CUDA);
            CUDA_API_CHECK(cudaMemcpy2DAsync(cuda_info.ptr.get(), cuda_info.pitch,
                                             host_info.ptr.get(), host_info.pitch,
                                             host_info.size.width * sizeof(T), host_info.size.height,
                                             cudaMemcpyHostToDevice, stream->cuda));
        }
        assert(cuda_info.ptr != nullptr);
        return cuda_info;
    }

    image_info_type<T> as_info() { // TODO: select prefer location
        if (cuda_info.ptr != nullptr) {
            return cuda_info;
        }
        assert(host_info.ptr != nullptr);
        return host_info;
    }

    cv::Mat as_host(smart_cuda_stream *stream = nullptr) {
        return as_host_info(stream).as_mat();
    }

    cv::cuda::GpuMat as_cuda(smart_cuda_stream *stream = nullptr) {
        return as_cuda_info(stream).as_gpu_mat();
    }

    cv::Size size() const {
        if (cuda_info.ptr != nullptr) {
            return cuda_info.size;
        }
        assert(host_info.ptr != nullptr);
        return host_info.size;
    }

private:
    image_info_type<T> host_info;
    image_info_type<T> cuda_info;
};

template<typename T>
auto create_image(image_info_type<T> info) {
    return std::make_shared<smart_image<T>>(info);
}

using image_info_u8c1 = image_info_type<uchar1>;
using image_info_u8c2 = image_info_type<uchar2>;
using image_info_u8c3 = image_info_type<uchar3>;
using image_info_u8c4 = image_info_type<uchar4>;

using image_u8c1 = std::shared_ptr<smart_image<uchar1>>;
using image_u8c2 = std::shared_ptr<smart_image<uchar2>>;
using image_u8c3 = std::shared_ptr<smart_image<uchar3>>;
using image_u8c4 = std::shared_ptr<smart_image<uchar4>>;
using image_u16c1 = std::shared_ptr<smart_image<ushort1>>;
using image_f32c1 = std::shared_ptr<smart_image<float1>>;

#endif //DEPTHGUIDE_IMAGE_UTILITY_HPP