jcsyshc
/
RemoteAR3


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
							#include "cuda_helper.hpp"
#include "image_process.h"
#include "image_process/process_kernels.cuh"
#include "utility.hpp"

#include <opencv2/cudaimgproc.hpp>

namespace process_impl {

    template<typename T>
    struct smart_gpu_buffer {
        T *ptr = nullptr;
        size_t size = 0;

        ~smart_gpu_buffer() {
            deallocate();
        }

        void create(size_t req_size) {
            if (req_size > capacity) [[unlikely]] {
                deallocate();
                CUDA_API_CHECK(cudaMalloc(&ptr, req_size * sizeof(T)));
                capacity = req_size;
            }
            size = req_size;
        }

    private:
        size_t capacity = 0;

        void deallocate() {
            if (ptr == nullptr) return;
            CUDA_API_CHECK(cudaFree(ptr));
            ptr = nullptr;
        }
    };

    template<typename T>
    void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
        assert(in.elemSize() == sizeof(T));
        out->create(in.size().area());
        auto flatten_pitch = in.cols * in.elemSize();
        CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
                                         flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
    }

    template<typename T>
    void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
                   cv::Size size, int type, cudaStream_t stream) {
        assert(sizeof(T) == CV_ELEM_SIZE(type));
        assert(in.size == size.area());
        out->create(size, type);
        auto flatten_pitch = out->cols * out->elemSize();
        CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
                                         flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
    }

}

using namespace process_impl;

struct monocular_processor::impl {
    cv::cuda::GpuMat raw_dev;
    smart_gpu_buffer<uchar3> rgb_f;
    smart_gpu_buffer<float> hsv_v_f;
    smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
    smart_gpu_buffer<enhance_coeff> enhance_ext;

    static void debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
                        cv::cuda::Stream &stream) {
        switch (in.type()) {
            case CV_8UC1: {
                cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2RGB, 3, stream);
                return;
            }
        }
        unreachable();
    }

    void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
        assert(in.type() == CV_8UC3);

        // flatten image into a line
        flatten(in, &rgb_f, stream);
        auto line_size = rgb_f.size;

        // extract V channel of HSV
        constexpr auto block_size = 256;
        constexpr auto grid_dim = 512;
        hsv_v_f.create(rgb_f.size);
        call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
                           block_size, grid_dim, stream);

        // reduce enhance coefficients
        hsv_v_max.create(grid_dim);
        call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
                        block_size, grid_dim, stream);
        hsv_v_sum_log.create(grid_dim);
        call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
                            block_size, grid_dim, stream);

        // prepare enhance coefficients
        enhance_ext.create(1);
        call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
                                   line_size, enhance_ext.ptr, stream);

        // enhance image
        call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
                           block_size, grid_dim, stream);

        // unflatten image
        unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
    }

    void process(const cv::Mat &in, cv::cuda::GpuMat *out,
                 bool enhance, cv::cuda::Stream &stream) {
        // upload from host to device
        raw_dev.upload(in, stream);

        // debayer using OpenCV
        debayer(raw_dev, out, stream);

        // enhance image
        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
        if (enhance) {
            enhance_image(*out, out, cuda_stream);
        }

        // TODO: un-distort
    }
};

monocular_processor::monocular_processor()
        : pimpl(std::make_unique<impl>()) {}

monocular_processor::~monocular_processor() = default;

void monocular_processor::process(const cv::Mat &in, cv::cuda::GpuMat *out,
                                  bool enhance, cv::cuda::Stream &stream) {
    pimpl->process(in, out, enhance, stream);
}