#include "image_process.h"
#include "core/cuda_helper.hpp"
#include "core/image_utility.hpp"
#include "core/memory_pool.h"
#include "cuda_impl/process_kernels.cuh"

#include <opencv2/cudaimgproc.hpp>

#include <boost/noncopyable.hpp>

namespace process_impl {

    template<typename T>
    struct smart_buffer : private boost::noncopyable {
        static_assert(std::is_trivial_v<T>);

        T *ptr = nullptr;
        size_t length = 0;

        smart_buffer() = default;

        template<typename U=T>
        smart_buffer(const smart_buffer<U> &other) = delete;

        ~smart_buffer() {
            MEM_DEALLOC(ptr);
        }

        void create(size_t req_length) {
            if (req_length > capacity) [[unlikely]] {
                MEM_DEALLOC(ptr);
                MEM_ALLOC(T, req_length, MEM_HOST);
                capacity = req_length;
            }
            length = req_length;
        }

        size_t size() const {
            return length * sizeof(T);
        }

    private:
        size_t capacity = 0;
    };

    template<typename T>
    struct smart_gpu_buffer : private boost::noncopyable {
        T *ptr = nullptr;
        size_t size = 0;

        smart_gpu_buffer() = default;

        template<typename U>
        smart_gpu_buffer(const smart_gpu_buffer<T> &other) = delete;

        ~smart_gpu_buffer() {
            deallocate();
        }

        void create(size_t req_size) {
            if (req_size > capacity) [[unlikely]] {
                deallocate();
                ptr = MEM_ALLOC(T, req_size, MEM_CUDA);
                capacity = req_size;
            }
            size = req_size;
        }

        template<typename U=T>
        void upload_from(const smart_buffer<U> &buf, cudaStream_t stream = nullptr) {
            assert(buf.length * sizeof(U) % sizeof(T) == 0);
            create(buf.length * sizeof(U) / sizeof(T));
            if (stream == nullptr) {
                CUDA_API_CHECK(cudaMemcpy(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice));
            } else {
                CUDA_API_CHECK(cudaMemcpyAsync(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice, stream));
            }
        }

        template<typename U=T>
        void upload_from(const U *src_ptr, size_t src_size, cudaStream_t stream = nullptr) {
            assert(src_size * sizeof(U) % sizeof(T) == 0);
            create(src_size * sizeof(U) / sizeof(T));
            if (stream == nullptr) {
                CUDA_API_CHECK(cudaMemcpy(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice));
            } else {
                CUDA_API_CHECK(cudaMemcpyAsync(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice, stream));
            }
        }

        template<typename U=T>
        void download_to(smart_buffer<U> *buf, cudaStream_t stream = nullptr) {
            assert(size * sizeof(T) % sizeof(U) == 0);
            buf->create(size * sizeof(T) / sizeof(U));
            if (stream == nullptr) {
                CUDA_API_CHECK(cudaMemcpy(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
            } else {
                CUDA_API_CHECK(cudaMemcpyAsync(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
            }
        }

    private:
        size_t capacity = 0;

        void deallocate() {
            if (ptr == nullptr) return;
            MEM_DEALLOC(ptr);
            ptr = nullptr;
        }
    };

    struct smart_cuda_texture {
        cudaTextureObject_t obj = 0;
        int mat_type = -1;

        ~smart_cuda_texture() {
            deallocate();
        }

        smart_cuda_texture() = default;

        smart_cuda_texture(const smart_cuda_texture &other) = delete;

        void create(const cv::cuda::GpuMat &mat) {
            if (last_ptr != mat.cudaPtr()) [[unlikely]] {
                deallocate();
                allocate(mat);
            }
        }

    private:
        void *last_ptr = nullptr;

        void allocate(const cv::cuda::GpuMat &mat) {
            auto res_desc = cudaResourceDesc{};
            res_desc.resType = cudaResourceTypePitch2D;
            res_desc.res.pitch2D.devPtr = mat.cudaPtr();
            res_desc.res.pitch2D.width = mat.cols;
            res_desc.res.pitch2D.height = mat.rows;
            res_desc.res.pitch2D.pitchInBytes = mat.step;

            auto tex_desc = cudaTextureDesc{};
            tex_desc.addressMode[0] = cudaAddressModeClamp;
            tex_desc.addressMode[1] = cudaAddressModeClamp;
            tex_desc.filterMode = cudaFilterModeLinear;
            tex_desc.readMode = cudaReadModeNormalizedFloat;
            tex_desc.normalizedCoords = true;

            mat_type = mat.type();
            switch (mat_type) {
                case CV_8UC1: {
                    res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uint8_t>();
                    break;
                }
                case CV_8UC4: {
                    res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uchar4>();
                    break;
                }
                default: {
                    RET_ERROR;
                }
            }

            assert(obj == 0);
            CUDA_API_CHECK(cudaCreateTextureObject(&obj, &res_desc, &tex_desc, nullptr));
            last_ptr = mat.cudaPtr();
        }

        void deallocate() {
            if (obj == 0) return;
            CUDA_API_CHECK(cudaDestroyTextureObject(obj));
            last_ptr = nullptr;
            obj = 0;
        }
    };

    camera_info to_camera_info(const camera_intrinsic &cam) {
        camera_info ret{};
        ret.fx = cam.fx / cam.width;
        ret.fy = cam.fy / cam.height;
        ret.cx = cam.cx / cam.width;
        ret.cy = cam.cy / cam.height;
        ret.k[0] = cam.k[0];
        ret.k[1] = cam.k[1];
        return ret;
    }

    void opencv_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
        switch (in.type()) {
            case CV_8UC1: {
                cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2BGR, 3, stream);
                return;
            }
        }
        unreachable();
    }

    void opencv_gray2rgb(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
        switch (in.type()) {
            case CV_8UC1: {
                cv::cuda::cvtColor(in, *out, cv::COLOR_GRAY2BGR, 3, stream);
                return;
            }
        }
        unreachable();
    }

    template<typename T>
    image_type<T> to_image_type(const cv::cuda::GpuMat &mat) {
        assert(sizeof(T) == CV_ELEM_SIZE(mat.type()));
        auto ret = image_type<T>();
        ret.ptr = (T *) mat.cudaPtr();
        ret.pitch = mat.step;
        ret.width = mat.cols;
        ret.height = mat.rows;
        return ret;
    }

    template<typename T>
    void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
        assert(in.elemSize() == sizeof(T));
        out->create(in.size().area());
        auto flatten_pitch = in.cols * in.elemSize();
        CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
                                         flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
    }

    template<typename T>
    void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
                   cv::Size size, int type, cudaStream_t stream) {
        assert(sizeof(T) == CV_ELEM_SIZE(type));
        assert(in.size == size.area());
        out->create(size, type);
        auto flatten_pitch = out->cols * out->elemSize();
        CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
                                         flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
    }

    void crude_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
                       bool alpha, cudaStream_t stream) {
        constexpr uint2 block_size = {32, 4};
        constexpr uint2 grid_dim = {8, 128};
        auto out_size = cv::Size{in.cols >> 1, in.rows >> 1};
        switch (in.type()) {
            case CV_8UC1: {
                if (alpha) {
                    out->create(out_size, CV_8UC4);
                    call_crude_debayer(to_image_type<uint8_t>(in),
                                       to_image_type<uchar4>(*out),
                                       block_size, grid_dim, stream);
                } else {
                    out->create(out_size, CV_8UC3);
                    call_crude_debayer(to_image_type<uint8_t>(in),
                                       to_image_type<uchar3>(*out),
                                       block_size, grid_dim, stream);
                }
                return;
            }
            default: {
                RET_ERROR;
            }
        }
    }

    // pixel coordinate to undistorted normalized plane
    cv::Point2f undistort_point(const camera_intrinsic &info, cv::Point2f p) {
        auto u = (p.x - info.cx) / info.fx;
        auto v = (p.y - info.cy) / info.fy;
        auto r0 = sqrtf(u * u + v * v);

        // Newton's Method
        constexpr auto SOLVE_ITERATION_CNT = 4;
        auto r = r0;
        for (auto k = 0; k < SOLVE_ITERATION_CNT; ++k) {
            auto r2 = r * r;
            auto r3 = r2 * r;
            auto r4 = r3 * r;
            auto r5 = r4 * r;
            r -= (info.k[1] * r5 + info.k[0] * r3 + r - r0) /
                 (5 * info.k[1] * r4 + 3 * info.k[0] * r2 + 1);
        }

        auto factor = r / r0;
        u *= factor;
        v *= factor;
        return {u, v};
    }

    void resample_image(const smart_cuda_texture &in, cv::cuda::GpuMat *out,
                        cv::Size2f range, const camera_intrinsic &cam, uint32_t height, cudaStream_t stream) {
        constexpr uint2 block_size = {32, 4};
        constexpr uint2 grid_dim = {8, 128};
        float ps = 2 * range.height / height;
        uint32_t width = 2 * range.width / ps;
        width = (width + 3) & (-4); // make OpenGL happy
        resample_info info{};
        info.x = -range.width;
        info.y = -range.height;
        info.ps = ps;
        switch (in.mat_type) {
            case CV_8UC1: {
                out->create(height, width, CV_8UC1);
                call_resample_image(in.obj, to_image_type<uint8_t>(*out), info,
                                    to_camera_info(cam), block_size, grid_dim, stream);
                return;
            }
            case CV_8UC4: {
                out->create(height, width, CV_8UC3);
                call_resample_image(in.obj, to_image_type<uchar3>(*out), info,
                                    to_camera_info(cam), block_size, grid_dim, stream);
                return;
            }
            default: {
                RET_ERROR;
            }
        }
    }

}

using namespace process_impl;

struct monocular_processor::impl {
    cv::cuda::GpuMat rgba_dev;
    cv::cuda::GpuMat resample_dev;
    cv::cuda::GpuMat ugly_out; // TODO: ugly hack
    smart_cuda_texture resample_tex;
    smart_gpu_buffer<uchar3> rgb_f;
    smart_gpu_buffer<float> hsv_v_f;
    smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
    smart_gpu_buffer<enhance_coeff> enhance_ext;

    void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
        assert(in.type() == CV_8UC3);

        // flatten image into a line
        flatten(in, &rgb_f, stream);
        auto line_size = rgb_f.size;

        // extract V channel of HSV
        constexpr auto block_size = 256;
        constexpr auto grid_dim = 512;
        hsv_v_f.create(rgb_f.size);
        call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
                           block_size, grid_dim, stream);

        // reduce enhance coefficients
        hsv_v_max.create(grid_dim);
        call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
                        block_size, grid_dim, stream);
        hsv_v_sum_log.create(grid_dim);
        call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
                            block_size, grid_dim, stream);

        // prepare enhance coefficients
        enhance_ext.create(1);
        call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
                                   line_size, enhance_ext.ptr, stream);

        // enhance image
        call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
                           block_size, grid_dim, stream);

        // unflatten image
        unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
    }

    image_u8c3 process(const image_u8c1 &in, process_config conf) {
        auto cuda_stream = conf.stream->cuda;
        auto cv_stream = conf.stream->cv;
        auto in_mat = in->as_cuda(conf.stream);
        if (conf.is_mono) {

            // undistort
            if (conf.undistort) {
                resample_tex.create(in_mat);
                resample_image(resample_tex, &resample_dev, conf.valid_range,
                               conf.camera, conf.resample_height, cuda_stream);
            } else {
                resample_dev = in_mat;
            }

            // Mono -> RGB
            opencv_gray2rgb(resample_dev, &ugly_out, cv_stream);

        } else {

            // debayer
            if (conf.crude_debayer) {
                if (conf.undistort) {
                    crude_debayer(in_mat, &rgba_dev, true, cuda_stream);
                } else {
                    crude_debayer(in_mat, &ugly_out, false, cuda_stream);
                }
            } else {
                assert(!conf.undistort);
                opencv_debayer(in_mat, &ugly_out, cv_stream);
            }

            // undistort
            if (conf.undistort) {
                assert(conf.crude_debayer);
                resample_tex.create(rgba_dev);
                resample_image(resample_tex, &ugly_out, conf.valid_range,
                               conf.camera, conf.resample_height, cuda_stream);
            }
        }

        // enhance image
        if (conf.enhance) {
            enhance_image(ugly_out, &ugly_out, cuda_stream);
        }

        auto out_info = create_image_info<uchar3>(ugly_out.size(), MEM_CUDA);
        out_info.fill_from_async(ugly_out, conf.stream);
        return create_image(out_info);
    }
};

monocular_processor::monocular_processor()
        : pimpl(std::make_unique<impl>()) {}

monocular_processor::~monocular_processor() = default;

image_u8c3 monocular_processor::process(const image_u8c1 &in, process_config conf) {
    return pimpl->process(in, conf);
}

cv::Size2f calc_valid_range(const camera_intrinsic &left, const camera_intrinsic &right, float *angle) {
    auto u_lim = std::min({-undistort_point(left, {0, left.cy}).x,
                           undistort_point(left, {(float) left.width, left.cy}).x,
                           -undistort_point(right, {0, right.cy}).x,
                           undistort_point(right, {(float) right.width, right.cy}).x});
    auto v_lim = std::min({-undistort_point(left, {left.cx, 0}).y,
                           undistort_point(left, {left.cx, (float) left.height}).y,
                           -undistort_point(right, {right.cx, 0}).y,
                           undistort_point(right, {right.cx, (float) right.height}).y});
    if (angle != nullptr) {
        *angle = 2 * atanf(v_lim);
    }
    return {u_lim, v_lim};
}