2 năm trước cách đây · 89e2fcee0e
--- a/data/config.yaml
+++ b/data/config.yaml
@@ -2,17 +2,25 @@ camera:
 
				   names:
			
 
				     left: LeftEye
			
 
				     right: RightEye
			
 
				-  remap:
			
 
				-    width: 2491
			
 
				-    height: 2077
			
 
				-    angle: 32.351172
			
 
				-    #    width: 2458
			
 
				-    #    height: 2048
			
 
				-    #    angle: 32.36803962255573
			
 
				-    data:
			
 
				-      left: ./left_proj.dat
			
 
				-      right: ./right_proj.dat
			
 
				-  #      right: /home/tpx/project/RemoteAR2/data/right_proj.dat
			
 
				+  intrinsic:
			
 
				+    left:
			
 
				+      fx: 3572.10386197042
			
 
				+      fy: 3570.68084102920
			
 
				+      cx: 1229.80492150397
			
 
				+      cy: 1023.20068686429
			
 
				+      k0: -0.0596384363017306
			
 
				+      k1: 0.0479007910422696
			
 
				+      width: 2448
			
 
				+      height: 2048
			
 
				+    right:
			
 
				+      fx: 3581.94172450578
			
 
				+      fy: 3580.63177534790
			
 
				+      cx: 1220.42154859988
			
 
				+      cy: 1030.02704801102
			
 
				+      k0: -0.0766755999523193
			
 
				+      k1: 0.177092451010692
			
 
				+      width: 2448
			
 
				+      height: 2048
			
 
				   capture:
			
 
				     frame_rate: 40
			
 
				     expo_time_ms: 12
			
--- a/src/image_process.cpp
+++ b/src/image_process.cpp
@@ -35,6 +35,93 @@ namespace process_impl {
 
				         }
			
 
				     };
			
 
				 
			
 
				+    struct smart_cuda_texture {
			
 
				+        cudaTextureObject_t obj = 0;
			
 
				+
			
 
				+        ~smart_cuda_texture() {
			
 
				+            deallocate();
			
 
				+        }
			
 
				+
			
 
				+        void create(const cv::cuda::GpuMat &mat) {
			
 
				+            if (last_ptr != mat.cudaPtr()) [[unlikely]] {
			
 
				+                deallocate();
			
 
				+                allocate(mat);
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+    private:
			
 
				+        void *last_ptr = nullptr;
			
 
				+
			
 
				+        void allocate(const cv::cuda::GpuMat &mat) {
			
 
				+            auto res_desc = cudaResourceDesc{};
			
 
				+            res_desc.resType = cudaResourceTypePitch2D;
			
 
				+            res_desc.res.pitch2D.devPtr = mat.cudaPtr();
			
 
				+            res_desc.res.pitch2D.width = mat.cols;
			
 
				+            res_desc.res.pitch2D.height = mat.rows;
			
 
				+            res_desc.res.pitch2D.pitchInBytes = mat.step;
			
 
				+
			
 
				+            auto tex_desc = cudaTextureDesc{};
			
 
				+            tex_desc.addressMode[0] = cudaAddressModeClamp;
			
 
				+            tex_desc.addressMode[1] = cudaAddressModeClamp;
			
 
				+            tex_desc.filterMode = cudaFilterModeLinear;
			
 
				+            tex_desc.readMode = cudaReadModeNormalizedFloat;
			
 
				+            tex_desc.normalizedCoords = true;
			
 
				+
			
 
				+            switch (mat.type()) {
			
 
				+                case CV_8UC4: {
			
 
				+                    res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uchar4>();
			
 
				+                    break;
			
 
				+                }
			
 
				+                default: {
			
 
				+                    assert(false);
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            assert(obj == 0);
			
 
				+            CUDA_API_CHECK(cudaCreateTextureObject(&obj, &res_desc, &tex_desc, nullptr));
			
 
				+            last_ptr = mat.cudaPtr();
			
 
				+        }
			
 
				+
			
 
				+        void deallocate() {
			
 
				+            if (obj == 0) return;
			
 
				+            CUDA_API_CHECK(cudaDestroyTextureObject(obj));
			
 
				+            last_ptr = nullptr;
			
 
				+            obj = 0;
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template<typename T>
			
 
				+    image_type<T> to_image_type(const cv::cuda::GpuMat &mat) {
			
 
				+        assert(sizeof(T) == CV_ELEM_SIZE(mat.type()));
			
 
				+        image_type<T> ret;
			
 
				+        ret.ptr = (T *) mat.cudaPtr();
			
 
				+        ret.pitch = mat.step;
			
 
				+        ret.width = mat.cols;
			
 
				+        ret.height = mat.rows;
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    camera_info to_camera_info(const camera_intrinsic &cam) {
			
 
				+        camera_info ret{};
			
 
				+        ret.fx = cam.fx / cam.width;
			
 
				+        ret.fy = cam.fy / cam.height;
			
 
				+        ret.cx = cam.cx / cam.width;
			
 
				+        ret.cy = cam.cy / cam.height;
			
 
				+        ret.k[0] = cam.k[0];
			
 
				+        ret.k[1] = cam.k[1];
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				+    void opencv_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
			
 
				+        switch (in.type()) {
			
 
				+            case CV_8UC1: {
			
 
				+                cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2BGR, 3, stream);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+        unreachable();
			
 
				+    }
			
 
				+
			
 
				     template<typename T>
			
 
				     void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
			
 
				         assert(in.elemSize() == sizeof(T));
			
@@ -55,27 +142,95 @@ namespace process_impl {
 
				                                          flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
			
 
				     }
			
 
				 
			
 
				+    void crude_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
			
 
				+                       bool alpha, cudaStream_t stream) {
			
 
				+        constexpr uint2 block_size = {32, 4};
			
 
				+        constexpr uint2 grid_dim = {8, 128};
			
 
				+        auto out_size = cv::Size{in.cols >> 1, in.rows >> 1};
			
 
				+        switch (in.type()) {
			
 
				+            case CV_8UC1: {
			
 
				+                if (alpha) {
			
 
				+                    out->create(out_size, CV_8UC4);
			
 
				+                    call_crude_debayer(to_image_type<uint8_t>(in),
			
 
				+                                       to_image_type<uchar4>(*out),
			
 
				+                                       block_size, grid_dim, stream);
			
 
				+                } else {
			
 
				+                    out->create(out_size, CV_8UC3);
			
 
				+                    call_crude_debayer(to_image_type<uint8_t>(in),
			
 
				+                                       to_image_type<uchar3>(*out),
			
 
				+                                       block_size, grid_dim, stream);
			
 
				+                }
			
 
				+
			
 
				+                return;
			
 
				+            }
			
 
				+            default: {
			
 
				+                assert(false);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // pixel coordinate to undistorted normalized plane
			
 
				+    cv::Point2f undistort_point(const camera_intrinsic &info, cv::Point2f p) {
			
 
				+        auto u = (p.x - info.cx) / info.fx;
			
 
				+        auto v = (p.y - info.cy) / info.fy;
			
 
				+        auto r0 = sqrtf(u * u + v * v);
			
 
				+
			
 
				+        // Newton's Method
			
 
				+        constexpr auto SOLVE_ITERATION_CNT = 4;
			
 
				+        auto r = r0;
			
 
				+        for (auto k = 0; k < SOLVE_ITERATION_CNT; ++k) {
			
 
				+            auto r2 = r * r;
			
 
				+            auto r3 = r2 * r;
			
 
				+            auto r4 = r3 * r;
			
 
				+            auto r5 = r4 * r;
			
 
				+            r -= (info.k[1] * r5 + info.k[0] * r3 + r - r0) /
			
 
				+                 (5 * info.k[1] * r4 + 3 * info.k[0] * r2 + 1);
			
 
				+        }
			
 
				+
			
 
				+        auto factor = r / r0;
			
 
				+        u *= factor;
			
 
				+        v *= factor;
			
 
				+        return {u, v};
			
 
				+    }
			
 
				+
			
 
				+    void resample_image(cudaTextureObject_t in, cv::cuda::GpuMat *out, int depth_type,
			
 
				+                        cv::Size2f range, camera_intrinsic cam, uint32_t height, cudaStream_t stream) {
			
 
				+        constexpr uint2 block_size = {32, 4};
			
 
				+        constexpr uint2 grid_dim = {8, 128};
			
 
				+        float ps = 2 * range.height / height;
			
 
				+        uint32_t width = 2 * range.width / ps;
			
 
				+        resample_info info{};
			
 
				+        info.x = -range.width;
			
 
				+        info.y = -range.height;
			
 
				+        info.ps = ps;
			
 
				+        out->create(height, width, CV_MAKE_TYPE(depth_type, 3));
			
 
				+        switch (depth_type) {
			
 
				+            case CV_8U: {
			
 
				+                call_resample_image(in, to_image_type<uchar3>(*out), info,
			
 
				+                                    to_camera_info(cam), block_size, grid_dim, stream);
			
 
				+                return;
			
 
				+            }
			
 
				+            default: {
			
 
				+                assert(false);
			
 
				+                return;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				 }
			
 
				 
			
 
				 using namespace process_impl;
			
 
				 
			
 
				 struct monocular_processor::impl {
			
 
				     cv::cuda::GpuMat raw_dev;
			
 
				+    cv::cuda::GpuMat rgba_dev;
			
 
				+    smart_cuda_texture rgba_tex;
			
 
				     smart_gpu_buffer<uchar3> rgb_f;
			
 
				     smart_gpu_buffer<float> hsv_v_f;
			
 
				     smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
			
 
				     smart_gpu_buffer<enhance_coeff> enhance_ext;
			
 
				 
			
 
				-    static void debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
			
 
				-                        cv::cuda::Stream &stream) {
			
 
				-        switch (in.type()) {
			
 
				-            case CV_8UC1: {
			
 
				-                cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2RGB, 3, stream);
			
 
				-                return;
			
 
				-            }
			
 
				-        }
			
 
				-        unreachable();
			
 
				-    }
			
 
				 
			
 
				     void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
			
 
				         assert(in.type() == CV_8UC3);
			
@@ -113,20 +268,30 @@ struct monocular_processor::impl {
 
				     }
			
 
				 
			
 
				     void process(const cv::Mat &in, cv::cuda::GpuMat *out,
			
 
				-                 bool enhance, cv::cuda::Stream &stream) {
			
 
				+                 const image_process_config &conf, cv::cuda::Stream &stream) {
			
 
				+        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
			
 
				+
			
 
				         // upload from host to device
			
 
				         raw_dev.upload(in, stream);
			
 
				 
			
 
				-        // debayer using OpenCV
			
 
				-        debayer(raw_dev, out, stream);
			
 
				+        if (conf.undistort) {
			
 
				+            assert(conf.crude_debayer);
			
 
				+            crude_debayer(raw_dev, &rgba_dev, true, cuda_stream);
			
 
				+            rgba_tex.create(rgba_dev);
			
 
				+            resample_image(rgba_tex.obj, out, CV_MAT_DEPTH(in.type()),
			
 
				+                           conf.valid_range, conf.camera, conf.resample_height, cuda_stream);
			
 
				+        } else {
			
 
				+            if (conf.crude_debayer) {
			
 
				+                crude_debayer(raw_dev, out, false, cuda_stream);
			
 
				+            } else {
			
 
				+                opencv_debayer(raw_dev, out, stream);
			
 
				+            }
			
 
				+        }
			
 
				 
			
 
				         // enhance image
			
 
				-        auto cuda_stream = (cudaStream_t) stream.cudaPtr();
			
 
				-        if (enhance) {
			
 
				+        if (conf.enhance) {
			
 
				             enhance_image(*out, out, cuda_stream);
			
 
				         }
			
 
				-
			
 
				-        // TODO: un-distort
			
 
				     }
			
 
				 };
			
 
				 
			
@@ -136,6 +301,21 @@ monocular_processor::monocular_processor()
 
				 monocular_processor::~monocular_processor() = default;
			
 
				 
			
 
				 void monocular_processor::process(const cv::Mat &in, cv::cuda::GpuMat *out,
			
 
				-                                  bool enhance, cv::cuda::Stream &stream) {
			
 
				-    pimpl->process(in, out, enhance, stream);
			
 
				+                                  const image_process_config &conf, cv::cuda::Stream &stream) {
			
 
				+    pimpl->process(in, out, conf, stream);
			
 
				 }
			
 
				+
			
 
				+cv::Size2f calc_valid_range(const camera_intrinsic &left, const camera_intrinsic &right, float *angle) {
			
 
				+    auto u_lim = std::min({-undistort_point(left, {0, left.cy}).x,
			
 
				+                           undistort_point(left, {(float) left.width, left.cy}).x,
			
 
				+                           -undistort_point(right, {0, right.cy}).x,
			
 
				+                           undistort_point(right, {(float) right.width, right.cy}).x});
			
 
				+    auto v_lim = std::min({-undistort_point(left, {left.cx, 0}).y,
			
 
				+                           undistort_point(left, {left.cx, (float) left.height}).y,
			
 
				+                           -undistort_point(right, {right.cx, 0}).y,
			
 
				+                           undistort_point(right, {right.cx, (float) right.height}).y});
			
 
				+    if (angle != nullptr) {
			
 
				+        *angle = 2 * atanf(v_lim);
			
 
				+    }
			
 
				+    return {u_lim, v_lim};
			
 
				+}
			
--- a/src/image_process.h
+++ b/src/image_process.h
@@ -6,6 +6,29 @@
 
				 
			
 
				 #include <memory>
			
 
				 
			
 
				+struct camera_intrinsic {
			
 
				+    float fx, fy; // focus length in pixel
			
 
				+    float cx, cy; // optical center in pixel
			
 
				+    float k[2]; // distort coefficients
			
 
				+    uint32_t width, height;
			
 
				+};
			
 
				+
			
 
				+struct image_process_config {
			
 
				+    bool crude_debayer = true;
			
 
				+    bool enhance = false;
			
 
				+
			
 
				+    bool undistort = true;
			
 
				+    cv::Size2f valid_range; // used for undistort
			
 
				+    camera_intrinsic camera; // used for undistort
			
 
				+    uint32_t resample_height; // used for undistort
			
 
				+};
			
 
				+
			
 
				+// return 1/4 part of valid ranges
			
 
				+// angle: view angle
			
 
				+cv::Size2f calc_valid_range(const camera_intrinsic &left,
			
 
				+                            const camera_intrinsic &right,
			
 
				+                            float *angle = nullptr);
			
 
				+
			
 
				 class monocular_processor {
			
 
				 public:
			
 
				 
			
@@ -13,7 +36,7 @@ public:
 
				 
			
 
				     ~monocular_processor();
			
 
				 
			
 
				-    void process(const cv::Mat &in, cv::cuda::GpuMat *out, bool enhance = false,
			
 
				+    void process(const cv::Mat &in, cv::cuda::GpuMat *out, const image_process_config &conf,
			
 
				                  cv::cuda::Stream &stream = cv::cuda::Stream::Null());
			
 
				 
			
 
				 private:
			
--- a/src/image_process/process_kernels.cu
+++ b/src/image_process/process_kernels.cu
@@ -165,6 +165,25 @@ struct type_min_value {
 
				     static constexpr T value = std::numeric_limits<T>::min();
			
 
				 };
			
 
				 
			
 
				+template<typename T, uint32_t Len>
			
 
				+struct packed_type {
			
 
				+};
			
 
				+
			
 
				+template<>
			
 
				+struct packed_type<uint8_t, 2> {
			
 
				+    using type = uchar2;
			
 
				+};
			
 
				+
			
 
				+template<>
			
 
				+struct packed_type<uint8_t, 3> {
			
 
				+    using type = uchar3;
			
 
				+};
			
 
				+
			
 
				+template<>
			
 
				+struct packed_type<uint8_t, 4> {
			
 
				+    using type = uchar4;
			
 
				+};
			
 
				+
			
 
				 template<typename T>
			
 
				 struct reduce_max_func {
			
 
				     static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
@@ -238,7 +257,7 @@ struct enhance_image_func {
 
				         using ImgElemT = decltype(in.x);
			
 
				         static_assert(std::is_integral_v<ImgElemT>,
			
 
				                       "Type of image element must be integer.");
			
 
				-        ImgElemT c_max  = max(max(in.x, in.y), in.z);
			
 
				+        ImgElemT c_max = max(max(in.x, in.y), in.z);
			
 
				         ImgElemT c_min = min(min(in.x, in.y), in.z);
			
 
				         ImgElemT delta = c_max - c_min;
			
 
				 
			
@@ -331,6 +350,34 @@ struct enhance_image_func {
 
				     }
			
 
				 };
			
 
				 
			
 
				+__device__ __forceinline__ uint32_t get_ix() {
			
 
				+    return blockIdx.x * blockDim.x + threadIdx.x;
			
 
				+}
			
 
				+
			
 
				+__device__ __forceinline__ uint32_t get_iy() {
			
 
				+    return blockIdx.y * blockDim.y + threadIdx.y;
			
 
				+}
			
 
				+
			
 
				+__device__ __forceinline__ uint32_t get_gw() { // grid width
			
 
				+    return blockDim.x * gridDim.x;
			
 
				+}
			
 
				+
			
 
				+__device__ __forceinline__ uint32_t get_gh() { // grid height
			
 
				+    return blockDim.y * gridDim.y;
			
 
				+}
			
 
				+
			
 
				+template<typename T, typename U=T>
			
 
				+__device__ __forceinline__ U image_fetch(image_type<T> img, uint32_t x, uint32_t y) {
			
 
				+    auto row_ptr = (uint8_t *) img.ptr + y * img.pitch;
			
 
				+    return *((U *) row_ptr + x);
			
 
				+}
			
 
				+
			
 
				+template<typename T>
			
 
				+__device__ __forceinline__ void image_store(image_type<T> img, uint32_t x, uint32_t y, T elem) {
			
 
				+    auto row_ptr = (uint8_t *) img.ptr + y * img.pitch;
			
 
				+    *((T *) row_ptr + x) = elem;
			
 
				+}
			
 
				+
			
 
				 // special kernels
			
 
				 
			
 
				 __global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
			
@@ -343,6 +390,69 @@ __global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32
 
				     p_out->norm_factor = norm_factor;
			
 
				 }
			
 
				 
			
 
				+template<typename InT, typename OutT>
			
 
				+__global__ void crude_debayer(image_type<InT> in, image_type<OutT> out) {
			
 
				+    uint32_t gw = get_gw(), gh = get_gh();
			
 
				+    for (uint32_t iy = get_iy(); iy < out.height; iy += gh)
			
 
				+        for (uint32_t ix = get_ix(); ix < out.width; ix += gw) {
			
 
				+            // fetch elements
			
 
				+            using FetchType = packed_type<InT, 2>::type;
			
 
				+            auto raw_rg = image_fetch<InT, FetchType>(in, ix, iy << 1);
			
 
				+            auto raw_gb = image_fetch<InT, FetchType>(in, ix, (iy << 1) | 1);
			
 
				+
			
 
				+            // reconstruct
			
 
				+            static_assert(std::is_integral_v<InT>);
			
 
				+            static_assert(sizeof(InT) < sizeof(uint32_t));
			
 
				+            InT r = raw_rg.x;
			
 
				+            InT g = ((uint32_t) raw_rg.y + (uint32_t) raw_gb.x) >> 1;
			
 
				+            InT b = raw_gb.y;
			
 
				+
			
 
				+            // store result
			
 
				+            if constexpr (std::is_same_v<OutT, typename packed_type<InT, 3>::type>) {
			
 
				+                image_store(out, ix, iy, {r, g, b});
			
 
				+            } else if constexpr (std::is_same_v<OutT, typename packed_type<InT, 4>::type>) {
			
 
				+                constexpr InT alpha_val = type_max_value<InT>::value;
			
 
				+                image_store(out, ix, iy, {r, g, b, alpha_val});
			
 
				+            }
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+__global__ void resample_image(cudaTextureObject_t in, image_type<ImgT> out,
			
 
				+                               resample_info info, camera_info cam) {
			
 
				+    uint32_t gw = get_gw(), gh = get_gh();
			
 
				+    for (uint32_t iy = get_iy(); iy < out.height; iy += gh)
			
 
				+        for (uint32_t ix = get_ix(); ix < out.width; ix += gw) {
			
 
				+            // undistorted coordinates
			
 
				+            float u = info.x + info.ps * ix;
			
 
				+            float v = info.y + info.ps * iy;
			
 
				+
			
 
				+            // distort coordinates
			
 
				+            float r2 = u * u + v * v;
			
 
				+            float k = 1 + cam.k[0] * r2 + cam.k[1] * r2 * r2;
			
 
				+            u *= k;
			
 
				+            v *= k;
			
 
				+
			
 
				+            // convert to normalized pixel plane
			
 
				+            u = cam.fx * u + cam.cx;
			
 
				+            v = cam.fy * v + cam.cy;
			
 
				+
			
 
				+            // sample origin image
			
 
				+            auto val = tex2D<float4>(in, u, v);
			
 
				+            ImgT ret;
			
 
				+            using ElemT = decltype(ret.x);
			
 
				+            static_assert(std::is_integral_v<ElemT>);
			
 
				+            static_assert(std::is_same_v<ImgT, typename packed_type<ElemT, 3>::type>);
			
 
				+            constexpr auto factor = type_max_value<ElemT>::value;
			
 
				+            ret.x = factor * val.x;
			
 
				+            ret.y = factor * val.y;
			
 
				+            ret.z = factor * val.z;
			
 
				+
			
 
				+            // store result
			
 
				+            image_store(out, ix, iy, ret);
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				 // calling endpoints
			
 
				 
			
 
				 template<typename T>
			
@@ -418,4 +528,28 @@ void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
 
				     kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
			
 
				 }
			
 
				 
			
 
				-template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
			
 
				+template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
			
 
				+
			
 
				+template<typename InT, typename OutT>
			
 
				+void call_crude_debayer(image_type<InT> in, image_type<OutT> out,
			
 
				+                        uint2 _block_size, uint2 _grid_dim, cudaStream_t stream) {
			
 
				+    auto block_size = dim3{_block_size.x, _block_size.y, 1};
			
 
				+    auto grid_dim = dim3{_grid_dim.x, _grid_dim.y, 1};
			
 
				+    crude_debayer<<<grid_dim, block_size, 0, stream>>>(in, out);
			
 
				+}
			
 
				+
			
 
				+template void call_crude_debayer(image_type<uint8_t>, image_type<uchar3>, uint2, uint2, cudaStream_t);
			
 
				+
			
 
				+template void call_crude_debayer(image_type<uint8_t>, image_type<uchar4>, uint2, uint2, cudaStream_t);
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+void call_resample_image(cudaTextureObject_t in, image_type<ImgT> out,
			
 
				+                         resample_info info, camera_info cam,
			
 
				+                         uint2 _block_size, uint2 _grid_dim, cudaStream_t stream) {
			
 
				+    auto block_size = dim3{_block_size.x, _block_size.y, 1};
			
 
				+    auto grid_dim = dim3{_grid_dim.x, _grid_dim.y, 1};
			
 
				+    resample_image<<<grid_dim, block_size, 0, stream>>>(in, out, info, cam);
			
 
				+}
			
 
				+
			
 
				+template void call_resample_image(cudaTextureObject_t, image_type<uchar3>,
			
 
				+                                  resample_info, camera_info, uint2, uint2, cudaStream_t);
			
--- a/src/image_process/process_kernels.cuh
+++ b/src/image_process/process_kernels.cuh
@@ -33,4 +33,31 @@ template<typename ImgT>
 
				 void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
			
 
				                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream);
			
 
				 
			
 
				+template<typename ImgT>
			
 
				+struct image_type {
			
 
				+    ImgT *ptr;
			
 
				+    uint32_t pitch; // in bytes
			
 
				+    uint32_t width, height; // in pixels
			
 
				+};
			
 
				+
			
 
				+template<typename InT, typename OutT>
			
 
				+void call_crude_debayer(image_type<InT> in, image_type<OutT> out,
			
 
				+                        uint2 _block_size, uint2 _grid_dim, cudaStream_t stream);
			
 
				+
			
 
				+struct camera_info {
			
 
				+    float fx, fy; // focus length in pixel / width (height)
			
 
				+    float cx, cy; // optical center in pixel / width (height)
			
 
				+    float k[2]; // distort coefficients
			
 
				+};
			
 
				+
			
 
				+struct resample_info {
			
 
				+    float x, y; // coordinates of the start point in normalized plane
			
 
				+    float ps; // pixel size in normalized plane
			
 
				+};
			
 
				+
			
 
				+template<typename ImgT>
			
 
				+void call_resample_image(cudaTextureObject_t in, image_type<ImgT> out,
			
 
				+                         resample_info info, camera_info cam,
			
 
				+                         uint2 _block_size, uint2 _grid_dim, cudaStream_t stream);
			
 
				+
			
 
				 #endif //IMAGEHDR_PROCESS_KERNELS_CUH
			
--- a/src/main_ext.cpp
+++ b/src/main_ext.cpp
@@ -57,9 +57,9 @@ int preview_camera_index = 0; // 0 for left, 1 for right
 
				 std::unique_ptr<simple_render> opengl_render;
			
 
				 float process_frame_rate = 0;
			
 
				 bool enhance_image = false;
			
 
				+bool use_crude_debayer = true, undistort_image = true; // debug options
			
 
				 bool augment_enable = false;
			
 
				-cv::Size augment_render_size;
			
 
				-double augment_render_angle = 0;
			
 
				+float augment_render_angle = 0;
			
 
				 std::unique_ptr<vtk_viewer> augment_viewer;
			
 
				 
			
 
				 std::unique_ptr<std::thread> encoder_thread;
			
@@ -86,6 +86,7 @@ std::string probe_model_path;
 
				 std::vector<registration_target> reg_targets;
			
 
				 std::unique_ptr<registration> reg;
			
 
				 
			
 
				+bool debug_options = false;
			
 
				 bool show_vtk_debug = false;
			
 
				 bool show_imgui_demo = false;
			
 
				 std::unique_ptr<vtk_viewer> vtk_test1, vtk_test2;
			
@@ -107,13 +108,28 @@ struct camera_related {
 
				     cudaStream_t cuda_stream = nullptr;
			
 
				     std::unique_ptr<monocular_processor> processor;
			
 
				     std::string trans_var;
			
 
				+    image_process_config process_conf;
			
 
				 
			
 
				     // remap related
			
 
				-    std::string remap_data_path;
			
 
				-    std::unique_ptr<smart_texture> remap_tex;
			
 
				+//    std::string remap_data_path;
			
 
				+//    std::unique_ptr<smart_texture> remap_tex;
			
 
				     bool augment_available = false;
			
 
				     std::unique_ptr<smart_texture> augment_tex;
			
 
				 
			
 
				+    void load_intrinsic(YAML::Node conf) {
			
 
				+        camera_intrinsic info;
			
 
				+        info.fx = conf["fx"].as<float>();
			
 
				+        info.fy = conf["fy"].as<float>();
			
 
				+        info.cx = conf["cx"].as<float>();
			
 
				+        info.cy = conf["cy"].as<float>();
			
 
				+        info.k[0] = conf["k0"].as<float>();
			
 
				+        info.k[1] = conf["k1"].as<float>();
			
 
				+        info.width = conf["width"].as<int>();
			
 
				+        info.height = conf["height"].as<int>();
			
 
				+
			
 
				+        process_conf.camera = info;
			
 
				+    }
			
 
				+
			
 
				     void wait_frame(simple_mq::index_type index) const {
			
 
				         uint64_t cur_cnt;
			
 
				         if (auto ptr = mq().query_variable_ptr<cv::Mat>(index, &cur_cnt);
			
@@ -127,14 +143,26 @@ struct camera_related {
 
				         auto raw_ptr = mq().query_variable_ptr<cv::Mat>(index, &cur_cnt);
			
 
				         assert(cur_cnt > raw_cnt);
			
 
				         raw_cnt = cur_cnt;
			
 
				-        processor->process(*raw_ptr, img_dev.get(), enhance_image, *stream);
			
 
				+
			
 
				+        // OpenCV debayer does not support alpha channel
			
 
				+        if (undistort_image) {
			
 
				+            use_crude_debayer = true;
			
 
				+        }
			
 
				+
			
 
				+        // update process config
			
 
				+        process_conf.crude_debayer = use_crude_debayer;
			
 
				+        process_conf.undistort = undistort_image;
			
 
				+        process_conf.enhance = enhance_image;
			
 
				+
			
 
				+        // process image
			
 
				+        processor->process(*raw_ptr, img_dev.get(), process_conf, *stream);
			
 
				 
			
 
				         if (augment_enable) {
			
 
				             auto trans = sophiar_conn.query_transform_variable(trans_var);
			
 
				             augment_available = trans.has_value();
			
 
				             if (augment_available) {
			
 
				                 augment_viewer->set_camera_pose(trans.value());
			
 
				-                augment_viewer->render(augment_render_size);
			
 
				+                augment_viewer->render(img_dev->size());
			
 
				 
			
 
				                 // copy rendered image
			
 
				                 augment_tex->create(GL_RGBA8, img_dev->size());
			
@@ -146,20 +174,11 @@ struct camera_related {
 
				         }
			
 
				     }
			
 
				 
			
 
				-    void render(const simple_rect &rect) {
			
 
				+    void render(const simple_rect &rect, bool flip_y = false) {
			
 
				         assert(img_dev != nullptr);
			
 
				-        opengl_render->render_rect(*img_dev, rect, false, cuda_stream);
			
 
				+        opengl_render->render_rect(*img_dev, rect, !flip_y, cuda_stream);
			
 
				         if (augment_available) {
			
 
				-            // create remap file if needed
			
 
				-            if (remap_tex == nullptr) [[unlikely]] {
			
 
				-                auto remap_file = mapped_file{remap_data_path, mapped_file::readonly};
			
 
				-                auto size = img_dev->size();
			
 
				-                assert(remap_file.size() == size.area() * CV_ELEM_SIZE(CV_32FC2));
			
 
				-                auto remap_data = cv::Mat{size, CV_32FC2, (void *) remap_file.const_data()};
			
 
				-                remap_tex = std::make_unique<smart_texture>();
			
 
				-                upload_remap_data(remap_tex.get(), remap_data);
			
 
				-            }
			
 
				-            opengl_render->render_rect(augment_tex->id, rect, remap_tex->id);
			
 
				+            opengl_render->render_rect(augment_tex->id, rect, flip_y);
			
 
				         }
			
 
				     }
			
 
				 };
			
@@ -216,13 +235,18 @@ void load_config() {
 
				     main_encoder_conf.frame_rate = capture_conf.frame_rate;
			
 
				     capture_conf.expo_time_ms = capture_param["expo_time_ms"].as<float>();
			
 
				     capture_conf.gain_db = capture_param["gain_db"].as<float>();
			
 
				-    auto remap_conf = camera_conf["remap"];
			
 
				-    augment_render_size = {remap_conf["width"].as<int>(),
			
 
				-                           remap_conf["height"].as<int>()};
			
 
				-    augment_render_angle = remap_conf["angle"].as<double>();
			
 
				-    auto remap_data_conf = remap_conf["data"];
			
 
				-    left.remap_data_path = remap_data_conf["left"].as<std::string>();
			
 
				-    right.remap_data_path = remap_data_conf["right"].as<std::string>();
			
 
				+
			
 
				+    // load camera intrinsics
			
 
				+    auto intrinsic_conf = camera_conf["intrinsic"];
			
 
				+    left.load_intrinsic(intrinsic_conf["left"]);
			
 
				+    right.load_intrinsic(intrinsic_conf["right"]);
			
 
				+
			
 
				+    // calculate valid resample range
			
 
				+    auto range = calc_valid_range(left.process_conf.camera,
			
 
				+                                  right.process_conf.camera,
			
 
				+                                  &augment_render_angle);
			
 
				+    left.process_conf.valid_range = range;
			
 
				+    right.process_conf.valid_range = range;
			
 
				 
			
 
				     // load main window config
			
 
				     auto window_conf = conf["main_window"];
			
@@ -234,6 +258,8 @@ void load_config() {
 
				     output_width = output_conf["width"].as<int>();
			
 
				     output_height = output_conf["height"].as<int>();
			
 
				     main_encoder_conf.bitrate_mbps = output_conf["bitrate"].as<float>();
			
 
				+    left.process_conf.resample_height = output_height; // use output height as resample height
			
 
				+    right.process_conf.resample_height = output_height;
			
 
				 
			
 
				     // load sender config
			
 
				     auto sender_conf = conf["sender"];
			
@@ -620,6 +646,12 @@ void prepare_imgui_frame() {
 
				                 simple_eq.emplace(upload_capture_config);
			
 
				             }
			
 
				             ImGui::Checkbox("Enhance", &enhance_image);
			
 
				+            if (debug_options) {
			
 
				+                ImGui::SameLine();
			
 
				+                ImGui::Checkbox("Crude Debayer", &use_crude_debayer);
			
 
				+                ImGui::SameLine();
			
 
				+                ImGui::Checkbox("Undistort", &undistort_image);
			
 
				+            }
			
 
				 
			
 
				             if (is_capturing()) {
			
 
				                 // preview config
			
@@ -793,6 +825,7 @@ void prepare_imgui_frame() {
 
				             ImGui::PushID("Debug");
			
 
				             ImGui::Checkbox("Debug VTK Viewer", &show_vtk_debug);
			
 
				             ImGui::Checkbox("Show ImGui Demo", &show_imgui_demo);
			
 
				+            ImGui::Checkbox("Debug Options", &debug_options);
			
 
				             ImGui::PopID();
			
 
				         }
			
 
				 
			
@@ -859,7 +892,7 @@ void render_main_window() {
 
				         assert(left.img_dev->size() == right.img_dev->size());
			
 
				         float width_normal = left.img_dev->size().aspectRatio() / frame_size.aspectRatio();
			
 
				         auto render_rect = simple_rect{
			
 
				-                -width_normal, 1, 2 * width_normal, -2
			
 
				+                -width_normal, -1, 2 * width_normal, 2
			
 
				         };
			
 
				         if (preview_camera_index == 0) { // left camera
			
 
				             if (!left.img_dev->empty()) {
			
@@ -897,8 +930,8 @@ void generate_output_frame() {
 
				         left_rect = simple_rect{-0.5f - width_normal / 2, -1, width_normal, 2};
			
 
				         right_rect = simple_rect{0.5f - width_normal / 2, -1, width_normal, 2};
			
 
				     }
			
 
				-    left.render(left_rect);
			
 
				-    right.render(right_rect);
			
 
				+    left.render(left_rect, true);
			
 
				+    right.render(right_rect, true);
			
 
				 
			
 
				     // wait encoder idle
			
 
				     for (uint64_t cur_cnt = 0;;) {
			
--- a/src/simple_opengl.cpp
+++ b/src/simple_opengl.cpp
@@ -207,9 +207,10 @@ struct simple_render::impl {
 
				         glVertexAttribPointer(1, 2, GL_FLOAT, false, 4 * sizeof(GLfloat), (void *) (2 * sizeof(GLfloat)));
			
 
				     }
			
 
				 
			
 
				-    void render_texture(GLuint tex, const simple_rect &rect, GLuint remap_tex) {
			
 
				+    void render_texture(GLuint tex, const simple_rect &rect, bool flip_y) {
			
 
				         // bind buffers
			
 
				-        bool is_remap = (remap_tex != 0);
			
 
				+//        bool is_remap = (remap_tex != 0);
			
 
				+        constexpr bool is_remap = false;
			
 
				         glUseProgram(is_remap ? remap_program : simple_program);
			
 
				         glBindVertexArray(vao);
			
 
				         glBindBuffer(GL_ARRAY_BUFFER, vbo);
			
@@ -217,25 +218,27 @@ struct simple_render::impl {
 
				 
			
 
				         // bind textures
			
 
				         if (is_remap) {
			
 
				-            assert(remap_tex != 0);
			
 
				-            glUniform1i(image_tex_loc, 0);
			
 
				-            glUniform1i(remap_tex_loc, 1);
			
 
				-            glActiveTexture(GL_TEXTURE0 + 0);
			
 
				-            glBindTexture(GL_TEXTURE_2D, tex);
			
 
				-            glActiveTexture(GL_TEXTURE0 + 1);
			
 
				-            glBindTexture(GL_TEXTURE_2D, remap_tex);
			
 
				+//            assert(remap_tex != 0);
			
 
				+//            glUniform1i(image_tex_loc, 0);
			
 
				+//            glUniform1i(remap_tex_loc, 1);
			
 
				+//            glActiveTexture(GL_TEXTURE0 + 0);
			
 
				+//            glBindTexture(GL_TEXTURE_2D, tex);
			
 
				+//            glActiveTexture(GL_TEXTURE0 + 1);
			
 
				+//            glBindTexture(GL_TEXTURE_2D, remap_tex);
			
 
				         } else {
			
 
				             glActiveTexture(GL_TEXTURE0 + 0);
			
 
				             glBindTexture(GL_TEXTURE_2D, tex);
			
 
				         }
			
 
				 
			
 
				         // fill vertex buffer
			
 
				+        GLfloat tex_top = flip_y ? 0 : 1;
			
 
				+        GLfloat tex_bottom = flip_y ? 1 : 0;
			
 
				         GLfloat vertices[] = {
			
 
				                 // 2 for position; 2 for texture
			
 
				-                rect.x + rect.width, rect.y + rect.height, 1, 1, // top right
			
 
				-                rect.x + rect.width, rect.y, 1, 0, // bottom right
			
 
				-                rect.x, rect.y, 0, 0, // bottom left
			
 
				-                rect.x, rect.y + rect.height, 0, 1 // top left
			
 
				+                rect.x + rect.width, rect.y + rect.height, 1, tex_top, // top right
			
 
				+                rect.x + rect.width, rect.y, 1, tex_bottom, // bottom right
			
 
				+                rect.x, rect.y, 0, tex_bottom, // bottom left
			
 
				+                rect.x, rect.y + rect.height, 0, tex_top // top left
			
 
				         };
			
 
				         static_assert(sizeof(vertices) == 16 * sizeof(GLfloat));
			
 
				         glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices);
			
@@ -262,7 +265,7 @@ struct simple_render::impl {
 
				         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, image_pbo.id);
			
 
				         glBindTexture(GL_TEXTURE_2D, image_tex.id);
			
 
				         glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, img.size().width, img.size().height,
			
 
				-                        GL_BGR, GL_UNSIGNED_BYTE, nullptr);
			
 
				+                        GL_RGB, GL_UNSIGNED_BYTE, nullptr);
			
 
				         glBindTexture(GL_TEXTURE_2D, 0);
			
 
				         glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
			
 
				     }
			
@@ -273,14 +276,14 @@ simple_render::simple_render()
 
				 
			
 
				 simple_render::~simple_render() = default;
			
 
				 
			
 
				-void simple_render::render_rect(GLuint tex, const simple_rect &rect, GLuint remap_tex) {
			
 
				-    pimpl->render_texture(tex, rect, remap_tex);
			
 
				+void simple_render::render_rect(GLuint tex, const simple_rect &rect, bool flip_y) {
			
 
				+    pimpl->render_texture(tex, rect, flip_y);
			
 
				 }
			
 
				 
			
 
				 void simple_render::render_rect(const cv::cuda::GpuMat &img, const simple_rect &rect,
			
 
				-                                GLuint remap_tex, cudaStream_t stream) {
			
 
				+                                bool flip_y, cudaStream_t stream) {
			
 
				     pimpl->upload_gpu_mat(img, stream);
			
 
				-    pimpl->render_texture(pimpl->image_tex.id, rect, remap_tex);
			
 
				+    pimpl->render_texture(pimpl->image_tex.id, rect, flip_y);
			
 
				 }
			
 
				 
			
 
				 struct smart_frame_buffer::impl {
			
--- a/src/simple_opengl.h
+++ b/src/simple_opengl.h
@@ -42,11 +42,11 @@ public:
 
				 
			
 
				     void render_rect(GLuint tex,
			
 
				                      const simple_rect &rect,
			
 
				-                     GLuint remap_tex = 0);
			
 
				+                     bool flip_y = false);
			
 
				 
			
 
				     void render_rect(const cv::cuda::GpuMat &img,
			
 
				                      const simple_rect &rect,
			
 
				-                     GLuint remap_tex = 0,
			
 
				+                     bool flip_y = true,
			
 
				                      cudaStream_t stream = nullptr);
			
 
				 
			
 
				 private: