hai 1 ano · e2d1aaa957
--- a/src/image_process/cuda_impl/CMakeLists.txt
+++ b/src/image_process/cuda_impl/CMakeLists.txt
@@ -4,8 +4,9 @@ project(ImageProcessCuda LANGUAGES CXX CUDA)
 
				 set(CMAKE_CXX_STANDARD 20)
			
 
				 
			
 
				 add_library(${PROJECT_NAME}
			
 
				-        pixel_convert.cu
			
 
				-        fake_color.cu)
			
 
				+        fake_color.cu
			
 
				+        image_merge.cu
			
 
				+        pixel_convert.cu)
			
 
				 
			
 
				 # CUDA config
			
 
				 find_package(CUDAToolkit REQUIRED)
			
@@ -13,7 +14,11 @@ target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
 
				 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				 set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
			
 
				 
			
 
				+# glm config
			
 
				+find_package(glm REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} glm::glm)
			
 
				+
			
 
				 target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
			
 
				         -Xptxas -v # show kernel info
			
 
				-#        -g -G      # debug options
			
 
				+        #        -g -G      # debug options
			
 
				         >)
			
--- a/src/image_process/cuda_impl/fake_color.cu
+++ b/src/image_process/cuda_impl/fake_color.cu
@@ -53,87 +53,95 @@ namespace fake_color {
 
				     // sign and exp part of a f32 value within the range of [1.0, 2.0)
			
 
				     constexpr auto f32_sig_exp_val = (1u << 30) - (1u << 23);
			
 
				 
			
 
				+    template<typename EncFunc>
			
 
				+    struct encode {
			
 
				+        __device__ static uchar3 Op(float1 in, ext_type ext) {
			
 
				+            // convert depth value to the range [1, 2)
			
 
				+            if (in.x == 0) { return too_high_val; }
			
 
				+            auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
			
 
				+            if (val < 1) { return too_low_val; }
			
 
				+            if (val >= 2) { return too_high_val; }
			
 
				+            auto bin = (*(uint32_t *) &val) & f32_man_mask;
			
 
				+            return EncFunc::Op(bin);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template<typename DecFunc>
			
 
				+    struct decode {
			
 
				+        __device__ static float1 Op(uchar3 in, ext_type ext) {
			
 
				+            auto bin = DecFunc::Op(in);
			
 
				+            bin |= f32_sig_exp_val;
			
 
				+            auto val = *(float *) &bin;
			
 
				+            val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
			
 
				+            return float1(val);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template<typename EncFunc>
			
 
				+    void call_encode(
			
 
				+            image_type_v2<float1> in, image_type_v2<uchar3> out,
			
 
				+            ext_type ext, cudaStream_t stream) {
			
 
				+        auto func_type = call_image_element_wise_unary<
			
 
				+                float1, uchar3, encode<EncFunc>, ext_type>;
			
 
				+        func_type(in, out, stream, ext);
			
 
				+    }
			
 
				+
			
 
				+    template<typename DecFunc>
			
 
				+    void call_decode(
			
 
				+            image_type_v2<uchar3> in, image_type_v2<float1> out,
			
 
				+            ext_type ext, cudaStream_t stream) {
			
 
				+        auto func_type = call_image_element_wise_unary<
			
 
				+                uchar3, float1, decode<DecFunc>, ext_type>;
			
 
				+        func_type(in, out, stream, ext);
			
 
				+    }
			
 
				+
			
 
				     namespace i888 {
			
 
				 
			
 
				         // @formatter:off
			
 
				-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> r_masks = {
			
 
				+        __device__ __constant__ cuda::std::array<uint32_t, 8> r_masks = {
			
 
				                 1u << 23, 1u << 20, 1u << 17, 1u << 14,
			
 
				                 1u << 11, 1u <<  8, 1u <<  5, 1u <<  2
			
 
				         };
			
 
				-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> g_masks = {
			
 
				+        __device__ __constant__ cuda::std::array<uint32_t, 8> g_masks = {
			
 
				                 1u << 22, 1u << 19, 1u << 16, 1u << 13,
			
 
				                 1u << 10, 1u <<  7, 1u <<  4, 1u <<  1
			
 
				         };
			
 
				-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> b_masks = {
			
 
				+        __device__ __constant__ cuda::std::array<uint32_t, 8> b_masks = {
			
 
				                 1u << 21, 1u << 18, 1u << 15, 1u << 12,
			
 
				                 1u <<  9, 1u <<  6, 1u <<  3,  1u << 0
			
 
				         };
			
 
				         // @formatter:on
			
 
				 
			
 
				-        struct encode {
			
 
				-            __device__ static uchar3 Op(float1 in, ext_type ext) {
			
 
				-                // convert depth value to the range [1, 2)
			
 
				-                auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
			
 
				-                if (val < 1) { return too_low_val; }
			
 
				-                if (val >= 2) { return too_high_val; }
			
 
				-
			
 
				-                auto bin = (*(uint32_t *) &val) & f32_man_mask;
			
 
				+        struct encode_func {
			
 
				+            __device__ static uchar3 Op(uint32_t bin) {
			
 
				                 bin <<= (24 - 23); // uchar3 consists of 24 bytes, padding it
			
 
				-
			
 
				                 return uchar3(bit_compress(bin, r_masks),
			
 
				                               bit_compress(bin, g_masks),
			
 
				                               bit_compress(bin, b_masks));
			
 
				             }
			
 
				         };
			
 
				 
			
 
				-        struct decode {
			
 
				-            __device__ static float1 Op(uchar3 in, ext_type ext) {
			
 
				+        struct decode_func {
			
 
				+            __device__ static uint32_t Op(uchar3 in) {
			
 
				                 auto bin = bit_uncompress(in.x, r_masks)
			
 
				                            | bit_uncompress(in.y, g_masks)
			
 
				                            | bit_uncompress(in.z, b_masks);
			
 
				-
			
 
				-                bin = (bin >> (24 - 23)) | f32_sig_exp_val;
			
 
				-
			
 
				-                auto val = *(float *) &bin;
			
 
				-                val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
			
 
				-                return float1(val);
			
 
				+                return bin >> (24 - 23);
			
 
				             }
			
 
				         };
			
 
				 
			
 
				-        void call_encode(
			
 
				-                image_type_v2<float1> in, image_type_v2<uchar3> out,
			
 
				-                ext_type ext, cudaStream_t stream) {
			
 
				-            auto func_type = call_image_element_wise_unary<
			
 
				-                    float1, uchar3, encode, ext_type>;
			
 
				-            func_type(in, out, stream, ext);
			
 
				-        }
			
 
				-
			
 
				-        void call_decode(
			
 
				-                image_type_v2<uchar3> in, image_type_v2<float1> out,
			
 
				-                ext_type ext, cudaStream_t stream) {
			
 
				-            auto func_type = call_image_element_wise_unary<
			
 
				-                    uchar3, float1, decode, ext_type>;
			
 
				-            func_type(in, out, stream, ext);
			
 
				-        }
			
 
				-
			
 
				     }
			
 
				 
			
 
				     namespace p555 {
			
 
				 
			
 
				-        struct encode {
			
 
				-            __device__ static uchar3 Op(float1 in, ext_type ext) {
			
 
				-                // convert depth value to the range [1, 2)
			
 
				-                auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
			
 
				-                if (val < 1) { return too_low_val; }
			
 
				-                if (val >= 2) { return too_high_val; }
			
 
				-
			
 
				-                auto bin = (*(uint32_t *) &val) & f32_man_mask;
			
 
				+        struct encode_func {
			
 
				+            __device__ static uchar3 Op(uint32_t bin) {
			
 
				                 bin >>= (23 - 15);
			
 
				 
			
 
				                 // @formatter:off
			
 
				-                static constexpr auto r_mask = (1u << 15) - (1u << 10);
			
 
				-                static constexpr auto g_mask = (1u << 10) - (1u << 5 );
			
 
				-                static constexpr auto b_mask = (1u << 5 ) - (1u << 0 );
			
 
				+                static __constant__ auto r_mask = (1u << 15) - (1u << 10);
			
 
				+                static __constant__ auto g_mask = (1u << 10) - (1u << 5 );
			
 
				+                static __constant__ auto b_mask = (1u << 5 ) - (1u << 0 );
			
 
				 
			
 
				                 uint8_t r = (bin & r_mask) >> 10;
			
 
				                 uint8_t g = (bin & g_mask) >> 5;  if (r & 1) { g = ~g; }
			
@@ -146,9 +154,9 @@ namespace fake_color {
 
				             }
			
 
				         };
			
 
				 
			
 
				-        struct decode {
			
 
				-            __device__ static float1 Op(uchar3 in, ext_type ext) {
			
 
				-                static constexpr auto bit_mask = (1u << 5) - (1u << 0);
			
 
				+        struct decode_func {
			
 
				+            __device__ static uint32_t Op(uchar3 in) {
			
 
				+                static __constant__ auto bit_mask = (1u << 5) - (1u << 0);
			
 
				 
			
 
				                 // @formatter:off
			
 
				                 uint32_t r = in.x >> 3;
			
@@ -157,30 +165,27 @@ namespace fake_color {
 
				                 r <<= 10; g <<= 5; b <<= 0;
			
 
				                 // @formatter:on
			
 
				 
			
 
				-                auto bin = (r | g | b) << (23 - 15);
			
 
				-                bin |= f32_sig_exp_val;
			
 
				-
			
 
				-                auto val = *(float *) &bin;
			
 
				-                val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
			
 
				-                return float1(val);
			
 
				+                return (r | g | b) << (23 - 15);
			
 
				             }
			
 
				         };
			
 
				 
			
 
				-        void call_encode(
			
 
				-                image_type_v2<float1> in, image_type_v2<uchar3> out,
			
 
				-                ext_type ext, cudaStream_t stream) {
			
 
				-            auto func_type = call_image_element_wise_unary<
			
 
				-                    float1, uchar3, encode, ext_type>;
			
 
				-            func_type(in, out, stream, ext);
			
 
				-        }
			
 
				+    }
			
 
				 
			
 
				-        void call_decode(
			
 
				-                image_type_v2<uchar3> in, image_type_v2<float1> out,
			
 
				-                ext_type ext, cudaStream_t stream) {
			
 
				-            auto func_type = call_image_element_wise_unary<
			
 
				-                    uchar3, float1, decode, ext_type>;
			
 
				-            func_type(in, out, stream, ext);
			
 
				-        }
			
 
				+    namespace p800 {
			
 
				+
			
 
				+        struct encode_func {
			
 
				+            __device__ static uchar3 Op(uint32_t bin) {
			
 
				+                bin >>= (23 - 8);
			
 
				+                return uchar3(bin, bin, bin);
			
 
				+            }
			
 
				+        };
			
 
				+
			
 
				+        struct decode_func {
			
 
				+            __device__ static uint32_t Op(uchar3 in) {
			
 
				+                auto ret = 0u + in.x + in.y + in.z;
			
 
				+                return (ret / 3) << (23 - 8);
			
 
				+            }
			
 
				+        };
			
 
				 
			
 
				     }
			
 
				 
			
@@ -196,11 +201,15 @@ void call_fake_color_encode(
 
				 
			
 
				     switch (conf.mode) {
			
 
				         case FAKE_888I: {
			
 
				-            i888::call_encode(in, out, ext, stream);
			
 
				+            call_encode<i888::encode_func>(in, out, ext, stream);
			
 
				             break;
			
 
				         }
			
 
				         case FAKE_555P: {
			
 
				-            p555::call_encode(in, out, ext, stream);
			
 
				+            call_encode<p555::encode_func>(in, out, ext, stream);
			
 
				+            break;
			
 
				+        }
			
 
				+        case FAKE_800P: {
			
 
				+            call_encode<p800::encode_func>(in, out, ext, stream);
			
 
				             break;
			
 
				         }
			
 
				         default: {
			
@@ -217,11 +226,15 @@ void call_fake_color_decode(
 
				 
			
 
				     switch (conf.mode) {
			
 
				         case FAKE_888I: {
			
 
				-            i888::call_decode(in, out, ext, stream);
			
 
				+            call_decode<i888::decode_func>(in, out, ext, stream);
			
 
				             break;
			
 
				         }
			
 
				         case FAKE_555P: {
			
 
				-            p555::call_decode(in, out, ext, stream);
			
 
				+            call_decode<p555::decode_func>(in, out, ext, stream);
			
 
				+            break;
			
 
				+        }
			
 
				+        case FAKE_800P: {
			
 
				+            call_decode<p800::decode_func>(in, out, ext, stream);
			
 
				             break;
			
 
				         }
			
 
				         default: {
			
--- a/src/image_process/cuda_impl/fake_color.cuh
+++ b/src/image_process/cuda_impl/fake_color.cuh
@@ -5,11 +5,12 @@
 
				 
			
 
				 enum fake_color_method : uint8_t {
			
 
				     FAKE_888I,
			
 
				-    FAKE_555P
			
 
				+    FAKE_555P,
			
 
				+    FAKE_800P
			
 
				 };
			
 
				 
			
 
				 struct fake_color_config {
			
 
				-    uint8_t mode = FAKE_888I;
			
 
				+    uint8_t mode = FAKE_555P;
			
 
				     float lower = 0;
			
 
				     float upper = 1;
			
 
				 };
			
--- a/src/image_process/cuda_impl/image_merge.cu
+++ b/src/image_process/cuda_impl/image_merge.cu
@@ -0,0 +1,36 @@
 
				+#include "image_merge.cuh"
			
 
				+#include "kernel_utility.cuh"
			
 
				+
			
 
				+namespace depth_threshold {
			
 
				+
			
 
				+    struct ext_type {
			
 
				+        float lower;
			
 
				+        float upper;
			
 
				+    };
			
 
				+
			
 
				+    struct alpha_mask {
			
 
				+        __device__ static uchar4 Op(uchar3 img, float1 depth, ext_type ext) {
			
 
				+            uint8_t alpha = 255;
			
 
				+            if (depth.x > ext.upper) alpha = 0;
			
 
				+            if (depth.x < ext.lower) alpha = 0;
			
 
				+            return uchar4(img.x, img.y, img.z, alpha);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    void call(
			
 
				+            image_type_v2<uchar3> in1, image_type_v2<float1> in2,
			
 
				+            image_type_v2<uchar4> out,
			
 
				+            ext_type ext, cudaStream_t stream) {
			
 
				+        auto func_type = call_image_element_wise_binary_in<
			
 
				+                uchar3, float1, uchar4, alpha_mask, ext_type>;
			
 
				+        func_type(in1, in2, out, stream, ext);
			
 
				+    }
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void call_depth_mask(image_type_v2<uchar3> img, image_type_v2<float1> depth,
			
 
				+                     image_type_v2<uchar4> out,
			
 
				+                     depth_mask_config conf, cudaStream_t stream) {
			
 
				+    auto ext = depth_threshold::ext_type{.lower = conf.lower, .upper = conf.upper};
			
 
				+    return depth_threshold::call(img, depth, out, ext, stream);
			
 
				+}
			
--- a/src/image_process/cuda_impl/image_merge.cuh
+++ b/src/image_process/cuda_impl/image_merge.cuh
@@ -0,0 +1,15 @@
 
				+#ifndef DEPTHGUIDE_IMAGE_MERGE_H
			
 
				+#define DEPTHGUIDE_IMAGE_MERGE_H
			
 
				+
			
 
				+#include "image_utility.cuh"
			
 
				+
			
 
				+struct depth_mask_config {
			
 
				+    float lower = 0;
			
 
				+    float upper = 1;
			
 
				+};
			
 
				+
			
 
				+void call_depth_mask(image_type_v2<uchar3> img, image_type_v2<float1> depth,
			
 
				+                     image_type_v2<uchar4> out,
			
 
				+                     depth_mask_config conf, cudaStream_t stream);
			
 
				+
			
 
				+#endif //DEPTHGUIDE_IMAGE_MERGE_H
			
--- a/src/image_process/cuda_impl/image_utility.cuh
+++ b/src/image_process/cuda_impl/image_utility.cuh
@@ -1,18 +1,36 @@
 
				 #ifndef DEPTHGUIDE_IMAGE_UTILITY_CUH
			
 
				 #define DEPTHGUIDE_IMAGE_UTILITY_CUH
			
 
				 
			
 
				+#include <cassert>
			
 
				 #include <cstdint>
			
 
				 
			
 
				 template<typename PixelT>
			
 
				 struct image_type_v2 {
			
 
				+    using this_type = image_type_v2<PixelT>;
			
 
				+
			
 
				     PixelT *ptr = nullptr;
			
 
				     ushort width = 0, height = 0; // in pixel
			
 
				     ushort pitch = 0; // in bytes
			
 
				 
			
 
				-    __device__ auto at(ushort y, ushort x = 0) {
			
 
				+    __device__ __host__ auto at(ushort y, ushort x = 0) const {
			
 
				         auto row_ptr = (char *) ptr + y * pitch;
			
 
				         return (PixelT *) row_ptr + x;
			
 
				     }
			
 
				+
			
 
				+    __host__ auto sub_image(int row = 0, int col = 0,
			
 
				+                            int w = -1, int h = -1) const {
			
 
				+        if (w == -1) { w = width - col; }
			
 
				+        if (h == -1) { h = height - row; }
			
 
				+        return this_type(at(row, col), w, h, pitch);
			
 
				+    }
			
 
				+
			
 
				+    template<typename PixelU>
			
 
				+    __host__ auto cast() const {
			
 
				+        using ret_type = image_type_v2<PixelU>;
			
 
				+        auto ret_width = width * sizeof(PixelT) / sizeof(PixelU);
			
 
				+        assert(width * sizeof(PixelT) == ret_width * sizeof(PixelU));
			
 
				+        return ret_type((PixelU *) ptr, ret_width, height, pitch);
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 #endif //DEPTHGUIDE_IMAGE_UTILITY_CUH
			
--- a/src/image_process/cuda_impl/kernel_utility.cuh
+++ b/src/image_process/cuda_impl/kernel_utility.cuh
@@ -4,6 +4,7 @@
 
				 #include "image_utility.cuh"
			
 
				 
			
 
				 #include <cassert>
			
 
				+#include <tuple>
			
 
				 
			
 
				 template<typename PixIn, typename PixOut, typename Func, typename... Ext>
			
 
				 __global__ void image_elementwise_unary(image_type_v2<PixIn> in,
			
@@ -23,21 +24,59 @@ __global__ void image_elementwise_unary(image_type_v2<PixIn> in,
 
				     }
			
 
				 }
			
 
				 
			
 
				-template<typename PixIn, typename PixOut, typename Func, typename... Ext>
			
 
				-void call_image_element_wise_unary(image_type_v2<PixIn> in, image_type_v2<PixOut> out,
			
 
				-                                   cudaStream_t stream, Ext... ext) {
			
 
				-    assert(out.width >= in.width);
			
 
				-    assert(out.height >= in.height);
			
 
				+template<typename PixIn1, typename PixIn2,
			
 
				+        typename PixOut, typename Func, typename... Ext>
			
 
				+__global__ void image_elementwise_binary_in(image_type_v2<PixIn1> in1, image_type_v2<PixIn2> in2,
			
 
				+                                            image_type_v2<PixOut> out,
			
 
				+                                            Ext... ext) {
			
 
				+
			
 
				+    for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
			
 
				+         idy < in1.height;
			
 
				+         idy += gridDim.y * blockDim.y) {
			
 
				+
			
 
				+        for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+             idx < in1.width;
			
 
				+             idx += gridDim.x * blockDim.x) {
			
 
				+
			
 
				+            *out.at(idy, idx) = Func::Op(*in1.at(idy, idx), *in2.at(idy, idx), ext...);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+inline std::tuple<dim3, dim3> get_kernel_size(ushort width, ushort height) {
			
 
				     static constexpr auto block_x = 32;
			
 
				     static constexpr auto block_y = 8;
			
 
				     // https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/elementwise.cuh
			
 
				     static constexpr auto max_grids = 4352; // TODO: calculate by hardware at runtime
			
 
				-    auto grid_y = std::max<uint>(1, std::min<uint>(in.height / block_y, max_grids));
			
 
				-    auto grid_x = std::max<uint>(1, std::min<uint>(in.width / block_x, max_grids / grid_y));
			
 
				+    auto grid_y = std::max<uint>(1, std::min<uint>(height / block_y, max_grids));
			
 
				+    auto grid_x = std::max<uint>(1, std::min<uint>(width / block_x, max_grids / grid_y));
			
 
				     auto block_dim = dim3(block_x, block_y, 1);
			
 
				     auto grid_dim = dim3(grid_x, grid_y, 1);
			
 
				+    return std::make_tuple(grid_dim, block_dim);
			
 
				+}
			
 
				+
			
 
				+template<typename PixIn, typename PixOut, typename Func, typename... Ext>
			
 
				+void call_image_element_wise_unary(image_type_v2<PixIn> in, image_type_v2<PixOut> out,
			
 
				+                                   cudaStream_t stream, Ext... ext) {
			
 
				+    assert(out.width >= in.width);
			
 
				+    assert(out.height >= in.height);
			
 
				+    auto [grid_dim, block_dim] = get_kernel_size(in.width, in.height);
			
 
				     auto func_type = image_elementwise_unary<PixIn, PixOut, Func, Ext...>;
			
 
				     func_type<<<grid_dim, block_dim, 0, stream>>>(in, out, ext...);
			
 
				 }
			
 
				 
			
 
				+template<typename PixIn1, typename PixIn2,
			
 
				+        typename PixOut, typename Func, typename... Ext>
			
 
				+void call_image_element_wise_binary_in(image_type_v2<PixIn1> in1, image_type_v2<PixIn2> in2,
			
 
				+                                       image_type_v2<PixOut> out,
			
 
				+                                       cudaStream_t stream, Ext... ext) {
			
 
				+    assert(in1.width == in2.width);
			
 
				+    assert(in1.height == in2.height);
			
 
				+    assert(out.width >= in1.width);
			
 
				+    assert(out.height >= in1.height);
			
 
				+    auto [grid_dim, block_dim] = get_kernel_size(in1.width, in1.height);
			
 
				+    auto func_type = image_elementwise_binary_in<PixIn1, PixIn2, PixOut, Func, Ext...>;
			
 
				+    func_type<<<grid_dim, block_dim, 0, stream>>>(in1, in2, out, ext...);
			
 
				+}
			
 
				+
			
 
				 #endif //DEPTHGUIDE_KERNEL_UTILITY_CUH
			
--- a/src/image_process/cuda_impl/pixel_convert.cu
+++ b/src/image_process/cuda_impl/pixel_convert.cu
@@ -1,11 +1,14 @@
 
				 #include "pixel_convert.cuh"
			
 
				 #include "kernel_utility.cuh"
			
 
				 
			
 
				+#include <glm/glm.hpp>
			
 
				+#include <cuda/std/limits>
			
 
				+
			
 
				 template<typename PixIn, typename PixOut>
			
 
				 struct cvt_rgb_bgra {
			
 
				     __device__ static constexpr PixOut Op(PixIn in) {
			
 
				         auto out = PixOut();
			
 
				-        out.w = 1.0;
			
 
				+        out.w = 255; // TODO: use type traits
			
 
				         out.z = in.x;
			
 
				         out.y = in.y;
			
 
				         out.x = in.z;
			
@@ -22,3 +25,71 @@ void call_cvt_rgb_bgra_u8(image_type_v2<uchar3> in,
 
				             uchar3, uchar4, cvt_rgb_bgra_u8>;
			
 
				     func_type(in, out, stream);
			
 
				 }
			
 
				+
			
 
				+__device__ constexpr glm::vec3 to_vec3(uchar3 vec) {
			
 
				+    return glm::vec3(vec.x, vec.y, vec.z) / 255.f;
			
 
				+}
			
 
				+
			
 
				+__device__ constexpr uchar3 to_uchar3(glm::vec3 vec) {
			
 
				+    auto ret = glm::clamp(vec, 0.f, 1.f) * 255.f;
			
 
				+    return uchar3(ret.x, ret.y, ret.z);
			
 
				+}
			
 
				+
			
 
				+namespace yuv_to_rgb {
			
 
				+
			
 
				+    // @formatter:off
			
 
				+    __device__ __constant__ auto cvt_mat = glm::mat3( // BT.709
			
 
				+            1,       1,        1,
			
 
				+            0,       -0.1873,  1.8556,
			
 
				+            1.5748,  -0.4681,  0);
			
 
				+    // @formatter:on
			
 
				+
			
 
				+    struct cvt {
			
 
				+        __device__ static constexpr uchar3 Op(uchar3 in) {
			
 
				+            auto yuv = to_vec3(in) - glm::vec3(0, 0.5, 0.5);
			
 
				+            auto rgb = to_uchar3(cvt_mat * yuv);
			
 
				+            return rgb;
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+}
			
 
				+
			
 
				+__global__ void nv12_to_rgb(image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img,
			
 
				+                            image_type_v2<uchar3> rgb_img) {
			
 
				+
			
 
				+    for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
			
 
				+         idy < chroma_img.height;
			
 
				+         idy += gridDim.y * blockDim.y) {
			
 
				+
			
 
				+        for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
			
 
				+             idx < chroma_img.width;
			
 
				+             idx += gridDim.x * blockDim.x) {
			
 
				+
			
 
				+            auto chroma = *chroma_img.at(idy, idx);
			
 
				+
			
 
				+#pragma unroll
			
 
				+            for (auto dy = 0; dy < 2; ++dy)
			
 
				+#pragma unroll
			
 
				+                    for (auto dx = 0; dx < 2; ++dx) {
			
 
				+                        auto iy = 2 * idy + dy, ix = 2 * idx + dx;
			
 
				+
			
 
				+                        auto luma = *luma_img.at(iy, ix);
			
 
				+                        auto yuv = uchar3(luma.x, chroma.x, chroma.y);
			
 
				+                        *rgb_img.at(iy, ix) = yuv_to_rgb::cvt::Op(yuv);
			
 
				+                    }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void call_nv12_to_rgb(image_type_v2<uchar1> in,
			
 
				+                      image_type_v2<uchar3> out,
			
 
				+                      cudaStream_t stream) {
			
 
				+    assert(in.height % 3 == 0);
			
 
				+    auto img_height = in.height / 3 * 2;
			
 
				+    assert(out.width == in.width);
			
 
				+    assert(out.height == img_height);
			
 
				+    auto luma_img = in.sub_image(0, 0, -1, img_height);
			
 
				+    auto chroma_img = in.sub_image(img_height).cast<uchar2>();
			
 
				+    auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
			
 
				+    nv12_to_rgb<<<grid_dim, block_dim, 0, stream>>>(luma_img, chroma_img, out);
			
 
				+}
			
--- a/src/image_process/cuda_impl/pixel_convert.cuh
+++ b/src/image_process/cuda_impl/pixel_convert.cuh
@@ -7,5 +7,8 @@ void call_cvt_rgb_bgra_u8(image_type_v2<uchar3> in,
 
				                           image_type_v2<uchar4> out,
			
 
				                           cudaStream_t stream);
			
 
				 
			
 
				+void call_nv12_to_rgb(image_type_v2<uchar1> in,
			
 
				+                      image_type_v2<uchar3> out,
			
 
				+                      cudaStream_t stream);
			
 
				 
			
 
				 #endif //DEPTHGUIDE_PIXEL_CONVERT_CUH
			
--- a/src/image_process/impl/versatile_convertor.cpp
+++ b/src/image_process/impl/versatile_convertor.cpp
@@ -50,6 +50,18 @@ void versatile_convertor::impl::cvt_rgb_bgra() {
 
				     OBJ_SAVE(conf.out_name, create_image(img_out));
			
 
				 }
			
 
				 
			
 
				+void versatile_convertor::impl::cvt_nv12_rgb() {
			
 
				+    auto img = OBJ_QUERY(image_u8c1, conf.in_name);
			
 
				+    if (img == nullptr) return;
			
 
				+    auto img_size = img->size();
			
 
				+    auto out_size = cv::Size(img_size.width, img_size.height / 3 * 2);
			
 
				+    auto img_out = create_image_info<uchar3>(out_size, MEM_CUDA);
			
 
				+    call_nv12_to_rgb(img->as_cuda(conf.stream),
			
 
				+                     img_out.as_cuda(),
			
 
				+                     conf.stream->cuda);
			
 
				+    OBJ_SAVE(conf.out_name, create_image(img_out));
			
 
				+}
			
 
				+
			
 
				 void versatile_convertor::impl::cvt_fake_encode(fake_color_method mode) {
			
 
				     auto img = OBJ_QUERY(image_f32c1, conf.in_name);
			
 
				     if (img == nullptr) return;
			
@@ -108,10 +120,13 @@ void versatile_convertor::impl::process(obj_name_type name) {
 
				     switch (conf.cvt_opt) {
			
 
				         // @formatter:off
			
 
				         case CVT_RGB_BGRA: { cvt_rgb_bgra(); break; }
			
 
				+        case CVT_NV12_RGB: { cvt_nv12_rgb(); break; }
			
 
				         case CVT_FAKE_ENCODE_888I: { cvt_fake_encode(FAKE_888I); break; }
			
 
				         case CVT_FAKE_ENCODE_555P: { cvt_fake_encode(FAKE_555P); break; }
			
 
				+        case CVT_FAKE_ENCODE_800P: { cvt_fake_encode(FAKE_800P); break; }
			
 
				         case CVT_FAKE_DECODE_888I: { cvt_fake_decode(FAKE_888I); break; }
			
 
				         case CVT_FAKE_DECODE_555P: { cvt_fake_decode(FAKE_555P); break; }
			
 
				+        case CVT_FAKE_DECODE_800P: { cvt_fake_decode(FAKE_800P); break; }
			
 
				         case CVT_HALF_SPLIT: { cvt_half_split(); break; }
			
 
				         // @formatter:on
			
 
				         default: {
			
--- a/src/image_process/impl/versatile_convertor_impl.h
+++ b/src/image_process/impl/versatile_convertor_impl.h
@@ -26,6 +26,8 @@ struct versatile_convertor::impl {
 
				 
			
 
				     void cvt_rgb_bgra();
			
 
				 
			
 
				+    void cvt_nv12_rgb();
			
 
				+
			
 
				     void cvt_fake_encode(fake_color_method mode);
			
 
				 
			
 
				     void cvt_fake_decode(fake_color_method mode);
			
--- a/src/image_process/versatile_convertor.h
+++ b/src/image_process/versatile_convertor.h
@@ -8,11 +8,14 @@
 
				 
			
 
				 enum convert_options {
			
 
				     CVT_RGB_BGRA,
			
 
				+    CVT_NV12_RGB,
			
 
				 
			
 
				     CVT_FAKE_ENCODE_888I,
			
 
				     CVT_FAKE_ENCODE_555P,
			
 
				+    CVT_FAKE_ENCODE_800P,
			
 
				     CVT_FAKE_DECODE_888I,
			
 
				     CVT_FAKE_DECODE_555P,
			
 
				+    CVT_FAKE_DECODE_800P,
			
 
				 
			
 
				     CVT_HALF_SPLIT
			
 
				 };
			
--- a/src/impl/apps/depth_guide/depth_guide.cpp
+++ b/src/impl/apps/depth_guide/depth_guide.cpp
@@ -10,7 +10,7 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
 
				     OBJ_SAVE(img_color, image_u8c3());
			
 
				     OBJ_SAVE(img_depth, image_f32c1());
			
 
				     OBJ_SAVE(img_depth_fake, image_u8c3());
			
 
				-    auto fake_info = fake_color_config{.mode = FAKE_555P, .lower = 200, .upper = 1000};
			
 
				+    auto fake_info = fake_color_config{.mode = FAKE_800P, .lower = 200, .upper = 1000};
			
 
				     OBJ_SAVE(img_depth_fake_info, versatile_convertor_impl::encode_config(fake_info));
			
 
				     OBJ_SAVE(img_out, image_u8c4());
			
 
				 
			
@@ -23,7 +23,7 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
 
				 
			
 
				     auto fake_conf = versatile_convertor::create_config{
			
 
				             .in_name = img_depth, .ext_in = img_depth_fake_info, .out_name = img_depth_fake,
			
 
				-            .cvt_opt = CVT_FAKE_ENCODE_555P, .stream = default_cuda_stream,
			
 
				+            .cvt_opt = CVT_FAKE_ENCODE_800P, .stream = default_cuda_stream,
			
 
				     };
			
 
				     depth_encode = std::make_unique<versatile_convertor>(fake_conf);
			
 
				 
			
@@ -44,7 +44,9 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
 
				     bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
			
 
				 
			
 
				     auto out_streamer_conf = image_streamer::create_config{
			
 
				-            .img_name = img_out, .asio_ctx = conf.asio_ctx,
			
 
				+            .img_name = img_out,
			
 
				+            .ext_name = img_depth_fake_info, // comment this for tiny player to work
			
 
				+            .asio_ctx = conf.asio_ctx,
			
 
				             .cuda_ctx = conf.cuda_ctx, .stream = default_cuda_stream
			
 
				     };
			
 
				     out_streamer = std::make_unique<image_streamer>(out_streamer_conf);
			
--- a/src/impl/apps/remote_ar/remote_ar.cpp
+++ b/src/impl/apps/remote_ar/remote_ar.cpp
@@ -1,6 +1,7 @@
 
				 #include "remote_ar.h"
			
 
				 #include "core/imgui_utility.hpp"
			
 
				 #include "core/yaml_utility.hpp"
			
 
				+#include "network/binary_utility.hpp"
			
 
				 
			
 
				 #include <boost/asio/post.hpp>
			
 
				 
			
@@ -20,6 +21,13 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
 
				     OBJ_SAVE(aug_right, image_u8c3());
			
 
				     OBJ_SAVE(img_out, image_u8c4()); // ARGB
			
 
				 
			
 
				+    OBJ_SAVE(guide_combine, image_u8c1());
			
 
				+    OBJ_SAVE(guide_info, data_type());
			
 
				+    OBJ_SAVE(guide_combine_rgb, image_u8c3());
			
 
				+    OBJ_SAVE(guide_img, image_u8c3());
			
 
				+    OBJ_SAVE(guide_depth_fake, image_u8c3());
			
 
				+    OBJ_SAVE(guide_depth, image_f32c1());
			
 
				+
			
 
				     // process callbacks caused by OBJ_SAVE
			
 
				     asio_ctx->poll();
			
 
				 
			
@@ -61,6 +69,30 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
 
				         cam_right.img_proc->change_config({.is_mono = info.is_mono});
			
 
				     });
			
 
				 
			
 
				+    auto guide_in_conf = image_player::create_config{
			
 
				+            .img_name = guide_combine, .ext_name = guide_info,
			
 
				+            .ctx = asio_ctx, .stream = default_cuda_stream,
			
 
				+    };
			
 
				+    guide_player = std::make_unique<image_player>(guide_in_conf);
			
 
				+
			
 
				+    auto guide_cvt_conf = versatile_convertor::create_config{
			
 
				+            .in_name = guide_combine, .out_name = guide_combine_rgb,
			
 
				+            .cvt_opt = CVT_NV12_RGB, .stream = default_cuda_stream,
			
 
				+    };
			
 
				+    guide_cvt = std::make_unique<versatile_convertor>(guide_cvt_conf);
			
 
				+
			
 
				+    auto guide_split_conf = versatile_convertor::create_config{
			
 
				+            .in_name = guide_combine_rgb, .out_name = guide_img, .ext_out = guide_depth_fake,
			
 
				+            .cvt_opt = CVT_HALF_SPLIT, .stream = default_cuda_stream,
			
 
				+    };
			
 
				+    guide_split = std::make_unique<versatile_convertor>(guide_split_conf);
			
 
				+
			
 
				+    auto guide_decode_conf = versatile_convertor::create_config{
			
 
				+            .in_name = guide_depth_fake, .ext_in = guide_info, .out_name = guide_depth,
			
 
				+            .cvt_opt = CVT_FAKE_DECODE_800P, .stream = default_cuda_stream,
			
 
				+    };
			
 
				+    guide_decode = std::make_unique<versatile_convertor>(guide_decode_conf);
			
 
				+
			
 
				     auto aug_conf = augment_manager::create_config{
			
 
				             .item_list = augment_manager::item_list_from_yaml(LOAD_LIST("augment_list")),
			
 
				             .sophiar_conn = sophiar_conn.get()
			
@@ -111,14 +143,24 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
 
				         post(*asio_ctx, [=, this] { stereo_aug->resize(size); });
			
 
				     });
			
 
				 
			
 
				+//    auto bg_viewer_conf = image_viewer::create_config{
			
 
				+//            .mode = VIEW_STEREO, .flip_y = false,
			
 
				+//            .stream = default_cuda_stream,
			
 
				+//    };
			
 
				+//    auto &stereo_conf = bg_viewer_conf.extra.stereo;
			
 
				+//    stereo_conf.c_fmt = COLOR_RGB;
			
 
				+//    stereo_conf.left_name = aug_left;
			
 
				+//    stereo_conf.right_name = aug_right;
			
 
				+//    bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
			
 
				+
			
 
				     auto bg_viewer_conf = image_viewer::create_config{
			
 
				-            .mode = VIEW_STEREO, .flip_y = false,
			
 
				+            .mode = VIEW_COLOR_DEPTH, .flip_y = true,
			
 
				             .stream = default_cuda_stream,
			
 
				     };
			
 
				-    auto &stereo_conf = bg_viewer_conf.extra.stereo;
			
 
				-    stereo_conf.c_fmt = COLOR_RGB;
			
 
				-    stereo_conf.left_name = aug_left;
			
 
				-    stereo_conf.right_name = aug_right;
			
 
				+    auto &bg_extra_conf = bg_viewer_conf.extra.color_depth;
			
 
				+    bg_extra_conf.c_fmt = COLOR_RGB;
			
 
				+    bg_extra_conf.c_name = guide_img;
			
 
				+    bg_extra_conf.d_name = guide_depth;
			
 
				     bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
			
 
				 }
			
 
				 
			
@@ -160,6 +202,11 @@ void app_remote_ar::show_ui() {
 
				             }
			
 
				         }
			
 
				 
			
 
				+        if (ImGui::CollapsingHeader("Depth Guide")) {
			
 
				+            auto id_guard = imgui_id_guard("depth_guide");
			
 
				+            guide_player->show();
			
 
				+        }
			
 
				+
			
 
				         if (ImGui::CollapsingHeader("Stereo")) {
			
 
				             auto id_guard = imgui_id_guard("stereo");
			
 
				             stereo_aug->show();
			
--- a/src/impl/apps/remote_ar/remote_ar.h
+++ b/src/impl/apps/remote_ar/remote_ar.h
@@ -7,9 +7,11 @@
 
				 #include "module/augment_manager.h"
			
 
				 #include "module/camera_augment_helper.h"
			
 
				 #include "module/image_augment_helper.h"
			
 
				+#include "module/image_player.h"
			
 
				 #include "module/image_streamer.h"
			
 
				 #include "module/image_viewer.h"
			
 
				 #include "image_process/image_process_ui.h"
			
 
				+#include "image_process/versatile_convertor.h"
			
 
				 #include "impl/app_base.h"
			
 
				 
			
 
				 // sophiar
			
@@ -37,6 +39,11 @@ private:
 
				         rgb_left, rgb_right,
			
 
				         aug_left, aug_right,
			
 
				         img_out,
			
 
				+
			
 
				+        guide_combine, guide_info,
			
 
				+        guide_combine_rgb,
			
 
				+        guide_img, guide_depth_fake,
			
 
				+        guide_depth,
			
 
				     };
			
 
				 
			
 
				     struct camera_module {
			
@@ -63,6 +70,11 @@ private:
 
				     std::unique_ptr<stereo_augment_helper> stereo_aug;
			
 
				     std::unique_ptr<image_streamer> out_streamer;
			
 
				 
			
 
				+    std::unique_ptr<image_player> guide_player;
			
 
				+    std::unique_ptr<versatile_convertor> guide_cvt;
			
 
				+    std::unique_ptr<versatile_convertor> guide_split;
			
 
				+    std::unique_ptr<versatile_convertor> guide_decode;
			
 
				+
			
 
				     camera_module cam_left;
			
 
				     camera_module cam_right;
			
 
				 
			
--- a/src/module/impl/image_player.cpp
+++ b/src/module/impl/image_player.cpp
@@ -106,6 +106,7 @@ void image_player::impl::stop() {
 
				         aux_ctx->stop();
			
 
				         aux_thread->join();
			
 
				         aux_thread = nullptr;
			
 
				+        frame_queue = nullptr;
			
 
				     }
			
 
				 
			
 
				     dec_nvdec = nullptr;
			
--- a/src/network/binary_utility.hpp
+++ b/src/network/binary_utility.hpp
@@ -278,7 +278,7 @@ public:
 
				     }
			
 
				 
			
 
				     void write_data(const data_type &_data) {
			
 
				-        std::copy_n(cur_ptr, _data.size, _data.start_ptr());
			
 
				+        std::copy_n(_data.start_ptr(), _data.size, cur_ptr);
			
 
				         cur_ptr += _data.size;
			
 
				         assert(cur_ptr <= end_ptr());
			
 
				     }
			
--- a/src/network_v3/sender_udp_fec.cpp
+++ b/src/network_v3/sender_udp_fec.cpp
@@ -268,9 +268,11 @@ struct sender_udp_fec::impl {
 
				         // handle request
			
 
				         switch (req.request_type) {
			
 
				             case 'X': {
			
 
				-                SPDLOG_INFO("Client {}:{} left.",
			
 
				-                            remote_ep->address().to_string(), remote_ep->port());
			
 
				-                close_connection();
			
 
				+                if (remote_ep != nullptr) {
			
 
				+                    SPDLOG_INFO("Client {}:{} left.",
			
 
				+                                remote_ep->address().to_string(), remote_ep->port());
			
 
				+                    close_connection();
			
 
				+                }
			
 
				                 return;
			
 
				             }
			
 
				             case 'C': {
			
--- a/src/render/impl/shader/tex_nv12.frag
+++ b/src/render/impl/shader/tex_nv12.frag
@@ -8,10 +8,10 @@ uniform sampler2D chroma_tex;
 
				 in vec2 frag_uv;
			
 
				 
			
 
				 // @formatter:off
			
 
				-const mat3 cvt_mat = mat3(
			
 
				+const mat3 cvt_mat = mat3( // BT.709
			
 
				     1,       1,        1,
			
 
				-    0,       -0.3455,  1.7790,
			
 
				-    1.4075,  -0.7169,  0);
			
 
				+    0,       -0.1873,  1.8556,
			
 
				+    1.5748,  -0.4681,  0);
			
 
				 // @formatter:on
			
 
				 
			
 
				 layout (location = 0) out vec4 frag_color;