Explorar o código

Implemented guide image transfer.

jcsyshc hai 1 ano
pai
achega
e2d1aaa957

+ 8 - 3
src/image_process/cuda_impl/CMakeLists.txt

@@ -4,8 +4,9 @@ project(ImageProcessCuda LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 20)
 
 add_library(${PROJECT_NAME}
-        pixel_convert.cu
-        fake_color.cu)
+        fake_color.cu
+        image_merge.cu
+        pixel_convert.cu)
 
 # CUDA config
 find_package(CUDAToolkit REQUIRED)
@@ -13,7 +14,11 @@ target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
 set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
 
+# glm config
+find_package(glm REQUIRED)
+target_link_libraries(${PROJECT_NAME} glm::glm)
+
 target_compile_options(${PROJECT_NAME} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
         -Xptxas -v # show kernel info
-#        -g -G      # debug options
+        #        -g -G      # debug options
         >)

+ 87 - 74
src/image_process/cuda_impl/fake_color.cu

@@ -53,87 +53,95 @@ namespace fake_color {
     // sign and exp part of a f32 value within the range of [1.0, 2.0)
     constexpr auto f32_sig_exp_val = (1u << 30) - (1u << 23);
 
+    template<typename EncFunc>
+    struct encode {
+        __device__ static uchar3 Op(float1 in, ext_type ext) {
+            // convert depth value to the range [1, 2)
+            if (in.x == 0) { return too_high_val; }
+            auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
+            if (val < 1) { return too_low_val; }
+            if (val >= 2) { return too_high_val; }
+            auto bin = (*(uint32_t *) &val) & f32_man_mask;
+            return EncFunc::Op(bin);
+        }
+    };
+
+    template<typename DecFunc>
+    struct decode {
+        __device__ static float1 Op(uchar3 in, ext_type ext) {
+            auto bin = DecFunc::Op(in);
+            bin |= f32_sig_exp_val;
+            auto val = *(float *) &bin;
+            val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
+            return float1(val);
+        }
+    };
+
+    template<typename EncFunc>
+    void call_encode(
+            image_type_v2<float1> in, image_type_v2<uchar3> out,
+            ext_type ext, cudaStream_t stream) {
+        auto func_type = call_image_element_wise_unary<
+                float1, uchar3, encode<EncFunc>, ext_type>;
+        func_type(in, out, stream, ext);
+    }
+
+    template<typename DecFunc>
+    void call_decode(
+            image_type_v2<uchar3> in, image_type_v2<float1> out,
+            ext_type ext, cudaStream_t stream) {
+        auto func_type = call_image_element_wise_unary<
+                uchar3, float1, decode<DecFunc>, ext_type>;
+        func_type(in, out, stream, ext);
+    }
+
     namespace i888 {
 
         // @formatter:off
-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> r_masks = {
+        __device__ __constant__ cuda::std::array<uint32_t, 8> r_masks = {
                 1u << 23, 1u << 20, 1u << 17, 1u << 14,
                 1u << 11, 1u <<  8, 1u <<  5, 1u <<  2
         };
-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> g_masks = {
+        __device__ __constant__ cuda::std::array<uint32_t, 8> g_masks = {
                 1u << 22, 1u << 19, 1u << 16, 1u << 13,
                 1u << 10, 1u <<  7, 1u <<  4, 1u <<  1
         };
-        constexpr __device__ __constant__ cuda::std::array<uint32_t, 8> b_masks = {
+        __device__ __constant__ cuda::std::array<uint32_t, 8> b_masks = {
                 1u << 21, 1u << 18, 1u << 15, 1u << 12,
                 1u <<  9, 1u <<  6, 1u <<  3,  1u << 0
         };
         // @formatter:on
 
-        struct encode {
-            __device__ static uchar3 Op(float1 in, ext_type ext) {
-                // convert depth value to the range [1, 2)
-                auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
-                if (val < 1) { return too_low_val; }
-                if (val >= 2) { return too_high_val; }
-
-                auto bin = (*(uint32_t *) &val) & f32_man_mask;
+        struct encode_func {
+            __device__ static uchar3 Op(uint32_t bin) {
                 bin <<= (24 - 23); // uchar3 consists of 24 bytes, padding it
-
                 return uchar3(bit_compress(bin, r_masks),
                               bit_compress(bin, g_masks),
                               bit_compress(bin, b_masks));
             }
         };
 
-        struct decode {
-            __device__ static float1 Op(uchar3 in, ext_type ext) {
+        struct decode_func {
+            __device__ static uint32_t Op(uchar3 in) {
                 auto bin = bit_uncompress(in.x, r_masks)
                            | bit_uncompress(in.y, g_masks)
                            | bit_uncompress(in.z, b_masks);
-
-                bin = (bin >> (24 - 23)) | f32_sig_exp_val;
-
-                auto val = *(float *) &bin;
-                val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
-                return float1(val);
+                return bin >> (24 - 23);
             }
         };
 
-        void call_encode(
-                image_type_v2<float1> in, image_type_v2<uchar3> out,
-                ext_type ext, cudaStream_t stream) {
-            auto func_type = call_image_element_wise_unary<
-                    float1, uchar3, encode, ext_type>;
-            func_type(in, out, stream, ext);
-        }
-
-        void call_decode(
-                image_type_v2<uchar3> in, image_type_v2<float1> out,
-                ext_type ext, cudaStream_t stream) {
-            auto func_type = call_image_element_wise_unary<
-                    uchar3, float1, decode, ext_type>;
-            func_type(in, out, stream, ext);
-        }
-
     }
 
     namespace p555 {
 
-        struct encode {
-            __device__ static uchar3 Op(float1 in, ext_type ext) {
-                // convert depth value to the range [1, 2)
-                auto val = 1 + (in.x - ext.lower) / (ext.upper - ext.lower);
-                if (val < 1) { return too_low_val; }
-                if (val >= 2) { return too_high_val; }
-
-                auto bin = (*(uint32_t *) &val) & f32_man_mask;
+        struct encode_func {
+            __device__ static uchar3 Op(uint32_t bin) {
                 bin >>= (23 - 15);
 
                 // @formatter:off
-                static constexpr auto r_mask = (1u << 15) - (1u << 10);
-                static constexpr auto g_mask = (1u << 10) - (1u << 5 );
-                static constexpr auto b_mask = (1u << 5 ) - (1u << 0 );
+                static __constant__ auto r_mask = (1u << 15) - (1u << 10);
+                static __constant__ auto g_mask = (1u << 10) - (1u << 5 );
+                static __constant__ auto b_mask = (1u << 5 ) - (1u << 0 );
 
                 uint8_t r = (bin & r_mask) >> 10;
                 uint8_t g = (bin & g_mask) >> 5;  if (r & 1) { g = ~g; }
@@ -146,9 +154,9 @@ namespace fake_color {
             }
         };
 
-        struct decode {
-            __device__ static float1 Op(uchar3 in, ext_type ext) {
-                static constexpr auto bit_mask = (1u << 5) - (1u << 0);
+        struct decode_func {
+            __device__ static uint32_t Op(uchar3 in) {
+                static __constant__ auto bit_mask = (1u << 5) - (1u << 0);
 
                 // @formatter:off
                 uint32_t r = in.x >> 3;
@@ -157,30 +165,27 @@ namespace fake_color {
                 r <<= 10; g <<= 5; b <<= 0;
                 // @formatter:on
 
-                auto bin = (r | g | b) << (23 - 15);
-                bin |= f32_sig_exp_val;
-
-                auto val = *(float *) &bin;
-                val = (val - 1) * (ext.upper - ext.lower) + ext.lower;
-                return float1(val);
+                return (r | g | b) << (23 - 15);
             }
         };
 
-        void call_encode(
-                image_type_v2<float1> in, image_type_v2<uchar3> out,
-                ext_type ext, cudaStream_t stream) {
-            auto func_type = call_image_element_wise_unary<
-                    float1, uchar3, encode, ext_type>;
-            func_type(in, out, stream, ext);
-        }
+    }
 
-        void call_decode(
-                image_type_v2<uchar3> in, image_type_v2<float1> out,
-                ext_type ext, cudaStream_t stream) {
-            auto func_type = call_image_element_wise_unary<
-                    uchar3, float1, decode, ext_type>;
-            func_type(in, out, stream, ext);
-        }
+    namespace p800 {
+
+        struct encode_func {
+            __device__ static uchar3 Op(uint32_t bin) {
+                bin >>= (23 - 8);
+                return uchar3(bin, bin, bin);
+            }
+        };
+
+        struct decode_func {
+            __device__ static uint32_t Op(uchar3 in) {
+                auto ret = 0u + in.x + in.y + in.z;
+                return (ret / 3) << (23 - 8);
+            }
+        };
 
     }
 
@@ -196,11 +201,15 @@ void call_fake_color_encode(
 
     switch (conf.mode) {
         case FAKE_888I: {
-            i888::call_encode(in, out, ext, stream);
+            call_encode<i888::encode_func>(in, out, ext, stream);
             break;
         }
         case FAKE_555P: {
-            p555::call_encode(in, out, ext, stream);
+            call_encode<p555::encode_func>(in, out, ext, stream);
+            break;
+        }
+        case FAKE_800P: {
+            call_encode<p800::encode_func>(in, out, ext, stream);
             break;
         }
         default: {
@@ -217,11 +226,15 @@ void call_fake_color_decode(
 
     switch (conf.mode) {
         case FAKE_888I: {
-            i888::call_decode(in, out, ext, stream);
+            call_decode<i888::decode_func>(in, out, ext, stream);
             break;
         }
         case FAKE_555P: {
-            p555::call_decode(in, out, ext, stream);
+            call_decode<p555::decode_func>(in, out, ext, stream);
+            break;
+        }
+        case FAKE_800P: {
+            call_decode<p800::decode_func>(in, out, ext, stream);
             break;
         }
         default: {

+ 3 - 2
src/image_process/cuda_impl/fake_color.cuh

@@ -5,11 +5,12 @@
 
 enum fake_color_method : uint8_t {
     FAKE_888I,
-    FAKE_555P
+    FAKE_555P,
+    FAKE_800P
 };
 
 struct fake_color_config {
-    uint8_t mode = FAKE_888I;
+    uint8_t mode = FAKE_555P;
     float lower = 0;
     float upper = 1;
 };

+ 36 - 0
src/image_process/cuda_impl/image_merge.cu

@@ -0,0 +1,36 @@
+#include "image_merge.cuh"
+#include "kernel_utility.cuh"
+
+namespace depth_threshold {
+
+    struct ext_type {
+        float lower;
+        float upper;
+    };
+
+    struct alpha_mask {
+        __device__ static uchar4 Op(uchar3 img, float1 depth, ext_type ext) {
+            uint8_t alpha = 255;
+            if (depth.x > ext.upper) alpha = 0;
+            if (depth.x < ext.lower) alpha = 0;
+            return uchar4(img.x, img.y, img.z, alpha);
+        }
+    };
+
+    void call(
+            image_type_v2<uchar3> in1, image_type_v2<float1> in2,
+            image_type_v2<uchar4> out,
+            ext_type ext, cudaStream_t stream) {
+        auto func_type = call_image_element_wise_binary_in<
+                uchar3, float1, uchar4, alpha_mask, ext_type>;
+        func_type(in1, in2, out, stream, ext);
+    }
+
+}
+
+void call_depth_mask(image_type_v2<uchar3> img, image_type_v2<float1> depth,
+                     image_type_v2<uchar4> out,
+                     depth_mask_config conf, cudaStream_t stream) {
+    auto ext = depth_threshold::ext_type{.lower = conf.lower, .upper = conf.upper};
+    return depth_threshold::call(img, depth, out, ext, stream);
+}

+ 15 - 0
src/image_process/cuda_impl/image_merge.cuh

@@ -0,0 +1,15 @@
+#ifndef DEPTHGUIDE_IMAGE_MERGE_H
+#define DEPTHGUIDE_IMAGE_MERGE_H
+
+#include "image_utility.cuh"
+
+struct depth_mask_config {
+    float lower = 0;
+    float upper = 1;
+};
+
+void call_depth_mask(image_type_v2<uchar3> img, image_type_v2<float1> depth,
+                     image_type_v2<uchar4> out,
+                     depth_mask_config conf, cudaStream_t stream);
+
+#endif //DEPTHGUIDE_IMAGE_MERGE_H

+ 19 - 1
src/image_process/cuda_impl/image_utility.cuh

@@ -1,18 +1,36 @@
 #ifndef DEPTHGUIDE_IMAGE_UTILITY_CUH
 #define DEPTHGUIDE_IMAGE_UTILITY_CUH
 
+#include <cassert>
 #include <cstdint>
 
 template<typename PixelT>
 struct image_type_v2 {
+    using this_type = image_type_v2<PixelT>;
+
     PixelT *ptr = nullptr;
     ushort width = 0, height = 0; // in pixel
     ushort pitch = 0; // in bytes
 
-    __device__ auto at(ushort y, ushort x = 0) {
+    __device__ __host__ auto at(ushort y, ushort x = 0) const {
         auto row_ptr = (char *) ptr + y * pitch;
         return (PixelT *) row_ptr + x;
     }
+
+    __host__ auto sub_image(int row = 0, int col = 0,
+                            int w = -1, int h = -1) const {
+        if (w == -1) { w = width - col; }
+        if (h == -1) { h = height - row; }
+        return this_type(at(row, col), w, h, pitch);
+    }
+
+    template<typename PixelU>
+    __host__ auto cast() const {
+        using ret_type = image_type_v2<PixelU>;
+        auto ret_width = width * sizeof(PixelT) / sizeof(PixelU);
+        assert(width * sizeof(PixelT) == ret_width * sizeof(PixelU));
+        return ret_type((PixelU *) ptr, ret_width, height, pitch);
+    }
 };
 
 #endif //DEPTHGUIDE_IMAGE_UTILITY_CUH

+ 46 - 7
src/image_process/cuda_impl/kernel_utility.cuh

@@ -4,6 +4,7 @@
 #include "image_utility.cuh"
 
 #include <cassert>
+#include <tuple>
 
 template<typename PixIn, typename PixOut, typename Func, typename... Ext>
 __global__ void image_elementwise_unary(image_type_v2<PixIn> in,
@@ -23,21 +24,59 @@ __global__ void image_elementwise_unary(image_type_v2<PixIn> in,
     }
 }
 
-template<typename PixIn, typename PixOut, typename Func, typename... Ext>
-void call_image_element_wise_unary(image_type_v2<PixIn> in, image_type_v2<PixOut> out,
-                                   cudaStream_t stream, Ext... ext) {
-    assert(out.width >= in.width);
-    assert(out.height >= in.height);
+template<typename PixIn1, typename PixIn2,
+        typename PixOut, typename Func, typename... Ext>
+__global__ void image_elementwise_binary_in(image_type_v2<PixIn1> in1, image_type_v2<PixIn2> in2,
+                                            image_type_v2<PixOut> out,
+                                            Ext... ext) {
+
+    for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
+         idy < in1.height;
+         idy += gridDim.y * blockDim.y) {
+
+        for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
+             idx < in1.width;
+             idx += gridDim.x * blockDim.x) {
+
+            *out.at(idy, idx) = Func::Op(*in1.at(idy, idx), *in2.at(idy, idx), ext...);
+        }
+    }
+}
+
+inline std::tuple<dim3, dim3> get_kernel_size(ushort width, ushort height) {
     static constexpr auto block_x = 32;
     static constexpr auto block_y = 8;
     // https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/elementwise.cuh
     static constexpr auto max_grids = 4352; // TODO: calculate by hardware at runtime
-    auto grid_y = std::max<uint>(1, std::min<uint>(in.height / block_y, max_grids));
-    auto grid_x = std::max<uint>(1, std::min<uint>(in.width / block_x, max_grids / grid_y));
+    auto grid_y = std::max<uint>(1, std::min<uint>(height / block_y, max_grids));
+    auto grid_x = std::max<uint>(1, std::min<uint>(width / block_x, max_grids / grid_y));
     auto block_dim = dim3(block_x, block_y, 1);
     auto grid_dim = dim3(grid_x, grid_y, 1);
+    return std::make_tuple(grid_dim, block_dim);
+}
+
+template<typename PixIn, typename PixOut, typename Func, typename... Ext>
+void call_image_element_wise_unary(image_type_v2<PixIn> in, image_type_v2<PixOut> out,
+                                   cudaStream_t stream, Ext... ext) {
+    assert(out.width >= in.width);
+    assert(out.height >= in.height);
+    auto [grid_dim, block_dim] = get_kernel_size(in.width, in.height);
     auto func_type = image_elementwise_unary<PixIn, PixOut, Func, Ext...>;
     func_type<<<grid_dim, block_dim, 0, stream>>>(in, out, ext...);
 }
 
+template<typename PixIn1, typename PixIn2,
+        typename PixOut, typename Func, typename... Ext>
+void call_image_element_wise_binary_in(image_type_v2<PixIn1> in1, image_type_v2<PixIn2> in2,
+                                       image_type_v2<PixOut> out,
+                                       cudaStream_t stream, Ext... ext) {
+    assert(in1.width == in2.width);
+    assert(in1.height == in2.height);
+    assert(out.width >= in1.width);
+    assert(out.height >= in1.height);
+    auto [grid_dim, block_dim] = get_kernel_size(in1.width, in1.height);
+    auto func_type = image_elementwise_binary_in<PixIn1, PixIn2, PixOut, Func, Ext...>;
+    func_type<<<grid_dim, block_dim, 0, stream>>>(in1, in2, out, ext...);
+}
+
 #endif //DEPTHGUIDE_KERNEL_UTILITY_CUH

+ 72 - 1
src/image_process/cuda_impl/pixel_convert.cu

@@ -1,11 +1,14 @@
 #include "pixel_convert.cuh"
 #include "kernel_utility.cuh"
 
+#include <glm/glm.hpp>
+#include <cuda/std/limits>
+
 template<typename PixIn, typename PixOut>
 struct cvt_rgb_bgra {
     __device__ static constexpr PixOut Op(PixIn in) {
         auto out = PixOut();
-        out.w = 1.0;
+        out.w = 255; // TODO: use type traits
         out.z = in.x;
         out.y = in.y;
         out.x = in.z;
@@ -22,3 +25,71 @@ void call_cvt_rgb_bgra_u8(image_type_v2<uchar3> in,
             uchar3, uchar4, cvt_rgb_bgra_u8>;
     func_type(in, out, stream);
 }
+
+__device__ constexpr glm::vec3 to_vec3(uchar3 vec) {
+    return glm::vec3(vec.x, vec.y, vec.z) / 255.f;
+}
+
+__device__ constexpr uchar3 to_uchar3(glm::vec3 vec) {
+    auto ret = glm::clamp(vec, 0.f, 1.f) * 255.f;
+    return uchar3(ret.x, ret.y, ret.z);
+}
+
+namespace yuv_to_rgb {
+
+    // @formatter:off
+    __device__ __constant__ auto cvt_mat = glm::mat3( // BT.709
+            1,       1,        1,
+            0,       -0.1873,  1.8556,
+            1.5748,  -0.4681,  0);
+    // @formatter:on
+
+    struct cvt {
+        __device__ static constexpr uchar3 Op(uchar3 in) {
+            auto yuv = to_vec3(in) - glm::vec3(0, 0.5, 0.5);
+            auto rgb = to_uchar3(cvt_mat * yuv);
+            return rgb;
+        }
+    };
+
+}
+
+__global__ void nv12_to_rgb(image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img,
+                            image_type_v2<uchar3> rgb_img) {
+
+    for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
+         idy < chroma_img.height;
+         idy += gridDim.y * blockDim.y) {
+
+        for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
+             idx < chroma_img.width;
+             idx += gridDim.x * blockDim.x) {
+
+            auto chroma = *chroma_img.at(idy, idx);
+
+#pragma unroll
+            for (auto dy = 0; dy < 2; ++dy)
+#pragma unroll
+                    for (auto dx = 0; dx < 2; ++dx) {
+                        auto iy = 2 * idy + dy, ix = 2 * idx + dx;
+
+                        auto luma = *luma_img.at(iy, ix);
+                        auto yuv = uchar3(luma.x, chroma.x, chroma.y);
+                        *rgb_img.at(iy, ix) = yuv_to_rgb::cvt::Op(yuv);
+                    }
+        }
+    }
+}
+
+void call_nv12_to_rgb(image_type_v2<uchar1> in,
+                      image_type_v2<uchar3> out,
+                      cudaStream_t stream) {
+    assert(in.height % 3 == 0);
+    auto img_height = in.height / 3 * 2;
+    assert(out.width == in.width);
+    assert(out.height == img_height);
+    auto luma_img = in.sub_image(0, 0, -1, img_height);
+    auto chroma_img = in.sub_image(img_height).cast<uchar2>();
+    auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
+    nv12_to_rgb<<<grid_dim, block_dim, 0, stream>>>(luma_img, chroma_img, out);
+}

+ 3 - 0
src/image_process/cuda_impl/pixel_convert.cuh

@@ -7,5 +7,8 @@ void call_cvt_rgb_bgra_u8(image_type_v2<uchar3> in,
                           image_type_v2<uchar4> out,
                           cudaStream_t stream);
 
+void call_nv12_to_rgb(image_type_v2<uchar1> in,
+                      image_type_v2<uchar3> out,
+                      cudaStream_t stream);
 
 #endif //DEPTHGUIDE_PIXEL_CONVERT_CUH

+ 15 - 0
src/image_process/impl/versatile_convertor.cpp

@@ -50,6 +50,18 @@ void versatile_convertor::impl::cvt_rgb_bgra() {
     OBJ_SAVE(conf.out_name, create_image(img_out));
 }
 
+void versatile_convertor::impl::cvt_nv12_rgb() {
+    auto img = OBJ_QUERY(image_u8c1, conf.in_name);
+    if (img == nullptr) return;
+    auto img_size = img->size();
+    auto out_size = cv::Size(img_size.width, img_size.height / 3 * 2);
+    auto img_out = create_image_info<uchar3>(out_size, MEM_CUDA);
+    call_nv12_to_rgb(img->as_cuda(conf.stream),
+                     img_out.as_cuda(),
+                     conf.stream->cuda);
+    OBJ_SAVE(conf.out_name, create_image(img_out));
+}
+
 void versatile_convertor::impl::cvt_fake_encode(fake_color_method mode) {
     auto img = OBJ_QUERY(image_f32c1, conf.in_name);
     if (img == nullptr) return;
@@ -108,10 +120,13 @@ void versatile_convertor::impl::process(obj_name_type name) {
     switch (conf.cvt_opt) {
         // @formatter:off
         case CVT_RGB_BGRA: { cvt_rgb_bgra(); break; }
+        case CVT_NV12_RGB: { cvt_nv12_rgb(); break; }
         case CVT_FAKE_ENCODE_888I: { cvt_fake_encode(FAKE_888I); break; }
         case CVT_FAKE_ENCODE_555P: { cvt_fake_encode(FAKE_555P); break; }
+        case CVT_FAKE_ENCODE_800P: { cvt_fake_encode(FAKE_800P); break; }
         case CVT_FAKE_DECODE_888I: { cvt_fake_decode(FAKE_888I); break; }
         case CVT_FAKE_DECODE_555P: { cvt_fake_decode(FAKE_555P); break; }
+        case CVT_FAKE_DECODE_800P: { cvt_fake_decode(FAKE_800P); break; }
         case CVT_HALF_SPLIT: { cvt_half_split(); break; }
         // @formatter:on
         default: {

+ 2 - 0
src/image_process/impl/versatile_convertor_impl.h

@@ -26,6 +26,8 @@ struct versatile_convertor::impl {
 
     void cvt_rgb_bgra();
 
+    void cvt_nv12_rgb();
+
     void cvt_fake_encode(fake_color_method mode);
 
     void cvt_fake_decode(fake_color_method mode);

+ 3 - 0
src/image_process/versatile_convertor.h

@@ -8,11 +8,14 @@
 
 enum convert_options {
     CVT_RGB_BGRA,
+    CVT_NV12_RGB,
 
     CVT_FAKE_ENCODE_888I,
     CVT_FAKE_ENCODE_555P,
+    CVT_FAKE_ENCODE_800P,
     CVT_FAKE_DECODE_888I,
     CVT_FAKE_DECODE_555P,
+    CVT_FAKE_DECODE_800P,
 
     CVT_HALF_SPLIT
 };

+ 5 - 3
src/impl/apps/depth_guide/depth_guide.cpp

@@ -10,7 +10,7 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
     OBJ_SAVE(img_color, image_u8c3());
     OBJ_SAVE(img_depth, image_f32c1());
     OBJ_SAVE(img_depth_fake, image_u8c3());
-    auto fake_info = fake_color_config{.mode = FAKE_555P, .lower = 200, .upper = 1000};
+    auto fake_info = fake_color_config{.mode = FAKE_800P, .lower = 200, .upper = 1000};
     OBJ_SAVE(img_depth_fake_info, versatile_convertor_impl::encode_config(fake_info));
     OBJ_SAVE(img_out, image_u8c4());
 
@@ -23,7 +23,7 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
 
     auto fake_conf = versatile_convertor::create_config{
             .in_name = img_depth, .ext_in = img_depth_fake_info, .out_name = img_depth_fake,
-            .cvt_opt = CVT_FAKE_ENCODE_555P, .stream = default_cuda_stream,
+            .cvt_opt = CVT_FAKE_ENCODE_800P, .stream = default_cuda_stream,
     };
     depth_encode = std::make_unique<versatile_convertor>(fake_conf);
 
@@ -44,7 +44,9 @@ app_depth_guide::app_depth_guide(const create_config &_conf) {
     bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
 
     auto out_streamer_conf = image_streamer::create_config{
-            .img_name = img_out, .asio_ctx = conf.asio_ctx,
+            .img_name = img_out,
+            .ext_name = img_depth_fake_info, // comment this for tiny player to work
+            .asio_ctx = conf.asio_ctx,
             .cuda_ctx = conf.cuda_ctx, .stream = default_cuda_stream
     };
     out_streamer = std::make_unique<image_streamer>(out_streamer_conf);

+ 52 - 5
src/impl/apps/remote_ar/remote_ar.cpp

@@ -1,6 +1,7 @@
 #include "remote_ar.h"
 #include "core/imgui_utility.hpp"
 #include "core/yaml_utility.hpp"
+#include "network/binary_utility.hpp"
 
 #include <boost/asio/post.hpp>
 
@@ -20,6 +21,13 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
     OBJ_SAVE(aug_right, image_u8c3());
     OBJ_SAVE(img_out, image_u8c4()); // ARGB
 
+    OBJ_SAVE(guide_combine, image_u8c1());
+    OBJ_SAVE(guide_info, data_type());
+    OBJ_SAVE(guide_combine_rgb, image_u8c3());
+    OBJ_SAVE(guide_img, image_u8c3());
+    OBJ_SAVE(guide_depth_fake, image_u8c3());
+    OBJ_SAVE(guide_depth, image_f32c1());
+
     // process callbacks caused by OBJ_SAVE
     asio_ctx->poll();
 
@@ -61,6 +69,30 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
         cam_right.img_proc->change_config({.is_mono = info.is_mono});
     });
 
+    auto guide_in_conf = image_player::create_config{
+            .img_name = guide_combine, .ext_name = guide_info,
+            .ctx = asio_ctx, .stream = default_cuda_stream,
+    };
+    guide_player = std::make_unique<image_player>(guide_in_conf);
+
+    auto guide_cvt_conf = versatile_convertor::create_config{
+            .in_name = guide_combine, .out_name = guide_combine_rgb,
+            .cvt_opt = CVT_NV12_RGB, .stream = default_cuda_stream,
+    };
+    guide_cvt = std::make_unique<versatile_convertor>(guide_cvt_conf);
+
+    auto guide_split_conf = versatile_convertor::create_config{
+            .in_name = guide_combine_rgb, .out_name = guide_img, .ext_out = guide_depth_fake,
+            .cvt_opt = CVT_HALF_SPLIT, .stream = default_cuda_stream,
+    };
+    guide_split = std::make_unique<versatile_convertor>(guide_split_conf);
+
+    auto guide_decode_conf = versatile_convertor::create_config{
+            .in_name = guide_depth_fake, .ext_in = guide_info, .out_name = guide_depth,
+            .cvt_opt = CVT_FAKE_DECODE_800P, .stream = default_cuda_stream,
+    };
+    guide_decode = std::make_unique<versatile_convertor>(guide_decode_conf);
+
     auto aug_conf = augment_manager::create_config{
             .item_list = augment_manager::item_list_from_yaml(LOAD_LIST("augment_list")),
             .sophiar_conn = sophiar_conn.get()
@@ -111,14 +143,24 @@ app_remote_ar::app_remote_ar(const create_config &_conf) {
         post(*asio_ctx, [=, this] { stereo_aug->resize(size); });
     });
 
+//    auto bg_viewer_conf = image_viewer::create_config{
+//            .mode = VIEW_STEREO, .flip_y = false,
+//            .stream = default_cuda_stream,
+//    };
+//    auto &stereo_conf = bg_viewer_conf.extra.stereo;
+//    stereo_conf.c_fmt = COLOR_RGB;
+//    stereo_conf.left_name = aug_left;
+//    stereo_conf.right_name = aug_right;
+//    bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
+
     auto bg_viewer_conf = image_viewer::create_config{
-            .mode = VIEW_STEREO, .flip_y = false,
+            .mode = VIEW_COLOR_DEPTH, .flip_y = true,
             .stream = default_cuda_stream,
     };
-    auto &stereo_conf = bg_viewer_conf.extra.stereo;
-    stereo_conf.c_fmt = COLOR_RGB;
-    stereo_conf.left_name = aug_left;
-    stereo_conf.right_name = aug_right;
+    auto &bg_extra_conf = bg_viewer_conf.extra.color_depth;
+    bg_extra_conf.c_fmt = COLOR_RGB;
+    bg_extra_conf.c_name = guide_img;
+    bg_extra_conf.d_name = guide_depth;
     bg_viewer = std::make_unique<image_viewer>(bg_viewer_conf);
 }
 
@@ -160,6 +202,11 @@ void app_remote_ar::show_ui() {
             }
         }
 
+        if (ImGui::CollapsingHeader("Depth Guide")) {
+            auto id_guard = imgui_id_guard("depth_guide");
+            guide_player->show();
+        }
+
         if (ImGui::CollapsingHeader("Stereo")) {
             auto id_guard = imgui_id_guard("stereo");
             stereo_aug->show();

+ 12 - 0
src/impl/apps/remote_ar/remote_ar.h

@@ -7,9 +7,11 @@
 #include "module/augment_manager.h"
 #include "module/camera_augment_helper.h"
 #include "module/image_augment_helper.h"
+#include "module/image_player.h"
 #include "module/image_streamer.h"
 #include "module/image_viewer.h"
 #include "image_process/image_process_ui.h"
+#include "image_process/versatile_convertor.h"
 #include "impl/app_base.h"
 
 // sophiar
@@ -37,6 +39,11 @@ private:
         rgb_left, rgb_right,
         aug_left, aug_right,
         img_out,
+
+        guide_combine, guide_info,
+        guide_combine_rgb,
+        guide_img, guide_depth_fake,
+        guide_depth,
     };
 
     struct camera_module {
@@ -63,6 +70,11 @@ private:
     std::unique_ptr<stereo_augment_helper> stereo_aug;
     std::unique_ptr<image_streamer> out_streamer;
 
+    std::unique_ptr<image_player> guide_player;
+    std::unique_ptr<versatile_convertor> guide_cvt;
+    std::unique_ptr<versatile_convertor> guide_split;
+    std::unique_ptr<versatile_convertor> guide_decode;
+
     camera_module cam_left;
     camera_module cam_right;
 

+ 1 - 0
src/module/impl/image_player.cpp

@@ -106,6 +106,7 @@ void image_player::impl::stop() {
         aux_ctx->stop();
         aux_thread->join();
         aux_thread = nullptr;
+        frame_queue = nullptr;
     }
 
     dec_nvdec = nullptr;

+ 1 - 1
src/network/binary_utility.hpp

@@ -278,7 +278,7 @@ public:
     }
 
     void write_data(const data_type &_data) {
-        std::copy_n(cur_ptr, _data.size, _data.start_ptr());
+        std::copy_n(_data.start_ptr(), _data.size, cur_ptr);
         cur_ptr += _data.size;
         assert(cur_ptr <= end_ptr());
     }

+ 5 - 3
src/network_v3/sender_udp_fec.cpp

@@ -268,9 +268,11 @@ struct sender_udp_fec::impl {
         // handle request
         switch (req.request_type) {
             case 'X': {
-                SPDLOG_INFO("Client {}:{} left.",
-                            remote_ep->address().to_string(), remote_ep->port());
-                close_connection();
+                if (remote_ep != nullptr) {
+                    SPDLOG_INFO("Client {}:{} left.",
+                                remote_ep->address().to_string(), remote_ep->port());
+                    close_connection();
+                }
                 return;
             }
             case 'C': {

+ 3 - 3
src/render/impl/shader/tex_nv12.frag

@@ -8,10 +8,10 @@ uniform sampler2D chroma_tex;
 in vec2 frag_uv;
 
 // @formatter:off
-const mat3 cvt_mat = mat3(
+const mat3 cvt_mat = mat3( // BT.709
     1,       1,        1,
-    0,       -0.3455,  1.7790,
-    1.4075,  -0.7169,  0);
+    0,       -0.1873,  1.8556,
+    1.5748,  -0.4681,  0);
 // @formatter:on
 
 layout (location = 0) out vec4 frag_color;