3 gadi atpakaļ · b7455520c9
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -0,0 +1,91 @@
 
				+cmake_minimum_required(VERSION 3.25)
			
 
				+project(TinyPlayer2 LANGUAGES C CXX)
			
 
				+
			
 
				+set(CMAKE_CXX_STANDARD 20)
			
 
				+
			
 
				+add_executable(TinyPlayer2 src/main.cpp
			
 
				+        src/nv12_renderer.cpp
			
 
				+        src/video_decoder.cpp
			
 
				+        src/frame_receiver.cpp
			
 
				+        src/third_party/rs.c)
			
 
				+
			
 
				+# OpenGL config
			
 
				+find_package(OpenGL REQUIRED)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${OPENGL_INCLUDE_DIR})
			
 
				+target_link_libraries(${PROJECT_NAME} OpenGL::GL)
			
 
				+
			
 
				+# glfw config
			
 
				+if (WIN32)
			
 
				+    set(GLFW_INCLUDE_DIR C:/BuildEssentials/VS2019Libs/include)
			
 
				+    set(GLFW_LIB_DIR C:/BuildEssentials/VS2019Libs/lib)
			
 
				+    find_library(GLFW_LIB glfw3 HINTS ${GLFW_LIB_DIR})
			
 
				+    target_include_directories(${PROJECT_NAME} PRIVATE ${GLFW_INCLUDE_DIR})
			
 
				+    target_link_libraries(${PROJECT_NAME} ${GLFW_LIB})
			
 
				+else ()
			
 
				+    find_package(glfw3 REQUIRED)
			
 
				+    target_link_libraries(${PROJECT_NAME} glfw)
			
 
				+endif ()
			
 
				+
			
 
				+# glad config
			
 
				+if (WIN32)
			
 
				+    set(GLAD_DIR C:/BuildEssentials/Library/glad)
			
 
				+else ()
			
 
				+    set(GLAD_DIR /home/tpx/src/glad)
			
 
				+endif ()
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${GLAD_DIR}/include)
			
 
				+target_sources(${PROJECT_NAME} PRIVATE ${GLAD_DIR}/src/gl.c)
			
 
				+
			
 
				+# imgui config
			
 
				+if (WIN32)
			
 
				+    set(IMGUI_DIR C:/BuildEssentials/Library/imgui-1.89.5)
			
 
				+else ()
			
 
				+    set(IMGUI_DIR /home/tpx/src/imgui-1.89.5)
			
 
				+endif ()
			
 
				+set(IMGUI_BACKENDS_DIR ${IMGUI_DIR}/backends)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${IMGUI_DIR} ${IMGUI_BACKENDS_DIR})
			
 
				+target_sources(${PROJECT_NAME} PRIVATE
			
 
				+        ${IMGUI_DIR}/imgui.cpp
			
 
				+        ${IMGUI_DIR}/imgui_draw.cpp
			
 
				+        ${IMGUI_DIR}/imgui_tables.cpp
			
 
				+        ${IMGUI_DIR}/imgui_widgets.cpp
			
 
				+        ${IMGUI_DIR}/imgui_demo.cpp
			
 
				+        ${IMGUI_BACKENDS_DIR}/imgui_impl_glfw.cpp
			
 
				+        ${IMGUI_BACKENDS_DIR}/imgui_impl_opengl3.cpp)
			
 
				+
			
 
				+# spdlog config
			
 
				+find_package(spdlog REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} spdlog::spdlog)
			
 
				+target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_TRACE)
			
 
				+
			
 
				+# Boost config
			
 
				+find_package(Boost REQUIRED)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
			
 
				+
			
 
				+# CUDA config
			
 
				+find_package(CUDAToolkit REQUIRED)
			
 
				+target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				+
			
 
				+# NvDec config
			
 
				+if (WIN32)
			
 
				+    set(NVCODEC_DIR C:/BuildEssentials/CUDA/Video_Codec_SDK_12.0.16)
			
 
				+    find_library(NVDEC_LIB nvcuvid HINTS ${NVCODEC_DIR}/Lib/x64)
			
 
				+else ()
			
 
				+    set(NVCODEC_DIR /home/tpx/src/Video_Codec_SDK_12.0.16)
			
 
				+    find_library(NVDEC_LIB nvcuvid)
			
 
				+endif ()
			
 
				+set(NVCODEC_INCLUDE_DIR ${NVCODEC_DIR}/Interface)
			
 
				+target_include_directories(${PROJECT_NAME} PRIVATE ${NVCODEC_INCLUDE_DIR})
			
 
				+target_link_libraries(${PROJECT_NAME} ${NVDEC_LIB})
			
 
				+
			
 
				+# xxHash config
			
 
				+if (WIN32)
			
 
				+    set(XXHASH_DIR C:/BuildEssentials/Library/xxHash-0.8.1/Source)
			
 
				+    target_include_directories(${PROJECT_NAME} PRIVATE ${XXHASH_DIR})
			
 
				+    target_sources(${PROJECT_NAME} PRIVATE ${XXHASH_DIR}/xxhash.c)
			
 
				+else ()
			
 
				+    set(XXHASH_INCLUDE_DIR /usr/include)
			
 
				+    find_library(XXHASH_LIB xxhash)
			
 
				+    target_include_directories(${PROJECT_NAME} PRIVATE ${XXHASH_INCLUDE_DIR})
			
 
				+    target_link_libraries(${PROJECT_NAME} ${XXHASH_LIB})
			
 
				+endif ()
			
--- a/src/config.h
+++ b/src/config.h
@@ -0,0 +1,29 @@
 
				+#ifndef TINYPLAYER2_CONFIG_H
			
 
				+#define TINYPLAYER2_CONFIG_H
			
 
				+
			
 
				+#include <spdlog/spdlog.h>
			
 
				+
			
 
				+static constexpr auto main_window_width = 800;
			
 
				+static constexpr auto main_window_height = 600;
			
 
				+
			
 
				+static constexpr auto default_cuda_device_id = 0;
			
 
				+
			
 
				+#define RET_ERROR \
			
 
				+    assert(false); \
			
 
				+    return false; \
			
 
				+    (void) 0
			
 
				+
			
 
				+inline bool check_function_call(bool function_ret, unsigned int line_number,
			
 
				+                                const char *file_name, const char *function_call_str) {
			
 
				+    if (function_ret) [[likely]] return true;
			
 
				+    SPDLOG_ERROR("Function call {} failed at {}:{}.",
			
 
				+                 function_call_str, file_name, line_number);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+#define CALL_CHECK(function_call) \
			
 
				+    if (!check_function_call( \
			
 
				+        function_call, __LINE__, __FILE__, #function_call)) [[unlikely]] \
			
 
				+        return false
			
 
				+
			
 
				+#endif //TINYPLAYER2_CONFIG_H
			
--- a/src/cuda_helper.hpp
+++ b/src/cuda_helper.hpp
@@ -0,0 +1,48 @@
 
				+#ifndef TINYPLAYER2_CUDA_HELPER_HPP
			
 
				+#define TINYPLAYER2_CUDA_HELPER_HPP
			
 
				+
			
 
				+#include "config.h"
			
 
				+
			
 
				+#include <cuda.h>
			
 
				+#include <cuda_runtime.h>
			
 
				+
			
 
				+#include <spdlog/spdlog.h>
			
 
				+
			
 
				+inline bool check_cuda_api_call(CUresult api_ret, unsigned int line_number,
			
 
				+                                const char *file_name, const char *api_call_str) {
			
 
				+    if (api_ret == CUDA_SUCCESS) [[likely]] return true;
			
 
				+    const char *error_name, *error_str;
			
 
				+    auto ret = cuGetErrorName(api_ret, &error_name);
			
 
				+    if (ret != CUDA_SUCCESS) [[unlikely]] error_name = "Unknown";
			
 
				+    ret = cuGetErrorString(api_ret, &error_str);
			
 
				+    if (ret != CUDA_SUCCESS) [[unlikely]] error_str = "Unknown";
			
 
				+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}:{}, {}.",
			
 
				+                 api_call_str, file_name, line_number,
			
 
				+                 api_ret, error_name, error_str);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+inline bool check_cuda_api_call(cudaError api_ret, unsigned int line_number,
			
 
				+                                const char *file_name, const char *api_call_str) {
			
 
				+    if (api_ret == cudaSuccess) [[likely]] return true;
			
 
				+    SPDLOG_ERROR("CUDA api call {} failed at {}:{} with error 0x{:x}.",
			
 
				+                 api_call_str, file_name, line_number, api_ret);
			
 
				+    RET_ERROR;
			
 
				+}
			
 
				+
			
 
				+#define CUDA_API_CHECK(api_call) \
			
 
				+    if (!check_cuda_api_call( \
			
 
				+        api_call, __LINE__, __FILE__, #api_call)) [[unlikely]] \
			
 
				+        return false
			
 
				+
			
 
				+inline bool get_cuda_primary_context(CUcontext *ctx) {
			
 
				+    int cuda_device_count;
			
 
				+    CUDA_API_CHECK(cuDeviceGetCount(&cuda_device_count));
			
 
				+    assert(cuda_device_count > default_cuda_device_id);
			
 
				+    CUdevice cuda_device;
			
 
				+    CUDA_API_CHECK(cuDeviceGet(&cuda_device, default_cuda_device_id));
			
 
				+    CUDA_API_CHECK(cuDevicePrimaryCtxRetain(ctx, cuda_device));
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+#endif //TINYPLAYER2_CUDA_HELPER_HPP
			
--- a/src/frame_receiver.cpp
+++ b/src/frame_receiver.cpp
@@ -0,0 +1,300 @@
 
				+#include "config.h"
			
 
				+#include "frame_receiver.h"
			
 
				+#include "third_party/scope_guard.hpp"
			
 
				+#include "video_decoder.h"
			
 
				+
			
 
				+extern "C" {
			
 
				+#include "third_party/rs.h"
			
 
				+}
			
 
				+
			
 
				+#include <boost/asio/awaitable.hpp>
			
 
				+#include <boost/asio/buffer.hpp>
			
 
				+#include <boost/asio/co_spawn.hpp>
			
 
				+#include <boost/asio/detached.hpp>
			
 
				+#include <boost/asio/io_context.hpp>
			
 
				+#include <boost/asio/ip/udp.hpp>
			
 
				+#include <boost/asio/post.hpp>
			
 
				+#include <boost/asio/use_awaitable.hpp>
			
 
				+#include <boost/endian.hpp>
			
 
				+#include <boost/smart_ptr.hpp>
			
 
				+
			
 
				+#include <xxhash.h>
			
 
				+
			
 
				+#include <list>
			
 
				+
			
 
				+using namespace boost::asio::ip;
			
 
				+using boost::asio::awaitable;
			
 
				+using boost::asio::buffer;
			
 
				+using boost::asio::co_spawn;
			
 
				+using boost::asio::detached;
			
 
				+using boost::asio::io_context;
			
 
				+using boost::asio::post;
			
 
				+using boost::asio::use_awaitable;
			
 
				+
			
 
				+#define EXCEPTION_CHECK(api_call) \
			
 
				+    try { \
			
 
				+        api_call; \
			
 
				+    } catch (std::exception &e) { \
			
 
				+        SPDLOG_ERROR("Procedure {} failed at {}:{} with exception {}.", \
			
 
				+                     #api_call, __FILE__, __LINE__, e.what()); \
			
 
				+        return false; \
			
 
				+    } void(0)
			
 
				+
			
 
				+struct frame_receiver::impl {
			
 
				+    static constexpr auto buffer_size = 64 * 1024; // 64KB
			
 
				+    static constexpr auto frag_type_offset = sizeof(uint64_t);
			
 
				+    static constexpr auto frame_salt_offset = frag_type_offset + sizeof(uint8_t);
			
 
				+    static constexpr auto reply_repeat_count = 3; // every reply will repeat 3 times
			
 
				+
			
 
				+    struct frag_header {
			
 
				+        uint64_t frag_checksum;
			
 
				+        uint8_t frame_type;
			
 
				+        uint64_t frame_salt;
			
 
				+        uint32_t frame_id;
			
 
				+        uint32_t frame_length;
			
 
				+        uint32_t block_size;
			
 
				+        uint16_t block_count;
			
 
				+        uint16_t frame_decode_count;
			
 
				+        uint16_t block_id;
			
 
				+    };
			
 
				+
			
 
				+    struct received_frame_info {
			
 
				+        bool is_idr_frame;
			
 
				+        uint64_t frame_salt;
			
 
				+        uint32_t frame_id;
			
 
				+        uint32_t frame_length;
			
 
				+        uint32_t block_size;
			
 
				+        uint16_t block_count;
			
 
				+        uint16_t decode_count;
			
 
				+        uint16_t received_count;
			
 
				+        bool *is_missing;
			
 
				+        char *data;
			
 
				+
			
 
				+        ~received_frame_info() {
			
 
				+            free(is_missing);
			
 
				+            free(data);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    boost::scoped_ptr<io_context> context;
			
 
				+    boost::scoped_ptr<udp::socket> socket;
			
 
				+
			
 
				+    udp::endpoint server_endpoint;
			
 
				+
			
 
				+    char *in_data = nullptr, *out_data = nullptr;
			
 
				+
			
 
				+    // frame related
			
 
				+    uint32_t last_decoded_frame = 0;
			
 
				+    std::list<received_frame_info> received_list;
			
 
				+    video_decoder *decoder = nullptr;
			
 
				+
			
 
				+    impl() {
			
 
				+        in_data = (char *) malloc(buffer_size);
			
 
				+        out_data = (char *) malloc(buffer_size);
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    static char *write_binary_number(char *ptr, T val) {
			
 
				+        static constexpr auto need_swap =
			
 
				+                (boost::endian::order::native != boost::endian::order::big);
			
 
				+        auto real_ptr = (T *) ptr;
			
 
				+        if constexpr (need_swap) {
			
 
				+            *real_ptr = boost::endian::endian_reverse(val);
			
 
				+        } else {
			
 
				+            *real_ptr = val;
			
 
				+        }
			
 
				+        return ptr + sizeof(T);
			
 
				+    }
			
 
				+
			
 
				+    template<typename T>
			
 
				+    static char *read_binary_number(char *ptr, T *val) {
			
 
				+        static constexpr auto need_swap =
			
 
				+                (boost::endian::order::native != boost::endian::order::big);
			
 
				+        *val = *(T *) ptr;
			
 
				+        if constexpr (need_swap) {
			
 
				+            boost::endian::endian_reverse_inplace(*val);
			
 
				+        }
			
 
				+        return ptr + sizeof(T);
			
 
				+    }
			
 
				+
			
 
				+    static char *read_frag_header(char *ptr, frag_header *header) {
			
 
				+#define READ(member) ptr = read_binary_number(ptr, &header->member)
			
 
				+        READ(frag_checksum);
			
 
				+        READ(frame_type);
			
 
				+        READ(frame_salt);
			
 
				+        READ(frame_id);
			
 
				+        READ(frame_length);
			
 
				+        READ(block_size);
			
 
				+        READ(block_count);
			
 
				+        READ(frame_decode_count);
			
 
				+        READ(block_id);
			
 
				+#undef READ
			
 
				+        return ptr;
			
 
				+    }
			
 
				+
			
 
				+    bool check_in_hash(size_t msg_len) const {
			
 
				+        if (msg_len <= sizeof(uint64_t)) return false;
			
 
				+        static auto hash_state = XXH64_createState();
			
 
				+        CALL_CHECK(XXH64_reset(hash_state, 0) != XXH_ERROR);
			
 
				+        uint64_t hash_receive;
			
 
				+        auto in_ptr = read_binary_number(in_data, &hash_receive);
			
 
				+        auto end_ptr = in_data + msg_len;
			
 
				+        CALL_CHECK(XXH64_update(hash_state, in_ptr, end_ptr - in_ptr) != XXH_ERROR);
			
 
				+        return hash_receive == XXH64_digest(hash_state);
			
 
				+    }
			
 
				+
			
 
				+    void handle_test_msg(size_t msg_len) {
			
 
				+        uint64_t frame_salt;
			
 
				+        auto in_ptr = read_binary_number(in_data + frame_salt_offset, &frame_salt);
			
 
				+        uint16_t content_len = msg_len - (in_ptr - in_data);
			
 
				+
			
 
				+        // fill output buffer
			
 
				+        auto out_ptr = out_data;
			
 
				+        *(out_ptr++) = 'R';
			
 
				+        out_ptr = write_binary_number(out_ptr, frame_salt);
			
 
				+        out_ptr = write_binary_number(out_ptr, content_len);
			
 
				+
			
 
				+        // send test reply
			
 
				+        auto out_buf = buffer(out_data, out_ptr - out_data);
			
 
				+        socket->send(out_buf);
			
 
				+        SPDLOG_TRACE("Received test message with length {}.", content_len);
			
 
				+    }
			
 
				+
			
 
				+    void handle_frame_frag(size_t msg_len) {
			
 
				+        frag_header header;
			
 
				+        auto in_ptr = read_frag_header(in_data, &header);
			
 
				+        if (header.frame_id <= last_decoded_frame) return;
			
 
				+        assert((msg_len - (in_ptr - in_data)) == header.block_size);
			
 
				+
			
 
				+        auto iter = received_list.begin();
			
 
				+        while (iter != received_list.end() && header.frame_id > iter->frame_id) ++iter;
			
 
				+        if (iter != received_list.end() && header.frame_id == iter->frame_id) { // existing and pending frame
			
 
				+            // TODO: check consistency
			
 
				+            if (!iter->is_missing[header.block_id]) return; // already received
			
 
				+            auto block_ptr = iter->data + header.block_id * iter->block_size;
			
 
				+            memcpy(block_ptr, in_ptr, iter->block_size);
			
 
				+            iter->is_missing[header.block_id] = false;
			
 
				+            ++(iter->received_count);
			
 
				+        } else { // new frame
			
 
				+            if (header.frame_type == 'I') { // IDR frame
			
 
				+                if (iter != received_list.end()) return; // old IDR frame
			
 
				+                if (!received_list.empty()) {
			
 
				+                    received_list.clear();
			
 
				+                    iter = received_list.end();
			
 
				+                }
			
 
				+            }
			
 
				+            iter = received_list.insert(iter, received_frame_info{});
			
 
				+            SPDLOG_DEBUG("Pending size: {}.", received_list.size());
			
 
				+            iter->is_idr_frame = header.frame_type == 'I';
			
 
				+            iter->frame_salt = header.frame_salt;
			
 
				+            iter->frame_id = header.frame_id;
			
 
				+            iter->frame_length = header.frame_length;
			
 
				+            iter->block_size = header.block_size;
			
 
				+            iter->block_count = header.block_count;
			
 
				+            iter->decode_count = header.frame_decode_count;
			
 
				+            iter->received_count = 1; // this frag
			
 
				+
			
 
				+            static_assert(sizeof(bool) == 1);
			
 
				+            iter->is_missing = (bool *) malloc(iter->block_count);
			
 
				+            memset(iter->is_missing, 0x01, iter->block_count); // set to true
			
 
				+            iter->is_missing[header.block_id] = false;
			
 
				+
			
 
				+            iter->data = (char *) malloc(iter->block_size * iter->block_count);
			
 
				+            auto block_ptr = iter->data + header.block_id * iter->block_size;
			
 
				+            memcpy(block_ptr, in_ptr, iter->block_size);
			
 
				+        }
			
 
				+        SPDLOG_TRACE("Received {} frame {}, block {}.",
			
 
				+                     iter->is_idr_frame ? 'I' : 'P', iter->frame_id, header.block_id);
			
 
				+    }
			
 
				+
			
 
				+    void handle_message(size_t msg_len) {
			
 
				+        char frag_type = in_data[frag_type_offset];
			
 
				+        if (frag_type == 'T') {
			
 
				+            handle_test_msg(msg_len);
			
 
				+        } else if (frag_type == 'I' || frag_type == 'P') {
			
 
				+            handle_frame_frag(msg_len);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    bool try_decode_frame() {
			
 
				+        if (received_list.empty()) return false;
			
 
				+        auto iter = received_list.begin();
			
 
				+        if (iter->received_count < iter->decode_count) return false; // no enough block to decode
			
 
				+        if (iter->frame_id == last_decoded_frame + 1 || iter->is_idr_frame) { // ready to decode
			
 
				+            // FEC correct
			
 
				+            auto data_blocks = iter->decode_count;
			
 
				+            auto parity_blocks = iter->block_count - iter->decode_count;
			
 
				+            auto rs = reed_solomon_new(data_blocks, parity_blocks);
			
 
				+            auto closer = sg::make_scope_guard([&]() {
			
 
				+                reed_solomon_release(rs);
			
 
				+            });
			
 
				+            auto ret = reed_solomon_reconstruct(rs, (unsigned char **) iter->data, (unsigned char *) iter->is_missing,
			
 
				+                                                iter->block_count, iter->block_size);
			
 
				+            assert(ret == 0);
			
 
				+
			
 
				+            decoder->decode_frame(iter->data, iter->frame_length);
			
 
				+            SPDLOG_TRACE("Frame {} decoded.", iter->frame_id);
			
 
				+
			
 
				+            // send confirm reply
			
 
				+            auto out_ptr = out_data;
			
 
				+            *(out_ptr++) = 'C';
			
 
				+            out_ptr = write_binary_number(out_ptr, iter->frame_salt);
			
 
				+            auto out_buf = buffer(out_data, out_ptr - out_data);
			
 
				+            for (int k = 0; k < reply_repeat_count; ++k) {
			
 
				+                socket->send(out_buf);
			
 
				+            }
			
 
				+
			
 
				+            last_decoded_frame = iter->frame_id;
			
 
				+            received_list.erase(iter);
			
 
				+            return true;
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    awaitable<void> main_loop() {
			
 
				+        auto in_buf = buffer(in_data, buffer_size);
			
 
				+        for (;;) {
			
 
				+            co_await socket->async_wait(udp::socket::wait_read, use_awaitable);
			
 
				+            while (socket->available() > 0) {
			
 
				+                udp::endpoint sender_endpoint;
			
 
				+                auto msg_len = socket->receive_from(in_buf, sender_endpoint);
			
 
				+                if (sender_endpoint != server_endpoint) continue;
			
 
				+                if (!check_in_hash(msg_len)) continue;
			
 
				+                handle_message(msg_len);
			
 
				+            }
			
 
				+            while (try_decode_frame());
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    void start() {
			
 
				+        context.reset(new io_context{});
			
 
				+        socket.reset(new udp::socket{*context});
			
 
				+        socket->connect(server_endpoint);
			
 
				+        socket->set_option(udp::socket::receive_buffer_size{10 * 1024 * 1024}); // 10MB receive buffer
			
 
				+        assert(socket->is_open());
			
 
				+        reconnect();
			
 
				+        co_spawn(*context, main_loop(), detached);
			
 
				+        context->run(); // TODO: support server timeout
			
 
				+    }
			
 
				+
			
 
				+    void reconnect() {
			
 
				+        out_data[0] = 'R';
			
 
				+        auto out_buf = buffer(out_data, 1);
			
 
				+        socket->send(out_buf);
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+frame_receiver::frame_receiver()
			
 
				+        : pimpl(std::make_unique<impl>()) {
			
 
				+    fec_init();
			
 
				+}
			
 
				+
			
 
				+frame_receiver::~frame_receiver() = default;
			
 
				+
			
 
				+bool frame_receiver::start(std::string_view address, uint16_t port, video_decoder *decoder) {
			
 
				+    pimpl->server_endpoint = udp::endpoint{address::from_string(address.data()), port};
			
 
				+    pimpl->decoder = decoder;
			
 
				+    EXCEPTION_CHECK(pimpl->start());
			
 
				+    return true;
			
 
				+}
			
--- a/src/frame_receiver.h
+++ b/src/frame_receiver.h
@@ -0,0 +1,23 @@
 
				+#ifndef TINYPLAYER2_FRAME_RECEIVER_H
			
 
				+#define TINYPLAYER2_FRAME_RECEIVER_H
			
 
				+
			
 
				+#include <memory>
			
 
				+#include <string_view>
			
 
				+
			
 
				+class video_decoder;
			
 
				+
			
 
				+class frame_receiver {
			
 
				+public:
			
 
				+    frame_receiver();
			
 
				+
			
 
				+    ~frame_receiver();
			
 
				+
			
 
				+    bool start(std::string_view address, uint16_t port, video_decoder *decoder);
			
 
				+
			
 
				+private:
			
 
				+    struct impl;
			
 
				+    std::unique_ptr<impl> pimpl;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif //TINYPLAYER2_FRAME_RECEIVER_H
			
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -0,0 +1,206 @@
 
				+#include "config.h"
			
 
				+#include "cuda_helper.hpp"
			
 
				+#include "frame_receiver.h"
			
 
				+#include "nv12_renderer.h"
			
 
				+#include "third_party/scope_guard.hpp"
			
 
				+#include "video_decoder.h"
			
 
				+
			
 
				+#include <imgui.h>
			
 
				+#include <imgui_impl_glfw.h>
			
 
				+#include <imgui_impl_opengl3.h>
			
 
				+
			
 
				+#include <glad/gl.h>
			
 
				+#include <GLFW/glfw3.h>
			
 
				+
			
 
				+#include <spdlog/spdlog.h>
			
 
				+
			
 
				+#include <thread>
			
 
				+
			
 
				+// for renderer
			
 
				+int monitor_index = 0;
			
 
				+char *server_address;
			
 
				+uint16_t server_port = 5277;
			
 
				+GLFWwindow *main_window = nullptr;
			
 
				+
			
 
				+// for receiver
			
 
				+video_decoder decoder;
			
 
				+
			
 
				+void controller_main() {
			
 
				+
			
 
				+    // setup imgui context
			
 
				+    IMGUI_CHECKVERSION();
			
 
				+    ImGui::CreateContext();
			
 
				+    auto io = ImGui::GetIO();
			
 
				+    io.ConfigFlags |= ImGuiConfigFlags_NavEnableKeyboard;
			
 
				+    ImGui::StyleColorsDark();
			
 
				+    ImGui_ImplGlfw_InitForOpenGL(main_window, true);
			
 
				+    ImGui_ImplOpenGL3_Init();
			
 
				+
			
 
				+    static constexpr auto server_address_length = 256;
			
 
				+    server_address = (char *) malloc(server_address_length);
			
 
				+    strcpy(server_address, "127.0.0.1");
			
 
				+    auto closer = sg::make_scope_guard([&]() {
			
 
				+        free(server_address);
			
 
				+    });
			
 
				+
			
 
				+    // main loop
			
 
				+    while (!glfwWindowShouldClose(main_window)) {
			
 
				+
			
 
				+        glfwPollEvents();
			
 
				+
			
 
				+        ImGui_ImplOpenGL3_NewFrame();
			
 
				+        ImGui_ImplGlfw_NewFrame();
			
 
				+        ImGui::NewFrame();
			
 
				+
			
 
				+        ImGui::ShowDemoWindow();
			
 
				+
			
 
				+        if (ImGui::Begin("TinyPlayer Control")) {
			
 
				+            // action
			
 
				+            ImGui::SeparatorText("Actions");
			
 
				+
			
 
				+            // configs
			
 
				+            ImGui::SeparatorText("Configs");
			
 
				+            ImGui::PushItemWidth(200);
			
 
				+
			
 
				+            // monitor
			
 
				+            int monitor_count;
			
 
				+            auto monitors = glfwGetMonitors(&monitor_count);
			
 
				+            if (monitor_index >= monitor_count) {
			
 
				+                monitor_index = 0;
			
 
				+            }
			
 
				+            auto monitor_name_preview = glfwGetMonitorName(monitors[monitor_index]);
			
 
				+            if (ImGui::BeginCombo("Monitor", monitor_name_preview)) { // let user select monitors
			
 
				+                for (int k = 0; k < monitor_count; ++k) {
			
 
				+                    auto is_selected = (monitor_index == k);
			
 
				+                    if (ImGui::Selectable(glfwGetMonitorName(monitors[k]), is_selected)) {
			
 
				+                        monitor_index = k;
			
 
				+                    }
			
 
				+                    if (is_selected) {
			
 
				+                        ImGui::SetItemDefaultFocus();
			
 
				+                    }
			
 
				+                }
			
 
				+                ImGui::EndCombo();
			
 
				+            }
			
 
				+
			
 
				+            // server address
			
 
				+            ImGui::InputText("Server IP", server_address, server_address_length);
			
 
				+
			
 
				+            // server port
			
 
				+            static uint16_t val_one = 1;
			
 
				+            ImGui::InputScalar("Server Port", ImGuiDataType_U16, &server_port, &val_one, nullptr, "%u");
			
 
				+
			
 
				+            ImGui::PopItemWidth();
			
 
				+        }
			
 
				+        ImGui::End();
			
 
				+        ImGui::Render();
			
 
				+
			
 
				+        int frame_width, frame_height;
			
 
				+        glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
			
 
				+        glfwGetFramebufferSize(main_window, &frame_width, &frame_height);
			
 
				+        glViewport(0, 0, frame_width, frame_height);
			
 
				+        glClear(GL_COLOR_BUFFER_BIT);
			
 
				+
			
 
				+        ImGui_ImplOpenGL3_RenderDrawData(ImGui::GetDrawData());
			
 
				+        glfwSwapBuffers(main_window);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void render_main() {
			
 
				+    // setup cuda
			
 
				+    cuInit(0);
			
 
				+    CUcontext cuda_ctx;
			
 
				+    get_cuda_primary_context(&cuda_ctx);
			
 
				+    cuCtxPushCurrent(cuda_ctx);
			
 
				+
			
 
				+    std::thread receiver_thread{[]() {
			
 
				+        // setup cuda
			
 
				+        CUcontext cuda_ctx;
			
 
				+        get_cuda_primary_context(&cuda_ctx);
			
 
				+        cuCtxPushCurrent(cuda_ctx);
			
 
				+
			
 
				+        frame_receiver receiver;
			
 
				+        decoder.start();
			
 
				+        receiver.start(server_address, server_port, &decoder);
			
 
				+    }};
			
 
				+
			
 
				+    nv12_renderer renderer;
			
 
				+    while (!glfwWindowShouldClose(main_window)) {
			
 
				+
			
 
				+        // retrieve new image
			
 
				+        decoder.retrieve_frame(&renderer);
			
 
				+
			
 
				+        // adjust window
			
 
				+        int fbo_width, fbo_height;
			
 
				+        glfwGetFramebufferSize(main_window, &fbo_width, &fbo_height);
			
 
				+        glViewport(0, 0, fbo_width, fbo_height);
			
 
				+        auto fbo_wh_ratio = 1.0f * fbo_width / fbo_height;
			
 
				+
			
 
				+        // draw image
			
 
				+        renderer.render(fbo_wh_ratio);
			
 
				+
			
 
				+        glfwSwapBuffers(main_window);
			
 
				+        glFinish();
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char *argv[]) {
			
 
				+
			
 
				+    // config options
			
 
				+    bool is_controller = (argc == 1);
			
 
				+    if (!is_controller) {
			
 
				+        assert(argc == 4);
			
 
				+        monitor_index = std::atoi(argv[1]);
			
 
				+        server_address = argv[2];
			
 
				+        server_port = std::atoi(argv[3]);
			
 
				+    }
			
 
				+
			
 
				+    // setup glfw
			
 
				+    glfwSetErrorCallback([](int error, const char *desc) {
			
 
				+        SPDLOG_ERROR("GLFW error: code = {}, description = {}", error, desc);
			
 
				+        assert(false);
			
 
				+    });
			
 
				+    auto ret = glfwInit();
			
 
				+    assert(ret == GLFW_TRUE);
			
 
				+    glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4);
			
 
				+    glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 6);
			
 
				+    glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
			
 
				+
			
 
				+    // setup window
			
 
				+    if (is_controller) {
			
 
				+        main_window = glfwCreateWindow(main_window_width, main_window_height, "TinyPlayer Control", nullptr, nullptr);
			
 
				+    } else { // is render
			
 
				+        int monitor_count;
			
 
				+        auto monitors = glfwGetMonitors(&monitor_count);
			
 
				+        assert(monitor_count > monitor_index);
			
 
				+        auto monitor = monitors[monitor_index];
			
 
				+        auto video_mode = glfwGetVideoMode(monitor);
			
 
				+        main_window = glfwCreateWindow(video_mode->width, video_mode->height, "TinyPlayer", monitor, nullptr);
			
 
				+    }
			
 
				+    glfwMakeContextCurrent(main_window);
			
 
				+    glfwSwapInterval(1);
			
 
				+
			
 
				+    // load opengl functions
			
 
				+    auto version = gladLoadGL(glfwGetProcAddress);
			
 
				+    assert(version > 0);
			
 
				+    SPDLOG_INFO("Loaded OpenGL {}.{}", GLAD_VERSION_MAJOR(version), GLAD_VERSION_MINOR(version));
			
 
				+
			
 
				+#ifndef NDEBUG
			
 
				+    // log opengl error
			
 
				+    glEnable(GL_DEBUG_OUTPUT);
			
 
				+    glDebugMessageCallback([](GLenum source, GLenum type, GLuint id, GLenum severity,
			
 
				+                              GLsizei length, const GLchar *message, const void *user_data) {
			
 
				+        if (type == GL_DEBUG_TYPE_ERROR) {
			
 
				+            SPDLOG_ERROR("OpenGL error: type = {}, severity = {}, message = {}", type, severity, message);
			
 
				+            assert(false);
			
 
				+        }
			
 
				+    }, nullptr);
			
 
				+#endif
			
 
				+
			
 
				+    if (is_controller) {
			
 
				+        controller_main();
			
 
				+    } else {
			
 
				+        render_main();
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
--- a/src/nv12_renderer.cpp
+++ b/src/nv12_renderer.cpp
@@ -0,0 +1,266 @@
 
				+#include "nv12_renderer.h"
			
 
				+
			
 
				+#include <glad/gl.h>
			
 
				+
			
 
				+#include <cuda_gl_interop.h>
			
 
				+
			
 
				+static constexpr auto vertex_shader_source = R"(
			
 
				+        #version 460
			
 
				+        layout (location = 0) in vec2 pos_in;
			
 
				+        layout (location = 1) in vec2 tex_coord_in;
			
 
				+        out vec2 tex_coord;
			
 
				+        void main() {
			
 
				+            gl_Position = vec4(pos_in, 0, 1);
			
 
				+            tex_coord = tex_coord_in;
			
 
				+        }
			
 
				+    )";
			
 
				+
			
 
				+static constexpr auto fragment_shader_source = R"(
			
 
				+        #version 460
			
 
				+        layout (location = 0) out vec4 color_out;
			
 
				+        in vec2 tex_coord;
			
 
				+        uniform sampler2D luma_tex;
			
 
				+        uniform sampler2D chroma_tex;
			
 
				+        void main() {
			
 
				+            vec3 yuv, rgb;
			
 
				+            yuv.x = texture(luma_tex, tex_coord).x;
			
 
				+            yuv.yz = texture(chroma_tex, tex_coord).xy - vec2(0.5, 0.5);
			
 
				+            rgb = mat3(1, 1, 1,
			
 
				+                       0, -0.39465, 2.03211,
			
 
				+                       1.13983, -0.5806, 0) * yuv;
			
 
				+            color_out = vec4(rgb, 1.0);
			
 
				+        }
			
 
				+    )";
			
 
				+
			
 
				+static constexpr GLuint indices[] = {
			
 
				+        0, 1, 3, // first triangle
			
 
				+        1, 2, 3 // second triangle
			
 
				+};
			
 
				+
			
 
				+struct nv12_renderer::impl {
			
 
				+
			
 
				+    // program related
			
 
				+    GLuint vertex_array = 0;
			
 
				+    GLuint vertex_buffer = 0, element_buffer = 0;
			
 
				+    GLuint program = 0;
			
 
				+    GLint luma_tex_loc = 0, chroma_tex_loc = 0;
			
 
				+
			
 
				+    // frame related
			
 
				+    int frame_width = 0, frame_height = 0;
			
 
				+    GLuint luma_pbo = 0, chroma_pbo = 0;
			
 
				+    GLuint luma_tex = 0, chroma_tex = 0;
			
 
				+    cudaGraphicsResource *luma_res = nullptr, *chroma_res = nullptr;
			
 
				+
			
 
				+    impl() {
			
 
				+        // build program
			
 
				+        auto vertex_shader = glCreateShader(GL_VERTEX_SHADER);
			
 
				+        auto fragment_shader = glCreateShader(GL_FRAGMENT_SHADER);
			
 
				+        compile_shader(vertex_shader, vertex_shader_source, "vertex");
			
 
				+        compile_shader(fragment_shader, fragment_shader_source, "fragment");
			
 
				+        program = glCreateProgram();
			
 
				+        glAttachShader(program, vertex_shader);
			
 
				+        glAttachShader(program, fragment_shader);
			
 
				+        glLinkProgram(program);
			
 
				+        check_program();
			
 
				+        glDeleteShader(vertex_shader);
			
 
				+        glDeleteShader(fragment_shader);
			
 
				+
			
 
				+        // get texture location
			
 
				+        luma_tex_loc = glGetUniformLocation(program, "luma_tex");
			
 
				+        chroma_tex_loc = glGetUniformLocation(program, "chroma_tex");
			
 
				+
			
 
				+        // create buffers
			
 
				+        static_assert(offsetof(impl, element_buffer) - offsetof(impl, vertex_buffer) == sizeof(GLuint));
			
 
				+        glGenBuffers(2, &vertex_buffer);
			
 
				+
			
 
				+        // config vertex buffer
			
 
				+        glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
			
 
				+        glBufferStorage(GL_ARRAY_BUFFER, 16 * sizeof(GLfloat), nullptr, GL_DYNAMIC_STORAGE_BIT);
			
 
				+
			
 
				+        // fill element buffer
			
 
				+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, element_buffer);
			
 
				+        glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(indices), indices, GL_STATIC_DRAW);
			
 
				+
			
 
				+        // config vertex array
			
 
				+        glGenVertexArrays(1, &vertex_array);
			
 
				+        glBindVertexArray(vertex_array);
			
 
				+        glEnableVertexAttribArray(0);
			
 
				+        glEnableVertexAttribArray(1);
			
 
				+        glVertexAttribPointer(0, 2, GL_FLOAT, false, 4 * sizeof(GLfloat), (void *) 0);
			
 
				+        glVertexAttribPointer(1, 2, GL_FLOAT, false, 4 * sizeof(GLfloat), (void *) (2 * sizeof(GLfloat)));
			
 
				+    }
			
 
				+
			
 
				+    ~impl() {
			
 
				+        glDeleteBuffers(2, &vertex_buffer);
			
 
				+        glDeleteProgram(program);
			
 
				+        if (luma_pbo != 0) {
			
 
				+            cudaGraphicsUnregisterResource(luma_res);
			
 
				+            cudaGraphicsUnregisterResource(chroma_res);
			
 
				+            glDeleteBuffers(2, &luma_pbo);
			
 
				+            glDeleteTextures(2, &luma_tex);
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    static void compile_shader(GLuint shader, const char *source, const char *name) {
			
 
				+        glShaderSource(shader, 1, &source, nullptr);
			
 
				+        glCompileShader(shader);
			
 
				+        GLint status, log_length;
			
 
				+        glGetShaderiv(shader, GL_COMPILE_STATUS, &status);
			
 
				+        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_length);
			
 
				+        auto info_log = (GLchar *) malloc(log_length);
			
 
				+        glGetShaderInfoLog(shader, log_length, nullptr, info_log);
			
 
				+        if (status == GL_TRUE) {
			
 
				+            SPDLOG_INFO("Compile {} shader succeeded: {}", name, info_log);
			
 
				+        } else {
			
 
				+            SPDLOG_ERROR("Compile {} shader failed: {}", name, info_log);
			
 
				+            assert(false);
			
 
				+        }
			
 
				+        free(info_log);
			
 
				+    }
			
 
				+
			
 
				+    void check_program() const {
			
 
				+        GLint status, log_length;
			
 
				+        glGetProgramiv(program, GL_LINK_STATUS, &status);
			
 
				+        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &log_length);
			
 
				+        auto info_log = (GLchar *) malloc(log_length);
			
 
				+        glGetProgramInfoLog(program, log_length, nullptr, info_log);
			
 
				+        if (status == GL_TRUE) {
			
 
				+            SPDLOG_INFO("Link program succeeded: {}", info_log);
			
 
				+        } else {
			
 
				+            SPDLOG_ERROR("Link program failed: {}", info_log);
			
 
				+            assert(false);
			
 
				+        }
			
 
				+        free(info_log);
			
 
				+    }
			
 
				+
			
 
				+    static void config_plane_texture() {
			
 
				+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
			
 
				+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
			
 
				+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_BORDER);
			
 
				+        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_BORDER);
			
 
				+    }
			
 
				+
			
 
				+    bool config_frame() {
			
 
				+        // generate and allocate pixel buffers
			
 
				+        auto luma_size = frame_width * frame_height;
			
 
				+        auto chroma_size = frame_width * (frame_height >> 1);
			
 
				+        static_assert(offsetof(impl, chroma_pbo) - offsetof(impl, luma_pbo) == sizeof(GLuint));
			
 
				+        glGenBuffers(2, &luma_pbo);
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, luma_pbo);
			
 
				+        glBufferStorage(GL_PIXEL_UNPACK_BUFFER, luma_size, nullptr, GL_DYNAMIC_STORAGE_BIT | GL_MAP_WRITE_BIT);
			
 
				+
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, chroma_pbo);
			
 
				+        glBufferStorage(GL_PIXEL_UNPACK_BUFFER, chroma_size, nullptr, GL_DYNAMIC_STORAGE_BIT | GL_MAP_WRITE_BIT);
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
			
 
				+
			
 
				+        // generate and allocate textures
			
 
				+        static_assert(offsetof(impl, chroma_tex) - offsetof(impl, luma_tex) == sizeof(GLuint));
			
 
				+        glGenTextures(2, &luma_tex);
			
 
				+        glBindTexture(GL_TEXTURE_2D, luma_tex);
			
 
				+        config_plane_texture();
			
 
				+        glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, frame_width, frame_height);
			
 
				+
			
 
				+        glBindTexture(GL_TEXTURE_2D, chroma_tex);
			
 
				+        config_plane_texture();
			
 
				+        glTexStorage2D(GL_TEXTURE_2D, 1, GL_RG8, frame_width >> 1, frame_height >> 1);
			
 
				+        static const GLfloat chroma_black[] = {0.5, 0.5, 0, 0};
			
 
				+        glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, chroma_black);
			
 
				+
			
 
				+        // register resources
			
 
				+        CUDA_API_CHECK(cudaGraphicsGLRegisterBuffer(&luma_res, luma_pbo, cudaGraphicsRegisterFlagsWriteDiscard));
			
 
				+        CUDA_API_CHECK(cudaGraphicsGLRegisterBuffer(&chroma_res, chroma_pbo, cudaGraphicsRegisterFlagsWriteDiscard));
			
 
				+
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    void upload_frame() {
			
 
				+        // luma
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, luma_pbo);
			
 
				+        glBindTexture(GL_TEXTURE_2D, luma_tex);
			
 
				+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, frame_width, frame_height, GL_RED, GL_UNSIGNED_BYTE, (void *) 0);
			
 
				+
			
 
				+        // chroma
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, chroma_pbo);
			
 
				+        glBindTexture(GL_TEXTURE_2D, chroma_tex);
			
 
				+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, frame_width >> 1, frame_height >> 1,
			
 
				+                        GL_RG, GL_UNSIGNED_BYTE, (void *) 0);
			
 
				+
			
 
				+        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
			
 
				+    }
			
 
				+
			
 
				+    void render(float factor) {
			
 
				+        // bindings
			
 
				+        glUseProgram(program);
			
 
				+        glBindVertexArray(vertex_array);
			
 
				+        glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
			
 
				+        glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, element_buffer);
			
 
				+
			
 
				+        // setup nv12 textures
			
 
				+        glUniform1i(luma_tex_loc, 0);
			
 
				+        glUniform1i(chroma_tex_loc, 1);
			
 
				+        glActiveTexture(GL_TEXTURE0 + 0);
			
 
				+        glBindTexture(GL_TEXTURE_2D, luma_tex);
			
 
				+        glActiveTexture(GL_TEXTURE0 + 1);
			
 
				+        glBindTexture(GL_TEXTURE_2D, chroma_tex);
			
 
				+
			
 
				+        // fill vertex buffer
			
 
				+        float x_min, x_max, y_min, y_max;
			
 
				+        if (factor >= 1) {
			
 
				+            x_min = 0;
			
 
				+            x_max = 1;
			
 
				+            y_min = -factor / 2 + 0.5f;
			
 
				+            y_max = factor / 2 + 0.5f;
			
 
				+        } else {
			
 
				+            factor = 1 / factor;
			
 
				+            x_min = -factor / 2 + 0.5f;
			
 
				+            x_max = factor / 2 + 0.5f;
			
 
				+            y_min = 0;
			
 
				+            y_max = 1;
			
 
				+        }
			
 
				+        GLfloat vertices[16] = {
			
 
				+                // 2 for position; 2 for texture
			
 
				+                1, -1, x_max, y_max, // top right
			
 
				+                1, 1, x_max, y_min, // bottom right
			
 
				+                -1, 1, x_min, y_min, // bottom left
			
 
				+                -1, -1, x_min, y_max // top left
			
 
				+        };
			
 
				+        static_assert(sizeof(vertices) == 16 * sizeof(GLfloat));
			
 
				+        glBufferSubData(GL_ARRAY_BUFFER, 0, sizeof(vertices), vertices);
			
 
				+
			
 
				+        // draw texture
			
 
				+        glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, nullptr);
			
 
				+    }
			
 
				+};
			
 
				+
			
 
				+nv12_renderer::nv12_renderer()
			
 
				+        : pimpl(std::make_unique<impl>()) {}
			
 
				+
			
 
				+nv12_renderer::~nv12_renderer() = default;
			
 
				+
			
 
				+void nv12_renderer::config_frame(int frame_width, int frame_height) {
			
 
				+    if (pimpl->luma_pbo != 0) [[likely]] {
			
 
				+        assert(pimpl->frame_width == frame_width);
			
 
				+        assert(pimpl->frame_height == frame_height);
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    // config OpenGL staffs
			
 
				+    assert((frame_width & 1) == 0 && (frame_height & 1) == 0);
			
 
				+    pimpl->frame_width = frame_width;
			
 
				+    pimpl->frame_height = frame_height;
			
 
				+    pimpl->config_frame();
			
 
				+}
			
 
				+
			
 
				+void nv12_renderer::render(float factor) {
			
 
				+    pimpl->render(factor);
			
 
				+}
			
 
				+
			
 
				+void nv12_renderer::get_frame_res(cudaGraphicsResource **luma_res,
			
 
				+                                  cudaGraphicsResource **chroma_res) {
			
 
				+    *luma_res = pimpl->luma_res;
			
 
				+    *chroma_res = pimpl->chroma_res;
			
 
				+}
			
 
				+
			
 
				+void nv12_renderer::upload_frame() {
			
 
				+    pimpl->upload_frame();
			
 
				+}
			
--- a/src/nv12_renderer.h
+++ b/src/nv12_renderer.h
@@ -0,0 +1,30 @@
 
				+#ifndef TINYPLAYER2_NV12_RENDERER_H
			
 
				+#define TINYPLAYER2_NV12_RENDERER_H
			
 
				+
			
 
				+#include "cuda_helper.hpp"
			
 
				+
			
 
				+#include <memory>
			
 
				+
			
 
				+class nv12_renderer {
			
 
				+public:
			
 
				+    nv12_renderer();
			
 
				+
			
 
				+    ~nv12_renderer();
			
 
				+
			
 
				+    void config_frame(int frame_width, int frame_height);
			
 
				+
			
 
				+    void get_frame_res(cudaGraphicsResource **luma_res,
			
 
				+                       cudaGraphicsResource **chroma_res);
			
 
				+
			
 
				+    void upload_frame();
			
 
				+
			
 
				+    void render(float factor);
			
 
				+
			
 
				+private:
			
 
				+    struct impl;
			
 
				+    std::unique_ptr<impl> pimpl;
			
 
				+
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif //TINYPLAYER2_NV12_RENDERER_H
			
--- a/src/third_party/rs.c
+++ b/src/third_party/rs.c
@@ -0,0 +1,991 @@
 
				+/*#define PROFILE*/
			
 
				+/*
			
 
				+ * fec.c -- forward error correction based on Vandermonde matrices
			
 
				+ * 980624
			
 
				+ * (C) 1997-98 Luigi Rizzo (luigi@iet.unipi.it)
			
 
				+ * (C) 2001 Alain Knaff (alain@knaff.lu)
			
 
				+ *
			
 
				+ * Portions derived from code by Phil Karn (karn@ka9q.ampr.org),
			
 
				+ * Robert Morelos-Zaragoza (robert@spectra.eng.hawaii.edu) and Hari
			
 
				+ * Thirumoorthy (harit@spectra.eng.hawaii.edu), Aug 1995
			
 
				+ *
			
 
				+ * Redistribution and use in source and binary forms, with or without
			
 
				+ * modification, are permitted provided that the following conditions
			
 
				+ * are met:
			
 
				+ *
			
 
				+ * 1. Redistributions of source code must retain the above copyright
			
 
				+ *    notice, this list of conditions and the following disclaimer.
			
 
				+ * 2. Redistributions in binary form must reproduce the above
			
 
				+ *    copyright notice, this list of conditions and the following
			
 
				+ *    disclaimer in the documentation and/or other materials
			
 
				+ *    provided with the distribution.
			
 
				+ *
			
 
				+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND
			
 
				+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
			
 
				+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
			
 
				+ * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS
			
 
				+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
			
 
				+ * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
			
 
				+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
			
 
				+ * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
			
 
				+ * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
			
 
				+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
			
 
				+ * OF SUCH DAMAGE.
			
 
				+ *
			
 
				+ * Reimplement by Jannson (20161018): compatible for golang version of https://github.com/klauspost/reedsolomon
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * The following parameter defines how many bits are used for
			
 
				+ * field elements. The code supports any value from 2 to 16
			
 
				+ * but fastest operation is achieved with 8 bit elements
			
 
				+ * This is the only parameter you may want to change.
			
 
				+ */
			
 
				+#define GF_BITS  8  /* code over GF(2**GF_BITS) - change to suit */
			
 
				+
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <string.h>
			
 
				+
			
 
				+#include <assert.h>
			
 
				+#include "rs.h"
			
 
				+
			
 
				+/*
			
 
				+ * stuff used for testing purposes only
			
 
				+ */
			
 
				+
			
 
				+#ifdef  TEST
			
 
				+#define DEB(x)
			
 
				+#define DDB(x) x
			
 
				+#define DEBUG   0   /* minimal debugging */
			
 
				+
			
 
				+#include <sys/time.h>
			
 
				+#define DIFF_T(a,b) \
			
 
				+    (1+ 1000000*(a.tv_sec - b.tv_sec) + (a.tv_usec - b.tv_usec) )
			
 
				+
			
 
				+#define TICK(t) \
			
 
				+    {struct timeval x ; \
			
 
				+    gettimeofday(&x, NULL) ; \
			
 
				+    t = x.tv_usec + 1000000* (x.tv_sec & 0xff ) ; \
			
 
				+    }
			
 
				+#define TOCK(t) \
			
 
				+    { u_long t1 ; TICK(t1) ; \
			
 
				+      if (t1 < t) t = 256000000 + t1 - t ; \
			
 
				+      else t = t1 - t ; \
			
 
				+      if (t == 0) t = 1 ;}
			
 
				+
			
 
				+u_long ticks[10];   /* vars for timekeeping */
			
 
				+#else
			
 
				+#define DEB(x)
			
 
				+#define DDB(x)
			
 
				+#define TICK(x)
			
 
				+#define TOCK(x)
			
 
				+#endif /* TEST */
			
 
				+
			
 
				+/*
			
 
				+ * You should not need to change anything beyond this point.
			
 
				+ * The first part of the file implements linear algebra in GF.
			
 
				+ *
			
 
				+ * gf is the type used to store an element of the Galois Field.
			
 
				+ * Must constain at least GF_BITS bits.
			
 
				+ *
			
 
				+ * Note: unsigned char will work up to GF(256) but int seems to run
			
 
				+ * faster on the Pentium. We use int whenever have to deal with an
			
 
				+ * index, since they are generally faster.
			
 
				+ */
			
 
				+/*
			
 
				+ * AK: Udpcast only uses GF_BITS=8. Remove other possibilities
			
 
				+ */
			
 
				+#if (GF_BITS != 8)
			
 
				+#error "GF_BITS must be 8"
			
 
				+#endif
			
 
				+typedef unsigned char gf;
			
 
				+
			
 
				+#define GF_SIZE ((1 << GF_BITS) - 1)    /* powers of \alpha */
			
 
				+
			
 
				+/*
			
 
				+ * Primitive polynomials - see Lin & Costello, Appendix A,
			
 
				+ * and  Lee & Messerschmitt, p. 453.
			
 
				+ */
			
 
				+static char *allPp[] = {    /* GF_BITS  polynomial      */
			
 
				+        NULL,           /*  0   no code         */
			
 
				+        NULL,           /*  1   no code         */
			
 
				+        "111",          /*  2   1+x+x^2         */
			
 
				+        "1101",         /*  3   1+x+x^3         */
			
 
				+        "11001",            /*  4   1+x+x^4         */
			
 
				+        "101001",           /*  5   1+x^2+x^5       */
			
 
				+        "1100001",          /*  6   1+x+x^6         */
			
 
				+        "10010001",         /*  7   1 + x^3 + x^7       */
			
 
				+        "101110001",        /*  8   1+x^2+x^3+x^4+x^8   */
			
 
				+        "1000100001",       /*  9   1+x^4+x^9       */
			
 
				+        "10010000001",      /* 10   1+x^3+x^10      */
			
 
				+        "101000000001",     /* 11   1+x^2+x^11      */
			
 
				+        "1100101000001",        /* 12   1+x+x^4+x^6+x^12    */
			
 
				+        "11011000000001",       /* 13   1+x+x^3+x^4+x^13    */
			
 
				+        "110000100010001",      /* 14   1+x+x^6+x^10+x^14   */
			
 
				+        "1100000000000001",     /* 15   1+x+x^15        */
			
 
				+        "11010000000010001"     /* 16   1+x+x^3+x^12+x^16   */
			
 
				+};
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * To speed up computations, we have tables for logarithm, exponent
			
 
				+ * and inverse of a number. If GF_BITS <= 8, we use a table for
			
 
				+ * multiplication as well (it takes 64K, no big deal even on a PDA,
			
 
				+ * especially because it can be pre-initialized an put into a ROM!),
			
 
				+ * otherwhise we use a table of logarithms.
			
 
				+ * In any case the macro gf_mul(x,y) takes care of multiplications.
			
 
				+ */
			
 
				+
			
 
				+static gf gf_exp[2*GF_SIZE];    /* index->poly form conversion table    */
			
 
				+static int gf_log[GF_SIZE + 1]; /* Poly->index form conversion table    */
			
 
				+static gf inverse[GF_SIZE+1];   /* inverse of field elem.       */
			
 
				+/* inv[\alpha**i]=\alpha**(GF_SIZE-i-1) */
			
 
				+
			
 
				+/*
			
 
				+ * modnn(x) computes x % GF_SIZE, where GF_SIZE is 2**GF_BITS - 1,
			
 
				+ * without a slow divide.
			
 
				+ */
			
 
				+static inline gf
			
 
				+modnn(int x)
			
 
				+{
			
 
				+    while (x >= GF_SIZE) {
			
 
				+        x -= GF_SIZE;
			
 
				+        x = (x >> GF_BITS) + (x & GF_SIZE);
			
 
				+    }
			
 
				+    return x;
			
 
				+}
			
 
				+
			
 
				+#define SWAP(a,b,t) {t tmp; tmp=a; a=b; b=tmp;}
			
 
				+
			
 
				+/*
			
 
				+ * gf_mul(x,y) multiplies two numbers. If GF_BITS<=8, it is much
			
 
				+ * faster to use a multiplication table.
			
 
				+ *
			
 
				+ * USE_GF_MULC, GF_MULC0(c) and GF_ADDMULC(x) can be used when multiplying
			
 
				+ * many numbers by the same constant. In this case the first
			
 
				+ * call sets the constant, and others perform the multiplications.
			
 
				+ * A value related to the multiplication is held in a local variable
			
 
				+ * declared with USE_GF_MULC . See usage in addmul1().
			
 
				+ */
			
 
				+static gf gf_mul_table[(GF_SIZE + 1)*(GF_SIZE + 1)]
			
 
				+#ifdef WINDOWS
			
 
				+        __attribute__((aligned (16)))
			
 
				+#else
			
 
				+        __attribute__((aligned (256)))
			
 
				+#endif
			
 
				+;
			
 
				+
			
 
				+#define gf_mul(x,y) gf_mul_table[(x<<8)+y]
			
 
				+
			
 
				+#define USE_GF_MULC register gf * __gf_mulc_
			
 
				+#define GF_MULC0(c) __gf_mulc_ = &gf_mul_table[(c)<<8]
			
 
				+#define GF_ADDMULC(dst, x) dst ^= __gf_mulc_[x]
			
 
				+#define GF_MULC(dst, x) dst = __gf_mulc_[x]
			
 
				+
			
 
				+static void
			
 
				+init_mul_table(void)
			
 
				+{
			
 
				+    int i, j;
			
 
				+    for (i=0; i< GF_SIZE+1; i++)
			
 
				+        for (j=0; j< GF_SIZE+1; j++)
			
 
				+            gf_mul_table[(i<<8)+j] = gf_exp[modnn(gf_log[i] + gf_log[j]) ] ;
			
 
				+
			
 
				+    for (j=0; j< GF_SIZE+1; j++)
			
 
				+        gf_mul_table[j] = gf_mul_table[j<<8] = 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Generate GF(2**m) from the irreducible polynomial p(X) in p[0]..p[m]
			
 
				+ * Lookup tables:
			
 
				+ *     index->polynomial form       gf_exp[] contains j= \alpha^i;
			
 
				+ *     polynomial form -> index form    gf_log[ j = \alpha^i ] = i
			
 
				+ * \alpha=x is the primitive element of GF(2^m)
			
 
				+ *
			
 
				+ * For efficiency, gf_exp[] has size 2*GF_SIZE, so that a simple
			
 
				+ * multiplication of two numbers can be resolved without calling modnn
			
 
				+ */
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+ * initialize the data structures used for computations in GF.
			
 
				+ */
			
 
				+static void
			
 
				+generate_gf(void)
			
 
				+{
			
 
				+    int i;
			
 
				+    gf mask;
			
 
				+    char *Pp =  allPp[GF_BITS] ;
			
 
				+
			
 
				+    mask = 1;   /* x ** 0 = 1 */
			
 
				+    gf_exp[GF_BITS] = 0; /* will be updated at the end of the 1st loop */
			
 
				+    /*
			
 
				+     * first, generate the (polynomial representation of) powers of \alpha,
			
 
				+     * which are stored in gf_exp[i] = \alpha ** i .
			
 
				+     * At the same time build gf_log[gf_exp[i]] = i .
			
 
				+     * The first GF_BITS powers are simply bits shifted to the left.
			
 
				+     */
			
 
				+    for (i = 0; i < GF_BITS; i++, mask <<= 1 ) {
			
 
				+        gf_exp[i] = mask;
			
 
				+        gf_log[gf_exp[i]] = i;
			
 
				+        /*
			
 
				+         * If Pp[i] == 1 then \alpha ** i occurs in poly-repr
			
 
				+         * gf_exp[GF_BITS] = \alpha ** GF_BITS
			
 
				+         */
			
 
				+        if ( Pp[i] == '1' )
			
 
				+            gf_exp[GF_BITS] ^= mask;
			
 
				+    }
			
 
				+    /*
			
 
				+     * now gf_exp[GF_BITS] = \alpha ** GF_BITS is complete, so can als
			
 
				+     * compute its inverse.
			
 
				+     */
			
 
				+    gf_log[gf_exp[GF_BITS]] = GF_BITS;
			
 
				+    /*
			
 
				+     * Poly-repr of \alpha ** (i+1) is given by poly-repr of
			
 
				+     * \alpha ** i shifted left one-bit and accounting for any
			
 
				+     * \alpha ** GF_BITS term that may occur when poly-repr of
			
 
				+     * \alpha ** i is shifted.
			
 
				+     */
			
 
				+    mask = 1 << (GF_BITS - 1 ) ;
			
 
				+    for (i = GF_BITS + 1; i < GF_SIZE; i++) {
			
 
				+        if (gf_exp[i - 1] >= mask)
			
 
				+            gf_exp[i] = gf_exp[GF_BITS] ^ ((gf_exp[i - 1] ^ mask) << 1);
			
 
				+        else
			
 
				+            gf_exp[i] = gf_exp[i - 1] << 1;
			
 
				+        gf_log[gf_exp[i]] = i;
			
 
				+    }
			
 
				+    /*
			
 
				+     * log(0) is not defined, so use a special value
			
 
				+     */
			
 
				+    gf_log[0] = GF_SIZE ;
			
 
				+    /* set the extended gf_exp values for fast multiply */
			
 
				+    for (i = 0 ; i < GF_SIZE ; i++)
			
 
				+        gf_exp[i + GF_SIZE] = gf_exp[i] ;
			
 
				+
			
 
				+    /*
			
 
				+     * again special cases. 0 has no inverse. This used to
			
 
				+     * be initialized to GF_SIZE, but it should make no difference
			
 
				+     * since noone is supposed to read from here.
			
 
				+     */
			
 
				+    inverse[0] = 0 ;
			
 
				+    inverse[1] = 1;
			
 
				+    for (i=2; i<=GF_SIZE; i++)
			
 
				+        inverse[i] = gf_exp[GF_SIZE-gf_log[i]];
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Various linear algebra operations that i use often.
			
 
				+ */
			
 
				+
			
 
				+/*
			
 
				+ * addmul() computes dst[] = dst[] + c * src[]
			
 
				+ * This is used often, so better optimize it! Currently the loop is
			
 
				+ * unrolled 16 times, a good value for 486 and pentium-class machines.
			
 
				+ * The case c=0 is also optimized, whereas c=1 is not. These
			
 
				+ * calls are unfrequent in my typical apps so I did not bother.
			
 
				+ *
			
 
				+ * Note that gcc on
			
 
				+ */
			
 
				+#if 0
			
 
				+#define addmul(dst, src, c, sz) \
			
 
				+    if (c != 0) addmul1(dst, src, c, sz)
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+
			
 
				+#define UNROLL 16 /* 1, 4, 8, 16 */
			
 
				+static void
			
 
				+slow_addmul1(gf *dst1, gf *src1, gf c, int sz)
			
 
				+{
			
 
				+    USE_GF_MULC ;
			
 
				+    register gf *dst = dst1, *src = src1 ;
			
 
				+    gf *lim = &dst[sz - UNROLL + 1] ;
			
 
				+
			
 
				+    GF_MULC0(c) ;
			
 
				+
			
 
				+#if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */
			
 
				+    for (; dst < lim ; dst += UNROLL, src += UNROLL ) {
			
 
				+        GF_ADDMULC( dst[0] , src[0] );
			
 
				+        GF_ADDMULC( dst[1] , src[1] );
			
 
				+        GF_ADDMULC( dst[2] , src[2] );
			
 
				+        GF_ADDMULC( dst[3] , src[3] );
			
 
				+#if (UNROLL > 4)
			
 
				+        GF_ADDMULC( dst[4] , src[4] );
			
 
				+        GF_ADDMULC( dst[5] , src[5] );
			
 
				+        GF_ADDMULC( dst[6] , src[6] );
			
 
				+        GF_ADDMULC( dst[7] , src[7] );
			
 
				+#endif
			
 
				+#if (UNROLL > 8)
			
 
				+        GF_ADDMULC( dst[8] , src[8] );
			
 
				+        GF_ADDMULC( dst[9] , src[9] );
			
 
				+        GF_ADDMULC( dst[10] , src[10] );
			
 
				+        GF_ADDMULC( dst[11] , src[11] );
			
 
				+        GF_ADDMULC( dst[12] , src[12] );
			
 
				+        GF_ADDMULC( dst[13] , src[13] );
			
 
				+        GF_ADDMULC( dst[14] , src[14] );
			
 
				+        GF_ADDMULC( dst[15] , src[15] );
			
 
				+#endif
			
 
				+    }
			
 
				+#endif
			
 
				+    lim += UNROLL - 1 ;
			
 
				+    for (; dst < lim; dst++, src++ )        /* final components */
			
 
				+        GF_ADDMULC( *dst , *src );
			
 
				+}
			
 
				+
			
 
				+# define addmul1 slow_addmul1
			
 
				+
			
 
				+static void addmul(gf *dst, gf *src, gf c, int sz) {
			
 
				+    // fprintf(stderr, "Dst=%p Src=%p, gf=%02x sz=%d\n", dst, src, c, sz);
			
 
				+    if (c != 0) addmul1(dst, src, c, sz);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * mul() computes dst[] = c * src[]
			
 
				+ * This is used often, so better optimize it! Currently the loop is
			
 
				+ * unrolled 16 times, a good value for 486 and pentium-class machines.
			
 
				+ * The case c=0 is also optimized, whereas c=1 is not. These
			
 
				+ * calls are unfrequent in my typical apps so I did not bother.
			
 
				+ *
			
 
				+ * Note that gcc on
			
 
				+ */
			
 
				+#if 0
			
 
				+#define mul(dst, src, c, sz) \
			
 
				+    do { if (c != 0) mul1(dst, src, c, sz); else memset(dst, 0, c); } while(0)
			
 
				+#endif
			
 
				+
			
 
				+#define UNROLL 16 /* 1, 4, 8, 16 */
			
 
				+static void
			
 
				+slow_mul1(gf *dst1, gf *src1, gf c, int sz)
			
 
				+{
			
 
				+    USE_GF_MULC ;
			
 
				+    register gf *dst = dst1, *src = src1 ;
			
 
				+    gf *lim = &dst[sz - UNROLL + 1] ;
			
 
				+
			
 
				+    GF_MULC0(c) ;
			
 
				+
			
 
				+#if (UNROLL > 1) /* unrolling by 8/16 is quite effective on the pentium */
			
 
				+    for (; dst < lim ; dst += UNROLL, src += UNROLL ) {
			
 
				+        GF_MULC( dst[0] , src[0] );
			
 
				+        GF_MULC( dst[1] , src[1] );
			
 
				+        GF_MULC( dst[2] , src[2] );
			
 
				+        GF_MULC( dst[3] , src[3] );
			
 
				+#if (UNROLL > 4)
			
 
				+        GF_MULC( dst[4] , src[4] );
			
 
				+        GF_MULC( dst[5] , src[5] );
			
 
				+        GF_MULC( dst[6] , src[6] );
			
 
				+        GF_MULC( dst[7] , src[7] );
			
 
				+#endif
			
 
				+#if (UNROLL > 8)
			
 
				+        GF_MULC( dst[8] , src[8] );
			
 
				+        GF_MULC( dst[9] , src[9] );
			
 
				+        GF_MULC( dst[10] , src[10] );
			
 
				+        GF_MULC( dst[11] , src[11] );
			
 
				+        GF_MULC( dst[12] , src[12] );
			
 
				+        GF_MULC( dst[13] , src[13] );
			
 
				+        GF_MULC( dst[14] , src[14] );
			
 
				+        GF_MULC( dst[15] , src[15] );
			
 
				+#endif
			
 
				+    }
			
 
				+#endif
			
 
				+    lim += UNROLL - 1 ;
			
 
				+    for (; dst < lim; dst++, src++ )        /* final components */
			
 
				+        GF_MULC( *dst , *src );
			
 
				+}
			
 
				+
			
 
				+# define mul1 slow_mul1
			
 
				+
			
 
				+static inline void mul(gf *dst, gf *src, gf c, int sz) {
			
 
				+    /*fprintf(stderr, "%p = %02x * %p\n", dst, c, src);*/
			
 
				+    if (c != 0) mul1(dst, src, c, sz); else memset(dst, 0, c);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * invert_mat() takes a matrix and produces its inverse
			
 
				+ * k is the size of the matrix.
			
 
				+ * (Gauss-Jordan, adapted from Numerical Recipes in C)
			
 
				+ * Return non-zero if singular.
			
 
				+ */
			
 
				+DEB( int pivloops=0; int pivswaps=0 ; /* diagnostic */)
			
 
				+static int
			
 
				+invert_mat(gf *src, int k)
			
 
				+{
			
 
				+    gf c, *p ;
			
 
				+    int irow, icol, row, col, i, ix ;
			
 
				+
			
 
				+    int error = 1 ;
			
 
				+    int indxc[k];
			
 
				+    int indxr[k];
			
 
				+    int ipiv[k];
			
 
				+    gf id_row[k];
			
 
				+
			
 
				+    memset(id_row, 0, k*sizeof(gf));
			
 
				+    DEB( pivloops=0; pivswaps=0 ; /* diagnostic */ )
			
 
				+    /*
			
 
				+     * ipiv marks elements already used as pivots.
			
 
				+     */
			
 
				+    for (i = 0; i < k ; i++)
			
 
				+        ipiv[i] = 0 ;
			
 
				+
			
 
				+    for (col = 0; col < k ; col++) {
			
 
				+        gf *pivot_row ;
			
 
				+        /*
			
 
				+         * Zeroing column 'col', look for a non-zero element.
			
 
				+         * First try on the diagonal, if it fails, look elsewhere.
			
 
				+         */
			
 
				+        irow = icol = -1 ;
			
 
				+        if (ipiv[col] != 1 && src[col*k + col] != 0) {
			
 
				+            irow = col ;
			
 
				+            icol = col ;
			
 
				+            goto found_piv ;
			
 
				+        }
			
 
				+        for (row = 0 ; row < k ; row++) {
			
 
				+            if (ipiv[row] != 1) {
			
 
				+                for (ix = 0 ; ix < k ; ix++) {
			
 
				+                    DEB( pivloops++ ; )
			
 
				+                    if (ipiv[ix] == 0) {
			
 
				+                        if (src[row*k + ix] != 0) {
			
 
				+                            irow = row ;
			
 
				+                            icol = ix ;
			
 
				+                            goto found_piv ;
			
 
				+                        }
			
 
				+                    } else if (ipiv[ix] > 1) {
			
 
				+                        fprintf(stderr, "singular matrix\n");
			
 
				+                        goto fail ;
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        if (icol == -1) {
			
 
				+            fprintf(stderr, "XXX pivot not found!\n");
			
 
				+            goto fail ;
			
 
				+        }
			
 
				+        found_piv:
			
 
				+        ++(ipiv[icol]) ;
			
 
				+        /*
			
 
				+         * swap rows irow and icol, so afterwards the diagonal
			
 
				+         * element will be correct. Rarely done, not worth
			
 
				+         * optimizing.
			
 
				+         */
			
 
				+        if (irow != icol) {
			
 
				+            for (ix = 0 ; ix < k ; ix++ ) {
			
 
				+                SWAP( src[irow*k + ix], src[icol*k + ix], gf) ;
			
 
				+            }
			
 
				+        }
			
 
				+        indxr[col] = irow ;
			
 
				+        indxc[col] = icol ;
			
 
				+        pivot_row = &src[icol*k] ;
			
 
				+        c = pivot_row[icol] ;
			
 
				+        if (c == 0) {
			
 
				+            fprintf(stderr, "singular matrix 2\n");
			
 
				+            goto fail ;
			
 
				+        }
			
 
				+        if (c != 1 ) { /* otherwhise this is a NOP */
			
 
				+            /*
			
 
				+             * this is done often , but optimizing is not so
			
 
				+             * fruitful, at least in the obvious ways (unrolling)
			
 
				+             */
			
 
				+            DEB( pivswaps++ ; )
			
 
				+            c = inverse[ c ] ;
			
 
				+            pivot_row[icol] = 1 ;
			
 
				+            for (ix = 0 ; ix < k ; ix++ )
			
 
				+                pivot_row[ix] = gf_mul(c, pivot_row[ix] );
			
 
				+        }
			
 
				+        /*
			
 
				+         * from all rows, remove multiples of the selected row
			
 
				+         * to zero the relevant entry (in fact, the entry is not zero
			
 
				+         * because we know it must be zero).
			
 
				+         * (Here, if we know that the pivot_row is the identity,
			
 
				+         * we can optimize the addmul).
			
 
				+         */
			
 
				+        id_row[icol] = 1;
			
 
				+        if (memcmp(pivot_row, id_row, k*sizeof(gf)) != 0) {
			
 
				+            for (p = src, ix = 0 ; ix < k ; ix++, p += k ) {
			
 
				+                if (ix != icol) {
			
 
				+                    c = p[icol] ;
			
 
				+                    p[icol] = 0 ;
			
 
				+                    addmul(p, pivot_row, c, k );
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        id_row[icol] = 0;
			
 
				+    } /* done all columns */
			
 
				+    for (col = k-1 ; col >= 0 ; col-- ) {
			
 
				+        if (indxr[col] <0 || indxr[col] >= k)
			
 
				+            fprintf(stderr, "AARGH, indxr[col] %d\n", indxr[col]);
			
 
				+        else if (indxc[col] <0 || indxc[col] >= k)
			
 
				+            fprintf(stderr, "AARGH, indxc[col] %d\n", indxc[col]);
			
 
				+        else
			
 
				+        if (indxr[col] != indxc[col] ) {
			
 
				+            for (row = 0 ; row < k ; row++ ) {
			
 
				+                SWAP( src[row*k + indxr[col]], src[row*k + indxc[col]], gf) ;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    error = 0 ;
			
 
				+    fail:
			
 
				+    return error ;
			
 
				+}
			
 
				+
			
 
				+static int fec_initialized = 0 ;
			
 
				+
			
 
				+void fec_init(void)
			
 
				+{
			
 
				+    TICK(ticks[0]);
			
 
				+    generate_gf();
			
 
				+    TOCK(ticks[0]);
			
 
				+    DDB(fprintf(stderr, "generate_gf took %ldus\n", ticks[0]);)
			
 
				+    TICK(ticks[0]);
			
 
				+    init_mul_table();
			
 
				+    TOCK(ticks[0]);
			
 
				+    DDB(fprintf(stderr, "init_mul_table took %ldus\n", ticks[0]);)
			
 
				+    fec_initialized = 1 ;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+#ifdef PROFILE
			
 
				+#ifdef __x86_64__
			
 
				+static long long rdtsc(void)
			
 
				+{
			
 
				+    unsigned long low, hi;
			
 
				+    asm volatile ("rdtsc" : "=d" (hi), "=a" (low));
			
 
				+    return ( (((long long)hi) << 32) | ((long long) low));
			
 
				+}
			
 
				+#elif __arm__
			
 
				+static long long rdtsc(void)
			
 
				+{
			
 
				+    u64 val;
			
 
				+    asm volatile("mrs %0, cntvct_el0" : "=r" (val));
			
 
				+    return val;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+void print_matrix1(gf* matrix, int nrows, int ncols) {
			
 
				+    int i, j;
			
 
				+    printf("matrix (%d,%d):\n", nrows, ncols);
			
 
				+    for(i = 0; i < nrows; i++) {
			
 
				+        for(j = 0; j < ncols; j++) {
			
 
				+            printf("%6d ", matrix[i*ncols + j]);
			
 
				+        }
			
 
				+        printf("\n");
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void print_matrix2(gf** matrix, int nrows, int ncols) {
			
 
				+    int i, j;
			
 
				+    printf("matrix (%d,%d):\n", nrows, ncols);
			
 
				+    for(i = 0; i < nrows; i++) {
			
 
				+        for(j = 0; j < ncols; j++) {
			
 
				+            printf("%6d ", matrix[i][j]);
			
 
				+        }
			
 
				+        printf("\n");
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/* y = a**n */
			
 
				+static gf galExp(gf a, gf n) {
			
 
				+    int logA;
			
 
				+    int logResult;
			
 
				+    if(0 == n) {
			
 
				+        return 1;
			
 
				+    }
			
 
				+    if(0 == a) {
			
 
				+        return 0;
			
 
				+    }
			
 
				+    logA = gf_log[a];
			
 
				+    logResult = logA * n;
			
 
				+    while(logResult >= 255) {
			
 
				+        logResult -= 255;
			
 
				+    }
			
 
				+
			
 
				+    return gf_exp[logResult];
			
 
				+}
			
 
				+
			
 
				+static inline gf galMultiply(gf a, gf b) {
			
 
				+    return gf_mul_table[ ((int)a << 8) + (int)b ];
			
 
				+}
			
 
				+
			
 
				+static gf* vandermonde(int nrows, int ncols) {
			
 
				+    int row, col, ptr;
			
 
				+    gf* matrix = (gf*)RS_MALLOC(nrows * ncols);
			
 
				+    if(NULL != matrix) {
			
 
				+        ptr = 0;
			
 
				+        for(row = 0; row < nrows; row++) {
			
 
				+            for(col = 0; col < ncols; col++) {
			
 
				+                matrix[ptr++] = galExp((gf)row, (gf)col);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return matrix;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Not check for input params
			
 
				+ * */
			
 
				+static gf* sub_matrix(gf* matrix, int rmin, int cmin, int rmax, int cmax,  int nrows, int ncols) {
			
 
				+    int i, j, ptr = 0;
			
 
				+    gf* new_m = (gf*)RS_MALLOC( (rmax-rmin) * (cmax-cmin) );
			
 
				+    if(NULL != new_m) {
			
 
				+        for(i = rmin; i < rmax; i++) {
			
 
				+            for(j = cmin; j < cmax; j++) {
			
 
				+                new_m[ptr++] = matrix[i*ncols + j];
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return new_m;
			
 
				+}
			
 
				+
			
 
				+/* y = a.dot(b) */
			
 
				+static gf* multiply1(gf *a, int ar, int ac, gf *b, int br, int bc) {
			
 
				+    gf *new_m, tg;
			
 
				+    int r, c, i, ptr = 0;
			
 
				+
			
 
				+    assert(ac == br);
			
 
				+    new_m = (gf*)RS_CALLOC(1, ar*bc);
			
 
				+    if(NULL != new_m) {
			
 
				+
			
 
				+        /* this multiply is slow */
			
 
				+        for(r = 0; r < ar; r++) {
			
 
				+            for(c = 0; c < bc; c++) {
			
 
				+                tg = 0;
			
 
				+                for(i = 0; i < ac; i++) {
			
 
				+                    /* tg ^= gf_mul_table[ ((int)a[r*ac+i] << 8) + (int)b[i*bc+c] ]; */
			
 
				+                    tg ^= galMultiply(a[r*ac+i], b[i*bc+c]);
			
 
				+                }
			
 
				+
			
 
				+                new_m[ptr++] = tg;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+
			
 
				+    return new_m;
			
 
				+}
			
 
				+
			
 
				+/* copy from golang rs version */
			
 
				+static inline int code_some_shards(gf* matrixRows, gf** inputs, gf** outputs,
			
 
				+                                   int dataShards, int outputCount, int byteCount) {
			
 
				+    gf* in;
			
 
				+    int iRow, c;
			
 
				+    for(c = 0; c < dataShards; c++) {
			
 
				+        in = inputs[c];
			
 
				+        for(iRow = 0; iRow < outputCount; iRow++) {
			
 
				+            if(0 == c) {
			
 
				+                mul(outputs[iRow], in, matrixRows[iRow*dataShards+c], byteCount);
			
 
				+            } else {
			
 
				+                addmul(outputs[iRow], in, matrixRows[iRow*dataShards+c], byteCount);
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+reed_solomon* reed_solomon_new(int data_shards, int parity_shards) {
			
 
				+    gf* vm = NULL;
			
 
				+    gf* top = NULL;
			
 
				+    int err = 0;
			
 
				+    reed_solomon* rs = NULL;
			
 
				+
			
 
				+    /* MUST use fec_init once time first */
			
 
				+    assert(fec_initialized);
			
 
				+
			
 
				+    do {
			
 
				+        rs = (reed_solomon*) RS_MALLOC(sizeof(reed_solomon));
			
 
				+        if(NULL == rs) {
			
 
				+            return NULL;
			
 
				+        }
			
 
				+        rs->data_shards = data_shards;
			
 
				+        rs->parity_shards = parity_shards;
			
 
				+        rs->shards = (data_shards + parity_shards);
			
 
				+        rs->m = NULL;
			
 
				+        rs->parity = NULL;
			
 
				+
			
 
				+        if(rs->shards > DATA_SHARDS_MAX || data_shards <= 0 || parity_shards <= 0) {
			
 
				+            err = 1;
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        vm = vandermonde(rs->shards, rs->data_shards);
			
 
				+        if(NULL == vm) {
			
 
				+            err = 2;
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        top = sub_matrix(vm, 0, 0, data_shards, data_shards, rs->shards, data_shards);
			
 
				+        if(NULL == top) {
			
 
				+            err = 3;
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        err = invert_mat(top, data_shards);
			
 
				+        assert(0 == err);
			
 
				+
			
 
				+        rs->m = multiply1(vm, rs->shards, data_shards, top, data_shards, data_shards);
			
 
				+        if(NULL == rs->m) {
			
 
				+            err = 4;
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        rs->parity = sub_matrix(rs->m, data_shards, 0, rs->shards, data_shards, rs->shards, data_shards);
			
 
				+        if(NULL == rs->parity) {
			
 
				+            err = 5;
			
 
				+            break;
			
 
				+        }
			
 
				+
			
 
				+        RS_FREE(vm);
			
 
				+        RS_FREE(top);
			
 
				+        vm = NULL;
			
 
				+        top = NULL;
			
 
				+        return rs;
			
 
				+
			
 
				+    } while(0);
			
 
				+
			
 
				+    fprintf(stderr, "err=%d\n", err);
			
 
				+    if(NULL != vm) {
			
 
				+        RS_FREE(vm);
			
 
				+    }
			
 
				+    if(NULL != top) {
			
 
				+        RS_FREE(top);
			
 
				+    }
			
 
				+    if(NULL != rs) {
			
 
				+        if(NULL != rs->m) {
			
 
				+            RS_FREE(rs->m);
			
 
				+        }
			
 
				+        if(NULL != rs->parity) {
			
 
				+            RS_FREE(rs->parity);
			
 
				+        }
			
 
				+        RS_FREE(rs);
			
 
				+    }
			
 
				+
			
 
				+    return NULL;
			
 
				+}
			
 
				+
			
 
				+void reed_solomon_release(reed_solomon* rs) {
			
 
				+    if(NULL != rs) {
			
 
				+        if(NULL != rs->m) {
			
 
				+            RS_FREE(rs->m);
			
 
				+        }
			
 
				+        if(NULL != rs->parity) {
			
 
				+            RS_FREE(rs->parity);
			
 
				+        }
			
 
				+        RS_FREE(rs);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * encode one shard
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * data_blocks[rs->data_shards][block_size]
			
 
				+ * fec_blocks[rs->data_shards][block_size]
			
 
				+ * */
			
 
				+int reed_solomon_encode(reed_solomon* rs,
			
 
				+                        unsigned char** data_blocks,
			
 
				+                        unsigned char** fec_blocks,
			
 
				+                        int block_size) {
			
 
				+    assert(NULL != rs && NULL != rs->parity);
			
 
				+
			
 
				+    return code_some_shards(rs->parity, data_blocks, fec_blocks
			
 
				+            , rs->data_shards, rs->parity_shards, block_size);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * decode one shard
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * original data_blocks[rs->data_shards][block_size]
			
 
				+ * dec_fec_blocks[nr_fec_blocks][block_size]
			
 
				+ * fec_block_nos: fec pos number in original fec_blocks
			
 
				+ * erased_blocks: erased blocks in original data_blocks
			
 
				+ * nr_fec_blocks: the number of erased blocks
			
 
				+ * */
			
 
				+int reed_solomon_decode(reed_solomon* rs,
			
 
				+                        unsigned char **data_blocks,
			
 
				+                        int block_size,
			
 
				+                        unsigned char **dec_fec_blocks,
			
 
				+                        unsigned int *fec_block_nos,
			
 
				+                        unsigned int *erased_blocks,
			
 
				+                        int nr_fec_blocks) {
			
 
				+    /* use stack instead of malloc, define a small number of DATA_SHARDS_MAX to save memory */
			
 
				+    gf dataDecodeMatrix[DATA_SHARDS_MAX*DATA_SHARDS_MAX];
			
 
				+    unsigned char* subShards[DATA_SHARDS_MAX];
			
 
				+    unsigned char* outputs[DATA_SHARDS_MAX];
			
 
				+    gf* m = rs->m;
			
 
				+    int i, j, c, swap, subMatrixRow, dataShards, nos, nshards;
			
 
				+
			
 
				+    /* the erased_blocks should always sorted
			
 
				+     * if sorted, nr_fec_blocks times to check it
			
 
				+     * if not, sort it here
			
 
				+     * */
			
 
				+    for(i = 0; i < nr_fec_blocks; i++) {
			
 
				+        swap = 0;
			
 
				+        for(j = i+1; j < nr_fec_blocks; j++) {
			
 
				+            if(erased_blocks[i] > erased_blocks[j]) {
			
 
				+                /* the prefix is bigger than the following, swap */
			
 
				+                c = erased_blocks[i];
			
 
				+                erased_blocks[i] = erased_blocks[j];
			
 
				+                erased_blocks[j] = c;
			
 
				+
			
 
				+                swap = 1;
			
 
				+            }
			
 
				+        }
			
 
				+        //printf("swap:%d\n", swap);
			
 
				+        if(!swap) {
			
 
				+            //already sorted or sorted ok
			
 
				+            break;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    j = 0;
			
 
				+    subMatrixRow = 0;
			
 
				+    nos = 0;
			
 
				+    nshards = 0;
			
 
				+    dataShards = rs->data_shards;
			
 
				+    for(i = 0; i < dataShards; i++) {
			
 
				+        if(j < nr_fec_blocks && i == erased_blocks[j]) {
			
 
				+            //ignore the invalid block
			
 
				+            j++;
			
 
				+        } else {
			
 
				+            /* this row is ok */
			
 
				+            for(c = 0; c < dataShards; c++) {
			
 
				+                dataDecodeMatrix[subMatrixRow*dataShards + c] = m[i*dataShards + c];
			
 
				+            }
			
 
				+            subShards[subMatrixRow] = data_blocks[i];
			
 
				+            subMatrixRow++;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    for(i = 0; i < nr_fec_blocks && subMatrixRow < dataShards; i++) {
			
 
				+        subShards[subMatrixRow] = dec_fec_blocks[i];
			
 
				+        j = dataShards + fec_block_nos[i];
			
 
				+        for(c = 0; c < dataShards; c++) {
			
 
				+            dataDecodeMatrix[subMatrixRow*dataShards + c] = m[j*dataShards + c]; //use spefic pos of original fec_blocks
			
 
				+        }
			
 
				+        subMatrixRow++;
			
 
				+    }
			
 
				+
			
 
				+    if(subMatrixRow < dataShards) {
			
 
				+        //cannot correct
			
 
				+        return -1;
			
 
				+    }
			
 
				+
			
 
				+    invert_mat(dataDecodeMatrix, dataShards);
			
 
				+    //printf("invert:\n");
			
 
				+    //print_matrix1(dataDecodeMatrix, dataShards, dataShards);
			
 
				+    //printf("nShards:\n");
			
 
				+    //print_matrix2(subShards, dataShards, block_size);
			
 
				+
			
 
				+    for(i = 0; i < nr_fec_blocks; i++) {
			
 
				+        j = erased_blocks[i];
			
 
				+        outputs[i] = data_blocks[j];
			
 
				+        //data_blocks[j][0] = 0;
			
 
				+        memmove(dataDecodeMatrix+i*dataShards, dataDecodeMatrix+j*dataShards, dataShards);
			
 
				+    }
			
 
				+    //printf("subMatrixRow:\n");
			
 
				+    //print_matrix1(dataDecodeMatrix, nr_fec_blocks, dataShards);
			
 
				+
			
 
				+    //printf("outputs:\n");
			
 
				+    //print_matrix2(outputs, nr_fec_blocks, block_size);
			
 
				+
			
 
				+    return code_some_shards(dataDecodeMatrix, subShards, outputs,
			
 
				+                            dataShards, nr_fec_blocks, block_size);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * encode a big size of buffer
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * nr_shards: assert(0 == nr_shards % rs->shards)
			
 
				+ * shards[nr_shards][block_size]
			
 
				+ * */
			
 
				+int reed_solomon_encode2(reed_solomon* rs, unsigned char** shards, int nr_shards, int block_size) {
			
 
				+    unsigned char** data_blocks;
			
 
				+    unsigned char** fec_blocks;
			
 
				+    int i, ds = rs->data_shards, ps = rs->parity_shards, ss = rs->shards;
			
 
				+    i = nr_shards / ss;
			
 
				+    data_blocks = shards;
			
 
				+    fec_blocks = &shards[(i*ds)];
			
 
				+
			
 
				+    for(i = 0; i < nr_shards; i += ss) {
			
 
				+        reed_solomon_encode(rs, data_blocks, fec_blocks, block_size);
			
 
				+        data_blocks += ds;
			
 
				+        fec_blocks += ps;
			
 
				+    }
			
 
				+    return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * reconstruct a big size of buffer
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * nr_shards: assert(0 == nr_shards % rs->data_shards)
			
 
				+ * shards[nr_shards][block_size]
			
 
				+ * marks[nr_shards] marks as errors
			
 
				+ * */
			
 
				+int reed_solomon_reconstruct(reed_solomon* rs,
			
 
				+                             unsigned char** shards,
			
 
				+                             unsigned char* marks,
			
 
				+                             int nr_shards,
			
 
				+                             int block_size) {
			
 
				+    unsigned char *dec_fec_blocks[DATA_SHARDS_MAX];
			
 
				+    unsigned int fec_block_nos[DATA_SHARDS_MAX];
			
 
				+    unsigned int erased_blocks[DATA_SHARDS_MAX];
			
 
				+    unsigned char* fec_marks;
			
 
				+    unsigned char **data_blocks, **fec_blocks;
			
 
				+    int i, j, dn, pn, n;
			
 
				+    int ds = rs->data_shards;
			
 
				+    int ps = rs->parity_shards;
			
 
				+    int err = 0;
			
 
				+
			
 
				+    data_blocks = shards;
			
 
				+    n = nr_shards / rs->shards;
			
 
				+    fec_marks = marks + n*ds; //after all data, is't fec marks
			
 
				+    fec_blocks = shards + n*ds;
			
 
				+
			
 
				+    for(j = 0; j < n; j++) {
			
 
				+        dn = 0;
			
 
				+        for(i = 0; i < ds; i++) {
			
 
				+            if(marks[i]) {
			
 
				+                //errors
			
 
				+                erased_blocks[dn++] = i;
			
 
				+            }
			
 
				+        }
			
 
				+        if(dn > 0) {
			
 
				+            pn = 0;
			
 
				+            for(i = 0; i < ps && pn < dn; i++) {
			
 
				+                if(!fec_marks[i]) {
			
 
				+                    //got valid fec row
			
 
				+                    fec_block_nos[pn] = i;
			
 
				+                    dec_fec_blocks[pn] = fec_blocks[i];
			
 
				+                    pn++;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            if(dn == pn) {
			
 
				+                reed_solomon_decode(rs
			
 
				+                        , data_blocks
			
 
				+                        , block_size
			
 
				+                        , dec_fec_blocks
			
 
				+                        , fec_block_nos
			
 
				+                        , erased_blocks
			
 
				+                        , dn);
			
 
				+            } else {
			
 
				+                //error but we continue
			
 
				+                err = -1;
			
 
				+            }
			
 
				+        }
			
 
				+        data_blocks += ds;
			
 
				+        marks += ds;
			
 
				+        fec_blocks += ps;
			
 
				+        fec_marks += ps;
			
 
				+    }
			
 
				+
			
 
				+    return err;
			
 
				+}
			
 
				+
			
--- a/src/third_party/rs.h
+++ b/src/third_party/rs.h
@@ -0,0 +1,88 @@
 
				+#ifndef __RS_H_
			
 
				+#define __RS_H_
			
 
				+
			
 
				+/* use small value to save memory */
			
 
				+#ifndef DATA_SHARDS_MAX
			
 
				+#define DATA_SHARDS_MAX (255)
			
 
				+#endif
			
 
				+
			
 
				+/* use other memory allocator */
			
 
				+#ifndef RS_MALLOC
			
 
				+#define RS_MALLOC(x)    malloc(x)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef RS_FREE
			
 
				+#define RS_FREE(x)      free(x)
			
 
				+#endif
			
 
				+
			
 
				+#ifndef RS_CALLOC
			
 
				+#define RS_CALLOC(n, x) calloc(n, x)
			
 
				+#endif
			
 
				+
			
 
				+typedef struct _reed_solomon {
			
 
				+    int data_shards;
			
 
				+    int parity_shards;
			
 
				+    int shards;
			
 
				+    unsigned char* m;
			
 
				+    unsigned char* parity;
			
 
				+} reed_solomon;
			
 
				+
			
 
				+/**
			
 
				+ * MUST initial one time
			
 
				+ * */
			
 
				+void fec_init(void);
			
 
				+
			
 
				+reed_solomon* reed_solomon_new(int data_shards, int parity_shards);
			
 
				+void reed_solomon_release(reed_solomon* rs);
			
 
				+
			
 
				+/**
			
 
				+ * encode one shard
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * data_blocks[rs->data_shards][block_size]
			
 
				+ * fec_blocks[rs->data_shards][block_size]
			
 
				+ * */
			
 
				+int reed_solomon_encode(reed_solomon* rs,
			
 
				+                        unsigned char** data_blocks,
			
 
				+                        unsigned char** fec_blocks,
			
 
				+                        int block_size);
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * decode one shard
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * original data_blocks[rs->data_shards][block_size]
			
 
				+ * dec_fec_blocks[nr_fec_blocks][block_size]
			
 
				+ * fec_block_nos: fec pos number in original fec_blocks
			
 
				+ * erased_blocks: erased blocks in original data_blocks
			
 
				+ * nr_fec_blocks: the number of erased blocks
			
 
				+ * */
			
 
				+int reed_solomon_decode(reed_solomon* rs,
			
 
				+                        unsigned char **data_blocks,
			
 
				+                        int block_size,
			
 
				+                        unsigned char **dec_fec_blocks,
			
 
				+                        unsigned int *fec_block_nos,
			
 
				+                        unsigned int *erased_blocks,
			
 
				+                        int nr_fec_blocks);
			
 
				+
			
 
				+/**
			
 
				+ * encode a big size of buffer
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * nr_shards: assert(0 == nr_shards % rs->data_shards)
			
 
				+ * shards[nr_shards][block_size]
			
 
				+ * */
			
 
				+int reed_solomon_encode2(reed_solomon* rs, unsigned char** shards, int nr_shards, int block_size);
			
 
				+
			
 
				+/**
			
 
				+ * reconstruct a big size of buffer
			
 
				+ * input:
			
 
				+ * rs
			
 
				+ * nr_shards: assert(0 == nr_shards % rs->data_shards)
			
 
				+ * shards[nr_shards][block_size]
			
 
				+ * marks[nr_shards] marks as errors
			
 
				+ * */
			
 
				+int reed_solomon_reconstruct(reed_solomon* rs, unsigned char** shards, unsigned char* marks, int nr_shards, int block_size);
			
 
				+#endif
			
 
				+
			
--- a/src/third_party/scope_guard.hpp
+++ b/src/third_party/scope_guard.hpp
@@ -0,0 +1,184 @@
 
				+/*
			
 
				+ *  Created on: 13/02/2018
			
 
				+ *      Author: ricab
			
 
				+ *
			
 
				+ * See README.md for documentation of this header's public interface.
			
 
				+ */
			
 
				+
			
 
				+#ifndef SCOPE_GUARD_HPP_
			
 
				+#define SCOPE_GUARD_HPP_
			
 
				+
			
 
				+#include <type_traits>
			
 
				+#include <utility>
			
 
				+
			
 
				+#if __cplusplus >= 201703L && defined(SG_REQUIRE_NOEXCEPT_IN_CPP17)
			
 
				+#define SG_REQUIRE_NOEXCEPT
			
 
				+#endif
			
 
				+
			
 
				+namespace sg
			
 
				+{
			
 
				+    namespace detail
			
 
				+    {
			
 
				+        /* --- Some custom type traits --- */
			
 
				+
			
 
				+        // Type trait determining whether a type is callable with no arguments
			
 
				+        template<typename T, typename = void>
			
 
				+        struct is_noarg_callable_t
			
 
				+                : public std::false_type
			
 
				+        {}; // in general, false
			
 
				+
			
 
				+        template<typename T>
			
 
				+        struct is_noarg_callable_t<T, decltype(std::declval<T&&>()())>
			
 
				+                : public std::true_type
			
 
				+        {}; // only true when call expression valid
			
 
				+
			
 
				+        // Type trait determining whether a no-argument callable returns void
			
 
				+        template<typename T>
			
 
				+        struct returns_void_t
			
 
				+                : public std::is_same<void, decltype(std::declval<T&&>()())>
			
 
				+        {};
			
 
				+
			
 
				+        /* Type trait determining whether a no-arg callable is nothrow invocable if
			
 
				+        required. This is where SG_REQUIRE_NOEXCEPT logic is encapsulated. */
			
 
				+        template<typename T>
			
 
				+        struct is_nothrow_invocable_if_required_t
			
 
				+                : public
			
 
				+#ifdef SG_REQUIRE_NOEXCEPT
			
 
				+                  std::is_nothrow_invocable<T> /* Note: _r variants not enough to
			
 
				+                                          confirm void return: any return can be
			
 
				+                                          discarded so all returns are
			
 
				+                                          compatible with void */
			
 
				+#else
			
 
				+                  std::true_type
			
 
				+#endif
			
 
				+        {};
			
 
				+
			
 
				+        // logic AND of two or more type traits
			
 
				+        template<typename A, typename B, typename... C>
			
 
				+        struct and_t : public and_t<A, and_t<B, C...>>
			
 
				+        {}; // for more than two arguments
			
 
				+
			
 
				+        template<typename A, typename B>
			
 
				+        struct and_t<A, B> : public std::conditional<A::value, B, A>::type
			
 
				+        {}; // for two arguments
			
 
				+
			
 
				+        // Type trait determining whether a type is a proper scope_guard callback.
			
 
				+        template<typename T>
			
 
				+        struct is_proper_sg_callback_t
			
 
				+                : public and_t<is_noarg_callable_t<T>,
			
 
				+                        returns_void_t<T>,
			
 
				+                        is_nothrow_invocable_if_required_t<T>,
			
 
				+                        std::is_nothrow_destructible<T>>
			
 
				+        {};
			
 
				+
			
 
				+
			
 
				+        /* --- The actual scope_guard template --- */
			
 
				+
			
 
				+        template<typename Callback,
			
 
				+                typename = typename std::enable_if<
			
 
				+                        is_proper_sg_callback_t<Callback>::value>::type>
			
 
				+        class scope_guard;
			
 
				+
			
 
				+
			
 
				+        /* --- Now the friend maker --- */
			
 
				+
			
 
				+        template<typename Callback>
			
 
				+        detail::scope_guard<Callback> make_scope_guard(Callback&& callback)
			
 
				+        noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value); /*
			
 
				+    we need this in the inner namespace due to MSVC bugs preventing
			
 
				+    sg::detail::scope_guard from befriending a sg::make_scope_guard
			
 
				+    template instance in the parent namespace (see https://is.gd/xFfFhE). */
			
 
				+
			
 
				+
			
 
				+        /* --- The template specialization that actually defines the class --- */
			
 
				+
			
 
				+        template<typename Callback>
			
 
				+        class scope_guard<Callback> final
			
 
				+        {
			
 
				+        public:
			
 
				+            typedef Callback callback_type;
			
 
				+
			
 
				+            scope_guard(scope_guard&& other)
			
 
				+            noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value);
			
 
				+
			
 
				+            ~scope_guard() noexcept; // highlight noexcept dtor
			
 
				+
			
 
				+            void dismiss() noexcept;
			
 
				+
			
 
				+        public:
			
 
				+            scope_guard() = delete;
			
 
				+            scope_guard(const scope_guard&) = delete;
			
 
				+            scope_guard& operator=(const scope_guard&) = delete;
			
 
				+            scope_guard& operator=(scope_guard&&) = delete;
			
 
				+
			
 
				+        private:
			
 
				+            explicit scope_guard(Callback&& callback)
			
 
				+            noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value); /*
			
 
				+                                                      meant for friends only */
			
 
				+
			
 
				+            friend scope_guard<Callback> make_scope_guard<Callback>(Callback&&)
			
 
				+            noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value); /*
			
 
				+      only make_scope_guard can create scope_guards from scratch (i.e. non-move)
			
 
				+      */
			
 
				+
			
 
				+        private:
			
 
				+            Callback m_callback;
			
 
				+            bool m_active;
			
 
				+
			
 
				+        };
			
 
				+
			
 
				+    } // namespace detail
			
 
				+
			
 
				+
			
 
				+    /* --- Now the single public maker function --- */
			
 
				+
			
 
				+    using detail::make_scope_guard; // see comment on declaration above
			
 
				+
			
 
				+} // namespace sg
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+template<typename Callback>
			
 
				+sg::detail::scope_guard<Callback>::scope_guard(Callback&& callback)
			
 
				+noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value)
			
 
				+        : m_callback(std::forward<Callback>(callback)) /* use () instead of {} because
			
 
				+    of DR 1467 (https://is.gd/WHmWuo), which still impacts older compilers
			
 
				+    (e.g. GCC 4.x and clang <=3.6, see https://godbolt.org/g/TE9tPJ and
			
 
				+    https://is.gd/Tsmh8G) */
			
 
				+        , m_active{true}
			
 
				+{}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+template<typename Callback>
			
 
				+sg::detail::scope_guard<Callback>::~scope_guard() noexcept
			
 
				+{
			
 
				+    if(m_active)
			
 
				+        m_callback();
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+template<typename Callback>
			
 
				+sg::detail::scope_guard<Callback>::scope_guard(scope_guard&& other)
			
 
				+noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value)
			
 
				+        : m_callback(std::forward<Callback>(other.m_callback)) // idem
			
 
				+        , m_active{std::move(other.m_active)}
			
 
				+{
			
 
				+    other.m_active = false;
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+template<typename Callback>
			
 
				+inline void sg::detail::scope_guard<Callback>::dismiss() noexcept
			
 
				+{
			
 
				+    m_active = false;
			
 
				+}
			
 
				+
			
 
				+////////////////////////////////////////////////////////////////////////////////
			
 
				+template<typename Callback>
			
 
				+inline auto sg::detail::make_scope_guard(Callback&& callback)
			
 
				+noexcept(std::is_nothrow_constructible<Callback, Callback&&>::value)
			
 
				+-> detail::scope_guard<Callback>
			
 
				+{
			
 
				+    return detail::scope_guard<Callback>{std::forward<Callback>(callback)};
			
 
				+}
			
 
				+
			
 
				+#endif /* SCOPE_GUARD_HPP_ */
			
--- a/src/video_decoder.cpp
+++ b/src/video_decoder.cpp
@@ -0,0 +1,219 @@
 
				+#include "video_decoder.h"
			
 
				+#include "config.h"
			
 
				+#include "cuda_helper.hpp"
			
 
				+#include "nv12_renderer.h"
			
 
				+#include "third_party/scope_guard.hpp"
			
 
				+
			
 
				+#include <nvcuvid.h>
			
 
				+
			
 
				+#include <atomic>
			
 
				+
			
 
				+struct video_decoder::impl {
			
 
				+
			
 
				+    CUvideoparser parser = nullptr;
			
 
				+    CUvideodecoder decoder = nullptr;
			
 
				+
			
 
				+    uint32_t image_width = 0, image_height = 0;
			
 
				+    uint32_t decode_surface = 0;
			
 
				+
			
 
				+    nv12_renderer *renderer = nullptr;
			
 
				+    std::atomic_int next_pic_index;
			
 
				+
			
 
				+    impl() {
			
 
				+        next_pic_index.store(-1);
			
 
				+    }
			
 
				+
			
 
				+    ~impl() {
			
 
				+        stop();
			
 
				+    }
			
 
				+
			
 
				+    bool start() {
			
 
				+        // query decoder capability
			
 
				+        CUVIDDECODECAPS caps = {};
			
 
				+        caps.eCodecType = cudaVideoCodec_HEVC;
			
 
				+        caps.eChromaFormat = cudaVideoChromaFormat_420;
			
 
				+        caps.nBitDepthMinus8 = 0; // 8-bit
			
 
				+        CUDA_API_CHECK(cuvidGetDecoderCaps(&caps));
			
 
				+
			
 
				+        // check decoder capability
			
 
				+        CALL_CHECK(caps.bIsSupported == 1);
			
 
				+        CALL_CHECK(caps.nOutputFormatMask & (1 << cudaVideoSurfaceFormat_NV12));
			
 
				+
			
 
				+        // create parser
			
 
				+        CUVIDPARSERPARAMS params = {};
			
 
				+        params.CodecType = cudaVideoCodec_HEVC;
			
 
				+        params.ulMaxNumDecodeSurfaces = 1; // dummy value according to document
			
 
				+        params.ulMaxDisplayDelay = 0; // no delay
			
 
				+        params.pUserData = this;
			
 
				+        params.pfnSequenceCallback = sequence_callback;
			
 
				+        params.pfnDecodePicture = ready_decode;
			
 
				+        assert(parser == nullptr);
			
 
				+        CUDA_API_CHECK(cuvidCreateVideoParser(&parser, &params));
			
 
				+
			
 
				+        next_pic_index.store(-1);
			
 
				+        // decoder will be created on parser callback
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    void stop() {
			
 
				+        if (decoder != nullptr) {
			
 
				+            cuvidDestroyDecoder(decoder);
			
 
				+            decoder = nullptr;
			
 
				+        }
			
 
				+        if (parser != nullptr) {
			
 
				+            cuvidDestroyVideoParser(parser);
			
 
				+            parser = nullptr;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    bool create_decoder() {
			
 
				+        CUVIDDECODECREATEINFO decoder_info = {};
			
 
				+        decoder_info.ulWidth = image_width;
			
 
				+        decoder_info.ulHeight = image_height;
			
 
				+        decoder_info.ulNumDecodeSurfaces = decode_surface;
			
 
				+        decoder_info.CodecType = cudaVideoCodec_HEVC;
			
 
				+        decoder_info.ChromaFormat = cudaVideoChromaFormat_420;
			
 
				+        decoder_info.ulCreationFlags = cudaVideoCreate_PreferCUVID;
			
 
				+        decoder_info.bitDepthMinus8 = 0; // 8-bit
			
 
				+        decoder_info.OutputFormat = cudaVideoSurfaceFormat_NV12;
			
 
				+        decoder_info.DeinterlaceMode = cudaVideoDeinterlaceMode_Weave;
			
 
				+        decoder_info.ulTargetWidth = image_width;
			
 
				+        decoder_info.ulTargetHeight = image_height;
			
 
				+        decoder_info.ulNumOutputSurfaces = 2; // TODO; learn more about this
			
 
				+//        auto cuda_ctx = get_cuda_primary_context();
			
 
				+//        auto context_guard = smart_cuda_context_guard{cuda_ctx};
			
 
				+
			
 
				+        CUDA_API_CHECK(cuvidCreateDecoder(&decoder, &decoder_info));
			
 
				+        assert(decoder != nullptr);
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    static int sequence_callback(void *ptr, CUVIDEOFORMAT *format) {
			
 
				+        // ensure consistency
			
 
				+        assert(format->codec == cudaVideoCodec_HEVC);
			
 
				+        assert(format->progressive_sequence == 1); // progressive
			
 
				+        assert(format->bit_depth_luma_minus8 == 0); // 8-bit
			
 
				+        assert(format->bit_depth_chroma_minus8 == 0); // 8-bit
			
 
				+        assert(format->chroma_format == cudaVideoChromaFormat_420);
			
 
				+
			
 
				+        assert(ptr != nullptr);
			
 
				+        auto pimpl = (impl *) ptr;
			
 
				+        if (pimpl->decoder == nullptr) {
			
 
				+            pimpl->image_width = format->coded_width;
			
 
				+            pimpl->image_height = format->coded_height;
			
 
				+            pimpl->decode_surface = format->min_num_decode_surfaces + 4;
			
 
				+            CALL_CHECK(pimpl->create_decoder());
			
 
				+        } else {
			
 
				+            assert(format->coded_width == pimpl->image_width);
			
 
				+            assert(format->coded_height == pimpl->image_height);
			
 
				+        }
			
 
				+
			
 
				+        return format->min_num_decode_surfaces + 4;
			
 
				+    }
			
 
				+
			
 
				+    static int ready_decode(void *ptr, CUVIDPICPARAMS *pic) {
			
 
				+        assert(ptr != nullptr);
			
 
				+        auto pimpl = (impl *) ptr;
			
 
				+
			
 
				+        // decode image
			
 
				+        assert(pimpl->decoder != nullptr);
			
 
				+        CUDA_API_CHECK(cuvidDecodePicture(pimpl->decoder, pic));
			
 
				+
			
 
				+        // commit new image
			
 
				+        pimpl->next_pic_index.store(pic->CurrPicIdx);
			
 
				+        pimpl->next_pic_index.notify_all();
			
 
				+
			
 
				+        return 1; // success
			
 
				+    }
			
 
				+
			
 
				+    bool retrieve_frame() {
			
 
				+        // wait for new frame
			
 
				+        next_pic_index.wait(-1);
			
 
				+        auto pic_index = next_pic_index.load();
			
 
				+
			
 
				+        // setup renderer
			
 
				+        cudaGraphicsResource *luma_res, *chroma_res;
			
 
				+        renderer->config_frame(image_width, image_height);
			
 
				+        renderer->get_frame_res(&luma_res, &chroma_res);
			
 
				+
			
 
				+        // map frame
			
 
				+        CUdeviceptr frame_ptr;
			
 
				+        unsigned int frame_pitch;
			
 
				+        CUVIDPROCPARAMS proc_params = {};
			
 
				+        proc_params.progressive_frame = 1; // progressive frame
			
 
				+        proc_params.second_field = 1;
			
 
				+        assert(decoder != nullptr);
			
 
				+        CUDA_API_CHECK(cuvidMapVideoFrame(decoder, pic_index, &frame_ptr, &frame_pitch, &proc_params));
			
 
				+        auto pic_closer = sg::make_scope_guard([&]() {
			
 
				+            cuvidUnmapVideoFrame(decoder, frame_ptr);
			
 
				+        });
			
 
				+        assert(frame_ptr != 0);
			
 
				+
			
 
				+        // check decode status
			
 
				+        CUVIDGETDECODESTATUS status = {};
			
 
				+        CUDA_API_CHECK(cuvidGetDecodeStatus(decoder, pic_index, &status));
			
 
				+        CALL_CHECK(status.decodeStatus == cuvidDecodeStatus_Success);
			
 
				+
			
 
				+        // upload frame
			
 
				+        void *pbo_ptr;
			
 
				+        size_t pbo_size;
			
 
				+        CUDA_API_CHECK(cudaGraphicsMapResources(2, &luma_res));
			
 
				+        auto res_closer = sg::make_scope_guard([&]() {
			
 
				+            cudaGraphicsUnmapResources(2, &luma_res);
			
 
				+        });
			
 
				+
			
 
				+        auto luma_ptr = (void *) frame_ptr;
			
 
				+        CUDA_API_CHECK(cudaGraphicsResourceGetMappedPointer(&pbo_ptr, &pbo_size, luma_res));
			
 
				+        assert(pbo_size == image_width * image_height);
			
 
				+        CUDA_API_CHECK(cudaMemcpy2D(pbo_ptr, image_width, luma_ptr, frame_pitch,
			
 
				+                                    image_width, image_height, cudaMemcpyDeviceToDevice));
			
 
				+
			
 
				+        auto chroma_ptr = (char *) frame_ptr + frame_pitch * ((image_height + 1) & ~1);
			
 
				+        CUDA_API_CHECK(cudaGraphicsResourceGetMappedPointer(&pbo_ptr, &pbo_size, chroma_res));
			
 
				+        assert(pbo_size == image_width * image_height >> 1);
			
 
				+        CUDA_API_CHECK(cudaMemcpy2D(pbo_ptr, image_width, chroma_ptr, frame_pitch,
			
 
				+                                    image_width, image_height >> 1, cudaMemcpyDeviceToDevice));
			
 
				+
			
 
				+        // pbo to tex
			
 
				+        renderer->upload_frame();
			
 
				+
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+};
			
 
				+
			
 
				+video_decoder::video_decoder()
			
 
				+        : pimpl(std::make_unique<impl>()) {}
			
 
				+
			
 
				+video_decoder::~video_decoder() = default;
			
 
				+
			
 
				+bool video_decoder::start() {
			
 
				+    return pimpl->start();
			
 
				+}
			
 
				+
			
 
				+void video_decoder::stop() {
			
 
				+    pimpl->stop();
			
 
				+}
			
 
				+
			
 
				+void video_decoder::get_image_size(uint32_t *width, uint32_t *height) {
			
 
				+    *width = pimpl->image_width;
			
 
				+    *height = pimpl->image_height;
			
 
				+}
			
 
				+
			
 
				+bool video_decoder::decode_frame(void *data, size_t length) {
			
 
				+    CUVIDSOURCEDATAPACKET packet = {};
			
 
				+    packet.flags = CUVID_PKT_ENDOFPICTURE;
			
 
				+    packet.payload_size = length;
			
 
				+    packet.payload = (unsigned char *) data;
			
 
				+    CUDA_API_CHECK(cuvidParseVideoData(pimpl->parser, &packet));
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+bool video_decoder::retrieve_frame(nv12_renderer *renderer) {
			
 
				+    if (pimpl->renderer != nullptr) [[likely]] {
			
 
				+        assert(pimpl->renderer == renderer);
			
 
				+    } else {
			
 
				+        pimpl->renderer = renderer;
			
 
				+    }
			
 
				+    return pimpl->retrieve_frame();
			
 
				+}
			
--- a/src/video_decoder.h
+++ b/src/video_decoder.h
@@ -0,0 +1,32 @@
 
				+#ifndef TINYPLAYER2_VIDEO_DECODER_H
			
 
				+#define TINYPLAYER2_VIDEO_DECODER_H
			
 
				+
			
 
				+#include "cuda_helper.hpp"
			
 
				+
			
 
				+#include <memory>
			
 
				+
			
 
				+class nv12_renderer;
			
 
				+
			
 
				+class video_decoder {
			
 
				+public:
			
 
				+    video_decoder();
			
 
				+
			
 
				+    ~video_decoder();
			
 
				+
			
 
				+    bool start();
			
 
				+
			
 
				+    void stop();
			
 
				+
			
 
				+    void get_image_size(uint32_t *width, uint32_t *height);
			
 
				+
			
 
				+    bool decode_frame(void *data, size_t length);
			
 
				+
			
 
				+    bool retrieve_frame(nv12_renderer *renderer);
			
 
				+
			
 
				+private:
			
 
				+    struct impl;
			
 
				+    std::unique_ptr<impl> pimpl;
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#endif //TINYPLAYER2_VIDEO_DECODER_H