2 年之前 · 260621feea
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
 
															 target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
														
 
															 # VTK config
														
 
															+set(VTK_DIR /home/tpx/src/VTK-9.3.0/Build/lib/cmake/vtk-9.3)
														
 
															 find_package(VTK REQUIRED)
														
 
															 target_link_libraries(${PROJECT_NAME} ${VTK_LIBRARIES})
														
 
															 vtk_module_autoinit(TARGETS ${PROJECT_NAME} MODULES ${VTK_LIBRARIES})
														
--- a/src/image_process/CMakeLists.txt
+++ b/src/image_process/CMakeLists.txt
@@ -10,6 +10,7 @@ add_library(${PROJECT_NAME}
 
															 find_package(CUDAToolkit REQUIRED)
														
 
															 target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
														
 
															 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
														
 
															+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
														
 
															 # spdlog config
														
 
															 find_package(spdlog REQUIRED)
														
@@ -19,4 +20,4 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LE
 
															 # OpenCV config
														
 
															 find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
														
 
															 target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
														
 
															-target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
														
 
															+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
														
--- a/src/image_process/process_kernels.cu
+++ b/src/image_process/process_kernels.cu
@@ -1,13 +1,15 @@
 
															 #include "process_kernels.cuh"
														
 
															 #include <cassert>
														
 
															+#include <limits>
														
 
															 #include <type_traits>
														
 
															 // kernel templates
														
 
															 template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
														
 
															 __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
														
 
															-    static_assert(std::is_fundamental_v<OutT>);
														
 
															+    static_assert(std::is_fundamental_v<OutT>,
														
 
															+                  "Only fundamental type can be reduced.");
														
 
															     if constexpr (BlockSize >= 64) {
														
 
															         ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
														
 
															     }
														
@@ -28,7 +30,7 @@ __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
 
															     }
														
 
															 }
														
 
															-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
														
 
															+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, uint16_t BlockSize>
														
 
															 __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
														
 
															     extern __shared__ int shmem[];
														
 
															     auto s_buf = (OutT *) shmem;
														
@@ -37,7 +39,7 @@ __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
 
															     uint32_t bkx = blockIdx.x;
														
 
															     uint32_t grid_size = BlockSize * gridDim.x;
														
 
															-    OutT t_out = InitVal;
														
 
															+    OutT t_out = UpdateFunc::InitVal();
														
 
															     // load per-thread data
														
 
															     for (uint32_t i = bkx * blockDim.x + tdx;
														
@@ -106,7 +108,7 @@ __global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext)
 
															     }
														
 
															 }
														
 
															-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
														
 
															+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
														
 
															 void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
														
 
															                             uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     assert(n <= std::numeric_limits<uint32_t>::max());
														
@@ -115,19 +117,19 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
 
															     switch (block_size) {
														
 
															         case 512: {
														
 
															             constexpr uint16_t BlockSize = 512;
														
 
															-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
														
 
															+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
														
 
															             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
														
 
															             return;
														
 
															         }
														
 
															         case 256: {
														
 
															             constexpr uint16_t BlockSize = 256;
														
 
															-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
														
 
															+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
														
 
															             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
														
 
															             return;
														
 
															         }
														
 
															         case 128: {
														
 
															             constexpr uint16_t BlockSize = 128;
														
 
															-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
														
 
															+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
														
 
															             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
														
 
															             return;
														
 
															         }
														
@@ -138,15 +140,15 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
 
															 }
														
 
															 // result resides in out[0]
														
 
															-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
														
 
															+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
														
 
															 void call_reduce_any(InT *in, OutT *out, uint32_t n,
														
 
															                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     { // first step
														
 
															-        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
														
 
															+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc>;
														
 
															         helper_func(in, out, n, block_size, grid_dim, stream);
														
 
															     }
														
 
															     { // second step
														
 
															-        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
														
 
															+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc>;
														
 
															         helper_func(out, out, grid_dim, block_size, 1, stream);
														
 
															     }
														
 
															 }
														
@@ -158,11 +160,20 @@ struct type_max_value {
 
															     static constexpr T value = std::numeric_limits<T>::max();
														
 
															 };
														
 
															+template<typename T>
														
 
															+struct type_min_value {
														
 
															+    static constexpr T value = std::numeric_limits<T>::min();
														
 
															+};
														
 
															+
														
 
															 template<typename T>
														
 
															 struct reduce_max_func {
														
 
															     static __device__ __forceinline__ void Op(volatile T *out, T val) {
														
 
															         *out = max(*out, val);
														
 
															     }
														
 
															+
														
 
															+    static __device__ __forceinline__ T InitVal() {
														
 
															+        return type_min_value<T>::value;
														
 
															+    }
														
 
															 };
														
 
															 template<typename T>
														
@@ -170,6 +181,10 @@ struct reduce_min_func {
 
															     static __device__ __forceinline__ void Op(volatile T *out, T val) {
														
 
															         *out = min(*out, val);
														
 
															     }
														
 
															+
														
 
															+    static __device__ __forceinline__ T InitVal() {
														
 
															+        return type_max_value<T>::value;
														
 
															+    }
														
 
															 };
														
 
															 template<typename T>
														
@@ -177,6 +192,10 @@ struct reduce_sum_func {
 
															     static __device__ __forceinline__ void Op(volatile T *out, T val) {
														
 
															         *out = *out + val;
														
 
															     }
														
 
															+
														
 
															+    static __device__ __forceinline__ T InitVal() {
														
 
															+        return 0;
														
 
															+    }
														
 
															 };
														
 
															 template<typename T>
														
@@ -186,6 +205,10 @@ struct update_log_sum_func {
 
															     static __device__ __forceinline__ void Op(T *out, T val) {
														
 
															         *out += log(val + eps);
														
 
															     }
														
 
															+
														
 
															+    static __device__ __forceinline__ T InitVal() {
														
 
															+        return 0;
														
 
															+    }
														
 
															 };
														
 
															 template<typename InT, typename OutT>
														
@@ -213,8 +236,9 @@ struct enhance_image_func {
 
															         // convert RGB to HSV
														
 
															         // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
														
 
															         using ImgElemT = decltype(in.x);
														
 
															-        static_assert(std::is_integral_v<ImgElemT>);
														
 
															-        ImgElemT c_max = max(max(in.x, in.y), in.z);
														
 
															+        static_assert(std::is_integral_v<ImgElemT>,
														
 
															+                      "Type of image element must be integer.");
														
 
															+        ImgElemT c_maxgit  = max(max(in.x, in.y), in.z);
														
 
															         ImgElemT c_min = min(min(in.x, in.y), in.z);
														
 
															         ImgElemT delta = c_max - c_min;
														
@@ -325,8 +349,7 @@ template<typename T>
 
															 void call_reduce_max(T *in, T *out, size_t n,
														
 
															                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     using FuncType = reduce_max_func<T>;
														
 
															-    constexpr T InitVal = std::numeric_limits<T>::min();
														
 
															-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
														
 
															+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
														
 
															     helper_func(in, out, n, block_size, grid_dim, stream);
														
 
															 }
														
@@ -336,8 +359,7 @@ template<typename T>
 
															 void call_reduce_min(T *in, T *out, size_t n,
														
 
															                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     using FuncType = reduce_min_func<T>;
														
 
															-    constexpr T InitVal = std::numeric_limits<T>::max();
														
 
															-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
														
 
															+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
														
 
															     helper_func(in, out, n, block_size, grid_dim, stream);
														
 
															 }
														
@@ -347,7 +369,7 @@ template<typename T>
 
															 void call_reduce_sum(T *in, T *out, size_t n,
														
 
															                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     using FuncType = reduce_sum_func<T>;
														
 
															-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
														
 
															+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
														
 
															     helper_func(in, out, n, block_size, grid_dim, stream);
														
 
															 }
														
@@ -358,7 +380,7 @@ void call_reduce_log_sum(T *in, T *out, size_t n,
 
															                          uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
														
 
															     using UpdateFuncType = update_log_sum_func<T>;
														
 
															     using ReduceFuncType = reduce_sum_func<T>;
														
 
															-    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
														
 
															+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType>;
														
 
															     helper_func(in, out, n, block_size, grid_dim, stream);
														
 
															 }
														
--- a/src/video_encoder.cpp
+++ b/src/video_encoder.cpp
@@ -6,8 +6,18 @@
 
															 #include <nvEncodeAPI.h>
														
 
															+#ifdef _MSC_VER
														
 
															+
														
 
															+#include <format>
														
 
															+
														
 
															+#define fmt std
														
 
															+
														
 
															+#else
														
 
															+
														
 
															 #include <fmt/chrono.h>
														
 
															+#endif
														
 
															+
														
 
															 bool check_nvenc_api_call(NVENCSTATUS api_ret, unsigned int line_number,
														
 
															                           const char *file_name, const char *api_call_str) {
														
 
															     if (api_ret == NV_ENC_SUCCESS) [[likely]] return true;