2 jaren geleden · 260621feea
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -108,6 +108,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
 
				 target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
			
 
				 
			
 
				 # VTK config
			
 
				+set(VTK_DIR /home/tpx/src/VTK-9.3.0/Build/lib/cmake/vtk-9.3)
			
 
				 find_package(VTK REQUIRED)
			
 
				 target_link_libraries(${PROJECT_NAME} ${VTK_LIBRARIES})
			
 
				 vtk_module_autoinit(TARGETS ${PROJECT_NAME} MODULES ${VTK_LIBRARIES})
			
--- a/src/image_process/CMakeLists.txt
+++ b/src/image_process/CMakeLists.txt
@@ -10,6 +10,7 @@ add_library(${PROJECT_NAME}
 
				 find_package(CUDAToolkit REQUIRED)
			
 
				 target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
			
 
				 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
			
 
				+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
			
 
				 
			
 
				 # spdlog config
			
 
				 find_package(spdlog REQUIRED)
			
@@ -19,4 +20,4 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LE
 
				 # OpenCV config
			
 
				 find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
			
 
				 target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
			
 
				-target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
 
				+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
			
--- a/src/image_process/process_kernels.cu
+++ b/src/image_process/process_kernels.cu
@@ -1,13 +1,15 @@
 
				 #include "process_kernels.cuh"
			
 
				 
			
 
				 #include <cassert>
			
 
				+#include <limits>
			
 
				 #include <type_traits>
			
 
				 
			
 
				 // kernel templates
			
 
				 
			
 
				 template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
			
 
				 __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
			
 
				-    static_assert(std::is_fundamental_v<OutT>);
			
 
				+    static_assert(std::is_fundamental_v<OutT>,
			
 
				+                  "Only fundamental type can be reduced.");
			
 
				     if constexpr (BlockSize >= 64) {
			
 
				         ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
			
 
				     }
			
@@ -28,7 +30,7 @@ __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
 
				     }
			
 
				 }
			
 
				 
			
 
				-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, uint16_t BlockSize>
			
 
				 __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
			
 
				     extern __shared__ int shmem[];
			
 
				     auto s_buf = (OutT *) shmem;
			
@@ -37,7 +39,7 @@ __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
 
				     uint32_t bkx = blockIdx.x;
			
 
				     uint32_t grid_size = BlockSize * gridDim.x;
			
 
				 
			
 
				-    OutT t_out = InitVal;
			
 
				+    OutT t_out = UpdateFunc::InitVal();
			
 
				 
			
 
				     // load per-thread data
			
 
				     for (uint32_t i = bkx * blockDim.x + tdx;
			
@@ -106,7 +108,7 @@ __global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext)
 
				     }
			
 
				 }
			
 
				 
			
 
				-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
			
 
				 void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
			
 
				                             uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     assert(n <= std::numeric_limits<uint32_t>::max());
			
@@ -115,19 +117,19 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
 
				     switch (block_size) {
			
 
				         case 512: {
			
 
				             constexpr uint16_t BlockSize = 512;
			
 
				-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
			
 
				             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				             return;
			
 
				         }
			
 
				         case 256: {
			
 
				             constexpr uint16_t BlockSize = 256;
			
 
				-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
			
 
				             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				             return;
			
 
				         }
			
 
				         case 128: {
			
 
				             constexpr uint16_t BlockSize = 128;
			
 
				-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
			
 
				+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
			
 
				             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
			
 
				             return;
			
 
				         }
			
@@ -138,15 +140,15 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
 
				 }
			
 
				 
			
 
				 // result resides in out[0]
			
 
				-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
			
 
				+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
			
 
				 void call_reduce_any(InT *in, OutT *out, uint32_t n,
			
 
				                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     { // first step
			
 
				-        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
			
 
				+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc>;
			
 
				         helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				     }
			
 
				     { // second step
			
 
				-        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
			
 
				+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc>;
			
 
				         helper_func(out, out, grid_dim, block_size, 1, stream);
			
 
				     }
			
 
				 }
			
@@ -158,11 +160,20 @@ struct type_max_value {
 
				     static constexpr T value = std::numeric_limits<T>::max();
			
 
				 };
			
 
				 
			
 
				+template<typename T>
			
 
				+struct type_min_value {
			
 
				+    static constexpr T value = std::numeric_limits<T>::min();
			
 
				+};
			
 
				+
			
 
				 template<typename T>
			
 
				 struct reduce_max_func {
			
 
				     static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				         *out = max(*out, val);
			
 
				     }
			
 
				+
			
 
				+    static __device__ __forceinline__ T InitVal() {
			
 
				+        return type_min_value<T>::value;
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 template<typename T>
			
@@ -170,6 +181,10 @@ struct reduce_min_func {
 
				     static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				         *out = min(*out, val);
			
 
				     }
			
 
				+
			
 
				+    static __device__ __forceinline__ T InitVal() {
			
 
				+        return type_max_value<T>::value;
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 template<typename T>
			
@@ -177,6 +192,10 @@ struct reduce_sum_func {
 
				     static __device__ __forceinline__ void Op(volatile T *out, T val) {
			
 
				         *out = *out + val;
			
 
				     }
			
 
				+
			
 
				+    static __device__ __forceinline__ T InitVal() {
			
 
				+        return 0;
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 template<typename T>
			
@@ -186,6 +205,10 @@ struct update_log_sum_func {
 
				     static __device__ __forceinline__ void Op(T *out, T val) {
			
 
				         *out += log(val + eps);
			
 
				     }
			
 
				+
			
 
				+    static __device__ __forceinline__ T InitVal() {
			
 
				+        return 0;
			
 
				+    }
			
 
				 };
			
 
				 
			
 
				 template<typename InT, typename OutT>
			
@@ -213,8 +236,9 @@ struct enhance_image_func {
 
				         // convert RGB to HSV
			
 
				         // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
			
 
				         using ImgElemT = decltype(in.x);
			
 
				-        static_assert(std::is_integral_v<ImgElemT>);
			
 
				-        ImgElemT c_max = max(max(in.x, in.y), in.z);
			
 
				+        static_assert(std::is_integral_v<ImgElemT>,
			
 
				+                      "Type of image element must be integer.");
			
 
				+        ImgElemT c_maxgit  = max(max(in.x, in.y), in.z);
			
 
				         ImgElemT c_min = min(min(in.x, in.y), in.z);
			
 
				         ImgElemT delta = c_max - c_min;
			
 
				 
			
@@ -325,8 +349,7 @@ template<typename T>
 
				 void call_reduce_max(T *in, T *out, size_t n,
			
 
				                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     using FuncType = reduce_max_func<T>;
			
 
				-    constexpr T InitVal = std::numeric_limits<T>::min();
			
 
				-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
			
 
				     helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				 }
			
 
				 
			
@@ -336,8 +359,7 @@ template<typename T>
 
				 void call_reduce_min(T *in, T *out, size_t n,
			
 
				                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     using FuncType = reduce_min_func<T>;
			
 
				-    constexpr T InitVal = std::numeric_limits<T>::max();
			
 
				-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
			
 
				     helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				 }
			
 
				 
			
@@ -347,7 +369,7 @@ template<typename T>
 
				 void call_reduce_sum(T *in, T *out, size_t n,
			
 
				                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     using FuncType = reduce_sum_func<T>;
			
 
				-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
			
 
				+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
			
 
				     helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				 }
			
 
				 
			
@@ -358,7 +380,7 @@ void call_reduce_log_sum(T *in, T *out, size_t n,
 
				                          uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
			
 
				     using UpdateFuncType = update_log_sum_func<T>;
			
 
				     using ReduceFuncType = reduce_sum_func<T>;
			
 
				-    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
			
 
				+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType>;
			
 
				     helper_func(in, out, n, block_size, grid_dim, stream);
			
 
				 }
			
 
				 
			
--- a/src/video_encoder.cpp
+++ b/src/video_encoder.cpp
@@ -6,8 +6,18 @@
 
				 
			
 
				 #include <nvEncodeAPI.h>
			
 
				 
			
 
				+#ifdef _MSC_VER
			
 
				+
			
 
				+#include <format>
			
 
				+
			
 
				+#define fmt std
			
 
				+
			
 
				+#else
			
 
				+
			
 
				 #include <fmt/chrono.h>
			
 
				 
			
 
				+#endif
			
 
				+
			
 
				 bool check_nvenc_api_call(NVENCSTATUS api_ret, unsigned int line_number,
			
 
				                           const char *file_name, const char *api_call_str) {
			
 
				     if (api_ret == NV_ENC_SUCCESS) [[likely]] return true;