Bladeren bron

Merge remote-tracking branch 'origin/windows_dev' into dev

# Conflicts:
#	CMakeLists.txt
jcsyshc 2 jaren geleden
bovenliggende
commit
260621feea
4 gewijzigde bestanden met toevoegingen van 53 en 19 verwijderingen
  1. 1 0
      CMakeLists.txt
  2. 2 1
      src/image_process/CMakeLists.txt
  3. 40 18
      src/image_process/process_kernels.cu
  4. 10 0
      src/video_encoder.cpp

+ 1 - 0
CMakeLists.txt

@@ -108,6 +108,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE ${Boost_INCLUDE_DIRS})
 target_link_libraries(${PROJECT_NAME} ${Boost_LIBRARIES})
 
 # VTK config
+set(VTK_DIR /home/tpx/src/VTK-9.3.0/Build/lib/cmake/vtk-9.3)
 find_package(VTK REQUIRED)
 target_link_libraries(${PROJECT_NAME} ${VTK_LIBRARIES})
 vtk_module_autoinit(TARGETS ${PROJECT_NAME} MODULES ${VTK_LIBRARIES})

+ 2 - 1
src/image_process/CMakeLists.txt

@@ -10,6 +10,7 @@ add_library(${PROJECT_NAME}
 find_package(CUDAToolkit REQUIRED)
 target_link_directories(${PROJECT_NAME} PRIVATE /usr/local/cuda/lib64)
 target_link_libraries(${PROJECT_NAME} CUDA::cudart CUDA::cuda_driver)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "75;86")
 
 # spdlog config
 find_package(spdlog REQUIRED)
@@ -19,4 +20,4 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE SPDLOG_ACTIVE_LEVEL=SPDLOG_LE
 # OpenCV config
 find_package(OpenCV REQUIRED COMPONENTS cudaimgproc imgcodecs)
 target_include_directories(${PROJECT_NAME} PRIVATE ${OpenCV_INCLUDE_DIRS})
-target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})
+target_link_libraries(${PROJECT_NAME} ${OpenCV_LIBS})

+ 40 - 18
src/image_process/process_kernels.cu

@@ -1,13 +1,15 @@
 #include "process_kernels.cuh"
 
 #include <cassert>
+#include <limits>
 #include <type_traits>
 
 // kernel templates
 
 template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
 __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
-    static_assert(std::is_fundamental_v<OutT>);
+    static_assert(std::is_fundamental_v<OutT>,
+                  "Only fundamental type can be reduced.");
     if constexpr (BlockSize >= 64) {
         ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
     }
@@ -28,7 +30,7 @@ __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
     }
 }
 
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, uint16_t BlockSize>
 __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
     extern __shared__ int shmem[];
     auto s_buf = (OutT *) shmem;
@@ -37,7 +39,7 @@ __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
     uint32_t bkx = blockIdx.x;
     uint32_t grid_size = BlockSize * gridDim.x;
 
-    OutT t_out = InitVal;
+    OutT t_out = UpdateFunc::InitVal();
 
     // load per-thread data
     for (uint32_t i = bkx * blockDim.x + tdx;
@@ -106,7 +108,7 @@ __global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext)
     }
 }
 
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
 void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
                             uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     assert(n <= std::numeric_limits<uint32_t>::max());
@@ -115,19 +117,19 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
     switch (block_size) {
         case 512: {
             constexpr uint16_t BlockSize = 512;
-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
             return;
         }
         case 256: {
             constexpr uint16_t BlockSize = 256;
-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
             return;
         }
         case 128: {
             constexpr uint16_t BlockSize = 128;
-            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
+            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
             reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
             return;
         }
@@ -138,15 +140,15 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
 }
 
 // result resides in out[0]
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
 void call_reduce_any(InT *in, OutT *out, uint32_t n,
                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     { // first step
-        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
+        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc>;
         helper_func(in, out, n, block_size, grid_dim, stream);
     }
     { // second step
-        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
+        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc>;
         helper_func(out, out, grid_dim, block_size, 1, stream);
     }
 }
@@ -158,11 +160,20 @@ struct type_max_value {
     static constexpr T value = std::numeric_limits<T>::max();
 };
 
+template<typename T>
+struct type_min_value {
+    static constexpr T value = std::numeric_limits<T>::min();
+};
+
 template<typename T>
 struct reduce_max_func {
     static __device__ __forceinline__ void Op(volatile T *out, T val) {
         *out = max(*out, val);
     }
+
+    static __device__ __forceinline__ T InitVal() {
+        return type_min_value<T>::value;
+    }
 };
 
 template<typename T>
@@ -170,6 +181,10 @@ struct reduce_min_func {
     static __device__ __forceinline__ void Op(volatile T *out, T val) {
         *out = min(*out, val);
     }
+
+    static __device__ __forceinline__ T InitVal() {
+        return type_max_value<T>::value;
+    }
 };
 
 template<typename T>
@@ -177,6 +192,10 @@ struct reduce_sum_func {
     static __device__ __forceinline__ void Op(volatile T *out, T val) {
         *out = *out + val;
     }
+
+    static __device__ __forceinline__ T InitVal() {
+        return 0;
+    }
 };
 
 template<typename T>
@@ -186,6 +205,10 @@ struct update_log_sum_func {
     static __device__ __forceinline__ void Op(T *out, T val) {
         *out += log(val + eps);
     }
+
+    static __device__ __forceinline__ T InitVal() {
+        return 0;
+    }
 };
 
 template<typename InT, typename OutT>
@@ -213,8 +236,9 @@ struct enhance_image_func {
         // convert RGB to HSV
         // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
         using ImgElemT = decltype(in.x);
-        static_assert(std::is_integral_v<ImgElemT>);
-        ImgElemT c_max = max(max(in.x, in.y), in.z);
+        static_assert(std::is_integral_v<ImgElemT>,
+                      "Type of image element must be integer.");
+        ImgElemT c_maxgit  = max(max(in.x, in.y), in.z);
         ImgElemT c_min = min(min(in.x, in.y), in.z);
         ImgElemT delta = c_max - c_min;
 
@@ -325,8 +349,7 @@ template<typename T>
 void call_reduce_max(T *in, T *out, size_t n,
                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     using FuncType = reduce_max_func<T>;
-    constexpr T InitVal = std::numeric_limits<T>::min();
-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
     helper_func(in, out, n, block_size, grid_dim, stream);
 }
 
@@ -336,8 +359,7 @@ template<typename T>
 void call_reduce_min(T *in, T *out, size_t n,
                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     using FuncType = reduce_min_func<T>;
-    constexpr T InitVal = std::numeric_limits<T>::max();
-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
     helper_func(in, out, n, block_size, grid_dim, stream);
 }
 
@@ -347,7 +369,7 @@ template<typename T>
 void call_reduce_sum(T *in, T *out, size_t n,
                      uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     using FuncType = reduce_sum_func<T>;
-    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
+    auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
     helper_func(in, out, n, block_size, grid_dim, stream);
 }
 
@@ -358,7 +380,7 @@ void call_reduce_log_sum(T *in, T *out, size_t n,
                          uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
     using UpdateFuncType = update_log_sum_func<T>;
     using ReduceFuncType = reduce_sum_func<T>;
-    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
+    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType>;
     helper_func(in, out, n, block_size, grid_dim, stream);
 }
 

+ 10 - 0
src/video_encoder.cpp

@@ -6,8 +6,18 @@
 
 #include <nvEncodeAPI.h>
 
+#ifdef _MSC_VER
+
+#include <format>
+
+#define fmt std
+
+#else
+
 #include <fmt/chrono.h>
 
+#endif
+
 bool check_nvenc_api_call(NVENCSTATUS api_ret, unsigned int line_number,
                           const char *file_name, const char *api_call_str) {
     if (api_ret == NV_ENC_SUCCESS) [[likely]] return true;