|
@@ -1,13 +1,15 @@
|
|
|
#include "process_kernels.cuh"
|
|
#include "process_kernels.cuh"
|
|
|
|
|
|
|
|
#include <cassert>
|
|
#include <cassert>
|
|
|
|
|
+#include <limits>
|
|
|
#include <type_traits>
|
|
#include <type_traits>
|
|
|
|
|
|
|
|
// kernel templates
|
|
// kernel templates
|
|
|
|
|
|
|
|
template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
|
|
template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
|
|
|
__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
|
|
__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
|
|
|
- static_assert(std::is_fundamental_v<OutT>);
|
|
|
|
|
|
|
+ static_assert(std::is_fundamental_v<OutT>,
|
|
|
|
|
+ "Only fundamental type can be reduced.");
|
|
|
if constexpr (BlockSize >= 64) {
|
|
if constexpr (BlockSize >= 64) {
|
|
|
ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
|
|
ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
|
|
|
}
|
|
}
|
|
@@ -28,7 +30,7 @@ __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
|
|
|
|
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, uint16_t BlockSize>
|
|
|
__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
|
|
__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
|
|
|
extern __shared__ int shmem[];
|
|
extern __shared__ int shmem[];
|
|
|
auto s_buf = (OutT *) shmem;
|
|
auto s_buf = (OutT *) shmem;
|
|
@@ -37,7 +39,7 @@ __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
|
|
|
uint32_t bkx = blockIdx.x;
|
|
uint32_t bkx = blockIdx.x;
|
|
|
uint32_t grid_size = BlockSize * gridDim.x;
|
|
uint32_t grid_size = BlockSize * gridDim.x;
|
|
|
|
|
|
|
|
- OutT t_out = InitVal;
|
|
|
|
|
|
|
+ OutT t_out = UpdateFunc::InitVal();
|
|
|
|
|
|
|
|
// load per-thread data
|
|
// load per-thread data
|
|
|
for (uint32_t i = bkx * blockDim.x + tdx;
|
|
for (uint32_t i = bkx * blockDim.x + tdx;
|
|
@@ -106,7 +108,7 @@ __global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext)
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
|
|
|
|
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
|
|
|
void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
|
|
void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
assert(n <= std::numeric_limits<uint32_t>::max());
|
|
assert(n <= std::numeric_limits<uint32_t>::max());
|
|
@@ -115,19 +117,19 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
|
|
|
switch (block_size) {
|
|
switch (block_size) {
|
|
|
case 512: {
|
|
case 512: {
|
|
|
constexpr uint16_t BlockSize = 512;
|
|
constexpr uint16_t BlockSize = 512;
|
|
|
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
|
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
|
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
case 256: {
|
|
case 256: {
|
|
|
constexpr uint16_t BlockSize = 256;
|
|
constexpr uint16_t BlockSize = 256;
|
|
|
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
|
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
|
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
|
case 128: {
|
|
case 128: {
|
|
|
constexpr uint16_t BlockSize = 128;
|
|
constexpr uint16_t BlockSize = 128;
|
|
|
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
|
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, BlockSize>;
|
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
return;
|
|
return;
|
|
|
}
|
|
}
|
|
@@ -138,15 +140,15 @@ void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// result resides in out[0]
|
|
// result resides in out[0]
|
|
|
-template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
|
|
|
|
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc>
|
|
|
void call_reduce_any(InT *in, OutT *out, uint32_t n,
|
|
void call_reduce_any(InT *in, OutT *out, uint32_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
{ // first step
|
|
{ // first step
|
|
|
- auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc>;
|
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
}
|
|
}
|
|
|
{ // second step
|
|
{ // second step
|
|
|
- auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc>;
|
|
|
helper_func(out, out, grid_dim, block_size, 1, stream);
|
|
helper_func(out, out, grid_dim, block_size, 1, stream);
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
@@ -158,11 +160,20 @@ struct type_max_value {
|
|
|
static constexpr T value = std::numeric_limits<T>::max();
|
|
static constexpr T value = std::numeric_limits<T>::max();
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
|
|
+template<typename T>
|
|
|
|
|
+struct type_min_value {
|
|
|
|
|
+ static constexpr T value = std::numeric_limits<T>::min();
|
|
|
|
|
+};
|
|
|
|
|
+
|
|
|
template<typename T>
|
|
template<typename T>
|
|
|
struct reduce_max_func {
|
|
struct reduce_max_func {
|
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
*out = max(*out, val);
|
|
*out = max(*out, val);
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ static __device__ __forceinline__ T InitVal() {
|
|
|
|
|
+ return type_min_value<T>::value;
|
|
|
|
|
+ }
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
template<typename T>
|
|
template<typename T>
|
|
@@ -170,6 +181,10 @@ struct reduce_min_func {
|
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
*out = min(*out, val);
|
|
*out = min(*out, val);
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ static __device__ __forceinline__ T InitVal() {
|
|
|
|
|
+ return type_max_value<T>::value;
|
|
|
|
|
+ }
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
template<typename T>
|
|
template<typename T>
|
|
@@ -177,6 +192,10 @@ struct reduce_sum_func {
|
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
*out = *out + val;
|
|
*out = *out + val;
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ static __device__ __forceinline__ T InitVal() {
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
template<typename T>
|
|
template<typename T>
|
|
@@ -186,6 +205,10 @@ struct update_log_sum_func {
|
|
|
static __device__ __forceinline__ void Op(T *out, T val) {
|
|
static __device__ __forceinline__ void Op(T *out, T val) {
|
|
|
*out += log(val + eps);
|
|
*out += log(val + eps);
|
|
|
}
|
|
}
|
|
|
|
|
+
|
|
|
|
|
+ static __device__ __forceinline__ T InitVal() {
|
|
|
|
|
+ return 0;
|
|
|
|
|
+ }
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
template<typename InT, typename OutT>
|
|
template<typename InT, typename OutT>
|
|
@@ -213,8 +236,9 @@ struct enhance_image_func {
|
|
|
// convert RGB to HSV
|
|
// convert RGB to HSV
|
|
|
// https://www.rapidtables.com/convert/color/rgb-to-hsv.html
|
|
// https://www.rapidtables.com/convert/color/rgb-to-hsv.html
|
|
|
using ImgElemT = decltype(in.x);
|
|
using ImgElemT = decltype(in.x);
|
|
|
- static_assert(std::is_integral_v<ImgElemT>);
|
|
|
|
|
- ImgElemT c_max = max(max(in.x, in.y), in.z);
|
|
|
|
|
|
|
+ static_assert(std::is_integral_v<ImgElemT>,
|
|
|
|
|
+ "Type of image element must be integer.");
|
|
|
|
|
+ ImgElemT c_maxgit = max(max(in.x, in.y), in.z);
|
|
|
ImgElemT c_min = min(min(in.x, in.y), in.z);
|
|
ImgElemT c_min = min(min(in.x, in.y), in.z);
|
|
|
ImgElemT delta = c_max - c_min;
|
|
ImgElemT delta = c_max - c_min;
|
|
|
|
|
|
|
@@ -325,8 +349,7 @@ template<typename T>
|
|
|
void call_reduce_max(T *in, T *out, size_t n,
|
|
void call_reduce_max(T *in, T *out, size_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
using FuncType = reduce_max_func<T>;
|
|
using FuncType = reduce_max_func<T>;
|
|
|
- constexpr T InitVal = std::numeric_limits<T>::min();
|
|
|
|
|
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
|
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -336,8 +359,7 @@ template<typename T>
|
|
|
void call_reduce_min(T *in, T *out, size_t n,
|
|
void call_reduce_min(T *in, T *out, size_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
using FuncType = reduce_min_func<T>;
|
|
using FuncType = reduce_min_func<T>;
|
|
|
- constexpr T InitVal = std::numeric_limits<T>::max();
|
|
|
|
|
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
|
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -347,7 +369,7 @@ template<typename T>
|
|
|
void call_reduce_sum(T *in, T *out, size_t n,
|
|
void call_reduce_sum(T *in, T *out, size_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
using FuncType = reduce_sum_func<T>;
|
|
using FuncType = reduce_sum_func<T>;
|
|
|
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType>;
|
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -358,7 +380,7 @@ void call_reduce_log_sum(T *in, T *out, size_t n,
|
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
using UpdateFuncType = update_log_sum_func<T>;
|
|
using UpdateFuncType = update_log_sum_func<T>;
|
|
|
using ReduceFuncType = reduce_sum_func<T>;
|
|
using ReduceFuncType = reduce_sum_func<T>;
|
|
|
- auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
|
|
|
|
|
|
|
+ auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType>;
|
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
}
|
|
}
|
|
|
|
|
|