|
|
@@ -0,0 +1,399 @@
|
|
|
+#include "process_kernels.cuh"
|
|
|
+
|
|
|
+#include <cassert>
|
|
|
+#include <type_traits>
|
|
|
+
|
|
|
+// kernel templates
|
|
|
+
|
|
|
+template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
|
|
|
+__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
|
|
|
+ static_assert(std::is_fundamental_v<OutT>);
|
|
|
+ if constexpr (BlockSize >= 64) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 32) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 16) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 8) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 4) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 2) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
|
|
|
+__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
|
|
|
+ extern __shared__ int shmem[];
|
|
|
+ auto s_buf = (OutT *) shmem;
|
|
|
+
|
|
|
+ uint32_t tdx = threadIdx.x;
|
|
|
+ uint32_t bkx = blockIdx.x;
|
|
|
+ uint32_t grid_size = BlockSize * gridDim.x;
|
|
|
+
|
|
|
+ OutT t_out = InitVal;
|
|
|
+
|
|
|
+ // load per-thread data
|
|
|
+ for (uint32_t i = bkx * blockDim.x + tdx;
|
|
|
+ i < n;
|
|
|
+ i += grid_size) {
|
|
|
+ UpdateFunc::Op(&t_out, in[i]);
|
|
|
+ }
|
|
|
+
|
|
|
+ // update to shared memory
|
|
|
+ s_buf[tdx] = t_out;
|
|
|
+ __syncthreads();
|
|
|
+
|
|
|
+ if constexpr (BlockSize >= 512) {
|
|
|
+ if (tdx < 256) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
|
|
|
+ }
|
|
|
+ __syncthreads();
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 256) {
|
|
|
+ if (tdx < 128) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
|
|
|
+ }
|
|
|
+ __syncthreads();
|
|
|
+ }
|
|
|
+ if constexpr (BlockSize >= 128) {
|
|
|
+ if (tdx < 64) {
|
|
|
+ ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
|
|
|
+ }
|
|
|
+ __syncthreads();
|
|
|
+ }
|
|
|
+
|
|
|
+ if (tdx < 32) {
|
|
|
+ warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
|
|
|
+ }
|
|
|
+ if (tdx == 0) {
|
|
|
+ out[bkx] = s_buf[0];
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+template<typename InT, typename OutT, typename Func>
|
|
|
+__global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
|
|
|
+ uint32_t tdx = threadIdx.x;
|
|
|
+ uint32_t bkx = blockIdx.x;
|
|
|
+ uint32_t grid_size = blockDim.x * gridDim.x;
|
|
|
+
|
|
|
+ for (uint32_t i = bkx * blockDim.x + tdx;
|
|
|
+ i < n;
|
|
|
+ i += grid_size) {
|
|
|
+ Func::Op(&out[i], in[i]);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+template<typename InT, typename OutT, typename ExtT, typename Func>
|
|
|
+__global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
|
|
|
+ uint32_t tdx = threadIdx.x;
|
|
|
+ uint32_t bkx = blockIdx.x;
|
|
|
+ uint32_t grid_size = blockDim.x * gridDim.x;
|
|
|
+
|
|
|
+ // load extra values
|
|
|
+ ExtT ext = *p_ext;
|
|
|
+
|
|
|
+ for (uint32_t i = bkx * blockDim.x + tdx;
|
|
|
+ i < n;
|
|
|
+ i += grid_size) {
|
|
|
+ Func::Op(&out[i], in[i], ext);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
|
|
|
+void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ assert(n <= std::numeric_limits<uint32_t>::max());
|
|
|
+ auto shmem_size = block_size * (1 + (block_size <= 32));
|
|
|
+ auto shmem_length = shmem_size * sizeof(OutT);
|
|
|
+ switch (block_size) {
|
|
|
+ case 512: {
|
|
|
+ constexpr uint16_t BlockSize = 512;
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
+ reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ case 256: {
|
|
|
+ constexpr uint16_t BlockSize = 256;
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
+ reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ case 128: {
|
|
|
+ constexpr uint16_t BlockSize = 128;
|
|
|
+ auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
|
|
|
+ reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ default: {
|
|
|
+ assert(false);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// result resides in out[0]
|
|
|
+template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
|
|
|
+void call_reduce_any(InT *in, OutT *out, uint32_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ { // first step
|
|
|
+ auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
|
|
|
+ helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
+ }
|
|
|
+ { // second step
|
|
|
+ auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
|
|
|
+ helper_func(out, out, grid_dim, block_size, 1, stream);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+// working functions
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+struct type_max_value {
|
|
|
+ static constexpr T value = std::numeric_limits<T>::max();
|
|
|
+};
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+struct reduce_max_func {
|
|
|
+ static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
+ *out = max(*out, val);
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+struct reduce_min_func {
|
|
|
+ static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
+ *out = min(*out, val);
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+struct reduce_sum_func {
|
|
|
+ static __device__ __forceinline__ void Op(volatile T *out, T val) {
|
|
|
+ *out = *out + val;
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+struct update_log_sum_func {
|
|
|
+ static constexpr T eps = (T) 1e-6;
|
|
|
+
|
|
|
+ static __device__ __forceinline__ void Op(T *out, T val) {
|
|
|
+ *out += log(val + eps);
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<typename InT, typename OutT>
|
|
|
+struct rgb_extract_v_func { // Extract V value of HSV from RGB
|
|
|
+ static __device__ __forceinline__ void Op(OutT *out, InT in) {
|
|
|
+ if constexpr (std::is_floating_point_v<OutT>) {
|
|
|
+ using InElemT = decltype(in.x);
|
|
|
+ constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
|
|
|
+ *out = factor * max(max(in.x, in.y), in.z);
|
|
|
+ } else {
|
|
|
+ *out = max(max(in.x, in.y), in.z);
|
|
|
+ }
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+struct enhance_v_func {
|
|
|
+ static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
|
|
|
+ *out = ext.norm_factor * log(in / ext.log_avg + 1);
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<typename ImgT>
|
|
|
+struct enhance_image_func {
|
|
|
+ static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
|
|
|
+ // convert RGB to HSV
|
|
|
+ // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
|
|
|
+ using ImgElemT = decltype(in.x);
|
|
|
+ static_assert(std::is_integral_v<ImgElemT>);
|
|
|
+ ImgElemT c_max = max(max(in.x, in.y), in.z);
|
|
|
+ ImgElemT c_min = min(min(in.x, in.y), in.z);
|
|
|
+ ImgElemT delta = c_max - c_min;
|
|
|
+
|
|
|
+ float h; // 60 is eliminated
|
|
|
+ if (delta == 0) {
|
|
|
+ h = 0;
|
|
|
+ } else {
|
|
|
+ float delta_inv = 1.0f / delta;
|
|
|
+ if (c_max == in.x) { // c_max == r
|
|
|
+ h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
|
|
|
+ if (h < 0) {
|
|
|
+ h += 6;
|
|
|
+ }
|
|
|
+ } else if (c_max == in.y) { // c_max == g
|
|
|
+ h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
|
|
|
+ } else { // c_max == b
|
|
|
+ h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ float s;
|
|
|
+ if (c_max == 0) {
|
|
|
+ s = 0;
|
|
|
+ } else {
|
|
|
+ s = (float) delta / c_max;
|
|
|
+ }
|
|
|
+
|
|
|
+ constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
|
|
|
+ float v = v_factor * (float) c_max;
|
|
|
+
|
|
|
+ // enhance V channel
|
|
|
+ v = ext.norm_factor * log(v / ext.log_avg + 1);
|
|
|
+
|
|
|
+ // convert HSV to RGB
|
|
|
+ // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
|
|
|
+ float c = v * s;
|
|
|
+ float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
|
|
|
+ float m = v - c;
|
|
|
+ float r, g, b;
|
|
|
+ switch ((uint8_t) h) {
|
|
|
+ case 0: {
|
|
|
+ r = c;
|
|
|
+ g = x;
|
|
|
+ b = 0;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 1: {
|
|
|
+ r = x;
|
|
|
+ g = c;
|
|
|
+ b = 0;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 2: {
|
|
|
+ r = 0;
|
|
|
+ g = c;
|
|
|
+ b = x;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 3: {
|
|
|
+ r = 0;
|
|
|
+ g = x;
|
|
|
+ b = c;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 4: {
|
|
|
+ r = x;
|
|
|
+ g = 0;
|
|
|
+ b = c;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ case 5: {
|
|
|
+ r = c;
|
|
|
+ g = 0;
|
|
|
+ b = x;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ default: {
|
|
|
+ assert(false);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ constexpr float out_factor = type_max_value<ImgElemT>::value;
|
|
|
+ ImgT out;
|
|
|
+ out.x = out_factor * (r + m);
|
|
|
+ out.y = out_factor * (g + m);
|
|
|
+ out.z = out_factor * (b + m);
|
|
|
+
|
|
|
+ *p_out = out;
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+// special kernels
|
|
|
+
|
|
|
+__global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
|
|
|
+ enhance_coeff *p_out) {
|
|
|
+ float max_v = *p_max_v;
|
|
|
+ float sum_log_v = *p_sum_log_v;
|
|
|
+ float log_avg = exp(sum_log_v / n);
|
|
|
+ float norm_factor = 1.0f / (log(max_v / log_avg + 1));
|
|
|
+ p_out->log_avg = log_avg;
|
|
|
+ p_out->norm_factor = norm_factor;
|
|
|
+}
|
|
|
+
|
|
|
+// calling endpoints
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void call_reduce_max(T *in, T *out, size_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ using FuncType = reduce_max_func<T>;
|
|
|
+ constexpr T InitVal = std::numeric_limits<T>::min();
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
|
|
|
+ helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void call_reduce_min(T *in, T *out, size_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ using FuncType = reduce_min_func<T>;
|
|
|
+ constexpr T InitVal = std::numeric_limits<T>::max();
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
|
|
|
+ helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void call_reduce_sum(T *in, T *out, size_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ using FuncType = reduce_sum_func<T>;
|
|
|
+ auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
|
|
|
+ helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void call_reduce_log_sum(T *in, T *out, size_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ using UpdateFuncType = update_log_sum_func<T>;
|
|
|
+ using ReduceFuncType = reduce_sum_func<T>;
|
|
|
+ auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
|
|
|
+ helper_func(in, out, n, block_size, grid_dim, stream);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
|
|
|
+
|
|
|
+
|
|
|
+template<typename InT, typename OutT>
|
|
|
+void call_rgb_extract_v(InT *in, OutT *out, size_t n,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ assert(n <= std::numeric_limits<uint32_t>::max());
|
|
|
+ using FuncType = rgb_extract_v_func<InT, OutT>;
|
|
|
+ elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
|
|
|
+
|
|
|
+void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
|
|
|
+ enhance_coeff *out, cudaStream_t stream) {
|
|
|
+ prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
|
|
|
+}
|
|
|
+
|
|
|
+void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ assert(n <= std::numeric_limits<uint32_t>::max());
|
|
|
+ auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
|
|
|
+ kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
|
|
|
+}
|
|
|
+
|
|
|
+template<typename ImgT>
|
|
|
+void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
|
|
|
+ uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
|
|
|
+ assert(n <= std::numeric_limits<uint32_t>::max());
|
|
|
+ using FuncType = enhance_image_func<ImgT>;
|
|
|
+ auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
|
|
|
+ kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
|
|
|
+}
|
|
|
+
|
|
|
+template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
|