| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399 |
- #include "process_kernels.cuh"
- #include <cassert>
- #include <type_traits>
- // kernel templates
- template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
- __device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
- static_assert(std::is_fundamental_v<OutT>);
- if constexpr (BlockSize >= 64) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
- }
- if constexpr (BlockSize >= 32) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
- }
- if constexpr (BlockSize >= 16) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
- }
- if constexpr (BlockSize >= 8) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
- }
- if constexpr (BlockSize >= 4) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
- }
- if constexpr (BlockSize >= 2) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
- }
- }
- template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
- __global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
- extern __shared__ int shmem[];
- auto s_buf = (OutT *) shmem;
- uint32_t tdx = threadIdx.x;
- uint32_t bkx = blockIdx.x;
- uint32_t grid_size = BlockSize * gridDim.x;
- OutT t_out = InitVal;
- // load per-thread data
- for (uint32_t i = bkx * blockDim.x + tdx;
- i < n;
- i += grid_size) {
- UpdateFunc::Op(&t_out, in[i]);
- }
- // update to shared memory
- s_buf[tdx] = t_out;
- __syncthreads();
- if constexpr (BlockSize >= 512) {
- if (tdx < 256) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
- }
- __syncthreads();
- }
- if constexpr (BlockSize >= 256) {
- if (tdx < 128) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
- }
- __syncthreads();
- }
- if constexpr (BlockSize >= 128) {
- if (tdx < 64) {
- ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
- }
- __syncthreads();
- }
- if (tdx < 32) {
- warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
- }
- if (tdx == 0) {
- out[bkx] = s_buf[0];
- }
- }
- template<typename InT, typename OutT, typename Func>
- __global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
- uint32_t tdx = threadIdx.x;
- uint32_t bkx = blockIdx.x;
- uint32_t grid_size = blockDim.x * gridDim.x;
- for (uint32_t i = bkx * blockDim.x + tdx;
- i < n;
- i += grid_size) {
- Func::Op(&out[i], in[i]);
- }
- }
- template<typename InT, typename OutT, typename ExtT, typename Func>
- __global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
- uint32_t tdx = threadIdx.x;
- uint32_t bkx = blockIdx.x;
- uint32_t grid_size = blockDim.x * gridDim.x;
- // load extra values
- ExtT ext = *p_ext;
- for (uint32_t i = bkx * blockDim.x + tdx;
- i < n;
- i += grid_size) {
- Func::Op(&out[i], in[i], ext);
- }
- }
- template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
- void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- assert(n <= std::numeric_limits<uint32_t>::max());
- auto shmem_size = block_size * (1 + (block_size <= 32));
- auto shmem_length = shmem_size * sizeof(OutT);
- switch (block_size) {
- case 512: {
- constexpr uint16_t BlockSize = 512;
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
- reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
- return;
- }
- case 256: {
- constexpr uint16_t BlockSize = 256;
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
- reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
- return;
- }
- case 128: {
- constexpr uint16_t BlockSize = 128;
- auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
- reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
- return;
- }
- default: {
- assert(false);
- }
- }
- }
- // result resides in out[0]
- template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
- void call_reduce_any(InT *in, OutT *out, uint32_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- { // first step
- auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
- helper_func(in, out, n, block_size, grid_dim, stream);
- }
- { // second step
- auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
- helper_func(out, out, grid_dim, block_size, 1, stream);
- }
- }
- // working functions
- template<typename T>
- struct type_max_value {
- static constexpr T value = std::numeric_limits<T>::max();
- };
- template<typename T>
- struct reduce_max_func {
- static __device__ __forceinline__ void Op(volatile T *out, T val) {
- *out = max(*out, val);
- }
- };
- template<typename T>
- struct reduce_min_func {
- static __device__ __forceinline__ void Op(volatile T *out, T val) {
- *out = min(*out, val);
- }
- };
- template<typename T>
- struct reduce_sum_func {
- static __device__ __forceinline__ void Op(volatile T *out, T val) {
- *out = *out + val;
- }
- };
- template<typename T>
- struct update_log_sum_func {
- static constexpr T eps = (T) 1e-6;
- static __device__ __forceinline__ void Op(T *out, T val) {
- *out += log(val + eps);
- }
- };
- template<typename InT, typename OutT>
- struct rgb_extract_v_func { // Extract V value of HSV from RGB
- static __device__ __forceinline__ void Op(OutT *out, InT in) {
- if constexpr (std::is_floating_point_v<OutT>) {
- using InElemT = decltype(in.x);
- constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
- *out = factor * max(max(in.x, in.y), in.z);
- } else {
- *out = max(max(in.x, in.y), in.z);
- }
- }
- };
- struct enhance_v_func {
- static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
- *out = ext.norm_factor * log(in / ext.log_avg + 1);
- }
- };
- template<typename ImgT>
- struct enhance_image_func {
- static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
- // convert RGB to HSV
- // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
- using ImgElemT = decltype(in.x);
- static_assert(std::is_integral_v<ImgElemT>);
- ImgElemT c_max = max(max(in.x, in.y), in.z);
- ImgElemT c_min = min(min(in.x, in.y), in.z);
- ImgElemT delta = c_max - c_min;
- float h; // 60 is eliminated
- if (delta == 0) {
- h = 0;
- } else {
- float delta_inv = 1.0f / delta;
- if (c_max == in.x) { // c_max == r
- h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
- if (h < 0) {
- h += 6;
- }
- } else if (c_max == in.y) { // c_max == g
- h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
- } else { // c_max == b
- h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
- }
- }
- float s;
- if (c_max == 0) {
- s = 0;
- } else {
- s = (float) delta / c_max;
- }
- constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
- float v = v_factor * (float) c_max;
- // enhance V channel
- v = ext.norm_factor * log(v / ext.log_avg + 1);
- // convert HSV to RGB
- // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
- float c = v * s;
- float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
- float m = v - c;
- float r, g, b;
- switch ((uint8_t) h) {
- case 0: {
- r = c;
- g = x;
- b = 0;
- break;
- }
- case 1: {
- r = x;
- g = c;
- b = 0;
- break;
- }
- case 2: {
- r = 0;
- g = c;
- b = x;
- break;
- }
- case 3: {
- r = 0;
- g = x;
- b = c;
- break;
- }
- case 4: {
- r = x;
- g = 0;
- b = c;
- break;
- }
- case 5: {
- r = c;
- g = 0;
- b = x;
- break;
- }
- default: {
- assert(false);
- }
- }
- constexpr float out_factor = type_max_value<ImgElemT>::value;
- ImgT out;
- out.x = out_factor * (r + m);
- out.y = out_factor * (g + m);
- out.z = out_factor * (b + m);
- *p_out = out;
- }
- };
- // special kernels
- __global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
- enhance_coeff *p_out) {
- float max_v = *p_max_v;
- float sum_log_v = *p_sum_log_v;
- float log_avg = exp(sum_log_v / n);
- float norm_factor = 1.0f / (log(max_v / log_avg + 1));
- p_out->log_avg = log_avg;
- p_out->norm_factor = norm_factor;
- }
- // calling endpoints
- template<typename T>
- void call_reduce_max(T *in, T *out, size_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- using FuncType = reduce_max_func<T>;
- constexpr T InitVal = std::numeric_limits<T>::min();
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
- helper_func(in, out, n, block_size, grid_dim, stream);
- }
- template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
- template<typename T>
- void call_reduce_min(T *in, T *out, size_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- using FuncType = reduce_min_func<T>;
- constexpr T InitVal = std::numeric_limits<T>::max();
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
- helper_func(in, out, n, block_size, grid_dim, stream);
- }
- template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
- template<typename T>
- void call_reduce_sum(T *in, T *out, size_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- using FuncType = reduce_sum_func<T>;
- auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
- helper_func(in, out, n, block_size, grid_dim, stream);
- }
- template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
- template<typename T>
- void call_reduce_log_sum(T *in, T *out, size_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- using UpdateFuncType = update_log_sum_func<T>;
- using ReduceFuncType = reduce_sum_func<T>;
- auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
- helper_func(in, out, n, block_size, grid_dim, stream);
- }
- template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
- template<typename InT, typename OutT>
- void call_rgb_extract_v(InT *in, OutT *out, size_t n,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- assert(n <= std::numeric_limits<uint32_t>::max());
- using FuncType = rgb_extract_v_func<InT, OutT>;
- elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
- }
- template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);
- void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
- enhance_coeff *out, cudaStream_t stream) {
- prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
- }
- void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- assert(n <= std::numeric_limits<uint32_t>::max());
- auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
- kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
- }
- template<typename ImgT>
- void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
- uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
- assert(n <= std::numeric_limits<uint32_t>::max());
- using FuncType = enhance_image_func<ImgT>;
- auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
- kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
- }
- template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);
|