jcsyshc
/
RemoteAR3


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
							#include "process_kernels.cuh"

#include <cassert>
#include <type_traits>

// kernel templates

template<typename OutT, typename ReduceFunc, uint16_t BlockSize>
__device__ void warp_reduce(volatile OutT *s_buf, uint32_t tdx) {
    static_assert(std::is_fundamental_v<OutT>);
    if constexpr (BlockSize >= 64) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 32]);
    }
    if constexpr (BlockSize >= 32) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 16]);
    }
    if constexpr (BlockSize >= 16) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 8]);
    }
    if constexpr (BlockSize >= 8) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 4]);
    }
    if constexpr (BlockSize >= 4) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 2]);
    }
    if constexpr (BlockSize >= 2) {
        ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 1]);
    }
}

template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal, uint16_t BlockSize>
__global__ void reduce_any(InT *in, OutT *out, uint32_t n) {
    extern __shared__ int shmem[];
    auto s_buf = (OutT *) shmem;

    uint32_t tdx = threadIdx.x;
    uint32_t bkx = blockIdx.x;
    uint32_t grid_size = BlockSize * gridDim.x;

    OutT t_out = InitVal;

    // load per-thread data
    for (uint32_t i = bkx * blockDim.x + tdx;
         i < n;
         i += grid_size) {
        UpdateFunc::Op(&t_out, in[i]);
    }

    // update to shared memory
    s_buf[tdx] = t_out;
    __syncthreads();

    if constexpr (BlockSize >= 512) {
        if (tdx < 256) {
            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 256]);
        }
        __syncthreads();
    }
    if constexpr (BlockSize >= 256) {
        if (tdx < 128) {
            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 128]);
        }
        __syncthreads();
    }
    if constexpr (BlockSize >= 128) {
        if (tdx < 64) {
            ReduceFunc::Op(&s_buf[tdx], s_buf[tdx + 64]);
        }
        __syncthreads();
    }

    if (tdx < 32) {
        warp_reduce<OutT, ReduceFunc, BlockSize>(s_buf, tdx);
    }
    if (tdx == 0) {
        out[bkx] = s_buf[0];
    }
}

template<typename InT, typename OutT, typename Func>
__global__ void elementwise_any(InT *in, OutT *out, uint32_t n) {
    uint32_t tdx = threadIdx.x;
    uint32_t bkx = blockIdx.x;
    uint32_t grid_size = blockDim.x * gridDim.x;

    for (uint32_t i = bkx * blockDim.x + tdx;
         i < n;
         i += grid_size) {
        Func::Op(&out[i], in[i]);
    }
}

template<typename InT, typename OutT, typename ExtT, typename Func>
__global__ void elementwise_ext_any(InT *in, OutT *out, uint32_t n, ExtT *p_ext) {
    uint32_t tdx = threadIdx.x;
    uint32_t bkx = blockIdx.x;
    uint32_t grid_size = blockDim.x * gridDim.x;

    // load extra values
    ExtT ext = *p_ext;

    for (uint32_t i = bkx * blockDim.x + tdx;
         i < n;
         i += grid_size) {
        Func::Op(&out[i], in[i], ext);
    }
}

template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
void call_reduce_any_kernel(InT *in, OutT *out, uint32_t n,
                            uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    assert(n <= std::numeric_limits<uint32_t>::max());
    auto shmem_size = block_size * (1 + (block_size <= 32));
    auto shmem_length = shmem_size * sizeof(OutT);
    switch (block_size) {
        case 512: {
            constexpr uint16_t BlockSize = 512;
            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
            return;
        }
        case 256: {
            constexpr uint16_t BlockSize = 256;
            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
            return;
        }
        case 128: {
            constexpr uint16_t BlockSize = 128;
            auto reduce_func = reduce_any<InT, OutT, UpdateFunc, ReduceFunc, InitVal, BlockSize>;
            reduce_func<<<grid_dim, BlockSize, shmem_length, stream>>>(in, out, n);
            return;
        }
        default: {
            assert(false);
        }
    }
}

// result resides in out[0]
template<typename InT, typename OutT, typename UpdateFunc, typename ReduceFunc, OutT InitVal>
void call_reduce_any(InT *in, OutT *out, uint32_t n,
                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    { // first step
        auto helper_func = call_reduce_any_kernel<InT, OutT, UpdateFunc, ReduceFunc, InitVal>;
        helper_func(in, out, n, block_size, grid_dim, stream);
    }
    { // second step
        auto helper_func = call_reduce_any_kernel<OutT, OutT, ReduceFunc, ReduceFunc, InitVal>;
        helper_func(out, out, grid_dim, block_size, 1, stream);
    }
}

// working functions

template<typename T>
struct type_max_value {
    static constexpr T value = std::numeric_limits<T>::max();
};

template<typename T>
struct reduce_max_func {
    static __device__ __forceinline__ void Op(volatile T *out, T val) {
        *out = max(*out, val);
    }
};

template<typename T>
struct reduce_min_func {
    static __device__ __forceinline__ void Op(volatile T *out, T val) {
        *out = min(*out, val);
    }
};

template<typename T>
struct reduce_sum_func {
    static __device__ __forceinline__ void Op(volatile T *out, T val) {
        *out = *out + val;
    }
};

template<typename T>
struct update_log_sum_func {
    static constexpr T eps = (T) 1e-6;

    static __device__ __forceinline__ void Op(T *out, T val) {
        *out += log(val + eps);
    }
};

template<typename InT, typename OutT>
struct rgb_extract_v_func { // Extract V value of HSV from RGB
    static __device__ __forceinline__ void Op(OutT *out, InT in) {
        if constexpr (std::is_floating_point_v<OutT>) {
            using InElemT = decltype(in.x);
            constexpr OutT factor = (OutT) 1 / type_max_value<InElemT>::value;
            *out = factor * max(max(in.x, in.y), in.z);
        } else {
            *out = max(max(in.x, in.y), in.z);
        }
    }
};

struct enhance_v_func {
    static __device__ __forceinline__ void Op(float *out, float in, enhance_coeff ext) {
        *out = ext.norm_factor * log(in / ext.log_avg + 1);
    }
};

template<typename ImgT>
struct enhance_image_func {
    static __device__ __forceinline__ void Op(ImgT *p_out, ImgT in, enhance_coeff ext) {
        // convert RGB to HSV
        // https://www.rapidtables.com/convert/color/rgb-to-hsv.html
        using ImgElemT = decltype(in.x);
        static_assert(std::is_integral_v<ImgElemT>);
        ImgElemT c_max = max(max(in.x, in.y), in.z);
        ImgElemT c_min = min(min(in.x, in.y), in.z);
        ImgElemT delta = c_max - c_min;

        float h; // 60 is eliminated
        if (delta == 0) {
            h = 0;
        } else {
            float delta_inv = 1.0f / delta;
            if (c_max == in.x) { // c_max == r
                h = delta_inv * (in.y - in.z); // (g-b)/delta % 6
                if (h < 0) {
                    h += 6;
                }
            } else if (c_max == in.y) { // c_max == g
                h = delta_inv * (in.z - in.x) + 2; // (b-r)/delta + 2
            } else { // c_max == b
                h = delta_inv * (in.x - in.y) + 4; // (r-g)/delta + 2
            }

        }

        float s;
        if (c_max == 0) {
            s = 0;
        } else {
            s = (float) delta / c_max;
        }

        constexpr float v_factor = 1.0f / type_max_value<ImgElemT>::value;
        float v = v_factor * (float) c_max;

        // enhance V channel
        v = ext.norm_factor * log(v / ext.log_avg + 1);

        // convert HSV to RGB
        // https://www.rapidtables.com/convert/color/hsv-to-rgb.html
        float c = v * s;
        float x = c * (1 - fabsf(fmodf(h, 2) - 1)); // c * (1 - |h % 2 - 1|)
        float m = v - c;
        float r, g, b;
        switch ((uint8_t) h) {
            case 0: {
                r = c;
                g = x;
                b = 0;
                break;
            }
            case 1: {
                r = x;
                g = c;
                b = 0;
                break;
            }
            case 2: {
                r = 0;
                g = c;
                b = x;
                break;
            }
            case 3: {
                r = 0;
                g = x;
                b = c;
                break;
            }
            case 4: {
                r = x;
                g = 0;
                b = c;
                break;
            }
            case 5: {
                r = c;
                g = 0;
                b = x;
                break;
            }
            default: {
                assert(false);
            }
        }

        constexpr float out_factor = type_max_value<ImgElemT>::value;
        ImgT out;
        out.x = out_factor * (r + m);
        out.y = out_factor * (g + m);
        out.z = out_factor * (b + m);

        *p_out = out;
    }
};

// special kernels

__global__ void prepare_enhance_coeff(float *p_max_v, float *p_sum_log_v, uint32_t n,
                                      enhance_coeff *p_out) {
    float max_v = *p_max_v;
    float sum_log_v = *p_sum_log_v;
    float log_avg = exp(sum_log_v / n);
    float norm_factor = 1.0f / (log(max_v / log_avg + 1));
    p_out->log_avg = log_avg;
    p_out->norm_factor = norm_factor;
}

// calling endpoints

template<typename T>
void call_reduce_max(T *in, T *out, size_t n,
                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    using FuncType = reduce_max_func<T>;
    constexpr T InitVal = std::numeric_limits<T>::min();
    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
    helper_func(in, out, n, block_size, grid_dim, stream);
}

template void call_reduce_max(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);

template<typename T>
void call_reduce_min(T *in, T *out, size_t n,
                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    using FuncType = reduce_min_func<T>;
    constexpr T InitVal = std::numeric_limits<T>::max();
    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, InitVal>;
    helper_func(in, out, n, block_size, grid_dim, stream);
}

template void call_reduce_min(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);

template<typename T>
void call_reduce_sum(T *in, T *out, size_t n,
                     uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    using FuncType = reduce_sum_func<T>;
    auto helper_func = call_reduce_any<T, T, FuncType, FuncType, (T) 0>;
    helper_func(in, out, n, block_size, grid_dim, stream);
}

template void call_reduce_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);

template<typename T>
void call_reduce_log_sum(T *in, T *out, size_t n,
                         uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    using UpdateFuncType = update_log_sum_func<T>;
    using ReduceFuncType = reduce_sum_func<T>;
    auto helper_func = call_reduce_any<T, T, UpdateFuncType, ReduceFuncType, (T) 0>;
    helper_func(in, out, n, block_size, grid_dim, stream);
}

template void call_reduce_log_sum(float *, float *, size_t, uint16_t, uint16_t, cudaStream_t);


template<typename InT, typename OutT>
void call_rgb_extract_v(InT *in, OutT *out, size_t n,
                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    assert(n <= std::numeric_limits<uint32_t>::max());
    using FuncType = rgb_extract_v_func<InT, OutT>;
    elementwise_any<InT, OutT, FuncType><<<grid_dim, block_size, 0, stream>>>(in, out, n);
}

template void call_rgb_extract_v(uchar3 *, float *, size_t, uint16_t, uint16_t, cudaStream_t);

void call_prepare_enhance_coeff(float *max_v, float *sum_log_v, uint32_t n,
                                enhance_coeff *out, cudaStream_t stream) {
    prepare_enhance_coeff<<<1, 1, 0, stream>>>(max_v, sum_log_v, n, out);
}

void call_enhance_v(float *in, float *out, size_t n, enhance_coeff *ext,
                    uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    assert(n <= std::numeric_limits<uint32_t>::max());
    auto kernel_func = elementwise_ext_any<float, float, enhance_coeff, enhance_v_func>;
    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
}

template<typename ImgT>
void call_enhance_image(ImgT *in, ImgT *out, size_t n, enhance_coeff *ext,
                        uint16_t block_size, uint16_t grid_dim, cudaStream_t stream) {
    assert(n <= std::numeric_limits<uint32_t>::max());
    using FuncType = enhance_image_func<ImgT>;
    auto kernel_func = elementwise_ext_any<ImgT, ImgT, enhance_coeff, FuncType>;
    kernel_func<<<grid_dim, block_size, 0, stream>>>(in, out, n, ext);
}

template void call_enhance_image(uchar3 *, uchar3 *, size_t, enhance_coeff *, uint16_t, uint16_t, cudaStream_t);