|
|
@@ -55,6 +55,24 @@ namespace yuv_to_rgb {
|
|
|
|
|
|
}
|
|
|
|
|
|
+namespace rgb_to_yuv {
|
|
|
+
|
|
|
+ // @formatter:off
|
|
|
+ __device__ __constant__ auto cvt_mat = glm::mat3( // BT.709
|
|
|
+ 0.2126, -0.1146, 0.5,
|
|
|
+ 0.7152, -0.3854, -0.4542,
|
|
|
+ 0.0722, 0.5, -0.0458);
|
|
|
+ // @formatter:on
|
|
|
+
|
|
|
+ struct cvt {
|
|
|
+ __device__ static constexpr uchar3 Op(uchar3 in) {
|
|
|
+ auto yuv = cvt_mat * to_vec3(in);
|
|
|
+ return to_uchar3(yuv + glm::vec3(0, 0.5, 0.5));
|
|
|
+ }
|
|
|
+ };
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
__global__ void nv12_to_rgb(image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img,
|
|
|
image_type_v2<uchar3> rgb_img) {
|
|
|
|
|
|
@@ -85,15 +103,157 @@ __global__ void nv12_to_rgb(image_type_v2<uchar1> luma_img, image_type_v2<uchar2
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-void call_nv12_to_rgb(image_type_v2<uchar1> in,
|
|
|
- image_type_v2<uchar3> out,
|
|
|
- cudaStream_t stream) {
|
|
|
+__global__ void nv12_to_yuv(image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img,
|
|
|
+ image_type_v2<uchar3> yuv_img) {
|
|
|
+
|
|
|
+ for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
+ idy < chroma_img.height;
|
|
|
+ idy += gridDim.y * blockDim.y) {
|
|
|
+
|
|
|
+ for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
+ idx < chroma_img.width;
|
|
|
+ idx += gridDim.x * blockDim.x) {
|
|
|
+
|
|
|
+ auto chroma = *chroma_img.at(idy, idx);
|
|
|
+
|
|
|
+#pragma unroll
|
|
|
+ for (auto dy = 0; dy < 2; ++dy) {
|
|
|
+ auto iy = 2 * idy + dy, ix = 2 * idx;
|
|
|
+ auto luma_pack = *(uchar2 *) luma_img.at(iy, ix);
|
|
|
+ auto yuv_1 = uchar3(luma_pack.x, chroma.x, chroma.y);
|
|
|
+ auto yuv_2 = uchar3(luma_pack.y, chroma.x, chroma.y);
|
|
|
+
|
|
|
+ using yuv_pack_type = cuda::std::tuple<uchar3, uchar3>;
|
|
|
+ *(yuv_pack_type *) yuv_img.at(iy, ix) =
|
|
|
+ cuda::std::make_tuple(yuv_1, yuv_2);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+__global__ void rgb_to_nv12(image_type_v2<uchar3> rgb_img,
|
|
|
+ image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img) {
|
|
|
+
|
|
|
+ for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
+ idy < chroma_img.height;
|
|
|
+ idy += gridDim.y * blockDim.y) {
|
|
|
+
|
|
|
+ for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
+ idx < chroma_img.width;
|
|
|
+ idx += gridDim.x * blockDim.x) {
|
|
|
+
|
|
|
+ ushort2 chroma_sum = {};
|
|
|
+
|
|
|
+#pragma unroll
|
|
|
+ for (auto dy = 0; dy < 2; ++dy) {
|
|
|
+ auto iy = 2 * idy + dy, ix = 2 * idx;
|
|
|
+
|
|
|
+ using rgb_pack_type = cuda::std::tuple<uchar3, uchar3>;
|
|
|
+ auto rgb_pack = *(rgb_pack_type *) rgb_img.at(iy, ix);
|
|
|
+ auto rgb_1 = cuda::std::get<0>(rgb_pack);
|
|
|
+ auto rgb_2 = cuda::std::get<1>(rgb_pack);
|
|
|
+ auto yuv_1 = rgb_to_yuv::cvt::Op(rgb_1);
|
|
|
+ auto yuv_2 = rgb_to_yuv::cvt::Op(rgb_2);
|
|
|
+
|
|
|
+ auto luma_pack = uchar2(yuv_1.x, yuv_2.x);
|
|
|
+ *(uchar2 *) luma_img.at(iy, ix) = luma_pack;
|
|
|
+ chroma_sum.x += yuv_1.y + yuv_2.y;
|
|
|
+ chroma_sum.y += yuv_1.z + yuv_2.z;
|
|
|
+ }
|
|
|
+
|
|
|
+ auto chroma = uchar2(chroma_sum.x >> 2,
|
|
|
+ chroma_sum.y >> 2);
|
|
|
+ *chroma_img.at(idy, idx) = chroma;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+__global__ void yuv_to_nv12(image_type_v2<uchar3> yuv_img,
|
|
|
+ image_type_v2<uchar1> luma_img, image_type_v2<uchar2> chroma_img) {
|
|
|
+
|
|
|
+ for (auto idy = blockDim.y * blockIdx.y + threadIdx.y;
|
|
|
+ idy < chroma_img.height;
|
|
|
+ idy += gridDim.y * blockDim.y) {
|
|
|
+
|
|
|
+ for (auto idx = blockDim.x * blockIdx.x + threadIdx.x;
|
|
|
+ idx < chroma_img.width;
|
|
|
+ idx += gridDim.x * blockDim.x) {
|
|
|
+
|
|
|
+ ushort2 chroma_sum = {};
|
|
|
+
|
|
|
+#pragma unroll
|
|
|
+ for (auto dy = 0; dy < 2; ++dy) {
|
|
|
+ auto iy = 2 * idy + dy, ix = 2 * idx;
|
|
|
+
|
|
|
+ using yuv_pack_type = cuda::std::tuple<uchar3, uchar3>;
|
|
|
+ auto yuv_pack = *(yuv_pack_type *) yuv_img.at(iy, ix);
|
|
|
+ auto yuv_1 = cuda::std::get<0>(yuv_pack);
|
|
|
+ auto yuv_2 = cuda::std::get<1>(yuv_pack);
|
|
|
+
|
|
|
+ auto luma_pack = uchar2(yuv_1.x, yuv_2.x);
|
|
|
+ *(uchar2 *) luma_img.at(iy, ix) = luma_pack;
|
|
|
+ chroma_sum.x += yuv_1.y + yuv_2.y;
|
|
|
+ chroma_sum.y += yuv_1.z + yuv_2.z;
|
|
|
+ }
|
|
|
+
|
|
|
+ auto chroma = uchar2(chroma_sum.x >> 2,
|
|
|
+ chroma_sum.y >> 2);
|
|
|
+ *chroma_img.at(idy, idx) = chroma;
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+using luma_chroma_type =
|
|
|
+ std::tuple<image_type_v2<uchar1>, image_type_v2<uchar2>>;
|
|
|
+
|
|
|
+luma_chroma_type split_chroma_luma(image_type_v2<uchar1> img) {
|
|
|
+ assert(img.height % 3 == 0);
|
|
|
+ auto img_height = img.height / 3 * 2;
|
|
|
+ auto luma_img = img.sub_image(0, 0, -1, img_height);
|
|
|
+ auto chroma_img = img.sub_image(img_height).cast<uchar2>();
|
|
|
+ return std::make_tuple(luma_img, chroma_img);
|
|
|
+}
|
|
|
+
|
|
|
+void ensure_nv12_normal_compatible(image_type_v2<uchar1> in,
|
|
|
+ image_type_v2<uchar3> out) {
|
|
|
assert(in.height % 3 == 0);
|
|
|
auto img_height = in.height / 3 * 2;
|
|
|
assert(out.width == in.width);
|
|
|
assert(out.height == img_height);
|
|
|
- auto luma_img = in.sub_image(0, 0, -1, img_height);
|
|
|
- auto chroma_img = in.sub_image(img_height).cast<uchar2>();
|
|
|
+}
|
|
|
+
|
|
|
+void call_nv12_to_rgb(image_type_v2<uchar1> in,
|
|
|
+ image_type_v2<uchar3> out,
|
|
|
+ cudaStream_t stream) {
|
|
|
+ ensure_nv12_normal_compatible(in, out);
|
|
|
+ auto [luma_img, chroma_img] = split_chroma_luma(in);
|
|
|
auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
|
|
|
nv12_to_rgb<<<grid_dim, block_dim, 0, stream>>>(luma_img, chroma_img, out);
|
|
|
+}
|
|
|
+
|
|
|
+void call_nv12_to_yuv(image_type_v2<uchar1> in,
|
|
|
+ image_type_v2<uchar3> out,
|
|
|
+ cudaStream_t stream) {
|
|
|
+ ensure_nv12_normal_compatible(in, out);
|
|
|
+ auto [luma_img, chroma_img] = split_chroma_luma(in);
|
|
|
+ auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
|
|
|
+ nv12_to_yuv<<<grid_dim, block_dim, 0, stream>>>(luma_img, chroma_img, out);
|
|
|
+}
|
|
|
+
|
|
|
+void call_rgb_to_nv12(image_type_v2<uchar3> in,
|
|
|
+ image_type_v2<uchar1> out,
|
|
|
+ cudaStream_t stream) {
|
|
|
+ ensure_nv12_normal_compatible(out, in);
|
|
|
+ auto [luma_img, chroma_img] = split_chroma_luma(out);
|
|
|
+ auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
|
|
|
+ rgb_to_nv12<<<grid_dim, block_dim, 0, stream>>>(in, luma_img, chroma_img);
|
|
|
+}
|
|
|
+
|
|
|
+void call_yuv_to_nv12(image_type_v2<uchar3> in,
|
|
|
+ image_type_v2<uchar1> out,
|
|
|
+ cudaStream_t stream) {
|
|
|
+ ensure_nv12_normal_compatible(out, in);
|
|
|
+ auto [luma_img, chroma_img] = split_chroma_luma(out);
|
|
|
+ auto [grid_dim, block_dim] = get_kernel_size(chroma_img.width, chroma_img.height);
|
|
|
+ yuv_to_nv12<<<grid_dim, block_dim, 0, stream>>>(in, luma_img, chroma_img);
|
|
|
}
|