#include "image_process.h" #include "core/cuda_helper.hpp" #include "core/image_utility.hpp" #include "core/memory_pool.h" #include "cuda_impl/process_kernels.cuh" #include #include namespace process_impl { template struct smart_buffer : private boost::noncopyable { static_assert(std::is_trivial_v); T *ptr = nullptr; size_t length = 0; smart_buffer() = default; template smart_buffer(const smart_buffer &other) = delete; ~smart_buffer() { MEM_DEALLOC(ptr); } void create(size_t req_length) { if (req_length > capacity) [[unlikely]] { MEM_DEALLOC(ptr); MEM_ALLOC(T, req_length, MEM_HOST); capacity = req_length; } length = req_length; } size_t size() const { return length * sizeof(T); } private: size_t capacity = 0; }; template struct smart_gpu_buffer : private boost::noncopyable { T *ptr = nullptr; size_t size = 0; smart_gpu_buffer() = default; template smart_gpu_buffer(const smart_gpu_buffer &other) = delete; ~smart_gpu_buffer() { deallocate(); } void create(size_t req_size) { if (req_size > capacity) [[unlikely]] { deallocate(); ptr = MEM_ALLOC(T, req_size, MEM_CUDA); capacity = req_size; } size = req_size; } template void upload_from(const smart_buffer &buf, cudaStream_t stream = nullptr) { assert(buf.length * sizeof(U) % sizeof(T) == 0); create(buf.length * sizeof(U) / sizeof(T)); if (stream == nullptr) { CUDA_API_CHECK(cudaMemcpy(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice)); } else { CUDA_API_CHECK(cudaMemcpyAsync(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice, stream)); } } template void upload_from(const U *src_ptr, size_t src_size, cudaStream_t stream = nullptr) { assert(src_size * sizeof(U) % sizeof(T) == 0); create(src_size * sizeof(U) / sizeof(T)); if (stream == nullptr) { CUDA_API_CHECK(cudaMemcpy(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice)); } else { CUDA_API_CHECK(cudaMemcpyAsync(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice, stream)); } } template void download_to(smart_buffer *buf, cudaStream_t stream = nullptr) { assert(size * sizeof(T) % sizeof(U) == 0); buf->create(size * sizeof(T) / sizeof(U)); if (stream == nullptr) { CUDA_API_CHECK(cudaMemcpy(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost)); } else { CUDA_API_CHECK(cudaMemcpyAsync(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream)); } } private: size_t capacity = 0; void deallocate() { if (ptr == nullptr) return; MEM_DEALLOC(ptr); ptr = nullptr; } }; struct smart_cuda_texture { cudaTextureObject_t obj = 0; int mat_type = -1; ~smart_cuda_texture() { deallocate(); } smart_cuda_texture() = default; smart_cuda_texture(const smart_cuda_texture &other) = delete; void create(const cv::cuda::GpuMat &mat) { if (last_ptr != mat.cudaPtr()) [[unlikely]] { deallocate(); allocate(mat); } } private: void *last_ptr = nullptr; void allocate(const cv::cuda::GpuMat &mat) { auto res_desc = cudaResourceDesc{}; res_desc.resType = cudaResourceTypePitch2D; res_desc.res.pitch2D.devPtr = mat.cudaPtr(); res_desc.res.pitch2D.width = mat.cols; res_desc.res.pitch2D.height = mat.rows; res_desc.res.pitch2D.pitchInBytes = mat.step; auto tex_desc = cudaTextureDesc{}; tex_desc.addressMode[0] = cudaAddressModeClamp; tex_desc.addressMode[1] = cudaAddressModeClamp; tex_desc.filterMode = cudaFilterModeLinear; tex_desc.readMode = cudaReadModeNormalizedFloat; tex_desc.normalizedCoords = true; mat_type = mat.type(); switch (mat_type) { case CV_8UC1: { res_desc.res.pitch2D.desc = cudaCreateChannelDesc(); break; } case CV_8UC4: { res_desc.res.pitch2D.desc = cudaCreateChannelDesc(); break; } default: { RET_ERROR; } } assert(obj == 0); CUDA_API_CHECK(cudaCreateTextureObject(&obj, &res_desc, &tex_desc, nullptr)); last_ptr = mat.cudaPtr(); } void deallocate() { if (obj == 0) return; CUDA_API_CHECK(cudaDestroyTextureObject(obj)); last_ptr = nullptr; obj = 0; } }; camera_info to_camera_info(const camera_intrinsic &cam) { camera_info ret{}; ret.fx = cam.fx / cam.width; ret.fy = cam.fy / cam.height; ret.cx = cam.cx / cam.width; ret.cy = cam.cy / cam.height; ret.k[0] = cam.k[0]; ret.k[1] = cam.k[1]; return ret; } void opencv_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) { switch (in.type()) { case CV_8UC1: { cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2BGR, 3, stream); return; } } unreachable(); } void opencv_gray2rgb(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) { switch (in.type()) { case CV_8UC1: { cv::cuda::cvtColor(in, *out, cv::COLOR_GRAY2BGR, 3, stream); return; } } unreachable(); } template image_type to_image_type(const cv::cuda::GpuMat &mat) { assert(sizeof(T) == CV_ELEM_SIZE(mat.type())); auto ret = image_type(); ret.ptr = (T *) mat.cudaPtr(); ret.pitch = mat.step; ret.width = mat.cols; ret.height = mat.rows; return ret; } template void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer *out, cudaStream_t stream) { assert(in.elemSize() == sizeof(T)); out->create(in.size().area()); auto flatten_pitch = in.cols * in.elemSize(); CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step, flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream)); } template void unflatten(const smart_gpu_buffer &in, cv::cuda::GpuMat *out, cv::Size size, int type, cudaStream_t stream) { assert(sizeof(T) == CV_ELEM_SIZE(type)); assert(in.size == size.area()); out->create(size, type); auto flatten_pitch = out->cols * out->elemSize(); CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch, flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream)); } void crude_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, bool alpha, cudaStream_t stream) { constexpr uint2 block_size = {32, 4}; constexpr uint2 grid_dim = {8, 128}; auto out_size = cv::Size{in.cols >> 1, in.rows >> 1}; switch (in.type()) { case CV_8UC1: { if (alpha) { out->create(out_size, CV_8UC4); call_crude_debayer(to_image_type(in), to_image_type(*out), block_size, grid_dim, stream); } else { out->create(out_size, CV_8UC3); call_crude_debayer(to_image_type(in), to_image_type(*out), block_size, grid_dim, stream); } return; } default: { RET_ERROR; } } } // pixel coordinate to undistorted normalized plane cv::Point2f undistort_point(const camera_intrinsic &info, cv::Point2f p) { auto u = (p.x - info.cx) / info.fx; auto v = (p.y - info.cy) / info.fy; auto r0 = sqrtf(u * u + v * v); // Newton's Method constexpr auto SOLVE_ITERATION_CNT = 4; auto r = r0; for (auto k = 0; k < SOLVE_ITERATION_CNT; ++k) { auto r2 = r * r; auto r3 = r2 * r; auto r4 = r3 * r; auto r5 = r4 * r; r -= (info.k[1] * r5 + info.k[0] * r3 + r - r0) / (5 * info.k[1] * r4 + 3 * info.k[0] * r2 + 1); } auto factor = r / r0; u *= factor; v *= factor; return {u, v}; } void resample_image(const smart_cuda_texture &in, cv::cuda::GpuMat *out, cv::Size2f range, const camera_intrinsic &cam, uint32_t height, cudaStream_t stream) { constexpr uint2 block_size = {32, 4}; constexpr uint2 grid_dim = {8, 128}; float ps = 2 * range.height / height; uint32_t width = 2 * range.width / ps; width = (width + 3) & (-4); // make OpenGL happy resample_info info{}; info.x = -range.width; info.y = -range.height; info.ps = ps; switch (in.mat_type) { case CV_8UC1: { out->create(height, width, CV_8UC1); call_resample_image(in.obj, to_image_type(*out), info, to_camera_info(cam), block_size, grid_dim, stream); return; } case CV_8UC4: { out->create(height, width, CV_8UC3); call_resample_image(in.obj, to_image_type(*out), info, to_camera_info(cam), block_size, grid_dim, stream); return; } default: { RET_ERROR; } } } } using namespace process_impl; struct monocular_processor::impl { cv::cuda::GpuMat rgba_dev; cv::cuda::GpuMat resample_dev; cv::cuda::GpuMat ugly_out; // TODO: ugly hack smart_cuda_texture resample_tex; smart_gpu_buffer rgb_f; smart_gpu_buffer hsv_v_f; smart_gpu_buffer hsv_v_max, hsv_v_sum_log; smart_gpu_buffer enhance_ext; void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) { assert(in.type() == CV_8UC3); // flatten image into a line flatten(in, &rgb_f, stream); auto line_size = rgb_f.size; // extract V channel of HSV constexpr auto block_size = 256; constexpr auto grid_dim = 512; hsv_v_f.create(rgb_f.size); call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size, block_size, grid_dim, stream); // reduce enhance coefficients hsv_v_max.create(grid_dim); call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size, block_size, grid_dim, stream); hsv_v_sum_log.create(grid_dim); call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size, block_size, grid_dim, stream); // prepare enhance coefficients enhance_ext.create(1); call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr, line_size, enhance_ext.ptr, stream); // enhance image call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr, block_size, grid_dim, stream); // unflatten image unflatten(rgb_f, out, in.size(), CV_8UC3, stream); } image_u8c3 process(const image_u8c1 &in, process_config conf) { auto cuda_stream = conf.stream->cuda; auto cv_stream = conf.stream->cv; auto in_mat = in->as_cuda(conf.stream); if (conf.is_mono) { // undistort if (conf.undistort) { resample_tex.create(in_mat); resample_image(resample_tex, &resample_dev, conf.valid_range, conf.camera, conf.resample_height, cuda_stream); } else { resample_dev = in_mat; } // Mono -> RGB opencv_gray2rgb(resample_dev, &ugly_out, cv_stream); } else { // debayer if (conf.crude_debayer) { if (conf.undistort) { crude_debayer(in_mat, &rgba_dev, true, cuda_stream); } else { crude_debayer(in_mat, &ugly_out, false, cuda_stream); } } else { assert(!conf.undistort); opencv_debayer(in_mat, &ugly_out, cv_stream); } // undistort if (conf.undistort) { assert(conf.crude_debayer); resample_tex.create(rgba_dev); resample_image(resample_tex, &ugly_out, conf.valid_range, conf.camera, conf.resample_height, cuda_stream); } } // enhance image if (conf.enhance) { enhance_image(ugly_out, &ugly_out, cuda_stream); } auto out_info = create_image_info(ugly_out.size(), MEM_CUDA); out_info.fill_from_async(ugly_out, conf.stream); return create_image(out_info); } }; monocular_processor::monocular_processor() : pimpl(std::make_unique()) {} monocular_processor::~monocular_processor() = default; image_u8c3 monocular_processor::process(const image_u8c1 &in, process_config conf) { return pimpl->process(in, conf); } cv::Size2f calc_valid_range(const camera_intrinsic &left, const camera_intrinsic &right, float *angle) { auto u_lim = std::min({-undistort_point(left, {0, left.cy}).x, undistort_point(left, {(float) left.width, left.cy}).x, -undistort_point(right, {0, right.cy}).x, undistort_point(right, {(float) right.width, right.cy}).x}); auto v_lim = std::min({-undistort_point(left, {left.cx, 0}).y, undistort_point(left, {left.cx, (float) left.height}).y, -undistort_point(right, {right.cx, 0}).y, undistort_point(right, {right.cx, (float) right.height}).y}); if (angle != nullptr) { *angle = 2 * atanf(v_lim); } return {u_lim, v_lim}; }