image_process.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. #include "image_process.h"
  2. #include "core/cuda_helper.hpp"
  3. #include "core/image_utility.hpp"
  4. #include "core/memory_pool.h"
  5. #include "cuda_impl/process_kernels.cuh"
  6. #include <opencv2/cudaimgproc.hpp>
  7. #include <boost/noncopyable.hpp>
  8. namespace process_impl {
  9. template<typename T>
  10. struct smart_buffer : private boost::noncopyable {
  11. static_assert(std::is_trivial_v<T>);
  12. T *ptr = nullptr;
  13. size_t length = 0;
  14. smart_buffer() = default;
  15. template<typename U=T>
  16. smart_buffer(const smart_buffer<U> &other) = delete;
  17. ~smart_buffer() {
  18. MEM_DEALLOC(ptr);
  19. }
  20. void create(size_t req_length) {
  21. if (req_length > capacity) [[unlikely]] {
  22. MEM_DEALLOC(ptr);
  23. MEM_ALLOC(T, req_length, MEM_HOST);
  24. capacity = req_length;
  25. }
  26. length = req_length;
  27. }
  28. size_t size() const {
  29. return length * sizeof(T);
  30. }
  31. private:
  32. size_t capacity = 0;
  33. };
  34. template<typename T>
  35. struct smart_gpu_buffer : private boost::noncopyable {
  36. T *ptr = nullptr;
  37. size_t size = 0;
  38. smart_gpu_buffer() = default;
  39. template<typename U>
  40. smart_gpu_buffer(const smart_gpu_buffer<T> &other) = delete;
  41. ~smart_gpu_buffer() {
  42. deallocate();
  43. }
  44. void create(size_t req_size) {
  45. if (req_size > capacity) [[unlikely]] {
  46. deallocate();
  47. ptr = MEM_ALLOC(T, req_size, MEM_CUDA);
  48. capacity = req_size;
  49. }
  50. size = req_size;
  51. }
  52. template<typename U=T>
  53. void upload_from(const smart_buffer<U> &buf, cudaStream_t stream = nullptr) {
  54. assert(buf.length * sizeof(U) % sizeof(T) == 0);
  55. create(buf.length * sizeof(U) / sizeof(T));
  56. if (stream == nullptr) {
  57. CUDA_API_CHECK(cudaMemcpy(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice));
  58. } else {
  59. CUDA_API_CHECK(cudaMemcpyAsync(ptr, buf.ptr, buf.length * sizeof(U), cudaMemcpyHostToDevice, stream));
  60. }
  61. }
  62. template<typename U=T>
  63. void upload_from(const U *src_ptr, size_t src_size, cudaStream_t stream = nullptr) {
  64. assert(src_size * sizeof(U) % sizeof(T) == 0);
  65. create(src_size * sizeof(U) / sizeof(T));
  66. if (stream == nullptr) {
  67. CUDA_API_CHECK(cudaMemcpy(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice));
  68. } else {
  69. CUDA_API_CHECK(cudaMemcpyAsync(ptr, src_ptr, src_size * sizeof(U), cudaMemcpyHostToDevice, stream));
  70. }
  71. }
  72. template<typename U=T>
  73. void download_to(smart_buffer<U> *buf, cudaStream_t stream = nullptr) {
  74. assert(size * sizeof(T) % sizeof(U) == 0);
  75. buf->create(size * sizeof(T) / sizeof(U));
  76. if (stream == nullptr) {
  77. CUDA_API_CHECK(cudaMemcpy(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost));
  78. } else {
  79. CUDA_API_CHECK(cudaMemcpyAsync(buf->ptr, ptr, size * sizeof(T), cudaMemcpyDeviceToHost, stream));
  80. }
  81. }
  82. private:
  83. size_t capacity = 0;
  84. void deallocate() {
  85. if (ptr == nullptr) return;
  86. MEM_DEALLOC(ptr);
  87. ptr = nullptr;
  88. }
  89. };
  90. struct smart_cuda_texture {
  91. cudaTextureObject_t obj = 0;
  92. int mat_type = -1;
  93. ~smart_cuda_texture() {
  94. deallocate();
  95. }
  96. smart_cuda_texture() = default;
  97. smart_cuda_texture(const smart_cuda_texture &other) = delete;
  98. void create(const cv::cuda::GpuMat &mat) {
  99. if (last_ptr != mat.cudaPtr()) [[unlikely]] {
  100. deallocate();
  101. allocate(mat);
  102. }
  103. }
  104. private:
  105. void *last_ptr = nullptr;
  106. void allocate(const cv::cuda::GpuMat &mat) {
  107. auto res_desc = cudaResourceDesc{};
  108. res_desc.resType = cudaResourceTypePitch2D;
  109. res_desc.res.pitch2D.devPtr = mat.cudaPtr();
  110. res_desc.res.pitch2D.width = mat.cols;
  111. res_desc.res.pitch2D.height = mat.rows;
  112. res_desc.res.pitch2D.pitchInBytes = mat.step;
  113. auto tex_desc = cudaTextureDesc{};
  114. tex_desc.addressMode[0] = cudaAddressModeClamp;
  115. tex_desc.addressMode[1] = cudaAddressModeClamp;
  116. tex_desc.filterMode = cudaFilterModeLinear;
  117. tex_desc.readMode = cudaReadModeNormalizedFloat;
  118. tex_desc.normalizedCoords = true;
  119. mat_type = mat.type();
  120. switch (mat_type) {
  121. case CV_8UC1: {
  122. res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uint8_t>();
  123. break;
  124. }
  125. case CV_8UC4: {
  126. res_desc.res.pitch2D.desc = cudaCreateChannelDesc<uchar4>();
  127. break;
  128. }
  129. default: {
  130. RET_ERROR;
  131. }
  132. }
  133. assert(obj == 0);
  134. CUDA_API_CHECK(cudaCreateTextureObject(&obj, &res_desc, &tex_desc, nullptr));
  135. last_ptr = mat.cudaPtr();
  136. }
  137. void deallocate() {
  138. if (obj == 0) return;
  139. CUDA_API_CHECK(cudaDestroyTextureObject(obj));
  140. last_ptr = nullptr;
  141. obj = 0;
  142. }
  143. };
  144. camera_info to_camera_info(const camera_intrinsic &cam) {
  145. camera_info ret{};
  146. ret.fx = cam.fx / cam.width;
  147. ret.fy = cam.fy / cam.height;
  148. ret.cx = cam.cx / cam.width;
  149. ret.cy = cam.cy / cam.height;
  150. ret.k[0] = cam.k[0];
  151. ret.k[1] = cam.k[1];
  152. return ret;
  153. }
  154. void opencv_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
  155. switch (in.type()) {
  156. case CV_8UC1: {
  157. cv::cuda::cvtColor(in, *out, cv::COLOR_BayerRG2BGR, 3, stream);
  158. return;
  159. }
  160. }
  161. unreachable();
  162. }
  163. void opencv_gray2rgb(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cv::cuda::Stream &stream) {
  164. switch (in.type()) {
  165. case CV_8UC1: {
  166. cv::cuda::cvtColor(in, *out, cv::COLOR_GRAY2BGR, 3, stream);
  167. return;
  168. }
  169. }
  170. unreachable();
  171. }
  172. template<typename T>
  173. image_type<T> to_image_type(const cv::cuda::GpuMat &mat) {
  174. assert(sizeof(T) == CV_ELEM_SIZE(mat.type()));
  175. auto ret = image_type<T>();
  176. ret.ptr = (T *) mat.cudaPtr();
  177. ret.pitch = mat.step;
  178. ret.width = mat.cols;
  179. ret.height = mat.rows;
  180. return ret;
  181. }
  182. template<typename T>
  183. void flatten(const cv::cuda::GpuMat &in, smart_gpu_buffer<T> *out, cudaStream_t stream) {
  184. assert(in.elemSize() == sizeof(T));
  185. out->create(in.size().area());
  186. auto flatten_pitch = in.cols * in.elemSize();
  187. CUDA_API_CHECK(cudaMemcpy2DAsync(out->ptr, flatten_pitch, in.cudaPtr(), in.step,
  188. flatten_pitch, in.size().height, cudaMemcpyDeviceToDevice, stream));
  189. }
  190. template<typename T>
  191. void unflatten(const smart_gpu_buffer<T> &in, cv::cuda::GpuMat *out,
  192. cv::Size size, int type, cudaStream_t stream) {
  193. assert(sizeof(T) == CV_ELEM_SIZE(type));
  194. assert(in.size == size.area());
  195. out->create(size, type);
  196. auto flatten_pitch = out->cols * out->elemSize();
  197. CUDA_API_CHECK(cudaMemcpy2DAsync(out->cudaPtr(), out->step, in.ptr, flatten_pitch,
  198. flatten_pitch, out->size().height, cudaMemcpyDeviceToDevice, stream));
  199. }
  200. void crude_debayer(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out,
  201. bool alpha, cudaStream_t stream) {
  202. constexpr uint2 block_size = {32, 4};
  203. constexpr uint2 grid_dim = {8, 128};
  204. auto out_size = cv::Size{in.cols >> 1, in.rows >> 1};
  205. switch (in.type()) {
  206. case CV_8UC1: {
  207. if (alpha) {
  208. out->create(out_size, CV_8UC4);
  209. call_crude_debayer(to_image_type<uint8_t>(in),
  210. to_image_type<uchar4>(*out),
  211. block_size, grid_dim, stream);
  212. } else {
  213. out->create(out_size, CV_8UC3);
  214. call_crude_debayer(to_image_type<uint8_t>(in),
  215. to_image_type<uchar3>(*out),
  216. block_size, grid_dim, stream);
  217. }
  218. return;
  219. }
  220. default: {
  221. RET_ERROR;
  222. }
  223. }
  224. }
  225. // pixel coordinate to undistorted normalized plane
  226. cv::Point2f undistort_point(const camera_intrinsic &info, cv::Point2f p) {
  227. auto u = (p.x - info.cx) / info.fx;
  228. auto v = (p.y - info.cy) / info.fy;
  229. auto r0 = sqrtf(u * u + v * v);
  230. // Newton's Method
  231. constexpr auto SOLVE_ITERATION_CNT = 4;
  232. auto r = r0;
  233. for (auto k = 0; k < SOLVE_ITERATION_CNT; ++k) {
  234. auto r2 = r * r;
  235. auto r3 = r2 * r;
  236. auto r4 = r3 * r;
  237. auto r5 = r4 * r;
  238. r -= (info.k[1] * r5 + info.k[0] * r3 + r - r0) /
  239. (5 * info.k[1] * r4 + 3 * info.k[0] * r2 + 1);
  240. }
  241. auto factor = r / r0;
  242. u *= factor;
  243. v *= factor;
  244. return {u, v};
  245. }
  246. void resample_image(const smart_cuda_texture &in, cv::cuda::GpuMat *out,
  247. cv::Size2f range, const camera_intrinsic &cam, uint32_t height, cudaStream_t stream) {
  248. constexpr uint2 block_size = {32, 4};
  249. constexpr uint2 grid_dim = {8, 128};
  250. float ps = 2 * range.height / height;
  251. uint32_t width = 2 * range.width / ps;
  252. width = (width + 3) & (-4); // make OpenGL happy
  253. resample_info info{};
  254. info.x = -range.width;
  255. info.y = -range.height;
  256. info.ps = ps;
  257. switch (in.mat_type) {
  258. case CV_8UC1: {
  259. out->create(height, width, CV_8UC1);
  260. call_resample_image(in.obj, to_image_type<uint8_t>(*out), info,
  261. to_camera_info(cam), block_size, grid_dim, stream);
  262. return;
  263. }
  264. case CV_8UC4: {
  265. out->create(height, width, CV_8UC3);
  266. call_resample_image(in.obj, to_image_type<uchar3>(*out), info,
  267. to_camera_info(cam), block_size, grid_dim, stream);
  268. return;
  269. }
  270. default: {
  271. RET_ERROR;
  272. }
  273. }
  274. }
  275. }
  276. using namespace process_impl;
  277. struct monocular_processor::impl {
  278. cv::cuda::GpuMat rgba_dev;
  279. cv::cuda::GpuMat resample_dev;
  280. cv::cuda::GpuMat ugly_out; // TODO: ugly hack
  281. smart_cuda_texture resample_tex;
  282. smart_gpu_buffer<uchar3> rgb_f;
  283. smart_gpu_buffer<float> hsv_v_f;
  284. smart_gpu_buffer<float> hsv_v_max, hsv_v_sum_log;
  285. smart_gpu_buffer<enhance_coeff> enhance_ext;
  286. void enhance_image(const cv::cuda::GpuMat &in, cv::cuda::GpuMat *out, cudaStream_t stream) {
  287. assert(in.type() == CV_8UC3);
  288. // flatten image into a line
  289. flatten(in, &rgb_f, stream);
  290. auto line_size = rgb_f.size;
  291. // extract V channel of HSV
  292. constexpr auto block_size = 256;
  293. constexpr auto grid_dim = 512;
  294. hsv_v_f.create(rgb_f.size);
  295. call_rgb_extract_v(rgb_f.ptr, hsv_v_f.ptr, line_size,
  296. block_size, grid_dim, stream);
  297. // reduce enhance coefficients
  298. hsv_v_max.create(grid_dim);
  299. call_reduce_max(hsv_v_f.ptr, hsv_v_max.ptr, line_size,
  300. block_size, grid_dim, stream);
  301. hsv_v_sum_log.create(grid_dim);
  302. call_reduce_log_sum(hsv_v_f.ptr, hsv_v_sum_log.ptr, line_size,
  303. block_size, grid_dim, stream);
  304. // prepare enhance coefficients
  305. enhance_ext.create(1);
  306. call_prepare_enhance_coeff(hsv_v_max.ptr, hsv_v_sum_log.ptr,
  307. line_size, enhance_ext.ptr, stream);
  308. // enhance image
  309. call_enhance_image(rgb_f.ptr, rgb_f.ptr, line_size, enhance_ext.ptr,
  310. block_size, grid_dim, stream);
  311. // unflatten image
  312. unflatten(rgb_f, out, in.size(), CV_8UC3, stream);
  313. }
  314. image_u8c3 process(const image_u8c1 &in, process_config conf) {
  315. auto cuda_stream = conf.stream->cuda;
  316. auto cv_stream = conf.stream->cv;
  317. auto in_mat = in->as_cuda(conf.stream);
  318. if (conf.is_mono) {
  319. // undistort
  320. if (conf.undistort) {
  321. resample_tex.create(in_mat);
  322. resample_image(resample_tex, &resample_dev, conf.valid_range,
  323. conf.camera, conf.resample_height, cuda_stream);
  324. } else {
  325. resample_dev = in_mat;
  326. }
  327. // Mono -> RGB
  328. opencv_gray2rgb(resample_dev, &ugly_out, cv_stream);
  329. } else {
  330. // debayer
  331. if (conf.crude_debayer) {
  332. if (conf.undistort) {
  333. crude_debayer(in_mat, &rgba_dev, true, cuda_stream);
  334. } else {
  335. crude_debayer(in_mat, &ugly_out, false, cuda_stream);
  336. }
  337. } else {
  338. assert(!conf.undistort);
  339. opencv_debayer(in_mat, &ugly_out, cv_stream);
  340. }
  341. // undistort
  342. if (conf.undistort) {
  343. assert(conf.crude_debayer);
  344. resample_tex.create(rgba_dev);
  345. resample_image(resample_tex, &ugly_out, conf.valid_range,
  346. conf.camera, conf.resample_height, cuda_stream);
  347. }
  348. }
  349. // enhance image
  350. if (conf.enhance) {
  351. enhance_image(ugly_out, &ugly_out, cuda_stream);
  352. }
  353. auto out_info = create_image_info<uchar3>(ugly_out.size(), MEM_CUDA);
  354. out_info.fill_from_async(ugly_out, conf.stream);
  355. return create_image(out_info);
  356. }
  357. };
  358. monocular_processor::monocular_processor()
  359. : pimpl(std::make_unique<impl>()) {}
  360. monocular_processor::~monocular_processor() = default;
  361. image_u8c3 monocular_processor::process(const image_u8c1 &in, process_config conf) {
  362. return pimpl->process(in, conf);
  363. }
  364. cv::Size2f calc_valid_range(const camera_intrinsic &left, const camera_intrinsic &right, float *angle) {
  365. auto u_lim = std::min({-undistort_point(left, {0, left.cy}).x,
  366. undistort_point(left, {(float) left.width, left.cy}).x,
  367. -undistort_point(right, {0, right.cy}).x,
  368. undistort_point(right, {(float) right.width, right.cy}).x});
  369. auto v_lim = std::min({-undistort_point(left, {left.cx, 0}).y,
  370. undistort_point(left, {left.cx, (float) left.height}).y,
  371. -undistort_point(right, {right.cx, 0}).y,
  372. undistort_point(right, {right.cx, (float) right.height}).y});
  373. if (angle != nullptr) {
  374. *angle = 2 * atanf(v_lim);
  375. }
  376. return {u_lim, v_lim};
  377. }