|
@@ -19,21 +19,12 @@ struct smart_buffer {
|
|
|
|
|
|
|
|
smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
|
|
smart_buffer(size_t _width, size_t _height, size_t _elem_cnt)
|
|
|
: width(_width), height(_height), elem_cnt(_elem_cnt) {
|
|
: width(_width), height(_height), elem_cnt(_elem_cnt) {
|
|
|
- malloc_memory();
|
|
|
|
|
|
|
+ auto width_bytes = width * elem_cnt * sizeof(T);
|
|
|
|
|
+ CALL_ASSERT_EXCEPTION(cudaMallocPitch(&ptr, &pitch, width_bytes, height) == cudaSuccess);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
~smart_buffer() {
|
|
~smart_buffer() {
|
|
|
- free_memory();
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- bool malloc_memory() {
|
|
|
|
|
- CUDA_API_CHECK(cudaMallocPitch(&ptr, &pitch, width * elem_cnt * sizeof(T), height));
|
|
|
|
|
- return true;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- bool free_memory() {
|
|
|
|
|
- CUDA_API_CHECK(cudaFree(ptr));
|
|
|
|
|
- return true;
|
|
|
|
|
|
|
+ CALL_ASSERT_EXCEPTION(cudaFree(ptr) == cudaSuccess);
|
|
|
}
|
|
}
|
|
|
};
|
|
};
|
|
|
|
|
|
|
@@ -64,20 +55,23 @@ struct hdr_synthesizer::impl {
|
|
|
NppiSize *pyr_size_arr;
|
|
NppiSize *pyr_size_arr;
|
|
|
void *gaussian_filter_coff_f32;
|
|
void *gaussian_filter_coff_f32;
|
|
|
|
|
|
|
|
- NppStreamContext npp_ctx; // user provided stream
|
|
|
|
|
- NppStreamContext extra_npp_ctx;
|
|
|
|
|
- cudaStream_t extra_stream;
|
|
|
|
|
|
|
+ NppStreamContext npp_ctx, extra_npp_ctx;
|
|
|
|
|
+ cudaStream_t main_stream, extra_stream;
|
|
|
cudaEvent_t sync_event;
|
|
cudaEvent_t sync_event;
|
|
|
|
|
|
|
|
// global temporary memory
|
|
// global temporary memory
|
|
|
- smart_buffer<Npp8u> *raw_u8, *rgb_u8;
|
|
|
|
|
|
|
+ smart_buffer<Npp8u> *rgb_u8;
|
|
|
smart_buffer<Npp32f> *rgb_f32[2];
|
|
smart_buffer<Npp32f> *rgb_f32[2];
|
|
|
|
|
|
|
|
- impl(uint16_t _width, uint16_t _height, uint8_t _level)
|
|
|
|
|
- : width(_width), height(_height), pyr_level(_level) {
|
|
|
|
|
- malloc_global_memory();
|
|
|
|
|
- init_npp_ctx(&npp_ctx);
|
|
|
|
|
- init_npp_ctx(&extra_npp_ctx);
|
|
|
|
|
|
|
+ impl(uint16_t _width, uint16_t _height, uint8_t _level, cudaStream_t stream)
|
|
|
|
|
+ : width(_width), height(_height), pyr_level(_level), main_stream(stream) {
|
|
|
|
|
+ rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
|
|
|
|
|
+ rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
|
|
|
|
|
+ rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
|
|
|
|
|
+ CALL_ASSERT_EXCEPTION(malloc_dev_mem());
|
|
|
|
|
+
|
|
|
|
|
+ init_npp_ctx(&npp_ctx, main_stream);
|
|
|
|
|
+ init_npp_ctx(&extra_npp_ctx, extra_stream);
|
|
|
|
|
|
|
|
pyr_height = height + (height >> 1);
|
|
pyr_height = height + (height >> 1);
|
|
|
full_size = NppiSize{width, height};
|
|
full_size = NppiSize{width, height};
|
|
@@ -98,18 +92,16 @@ struct hdr_synthesizer::impl {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
~impl() {
|
|
~impl() {
|
|
|
- free_global_memory();
|
|
|
|
|
|
|
+ delete rgb_u8;
|
|
|
|
|
+ delete rgb_f32[0];
|
|
|
|
|
+ delete rgb_f32[1];
|
|
|
|
|
+ CALL_ASSERT_EXCEPTION(free_dev_mem());
|
|
|
|
|
|
|
|
delete pyr_offset_arr;
|
|
delete pyr_offset_arr;
|
|
|
delete pyr_size_arr;
|
|
delete pyr_size_arr;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- bool malloc_global_memory() {
|
|
|
|
|
- raw_u8 = new smart_buffer<Npp8u>(width, height, 1);
|
|
|
|
|
- rgb_u8 = new smart_buffer<Npp8u>(width, height, 3);
|
|
|
|
|
- rgb_f32[0] = new smart_buffer<Npp32f>(width, height, 3);
|
|
|
|
|
- rgb_f32[1] = new smart_buffer<Npp32f>(width, height, 3);
|
|
|
|
|
-
|
|
|
|
|
|
|
+ bool malloc_dev_mem() {
|
|
|
// upload gaussian kernel coefficient
|
|
// upload gaussian kernel coefficient
|
|
|
CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
|
|
CUDA_API_CHECK(cudaMalloc(&gaussian_filter_coff_f32, sizeof(gaussian_filter_coff)));
|
|
|
CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
|
|
CUDA_API_CHECK(cudaMemcpy(gaussian_filter_coff_f32, gaussian_filter_coff,
|
|
@@ -121,21 +113,15 @@ struct hdr_synthesizer::impl {
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- bool free_global_memory() {
|
|
|
|
|
|
|
+ bool free_dev_mem() {
|
|
|
CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
|
|
CUDA_API_CHECK(cudaFree(gaussian_filter_coff_f32));
|
|
|
CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
|
|
CUDA_API_CHECK(cudaStreamDestroy(extra_stream));
|
|
|
CUDA_API_CHECK(cudaEventDestroy(sync_event));
|
|
CUDA_API_CHECK(cudaEventDestroy(sync_event));
|
|
|
-
|
|
|
|
|
- delete raw_u8;
|
|
|
|
|
- delete rgb_u8;
|
|
|
|
|
- delete rgb_f32[0];
|
|
|
|
|
- delete rgb_f32[1];
|
|
|
|
|
-
|
|
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- static bool init_npp_ctx(NppStreamContext *ctx) {
|
|
|
|
|
- ctx->hStream = nullptr;
|
|
|
|
|
|
|
+ static bool init_npp_ctx(NppStreamContext *ctx, cudaStream_t stream) {
|
|
|
|
|
+ ctx->hStream = stream;
|
|
|
CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
|
|
CUDA_API_CHECK(cudaGetDevice(&ctx->nCudaDeviceId));
|
|
|
cudaDeviceProp dev_prop = {};
|
|
cudaDeviceProp dev_prop = {};
|
|
|
CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
|
|
CUDA_API_CHECK(cudaGetDeviceProperties(&dev_prop, ctx->nCudaDeviceId));
|
|
@@ -147,13 +133,6 @@ struct hdr_synthesizer::impl {
|
|
|
cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
|
|
cudaDevAttrComputeCapabilityMajor, ctx->nCudaDeviceId));
|
|
|
CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
|
|
CUDA_API_CHECK(cudaDeviceGetAttribute(&ctx->nCudaDevAttrComputeCapabilityMinor,
|
|
|
cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
|
|
cudaDevAttrComputeCapabilityMinor, ctx->nCudaDeviceId));
|
|
|
- CUDA_API_CHECK(cudaStreamGetFlags(nullptr, &ctx->nStreamFlags));
|
|
|
|
|
- return true;
|
|
|
|
|
- }
|
|
|
|
|
-
|
|
|
|
|
- static bool set_npp_stream(NppStreamContext *ctx, cudaStream_t stream) {
|
|
|
|
|
- if (ctx->hStream == stream) [[likely]] return true;
|
|
|
|
|
- ctx->hStream = stream;
|
|
|
|
|
CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
|
|
CUDA_API_CHECK(cudaStreamGetFlags(stream, &ctx->nStreamFlags));
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
@@ -196,15 +175,15 @@ struct hdr_synthesizer::impl {
|
|
|
bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
|
|
bool laplacian_pyramid(Npp32f *ptr, size_t pitch,
|
|
|
NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
|
|
NppStreamContext *ctx, cudaStream_t stream) const { // construct laplacian pyramid
|
|
|
// generate gaussian pyramid first
|
|
// generate gaussian pyramid first
|
|
|
- gaussian_pyramid(ptr, pitch, true, ctx);
|
|
|
|
|
|
|
+ CALL_CHECK(gaussian_pyramid(ptr, pitch, true, ctx));
|
|
|
|
|
|
|
|
// generate laplacian pyramid by up-sampling and subtraction
|
|
// generate laplacian pyramid by up-sampling and subtraction
|
|
|
auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
|
|
auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
|
|
|
- laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
|
|
|
|
|
|
|
+ CALL_CHECK(laplacian_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
|
|
|
for (int i = 0; i < pyr_level - 1; ++i) {
|
|
for (int i = 0; i < pyr_level - 1; ++i) {
|
|
|
auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
|
|
auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i + 1], height, 3);
|
|
|
auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
|
|
auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
|
|
|
- laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream);
|
|
|
|
|
|
|
+ CALL_CHECK(laplacian_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i + 1], pyr_size_arr[i], stream));
|
|
|
}
|
|
}
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
@@ -221,23 +200,16 @@ struct hdr_synthesizer::impl {
|
|
|
for (int i = pyr_level - 1; i > 0; --i) {
|
|
for (int i = pyr_level - 1; i > 0; --i) {
|
|
|
auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
|
|
auto src_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i], height, 3);
|
|
|
auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
|
|
auto dst_ptr = smart_offset(ptr, pitch, pyr_offset_arr[i - 1], height, 3);
|
|
|
- reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream);
|
|
|
|
|
|
|
+ CALL_CHECK(reconstruct_operation(src_ptr, dst_ptr, pitch, pyr_size_arr[i], pyr_size_arr[i - 1], stream));
|
|
|
}
|
|
}
|
|
|
auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
|
|
auto pyr_ptr = smart_offset(ptr, pitch, 0, height, 3);
|
|
|
- reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream);
|
|
|
|
|
|
|
+ CALL_CHECK(reconstruct_operation(pyr_ptr, ptr, pitch, pyr_size_arr[0], full_size, stream));
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- bool preprocess_image(image_buffer *buf, uint8_t *raw, cudaStream_t stream) {
|
|
|
|
|
- // upload image
|
|
|
|
|
- CUDA_API_CHECK(cudaMemcpy2DAsync(raw_u8->ptr, raw_u8->pitch,
|
|
|
|
|
- raw, width * sizeof(uint8_t), width * sizeof(uint8_t),
|
|
|
|
|
- height, cudaMemcpyHostToDevice, stream));
|
|
|
|
|
-
|
|
|
|
|
|
|
+ bool preprocess_image(image_buffer *buf, void *raw_u8, size_t pitch) {
|
|
|
// debayer image
|
|
// debayer image
|
|
|
- set_npp_stream(&npp_ctx, stream);
|
|
|
|
|
- CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx(raw_u8->ptr, raw_u8->pitch,
|
|
|
|
|
- full_size, full_rect,
|
|
|
|
|
|
|
+ CUDA_API_CHECK(nppiCFAToRGB_8u_C1C3R_Ctx((Npp8u *) raw_u8, pitch, full_size, full_rect,
|
|
|
rgb_u8->ptr, rgb_u8->pitch,
|
|
rgb_u8->ptr, rgb_u8->pitch,
|
|
|
NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
|
|
NPPI_BAYER_BGGR, NPPI_INTER_UNDEFINED, npp_ctx));
|
|
|
|
|
|
|
@@ -248,39 +220,37 @@ struct hdr_synthesizer::impl {
|
|
|
CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
|
|
CUDA_API_CHECK(nppiMulC_32f_C3IR_Ctx(u8_to_f32_coff_arr,
|
|
|
buf->image_pyr->ptr, buf->image_pyr->pitch,
|
|
buf->image_pyr->ptr, buf->image_pyr->pitch,
|
|
|
full_size, npp_ctx));
|
|
full_size, npp_ctx));
|
|
|
- CUDA_API_CHECK(cudaEventRecord(sync_event, stream));
|
|
|
|
|
|
|
+ CUDA_API_CHECK(cudaEventRecord(sync_event, main_stream));
|
|
|
|
|
|
|
|
// calc weight and construct pyramid
|
|
// calc weight and construct pyramid
|
|
|
CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
|
|
CUDA_API_CHECK(cudaStreamWaitEvent(extra_stream, sync_event));
|
|
|
call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
|
|
call_hdr_weight(rgb_u8->ptr, rgb_u8->pitch,
|
|
|
buf->weight_pyr->ptr, buf->weight_pyr->pitch,
|
|
buf->weight_pyr->ptr, buf->weight_pyr->pitch,
|
|
|
width, height, extra_stream); // parallel execution for weight related calculation
|
|
width, height, extra_stream); // parallel execution for weight related calculation
|
|
|
- set_npp_stream(&extra_npp_ctx, extra_stream);
|
|
|
|
|
- gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx);
|
|
|
|
|
|
|
+ CALL_CHECK(gaussian_pyramid(buf->weight_pyr->ptr, buf->weight_pyr->pitch, false, &extra_npp_ctx));
|
|
|
CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
|
|
CUDA_API_CHECK(cudaEventRecord(sync_event, extra_stream));
|
|
|
|
|
|
|
|
// construct image pyramid
|
|
// construct image pyramid
|
|
|
- laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, stream);
|
|
|
|
|
- CUDA_API_CHECK(cudaStreamWaitEvent(stream, sync_event));
|
|
|
|
|
|
|
+ CALL_CHECK(laplacian_pyramid(buf->image_pyr->ptr, buf->image_pyr->pitch, &npp_ctx, main_stream));
|
|
|
|
|
+ CUDA_API_CHECK(cudaStreamWaitEvent(main_stream, sync_event));
|
|
|
|
|
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
|
|
bool merge_image(image_buffer *buf_a, image_buffer *buf_b,
|
|
|
- uint8_t *out_ptr, size_t out_pitch, cudaStream_t stream) {
|
|
|
|
|
|
|
+ uint8_t *out_ptr, size_t out_pitch) {
|
|
|
assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
|
|
assert(buf_a->image_pyr->pitch == buf_b->image_pyr->pitch);
|
|
|
assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
|
|
assert(buf_a->weight_pyr->pitch == buf_b->weight_pyr->pitch);
|
|
|
|
|
|
|
|
// merge
|
|
// merge
|
|
|
call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
|
|
call_hdr_merge(buf_a->image_pyr->ptr, buf_b->image_pyr->ptr, buf_a->image_pyr->pitch,
|
|
|
buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
|
|
buf_a->weight_pyr->ptr, buf_b->weight_pyr->ptr, buf_a->weight_pyr->pitch,
|
|
|
- width, pyr_height, stream);
|
|
|
|
|
|
|
+ width, pyr_height, main_stream);
|
|
|
|
|
|
|
|
// reconstruct image from laplacian pyramid
|
|
// reconstruct image from laplacian pyramid
|
|
|
- pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, stream);
|
|
|
|
|
|
|
+ CALL_CHECK(pyramid_reconstruct(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch, main_stream));
|
|
|
|
|
|
|
|
// convert to uint8
|
|
// convert to uint8
|
|
|
- set_npp_stream(&npp_ctx, stream);
|
|
|
|
|
CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
|
|
CUDA_API_CHECK(nppiConvert_32f8u_C3R_Ctx(buf_a->image_pyr->ptr, buf_a->image_pyr->pitch,
|
|
|
out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
|
|
out_ptr, out_pitch, full_size, NPP_RND_NEAR, npp_ctx));
|
|
|
|
|
|
|
@@ -289,8 +259,9 @@ struct hdr_synthesizer::impl {
|
|
|
|
|
|
|
|
};
|
|
};
|
|
|
|
|
|
|
|
-hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height, uint8_t pyramid_level)
|
|
|
|
|
- : pimpl(std::make_unique<impl>(width, height, pyramid_level)) {
|
|
|
|
|
|
|
+hdr_synthesizer::hdr_synthesizer(uint16_t width, uint16_t height,
|
|
|
|
|
+ cudaStream_t stream, uint8_t pyramid_level)
|
|
|
|
|
+ : pimpl(std::make_unique<impl>(width, height, pyramid_level, stream)) {
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
hdr_synthesizer::~hdr_synthesizer() = default;
|
|
hdr_synthesizer::~hdr_synthesizer() = default;
|
|
@@ -308,12 +279,12 @@ bool hdr_synthesizer::free_buffer(void *out_buf) {
|
|
|
return true;
|
|
return true;
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
-bool hdr_synthesizer::preprocess_image(void *img_buf, uint8_t *img_ptr, void *stream) {
|
|
|
|
|
- return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, (cudaStream_t) stream);
|
|
|
|
|
|
|
+bool hdr_synthesizer::preprocess_image(void *img_buf, void *img_ptr, size_t pitch) {
|
|
|
|
|
+ return pimpl->preprocess_image((impl::image_buffer *) img_buf, img_ptr, pitch);
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
|
|
bool hdr_synthesizer::merge_image(void *buf_a, void *buf_b,
|
|
|
- uint8_t *img_ptr, size_t img_pitch, void *stream) {
|
|
|
|
|
|
|
+ uint8_t *img_ptr, size_t img_pitch) {
|
|
|
return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
|
|
return pimpl->merge_image((impl::image_buffer *) buf_a, (impl::image_buffer *) buf_b,
|
|
|
- img_ptr, img_pitch, (cudaStream_t) stream);
|
|
|
|
|
|
|
+ img_ptr, img_pitch);
|
|
|
}
|
|
}
|