|
|
@@ -0,0 +1,470 @@
|
|
|
+#include "memory_manager.h"
|
|
|
+#include "memory_utility.h"
|
|
|
+#include "utility.hpp"
|
|
|
+
|
|
|
+#include <map>
|
|
|
+#include <ranges>
|
|
|
+#include <shared_mutex>
|
|
|
+
|
|
|
+namespace {
|
|
|
+ // reuse_length * reuse_threshold >= request_length
|
|
|
+ constexpr auto reuse_threshold = 0.75;
|
|
|
+ constexpr auto host_alignment = 64;
|
|
|
+ constexpr auto cuda_alignment = 256;
|
|
|
+ constexpr auto pitch_alignment = 32;
|
|
|
+
|
|
|
+ template<typename T>
|
|
|
+ struct ptr_proxy {
|
|
|
+ using shared_type = std::shared_ptr<T>;
|
|
|
+ using weak_type = typename shared_type::weak_type;
|
|
|
+
|
|
|
+ shared_type shared;
|
|
|
+ weak_type weak;
|
|
|
+
|
|
|
+ [[nodiscard]] shared_type query() const {
|
|
|
+ if (shared != nullptr) return shared;
|
|
|
+ if (auto ret = weak.lock(); ret != nullptr) return ret;
|
|
|
+ return nullptr;
|
|
|
+ }
|
|
|
+ };
|
|
|
+}
|
|
|
+
|
|
|
+// #include <csignal>
|
|
|
+//
|
|
|
+// struct shared_mutex_debug : std::shared_mutex {
|
|
|
+// void lock() { raise(SIGTRAP); std::shared_mutex::lock(); }
|
|
|
+// bool try_lock() { raise(SIGTRAP); return std::shared_mutex::try_lock(); }
|
|
|
+// void unlock() { raise(SIGTRAP); std::shared_mutex::unlock(); }
|
|
|
+// void lock_shared() { raise(SIGTRAP); std::shared_mutex::lock_shared(); }
|
|
|
+// bool try_lock_shared() { raise(SIGTRAP); return std::shared_mutex::try_lock_shared(); }
|
|
|
+// void unlock_shared() { raise(SIGTRAP); std::shared_mutex::unlock_shared(); }
|
|
|
+// };
|
|
|
+
|
|
|
+struct memory_info_base {
|
|
|
+ void *ptr = nullptr;
|
|
|
+ size_t size = {}; // allocated size
|
|
|
+ std::shared_mutex mu;
|
|
|
+ // shared_mutex_debug mu;
|
|
|
+ std::shared_mutex twin_mu; // used for cuda_twin or host_twin
|
|
|
+
|
|
|
+ using ptr_type = std::shared_ptr<memory_info_base>;
|
|
|
+ using proxy_type = ptr_proxy<memory_info_base>;
|
|
|
+};
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+concept MemoryBaseType = std::is_base_of_v<memory_info_base, T>;
|
|
|
+
|
|
|
+struct host_memory_info_base : memory_info_base {
|
|
|
+ proxy_type cuda_twin;
|
|
|
+ std::optional<cuda_event_proxy> copy_in_event;
|
|
|
+ std::optional<cuda_event_proxy> copy_out_event;
|
|
|
+
|
|
|
+ using memory_type = host_memory_info;
|
|
|
+};
|
|
|
+
|
|
|
+struct cuda_memory_info_base : memory_info_base {
|
|
|
+ proxy_type host_twin;
|
|
|
+ cuda_event_proxy write_event;
|
|
|
+ cuda_event_list read_events;
|
|
|
+
|
|
|
+ using memory_type = cuda_memory_info;
|
|
|
+};
|
|
|
+
|
|
|
+template<MemoryBaseType T>
|
|
|
+static T *create_memory_base(size_t size);
|
|
|
+
|
|
|
+template<>
|
|
|
+host_memory_info_base *create_memory_base(size_t size) {
|
|
|
+ size = alignment_round<host_alignment>(size);
|
|
|
+ const auto ret = new host_memory_info_base();
|
|
|
+ ret->ptr = aligned_alloc(host_alignment, size);
|
|
|
+ ret->size = size;
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+template<>
|
|
|
+cuda_memory_info_base *create_memory_base(size_t size) {
|
|
|
+ size = alignment_round<cuda_alignment>(size);
|
|
|
+ const auto ret = new cuda_memory_info_base();
|
|
|
+ CUDA_API_CHECK(cudaMallocAsync(&ret->ptr, size, current_cuda_stream()));
|
|
|
+ record_cuda_event(ret->write_event);
|
|
|
+ ret->size = size;
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static void destroy_memory_base(host_memory_info_base *mem);
|
|
|
+
|
|
|
+static void destroy_memory_base(cuda_memory_info_base *mem);
|
|
|
+
|
|
|
+static bool event_finished_helper(const std::optional<cuda_event_proxy> &event) {
|
|
|
+ if (!event) return true;
|
|
|
+ if (is_cuda_event_finished(*event)) return true;
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static bool can_immediately_use(const host_memory_info_base *mem) {
|
|
|
+ if (!event_finished_helper(mem->copy_in_event)) return false;
|
|
|
+ if (!event_finished_helper(mem->copy_out_event)) return false;
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+static bool can_immediately_use(const cuda_memory_info_base *mem) {
|
|
|
+ if (!is_cuda_event_finished(mem->write_event)) return false;
|
|
|
+ if (std::ranges::any_of(mem->read_events,
|
|
|
+ [](const auto &e) { return !is_cuda_event_finished(e); }))
|
|
|
+ return false;
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+template<MemoryBaseType T>
|
|
|
+class memory_base_pool {
|
|
|
+public:
|
|
|
+ T *allocate(const size_t size) {
|
|
|
+ auto lock = std::lock_guard(mu);
|
|
|
+ if (auto ret = reuse_allocate(size);
|
|
|
+ ret != nullptr) [[likely]] { return ret; }
|
|
|
+ allocated += size;
|
|
|
+ return create_memory_base<T>(size);
|
|
|
+ }
|
|
|
+
|
|
|
+ void deallocate(T *mem) {
|
|
|
+ auto lock = std::lock_guard(mu);
|
|
|
+ cached += mem->size;
|
|
|
+ pool.emplace(mem->size, mem);
|
|
|
+ }
|
|
|
+
|
|
|
+ void purify() {
|
|
|
+ auto lock = std::lock_guard(mu);
|
|
|
+ for (auto info: pool | std::views::values) {
|
|
|
+ allocated -= info->size;
|
|
|
+ destroy_memory_base(info);
|
|
|
+ }
|
|
|
+ pool.clear();
|
|
|
+ }
|
|
|
+
|
|
|
+ ~memory_base_pool() {
|
|
|
+ purify();
|
|
|
+ }
|
|
|
+
|
|
|
+ size_t allocated = {}, cached = {};
|
|
|
+
|
|
|
+private:
|
|
|
+ using pool_type = std::multimap<size_t, T *>;
|
|
|
+ pool_type pool;
|
|
|
+
|
|
|
+ std::mutex mu;
|
|
|
+
|
|
|
+ T *reuse_allocate(const size_t size) {
|
|
|
+ auto iter = pool.lower_bound(size);
|
|
|
+ for (; iter != pool.end(); ++iter) {
|
|
|
+ const auto ret = iter->second;
|
|
|
+ if (ret->size * reuse_threshold > size) continue;
|
|
|
+ if (!can_immediately_use(ret)) continue;
|
|
|
+ cached -= ret->size;
|
|
|
+ pool.erase(iter);
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+ return nullptr;
|
|
|
+ }
|
|
|
+};
|
|
|
+
|
|
|
+template<MemoryBaseType T>
|
|
|
+auto create_info(typename T::ptr_type mem, size_t size_req) {
|
|
|
+ auto ret = typename T::memory_type();
|
|
|
+ ret.ptr = mem->ptr;
|
|
|
+ assert(size_req <= mem->size);
|
|
|
+ ret.size = size_req;
|
|
|
+ ret.base = std::static_pointer_cast<T>(mem);
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+struct memory_manager::impl {
|
|
|
+ memory_base_pool<host_memory_info_base> host_pool;
|
|
|
+ memory_base_pool<cuda_memory_info_base> cuda_pool;
|
|
|
+
|
|
|
+ template<MemoryBaseType T>
|
|
|
+ auto allocate(const size_t size) {
|
|
|
+ const auto mem = get_pool<T>().allocate(size);
|
|
|
+ auto base_ptr = typename T::ptr_type(mem, [this](auto *p) {
|
|
|
+ get_pool<T>().deallocate(p);
|
|
|
+ });
|
|
|
+ return create_info<T>(base_ptr, size);
|
|
|
+ }
|
|
|
+
|
|
|
+ void purify() {
|
|
|
+ host_pool.purify();
|
|
|
+ cuda_pool.purify();
|
|
|
+ }
|
|
|
+
|
|
|
+ status_type status() const {
|
|
|
+ auto ret = status_type();
|
|
|
+ ret.host_allocated = host_pool.allocated;
|
|
|
+ ret.host_cached = host_pool.cached;
|
|
|
+ ret.cuda_allocated = cuda_pool.allocated;
|
|
|
+ ret.cuda_cached = cuda_pool.cached;
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ ~impl() {
|
|
|
+ purify();
|
|
|
+ }
|
|
|
+
|
|
|
+private:
|
|
|
+ template<MemoryBaseType T>
|
|
|
+ auto &get_pool();
|
|
|
+};
|
|
|
+
|
|
|
+template<>
|
|
|
+auto &memory_manager::impl::get_pool<host_memory_info_base>() {
|
|
|
+ return host_pool;
|
|
|
+}
|
|
|
+
|
|
|
+template<>
|
|
|
+auto &memory_manager::impl::get_pool<cuda_memory_info_base>() {
|
|
|
+ return cuda_pool;
|
|
|
+}
|
|
|
+
|
|
|
+host_memory_info memory_manager::allocate_host(const size_t size) const {
|
|
|
+ return pimpl->allocate<host_memory_info_base>(size);
|
|
|
+}
|
|
|
+
|
|
|
+cuda_memory_info memory_manager::allocate_cuda(const size_t size) const {
|
|
|
+ return pimpl->allocate<cuda_memory_info_base>(size);
|
|
|
+}
|
|
|
+
|
|
|
+void memory_manager::purify() const {
|
|
|
+ pimpl->purify();
|
|
|
+}
|
|
|
+
|
|
|
+memory_manager::status_type memory_manager::status() const {
|
|
|
+ return pimpl->status();
|
|
|
+}
|
|
|
+
|
|
|
+memory_manager::memory_manager()
|
|
|
+ : pimpl(std::make_unique<impl>()) {
|
|
|
+}
|
|
|
+
|
|
|
+memory_manager::~memory_manager() = default;
|
|
|
+
|
|
|
+memory_manager *g_memory_manager = nullptr;
|
|
|
+
|
|
|
+namespace {
|
|
|
+ thread_local std::unique_ptr<cuda_stream_proxy> sync_stream;
|
|
|
+
|
|
|
+ cuda_stream_proxy &get_sync_stream() {
|
|
|
+ if (sync_stream == nullptr) [[unlikely]] {
|
|
|
+ sync_stream = std::make_unique<cuda_stream_proxy>();
|
|
|
+ }
|
|
|
+ return *sync_stream;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void record_event_helper(std::optional<cuda_event_proxy> &event) {
|
|
|
+ if (!event) [[unlikely]] { event.emplace(); }
|
|
|
+ record_cuda_event(*event);
|
|
|
+}
|
|
|
+
|
|
|
+static void sync_event_helper(const std::optional<cuda_event_proxy> &event) {
|
|
|
+ if (!event) [[unlikely]] return;
|
|
|
+ sync_cuda_event(*event);
|
|
|
+}
|
|
|
+
|
|
|
+static void host_sync_helper(const std::optional<cuda_event_proxy> &event) {
|
|
|
+ if (event) {
|
|
|
+ auto stream_guard = cuda_stream_guard(get_sync_stream());
|
|
|
+ sync_cuda_event(*event);
|
|
|
+ CUDA_API_CHECK(cudaStreamSynchronize(current_cuda_stream()));
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void record_write_event(cuda_memory_info_base &mem) {
|
|
|
+ record_cuda_event(mem.write_event);
|
|
|
+}
|
|
|
+
|
|
|
+static void sync_write_event(const cuda_memory_info_base &mem) {
|
|
|
+ sync_cuda_event(mem.write_event);
|
|
|
+}
|
|
|
+
|
|
|
+static void record_read_event(cuda_memory_info_base &mem) {
|
|
|
+ auto lock = std::lock_guard(mem.read_events.mu);
|
|
|
+ auto &event = mem.read_events.emplace_front();
|
|
|
+ record_cuda_event(event);
|
|
|
+}
|
|
|
+
|
|
|
+static void sync_read_event(cuda_memory_info_base &mem) {
|
|
|
+ auto lock = std::lock_guard(mem.read_events.mu); // TODO: may be not necessary
|
|
|
+ for (auto &event: mem.read_events) {
|
|
|
+ sync_cuda_event(event);
|
|
|
+ }
|
|
|
+ mem.read_events.clear();
|
|
|
+}
|
|
|
+
|
|
|
+static void destroy_memory_base(host_memory_info_base *mem) {
|
|
|
+ mem->mu.lock();
|
|
|
+ host_sync_helper(mem->copy_in_event);
|
|
|
+ host_sync_helper(mem->copy_out_event);
|
|
|
+ free(mem->ptr);
|
|
|
+ mem->mu.unlock();
|
|
|
+ delete mem;
|
|
|
+}
|
|
|
+
|
|
|
+static void destroy_memory_base(cuda_memory_info_base *mem) {
|
|
|
+ mem->mu.lock();
|
|
|
+ sync_write_event(*mem);
|
|
|
+ sync_read_event(*mem);
|
|
|
+ CUDA_API_CHECK(cudaFreeAsync(mem->ptr, current_cuda_stream()));
|
|
|
+ mem->mu.unlock();
|
|
|
+ delete mem;
|
|
|
+}
|
|
|
+
|
|
|
+static void acquire_read_access(host_memory_info_base &mem) {
|
|
|
+ mem.mu.lock_shared();
|
|
|
+ host_sync_helper(mem.copy_in_event);
|
|
|
+}
|
|
|
+
|
|
|
+static void acquire_read_access(cuda_memory_info_base &mem) {
|
|
|
+ mem.mu.lock_shared();
|
|
|
+ sync_write_event(mem);
|
|
|
+}
|
|
|
+
|
|
|
+static void release_read_access(host_memory_info_base &mem) {
|
|
|
+ mem.mu.unlock_shared();
|
|
|
+}
|
|
|
+
|
|
|
+static void release_read_access(cuda_memory_info_base &mem) {
|
|
|
+ record_read_event(mem);
|
|
|
+ mem.mu.unlock_shared();
|
|
|
+}
|
|
|
+
|
|
|
+static void acquire_write_access(host_memory_info_base &mem) {
|
|
|
+ mem.mu.lock();
|
|
|
+ host_sync_helper(mem.copy_in_event);
|
|
|
+ host_sync_helper(mem.copy_out_event);
|
|
|
+ mem.copy_in_event.reset();
|
|
|
+ mem.copy_out_event.reset();
|
|
|
+ mem.cuda_twin = {};
|
|
|
+}
|
|
|
+
|
|
|
+static void acquire_write_access(cuda_memory_info_base &mem) {
|
|
|
+ mem.mu.lock();
|
|
|
+ sync_write_event(mem);
|
|
|
+ sync_read_event(mem);
|
|
|
+ mem.host_twin = {};
|
|
|
+}
|
|
|
+
|
|
|
+static void release_write_access(host_memory_info_base &mem) {
|
|
|
+ mem.mu.unlock();
|
|
|
+}
|
|
|
+
|
|
|
+static void release_write_access(cuda_memory_info_base &mem) {
|
|
|
+ record_write_event(mem);
|
|
|
+ mem.mu.unlock();
|
|
|
+}
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void acquire_read_access(T &mem) {
|
|
|
+ acquire_read_access(*mem.base);
|
|
|
+}
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void release_read_access(T &mem) {
|
|
|
+ release_read_access(*mem.base);
|
|
|
+}
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void acquire_write_access(T &mem) {
|
|
|
+ acquire_write_access(*mem.base);
|
|
|
+}
|
|
|
+
|
|
|
+template<typename T>
|
|
|
+void release_write_access(T &mem) {
|
|
|
+ release_write_access(*mem.base);
|
|
|
+}
|
|
|
+
|
|
|
+// @formatter:off
|
|
|
+template void acquire_read_access(host_memory_info &);
|
|
|
+template void acquire_read_access(cuda_memory_info &);
|
|
|
+template void release_read_access(host_memory_info &);
|
|
|
+template void release_read_access(cuda_memory_info &);
|
|
|
+template void acquire_write_access(host_memory_info &);
|
|
|
+template void acquire_write_access(cuda_memory_info &);
|
|
|
+template void release_write_access(host_memory_info &);
|
|
|
+template void release_write_access(cuda_memory_info &);
|
|
|
+// @formatter:on
|
|
|
+
|
|
|
+
|
|
|
+cuda_memory_info acquire_cuda_twin(const host_memory_info &mem) {
|
|
|
+ // first check
|
|
|
+ {
|
|
|
+ auto lock = std::shared_lock(mem.base->twin_mu);
|
|
|
+ if (const auto ret = mem.base->cuda_twin.query(); ret != nullptr) [[likely]] {
|
|
|
+ return create_info<cuda_memory_info_base>(ret, mem.size);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // second check
|
|
|
+ auto lock = std::unique_lock(mem.base->twin_mu);
|
|
|
+ if (const auto ret = mem.base->cuda_twin.query(); ret != nullptr) [[likely]] {
|
|
|
+ return create_info<cuda_memory_info_base>(ret, mem.size);
|
|
|
+ }
|
|
|
+ // real copy
|
|
|
+ auto ret = CUDA_ALLOC(mem.size);
|
|
|
+ auto ret_copy = ret; // prevent ret is moved before release write access
|
|
|
+ auto read_lock = read_access_guard(mem);
|
|
|
+ auto write_lock = write_access_guard(ret_copy);
|
|
|
+ CUDA_API_CHECK(cudaMemcpyAsync(ret.ptr, mem.ptr, mem.size,
|
|
|
+ cudaMemcpyHostToDevice, current_cuda_stream()));
|
|
|
+ record_event_helper(mem.base->copy_out_event);
|
|
|
+ // twin assigment
|
|
|
+ mem.base->cuda_twin.shared = ret.base;
|
|
|
+ ret.base->host_twin.weak = mem.base;
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+host_memory_info acquire_host_twin(const cuda_memory_info &mem) {
|
|
|
+ // first check
|
|
|
+ {
|
|
|
+ auto lock = std::shared_lock(mem.base->twin_mu);
|
|
|
+ if (const auto ret = mem.base->host_twin.query(); ret != nullptr) [[likely]] {
|
|
|
+ return create_info<host_memory_info_base>(ret, mem.size);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // second check
|
|
|
+ auto lock = std::unique_lock(mem.base->twin_mu);
|
|
|
+ if (const auto ret = mem.base->host_twin.query(); ret != nullptr) [[likely]] {
|
|
|
+ return create_info<host_memory_info_base>(ret, mem.size);
|
|
|
+ }
|
|
|
+ // real copy
|
|
|
+ auto ret = HOST_ALLOC(mem.size);
|
|
|
+ auto ret_copy = ret; // prevent ret is moved before release write access
|
|
|
+ auto read_lock = read_access_guard(mem);
|
|
|
+ auto write_lock = write_access_guard(ret_copy);
|
|
|
+ CUDA_API_CHECK(cudaMemcpyAsync(ret.ptr, mem.ptr, mem.size,
|
|
|
+ cudaMemcpyDeviceToHost, current_cuda_stream()));
|
|
|
+ record_event_helper(ret.base->copy_in_event);
|
|
|
+ // twin assignment
|
|
|
+ mem.base->host_twin.shared = ret.base;
|
|
|
+ ret.base->cuda_twin.weak = mem.base;
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+std::shared_ptr<void> auto_alloc(const size_t size, const memory_location loc) {
|
|
|
+ auto ret = std::shared_ptr<void>();
|
|
|
+ if (loc == MEM_HOST) {
|
|
|
+ auto mem = HOST_ALLOC(size);
|
|
|
+ ret = std::shared_ptr<void>(mem.ptr, [b = mem.base](void *) { (void) 0; });
|
|
|
+ } else if (loc == MEM_CUDA) {
|
|
|
+ auto mem = CUDA_ALLOC(size);
|
|
|
+ ret = std::shared_ptr<void>(mem.ptr, [b = mem.base](void *) { (void) 0; });
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+size_t get_pitch_aligned_size(const size_t pitch) {
|
|
|
+ return alignment_round<pitch_alignment>(pitch);
|
|
|
+}
|
|
|
+
|
|
|
+std::shared_ptr<void> auto_alloc_pitch(const size_t width, const size_t height,
|
|
|
+ const memory_location loc, size_t *pitch) {
|
|
|
+ *pitch = get_pitch_aligned_size(width);
|
|
|
+ return auto_alloc(*pitch * height, loc);
|
|
|
+}
|