#ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H #define CUMO_CUDA_MEMORY_POOL_IMPL_H #include #include #include #include #include #include #include #include #include // CUDA memory pool implementation highly referring CuPy namespace cumo { namespace internal { // cudaMalloc() is aligned to at least 512 bytes // cf. https://gist.github.com/sonots/41daaa6432b1c8b27ef782cd14064269 constexpr int kRoundSize = 512; // bytes class CUDARuntimeError : public std::runtime_error { private: cudaError_t status_; public: CUDARuntimeError(cudaError_t status) : runtime_error(cudaGetErrorString(status)), status_(status) {} cudaError_t status() const { return status_; } }; class OutOfMemoryError : public std::runtime_error { public: OutOfMemoryError(size_t size, size_t total) : runtime_error("out of memory to allocate " + std::to_string(size) + " bytes (total " + std::to_string(total) + " bytes)") {} }; void CheckStatus(cudaError_t status); // Memory allocation on a CUDA device. // // This class provides an RAII interface of the CUDA memory allocation. class Memory { private: // Pointer to the place within the buffer. void* ptr_ = nullptr; // Size of the memory allocation in bytes. size_t size_ = 0; // GPU device id whose memory the pointer refers to. int device_id_ = -1; public: Memory(size_t size); ~Memory(); intptr_t ptr() const { return reinterpret_cast(ptr_); } size_t size() const { return size_; } int device_id() const { return device_id_; } }; // A chunk points to a device memory. // // A chunk might be a splitted memory block from a larger allocation. // The prev/next pointers contruct a doubly-linked list of memory addresses // sorted by base address that must be contiguous. class Chunk { private: // The device memory buffer. std::shared_ptr mem_; // Memory address. intptr_t ptr_ = 0; // An offset bytes from the head of the buffer. size_t offset_ = 0; // Chunk size in bytes. size_t size_ = 0; // GPU device id whose memory the pointer refers to. int device_id_; // prev memory pointer if split from a larger allocation std::shared_ptr prev_; // next memory pointer if split from a larger allocation std::shared_ptr next_; // Raw stream handle of cuda stream cudaStream_t stream_ptr_; // chunk is in use bool in_use_ = false; public: Chunk() {} // mem: The device memory buffer. // offset: An offset bytes from the head of the buffer. // size: Chunk size in bytes. // stream_ptr: Raw stream handle of cuda stream Chunk(const std::shared_ptr& mem, size_t offset, size_t size, cudaStream_t stream_ptr = 0) : mem_(mem), ptr_(mem->ptr() + offset), offset_(offset), size_(size), device_id_(mem->device_id()), stream_ptr_(stream_ptr) { assert(mem->ptr() > 0 || offset == 0); } Chunk(const Chunk&) = default; ~Chunk() { // std::cout << "Chunk dtor " << (void*)ptr_ << " " << this << std::endl; } intptr_t ptr() const { return ptr_; } size_t offset() const { return offset_; } size_t size() const { return size_; } int device_id() const { return device_id_; } const std::shared_ptr& prev() const { return prev_; } std::shared_ptr& prev() { return prev_; } const std::shared_ptr& next() const { return next_; } std::shared_ptr& next() { return next_; } cudaStream_t stream_ptr() const { return stream_ptr_; } void set_prev(const std::shared_ptr& prev) { prev_ = prev; } void set_next(const std::shared_ptr& next) { next_ = next; } bool in_use() const { return in_use_; } void set_in_use(bool in_use) { in_use_ = in_use; } // Split contiguous block of a larger allocation friend std::shared_ptr Split(std::shared_ptr& self, size_t size); // Merge previously splitted block (chunk) friend void Merge(std::shared_ptr& self, std::shared_ptr remaining); }; using FreeList = std::vector>; // list of free chunk using Arena = std::vector; // free_list w.r.t arena index using ArenaIndexMap = std::vector; // arena index <=> bin size index // Memory pool implementation for single device. // - The allocator attempts to find the smallest cached block that will fit // the requested size. If the block is larger than the requested size, // it may be split. If no block is found, the allocator will delegate to // cudaMalloc. // - If the cudaMalloc fails, the allocator will free all cached blocks that // are not split and retry the allocation. class SingleDeviceMemoryPool { private: int device_id_; std::unordered_map> in_use_; // ptr => Chunk std::unordered_map free_; std::unordered_map index_; std::recursive_mutex mutex_; public: SingleDeviceMemoryPool() { CheckStatus(cudaGetDevice(&device_id_)); } intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0); void Free(intptr_t ptr, cudaStream_t stream_ptr = 0); // Free all **non-split** chunks in all arenas void FreeAllBlocks(); // Free all **non-split** chunks in specified arena void FreeAllBlocks(cudaStream_t stream_ptr); size_t GetNumFreeBlocks(); size_t GetUsedBytes(); size_t GetFreeBytes(); size_t GetTotalBytes() { return GetUsedBytes() + GetFreeBytes(); } // private: // Rounds up the memory size to fit memory alignment of cudaMalloc. size_t GetRoundedSize(size_t size) { return ((size + kRoundSize - 1) / kRoundSize) * kRoundSize; } // Get bin index regarding the memory size int GetBinIndex(size_t size) { return (size - 1) / kRoundSize; } int GetArenaIndex(size_t size, cudaStream_t stream_ptr = 0) { int bin_index = GetBinIndex(size); ArenaIndexMap& arena_index_map = GetArenaIndexMap(stream_ptr); return std::lower_bound(arena_index_map.begin(), arena_index_map.end(), bin_index) - arena_index_map.begin(); } bool HasArena(cudaStream_t stream_ptr) { auto it = free_.find(stream_ptr); return it != free_.end(); } // Returns appropriate arena (list of bins) of a given stream. // // All free chunks in the stream belong to one of the bin in the arena. // // Caller is responsible to acquire lock. Arena& GetArena(cudaStream_t stream_ptr) { return free_[stream_ptr]; // find or create } // Returns appropriate arena sparse index of a given stream. // // Each element of the returned vector is an index value of the arena // for the stream. The k-th element of the arena index is the bin index // of the arena. For example, when the arena index is `[1, 3]`, it means // that the arena has 2 bins, and `arena[0]` is for bin index 1 and // `arena[1]` is for bin index 3. // // Caller is responsible to acquire lock. ArenaIndexMap& GetArenaIndexMap(cudaStream_t stream_ptr) { return index_[stream_ptr]; // find or create } std::shared_ptr PopFromFreeList(FreeList& free_list) { auto data = free_list.back(); free_list.pop_back(); return data; } // std::vector erase-remove idiom // http://minus9d.hatenablog.com/entry/20120605/1338896754 bool EraseFromFreeList(FreeList& free_list, const std::shared_ptr& chunk) { assert(!chunk->in_use()); auto iter = std::find(free_list.begin(), free_list.end(), chunk); if (iter == free_list.end()) { return false; } free_list.erase(iter); return true; } void AppendToFreeList(size_t size, std::shared_ptr& chunk, cudaStream_t stream_ptr = 0); // Removes the chunk from the free list. // // @return true if the chunk can successfully be removed from // the free list. false` otherwise (e.g., the chunk could not // be found in the free list as the chunk is allocated.) bool RemoveFromFreeList(size_t size, std::shared_ptr& chunk, cudaStream_t stream_ptr = 0); void CompactIndex(cudaStream_t stream_ptr, bool free); }; // Memory pool for all GPU devices on the host. // // A memory pool preserves any allocations even if they are freed by the user. // Freed memory buffers are held by the memory pool as *free blocks*, and they // are reused for further memory allocations of the same sizes. The allocated // blocks are managed for each device, so one instance of this class can be // used for multiple devices. // .. note:: // When the allocation is skipped by reusing the pre-allocated block, it // does not call ``cudaMalloc`` and therefore CPU-GPU synchronization does // not occur. It makes interleaves of memory allocations and kernel // invocations very fast. // .. note:: // The memory pool holds allocated blocks without freeing as much as // possible. It makes the program hold most of the device memory, which may // make other CUDA programs running in parallel out-of-memory situation. class MemoryPool { private: int device_id() { int device_id = -1; CheckStatus(cudaGetDevice(&device_id)); return device_id; } std::unordered_map pools_; public: MemoryPool() {} ~MemoryPool() { pools_.clear(); } // Allocates the memory, from the pool if possible. // // Args: // size (int): Size of the memory buffer to allocate in bytes. // stream_ptr (cudaStream_t): Get the memory from the arena of given stream // Returns: // intptr_t: Pointer address to the allocated buffer. intptr_t Malloc(size_t size, cudaStream_t stream_ptr = 0) { auto& mp = pools_[device_id()]; return mp.Malloc(size, stream_ptr); } // Frees the memory, to the pool // // Args: // ptr (intptr_t): Pointer of the memory buffer // stream_ptr (cudaStream_t): Return the memory to the arena of given stream void Free(intptr_t ptr, cudaStream_t stream_ptr = 0) { auto& mp = pools_[device_id()]; mp.Free(ptr, stream_ptr); } // Free all **non-split** chunks in all arenas void FreeAllBlocks() { auto& mp = pools_[device_id()]; return mp.FreeAllBlocks(); } // Free all **non-split** chunks in specified arena // // Args: // stream_ptr (cudaStream_t): Release free blocks in the arena of given stream void FreeAllBlocks(cudaStream_t stream_ptr) { auto& mp = pools_[device_id()]; return mp.FreeAllBlocks(stream_ptr); } // Count the total number of free blocks. // // Returns: // size_t: The total number of free blocks. size_t GetNumFreeBlocks() { auto& mp = pools_[device_id()]; return mp.GetNumFreeBlocks(); } // Get the total number of bytes used. // // Returns: // size_t: The total number of bytes used. size_t GetUsedBytes() { auto& mp = pools_[device_id()]; return mp.GetUsedBytes(); } // Get the total number of bytes acquired but not used in the pool. // // Returns: // size_t: The total number of bytes acquired but not used in the pool. size_t GetFreeBytes() { auto& mp = pools_[device_id()]; return mp.GetFreeBytes(); } // Get the total number of bytes acquired in the pool. // // Returns: // size_t: The total number of bytes acquired in the pool. size_t GetTotalBytes() { auto& mp = pools_[device_id()]; return mp.GetTotalBytes(); } }; } // namespace internal } // namespace cumo #endif /* ifndef CUMO_CUDA_MEMORY_POOL_IMPL_H */