diff --git a/changelog/std-aligned-block-list.dd b/changelog/std-aligned-block-list.dd new file mode 100644 index 000000000..c705e5ee2 --- /dev/null +++ b/changelog/std-aligned-block-list.dd @@ -0,0 +1,10 @@ +Implemented a new allocator, `AlignedBlockList` and its thread-safe version `SharedAlignedBlockList` + +$(REF AlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) represents +a list of allocators which allows for deallocations in constant time. +Although allocations are in theory served in linear searching time, `deallocate` calls take +$(BIGOH 1) time, by using aligned allocations. The `ParentAllocator` must implement `alignedAllocate`. + +$(REF SharedAlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) has the +same semantics as its single threaded version, however the internal allocators must be in addition marked +as shared. diff --git a/posix.mak b/posix.mak index 2e6f16686..20543efbc 100644 --- a/posix.mak +++ b/posix.mak @@ -206,8 +206,8 @@ PACKAGE_std_experimental_logger = core filelogger \ PACKAGE_std_experimental_allocator = \ common gc_allocator mallocator mmap_allocator package showcase typed PACKAGE_std_experimental_allocator_building_blocks = \ - affix_allocator allocator_list ascending_page_allocator bucketizer \ - fallback_allocator free_list free_tree bitmapped_block \ + affix_allocator aligned_block_list allocator_list ascending_page_allocator \ + bucketizer fallback_allocator free_list free_tree bitmapped_block \ kernighan_ritchie null_allocator package quantizer \ region scoped_allocator segregator stats_collector PACKAGE_std_net = curl isemail diff --git a/std/experimental/allocator/building_blocks/aligned_block_list.d b/std/experimental/allocator/building_blocks/aligned_block_list.d new file mode 100644 index 000000000..3870dfe5a --- /dev/null +++ b/std/experimental/allocator/building_blocks/aligned_block_list.d @@ -0,0 +1,699 @@ +// Written in the D programming language. +/** +`AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations +and preserving a low degree of fragmentation by means of aligned allocations. + +Source: $(PHOBOSSRC std/experimental/allocator/building_blocks/aligned_block_list.d) +*/ +module std.experimental.allocator.building_blocks.aligned_block_list; + +import std.experimental.allocator.common; +import std.experimental.allocator.building_blocks.null_allocator; + +// Common function implementation for thread local and shared AlignedBlockList +private mixin template AlignedBlockListImpl(bool isShared) +{ + import std.traits : hasMember; + import std.typecons : Ternary; + + static if (isShared) + import core.internal.spinlock : SpinLock; + +private: + // Doubly linked list of 'AlignedBlockNode' + // Each node contains an `Allocator` followed by its payload + static struct AlignedBlockNode + { + AlignedBlockNode* next, prev; + Allocator bAlloc; + + static if (isShared) + { + shared(size_t) bytesUsed; + // Since the lock is not taken when allocating, this acts like a refcount + // keeping the node alive + uint keepAlive; + } + else + { + size_t bytesUsed; + } + } + + // Root of the internal doubly linked list + AlignedBlockNode* root; + + // Number of active nodes + uint numNodes; + + // If the numNodes exceeds this limit, we will start deallocating nodes + enum uint maxNodes = 64; + + // This lock is always taken when changing the list + // To improve performance, the lock is not taken when the allocation logic is called + static if (isShared) + SpinLock lock = SpinLock(SpinLock.Contention.brief); + + // Moves a node to the front of the list, allowing for quick allocations + void moveToFront(AlignedBlockNode* tmp) + { + auto localRoot = cast(AlignedBlockNode*) root; + if (tmp == localRoot) + return; + + if (tmp.prev) tmp.prev.next = tmp.next; + if (tmp.next) tmp.next.prev = tmp.prev; + if (localRoot) localRoot.prev = tmp; + tmp.next = localRoot; + tmp.prev = null; + + root = cast(typeof(root)) tmp; + } + + // Removes a node from the list, including its payload + // The payload is deallocated by calling 'parent.deallocate' + void removeNode(AlignedBlockNode* tmp) + { + auto next = tmp.next; + if (tmp.prev) tmp.prev.next = tmp.next; + if (tmp.next) tmp.next.prev = tmp.prev; + parent.deallocate((cast(void*) tmp)[0 .. theAlignment]); + + if (tmp == cast(AlignedBlockNode*) root) + root = cast(typeof(root)) next; + + static if (isShared) + { + import core.atomic : atomicOp; + atomicOp!"-="(numNodes, 1); + } + else + { + numNodes--; + } + } + + // If the nodes do not have available space, a new node is created + // by drawing memory from the parent allocator with aligned allocations. + // The new node is inserted at the front of the list + bool insertNewNode() + { + void[] buf = parent.alignedAllocate(theAlignment, theAlignment); + if (buf is null) + return false; + + auto localRoot = cast(AlignedBlockNode*) root; + auto newNode = cast(AlignedBlockNode*) buf; + + // The first part of the allocation represent the node contents + // followed by the actual payload + ubyte[] payload = cast(ubyte[]) buf[AlignedBlockNode.sizeof .. $]; + newNode.bAlloc = Allocator(payload); + + newNode.next = localRoot; + newNode.prev = null; + if (localRoot) + localRoot.prev = newNode; + root = cast(typeof(root)) newNode; + + static if (isShared) + { + import core.atomic : atomicOp; + atomicOp!"+="(numNodes, 1); + } + else + { + numNodes++; + } + + return true; + } + +public: + static if (stateSize!ParentAllocator) ParentAllocator parent; + else alias parent = ParentAllocator.instance; + + enum ulong alignment = Allocator.alignment; + + // Since all memory is drawn from ParentAllocator, we can + // forward this to the parent + static if (hasMember!(ParentAllocator, "owns")) + Ternary owns(void[] b) + { + return parent.owns(b); + } + + // Use `theAlignment` to find the node which allocated this block + bool deallocate(void[] b) + { + if (b is null) + return true; + + // Round buffer to nearest `theAlignment` multiple to quickly find + // the `parent` `AlignedBlockNode` + enum ulong mask = ~(theAlignment - 1); + ulong ptr = ((cast(ulong) b.ptr) & mask); + AlignedBlockNode *node = cast(AlignedBlockNode*) ptr; + if (node.bAlloc.deallocate(b)) + { + static if (isShared) + { + import core.atomic : atomicOp; + atomicOp!"-="(node.bytesUsed, b.length); + } + else + { + node.bytesUsed -= b.length; + } + return true; + } + return false; + } + + // Allocate works only if memory can be provided via `alignedAllocate` from the parent + static if (hasMember!(ParentAllocator, "alignedAllocate")) + void[] allocate(size_t n) + { + static if (isShared) + import core.atomic : atomicOp, atomicLoad; + + if (n == 0 || n > theAlignment) + return null; + + static if (isShared) + { + lock.lock(); + scope(exit) lock.unlock(); + } + + auto tmp = cast(AlignedBlockNode*) root; + + // Iterate through list and find first node which has memory available + while (tmp) + { + auto next = tmp.next; + static if (isShared) + { + // Allocations can happen outside the lock + // Make sure nobody deletes this node while using it + tmp.keepAlive++; + if (next) next.keepAlive++; + lock.unlock(); + } + + auto result = tmp.bAlloc.allocate(n); + if (result.length == n) + { + // Success + static if (isShared) + { + atomicOp!"+="(tmp.bytesUsed, n); + lock.lock(); + } + else + { + tmp.bytesUsed += n; + } + + // Most likely this node has memory for more allocations + // Move it to the front + moveToFront(tmp); + + static if (isShared) + { + tmp.keepAlive--; + if (next) next.keepAlive--; + } + + return result; + } + + // This node can now be removed if necessary + static if (isShared) + { + lock.lock(); + tmp.keepAlive--; + if (next) next.keepAlive--; + } + + if (!next) + break; + + tmp = next; + next = tmp.next; + + // If there are too many nodes, free memory by removing empty nodes + static if (isShared) + { + if (atomicLoad(numNodes) > maxNodes && + atomicLoad(tmp.bytesUsed) == 0 && + tmp.keepAlive == 0) + { + removeNode(tmp); + } + } + else + { + if (numNodes > maxNodes && tmp.bytesUsed == 0) + { + removeNode(tmp); + } + } + + tmp = next; + } + + // Cannot create new AlignedBlockNode. Most likely the ParentAllocator ran out of resources + if (!insertNewNode()) + return null; + + tmp = cast(typeof(tmp)) root; + void[] result = tmp.bAlloc.allocate(n); + + static if (isShared) + { + atomicOp!"+="(root.bytesUsed, result.length); + } + else + { + root.bytesUsed += result.length; + } + + return result; + } + + // goodAllocSize should not use state + size_t goodAllocSize(const size_t n) + { + Allocator a = null; + return a.goodAllocSize(n); + } +} + +/** +`AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations +and preserving a low degree of fragmentation. +The allocator holds internally a doubly linked list of `Allocator` objects, which will serve allocations +in a most-recently-used fashion. Most recent allocators used for `allocate` calls, will be +moved to the front of the list. + +Although allocations are in theory served in linear searching time, `deallocate` calls take +$(BIGOH 1) time, by using aligned allocations. `ParentAllocator` must implement `alignedAllocate` +and it must be able to allocate `theAlignment` bytes at the same alignment. Each aligned allocation +done by `ParentAllocator` will contain metadata for an `Allocator`, followed by its payload. + +Params: + Allocator = the allocator which is used to manage each node; it must have a constructor which receives + `ubyte[]` and it must not have any parent allocators, except for the `NullAllocator` + ParentAllocator = each node draws memory from the parent allocator; it must support `alignedAllocate` + theAlignment = alignment of each block and at the same time length of each node +*/ +struct AlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21)) +{ + version (StdDdoc) + { + import std.typecons : Ternary; + import std.traits : hasMember; + + /** + Returns a chunk of memory of size `n` + It finds the first node in the `AlignedBlockNode` list which has available memory, + and moves it to the front of the list. + + All empty nodes which cannot return new memory, are removed from the list. + + Params: + n = bytes to allocate + Returns: + A chunk of memory of the required length or `null` on failure or + */ + static if (hasMember!(ParentAllocator, "alignedAllocate")) + void[] allocate(size_t n); + + /** + Deallocates the buffer `b` given as parameter. Deallocations take place in constant + time, regardless of the number of nodes in the list. `b.ptr` is rounded down + to the nearest multiple of the `alignment` to quickly find the corresponding + `AlignedBlockNode`. + + Params: + b = buffer candidate for deallocation + Returns: + `true` on success and `false` on failure + */ + bool deallocate(void[] b); + + /** + Returns `Ternary.yes` if the buffer belongs to the parent allocator and + `Ternary.no` otherwise. + + Params: + b = buffer tested if owned by this allocator + Returns: + `Ternary.yes` if owned by this allocator and `Ternary.no` otherwise + */ + static if (hasMember!(ParentAllocator, "owns")) + Ternary owns(void[] b); + } + else + { + import std.math : isPowerOf2; + static assert(isPowerOf2(alignment)); + mixin AlignedBlockListImpl!false; + } +} + +/// +@system unittest +{ + import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator; + import std.experimental.allocator.building_blocks.segregator : Segregator; + import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock; + import std.typecons : Ternary; + + /* + In this example we use 'AlignedBlockList' in conjunction with other allocators + in order to create a more complex allocator. + + The 'SuperAllocator' uses a 'Segregator' to distribute allocations to sub-allocators, + based on the requested size. + + Each sub-allocator is represented by an 'AlignedBlockList' of 'BitmappedBlocks'. + Each 'AlignedBlockList' draws memory from a root allocator which in this case is an 'AscendingPageAllocator' + + Such an allocator not only provides good performance, but also a low degree of memory fragmentation. + */ + alias SuperAllocator = Segregator!( + 32, + AlignedBlockList!(BitmappedBlock!32, AscendingPageAllocator*, 1 << 12), + Segregator!( + + 64, + AlignedBlockList!(BitmappedBlock!64, AscendingPageAllocator*, 1 << 12), + Segregator!( + + 128, + AlignedBlockList!(BitmappedBlock!128, AscendingPageAllocator*, 1 << 12), + AscendingPageAllocator* + ))); + + SuperAllocator a; + auto pageAlloc = AscendingPageAllocator(128 * 4096); + + // Set the parent allocator for all the sub allocators + a.allocatorForSize!256 = &pageAlloc; + a.allocatorForSize!128.parent = &pageAlloc; + a.allocatorForSize!64.parent = &pageAlloc; + a.allocatorForSize!32.parent = &pageAlloc; + + enum testNum = 10; + void[][testNum] buf; + + // Allocations of size 32 will go to the first 'AlignedBlockList' + foreach (j; 0 .. testNum) + { + buf[j] = a.allocate(32); + assert(buf[j].length == 32); + + // This is owned by the first 'AlignedBlockList' + assert(a.allocatorForSize!32.owns(buf[j]) == Ternary.yes); + } + + // Free the memory + foreach (j; 0 .. testNum) + assert(a.deallocate(buf[j])); + + // Allocations of size 64 will go to the second 'AlignedBlockList' + foreach (j; 0 .. testNum) + { + buf[j] = a.allocate(64); + assert(buf[j].length == 64); + + // This is owned by the second 'AlignedBlockList' + assert(a.allocatorForSize!64.owns(buf[j]) == Ternary.yes); + } + + // Free the memory + foreach (j; 0 .. testNum) + assert(a.deallocate(buf[j])); + + // Allocations of size 128 will go to the third 'AlignedBlockList' + foreach (j; 0 .. testNum) + { + buf[j] = a.allocate(128); + assert(buf[j].length == 128); + + // This is owned by the third 'AlignedBlockList' + assert(a.allocatorForSize!128.owns(buf[j]) == Ternary.yes); + } + + // Free the memory + foreach (j; 0 .. testNum) + assert(a.deallocate(buf[j])); + + // Allocations which exceed 128, will go to the 'AscendingPageAllocator*' + void[] b = a.allocate(256); + assert(b.length == 256); + a.deallocate(b); +} + +/** +`SharedAlignedBlockList` is the threadsafe version of `AlignedBlockList`. +The `Allocator` template parameter must refer a shared allocator. +Also, `ParentAllocator` must be a shared allocator, supporting `alignedAllocate`. + +Params: + Allocator = the shared allocator which is used to manage each node; it must have a constructor which receives + `ubyte[]` and it must not have any parent allocators, except for the `NullAllocator` + ParentAllocator = each node draws memory from the parent allocator; it must be shared and support `alignedAllocate` + theAlignment = alignment of each block and at the same time length of each node +*/ +shared struct SharedAlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21)) +{ + version (StdDdoc) + { + import std.typecons : Ternary; + import std.traits : hasMember; + + /** + Returns a chunk of memory of size `n` + It finds the first node in the `AlignedBlockNode` list which has available memory, + and moves it to the front of the list. + + All empty nodes which cannot return new memory, are removed from the list. + + Params: + n = bytes to allocate + Returns: + A chunk of memory of the required length or `null` on failure or + */ + static if (hasMember!(ParentAllocator, "alignedAllocate")) + void[] allocate(size_t n); + + /** + Deallocates the buffer `b` given as parameter. Deallocations take place in constant + time, regardless of the number of nodes in the list. `b.ptr` is rounded down + to the nearest multiple of the `alignment` to quickly find the corresponding + `AlignedBlockNode`. + + Params: + b = buffer candidate for deallocation + Returns: + `true` on success and `false` on failure + */ + bool deallocate(void[] b); + + /** + Returns `Ternary.yes` if the buffer belongs to the parent allocator and + `Ternary.no` otherwise. + + Params: + b = buffer tested if owned by this allocator + Returns: + `Ternary.yes` if owned by this allocator and `Ternary.no` otherwise + */ + static if (hasMember!(ParentAllocator, "owns")) + Ternary owns(void[] b); + } + else + { + import std.math : isPowerOf2; + static assert(isPowerOf2(alignment)); + mixin AlignedBlockListImpl!true; + } +} + +/// +@system unittest +{ + import std.experimental.allocator.building_blocks.region : SharedRegion; + import std.experimental.allocator.building_blocks.ascending_page_allocator : SharedAscendingPageAllocator; + import std.experimental.allocator.building_blocks.null_allocator : NullAllocator; + import core.thread : ThreadGroup; + + enum numThreads = 8; + enum size = 2048; + enum maxIter = 10; + + /* + In this example we use 'SharedAlignedBlockList' together with 'SharedRegion', + in order to create a fast, thread-safe allocator. + */ + alias SuperAllocator = SharedAlignedBlockList!( + SharedRegion!(NullAllocator, 1), + SharedAscendingPageAllocator, + 4096); + + SuperAllocator a; + // The 'SuperAllocator' will draw memory from a 'SharedAscendingPageAllocator' + a.parent = SharedAscendingPageAllocator(4096 * 1024); + + // Launch 'numThreads', each performing allocations + void fun() + { + foreach (i; 0 .. maxIter) + { + void[] b = a.allocate(size); + assert(b.length == size); + } + } + + auto tg = new ThreadGroup; + foreach (i; 0 .. numThreads) + { + tg.create(&fun); + } + tg.joinAll(); +} + +version (unittest) +{ + static void testrw(void[] b) + { + ubyte* buf = cast(ubyte*) b.ptr; + size_t len = (b.length).roundUpToMultipleOf(4096); + for (int i = 0; i < len; i += 4096) + { + buf[i] = (cast(ubyte) i % 256); + assert(buf[i] == (cast(ubyte) i % 256)); + } + } +} + +@system unittest +{ + import std.experimental.allocator.building_blocks.region; + import std.experimental.allocator.building_blocks.ascending_page_allocator; + import std.random; + import std.algorithm.sorting : sort; + import core.thread : ThreadGroup; + import core.internal.spinlock : SpinLock; + + enum pageSize = 4096; + enum numThreads = 10; + enum maxIter = 20; + enum totalAllocs = maxIter * numThreads; + size_t count = 0; + SpinLock lock = SpinLock(SpinLock.Contention.brief); + + alias SuperAllocator = SharedAlignedBlockList!( + SharedRegion!(NullAllocator, 1), + SharedAscendingPageAllocator, + 1 << 16); + void[][totalAllocs] buf; + + SuperAllocator a; + a.parent = SharedAscendingPageAllocator(4096 * 1024); + + void fun() + { + auto rnd = Random(1000); + + foreach (i; 0 .. maxIter) + { + auto size = uniform(1, pageSize + 1, rnd); + void[] b = a.allocate(size); + assert(b.length == size); + testrw(b); + + lock.lock(); + buf[count++] = b; + lock.unlock(); + } + } + auto tg = new ThreadGroup; + foreach (i; 0 .. numThreads) + { + tg.create(&fun); + } + tg.joinAll(); + + sort!((a, b) => a.ptr < b.ptr)(buf[0 .. totalAllocs]); + foreach (i; 0 .. totalAllocs - 1) + { + assert(buf[i].ptr + a.goodAllocSize(buf[i].length) <= buf[i + 1].ptr); + } + + foreach (i; 0 .. totalAllocs) + { + assert(a.deallocate(buf[totalAllocs - 1 - i])); + } +} + +@system unittest +{ + import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator; + import std.experimental.allocator.building_blocks.segregator : Segregator; + import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock; + import std.random; + + alias SuperAllocator = Segregator!( + 256, + AlignedBlockList!(BitmappedBlock!256, AscendingPageAllocator*, 1 << 16), + Segregator!( + + 512, + AlignedBlockList!(BitmappedBlock!512, AscendingPageAllocator*, 1 << 16), + Segregator!( + + 1024, + AlignedBlockList!(BitmappedBlock!1024, AscendingPageAllocator*, 1 << 16), + Segregator!( + + 2048, + AlignedBlockList!(BitmappedBlock!2048, AscendingPageAllocator*, 1 << 16), + AscendingPageAllocator* + )))); + + SuperAllocator a; + auto pageAlloc = AscendingPageAllocator(4096 * 4096); + a.allocatorForSize!4096 = &pageAlloc; + a.allocatorForSize!2048.parent = &pageAlloc; + a.allocatorForSize!1024.parent = &pageAlloc; + a.allocatorForSize!512.parent = &pageAlloc; + a.allocatorForSize!256.parent = &pageAlloc; + + auto rnd = Random(1000); + + size_t maxIter = 10; + enum testNum = 10; + void[][testNum] buf; + int maxSize = 8192; + foreach (i; 0 .. maxIter) + { + foreach (j; 0 .. testNum) + { + auto size = uniform(1, maxSize + 1, rnd); + buf[j] = a.allocate(size); + assert(buf[j].length == size); + testrw(buf[j]); + } + + randomShuffle(buf[]); + + foreach (j; 0 .. testNum) + { + assert(a.deallocate(buf[j])); + } + } +} diff --git a/std/experimental/allocator/building_blocks/package.d b/std/experimental/allocator/building_blocks/package.d index 33b7e7505..eb086a775 100644 --- a/std/experimental/allocator/building_blocks/package.d +++ b/std/experimental/allocator/building_blocks/package.d @@ -225,6 +225,9 @@ $(HTTP man7.org/linux/man-pages/man3/posix_memalign.3.html, `posix_memalign`) on Posix and $(HTTP msdn.microsoft.com/en-us/library/fs9stz4e(v=vs.80).aspx, `__aligned_xxx`) on Windows.)) +$(TR $(TDC2 AlignedBlockList, aligned_block_list) $(TD A wrapper around a list of allocators +which allow for very fast deallocations.)) + $(TR $(TDC2 AffixAllocator, affix_allocator) $(TD Allocator that allows and manages allocating extra prefix and/or a suffix bytes for each block allocated.)) @@ -302,6 +305,7 @@ module std.experimental.allocator.building_blocks; public import std.experimental.allocator.building_blocks.affix_allocator, + std.experimental.allocator.building_blocks.aligned_block_list, std.experimental.allocator.building_blocks.allocator_list, std.experimental.allocator.building_blocks.ascending_page_allocator, std.experimental.allocator.building_blocks.bucketizer, diff --git a/win32.mak b/win32.mak index f6ef5e5d5..1b18d14e1 100644 --- a/win32.mak +++ b/win32.mak @@ -288,6 +288,7 @@ SRC_STD_EXP= \ SRC_STD_EXP_ALLOC_BB= \ std\experimental\allocator\building_blocks\affix_allocator.d \ + std\experimental\allocator\building_blocks\aligned_block_list.d \ std\experimental\allocator\building_blocks\allocator_list.d \ std\experimental\allocator\building_blocks\ascending_page_allocator.d \ std\experimental\allocator\building_blocks\bitmapped_block.d \ diff --git a/win64.mak b/win64.mak index 38375d2e4..147a37918 100644 --- a/win64.mak +++ b/win64.mak @@ -313,6 +313,7 @@ SRC_STD_EXP= \ SRC_STD_EXP_ALLOC_BB= \ std\experimental\allocator\building_blocks\affix_allocator.d \ + std\experimental\allocator\building_blocks\aligned_block_list.d \ std\experimental\allocator\building_blocks\allocator_list.d \ std\experimental\allocator\building_blocks\ascending_page_allocator.d \ std\experimental\allocator\building_blocks\bitmapped_block.d \