Add AlignedBlockList

2025-04-27 13:40:20 +03:00 · 2018-04-02 23:08:49 +03:00 · 2018-04-02 23:08:49 +03:00 · 0fb44b0d45
commit 0fb44b0d45
parent 813327d51a
6 changed files with 717 additions and 2 deletions
--- a/changelog/std-aligned-block-list.dd
+++ b/changelog/std-aligned-block-list.dd
@ -0,0 +1,10 @@
 Implemented a new allocator, `AlignedBlockList` and its thread-safe version `SharedAlignedBlockList`
 $(REF AlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) represents
 a list of allocators which allows for deallocations in constant time.
 Although allocations are in theory served in linear searching time, `deallocate` calls take
 $(BIGOH 1) time, by using aligned allocations. The `ParentAllocator` must implement `alignedAllocate`.
 $(REF SharedAlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) has the
 same semantics as its single threaded version, however the internal allocators must be in addition marked
 as shared.
--- a/posix.mak
+++ b/posix.mak
@ -206,8 +206,8 @@ PACKAGE_std_experimental_logger = core filelogger \
 PACKAGE_std_experimental_allocator = \
  common gc_allocator mallocator mmap_allocator package showcase typed
 PACKAGE_std_experimental_allocator_building_blocks = \
-  affix_allocator allocator_list ascending_page_allocator bucketizer \
+  affix_allocator aligned_block_list allocator_list ascending_page_allocator \
-  fallback_allocator free_list free_tree bitmapped_block \
+  bucketizer fallback_allocator free_list free_tree bitmapped_block \
  kernighan_ritchie null_allocator package quantizer \
  region scoped_allocator segregator stats_collector
 PACKAGE_std_net = curl isemail
--- a/std/experimental/allocator/building_blocks/aligned_block_list.d
+++ b/std/experimental/allocator/building_blocks/aligned_block_list.d
@ -0,0 +1,699 @@
 // Written in the D programming language.
 /**
 `AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations
 and preserving a low degree of fragmentation by means of aligned allocations.
 Source: $(PHOBOSSRC std/experimental/allocator/building_blocks/aligned_block_list.d)
 */
 module std.experimental.allocator.building_blocks.aligned_block_list;
 import std.experimental.allocator.common;
 import std.experimental.allocator.building_blocks.null_allocator;
 // Common function implementation for thread local and shared AlignedBlockList
 private mixin template AlignedBlockListImpl(bool isShared)
 {
    import std.traits : hasMember;
    import std.typecons : Ternary;
    static if (isShared)
    import core.internal.spinlock : SpinLock;
 private:
    // Doubly linked list of 'AlignedBlockNode'
    // Each node contains an `Allocator` followed by its payload
    static struct AlignedBlockNode
    {
        AlignedBlockNode* next, prev;
        Allocator bAlloc;
        static if (isShared)
        {
            shared(size_t) bytesUsed;
            // Since the lock is not taken when allocating, this acts like a refcount
            // keeping the node alive
            uint keepAlive;
        }
        else
        {
            size_t bytesUsed;
        }
    }
    // Root of the internal doubly linked list
    AlignedBlockNode* root;
    // Number of active nodes
    uint numNodes;
    // If the numNodes exceeds this limit, we will start deallocating nodes
    enum uint maxNodes = 64;
    // This lock is always taken when changing the list
    // To improve performance, the lock is not taken when the allocation logic is called
    static if (isShared)
    SpinLock lock = SpinLock(SpinLock.Contention.brief);
    // Moves a node to the front of the list, allowing for quick allocations
    void moveToFront(AlignedBlockNode* tmp)
    {
        auto localRoot = cast(AlignedBlockNode*) root;
        if (tmp == localRoot)
            return;
        if (tmp.prev) tmp.prev.next = tmp.next;
        if (tmp.next) tmp.next.prev = tmp.prev;
        if (localRoot) localRoot.prev = tmp;
        tmp.next = localRoot;
        tmp.prev = null;
        root = cast(typeof(root)) tmp;
    }
    // Removes a node from the list, including its payload
    // The payload is deallocated by calling 'parent.deallocate'
    void removeNode(AlignedBlockNode* tmp)
    {
        auto next = tmp.next;
        if (tmp.prev) tmp.prev.next = tmp.next;
        if (tmp.next) tmp.next.prev = tmp.prev;
        parent.deallocate((cast(void*) tmp)[0 .. theAlignment]);
        if (tmp == cast(AlignedBlockNode*) root)
            root = cast(typeof(root)) next;
        static if (isShared)
        {
            import core.atomic : atomicOp;
            atomicOp!"-="(numNodes, 1);
        }
        else
        {
            numNodes--;
        }
    }
    // If the nodes do not have available space, a new node is created
    // by drawing memory from the parent allocator with aligned allocations.
    // The new node is inserted at the front of the list
    bool insertNewNode()
    {
        void[] buf = parent.alignedAllocate(theAlignment, theAlignment);
        if (buf is null)
            return false;
        auto localRoot = cast(AlignedBlockNode*) root;
        auto newNode = cast(AlignedBlockNode*) buf;
        // The first part of the allocation represent the node contents
        // followed by the actual payload
        ubyte[] payload = cast(ubyte[]) buf[AlignedBlockNode.sizeof .. $];
        newNode.bAlloc = Allocator(payload);
        newNode.next = localRoot;
        newNode.prev = null;
        if (localRoot)
            localRoot.prev = newNode;
        root = cast(typeof(root)) newNode;
        static if (isShared)
        {
            import core.atomic : atomicOp;
            atomicOp!"+="(numNodes, 1);
        }
        else
        {
            numNodes++;
        }
        return true;
    }
 public:
    static if (stateSize!ParentAllocator) ParentAllocator parent;
    else alias parent = ParentAllocator.instance;
    enum ulong alignment = Allocator.alignment;
    // Since all memory is drawn from ParentAllocator, we can
    // forward this to the parent
    static if (hasMember!(ParentAllocator, "owns"))
    Ternary owns(void[] b)
    {
        return parent.owns(b);
    }
    // Use `theAlignment` to find the node which allocated this block
    bool deallocate(void[] b)
    {
        if (b is null)
            return true;
        // Round buffer to nearest `theAlignment` multiple to quickly find
        // the `parent` `AlignedBlockNode`
        enum ulong mask = ~(theAlignment - 1);
        ulong ptr = ((cast(ulong) b.ptr) & mask);
        AlignedBlockNode *node = cast(AlignedBlockNode*) ptr;
        if (node.bAlloc.deallocate(b))
        {
            static if (isShared)
            {
                import core.atomic : atomicOp;
                atomicOp!"-="(node.bytesUsed, b.length);
            }
            else
            {
                node.bytesUsed -= b.length;
            }
            return true;
        }
        return false;
    }
    // Allocate works only if memory can be provided via `alignedAllocate` from the parent
    static if (hasMember!(ParentAllocator, "alignedAllocate"))
    void[] allocate(size_t n)
    {
        static if (isShared)
        import core.atomic : atomicOp, atomicLoad;
        if (n == 0 || n > theAlignment)
            return null;
        static if (isShared)
        {
            lock.lock();
            scope(exit) lock.unlock();
        }
        auto tmp = cast(AlignedBlockNode*) root;
        // Iterate through list and find first node which has memory available
        while (tmp)
        {
            auto next = tmp.next;
            static if (isShared)
            {
                // Allocations can happen outside the lock
                // Make sure nobody deletes this node while using it
                tmp.keepAlive++;
                if (next) next.keepAlive++;
                lock.unlock();
            }
            auto result = tmp.bAlloc.allocate(n);
            if (result.length == n)
            {
                // Success
                static if (isShared)
                {
                    atomicOp!"+="(tmp.bytesUsed, n);
                    lock.lock();
                }
                else
                {
                    tmp.bytesUsed += n;
                }
                // Most likely this node has memory for more allocations
                // Move it to the front
                moveToFront(tmp);
                static if (isShared)
                {
                    tmp.keepAlive--;
                    if (next) next.keepAlive--;
                }
                return result;
            }
            // This node can now be removed if necessary
            static if (isShared)
            {
                lock.lock();
                tmp.keepAlive--;
                if (next) next.keepAlive--;
            }
            if (!next)
                break;
            tmp = next;
            next = tmp.next;
            // If there are too many nodes, free memory by removing empty nodes
            static if (isShared)
            {
                if (atomicLoad(numNodes) > maxNodes &&
                    atomicLoad(tmp.bytesUsed) == 0 &&
                    tmp.keepAlive == 0)
                {
                    removeNode(tmp);
                }
            }
            else
            {
                if (numNodes > maxNodes && tmp.bytesUsed == 0)
                {
                    removeNode(tmp);
                }
            }
            tmp = next;
        }
        // Cannot create new AlignedBlockNode. Most likely the ParentAllocator ran out of resources
        if (!insertNewNode())
            return null;
        tmp = cast(typeof(tmp)) root;
        void[] result = tmp.bAlloc.allocate(n);
        static if (isShared)
        {
            atomicOp!"+="(root.bytesUsed, result.length);
        }
        else
        {
            root.bytesUsed += result.length;
        }
        return result;
    }
    // goodAllocSize should not use state
    size_t goodAllocSize(const size_t n)
    {
        Allocator a = null;
        return a.goodAllocSize(n);
    }
 }
 /**
 `AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations
 and preserving a low degree of fragmentation.
 The allocator holds internally a doubly linked list of `Allocator` objects, which will serve allocations
 in a most-recently-used fashion. Most recent allocators used for `allocate` calls, will be
 moved to the front of the list.
 Although allocations are in theory served in linear searching time, `deallocate` calls take
 $(BIGOH 1) time, by using aligned allocations. `ParentAllocator` must implement `alignedAllocate`
 and it must be able to allocate `theAlignment` bytes at the same alignment. Each aligned allocation
 done by `ParentAllocator` will contain metadata for an `Allocator`, followed by its payload.
 Params:
    Allocator = the allocator which is used to manage each node; it must have a constructor which receives
        `ubyte[]` and it must not have any parent allocators, except for the `NullAllocator`
    ParentAllocator = each node draws memory from the parent allocator; it must support `alignedAllocate`
    theAlignment = alignment of each block and at the same time length of each node
 */
 struct AlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21))
 {
    version (StdDdoc)
    {
        import std.typecons : Ternary;
        import std.traits : hasMember;
        /**
        Returns a chunk of memory of size `n`
        It finds the first node in the `AlignedBlockNode` list which has available memory,
        and moves it to the front of the list.
        All empty nodes which cannot return new memory, are removed from the list.
        Params:
            n = bytes to allocate
        Returns:
            A chunk of memory of the required length or `null` on failure or
        */
        static if (hasMember!(ParentAllocator, "alignedAllocate"))
        void[] allocate(size_t n);
        /**
        Deallocates the buffer `b` given as parameter. Deallocations take place in constant
        time, regardless of the number of nodes in the list. `b.ptr` is rounded down
        to the nearest multiple of the `alignment` to quickly find the corresponding
        `AlignedBlockNode`.
        Params:
            b = buffer candidate for deallocation
        Returns:
            `true` on success and `false` on failure
        */
        bool deallocate(void[] b);
        /**
        Returns `Ternary.yes` if the buffer belongs to the parent allocator and
        `Ternary.no` otherwise.
        Params:
            b = buffer tested if owned by this allocator
        Returns:
            `Ternary.yes` if owned by this allocator and `Ternary.no` otherwise
        */
        static if (hasMember!(ParentAllocator, "owns"))
        Ternary owns(void[] b);
    }
    else
    {
        import std.math : isPowerOf2;
        static assert(isPowerOf2(alignment));
        mixin AlignedBlockListImpl!false;
    }
 }
 ///
@system unittest
 {
    import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator;
    import std.experimental.allocator.building_blocks.segregator : Segregator;
    import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock;
    import std.typecons : Ternary;
    /*
    In this example we use 'AlignedBlockList' in conjunction with other allocators
    in order to create a more complex allocator.
    The 'SuperAllocator' uses a 'Segregator' to distribute allocations to sub-allocators,
    based on the requested size.
    Each sub-allocator is represented by an 'AlignedBlockList' of 'BitmappedBlocks'.
    Each 'AlignedBlockList' draws memory from a root allocator which in this case is an 'AscendingPageAllocator'
    Such an allocator not only provides good performance, but also a low degree of memory fragmentation.
    */
    alias SuperAllocator = Segregator!(
        32,
        AlignedBlockList!(BitmappedBlock!32, AscendingPageAllocator*, 1 << 12),
        Segregator!(
        64,
        AlignedBlockList!(BitmappedBlock!64, AscendingPageAllocator*, 1 << 12),
        Segregator!(
        128,
        AlignedBlockList!(BitmappedBlock!128, AscendingPageAllocator*, 1 << 12),
        AscendingPageAllocator*
    )));
    SuperAllocator a;
    auto pageAlloc = AscendingPageAllocator(128 * 4096);
    // Set the parent allocator for all the sub allocators
    a.allocatorForSize!256 = &pageAlloc;
    a.allocatorForSize!128.parent = &pageAlloc;
    a.allocatorForSize!64.parent = &pageAlloc;
    a.allocatorForSize!32.parent = &pageAlloc;
    enum testNum = 10;
    void[][testNum] buf;
    // Allocations of size 32 will go to the first 'AlignedBlockList'
    foreach (j; 0 .. testNum)
    {
        buf[j] = a.allocate(32);
        assert(buf[j].length == 32);
        // This is owned by the first 'AlignedBlockList'
        assert(a.allocatorForSize!32.owns(buf[j]) == Ternary.yes);
    }
    // Free the memory
    foreach (j; 0 .. testNum)
        assert(a.deallocate(buf[j]));
    // Allocations of size 64 will go to the second 'AlignedBlockList'
    foreach (j; 0 .. testNum)
    {
        buf[j] = a.allocate(64);
        assert(buf[j].length == 64);
        // This is owned by the second 'AlignedBlockList'
        assert(a.allocatorForSize!64.owns(buf[j]) == Ternary.yes);
    }
    // Free the memory
    foreach (j; 0 .. testNum)
        assert(a.deallocate(buf[j]));
    // Allocations of size 128 will go to the third 'AlignedBlockList'
    foreach (j; 0 .. testNum)
    {
        buf[j] = a.allocate(128);
        assert(buf[j].length == 128);
        // This is owned by the third 'AlignedBlockList'
        assert(a.allocatorForSize!128.owns(buf[j]) == Ternary.yes);
    }
    // Free the memory
    foreach (j; 0 .. testNum)
        assert(a.deallocate(buf[j]));
    // Allocations which exceed 128, will go to the 'AscendingPageAllocator*'
    void[] b = a.allocate(256);
    assert(b.length == 256);
    a.deallocate(b);
 }
 /**
 `SharedAlignedBlockList` is the threadsafe version of `AlignedBlockList`.
 The `Allocator` template parameter must refer a shared allocator.
 Also, `ParentAllocator` must be a shared allocator, supporting `alignedAllocate`.
 Params:
    Allocator = the shared allocator which is used to manage each node; it must have a constructor which receives
        `ubyte[]` and it must not have any parent allocators, except for the `NullAllocator`
    ParentAllocator = each node draws memory from the parent allocator; it must be shared and support `alignedAllocate`
    theAlignment = alignment of each block and at the same time length of each node
 */
 shared struct SharedAlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21))
 {
    version (StdDdoc)
    {
        import std.typecons : Ternary;
        import std.traits : hasMember;
        /**
        Returns a chunk of memory of size `n`
        It finds the first node in the `AlignedBlockNode` list which has available memory,
        and moves it to the front of the list.
        All empty nodes which cannot return new memory, are removed from the list.
        Params:
            n = bytes to allocate
        Returns:
            A chunk of memory of the required length or `null` on failure or
        */
        static if (hasMember!(ParentAllocator, "alignedAllocate"))
        void[] allocate(size_t n);
        /**
        Deallocates the buffer `b` given as parameter. Deallocations take place in constant
        time, regardless of the number of nodes in the list. `b.ptr` is rounded down
        to the nearest multiple of the `alignment` to quickly find the corresponding
        `AlignedBlockNode`.
        Params:
            b = buffer candidate for deallocation
        Returns:
            `true` on success and `false` on failure
        */
        bool deallocate(void[] b);
        /**
        Returns `Ternary.yes` if the buffer belongs to the parent allocator and
        `Ternary.no` otherwise.
        Params:
            b = buffer tested if owned by this allocator
        Returns:
            `Ternary.yes` if owned by this allocator and `Ternary.no` otherwise
        */
        static if (hasMember!(ParentAllocator, "owns"))
        Ternary owns(void[] b);
    }
    else
    {
        import std.math : isPowerOf2;
        static assert(isPowerOf2(alignment));
        mixin AlignedBlockListImpl!true;
    }
 }
 ///
@system unittest
 {
    import std.experimental.allocator.building_blocks.region : SharedRegion;
    import std.experimental.allocator.building_blocks.ascending_page_allocator : SharedAscendingPageAllocator;
    import std.experimental.allocator.building_blocks.null_allocator : NullAllocator;
    import core.thread : ThreadGroup;
    enum numThreads = 8;
    enum size = 2048;
    enum maxIter = 10;
    /*
    In this example we use 'SharedAlignedBlockList' together with 'SharedRegion',
    in order to create a fast, thread-safe allocator.
    */
    alias SuperAllocator = SharedAlignedBlockList!(
            SharedRegion!(NullAllocator, 1),
            SharedAscendingPageAllocator,
            4096);
    SuperAllocator a;
    // The 'SuperAllocator' will draw memory from a 'SharedAscendingPageAllocator'
    a.parent = SharedAscendingPageAllocator(4096 * 1024);
    // Launch 'numThreads', each performing allocations
    void fun()
    {
        foreach (i; 0 .. maxIter)
        {
            void[] b = a.allocate(size);
            assert(b.length == size);
        }
    }
    auto tg = new ThreadGroup;
    foreach (i; 0 .. numThreads)
    {
        tg.create(&fun);
    }
    tg.joinAll();
 }
 version (unittest)
 {
    static void testrw(void[] b)
    {
        ubyte* buf = cast(ubyte*) b.ptr;
        size_t len = (b.length).roundUpToMultipleOf(4096);
        for (int i = 0; i < len; i += 4096)
        {
            buf[i] =  (cast(ubyte) i % 256);
            assert(buf[i] == (cast(ubyte) i % 256));
        }
    }
 }
@system unittest
 {
    import std.experimental.allocator.building_blocks.region;
    import std.experimental.allocator.building_blocks.ascending_page_allocator;
    import std.random;
    import std.algorithm.sorting : sort;
    import core.thread : ThreadGroup;
    import core.internal.spinlock : SpinLock;
    enum pageSize = 4096;
    enum numThreads = 10;
    enum maxIter = 20;
    enum totalAllocs = maxIter * numThreads;
    size_t count = 0;
    SpinLock lock = SpinLock(SpinLock.Contention.brief);
    alias SuperAllocator = SharedAlignedBlockList!(
            SharedRegion!(NullAllocator, 1),
            SharedAscendingPageAllocator,
            1 << 16);
    void[][totalAllocs] buf;
    SuperAllocator a;
    a.parent = SharedAscendingPageAllocator(4096 * 1024);
    void fun()
    {
        auto rnd = Random(1000);
        foreach (i; 0 .. maxIter)
        {
            auto size = uniform(1, pageSize + 1, rnd);
            void[] b = a.allocate(size);
            assert(b.length == size);
            testrw(b);
            lock.lock();
            buf[count++] = b;
            lock.unlock();
        }
    }
    auto tg = new ThreadGroup;
    foreach (i; 0 .. numThreads)
    {
        tg.create(&fun);
    }
    tg.joinAll();
    sort!((a, b) => a.ptr < b.ptr)(buf[0 .. totalAllocs]);
    foreach (i; 0 .. totalAllocs - 1)
    {
        assert(buf[i].ptr + a.goodAllocSize(buf[i].length) <= buf[i + 1].ptr);
    }
    foreach (i; 0 .. totalAllocs)
    {
        assert(a.deallocate(buf[totalAllocs - 1 - i]));
    }
 }
@system unittest
 {
    import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator;
    import std.experimental.allocator.building_blocks.segregator : Segregator;
    import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock;
    import std.random;
    alias SuperAllocator = Segregator!(
        256,
        AlignedBlockList!(BitmappedBlock!256, AscendingPageAllocator*, 1 << 16),
        Segregator!(
        512,
        AlignedBlockList!(BitmappedBlock!512, AscendingPageAllocator*, 1 << 16),
        Segregator!(
        1024,
        AlignedBlockList!(BitmappedBlock!1024, AscendingPageAllocator*, 1 << 16),
        Segregator!(
        2048,
        AlignedBlockList!(BitmappedBlock!2048, AscendingPageAllocator*, 1 << 16),
        AscendingPageAllocator*
    ))));
    SuperAllocator a;
    auto pageAlloc = AscendingPageAllocator(4096 * 4096);
    a.allocatorForSize!4096 = &pageAlloc;
    a.allocatorForSize!2048.parent = &pageAlloc;
    a.allocatorForSize!1024.parent = &pageAlloc;
    a.allocatorForSize!512.parent = &pageAlloc;
    a.allocatorForSize!256.parent = &pageAlloc;
    auto rnd = Random(1000);
    size_t maxIter = 10;
    enum testNum = 10;
    void[][testNum] buf;
    int maxSize = 8192;
    foreach (i; 0 .. maxIter)
    {
        foreach (j; 0 .. testNum)
        {
            auto size = uniform(1, maxSize + 1, rnd);
            buf[j] = a.allocate(size);
            assert(buf[j].length == size);
            testrw(buf[j]);
        }
        randomShuffle(buf[]);
        foreach (j; 0 .. testNum)
        {
            assert(a.deallocate(buf[j]));
        }
    }
 }
--- a/std/experimental/allocator/building_blocks/package.d
+++ b/std/experimental/allocator/building_blocks/package.d
@ -225,6 +225,9 @@ $(HTTP man7.org/linux/man-pages/man3/posix_memalign.3.html, `posix_memalign`)
 on Posix and $(HTTP msdn.microsoft.com/en-us/library/fs9stz4e(v=vs.80).aspx,
 `__aligned_xxx`) on Windows.))
 $(TR $(TDC2 AlignedBlockList, aligned_block_list) $(TD A wrapper around a list of allocators
 which allow for very fast deallocations.))
 $(TR $(TDC2 AffixAllocator, affix_allocator) $(TD Allocator that allows and manages allocating
 extra prefix and/or a suffix bytes for each block allocated.))
@ -302,6 +305,7 @@ module std.experimental.allocator.building_blocks;
 public import
    std.experimental.allocator.building_blocks.affix_allocator,
    std.experimental.allocator.building_blocks.aligned_block_list,
    std.experimental.allocator.building_blocks.allocator_list,
    std.experimental.allocator.building_blocks.ascending_page_allocator,
    std.experimental.allocator.building_blocks.bucketizer,
--- a/win32.mak
+++ b/win32.mak
@ -288,6 +288,7 @@ SRC_STD_EXP= \
 SRC_STD_EXP_ALLOC_BB= \
 	std\experimental\allocator\building_blocks\affix_allocator.d \
 	std\experimental\allocator\building_blocks\aligned_block_list.d \
 	std\experimental\allocator\building_blocks\allocator_list.d \
 	std\experimental\allocator\building_blocks\ascending_page_allocator.d \
 	std\experimental\allocator\building_blocks\bitmapped_block.d \
--- a/win64.mak
+++ b/win64.mak
@ -313,6 +313,7 @@ SRC_STD_EXP= \
 SRC_STD_EXP_ALLOC_BB= \
 	std\experimental\allocator\building_blocks\affix_allocator.d \
 	std\experimental\allocator\building_blocks\aligned_block_list.d \
 	std\experimental\allocator\building_blocks\allocator_list.d \
 	std\experimental\allocator\building_blocks\ascending_page_allocator.d \
 	std\experimental\allocator\building_blocks\bitmapped_block.d \