Add AlignedBlockList

This commit is contained in:
Alexandru Jercaianu 2018-04-02 23:08:49 +03:00
parent 813327d51a
commit 0fb44b0d45
6 changed files with 717 additions and 2 deletions

View file

@ -0,0 +1,10 @@
Implemented a new allocator, `AlignedBlockList` and its thread-safe version `SharedAlignedBlockList`
$(REF AlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) represents
a list of allocators which allows for deallocations in constant time.
Although allocations are in theory served in linear searching time, `deallocate` calls take
$(BIGOH 1) time, by using aligned allocations. The `ParentAllocator` must implement `alignedAllocate`.
$(REF SharedAlignedBlockList, std,experimental,allocator,building_blocks, aligned_block_list) has the
same semantics as its single threaded version, however the internal allocators must be in addition marked
as shared.

View file

@ -206,8 +206,8 @@ PACKAGE_std_experimental_logger = core filelogger \
PACKAGE_std_experimental_allocator = \ PACKAGE_std_experimental_allocator = \
common gc_allocator mallocator mmap_allocator package showcase typed common gc_allocator mallocator mmap_allocator package showcase typed
PACKAGE_std_experimental_allocator_building_blocks = \ PACKAGE_std_experimental_allocator_building_blocks = \
affix_allocator allocator_list ascending_page_allocator bucketizer \ affix_allocator aligned_block_list allocator_list ascending_page_allocator \
fallback_allocator free_list free_tree bitmapped_block \ bucketizer fallback_allocator free_list free_tree bitmapped_block \
kernighan_ritchie null_allocator package quantizer \ kernighan_ritchie null_allocator package quantizer \
region scoped_allocator segregator stats_collector region scoped_allocator segregator stats_collector
PACKAGE_std_net = curl isemail PACKAGE_std_net = curl isemail

View file

@ -0,0 +1,699 @@
// Written in the D programming language.
/**
`AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations
and preserving a low degree of fragmentation by means of aligned allocations.
Source: $(PHOBOSSRC std/experimental/allocator/building_blocks/aligned_block_list.d)
*/
module std.experimental.allocator.building_blocks.aligned_block_list;
import std.experimental.allocator.common;
import std.experimental.allocator.building_blocks.null_allocator;
// Common function implementation for thread local and shared AlignedBlockList
private mixin template AlignedBlockListImpl(bool isShared)
{
import std.traits : hasMember;
import std.typecons : Ternary;
static if (isShared)
import core.internal.spinlock : SpinLock;
private:
// Doubly linked list of 'AlignedBlockNode'
// Each node contains an `Allocator` followed by its payload
static struct AlignedBlockNode
{
AlignedBlockNode* next, prev;
Allocator bAlloc;
static if (isShared)
{
shared(size_t) bytesUsed;
// Since the lock is not taken when allocating, this acts like a refcount
// keeping the node alive
uint keepAlive;
}
else
{
size_t bytesUsed;
}
}
// Root of the internal doubly linked list
AlignedBlockNode* root;
// Number of active nodes
uint numNodes;
// If the numNodes exceeds this limit, we will start deallocating nodes
enum uint maxNodes = 64;
// This lock is always taken when changing the list
// To improve performance, the lock is not taken when the allocation logic is called
static if (isShared)
SpinLock lock = SpinLock(SpinLock.Contention.brief);
// Moves a node to the front of the list, allowing for quick allocations
void moveToFront(AlignedBlockNode* tmp)
{
auto localRoot = cast(AlignedBlockNode*) root;
if (tmp == localRoot)
return;
if (tmp.prev) tmp.prev.next = tmp.next;
if (tmp.next) tmp.next.prev = tmp.prev;
if (localRoot) localRoot.prev = tmp;
tmp.next = localRoot;
tmp.prev = null;
root = cast(typeof(root)) tmp;
}
// Removes a node from the list, including its payload
// The payload is deallocated by calling 'parent.deallocate'
void removeNode(AlignedBlockNode* tmp)
{
auto next = tmp.next;
if (tmp.prev) tmp.prev.next = tmp.next;
if (tmp.next) tmp.next.prev = tmp.prev;
parent.deallocate((cast(void*) tmp)[0 .. theAlignment]);
if (tmp == cast(AlignedBlockNode*) root)
root = cast(typeof(root)) next;
static if (isShared)
{
import core.atomic : atomicOp;
atomicOp!"-="(numNodes, 1);
}
else
{
numNodes--;
}
}
// If the nodes do not have available space, a new node is created
// by drawing memory from the parent allocator with aligned allocations.
// The new node is inserted at the front of the list
bool insertNewNode()
{
void[] buf = parent.alignedAllocate(theAlignment, theAlignment);
if (buf is null)
return false;
auto localRoot = cast(AlignedBlockNode*) root;
auto newNode = cast(AlignedBlockNode*) buf;
// The first part of the allocation represent the node contents
// followed by the actual payload
ubyte[] payload = cast(ubyte[]) buf[AlignedBlockNode.sizeof .. $];
newNode.bAlloc = Allocator(payload);
newNode.next = localRoot;
newNode.prev = null;
if (localRoot)
localRoot.prev = newNode;
root = cast(typeof(root)) newNode;
static if (isShared)
{
import core.atomic : atomicOp;
atomicOp!"+="(numNodes, 1);
}
else
{
numNodes++;
}
return true;
}
public:
static if (stateSize!ParentAllocator) ParentAllocator parent;
else alias parent = ParentAllocator.instance;
enum ulong alignment = Allocator.alignment;
// Since all memory is drawn from ParentAllocator, we can
// forward this to the parent
static if (hasMember!(ParentAllocator, "owns"))
Ternary owns(void[] b)
{
return parent.owns(b);
}
// Use `theAlignment` to find the node which allocated this block
bool deallocate(void[] b)
{
if (b is null)
return true;
// Round buffer to nearest `theAlignment` multiple to quickly find
// the `parent` `AlignedBlockNode`
enum ulong mask = ~(theAlignment - 1);
ulong ptr = ((cast(ulong) b.ptr) & mask);
AlignedBlockNode *node = cast(AlignedBlockNode*) ptr;
if (node.bAlloc.deallocate(b))
{
static if (isShared)
{
import core.atomic : atomicOp;
atomicOp!"-="(node.bytesUsed, b.length);
}
else
{
node.bytesUsed -= b.length;
}
return true;
}
return false;
}
// Allocate works only if memory can be provided via `alignedAllocate` from the parent
static if (hasMember!(ParentAllocator, "alignedAllocate"))
void[] allocate(size_t n)
{
static if (isShared)
import core.atomic : atomicOp, atomicLoad;
if (n == 0 || n > theAlignment)
return null;
static if (isShared)
{
lock.lock();
scope(exit) lock.unlock();
}
auto tmp = cast(AlignedBlockNode*) root;
// Iterate through list and find first node which has memory available
while (tmp)
{
auto next = tmp.next;
static if (isShared)
{
// Allocations can happen outside the lock
// Make sure nobody deletes this node while using it
tmp.keepAlive++;
if (next) next.keepAlive++;
lock.unlock();
}
auto result = tmp.bAlloc.allocate(n);
if (result.length == n)
{
// Success
static if (isShared)
{
atomicOp!"+="(tmp.bytesUsed, n);
lock.lock();
}
else
{
tmp.bytesUsed += n;
}
// Most likely this node has memory for more allocations
// Move it to the front
moveToFront(tmp);
static if (isShared)
{
tmp.keepAlive--;
if (next) next.keepAlive--;
}
return result;
}
// This node can now be removed if necessary
static if (isShared)
{
lock.lock();
tmp.keepAlive--;
if (next) next.keepAlive--;
}
if (!next)
break;
tmp = next;
next = tmp.next;
// If there are too many nodes, free memory by removing empty nodes
static if (isShared)
{
if (atomicLoad(numNodes) > maxNodes &&
atomicLoad(tmp.bytesUsed) == 0 &&
tmp.keepAlive == 0)
{
removeNode(tmp);
}
}
else
{
if (numNodes > maxNodes && tmp.bytesUsed == 0)
{
removeNode(tmp);
}
}
tmp = next;
}
// Cannot create new AlignedBlockNode. Most likely the ParentAllocator ran out of resources
if (!insertNewNode())
return null;
tmp = cast(typeof(tmp)) root;
void[] result = tmp.bAlloc.allocate(n);
static if (isShared)
{
atomicOp!"+="(root.bytesUsed, result.length);
}
else
{
root.bytesUsed += result.length;
}
return result;
}
// goodAllocSize should not use state
size_t goodAllocSize(const size_t n)
{
Allocator a = null;
return a.goodAllocSize(n);
}
}
/**
`AlignedBlockList` represents a wrapper around a chain of allocators, allowing for fast deallocations
and preserving a low degree of fragmentation.
The allocator holds internally a doubly linked list of `Allocator` objects, which will serve allocations
in a most-recently-used fashion. Most recent allocators used for `allocate` calls, will be
moved to the front of the list.
Although allocations are in theory served in linear searching time, `deallocate` calls take
$(BIGOH 1) time, by using aligned allocations. `ParentAllocator` must implement `alignedAllocate`
and it must be able to allocate `theAlignment` bytes at the same alignment. Each aligned allocation
done by `ParentAllocator` will contain metadata for an `Allocator`, followed by its payload.
Params:
Allocator = the allocator which is used to manage each node; it must have a constructor which receives
`ubyte[]` and it must not have any parent allocators, except for the `NullAllocator`
ParentAllocator = each node draws memory from the parent allocator; it must support `alignedAllocate`
theAlignment = alignment of each block and at the same time length of each node
*/
struct AlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21))
{
version (StdDdoc)
{
import std.typecons : Ternary;
import std.traits : hasMember;
/**
Returns a chunk of memory of size `n`
It finds the first node in the `AlignedBlockNode` list which has available memory,
and moves it to the front of the list.
All empty nodes which cannot return new memory, are removed from the list.
Params:
n = bytes to allocate
Returns:
A chunk of memory of the required length or `null` on failure or
*/
static if (hasMember!(ParentAllocator, "alignedAllocate"))
void[] allocate(size_t n);
/**
Deallocates the buffer `b` given as parameter. Deallocations take place in constant
time, regardless of the number of nodes in the list. `b.ptr` is rounded down
to the nearest multiple of the `alignment` to quickly find the corresponding
`AlignedBlockNode`.
Params:
b = buffer candidate for deallocation
Returns:
`true` on success and `false` on failure
*/
bool deallocate(void[] b);
/**
Returns `Ternary.yes` if the buffer belongs to the parent allocator and
`Ternary.no` otherwise.
Params:
b = buffer tested if owned by this allocator
Returns:
`Ternary.yes` if owned by this allocator and `Ternary.no` otherwise
*/
static if (hasMember!(ParentAllocator, "owns"))
Ternary owns(void[] b);
}
else
{
import std.math : isPowerOf2;
static assert(isPowerOf2(alignment));
mixin AlignedBlockListImpl!false;
}
}
///
@system unittest
{
import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator;
import std.experimental.allocator.building_blocks.segregator : Segregator;
import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock;
import std.typecons : Ternary;
/*
In this example we use 'AlignedBlockList' in conjunction with other allocators
in order to create a more complex allocator.
The 'SuperAllocator' uses a 'Segregator' to distribute allocations to sub-allocators,
based on the requested size.
Each sub-allocator is represented by an 'AlignedBlockList' of 'BitmappedBlocks'.
Each 'AlignedBlockList' draws memory from a root allocator which in this case is an 'AscendingPageAllocator'
Such an allocator not only provides good performance, but also a low degree of memory fragmentation.
*/
alias SuperAllocator = Segregator!(
32,
AlignedBlockList!(BitmappedBlock!32, AscendingPageAllocator*, 1 << 12),
Segregator!(
64,
AlignedBlockList!(BitmappedBlock!64, AscendingPageAllocator*, 1 << 12),
Segregator!(
128,
AlignedBlockList!(BitmappedBlock!128, AscendingPageAllocator*, 1 << 12),
AscendingPageAllocator*
)));
SuperAllocator a;
auto pageAlloc = AscendingPageAllocator(128 * 4096);
// Set the parent allocator for all the sub allocators
a.allocatorForSize!256 = &pageAlloc;
a.allocatorForSize!128.parent = &pageAlloc;
a.allocatorForSize!64.parent = &pageAlloc;
a.allocatorForSize!32.parent = &pageAlloc;
enum testNum = 10;
void[][testNum] buf;
// Allocations of size 32 will go to the first 'AlignedBlockList'
foreach (j; 0 .. testNum)
{
buf[j] = a.allocate(32);
assert(buf[j].length == 32);
// This is owned by the first 'AlignedBlockList'
assert(a.allocatorForSize!32.owns(buf[j]) == Ternary.yes);
}
// Free the memory
foreach (j; 0 .. testNum)
assert(a.deallocate(buf[j]));
// Allocations of size 64 will go to the second 'AlignedBlockList'
foreach (j; 0 .. testNum)
{
buf[j] = a.allocate(64);
assert(buf[j].length == 64);
// This is owned by the second 'AlignedBlockList'
assert(a.allocatorForSize!64.owns(buf[j]) == Ternary.yes);
}
// Free the memory
foreach (j; 0 .. testNum)
assert(a.deallocate(buf[j]));
// Allocations of size 128 will go to the third 'AlignedBlockList'
foreach (j; 0 .. testNum)
{
buf[j] = a.allocate(128);
assert(buf[j].length == 128);
// This is owned by the third 'AlignedBlockList'
assert(a.allocatorForSize!128.owns(buf[j]) == Ternary.yes);
}
// Free the memory
foreach (j; 0 .. testNum)
assert(a.deallocate(buf[j]));
// Allocations which exceed 128, will go to the 'AscendingPageAllocator*'
void[] b = a.allocate(256);
assert(b.length == 256);
a.deallocate(b);
}
/**
`SharedAlignedBlockList` is the threadsafe version of `AlignedBlockList`.
The `Allocator` template parameter must refer a shared allocator.
Also, `ParentAllocator` must be a shared allocator, supporting `alignedAllocate`.
Params:
Allocator = the shared allocator which is used to manage each node; it must have a constructor which receives
`ubyte[]` and it must not have any parent allocators, except for the `NullAllocator`
ParentAllocator = each node draws memory from the parent allocator; it must be shared and support `alignedAllocate`
theAlignment = alignment of each block and at the same time length of each node
*/
shared struct SharedAlignedBlockList(Allocator, ParentAllocator, ulong theAlignment = (1 << 21))
{
version (StdDdoc)
{
import std.typecons : Ternary;
import std.traits : hasMember;
/**
Returns a chunk of memory of size `n`
It finds the first node in the `AlignedBlockNode` list which has available memory,
and moves it to the front of the list.
All empty nodes which cannot return new memory, are removed from the list.
Params:
n = bytes to allocate
Returns:
A chunk of memory of the required length or `null` on failure or
*/
static if (hasMember!(ParentAllocator, "alignedAllocate"))
void[] allocate(size_t n);
/**
Deallocates the buffer `b` given as parameter. Deallocations take place in constant
time, regardless of the number of nodes in the list. `b.ptr` is rounded down
to the nearest multiple of the `alignment` to quickly find the corresponding
`AlignedBlockNode`.
Params:
b = buffer candidate for deallocation
Returns:
`true` on success and `false` on failure
*/
bool deallocate(void[] b);
/**
Returns `Ternary.yes` if the buffer belongs to the parent allocator and
`Ternary.no` otherwise.
Params:
b = buffer tested if owned by this allocator
Returns:
`Ternary.yes` if owned by this allocator and `Ternary.no` otherwise
*/
static if (hasMember!(ParentAllocator, "owns"))
Ternary owns(void[] b);
}
else
{
import std.math : isPowerOf2;
static assert(isPowerOf2(alignment));
mixin AlignedBlockListImpl!true;
}
}
///
@system unittest
{
import std.experimental.allocator.building_blocks.region : SharedRegion;
import std.experimental.allocator.building_blocks.ascending_page_allocator : SharedAscendingPageAllocator;
import std.experimental.allocator.building_blocks.null_allocator : NullAllocator;
import core.thread : ThreadGroup;
enum numThreads = 8;
enum size = 2048;
enum maxIter = 10;
/*
In this example we use 'SharedAlignedBlockList' together with 'SharedRegion',
in order to create a fast, thread-safe allocator.
*/
alias SuperAllocator = SharedAlignedBlockList!(
SharedRegion!(NullAllocator, 1),
SharedAscendingPageAllocator,
4096);
SuperAllocator a;
// The 'SuperAllocator' will draw memory from a 'SharedAscendingPageAllocator'
a.parent = SharedAscendingPageAllocator(4096 * 1024);
// Launch 'numThreads', each performing allocations
void fun()
{
foreach (i; 0 .. maxIter)
{
void[] b = a.allocate(size);
assert(b.length == size);
}
}
auto tg = new ThreadGroup;
foreach (i; 0 .. numThreads)
{
tg.create(&fun);
}
tg.joinAll();
}
version (unittest)
{
static void testrw(void[] b)
{
ubyte* buf = cast(ubyte*) b.ptr;
size_t len = (b.length).roundUpToMultipleOf(4096);
for (int i = 0; i < len; i += 4096)
{
buf[i] = (cast(ubyte) i % 256);
assert(buf[i] == (cast(ubyte) i % 256));
}
}
}
@system unittest
{
import std.experimental.allocator.building_blocks.region;
import std.experimental.allocator.building_blocks.ascending_page_allocator;
import std.random;
import std.algorithm.sorting : sort;
import core.thread : ThreadGroup;
import core.internal.spinlock : SpinLock;
enum pageSize = 4096;
enum numThreads = 10;
enum maxIter = 20;
enum totalAllocs = maxIter * numThreads;
size_t count = 0;
SpinLock lock = SpinLock(SpinLock.Contention.brief);
alias SuperAllocator = SharedAlignedBlockList!(
SharedRegion!(NullAllocator, 1),
SharedAscendingPageAllocator,
1 << 16);
void[][totalAllocs] buf;
SuperAllocator a;
a.parent = SharedAscendingPageAllocator(4096 * 1024);
void fun()
{
auto rnd = Random(1000);
foreach (i; 0 .. maxIter)
{
auto size = uniform(1, pageSize + 1, rnd);
void[] b = a.allocate(size);
assert(b.length == size);
testrw(b);
lock.lock();
buf[count++] = b;
lock.unlock();
}
}
auto tg = new ThreadGroup;
foreach (i; 0 .. numThreads)
{
tg.create(&fun);
}
tg.joinAll();
sort!((a, b) => a.ptr < b.ptr)(buf[0 .. totalAllocs]);
foreach (i; 0 .. totalAllocs - 1)
{
assert(buf[i].ptr + a.goodAllocSize(buf[i].length) <= buf[i + 1].ptr);
}
foreach (i; 0 .. totalAllocs)
{
assert(a.deallocate(buf[totalAllocs - 1 - i]));
}
}
@system unittest
{
import std.experimental.allocator.building_blocks.ascending_page_allocator : AscendingPageAllocator;
import std.experimental.allocator.building_blocks.segregator : Segregator;
import std.experimental.allocator.building_blocks.bitmapped_block : BitmappedBlock;
import std.random;
alias SuperAllocator = Segregator!(
256,
AlignedBlockList!(BitmappedBlock!256, AscendingPageAllocator*, 1 << 16),
Segregator!(
512,
AlignedBlockList!(BitmappedBlock!512, AscendingPageAllocator*, 1 << 16),
Segregator!(
1024,
AlignedBlockList!(BitmappedBlock!1024, AscendingPageAllocator*, 1 << 16),
Segregator!(
2048,
AlignedBlockList!(BitmappedBlock!2048, AscendingPageAllocator*, 1 << 16),
AscendingPageAllocator*
))));
SuperAllocator a;
auto pageAlloc = AscendingPageAllocator(4096 * 4096);
a.allocatorForSize!4096 = &pageAlloc;
a.allocatorForSize!2048.parent = &pageAlloc;
a.allocatorForSize!1024.parent = &pageAlloc;
a.allocatorForSize!512.parent = &pageAlloc;
a.allocatorForSize!256.parent = &pageAlloc;
auto rnd = Random(1000);
size_t maxIter = 10;
enum testNum = 10;
void[][testNum] buf;
int maxSize = 8192;
foreach (i; 0 .. maxIter)
{
foreach (j; 0 .. testNum)
{
auto size = uniform(1, maxSize + 1, rnd);
buf[j] = a.allocate(size);
assert(buf[j].length == size);
testrw(buf[j]);
}
randomShuffle(buf[]);
foreach (j; 0 .. testNum)
{
assert(a.deallocate(buf[j]));
}
}
}

View file

@ -225,6 +225,9 @@ $(HTTP man7.org/linux/man-pages/man3/posix_memalign.3.html, `posix_memalign`)
on Posix and $(HTTP msdn.microsoft.com/en-us/library/fs9stz4e(v=vs.80).aspx, on Posix and $(HTTP msdn.microsoft.com/en-us/library/fs9stz4e(v=vs.80).aspx,
`__aligned_xxx`) on Windows.)) `__aligned_xxx`) on Windows.))
$(TR $(TDC2 AlignedBlockList, aligned_block_list) $(TD A wrapper around a list of allocators
which allow for very fast deallocations.))
$(TR $(TDC2 AffixAllocator, affix_allocator) $(TD Allocator that allows and manages allocating $(TR $(TDC2 AffixAllocator, affix_allocator) $(TD Allocator that allows and manages allocating
extra prefix and/or a suffix bytes for each block allocated.)) extra prefix and/or a suffix bytes for each block allocated.))
@ -302,6 +305,7 @@ module std.experimental.allocator.building_blocks;
public import public import
std.experimental.allocator.building_blocks.affix_allocator, std.experimental.allocator.building_blocks.affix_allocator,
std.experimental.allocator.building_blocks.aligned_block_list,
std.experimental.allocator.building_blocks.allocator_list, std.experimental.allocator.building_blocks.allocator_list,
std.experimental.allocator.building_blocks.ascending_page_allocator, std.experimental.allocator.building_blocks.ascending_page_allocator,
std.experimental.allocator.building_blocks.bucketizer, std.experimental.allocator.building_blocks.bucketizer,

View file

@ -288,6 +288,7 @@ SRC_STD_EXP= \
SRC_STD_EXP_ALLOC_BB= \ SRC_STD_EXP_ALLOC_BB= \
std\experimental\allocator\building_blocks\affix_allocator.d \ std\experimental\allocator\building_blocks\affix_allocator.d \
std\experimental\allocator\building_blocks\aligned_block_list.d \
std\experimental\allocator\building_blocks\allocator_list.d \ std\experimental\allocator\building_blocks\allocator_list.d \
std\experimental\allocator\building_blocks\ascending_page_allocator.d \ std\experimental\allocator\building_blocks\ascending_page_allocator.d \
std\experimental\allocator\building_blocks\bitmapped_block.d \ std\experimental\allocator\building_blocks\bitmapped_block.d \

View file

@ -313,6 +313,7 @@ SRC_STD_EXP= \
SRC_STD_EXP_ALLOC_BB= \ SRC_STD_EXP_ALLOC_BB= \
std\experimental\allocator\building_blocks\affix_allocator.d \ std\experimental\allocator\building_blocks\affix_allocator.d \
std\experimental\allocator\building_blocks\aligned_block_list.d \
std\experimental\allocator\building_blocks\allocator_list.d \ std\experimental\allocator\building_blocks\allocator_list.d \
std\experimental\allocator\building_blocks\ascending_page_allocator.d \ std\experimental\allocator\building_blocks\ascending_page_allocator.d \
std\experimental\allocator\building_blocks\bitmapped_block.d \ std\experimental\allocator\building_blocks\bitmapped_block.d \