Redo hash table to avoid array append & GC lock/unlock

Also save hash to speed up in case of collisions & allow future re-hashing.
This commit is contained in:
Dmitry Olshansky 2013-02-23 00:22:58 +04:00
parent e4baf99ca4
commit 0c97b2f573
2 changed files with 64 additions and 42 deletions

5
main.d
View File

@ -148,7 +148,10 @@ int main(string[] args)
config.fileName = arg; config.fileName = arg;
uint count; uint count;
auto f = File(arg); auto f = File(arg);
ubyte[] buffer = uninitializedArray!(ubyte[])(f.size); import core.stdc.stdlib;
ubyte[] buffer = (cast(ubyte*)malloc(f.size))[0..f.size];
scope(exit) free(buffer.ptr);
//uninitializedArray!(ubyte[])(f.size);
foreach (t; byToken(f.rawRead(buffer), config)) foreach (t; byToken(f.rawRead(buffer), config))
{ {
if (tokenCount) if (tokenCount)

View File

@ -3051,40 +3051,33 @@ struct StringCache
if(isRandomAccessRange!R if(isRandomAccessRange!R
&& is(Unqual!(ElementType!R) : const(ubyte))) && is(Unqual!(ElementType!R) : const(ubyte)))
{ {
size_t bucket;
hash_t h; uint h = hash(range);
string* val = find(range, bucket, h); uint bucket = h % mapSize;
if (val !is null) Slot *s = &index[bucket];
//1st slot not yet initialized?
if(s.value.ptr == null)
{ {
return *val; *s = Slot(putIntoCache(range), null, h);
return s.value;
} }
else Slot* insSlot = s;
for(;;)
{ {
auto s = putIntoCache(range); if(s.hash == h && s.value.equal(range))
index[bucket] ~= s; return s.value;
return s; insSlot = s;
s = s.next;
if(s == null) break;
} }
string str = putIntoCache(range);
insertIntoSlot(insSlot, str, h);
return str;
} }
private: private:
import core.stdc.string; static uint hash(R)(R data)
string* find(R)(R data, out size_t bucket, out hash_t h)
{
h = hash(data);
bucket = h % mapSize;
foreach (i; 0 .. index[bucket].length)
{
if (equal(index[bucket][i], data))
{
return &index[bucket][i];
}
}
return null;
}
static hash_t hash(R)(R data)
{ {
uint hash = 0; uint hash = 0;
foreach (b; data) foreach (b; data)
@ -3096,28 +3089,54 @@ private:
} }
enum mapSize = 2048; enum mapSize = 2048;
string[][mapSize] index;
struct Slot
{
string value;
Slot* next;
uint hash;
};
void insertIntoSlot(Slot* tgt, string val, uint hash)
{
auto slice = allocateInCache(Slot.sizeof);
auto newSlot = cast(Slot*)slice.ptr;
*newSlot = Slot(val, null, hash);
tgt.next = newSlot;
}
Slot[mapSize] index;
// leave some slack for alloctors/GC meta-data // leave some slack for alloctors/GC meta-data
enum chunkSize = 16*1024 - size_t.sizeof*8; enum chunkSize = 16*1024 - size_t.sizeof*8;
ubyte*[] chunkS; ubyte*[] chunkS;
size_t next = chunkSize; size_t next = chunkSize;
string putIntoCache(R)(R data) ubyte[] allocateInCache(size_t size)
{ {
import core.memory; import core.memory;
if(next + size > chunkSize)
if(next + data.length > chunkSize)
{ {
// avoid huge strings // avoid huge allocations
if(data.length > chunkSize/4) if(size> chunkSize/4)
return (cast(char[])data).idup; {
ubyte* p = cast(ubyte*)GC.malloc(size,
GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR);
return p[0..size];
}
chunkS ~= cast(ubyte*)GC.malloc(chunkSize, chunkS ~= cast(ubyte*)GC.malloc(chunkSize,
GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR); GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR);
next = 0; next = 0;
} }
auto slice = chunkS[$-1][next..next+data.length]; auto slice = chunkS[$-1][next..next+size];
next += size;
return slice;
}
string putIntoCache(R)(R data)
{
auto slice = allocateInCache(data.length);
slice[] = data[]; slice[] = data[];
next += data.length;
return cast(string)slice; return cast(string)slice;
} }