std.uni: optimize fullCaseTable

This commit is contained in:
Dennis Korpel 2024-01-06 17:27:14 +01:00
parent e531180fde
commit bd8b943fa2
3 changed files with 605 additions and 831 deletions

File diff suppressed because it is too large Load diff

View file

@ -8073,22 +8073,23 @@ private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
// fullCaseTrie is packed index table // fullCaseTrie is packed index table
if (idx == EMPTY_CASE_TRIE) if (idx == EMPTY_CASE_TRIE)
return lhs; return lhs;
immutable start = idx - fTable[idx].n; immutable start = idx - fTable(idx).n;
immutable end = fTable[idx].size + start; immutable end = fTable(idx).size + start;
assert(fTable[start].entry_len == 1); assert(fTable(start).entry_len == 1);
for (idx=start; idx<end; idx++) for (idx=start; idx<end; idx++)
{ {
auto entryLen = fTable[idx].entry_len; const entryLen = fTable(idx).entry_len;
if (entryLen == 1) if (entryLen == 1)
{ {
if (fTable[idx].seq[0] == rhs) if (fTable(idx).seq[0] == rhs)
{ {
return 0; return 0;
} }
} }
else else
{// OK it's a long chunk, like 'ss' for German {// OK it's a long chunk, like 'ss' for German
dstring seq = fTable[idx].seq[0 .. entryLen]; dchar[3] arr = fTable(idx).seq;
const dchar[] seq = arr[0 .. entryLen];
if (rhs == seq[0] if (rhs == seq[0]
&& rtail.skipOver(seq[1..$])) && rtail.skipOver(seq[1..$]))
{ {
@ -8098,7 +8099,7 @@ private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
} }
} }
} }
return fTable[start].seq[0]; // new remapped character for accurate diffs return fTable(start).seq[0]; // new remapped character for accurate diffs
} }
/++ /++

View file

@ -104,6 +104,19 @@ CodepointSet compExclusions;
//property names to discard //property names to discard
string[] blacklist = []; string[] blacklist = [];
struct FullCaseEntry
{
dchar[3] seq = 0;
ubyte n; /// number in batch
ubyte size; /// size - size of batch
ubyte entry_len;
auto value() const @safe pure nothrow @nogc return
{
return seq[0 .. entry_len];
}
}
enum mixedCCEntry = ` enum mixedCCEntry = `
struct SimpleCaseEntry struct SimpleCaseEntry
{ {
@ -126,16 +139,27 @@ pure nothrow @nogc:
} }
} }
struct FullCaseEntry /// Bit backed FullCaseEntry
struct FCE
{ {
dchar[3] seq; ulong x; // bit field sizes: 18, 12, 12, 4, 4, 4
ubyte n, size;// n number in batch, size - size of batch
ubyte entry_len;
@property auto value() const @trusted pure nothrow @nogc return nothrow @nogc pure @safe:
this(ulong x)
{ {
return seq[0 .. entry_len]; this.x = x;
} }
this(dchar[3] seq, ubyte n, ubyte size, ubyte entry_len)
{
this.x = ulong(seq[0]) << 36 | ulong(seq[1]) << 24 | seq[2] << 12 | n << 8 | size << 4 | entry_len << 0;
}
dchar[3] seq() const { return [(x >> 36) & 0x1FFFF, (x >> 24) & 0xFFF, (x >> 12) & 0xFFF]; }
ubyte n() const { return (x >> 8) & 0xF; }
ubyte size() const { return (x >> 4) & 0xF; }
ubyte entry_len() const { return (x >> 0) & 0xF; }
} }
struct CompEntry struct CompEntry
@ -160,7 +184,7 @@ struct TrieEntry(T...)
auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size) auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size)
{ {
dchar[3] val; dchar[3] val = 0;
val[0 .. value.length] = value[]; val[0 .. value.length] = value[];
return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length); return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length);
} }
@ -875,25 +899,30 @@ void writeCaseFolding(File sink)
writeln("];"); writeln("];");
writeln("return t;"); writeln("return t;");
writeln("}"); writeln("}");
static uint maxLen = 0; writeln("@property FCE fullCaseTable(size_t index) nothrow @nogc @safe pure");
writeln("@property immutable(FullCaseEntry[]) fullCaseTable() nothrow @nogc @safe pure");
writeln("{"); writeln("{");
writeln("alias FCE = FullCaseEntry;"); write("static immutable ulong[] t = [");
writeln("static immutable FCE[] t = ["); int[4] maxS = 0;
foreach (i, v; fullTable) foreach (i, v; fullTable)
{ {
maxLen = max(maxLen, v.entry_len); foreach (j; 0 .. v.entry_len)
maxS[j] = max(maxS[j], v.value[j]);
if (v.entry_len > 1) if (v.entry_len > 1)
{ {
assert(v.n >= 1); // meaning that start of bucket is always single char assert(v.n >= 1); // meaning that start of bucket is always single char
} }
writef("FCE(\"%s\", %s, %s, %s),", v.value, v.n, v.size, v.entry_len); if (i % 6 == 0) writeln();
if (i % 4 == 0) writeln(); writef("0x%014X,", FCE(v.seq, v.n, v.size, v.entry_len).x);
} }
writeln("];"); writeln("];");
writeln("return t;"); writeln("return FCE(t[index]);");
writeln("}"); writeln("}");
stderr.writefln("MAX FCF len = %d", maxLen); import core.bitop : bsr;
stderr.writefln("max seq bits: [%d, %d, %d]", 1 + bsr(maxS[0]), 1 + bsr(maxS[1]), 1 + bsr(maxS[2])); //[17, 11, 10]
stderr.writefln("max n = %d", fullTable.map!(x => x.n).maxElement); // 3
stderr.writefln("max size = %d", fullTable.map!(x => x.size).maxElement); // 4
stderr.writefln("max entry_len = %d", fullTable.map!(x => x.entry_len).maxElement); // 3
} }
} }