Refactored the grapheme walker to a proper state machine.

This commit is contained in:
Ate Eskola 2023-01-14 19:24:11 +02:00
parent 57fe5b80e5
commit 6550747c72

View file

@ -712,6 +712,8 @@ import std.traits : isConvertibleToString, isIntegral, isSomeChar,
isSomeString, Unqual, isDynamicArray; isSomeString, Unqual, isDynamicArray;
// debug = std_uni; // debug = std_uni;
import std.internal.unicode_tables; // generated file
debug(std_uni) import std.stdio; // writefln, writeln debug(std_uni) import std.stdio; // writefln, writeln
private: private:
@ -6963,7 +6965,6 @@ private:
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
// kill unrolled switches
// Use combined trie instead of checking for '\r' | '\n' | ccTrie, // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
// or extend | '\u200D' separately // or extend | '\u200D' separately
@ -6972,23 +6973,9 @@ private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
} }
template genericDecodeGrapheme(bool getValue) // Our grapheme decoder is a state machine, this is list of all possible
{ // states before each code point.
alias graphemeExtend = graphemeExtendTrie; private enum GraphemeState {
alias spacingMark = spacingMarkTrie;
alias prepend = prependTrie;
alias ccTrie = graphemeControlTrie;
alias xpicto = xpictoTrie;
static if (getValue)
alias Value = Grapheme;
else
alias Value = void;
Value genericDecodeGrapheme(Input)(ref Input range)
{
import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
enum GraphemeState {
Start, Start,
CR, CR,
RI, RI,
@ -6997,140 +6984,190 @@ template genericDecodeGrapheme(bool getValue)
LVT, LVT,
Emoji, Emoji,
EmojiZWJ, EmojiZWJ,
Prepend Prepend,
End
}
// Message values whether end of grapheme is reached
private enum TransformRes {
goOn, // No, unless the source range ends here
redo, // Run last character again with new state
retInclude, // Yes, after the just iterated character
retExclude // Yes, before the just iterated character
}
// The logic of the grapheme decoding is all here
private enum TransformRes
function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
[
GraphemeState.Start: (ref state, ch)
{
if (graphemeControlTrie[ch] || ch == '\n')
return TransformRes.retInclude;
with (GraphemeState) state =
ch == '\r' ? CR :
isRegionalIndicator(ch) ? RI :
isHangL(ch) ? L :
hangLV[ch] || isHangV(ch) ? V :
hangLVT[ch] || isHangT(ch) ? LVT :
prependTrie[ch] ? Prepend :
xpictoTrie[ch] ? Emoji :
End;
// No matter what we encountered, we always include the
// first code point in the grapheme.
return TransformRes.goOn;
},
GraphemeState.CR: (ref state, ch) => ch == '\n' ?
TransformRes.retInclude :
TransformRes.retExclude,
GraphemeState.RI: (ref state, ch)
{
state = GraphemeState.End;
return isRegionalIndicator(ch) ?
TransformRes.goOn :
TransformRes.redo;
},
GraphemeState.L: (ref state, ch)
{
if (isHangL(ch))
return TransformRes.goOn;
else if (isHangV(ch) || hangLV[ch])
{
state = GraphemeState.V;
return TransformRes.goOn;
} }
else if (hangLVT[ch])
{
state = GraphemeState.LVT;
return TransformRes.goOn;
}
state = GraphemeState.End;
return TransformRes.redo;
},
GraphemeState.V: (ref state, ch)
{
if (isHangV(ch))
return TransformRes.goOn;
else if (isHangT(ch))
{
state = GraphemeState.LVT;
return TransformRes.goOn;
}
state = GraphemeState.End;
return TransformRes.redo;
},
GraphemeState.LVT: (ref state, ch)
{
if (isHangT(ch))
return TransformRes.goOn;
state = GraphemeState.End;
return TransformRes.redo;
},
GraphemeState.Emoji: (ref state, ch)
{
if (graphemeExtendTrie[ch])
return TransformRes.goOn;
static assert(!graphemeExtendTrie['\u200D']);
if (ch == '\u200D')
{
state = GraphemeState.EmojiZWJ;
return TransformRes.goOn;
}
state = GraphemeState.End;
// There might still be spacing marks are
// at the end, which are not allowed in
// middle of emoji sequences
return TransformRes.redo;
},
GraphemeState.EmojiZWJ: (ref state, ch)
{
state = GraphemeState.Emoji;
if (xpictoTrie[ch])
return TransformRes.goOn;
return TransformRes.redo;
},
GraphemeState.Prepend: (ref state, ch)
{
// Control characters need to be special cased
// because the starting state would include them in
// the current grapheme.
if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
return TransformRes.retExclude;
state = GraphemeState.Start;
return TransformRes.redo;
},
GraphemeState.End: (ref state, ch)
=> !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
TransformRes.retExclude :
TransformRes.goOn
];
template genericDecodeGrapheme(bool getValue)
{
static if (getValue)
alias Value = Grapheme;
else
alias Value = void;
Value genericDecodeGrapheme(Input)(ref Input range)
{
static if (getValue) static if (getValue)
Grapheme grapheme; Grapheme grapheme;
auto state = GraphemeState.Start; auto state = GraphemeState.Start;
dchar ch; dchar ch;
void popCodePoint() { assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
outer:
while (!range.empty)
{
ch = range.front;
rerun:
final switch (graphemeTransforms[state](state, ch))
with(TransformRes)
{
case goOn:
static if (getValue) static if (getValue)
grapheme ~= ch; grapheme ~= ch;
range.popFront(); range.popFront();
} continue;
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); case redo:
while (!range.empty) goto rerun;
{
ch = range.front; case retInclude:
final switch (state) with(GraphemeState) static if (getValue)
{ grapheme ~= ch;
case Start: range.popFront();
popCodePoint(); break outer;
if (ch == '\r')
state = CR; case retExclude:
else if (ccTrie[ch] || ch == '\n') break outer;
goto L_End;
else if (isRegionalIndicator(ch))
state = RI;
else if (isHangL(ch))
state = L;
else if (hangLV[ch] || isHangV(ch))
state = V;
else if (hangLVT[ch])
state = LVT;
else if (isHangT(ch))
state = LVT;
else if (prepend[ch])
state = Prepend;
else if (xpicto[ch])
state = Emoji;
else
goto L_End_Extend;
break;
case CR:
if (ch == '\n')
popCodePoint();
goto L_End;
case Emoji:
if (!graphemeExtend[ch])
{
static assert(!graphemeExtend['\u200D']);
if (ch == '\u200D')
state = EmojiZWJ;
else
{
// We will recheck for extensions since spacing
// marks are allowed at the end, but not at middle of
// emoji sequences, unlike extend code points.
goto L_End_Extend;
} }
} }
popCodePoint();
break;
case EmojiZWJ:
state = Emoji;
if (xpicto[ch])
{
popCodePoint();
break;
}
goto case Emoji;
case RI:
if (isRegionalIndicator(ch))
popCodePoint();
goto L_End_Extend;
case L:
if (isHangL(ch))
popCodePoint();
else if (isHangV(ch) || hangLV[ch])
{
state = V;
popCodePoint();
}
else if (hangLVT[ch])
{
state = LVT;
popCodePoint();
}
else
goto L_End_Extend;
break;
case V:
if (isHangV(ch))
popCodePoint();
else if (isHangT(ch))
{
state = LVT;
popCodePoint();
}
else
goto L_End_Extend;
break;
case LVT:
if (isHangT(ch))
{
popCodePoint();
}
else
goto L_End_Extend;
break;
case Prepend:
// Unlike the starting state, we must not pop control
// characters here.
if (ccTrie[ch] || ch == '\r' || ch == '\n')
goto L_End;
else
goto case Start;
}
}
L_End_Extend:
while (!range.empty)
{
ch = range.front;
// extend & spacing marks
if (!graphemeExtend[ch] && !spacingMark[ch] && ch != '\u200D')
break;
popCodePoint();
}
L_End:
static if (getValue) static if (getValue)
return grapheme; return grapheme;
} }
} }
public: // Public API continues public: // Public API continues
@ -7179,6 +7216,8 @@ if (is(C : dchar))
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
} }
// TODO: make this @nogc. Probably no big deal since the state machine is
// already GC-free.
@safe pure nothrow unittest @safe pure nothrow unittest
{ {
// grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
@ -10596,8 +10635,6 @@ private:
@safe pure nothrow @nogc @property @safe pure nothrow @nogc @property
{ {
import std.internal.unicode_tables; // generated file
// It's important to use auto return here, so that the compiler // It's important to use auto return here, so that the compiler
// only runs semantic on the return type if the function gets // only runs semantic on the return type if the function gets
// used. Also these are functions rather than templates to not // used. Also these are functions rather than templates to not