mirror of
https://github.com/dlang/phobos.git
synced 2025-04-26 13:10:35 +03:00
Refactored the grapheme walker to a proper state machine.
This commit is contained in:
parent
57fe5b80e5
commit
6550747c72
1 changed files with 174 additions and 137 deletions
|
@ -712,6 +712,8 @@ import std.traits : isConvertibleToString, isIntegral, isSomeChar,
|
|||
isSomeString, Unqual, isDynamicArray;
|
||||
// debug = std_uni;
|
||||
|
||||
import std.internal.unicode_tables; // generated file
|
||||
|
||||
debug(std_uni) import std.stdio; // writefln, writeln
|
||||
|
||||
private:
|
||||
|
@ -6963,7 +6965,6 @@ private:
|
|||
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
|
||||
|
||||
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
|
||||
// kill unrolled switches
|
||||
// Use combined trie instead of checking for '\r' | '\n' | ccTrie,
|
||||
// or extend | '\u200D' separately
|
||||
|
||||
|
@ -6972,14 +6973,156 @@ private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
|
|||
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
|
||||
}
|
||||
|
||||
// Our grapheme decoder is a state machine, this is list of all possible
|
||||
// states before each code point.
|
||||
private enum GraphemeState {
|
||||
Start,
|
||||
CR,
|
||||
RI,
|
||||
L,
|
||||
V,
|
||||
LVT,
|
||||
Emoji,
|
||||
EmojiZWJ,
|
||||
Prepend,
|
||||
End
|
||||
}
|
||||
|
||||
// Message values whether end of grapheme is reached
|
||||
private enum TransformRes {
|
||||
goOn, // No, unless the source range ends here
|
||||
redo, // Run last character again with new state
|
||||
retInclude, // Yes, after the just iterated character
|
||||
retExclude // Yes, before the just iterated character
|
||||
}
|
||||
|
||||
// The logic of the grapheme decoding is all here
|
||||
private enum TransformRes
|
||||
function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
|
||||
[
|
||||
GraphemeState.Start: (ref state, ch)
|
||||
{
|
||||
if (graphemeControlTrie[ch] || ch == '\n')
|
||||
return TransformRes.retInclude;
|
||||
|
||||
with (GraphemeState) state =
|
||||
ch == '\r' ? CR :
|
||||
isRegionalIndicator(ch) ? RI :
|
||||
isHangL(ch) ? L :
|
||||
hangLV[ch] || isHangV(ch) ? V :
|
||||
hangLVT[ch] || isHangT(ch) ? LVT :
|
||||
prependTrie[ch] ? Prepend :
|
||||
xpictoTrie[ch] ? Emoji :
|
||||
End;
|
||||
|
||||
// No matter what we encountered, we always include the
|
||||
// first code point in the grapheme.
|
||||
return TransformRes.goOn;
|
||||
},
|
||||
|
||||
GraphemeState.CR: (ref state, ch) => ch == '\n' ?
|
||||
TransformRes.retInclude :
|
||||
TransformRes.retExclude,
|
||||
|
||||
GraphemeState.RI: (ref state, ch)
|
||||
{
|
||||
state = GraphemeState.End;
|
||||
|
||||
return isRegionalIndicator(ch) ?
|
||||
TransformRes.goOn :
|
||||
TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.L: (ref state, ch)
|
||||
{
|
||||
if (isHangL(ch))
|
||||
return TransformRes.goOn;
|
||||
else if (isHangV(ch) || hangLV[ch])
|
||||
{
|
||||
state = GraphemeState.V;
|
||||
return TransformRes.goOn;
|
||||
}
|
||||
else if (hangLVT[ch])
|
||||
{
|
||||
state = GraphemeState.LVT;
|
||||
return TransformRes.goOn;
|
||||
}
|
||||
|
||||
state = GraphemeState.End;
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.V: (ref state, ch)
|
||||
{
|
||||
if (isHangV(ch))
|
||||
return TransformRes.goOn;
|
||||
else if (isHangT(ch))
|
||||
{
|
||||
state = GraphemeState.LVT;
|
||||
return TransformRes.goOn;
|
||||
}
|
||||
|
||||
state = GraphemeState.End;
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.LVT: (ref state, ch)
|
||||
{
|
||||
if (isHangT(ch))
|
||||
return TransformRes.goOn;
|
||||
|
||||
state = GraphemeState.End;
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.Emoji: (ref state, ch)
|
||||
{
|
||||
if (graphemeExtendTrie[ch])
|
||||
return TransformRes.goOn;
|
||||
|
||||
static assert(!graphemeExtendTrie['\u200D']);
|
||||
|
||||
if (ch == '\u200D')
|
||||
{
|
||||
state = GraphemeState.EmojiZWJ;
|
||||
return TransformRes.goOn;
|
||||
}
|
||||
|
||||
state = GraphemeState.End;
|
||||
// There might still be spacing marks are
|
||||
// at the end, which are not allowed in
|
||||
// middle of emoji sequences
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.EmojiZWJ: (ref state, ch)
|
||||
{
|
||||
state = GraphemeState.Emoji;
|
||||
if (xpictoTrie[ch])
|
||||
return TransformRes.goOn;
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.Prepend: (ref state, ch)
|
||||
{
|
||||
// Control characters need to be special cased
|
||||
// because the starting state would include them in
|
||||
// the current grapheme.
|
||||
if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
|
||||
return TransformRes.retExclude;
|
||||
|
||||
state = GraphemeState.Start;
|
||||
return TransformRes.redo;
|
||||
},
|
||||
|
||||
GraphemeState.End: (ref state, ch)
|
||||
=> !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
|
||||
TransformRes.retExclude :
|
||||
TransformRes.goOn
|
||||
];
|
||||
|
||||
template genericDecodeGrapheme(bool getValue)
|
||||
{
|
||||
alias graphemeExtend = graphemeExtendTrie;
|
||||
alias spacingMark = spacingMarkTrie;
|
||||
alias prepend = prependTrie;
|
||||
alias ccTrie = graphemeControlTrie;
|
||||
alias xpicto = xpictoTrie;
|
||||
|
||||
static if (getValue)
|
||||
alias Value = Grapheme;
|
||||
else
|
||||
|
@ -6987,150 +7130,44 @@ template genericDecodeGrapheme(bool getValue)
|
|||
|
||||
Value genericDecodeGrapheme(Input)(ref Input range)
|
||||
{
|
||||
import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
|
||||
enum GraphemeState {
|
||||
Start,
|
||||
CR,
|
||||
RI,
|
||||
L,
|
||||
V,
|
||||
LVT,
|
||||
Emoji,
|
||||
EmojiZWJ,
|
||||
Prepend
|
||||
}
|
||||
static if (getValue)
|
||||
Grapheme grapheme;
|
||||
auto state = GraphemeState.Start;
|
||||
dchar ch;
|
||||
|
||||
void popCodePoint() {
|
||||
static if (getValue)
|
||||
grapheme ~= ch;
|
||||
range.popFront();
|
||||
}
|
||||
|
||||
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
|
||||
outer:
|
||||
while (!range.empty)
|
||||
{
|
||||
ch = range.front;
|
||||
final switch (state) with(GraphemeState)
|
||||
{
|
||||
case Start:
|
||||
popCodePoint();
|
||||
if (ch == '\r')
|
||||
state = CR;
|
||||
else if (ccTrie[ch] || ch == '\n')
|
||||
goto L_End;
|
||||
else if (isRegionalIndicator(ch))
|
||||
state = RI;
|
||||
else if (isHangL(ch))
|
||||
state = L;
|
||||
else if (hangLV[ch] || isHangV(ch))
|
||||
state = V;
|
||||
else if (hangLVT[ch])
|
||||
state = LVT;
|
||||
else if (isHangT(ch))
|
||||
state = LVT;
|
||||
else if (prepend[ch])
|
||||
state = Prepend;
|
||||
else if (xpicto[ch])
|
||||
state = Emoji;
|
||||
else
|
||||
goto L_End_Extend;
|
||||
break;
|
||||
case CR:
|
||||
if (ch == '\n')
|
||||
popCodePoint();
|
||||
goto L_End;
|
||||
case Emoji:
|
||||
if (!graphemeExtend[ch])
|
||||
{
|
||||
static assert(!graphemeExtend['\u200D']);
|
||||
if (ch == '\u200D')
|
||||
state = EmojiZWJ;
|
||||
else
|
||||
{
|
||||
// We will recheck for extensions since spacing
|
||||
// marks are allowed at the end, but not at middle of
|
||||
// emoji sequences, unlike extend code points.
|
||||
goto L_End_Extend;
|
||||
}
|
||||
}
|
||||
|
||||
popCodePoint();
|
||||
break;
|
||||
case EmojiZWJ:
|
||||
state = Emoji;
|
||||
if (xpicto[ch])
|
||||
{
|
||||
popCodePoint();
|
||||
break;
|
||||
}
|
||||
goto case Emoji;
|
||||
case RI:
|
||||
if (isRegionalIndicator(ch))
|
||||
popCodePoint();
|
||||
goto L_End_Extend;
|
||||
case L:
|
||||
if (isHangL(ch))
|
||||
popCodePoint();
|
||||
else if (isHangV(ch) || hangLV[ch])
|
||||
{
|
||||
state = V;
|
||||
popCodePoint();
|
||||
}
|
||||
else if (hangLVT[ch])
|
||||
{
|
||||
state = LVT;
|
||||
popCodePoint();
|
||||
}
|
||||
else
|
||||
goto L_End_Extend;
|
||||
break;
|
||||
case V:
|
||||
if (isHangV(ch))
|
||||
popCodePoint();
|
||||
else if (isHangT(ch))
|
||||
{
|
||||
state = LVT;
|
||||
popCodePoint();
|
||||
}
|
||||
else
|
||||
goto L_End_Extend;
|
||||
break;
|
||||
case LVT:
|
||||
if (isHangT(ch))
|
||||
{
|
||||
popCodePoint();
|
||||
}
|
||||
else
|
||||
goto L_End_Extend;
|
||||
break;
|
||||
case Prepend:
|
||||
// Unlike the starting state, we must not pop control
|
||||
// characters here.
|
||||
if (ccTrie[ch] || ch == '\r' || ch == '\n')
|
||||
goto L_End;
|
||||
else
|
||||
goto case Start;
|
||||
rerun:
|
||||
final switch (graphemeTransforms[state](state, ch))
|
||||
with(TransformRes)
|
||||
{
|
||||
case goOn:
|
||||
static if (getValue)
|
||||
grapheme ~= ch;
|
||||
range.popFront();
|
||||
continue;
|
||||
|
||||
case redo:
|
||||
goto rerun;
|
||||
|
||||
case retInclude:
|
||||
static if (getValue)
|
||||
grapheme ~= ch;
|
||||
range.popFront();
|
||||
break outer;
|
||||
|
||||
case retExclude:
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
L_End_Extend:
|
||||
while (!range.empty)
|
||||
{
|
||||
ch = range.front;
|
||||
// extend & spacing marks
|
||||
if (!graphemeExtend[ch] && !spacingMark[ch] && ch != '\u200D')
|
||||
break;
|
||||
|
||||
popCodePoint();
|
||||
}
|
||||
L_End:
|
||||
static if (getValue)
|
||||
return grapheme;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public: // Public API continues
|
||||
|
@ -7179,6 +7216,8 @@ if (is(C : dchar))
|
|||
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
|
||||
}
|
||||
|
||||
// TODO: make this @nogc. Probably no big deal since the state machine is
|
||||
// already GC-free.
|
||||
@safe pure nothrow unittest
|
||||
{
|
||||
// grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
|
||||
|
@ -10596,8 +10635,6 @@ private:
|
|||
|
||||
@safe pure nothrow @nogc @property
|
||||
{
|
||||
import std.internal.unicode_tables; // generated file
|
||||
|
||||
// It's important to use auto return here, so that the compiler
|
||||
// only runs semantic on the return type if the function gets
|
||||
// used. Also these are functions rather than templates to not
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue