mirror of
https://github.com/dlang/phobos.git
synced 2025-04-27 13:40:20 +03:00
Refactored the grapheme walker to a proper state machine.
This commit is contained in:
parent
57fe5b80e5
commit
6550747c72
1 changed files with 174 additions and 137 deletions
|
@ -712,6 +712,8 @@ import std.traits : isConvertibleToString, isIntegral, isSomeChar,
|
||||||
isSomeString, Unqual, isDynamicArray;
|
isSomeString, Unqual, isDynamicArray;
|
||||||
// debug = std_uni;
|
// debug = std_uni;
|
||||||
|
|
||||||
|
import std.internal.unicode_tables; // generated file
|
||||||
|
|
||||||
debug(std_uni) import std.stdio; // writefln, writeln
|
debug(std_uni) import std.stdio; // writefln, writeln
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
@ -6963,7 +6965,6 @@ private:
|
||||||
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
|
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
|
||||||
|
|
||||||
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
|
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
|
||||||
// kill unrolled switches
|
|
||||||
// Use combined trie instead of checking for '\r' | '\n' | ccTrie,
|
// Use combined trie instead of checking for '\r' | '\n' | ccTrie,
|
||||||
// or extend | '\u200D' separately
|
// or extend | '\u200D' separately
|
||||||
|
|
||||||
|
@ -6972,23 +6973,9 @@ private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
|
||||||
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
|
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
|
||||||
}
|
}
|
||||||
|
|
||||||
template genericDecodeGrapheme(bool getValue)
|
// Our grapheme decoder is a state machine, this is list of all possible
|
||||||
{
|
// states before each code point.
|
||||||
alias graphemeExtend = graphemeExtendTrie;
|
private enum GraphemeState {
|
||||||
alias spacingMark = spacingMarkTrie;
|
|
||||||
alias prepend = prependTrie;
|
|
||||||
alias ccTrie = graphemeControlTrie;
|
|
||||||
alias xpicto = xpictoTrie;
|
|
||||||
|
|
||||||
static if (getValue)
|
|
||||||
alias Value = Grapheme;
|
|
||||||
else
|
|
||||||
alias Value = void;
|
|
||||||
|
|
||||||
Value genericDecodeGrapheme(Input)(ref Input range)
|
|
||||||
{
|
|
||||||
import std.internal.unicode_tables : isHangL, isHangT, isHangV; // generated file
|
|
||||||
enum GraphemeState {
|
|
||||||
Start,
|
Start,
|
||||||
CR,
|
CR,
|
||||||
RI,
|
RI,
|
||||||
|
@ -6997,140 +6984,190 @@ template genericDecodeGrapheme(bool getValue)
|
||||||
LVT,
|
LVT,
|
||||||
Emoji,
|
Emoji,
|
||||||
EmojiZWJ,
|
EmojiZWJ,
|
||||||
Prepend
|
Prepend,
|
||||||
|
End
|
||||||
|
}
|
||||||
|
|
||||||
|
// Message values whether end of grapheme is reached
|
||||||
|
private enum TransformRes {
|
||||||
|
goOn, // No, unless the source range ends here
|
||||||
|
redo, // Run last character again with new state
|
||||||
|
retInclude, // Yes, after the just iterated character
|
||||||
|
retExclude // Yes, before the just iterated character
|
||||||
|
}
|
||||||
|
|
||||||
|
// The logic of the grapheme decoding is all here
|
||||||
|
private enum TransformRes
|
||||||
|
function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
|
||||||
|
[
|
||||||
|
GraphemeState.Start: (ref state, ch)
|
||||||
|
{
|
||||||
|
if (graphemeControlTrie[ch] || ch == '\n')
|
||||||
|
return TransformRes.retInclude;
|
||||||
|
|
||||||
|
with (GraphemeState) state =
|
||||||
|
ch == '\r' ? CR :
|
||||||
|
isRegionalIndicator(ch) ? RI :
|
||||||
|
isHangL(ch) ? L :
|
||||||
|
hangLV[ch] || isHangV(ch) ? V :
|
||||||
|
hangLVT[ch] || isHangT(ch) ? LVT :
|
||||||
|
prependTrie[ch] ? Prepend :
|
||||||
|
xpictoTrie[ch] ? Emoji :
|
||||||
|
End;
|
||||||
|
|
||||||
|
// No matter what we encountered, we always include the
|
||||||
|
// first code point in the grapheme.
|
||||||
|
return TransformRes.goOn;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.CR: (ref state, ch) => ch == '\n' ?
|
||||||
|
TransformRes.retInclude :
|
||||||
|
TransformRes.retExclude,
|
||||||
|
|
||||||
|
GraphemeState.RI: (ref state, ch)
|
||||||
|
{
|
||||||
|
state = GraphemeState.End;
|
||||||
|
|
||||||
|
return isRegionalIndicator(ch) ?
|
||||||
|
TransformRes.goOn :
|
||||||
|
TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.L: (ref state, ch)
|
||||||
|
{
|
||||||
|
if (isHangL(ch))
|
||||||
|
return TransformRes.goOn;
|
||||||
|
else if (isHangV(ch) || hangLV[ch])
|
||||||
|
{
|
||||||
|
state = GraphemeState.V;
|
||||||
|
return TransformRes.goOn;
|
||||||
}
|
}
|
||||||
|
else if (hangLVT[ch])
|
||||||
|
{
|
||||||
|
state = GraphemeState.LVT;
|
||||||
|
return TransformRes.goOn;
|
||||||
|
}
|
||||||
|
|
||||||
|
state = GraphemeState.End;
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.V: (ref state, ch)
|
||||||
|
{
|
||||||
|
if (isHangV(ch))
|
||||||
|
return TransformRes.goOn;
|
||||||
|
else if (isHangT(ch))
|
||||||
|
{
|
||||||
|
state = GraphemeState.LVT;
|
||||||
|
return TransformRes.goOn;
|
||||||
|
}
|
||||||
|
|
||||||
|
state = GraphemeState.End;
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.LVT: (ref state, ch)
|
||||||
|
{
|
||||||
|
if (isHangT(ch))
|
||||||
|
return TransformRes.goOn;
|
||||||
|
|
||||||
|
state = GraphemeState.End;
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.Emoji: (ref state, ch)
|
||||||
|
{
|
||||||
|
if (graphemeExtendTrie[ch])
|
||||||
|
return TransformRes.goOn;
|
||||||
|
|
||||||
|
static assert(!graphemeExtendTrie['\u200D']);
|
||||||
|
|
||||||
|
if (ch == '\u200D')
|
||||||
|
{
|
||||||
|
state = GraphemeState.EmojiZWJ;
|
||||||
|
return TransformRes.goOn;
|
||||||
|
}
|
||||||
|
|
||||||
|
state = GraphemeState.End;
|
||||||
|
// There might still be spacing marks are
|
||||||
|
// at the end, which are not allowed in
|
||||||
|
// middle of emoji sequences
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.EmojiZWJ: (ref state, ch)
|
||||||
|
{
|
||||||
|
state = GraphemeState.Emoji;
|
||||||
|
if (xpictoTrie[ch])
|
||||||
|
return TransformRes.goOn;
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.Prepend: (ref state, ch)
|
||||||
|
{
|
||||||
|
// Control characters need to be special cased
|
||||||
|
// because the starting state would include them in
|
||||||
|
// the current grapheme.
|
||||||
|
if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
|
||||||
|
return TransformRes.retExclude;
|
||||||
|
|
||||||
|
state = GraphemeState.Start;
|
||||||
|
return TransformRes.redo;
|
||||||
|
},
|
||||||
|
|
||||||
|
GraphemeState.End: (ref state, ch)
|
||||||
|
=> !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
|
||||||
|
TransformRes.retExclude :
|
||||||
|
TransformRes.goOn
|
||||||
|
];
|
||||||
|
|
||||||
|
template genericDecodeGrapheme(bool getValue)
|
||||||
|
{
|
||||||
|
static if (getValue)
|
||||||
|
alias Value = Grapheme;
|
||||||
|
else
|
||||||
|
alias Value = void;
|
||||||
|
|
||||||
|
Value genericDecodeGrapheme(Input)(ref Input range)
|
||||||
|
{
|
||||||
static if (getValue)
|
static if (getValue)
|
||||||
Grapheme grapheme;
|
Grapheme grapheme;
|
||||||
auto state = GraphemeState.Start;
|
auto state = GraphemeState.Start;
|
||||||
dchar ch;
|
dchar ch;
|
||||||
|
|
||||||
void popCodePoint() {
|
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
|
||||||
|
outer:
|
||||||
|
while (!range.empty)
|
||||||
|
{
|
||||||
|
ch = range.front;
|
||||||
|
|
||||||
|
rerun:
|
||||||
|
final switch (graphemeTransforms[state](state, ch))
|
||||||
|
with(TransformRes)
|
||||||
|
{
|
||||||
|
case goOn:
|
||||||
static if (getValue)
|
static if (getValue)
|
||||||
grapheme ~= ch;
|
grapheme ~= ch;
|
||||||
range.popFront();
|
range.popFront();
|
||||||
}
|
continue;
|
||||||
|
|
||||||
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
|
case redo:
|
||||||
while (!range.empty)
|
goto rerun;
|
||||||
{
|
|
||||||
ch = range.front;
|
case retInclude:
|
||||||
final switch (state) with(GraphemeState)
|
static if (getValue)
|
||||||
{
|
grapheme ~= ch;
|
||||||
case Start:
|
range.popFront();
|
||||||
popCodePoint();
|
break outer;
|
||||||
if (ch == '\r')
|
|
||||||
state = CR;
|
case retExclude:
|
||||||
else if (ccTrie[ch] || ch == '\n')
|
break outer;
|
||||||
goto L_End;
|
|
||||||
else if (isRegionalIndicator(ch))
|
|
||||||
state = RI;
|
|
||||||
else if (isHangL(ch))
|
|
||||||
state = L;
|
|
||||||
else if (hangLV[ch] || isHangV(ch))
|
|
||||||
state = V;
|
|
||||||
else if (hangLVT[ch])
|
|
||||||
state = LVT;
|
|
||||||
else if (isHangT(ch))
|
|
||||||
state = LVT;
|
|
||||||
else if (prepend[ch])
|
|
||||||
state = Prepend;
|
|
||||||
else if (xpicto[ch])
|
|
||||||
state = Emoji;
|
|
||||||
else
|
|
||||||
goto L_End_Extend;
|
|
||||||
break;
|
|
||||||
case CR:
|
|
||||||
if (ch == '\n')
|
|
||||||
popCodePoint();
|
|
||||||
goto L_End;
|
|
||||||
case Emoji:
|
|
||||||
if (!graphemeExtend[ch])
|
|
||||||
{
|
|
||||||
static assert(!graphemeExtend['\u200D']);
|
|
||||||
if (ch == '\u200D')
|
|
||||||
state = EmojiZWJ;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// We will recheck for extensions since spacing
|
|
||||||
// marks are allowed at the end, but not at middle of
|
|
||||||
// emoji sequences, unlike extend code points.
|
|
||||||
goto L_End_Extend;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
popCodePoint();
|
|
||||||
break;
|
|
||||||
case EmojiZWJ:
|
|
||||||
state = Emoji;
|
|
||||||
if (xpicto[ch])
|
|
||||||
{
|
|
||||||
popCodePoint();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
goto case Emoji;
|
|
||||||
case RI:
|
|
||||||
if (isRegionalIndicator(ch))
|
|
||||||
popCodePoint();
|
|
||||||
goto L_End_Extend;
|
|
||||||
case L:
|
|
||||||
if (isHangL(ch))
|
|
||||||
popCodePoint();
|
|
||||||
else if (isHangV(ch) || hangLV[ch])
|
|
||||||
{
|
|
||||||
state = V;
|
|
||||||
popCodePoint();
|
|
||||||
}
|
|
||||||
else if (hangLVT[ch])
|
|
||||||
{
|
|
||||||
state = LVT;
|
|
||||||
popCodePoint();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
goto L_End_Extend;
|
|
||||||
break;
|
|
||||||
case V:
|
|
||||||
if (isHangV(ch))
|
|
||||||
popCodePoint();
|
|
||||||
else if (isHangT(ch))
|
|
||||||
{
|
|
||||||
state = LVT;
|
|
||||||
popCodePoint();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
goto L_End_Extend;
|
|
||||||
break;
|
|
||||||
case LVT:
|
|
||||||
if (isHangT(ch))
|
|
||||||
{
|
|
||||||
popCodePoint();
|
|
||||||
}
|
|
||||||
else
|
|
||||||
goto L_End_Extend;
|
|
||||||
break;
|
|
||||||
case Prepend:
|
|
||||||
// Unlike the starting state, we must not pop control
|
|
||||||
// characters here.
|
|
||||||
if (ccTrie[ch] || ch == '\r' || ch == '\n')
|
|
||||||
goto L_End;
|
|
||||||
else
|
|
||||||
goto case Start;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
L_End_Extend:
|
|
||||||
while (!range.empty)
|
|
||||||
{
|
|
||||||
ch = range.front;
|
|
||||||
// extend & spacing marks
|
|
||||||
if (!graphemeExtend[ch] && !spacingMark[ch] && ch != '\u200D')
|
|
||||||
break;
|
|
||||||
|
|
||||||
popCodePoint();
|
|
||||||
}
|
|
||||||
L_End:
|
|
||||||
static if (getValue)
|
static if (getValue)
|
||||||
return grapheme;
|
return grapheme;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public: // Public API continues
|
public: // Public API continues
|
||||||
|
@ -7179,6 +7216,8 @@ if (is(C : dchar))
|
||||||
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
|
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: make this @nogc. Probably no big deal since the state machine is
|
||||||
|
// already GC-free.
|
||||||
@safe pure nothrow unittest
|
@safe pure nothrow unittest
|
||||||
{
|
{
|
||||||
// grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
|
// grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
|
||||||
|
@ -10596,8 +10635,6 @@ private:
|
||||||
|
|
||||||
@safe pure nothrow @nogc @property
|
@safe pure nothrow @nogc @property
|
||||||
{
|
{
|
||||||
import std.internal.unicode_tables; // generated file
|
|
||||||
|
|
||||||
// It's important to use auto return here, so that the compiler
|
// It's important to use auto return here, so that the compiler
|
||||||
// only runs semantic on the return type if the function gets
|
// only runs semantic on the return type if the function gets
|
||||||
// used. Also these are functions rather than templates to not
|
// used. Also these are functions rather than templates to not
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue