Fix issue 23474 - Fixed many issues in grapheme walker

This commit is contained in:
Ate Eskola 2022-12-31 18:10:27 +02:00
parent ad92ea2f8d
commit b334d4ddb2
2 changed files with 132 additions and 33 deletions

View file

@ -6962,13 +6962,10 @@ private:
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
// control - '\r'
enum controlSwitch = `
case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..
case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
`;
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
// kill unrolled switches
// Use combined trie instead of checking for '\r' | '\n' | ccTrie,
// or extend | '\u200D' separately
private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
{
@ -6977,8 +6974,12 @@ private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
template genericDecodeGrapheme(bool getValue)
{
alias graphemeExtend = graphemeExtendTrie;
alias extend = graphemeExtendTrie;
alias spacingMark = mcTrie;
alias prepend = prependTrie;
alias ccTrie = graphemeControlTrie;
alias xpicto = xpictoTrie;
static if (getValue)
alias Value = Grapheme;
else
@ -6993,7 +6994,10 @@ template genericDecodeGrapheme(bool getValue)
RI,
L,
V,
LVT
LVT,
Emoji,
EmojiZWJ,
Prepend
}
static if (getValue)
Grapheme grapheme;
@ -7015,6 +7019,8 @@ template genericDecodeGrapheme(bool getValue)
mixin(eat);
if (ch == '\r')
state = CR;
else if (ccTrie[ch] || ch == '\n')
goto L_End;
else if (isRegionalIndicator(ch))
state = RI;
else if (isHangL(ch))
@ -7025,21 +7031,42 @@ template genericDecodeGrapheme(bool getValue)
state = LVT;
else if (isHangT(ch))
state = LVT;
else if (prepend[ch])
state = Prepend;
else if (xpicto[ch])
state = Emoji;
else
{
switch (ch)
{
mixin(controlSwitch);
goto L_End;
default:
goto L_End_Extend;
}
}
goto L_End_Extend;
break;
case CR:
if (ch == '\n')
mixin(eat);
goto L_End_Extend;
goto L_End;
case Emoji:
if (!extend[ch])
{
static assert(!extend['\u200D']);
if (ch == '\u200D')
state = EmojiZWJ;
else
{
// We will recheck for extensions since spacing
// marks are allowed at the end, but not at middle of
// emoji sequences, unlike extend code points.
goto L_End_Extend;
}
}
mixin(eat);
break;
case EmojiZWJ:
state = Emoji;
if (xpicto[ch])
{
mixin(eat);
break;
}
goto case Emoji;
case RI:
if (isRegionalIndicator(ch))
mixin(eat);
@ -7079,6 +7106,13 @@ template genericDecodeGrapheme(bool getValue)
else
goto L_End_Extend;
break;
case Prepend:
// Unlike the starting state, we must not eat control
// characters here.
if(ccTrie[ch] || ch == '\r' || ch == '\n')
goto L_End;
else
goto case Start;
}
}
L_End_Extend:
@ -7086,7 +7120,7 @@ template genericDecodeGrapheme(bool getValue)
{
ch = range.front;
// extend & spacing marks
if (!graphemeExtend[ch] && !spacingMark[ch])
if (!extend[ch] && !spacingMark[ch] && ch != '\u200D')
break;
mixin(eat);
}
@ -7143,6 +7177,29 @@ if (is(C : dchar))
static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
}
@safe pure nothrow unittest
{
// grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
// skier ~ female sign ~ '€'
assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
// skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
// skier ~ zero-width joiner ~ female sign ~ '€'
assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
// skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
// ~ female sign ~ '€'
assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
// skier ~ zero-width joiner ~ '€'
assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
//'€' ~ zero-width joiner ~ skier
assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
// Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
// Kaithi number sign ~ null
assert(graphemeStride("\U000110BD\0"d, 0) == 1);
}
/++
Reads one full grapheme cluster from an
$(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
@ -7285,6 +7342,13 @@ private static @safe struct InputRangeString
assert(nonForwardRange.walkLength == 4);
}
// Issue 23474
@safe pure unittest
{
import std.range.primitives : walkLength;
assert(byGrapheme("\r\u0308").walkLength == 2);
}
/++
$(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
@ -10580,29 +10644,50 @@ private:
//grapheme breaking algorithm tables
auto mcTrie()
{
import std.internal.unicode_grapheme : mcTrieEntries;
static immutable res = asTrie(mcTrieEntries);
import std.internal.unicode_grapheme : SpacingMarkTrieEntries;
static immutable res = asTrie(SpacingMarkTrieEntries);
return res;
}
auto graphemeExtendTrie()
{
import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
static immutable res = asTrie(graphemeExtendTrieEntries);
import std.internal.unicode_grapheme : ExtendTrieEntries;
static immutable res = asTrie(ExtendTrieEntries);
return res;
}
auto hangLV()
{
import std.internal.unicode_grapheme : hangulLVTrieEntries;
static immutable res = asTrie(hangulLVTrieEntries);
import std.internal.unicode_grapheme : LVTrieEntries;
static immutable res = asTrie(LVTrieEntries);
return res;
}
auto hangLVT()
{
import std.internal.unicode_grapheme : hangulLVTTrieEntries;
static immutable res = asTrie(hangulLVTTrieEntries);
import std.internal.unicode_grapheme : LVTTrieEntries;
static immutable res = asTrie(LVTTrieEntries);
return res;
}
auto prependTrie()
{
import std.internal.unicode_grapheme : PrependTrieEntries;
static immutable res = asTrie(PrependTrieEntries);
return res;
}
auto graphemeControlTrie()
{
import std.internal.unicode_grapheme : ControlTrieEntries;
static immutable res = asTrie(ControlTrieEntries);
return res;
}
auto xpictoTrie()
{
import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
static immutable res = asTrie(Extended_PictographicTrieEntries);
return res;
}

View file

@ -37,6 +37,8 @@ PropertyTable general;
PropertyTable blocks;
PropertyTable scripts;
PropertyTable hangul;
PropertyTable graphemeBreaks;
PropertyTable emojiData;
//quick NO/MAYBE charaсter sets
CodepointSet[string] normalization;
@ -148,6 +150,8 @@ enum {
caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt",
blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt",
propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt",
graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt",
emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt",
propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt",
corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt",
normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt",
@ -231,6 +235,8 @@ void main(string[] argv)
loadProperties(corePropSrc, general);
loadProperties(scriptsSrc, scripts);
loadProperties(hangulSyllableSrc, hangul);
loadProperties(graphemeSrc, graphemeBreaks);
loadProperties(emojiDataSrc, emojiData);
loadPropertyAliases(propertyValueAliases);
loadUnicodeData(unicodeDataSrc);
@ -914,13 +920,21 @@ void writeNormalizationTries(File sink)
void writeGraphemeTries(File sink)
{
//few specifics for grapheme cluster breaking algorithm
//
auto props = general.table;
writeBest3Level(sink, "hangulLV", hangul.table["LV"]);
writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]);
writeBest3Level(sink, "mc", props["Mc"]);
writeBest3Level(sink, "graphemeExtend", props["Grapheme_Extend"]);
auto table = graphemeBreaks.table;
foreach(key; table.byKey)
{
writeBest3Level(sink, key, table[key]);
}
sink.writeln();
writeBest3Level
(
sink,
"Extended_Pictographic",
emojiData.table["Extended_Pictographic"]
);
}
void writeCaseCoversion(File sink)