diff --git a/std/uni/package.d b/std/uni/package.d index 5c0659ec7..b08f8c66a 100644 --- a/std/uni/package.d +++ b/std/uni/package.d @@ -6962,13 +6962,10 @@ private: enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally -// control - '\r' -enum controlSwitch = ` - case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':.. - case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085': -`; // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too // kill unrolled switches +// Use combined trie instead of checking for '\r' | '\n' | ccTrie, +// or extend | '\u200D' separately private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow { @@ -6977,8 +6974,12 @@ private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow template genericDecodeGrapheme(bool getValue) { - alias graphemeExtend = graphemeExtendTrie; + alias extend = graphemeExtendTrie; alias spacingMark = mcTrie; + alias prepend = prependTrie; + alias ccTrie = graphemeControlTrie; + alias xpicto = xpictoTrie; + static if (getValue) alias Value = Grapheme; else @@ -6993,7 +6994,10 @@ template genericDecodeGrapheme(bool getValue) RI, L, V, - LVT + LVT, + Emoji, + EmojiZWJ, + Prepend } static if (getValue) Grapheme grapheme; @@ -7015,6 +7019,8 @@ template genericDecodeGrapheme(bool getValue) mixin(eat); if (ch == '\r') state = CR; + else if (ccTrie[ch] || ch == '\n') + goto L_End; else if (isRegionalIndicator(ch)) state = RI; else if (isHangL(ch)) @@ -7025,21 +7031,42 @@ template genericDecodeGrapheme(bool getValue) state = LVT; else if (isHangT(ch)) state = LVT; + else if (prepend[ch]) + state = Prepend; + else if (xpicto[ch]) + state = Emoji; else - { - switch (ch) - { - mixin(controlSwitch); - goto L_End; - default: - goto L_End_Extend; - } - } + goto L_End_Extend; break; case CR: if (ch == '\n') mixin(eat); - goto L_End_Extend; + goto L_End; + case Emoji: + if (!extend[ch]) + { + static assert(!extend['\u200D']); + if (ch == '\u200D') + state = EmojiZWJ; + else + { + // We will recheck for extensions since spacing + // marks are allowed at the end, but not at middle of + // emoji sequences, unlike extend code points. + goto L_End_Extend; + } + } + + mixin(eat); + break; + case EmojiZWJ: + state = Emoji; + if (xpicto[ch]) + { + mixin(eat); + break; + } + goto case Emoji; case RI: if (isRegionalIndicator(ch)) mixin(eat); @@ -7079,6 +7106,13 @@ template genericDecodeGrapheme(bool getValue) else goto L_End_Extend; break; + case Prepend: + // Unlike the starting state, we must not eat control + // characters here. + if(ccTrie[ch] || ch == '\r' || ch == '\n') + goto L_End; + else + goto case Start; } } L_End_Extend: @@ -7086,7 +7120,7 @@ template genericDecodeGrapheme(bool getValue) { ch = range.front; // extend & spacing marks - if (!graphemeExtend[ch] && !spacingMark[ch]) + if (!extend[ch] && !spacingMark[ch] && ch != '\u200D') break; mixin(eat); } @@ -7143,6 +7177,29 @@ if (is(C : dchar)) static assert(c2 == 3); // \u0301 has 2 UTF-8 code units } +@safe pure nothrow unittest +{ + // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face + assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); + // skier ~ female sign ~ '€' + assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); + // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' + assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); + // skier ~ zero-width joiner ~ female sign ~ '€' + assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); + // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner + // ~ female sign ~ '€' + assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); + // skier ~ zero-width joiner ~ '€' + assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); + //'€' ~ zero-width joiner ~ skier + assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); + // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two + assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); + // Kaithi number sign ~ null + assert(graphemeStride("\U000110BD\0"d, 0) == 1); +} + /++ Reads one full grapheme cluster from an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. @@ -7285,6 +7342,13 @@ private static @safe struct InputRangeString assert(nonForwardRange.walkLength == 4); } +// Issue 23474 +@safe pure unittest +{ + import std.range.primitives : walkLength; + assert(byGrapheme("\r\u0308").walkLength == 2); +} + /++ $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) @@ -10580,29 +10644,50 @@ private: //grapheme breaking algorithm tables auto mcTrie() { - import std.internal.unicode_grapheme : mcTrieEntries; - static immutable res = asTrie(mcTrieEntries); + import std.internal.unicode_grapheme : SpacingMarkTrieEntries; + static immutable res = asTrie(SpacingMarkTrieEntries); return res; } auto graphemeExtendTrie() { - import std.internal.unicode_grapheme : graphemeExtendTrieEntries; - static immutable res = asTrie(graphemeExtendTrieEntries); + import std.internal.unicode_grapheme : ExtendTrieEntries; + static immutable res = asTrie(ExtendTrieEntries); return res; } auto hangLV() { - import std.internal.unicode_grapheme : hangulLVTrieEntries; - static immutable res = asTrie(hangulLVTrieEntries); + import std.internal.unicode_grapheme : LVTrieEntries; + static immutable res = asTrie(LVTrieEntries); return res; } auto hangLVT() { - import std.internal.unicode_grapheme : hangulLVTTrieEntries; - static immutable res = asTrie(hangulLVTTrieEntries); + import std.internal.unicode_grapheme : LVTTrieEntries; + static immutable res = asTrie(LVTTrieEntries); + return res; + } + + auto prependTrie() + { + import std.internal.unicode_grapheme : PrependTrieEntries; + static immutable res = asTrie(PrependTrieEntries); + return res; + } + + auto graphemeControlTrie() + { + import std.internal.unicode_grapheme : ControlTrieEntries; + static immutable res = asTrie(ControlTrieEntries); + return res; + } + + auto xpictoTrie() + { + import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; + static immutable res = asTrie(Extended_PictographicTrieEntries); return res; } diff --git a/tools/unicode_table_generator.d b/tools/unicode_table_generator.d index d93f889cd..f91b760ec 100644 --- a/tools/unicode_table_generator.d +++ b/tools/unicode_table_generator.d @@ -37,6 +37,8 @@ PropertyTable general; PropertyTable blocks; PropertyTable scripts; PropertyTable hangul; +PropertyTable graphemeBreaks; +PropertyTable emojiData; //quick NO/MAYBE charaсter sets CodepointSet[string] normalization; @@ -148,6 +150,8 @@ enum { caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt", blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt", propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt", + graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt", + emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt", propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt", corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt", normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt", @@ -231,6 +235,8 @@ void main(string[] argv) loadProperties(corePropSrc, general); loadProperties(scriptsSrc, scripts); loadProperties(hangulSyllableSrc, hangul); + loadProperties(graphemeSrc, graphemeBreaks); + loadProperties(emojiDataSrc, emojiData); loadPropertyAliases(propertyValueAliases); loadUnicodeData(unicodeDataSrc); @@ -914,13 +920,21 @@ void writeNormalizationTries(File sink) void writeGraphemeTries(File sink) { - //few specifics for grapheme cluster breaking algorithm - // - auto props = general.table; - writeBest3Level(sink, "hangulLV", hangul.table["LV"]); - writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]); - writeBest3Level(sink, "mc", props["Mc"]); - writeBest3Level(sink, "graphemeExtend", props["Grapheme_Extend"]); + auto table = graphemeBreaks.table; + + foreach(key; table.byKey) + { + writeBest3Level(sink, key, table[key]); + } + + sink.writeln(); + + writeBest3Level + ( + sink, + "Extended_Pictographic", + emojiData.table["Extended_Pictographic"] + ); } void writeCaseCoversion(File sink)