Upstream Unicode table generator and update tables to v15

2025-04-26 13:10:35 +03:00 · 2022-10-31 23:49:04 +13:00 · 2022-10-31 23:49:04 +13:00 · 122df9272a
commit 122df9272a
parent 08be61029e
10 changed files with 3820 additions and 19443 deletions
--- a/.dscanner.ini
+++ b/.dscanner.ini
@ -512,3 +512,5 @@ trust_too_much="-std.regex,-std.stdio,-std.uni,-std.internal.cstring"
 ; Checks for if statements whose 'then' block is the same as the 'else' block
 ; Temporarily disable until https://github.com/dlang-community/D-Scanner/issues/593 is fixed
 if_else_same_check="-std.typecons"
+; Disable checks for generated unicode tables
+long_line_check="-std.internal.unicode_decomp,-std.internal.unicode_comp,-std.internal.unicode_grapheme,-std.internal.unicode_norm,-std.internal.unicode_tables"
--- a/1
+++ b/1
@ -14,6 +14,7 @@
 circleci.sh @CyberShadow @MartinNowak @wilzbach
 etc/c/* @CyberShadow
 posix.mak @CyberShadow @MartinNowak @wilzbach
+# tools/unicode_table_generator.d
 std/* @andralex
 std/algorithm/* @andralex @JackStouffer @wilzbach @PetarKirov
 std/array.d @JackStouffer @wilzbach @PetarKirov
--- a/changelog/unicode_table_generator.dd
+++ b/changelog/unicode_table_generator.dd
@ -0,0 +1,5 @@
+Unicode table generator is now in Phobos, tables are updated to version 15.
+
+It is likely that this change will result in breakage in code and program usage.
+This is due to a number of factors, the tables being updated so significantly and the table generator not having all its changes commited throughout the years.
+
--- a/std/internal/unicode_comp.d
+++ b/std/internal/unicode_comp.d
--- a/std/internal/unicode_decomp.d
+++ b/std/internal/unicode_decomp.d
--- a/std/internal/unicode_grapheme.d
+++ b/std/internal/unicode_grapheme.d
--- a/std/internal/unicode_norm.d
+++ b/std/internal/unicode_norm.d
--- a/std/internal/unicode_tables.d
+++ b/std/internal/unicode_tables.d
--- a/std/uni/package.d
+++ b/std/uni/package.d
@ -1528,7 +1528,7 @@ if (is(Unqual!T == T))
    return SliceOverIndexed!T(a, b, x);
 }

-@safe unittest
+@system unittest
 {
    int[] idxArray = [2, 3, 5, 8, 13];
    auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
@ -2472,19 +2472,19 @@ public:
        import std.format : format;
        import std.uni : unicode;

-        assert(unicode.Cyrillic.to!string ==
-            "[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
+        // This was originally using Cyrillic script.
+        // Unfortunately this is a pretty active range for changes,
+        // and hence broke in an update.
+        // Therefore the range Basic latin was used instead as it
+        // unlikely to ever change.
+
+        assert(unicode.InBasic_latin.to!string == "[0..128)");

        // The specs '%s' and '%d' are equivalent to the to!string call above.
-        assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
+        assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);

-        assert(format("%#x", unicode.Cyrillic) ==
-            "[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) "
-            ~"[0xa640..0xa698) [0xa69f..0xa6a0)");
-
-        assert(format("%#X", unicode.Cyrillic) ==
-            "[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) "
-            ~"[0XA640..0XA698) [0XA69F..0XA6A0)");
+        assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
+        assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
    }

    pure @safe unittest
@ -4872,6 +4872,7 @@ template Utf8Matcher()
            enum mode = Mode.neverSkip;
            assert(!inp.empty);
            auto ch = inp[0];
+
            static if (hasASCII)
            {
                if (ch < 0x80)
@ -4970,6 +4971,7 @@ template Utf8Matcher()
            else
            {
                static assert(mode == Mode.skipOnMatch);
+
                if (tab!size[needle])
                {
                    inp.popFrontN(size);
@ -5312,23 +5314,31 @@ pure @safe unittest
    auto utf8 =  utf8Matcher(unicode.Letter);
    auto asc = utf8.subMatcher!(1);
    auto uni = utf8.subMatcher!(2,3,4);
+
+    // h
    assert(asc.test(codec));
    assert(!uni.match(codec));
    assert(utf8.skip(codec));
    assert(codec.idx == 1);

-    assert(!uni.match(codec));
+    // i
    assert(asc.test(codec));
+    assert(!uni.match(codec));
    assert(utf8.skip(codec));
    assert(codec.idx == 2);
-    assert(!asc.match(codec));

+    // !
+    assert(!asc.match(codec));
    assert(!utf8.test(codec));
    assert(!utf8.skip(codec));
+    assert(codec.idx == 3);

+    // space
    assert(!asc.test(codec));
    assert(!utf8.test(codec));
    assert(!utf8.skip(codec));
+    assert(codec.idx == 4);
+
    assert(utf8.test(codec));
    foreach (i; 0 .. 7)
    {
@ -5338,6 +5348,7 @@ pure @safe unittest
    }
    assert(!utf8.test(codec));
    assert(!utf8.skip(codec));
+
    //the same with match where applicable
    codec = rs.decoder;
    assert(utf8.match(codec));
@ -5360,7 +5371,7 @@ pure @safe unittest
    assert(codec.idx == i);
 }

-pure @safe unittest
+pure @system unittest
 {
    import std.range : stride;
    static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
--- a/tools/unicode_table_generator.d
+++ b/tools/unicode_table_generator.d