new std.uni module

2025-04-29 14:40:30 +03:00 · 2013-07-20 22:30:10 +04:00 · 2013-07-20 22:30:10 +04:00 · 9a053d97c2
commit 9a053d97c2
parent 1bd22b2e8b
8 changed files with 12979 additions and 2681 deletions
--- a/posix.mak
+++ b/posix.mak
@ -202,7 +202,7 @@ EXTRA_MODULES += $(EXTRA_DOCUMENTABLES) $(addprefix			\
 	std/internal/digest/, sha_SSSE3 ) $(addprefix \
 	std/internal/math/, biguintcore biguintnoasm biguintx86	\
 	gammafunction errorfunction) $(addprefix std/internal/, \
-	processinit uni uni_tab)
+	processinit uni uni_tab unicode_tables)

 # Aggregate all D modules relevant to this build
 D_MODULES = crc32 $(STD_MODULES) $(EXTRA_MODULES) $(STD_NET_MODULES) $(STD_DIGEST_MODULES)
--- a/std/internal/unicode_tables.d
+++ b/std/internal/unicode_tables.d
--- a/std/json.d
+++ b/std/json.d
@ -596,7 +596,8 @@ unittest
    val = parseJSON(`"\u2660\u2666"`);
    assert(toJSON(&val) == "\"\&spades;\&diams;\"");

-    assertNotThrown(parseJSON(`{ "foo": "` ~ "\u007F" ~ `"}`));
+    //0x7F is a control character (see Unicode spec)
+    assertThrown(parseJSON(`{ "foo": "` ~ "\u007F" ~ `"}`));

    with(parseJSON(`""`))
        assert(str == "" && str !is null);
--- a/std/regex.d
+++ b/std/regex.d
@ -225,8 +225,9 @@ module std.regex;
 import std.internal.uni, std.internal.uni_tab;//unicode property tables
 import std.array, std.algorithm, std.range,
       std.conv, std.exception, std.traits, std.typetuple,
-       std.uni, std.utf, std.format, std.typecons, std.bitmanip,
+       std.utf, std.format, std.typecons, std.bitmanip,
       std.functional, std.exception;
+
 import core.bitop, core.stdc.string, core.stdc.stdlib;
 static import ascii = std.ascii;
 import std.string : representation;
@ -234,6 +235,9 @@ import std.string : representation;
 debug import std.stdio;

 private:
+
+import std.uni : isAlpha, isWhite;
+
@safe:

 //uncomment to get a barrage of debug info
--- a/std/string.d
+++ b/std/string.d
@ -71,85 +71,15 @@ class StringException : Exception
 /++
    Compares two ranges of characters lexicographically. The comparison is
    case insensitive. Use $(XREF algorithm, cmp) for a case sensitive
-    comparison. icmp works like $(XREF algorithm, cmp) except that it
-    converts characters to lowercase prior to applying $(D pred). Technically,
-    $(D icmp(r1, r2)) is equivalent to
-    $(D cmp!"std.uni.toLower(a) < std.uni.toLower(b)"(r1, r2)).
+    comparison. For details see $(XREF uni, icmp).

    $(BOOKTABLE,
        $(TR $(TD $(D < 0))  $(TD $(D s1 < s2) ))
        $(TR $(TD $(D = 0))  $(TD $(D s1 == s2)))
        $(TR $(TD $(D > 0))  $(TD $(D s1 > s2)))
     )
-  +/
-int icmp(alias pred = "a < b", S1, S2)(S1 s1, S2 s2)
-    if (isSomeString!S1 && isSomeString!S2)
-{
-    static if (is(typeof(pred) : string))
-        enum isLessThan = pred == "a < b";
-    else
-        enum isLessThan = false;
-
-    size_t i, j;
-    while (i < s1.length && j < s2.length)
-    {
-        immutable c1 = std.uni.toLower(decode(s1, i));
-        immutable c2 = std.uni.toLower(decode(s2, j));
-
-        static if (isLessThan)
-        {
-            if (c1 != c2)
-            {
-                if (c1 < c2) return -1;
-                if (c1 > c2) return 1;
-            }
-        }
-        else
-        {
-            if (binaryFun!pred(c1, c2)) return -1;
-            if (binaryFun!pred(c2, c1)) return 1;
-        }
-    }
-
-    if (i < s1.length) return 1;
-    if (j < s2.length) return -1;
-
-    return 0;
-}
-
-int icmp(alias pred = "a < b", S1, S2)(S1 s1, S2 s2)
-    if (!(isSomeString!S1 && isSomeString!S2) &&
-        isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) &&
-        isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
-{
-    static if (is(typeof(pred) : string))
-        enum isLessThan = pred == "a < b";
-    else
-        enum isLessThan = false;
-
-    for (;; s1.popFront(), s2.popFront())
-    {
-        if (s1.empty) return s2.empty ? 0 : -1;
-        if (s2.empty) return 1;
-
-        immutable c1 = std.uni.toLower(s1.front);
-        immutable c2 = std.uni.toLower(s2.front);
-
-        static if (isLessThan)
-        {
-            if (c1 != c2)
-            {
-                if(c1 < c2) return -1;
-                if(c1 > c2) return 1;
-            }
-        }
-        else
-        {
-            if (binaryFun!pred(c1, c2)) return -1;
-            if (binaryFun!pred(c2, c1)) return 1;
-        }
-    }
-}
+/
+alias icmp = std.uni.icmp;

 unittest
 {
@ -786,318 +716,35 @@ unittest

 /++
    Returns a string which is identical to $(D s) except that all of its
-    characters are lowercase (in unicode, not just ASCII). If $(D s) does not
-    have any uppercase characters, then $(D s) is returned.
+    characters are converted to lowercase (by preforming Unicode lowercase mapping).
+    If none of $(D s) characters were affected, then $(D s) itself is returned.
  +/
-S toLower(S)(S s) @trusted pure
-    if (isSomeString!S)
-{
-    foreach (i, dchar cOuter; s)
-    {
-        if (!std.uni.isUpper(cOuter))
-            continue;
-        auto result = s[0.. i].dup;
-        foreach (dchar c; s[i .. $])
-        {
-            if (std.uni.isUpper(c))
-            {
-                c = std.uni.toLower(c);
-            }
-            result ~= c;
-        }
-        return cast(S) result;
-    }
-    return s;
-}
-
-unittest
-{
-    debug(string) printf("string.toLower.unittest\n");
-
-    assertCTFEable!(
-    {
-    foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
-    {
-        S s = cast(S)"hello world\u0101";
-        assert(toLower(s) is s);
-        const S sc = "hello world\u0101";
-        assert(toLower(sc) is sc);
-        immutable S si = "hello world\u0101";
-        assert(toLower(si) is si);
-
-        S t = cast(S)"Hello World\u0100";
-        assert(toLower(t) == s);
-        const S tc = "hello world\u0101";
-        assert(toLower(tc) == s);
-        immutable S ti = "hello world\u0101";
-        assert(toLower(ti) == s);
-    }
-    });
-}
-
+alias toLower = std.uni.toLower;
 /++
-    Converts $(D s) to lowercase (in unicode, not just ASCII) in place.
+    Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.    
+    For a few characters string length may increase after the transformation,
+    in such a case the function reallocates exactly once.
    If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
 +/
-void toLowerInPlace(C)(ref C[] s)
-    if (is(C == char) || is(C == wchar))
-{
-    for (size_t i = 0; i < s.length; )
-    {
-        immutable c = s[i];
-        if (std.ascii.isUpper(c))
-        {
-            s[i++] = cast(C) (c + (cast(C)'a' - 'A'));
-        }
-        else if (!std.ascii.isASCII(c))
-        {
-            // wide character
-            size_t j = i;
-            dchar dc = decode(s, j);
-            assert(j > i);
-            if (!std.uni.isUpper(dc))
-            {
-                i = j;
-                continue;
-            }
-            auto toAdd = to!(C[])(std.uni.toLower(dc));
-            s = s[0 .. i] ~ toAdd  ~ s[j .. $];
-            i += toAdd.length;
-        }
-        else
-        {
-            ++i;
-        }
-    }
-}
-
-void toLowerInPlace(C)(ref C[] s) @safe pure nothrow
-    if (is(C == dchar))
-{
-    foreach (ref c; s)
-    {
-        if (std.uni.isUpper(c))
-            c = std.uni.toLower(c);
-    }
-}
-
-unittest
-{
-    debug(string) printf("string.toLowerInPlace.unittest\n");
-
-    assertCTFEable!(
-    {
-    foreach (S; TypeTuple!(char[], wchar[], dchar[]))
-    {
-        S s = to!S("hello world\u0101");
-        toLowerInPlace(s);
-        assert(s == "hello world\u0101");
-
-        S t = to!S("Hello World\u0100");
-        toLowerInPlace(t);
-        assert(t == "hello world\u0101");
-    }
-    });
-}
-
-unittest
-{
-    debug(string) printf("string.toLower/toLowerInPlace.unittest\n");
-
-    assertCTFEable!(
-    {
-    string s1 = "FoL";
-    string s2 = toLower(s1);
-    assert(cmp(s2, "fol") == 0, s2);
-    assert(s2 != s1);
-
-    char[] s3 = s1.dup;
-    toLowerInPlace(s3);
-    assert(s3 == s2, s3);
-
-    s1 = "A\u0100B\u0101d";
-    s2 = toLower(s1);
-    s3 = s1.dup;
-    assert(cmp(s2, "a\u0101b\u0101d") == 0);
-    assert(s2 !is s1);
-    toLowerInPlace(s3);
-    assert(s3 == s2, s3);
-
-    s1 = "A\u0460B\u0461d";
-    s2 = toLower(s1);
-    s3 = s1.dup;
-    assert(cmp(s2, "a\u0461b\u0461d") == 0);
-    assert(s2 !is s1);
-    toLowerInPlace(s3);
-    assert(s3 == s2, s3);
-
-    s1 = "\u0130";
-    s2 = toLower(s1);
-    s3 = s1.dup;
-    assert(s2 == "i");
-    assert(s2 !is s1);
-    toLowerInPlace(s3);
-    assert(s3 == s2, s3);
-
-    // Test on wchar and dchar strings.
-    assert(toLower("Some String"w) == "some string"w);
-    assert(toLower("Some String"d) == "some string"d);
-    });
-}
-
+alias toLowerInPlace = std.uni.toLowerInPlace;

 /++
    Returns a string which is identical to $(D s) except that all of its
-    characters are uppercase (in unicode, not just ASCII). If $(D s) does not
-    have any lowercase characters, then $(D s) is returned.
+    characters are converted to uppercase (by preforming Unicode uppercase mapping).
+    If none of $(D s) characters were affected, then $(D s) itself is returned.
  +/
-S toUpper(S)(S s) @trusted pure
-    if (isSomeString!S)
-{
-    foreach (i, dchar cOuter; s)
-    {
-        if (!std.uni.isLower(cOuter))
-            continue;
-        auto result = s[0.. i].dup;
-        foreach (dchar c; s[i .. $])
-        {
-            if (std.uni.isLower(c))
-            {
-                c = std.uni.toUpper(c);
-            }
-            result ~= c;
-        }
-        return cast(S) result;
-    }
-    return s;
-}
-
-unittest
-{
-    debug(string) printf("string.toUpper.unittest\n");
-
-    assertCTFEable!(
-    {
-    foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
-    {
-        S s = cast(S)"HELLO WORLD\u0100";
-        assert(toUpper(s) is s);
-        const S sc = "HELLO WORLD\u0100";
-        assert(toUpper(sc) is sc);
-        immutable S si = "HELLO WORLD\u0100";
-        assert(toUpper(si) is si);
-
-        S t = cast(S)"hello world\u0101";
-        assert(toUpper(t) == s);
-        const S tc = "HELLO WORLD\u0100";
-        assert(toUpper(tc) == s);
-        immutable S ti = "HELLO WORLD\u0100";
-        assert(toUpper(ti) == s);
-    }
-    });
-}
+alias toUpper = std.uni.toUpper;

 /++
-    Converts $(D s) to uppercase (in unicode, not just ASCII) in place.
+    Converts $(D s) to uppercase  (by performing Unicode uppercase mapping) in place.
+    For a few characters string length may increase after the transformation,
+    in such a case the function reallocates exactly once.
    If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
 +/
-void toUpperInPlace(C)(ref C[] s)
-    if (isSomeChar!C &&
-        (is(C == char) || is(C == wchar)))
-{
-    for (size_t i = 0; i < s.length; )
-    {
-        immutable c = s[i];
-        if ('a' <= c && c <= 'z')
-        {
-            s[i++] = cast(C) (c - (cast(C)'a' - 'A'));
-        }
-        else if (!std.ascii.isASCII(c))
-        {
-            // wide character
-            size_t j = i;
-            dchar dc = decode(s, j);
-            assert(j > i);
-            if (!std.uni.isLower(dc))
-            {
-                i = j;
-                continue;
-            }
-            auto toAdd = to!(C[])(std.uni.toUpper(dc));
-            s = s[0 .. i] ~ toAdd  ~ s[j .. $];
-            i += toAdd.length;
-        }
-        else
-        {
-            ++i;
-        }
-    }
-}
-
-void toUpperInPlace(C)(ref C[] s) @safe pure nothrow
-    if (is(C == dchar))
-{
-    foreach (ref c; s)
-    {
-        if (std.uni.isLower(c))
-            c = std.uni.toUpper(c);
-    }
-}
-
-unittest
-{
-    debug(string) printf("string.toUpperInPlace.unittest\n");
-
-    assertCTFEable!(
-    {
-    foreach (S; TypeTuple!(char[], wchar[], dchar[]))
-    {
-        S s = to!S("HELLO WORLD\u0100");
-        toUpperInPlace(s);
-        assert(s == "HELLO WORLD\u0100");
-
-        S t = to!S("Hello World\u0101");
-        toUpperInPlace(t);
-        assert(t == "HELLO WORLD\u0100");
-    }
-    });
-}
-
-unittest
-{
-    debug(string) printf("string.toUpper/toUpperInPlace.unittest\n");
-
-    assertCTFEable!(
-    {
-    string s1 = "FoL";
-    string s2;
-    char[] s3;
-
-    s2 = toUpper(s1);
-    s3 = s1.dup; toUpperInPlace(s3);
-    assert(s3 == s2, s3);
-    assert(cmp(s2, "FOL") == 0);
-    assert(s2 !is s1);
-
-    s1 = "a\u0100B\u0101d";
-    s2 = toUpper(s1);
-    s3 = s1.dup; toUpperInPlace(s3);
-    assert(s3 == s2);
-    assert(cmp(s2, "A\u0100B\u0100D") == 0);
-    assert(s2 !is s1);
-
-    s1 = "a\u0460B\u0461d";
-    s2 = toUpper(s1);
-    s3 = s1.dup; toUpperInPlace(s3);
-    assert(s3 == s2);
-    assert(cmp(s2, "A\u0460B\u0460D") == 0);
-    assert(s2 !is s1);
-    });
-}
-
+alias toUpperInPlace = std.uni.toUpperInPlace;

 /++
-    Capitalize the first character of $(D s) and conver the rest of $(D s)
+    Capitalize the first character of $(D s) and convert the rest of $(D s)
    to lowercase.
 +/
 S capitalize(S)(S s) @trusted pure
@ -1138,8 +785,6 @@ S capitalize(S)(S s) @trusted pure

 unittest
 {
-    debug(string) printf("string.capitalize.unittest\n");
-
    assertCTFEable!(
    {
    foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
@ -1159,10 +804,9 @@ unittest
        s2 = capitalize(s1);
        assert(cmp(s2, "Fol") == 0);
        assert(s2 !is s1);
-
        s1 = to!S("\u0131 \u0130");
        s2 = capitalize(s1);
-        assert(cmp(s2, "\u0049 \u0069") == 0);
+        assert(cmp(s2, "I \u0130") == 0);
        assert(s2 !is s1);

        s1 = to!S("\u017F \u0049");
@ -1173,7 +817,6 @@ unittest
    });
 }

-
 /++
    Split $(D s) into an array of lines using $(D '\r'), $(D '\n'),
    $(D "\r\n"), $(XREF uni, lineSep), and $(XREF uni, paraSep) as delimiters.
--- a/std/uni.d
+++ b/std/uni.d
--- a/win32.mak
+++ b/win32.mak
@ -191,7 +191,7 @@ SRC_STD_C_OSX= std\c\osx\socket.d

 SRC_STD_C_FREEBSD= std\c\freebsd\socket.d

-SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
+SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d std\internal\unicode_tables.d

 SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d

--- a/win64.mak
+++ b/win64.mak
@ -210,7 +210,7 @@ SRC_STD_C_OSX= std\c\osx\socket.d

 SRC_STD_C_FREEBSD= std\c\freebsd\socket.d

-SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
+SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d std\internal\unicode_tables.d

 SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d