new std.uni module

This commit is contained in:
Dmitry Olshansky 2013-07-20 22:30:10 +04:00
parent 1bd22b2e8b
commit 9a053d97c2
8 changed files with 12979 additions and 2681 deletions

View file

@ -202,7 +202,7 @@ EXTRA_MODULES += $(EXTRA_DOCUMENTABLES) $(addprefix \
std/internal/digest/, sha_SSSE3 ) $(addprefix \
std/internal/math/, biguintcore biguintnoasm biguintx86 \
gammafunction errorfunction) $(addprefix std/internal/, \
processinit uni uni_tab)
processinit uni uni_tab unicode_tables)
# Aggregate all D modules relevant to this build
D_MODULES = crc32 $(STD_MODULES) $(EXTRA_MODULES) $(STD_NET_MODULES) $(STD_DIGEST_MODULES)

File diff suppressed because one or more lines are too long

View file

@ -596,7 +596,8 @@ unittest
val = parseJSON(`"\u2660\u2666"`);
assert(toJSON(&val) == "\"\♠\♦\"");
assertNotThrown(parseJSON(`{ "foo": "` ~ "\u007F" ~ `"}`));
//0x7F is a control character (see Unicode spec)
assertThrown(parseJSON(`{ "foo": "` ~ "\u007F" ~ `"}`));
with(parseJSON(`""`))
assert(str == "" && str !is null);

View file

@ -225,8 +225,9 @@ module std.regex;
import std.internal.uni, std.internal.uni_tab;//unicode property tables
import std.array, std.algorithm, std.range,
std.conv, std.exception, std.traits, std.typetuple,
std.uni, std.utf, std.format, std.typecons, std.bitmanip,
std.utf, std.format, std.typecons, std.bitmanip,
std.functional, std.exception;
import core.bitop, core.stdc.string, core.stdc.stdlib;
static import ascii = std.ascii;
import std.string : representation;
@ -234,6 +235,9 @@ import std.string : representation;
debug import std.stdio;
private:
import std.uni : isAlpha, isWhite;
@safe:
//uncomment to get a barrage of debug info

View file

@ -71,85 +71,15 @@ class StringException : Exception
/++
Compares two ranges of characters lexicographically. The comparison is
case insensitive. Use $(XREF algorithm, cmp) for a case sensitive
comparison. icmp works like $(XREF algorithm, cmp) except that it
converts characters to lowercase prior to applying $(D pred). Technically,
$(D icmp(r1, r2)) is equivalent to
$(D cmp!"std.uni.toLower(a) < std.uni.toLower(b)"(r1, r2)).
comparison. For details see $(XREF uni, icmp).
$(BOOKTABLE,
$(TR $(TD $(D < 0)) $(TD $(D s1 < s2) ))
$(TR $(TD $(D = 0)) $(TD $(D s1 == s2)))
$(TR $(TD $(D > 0)) $(TD $(D s1 > s2)))
)
+/
int icmp(alias pred = "a < b", S1, S2)(S1 s1, S2 s2)
if (isSomeString!S1 && isSomeString!S2)
{
static if (is(typeof(pred) : string))
enum isLessThan = pred == "a < b";
else
enum isLessThan = false;
size_t i, j;
while (i < s1.length && j < s2.length)
{
immutable c1 = std.uni.toLower(decode(s1, i));
immutable c2 = std.uni.toLower(decode(s2, j));
static if (isLessThan)
{
if (c1 != c2)
{
if (c1 < c2) return -1;
if (c1 > c2) return 1;
}
}
else
{
if (binaryFun!pred(c1, c2)) return -1;
if (binaryFun!pred(c2, c1)) return 1;
}
}
if (i < s1.length) return 1;
if (j < s2.length) return -1;
return 0;
}
int icmp(alias pred = "a < b", S1, S2)(S1 s1, S2 s2)
if (!(isSomeString!S1 && isSomeString!S2) &&
isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar) &&
isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
{
static if (is(typeof(pred) : string))
enum isLessThan = pred == "a < b";
else
enum isLessThan = false;
for (;; s1.popFront(), s2.popFront())
{
if (s1.empty) return s2.empty ? 0 : -1;
if (s2.empty) return 1;
immutable c1 = std.uni.toLower(s1.front);
immutable c2 = std.uni.toLower(s2.front);
static if (isLessThan)
{
if (c1 != c2)
{
if(c1 < c2) return -1;
if(c1 > c2) return 1;
}
}
else
{
if (binaryFun!pred(c1, c2)) return -1;
if (binaryFun!pred(c2, c1)) return 1;
}
}
}
+/
alias icmp = std.uni.icmp;
unittest
{
@ -786,318 +716,35 @@ unittest
/++
Returns a string which is identical to $(D s) except that all of its
characters are lowercase (in unicode, not just ASCII). If $(D s) does not
have any uppercase characters, then $(D s) is returned.
characters are converted to lowercase (by preforming Unicode lowercase mapping).
If none of $(D s) characters were affected, then $(D s) itself is returned.
+/
S toLower(S)(S s) @trusted pure
if (isSomeString!S)
{
foreach (i, dchar cOuter; s)
{
if (!std.uni.isUpper(cOuter))
continue;
auto result = s[0.. i].dup;
foreach (dchar c; s[i .. $])
{
if (std.uni.isUpper(c))
{
c = std.uni.toLower(c);
}
result ~= c;
}
return cast(S) result;
}
return s;
}
unittest
{
debug(string) printf("string.toLower.unittest\n");
assertCTFEable!(
{
foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
{
S s = cast(S)"hello world\u0101";
assert(toLower(s) is s);
const S sc = "hello world\u0101";
assert(toLower(sc) is sc);
immutable S si = "hello world\u0101";
assert(toLower(si) is si);
S t = cast(S)"Hello World\u0100";
assert(toLower(t) == s);
const S tc = "hello world\u0101";
assert(toLower(tc) == s);
immutable S ti = "hello world\u0101";
assert(toLower(ti) == s);
}
});
}
alias toLower = std.uni.toLower;
/++
Converts $(D s) to lowercase (in unicode, not just ASCII) in place.
Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
For a few characters string length may increase after the transformation,
in such a case the function reallocates exactly once.
If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
+/
void toLowerInPlace(C)(ref C[] s)
if (is(C == char) || is(C == wchar))
{
for (size_t i = 0; i < s.length; )
{
immutable c = s[i];
if (std.ascii.isUpper(c))
{
s[i++] = cast(C) (c + (cast(C)'a' - 'A'));
}
else if (!std.ascii.isASCII(c))
{
// wide character
size_t j = i;
dchar dc = decode(s, j);
assert(j > i);
if (!std.uni.isUpper(dc))
{
i = j;
continue;
}
auto toAdd = to!(C[])(std.uni.toLower(dc));
s = s[0 .. i] ~ toAdd ~ s[j .. $];
i += toAdd.length;
}
else
{
++i;
}
}
}
void toLowerInPlace(C)(ref C[] s) @safe pure nothrow
if (is(C == dchar))
{
foreach (ref c; s)
{
if (std.uni.isUpper(c))
c = std.uni.toLower(c);
}
}
unittest
{
debug(string) printf("string.toLowerInPlace.unittest\n");
assertCTFEable!(
{
foreach (S; TypeTuple!(char[], wchar[], dchar[]))
{
S s = to!S("hello world\u0101");
toLowerInPlace(s);
assert(s == "hello world\u0101");
S t = to!S("Hello World\u0100");
toLowerInPlace(t);
assert(t == "hello world\u0101");
}
});
}
unittest
{
debug(string) printf("string.toLower/toLowerInPlace.unittest\n");
assertCTFEable!(
{
string s1 = "FoL";
string s2 = toLower(s1);
assert(cmp(s2, "fol") == 0, s2);
assert(s2 != s1);
char[] s3 = s1.dup;
toLowerInPlace(s3);
assert(s3 == s2, s3);
s1 = "A\u0100B\u0101d";
s2 = toLower(s1);
s3 = s1.dup;
assert(cmp(s2, "a\u0101b\u0101d") == 0);
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2, s3);
s1 = "A\u0460B\u0461d";
s2 = toLower(s1);
s3 = s1.dup;
assert(cmp(s2, "a\u0461b\u0461d") == 0);
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2, s3);
s1 = "\u0130";
s2 = toLower(s1);
s3 = s1.dup;
assert(s2 == "i");
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2, s3);
// Test on wchar and dchar strings.
assert(toLower("Some String"w) == "some string"w);
assert(toLower("Some String"d) == "some string"d);
});
}
alias toLowerInPlace = std.uni.toLowerInPlace;
/++
Returns a string which is identical to $(D s) except that all of its
characters are uppercase (in unicode, not just ASCII). If $(D s) does not
have any lowercase characters, then $(D s) is returned.
characters are converted to uppercase (by preforming Unicode uppercase mapping).
If none of $(D s) characters were affected, then $(D s) itself is returned.
+/
S toUpper(S)(S s) @trusted pure
if (isSomeString!S)
{
foreach (i, dchar cOuter; s)
{
if (!std.uni.isLower(cOuter))
continue;
auto result = s[0.. i].dup;
foreach (dchar c; s[i .. $])
{
if (std.uni.isLower(c))
{
c = std.uni.toUpper(c);
}
result ~= c;
}
return cast(S) result;
}
return s;
}
unittest
{
debug(string) printf("string.toUpper.unittest\n");
assertCTFEable!(
{
foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
{
S s = cast(S)"HELLO WORLD\u0100";
assert(toUpper(s) is s);
const S sc = "HELLO WORLD\u0100";
assert(toUpper(sc) is sc);
immutable S si = "HELLO WORLD\u0100";
assert(toUpper(si) is si);
S t = cast(S)"hello world\u0101";
assert(toUpper(t) == s);
const S tc = "HELLO WORLD\u0100";
assert(toUpper(tc) == s);
immutable S ti = "HELLO WORLD\u0100";
assert(toUpper(ti) == s);
}
});
}
alias toUpper = std.uni.toUpper;
/++
Converts $(D s) to uppercase (in unicode, not just ASCII) in place.
Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place.
For a few characters string length may increase after the transformation,
in such a case the function reallocates exactly once.
If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
+/
void toUpperInPlace(C)(ref C[] s)
if (isSomeChar!C &&
(is(C == char) || is(C == wchar)))
{
for (size_t i = 0; i < s.length; )
{
immutable c = s[i];
if ('a' <= c && c <= 'z')
{
s[i++] = cast(C) (c - (cast(C)'a' - 'A'));
}
else if (!std.ascii.isASCII(c))
{
// wide character
size_t j = i;
dchar dc = decode(s, j);
assert(j > i);
if (!std.uni.isLower(dc))
{
i = j;
continue;
}
auto toAdd = to!(C[])(std.uni.toUpper(dc));
s = s[0 .. i] ~ toAdd ~ s[j .. $];
i += toAdd.length;
}
else
{
++i;
}
}
}
void toUpperInPlace(C)(ref C[] s) @safe pure nothrow
if (is(C == dchar))
{
foreach (ref c; s)
{
if (std.uni.isLower(c))
c = std.uni.toUpper(c);
}
}
unittest
{
debug(string) printf("string.toUpperInPlace.unittest\n");
assertCTFEable!(
{
foreach (S; TypeTuple!(char[], wchar[], dchar[]))
{
S s = to!S("HELLO WORLD\u0100");
toUpperInPlace(s);
assert(s == "HELLO WORLD\u0100");
S t = to!S("Hello World\u0101");
toUpperInPlace(t);
assert(t == "HELLO WORLD\u0100");
}
});
}
unittest
{
debug(string) printf("string.toUpper/toUpperInPlace.unittest\n");
assertCTFEable!(
{
string s1 = "FoL";
string s2;
char[] s3;
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2, s3);
assert(cmp(s2, "FOL") == 0);
assert(s2 !is s1);
s1 = "a\u0100B\u0101d";
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2);
assert(cmp(s2, "A\u0100B\u0100D") == 0);
assert(s2 !is s1);
s1 = "a\u0460B\u0461d";
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2);
assert(cmp(s2, "A\u0460B\u0460D") == 0);
assert(s2 !is s1);
});
}
alias toUpperInPlace = std.uni.toUpperInPlace;
/++
Capitalize the first character of $(D s) and conver the rest of $(D s)
Capitalize the first character of $(D s) and convert the rest of $(D s)
to lowercase.
+/
S capitalize(S)(S s) @trusted pure
@ -1138,8 +785,6 @@ S capitalize(S)(S s) @trusted pure
unittest
{
debug(string) printf("string.capitalize.unittest\n");
assertCTFEable!(
{
foreach (S; TypeTuple!(string, wstring, dstring, char[], wchar[], dchar[]))
@ -1159,10 +804,9 @@ unittest
s2 = capitalize(s1);
assert(cmp(s2, "Fol") == 0);
assert(s2 !is s1);
s1 = to!S("\u0131 \u0130");
s2 = capitalize(s1);
assert(cmp(s2, "\u0049 \u0069") == 0);
assert(cmp(s2, "I \u0130") == 0);
assert(s2 !is s1);
s1 = to!S("\u017F \u0049");
@ -1173,7 +817,6 @@ unittest
});
}
/++
Split $(D s) into an array of lines using $(D '\r'), $(D '\n'),
$(D "\r\n"), $(XREF uni, lineSep), and $(XREF uni, paraSep) as delimiters.

9473
std/uni.d

File diff suppressed because it is too large Load diff

View file

@ -191,7 +191,7 @@ SRC_STD_C_OSX= std\c\osx\socket.d
SRC_STD_C_FREEBSD= std\c\freebsd\socket.d
SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d std\internal\unicode_tables.d
SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d

View file

@ -210,7 +210,7 @@ SRC_STD_C_OSX= std\c\osx\socket.d
SRC_STD_C_FREEBSD= std\c\freebsd\socket.d
SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d
SRC_STD_INTERNAL= std\internal\processinit.d std\internal\uni.d std\internal\uni_tab.d std\internal\unicode_tables.d
SRC_STD_INTERNAL_DIGEST= std\internal\digest\sha_SSSE3.d