phobos/std/utf.d
jmdavis 256976dddd Removed "scheduled for deprecation" pragmas.
The pragmas have not been as effective as we might have liked, since
they only work with templates and can't tell you where in your code you
need to make changes, and they seemed to have been more annoying to
programmers than helpful, so we're going to discontinue them. We'll
leave them in for stuff that's actually been deprecated until deprecated
has been improved enough to take a message, but we'll leave "scheduled
for deprecation" messages to the documentation and changelog.
2011-10-23 23:11:17 -07:00

1746 lines
48 KiB
D

// Written in the D programming language.
/++
Encode and decode UTF-8, UTF-16 and UTF-32 strings.
UTF character support is restricted to
$(D '\u0000' <= character <= '\U0010FFFF').
See_Also:
$(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
$(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
$(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
Macros:
WIKI = Phobos/StdUtf
Copyright: Copyright Digital Mars 2000 - 2010.
License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
Authors: $(WEB digitalmars.com, Walter Bright) and Jonathan M Davis
Source: $(PHOBOSSRC std/_utf.d)
+/
module std.utf;
import std.conv; // to, assumeUnique
import std.exception; // enforce, assumeUnique
import std.range; // walkLength
import std.traits; // isSomeChar, isSomeString
//debug=utf; // uncomment to turn on debugging printf's
debug (utf) import core.stdc.stdio : printf;
version(unittest)
{
import core.exception;
import std.string;
}
/++
Exception thrown on errors in std.utf functions.
+/
class UTFException : Exception
{
uint[4] sequence;
size_t len;
UTFException setSequence(uint[] data...) @safe pure nothrow
{
import std.algorithm;
assert(data.length <= 4);
len = min(data.length, 4);
sequence[0 .. len] = data[0 .. len];
return this;
}
this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null)
{
super(msg, file, line, next);
}
this(string msg, size_t index, string file = __FILE__, size_t line = __LINE__, Throwable next = null)
{
import std.string;
super(msg ~ format(" (at index %s)", index), file, line, next);
}
override string toString()
{
if(len == 0)
return super.toString();
string result = "Invalid UTF sequence:";
foreach(i; sequence[0 .. len])
result ~= " " ~ to!string(i, 16);
if(super.msg.length > 0)
{
result ~= " - ";
result ~= super.msg;
}
return result;
}
}
/++
$(RED Scheduled for deprecation in December 2012.
Please use $(LREF UTFException) instead.)
+/
alias UTFException UtfException;
/++
Returns whether $(D c) is a valid UTF-32 character.
$(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
as they are permitted for internal use by an application, but they are
not allowed for interchange by the Unicode standard.
+/
@safe
pure nothrow bool isValidDchar(dchar c)
{
/* Note: FFFE and FFFF are specifically permitted by the
* Unicode standard for application internal use, but are not
* allowed for interchange.
* (thanks to Arcane Jill)
*/
return c < 0xD800 ||
(c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
}
unittest
{
debug(utf) printf("utf.isValidDchar.unittest\n");
assert(isValidDchar(cast(dchar)'a') == true);
assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
assert(!isValidDchar(cast(dchar)0x00D800));
assert(!isValidDchar(cast(dchar)0x00DBFF));
assert(!isValidDchar(cast(dchar)0x00DC00));
assert(!isValidDchar(cast(dchar)0x00DFFF));
assert(isValidDchar(cast(dchar)0x00FFFE));
assert(isValidDchar(cast(dchar)0x00FFFF));
assert(isValidDchar(cast(dchar)0x01FFFF));
assert(isValidDchar(cast(dchar)0x10FFFF));
assert(!isValidDchar(cast(dchar)0x110000));
}
private immutable ubyte[256] utf8Stride =
[
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];
/++
$(D stride) returns the length of the UTF-8 sequence starting at $(D index)
in $(D str).
Returns:
The number of bytes in the UTF-8 sequence.
Throws:
$(D UTFException) if $(D str[index]) is not the start of a valid UTF-8
sequence.
+/
uint stride(in char[] str, size_t index) @safe pure
{
immutable result = utf8Stride[str[index]];
enforce(result != 0xFF, new UTFException("Not the start of the UTF-8 sequence", index));
return result;
}
@trusted unittest
{
static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
{
enforce(stride(s, i) == codeLength!char(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'a');
test("hello\U00010143\u0100\U00010143", 'h', 0);
test("hello\U00010143\u0100\U00010143", 'e', 1);
test("hello\U00010143\u0100\U00010143", 'l', 2);
test("hello\U00010143\u0100\U00010143", 'l', 3);
test("hello\U00010143\u0100\U00010143", 'o', 4);
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
test("hello\U00010143\u0100\U00010143", '\u0100', 9);
test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
}
/++
$(D strideBack) returns the length of the UTF-8 sequence ending one code
unit before $(D index) in $(D str).
Returns:
The number of bytes in the UTF-8 sequence.
Throws:
$(D UTFException) if $(D str[index]) is not one past the end of a valid
UTF-8 sequence.
+/
uint strideBack(in char[] str, size_t index) @safe pure
{
if (index >= 1 && (str[index-1] & 0b1100_0000) != 0b1000_0000)
return 1;
else if (index >= 2 && (str[index-2] & 0b1100_0000) != 0b1000_0000)
return 2;
else if (index >= 3 && (str[index-3] & 0b1100_0000) != 0b1000_0000)
return 3;
else if (index >= 4 && (str[index-4] & 0b1100_0000) != 0b1000_0000)
return 4;
else
throw new UTFException("Not the end of the UTF sequence", index);
}
unittest
{
static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
{
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'f');
test("\U00010143\u0100\U00010143hello", 'o', 15);
test("\U00010143\u0100\U00010143hello", 'l', 14);
test("\U00010143\u0100\U00010143hello", 'l', 13);
test("\U00010143\u0100\U00010143hello", 'e', 12);
test("\U00010143\u0100\U00010143hello", 'h', 11);
test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
test("\U00010143\u0100\U00010143hello", '\u0100', 6);
test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
}
/++
$(D stride) returns the length of the UTF-16 sequence starting at $(D index)
in $(D str).
Returns:
The number of bytes in the UTF-16 sequence.
+/
uint stride(in wchar[] str, size_t index) @safe pure nothrow
{
immutable uint u = str[index];
return 1 + (u >= 0xD800 && u <= 0xDBFF);
}
@trusted unittest
{
static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
{
enforce(stride(s, i) == codeLength!wchar(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'a');
test("hello\U00010143\u0100\U00010143", 'h', 0);
test("hello\U00010143\u0100\U00010143", 'e', 1);
test("hello\U00010143\u0100\U00010143", 'l', 2);
test("hello\U00010143\u0100\U00010143", 'l', 3);
test("hello\U00010143\u0100\U00010143", 'o', 4);
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
test("hello\U00010143\u0100\U00010143", '\u0100', 7);
test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
}
/++
$(D strideBack) returns the length of the UTF-16 sequence ending one code
unit before $(D index) in $(D str).
Returns:
The number of bytes in the UTF-16 sequence.
Throws:
$(D UTFException) if $(D str[index]) is not one past the end of a valid
UTF-16 sequence.
+/
uint strideBack(in wchar[] str, size_t index) @safe pure
{
enforce(index != 0 && (str[index-1] < 0xD800 || str[index-1] > 0xDBFF),
new UTFException("Not the end of the UTF-16 sequence", index));
if (index <= 1)
return 1;
immutable c = str[index - 2];
return 1 + (c >= 0xD800 && c <= 0xDBFF);
}
unittest
{
static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
{
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'f');
test("\U00010143\u0100\U00010143hello", 'o', 10);
test("\U00010143\u0100\U00010143hello", 'l', 9);
test("\U00010143\u0100\U00010143hello", 'l', 8);
test("\U00010143\u0100\U00010143hello", 'e', 7);
test("\U00010143\u0100\U00010143hello", 'h', 6);
test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
test("\U00010143\u0100\U00010143hello", '\u0100', 3);
test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
}
/++
$(D stride) returns the length of the UTF-32 sequence starting at $(D index)
in $(D str).
Returns:
The number of bytes in the UTF-32 sequence (always $(D 1)).
+/
uint stride(in dchar[] str, size_t index) @safe pure nothrow
{
assert(index < str.length);
return 1;
}
unittest
{
static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
{
enforce(stride(s, i) == codeLength!dchar(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'a');
test("hello\U00010143\u0100\U00010143", 'h', 0);
test("hello\U00010143\u0100\U00010143", 'e', 1);
test("hello\U00010143\u0100\U00010143", 'l', 2);
test("hello\U00010143\u0100\U00010143", 'l', 3);
test("hello\U00010143\u0100\U00010143", 'o', 4);
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
test("hello\U00010143\u0100\U00010143", '\u0100', 6);
test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
}
/++
$(D strideBack) returns the length of the UTF-32 sequence ending one code
unit before $(D index) in $(D str).
Returns:
The number of bytes in the UTF-32 sequence (always $(D 1)).
+/
uint strideBack(in dchar[] str, size_t index) @safe pure nothrow
{
assert(index <= str.length);
return 1;
}
unittest
{
static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
{
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
}
test("a", 'a');
test(" ", ' ');
test("\u2029", '\u2029'); //paraSep
test("\u0100", '\u0100');
test("\u0430", '\u0430');
test("\U00010143", '\U00010143');
test("abcdefcdef", 'f');
test("\U00010143\u0100\U00010143hello", 'o', 8);
test("\U00010143\u0100\U00010143hello", 'l', 7);
test("\U00010143\u0100\U00010143hello", 'l', 6);
test("\U00010143\u0100\U00010143hello", 'e', 5);
test("\U00010143\u0100\U00010143hello", 'h', 4);
test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
test("\U00010143\u0100\U00010143hello", '\u0100', 2);
test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
}
/++
Given $(D index) into $(D str) and assuming that $(D index) is at the start
of a UTF sequence, $(D toUCSindex) determines the number of UCS characters
up to $(D index). So, $(D index) is the index of a code unit at the
beginning of a code point, and the return value is how many code points into
the string that that code point is.
Examples:
--------------------
assert(toUCSindex(`hello world`, 7) == 7);
assert(toUCSindex(`hello world`w, 7) == 7);
assert(toUCSindex(`hello world`d, 7) == 7);
assert(toUCSindex(`Ma Chérie`, 7) == 6);
assert(toUCSindex(`Ma Chérie`w, 7) == 7);
assert(toUCSindex(`Ma Chérie`d, 7) == 7);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
--------------------
+/
size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
if(isSomeChar!C)
{
static if(is(Unqual!C == dchar))
return index;
else
{
size_t n = 0;
size_t j = 0;
for(; j < index; ++n)
j += stride(str, j);
if(j > index)
{
static if(is(Unqual!C == char))
throw new UTFException("Invalid UTF-8 sequence", index);
else
throw new UTFException("Invalid UTF-16 sequence", index);
}
return n;
}
}
unittest
{
assert(toUCSindex(`hello world`, 7) == 7);
assert(toUCSindex(`hello world`w, 7) == 7);
assert(toUCSindex(`hello world`d, 7) == 7);
assert(toUCSindex(`Ma Chérie`, 7) == 6);
assert(toUCSindex(`Ma Chérie`w, 7) == 7);
assert(toUCSindex(`Ma Chérie`d, 7) == 7);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
}
/++
Given a UCS index $(D n) into $(D str), returns the UTF index.
So, $(D n) is how many code points into the string the code point is, and
the array index of the code unit is returned.
Examples:
--------------------
assert(toUTFindex(`hello world`, 7) == 7);
assert(toUTFindex(`hello world`w, 7) == 7);
assert(toUTFindex(`hello world`d, 7) == 7);
assert(toUTFindex(`Ma Chérie`, 6) == 7);
assert(toUTFindex(`Ma Chérie`w, 7) == 7);
assert(toUTFindex(`Ma Chérie`d, 7) == 7);
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
--------------------
+/
size_t toUTFindex(in char[] str, size_t n) @safe pure
{
size_t i;
while (n--)
{
uint j = utf8Stride[str[i]];
if (j == 0xFF)
throw (new UTFException("Invalid UTF-8 sequence")).setSequence(str[i]);
i += j;
}
return i;
}
/// ditto
size_t toUTFindex(in wchar[] str, size_t n) @safe pure nothrow
{
size_t i;
while (n--)
{
wchar u = str[i];
i += 1 + (u >= 0xD800 && u <= 0xDBFF);
}
return i;
}
/// ditto
size_t toUTFindex(in dchar[] str, size_t n) @safe pure nothrow
{
return n;
}
/* =================== Decode ======================= */
/++
Decodes and returns the character starting at $(D str[index]). $(D index)
is advanced to one past the decoded character. If the character is not
well-formed, then a $(D UTFException) is thrown and $(D index) remains
unchanged.
Throws:
$(D UTFException) if $(D str[index]) is not the start of a valid UTF
sequence.
+/
dchar decode(in char[] str, ref size_t index) @safe pure
out (result)
{
assert(isValidDchar(result));
}
body
{
enforceEx!UTFException(index < str.length, "Attempted to decode past the end of a string");
immutable len = str.length;
dchar V;
size_t i = index;
char u = str[i];
if (u & 0x80)
{
/* The following encodings are valid, except for the 5 and 6 byte
* combinations:
* 0xxxxxxx
* 110xxxxx 10xxxxxx
* 1110xxxx 10xxxxxx 10xxxxxx
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*/
uint n = 1;
for (; ; n++)
{
if (n > 4)
goto Lerr; // only do the first 4 of 6 encodings
if (((u << n) & 0x80) == 0)
{
if (n == 1)
goto Lerr;
break;
}
}
// Pick off (7 - n) significant bits of B from first byte of octet
V = cast(dchar)(u & ((1 << (7 - n)) - 1));
if (i + n > len)
goto Lerr; // off end of string
/* The following combinations are overlong, and illegal:
* 1100000x (10xxxxxx)
* 11100000 100xxxxx (10xxxxxx)
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
*/
auto u2 = str[i + 1];
if ((u & 0xFE) == 0xC0 ||
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
(u == 0xFC && (u2 & 0xFC) == 0x80))
goto Lerr; // overlong combination
foreach (j; 1 .. n)
{
u = str[i + j];
if ((u & 0xC0) != 0x80)
goto Lerr; // trailing bytes are 10xxxxxx
V = (V << 6) | (u & 0x3F);
}
if (!isValidDchar(V))
goto Lerr;
i += n;
}
else
{
V = cast(dchar)u;
i++;
}
index = i;
return V;
Lerr:
uint[4] sequence;
size_t seqLen = 0;
for(size_t j = index; seqLen < 4 && j < len && (str[j] & 0x80) && !(str[j] & 0xC0); ++j, ++seqLen)
sequence[j] = str[j];
throw (new UTFException("Invalid UTF-8 sequence", i)).setSequence(sequence[0 .. seqLen]);
}
unittest
{
size_t i;
dchar c;
debug(utf) printf("utf.decode.unittest\n");
static string s1 = "abcd";
i = 0;
c = decode(s1, i);
assert(c == cast(dchar)'a');
assert(i == 1);
c = decode(s1, i);
assert(c == cast(dchar)'b');
assert(i == 2);
static string s2 = "\xC2\xA9";
i = 0;
c = decode(s2, i);
assert(c == cast(dchar)'\u00A9');
assert(i == 2);
static string s3 = "\xE2\x89\xA0";
i = 0;
c = decode(s3, i);
assert(c == cast(dchar)'\u2260');
assert(i == 3);
static string[] s4 = [
"\xE2\x89", // too short
"\xC0\x8A",
"\xE0\x80\x8A",
"\xF0\x80\x80\x8A",
"\xF8\x80\x80\x80\x8A",
"\xFC\x80\x80\x80\x80\x8A",
];
for (int j = 0; j < s4.length; j++)
{
try
{
i = 0;
c = decode(s4[j], i);
assert(0);
}
catch (UTFException u)
{
i = 23;
delete u;
}
assert(i == 23);
}
}
unittest
{
size_t i;
i = 0; assert(decode("\xEF\xBF\xBE"c, i) == cast(dchar)0xFFFE);
i = 0; assert(decode("\xEF\xBF\xBF"c, i) == cast(dchar)0xFFFF);
i = 0;
assertThrown!UTFException(decode("\xED\xA0\x80"c, i));
assertThrown!UTFException(decode("\xED\xAD\xBF"c, i));
assertThrown!UTFException(decode("\xED\xAE\x80"c, i));
assertThrown!UTFException(decode("\xED\xAF\xBF"c, i));
assertThrown!UTFException(decode("\xED\xB0\x80"c, i));
assertThrown!UTFException(decode("\xED\xBE\x80"c, i));
assertThrown!UTFException(decode("\xED\xBF\xBF"c, i));
}
/// ditto
dchar decode(in wchar[] str, ref size_t index) @safe pure
out (result)
{
assert(isValidDchar(result));
}
body
{
enforceEx!UTFException(index < str.length, "Attempted to decode past the end of a string");
string msg;
dchar V;
size_t i = index;
uint u = str[i];
if (u & ~0x7F)
{
if (u >= 0xD800 && u <= 0xDBFF)
{
uint u2;
if (i + 1 == str.length)
{
msg = "surrogate UTF-16 high value past end of string";
goto Lerr;
}
u2 = str[i + 1];
if (u2 < 0xDC00 || u2 > 0xDFFF)
{
msg = "surrogate UTF-16 low value out of range";
goto Lerr;
}
u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
i += 2;
}
else if (u >= 0xDC00 && u <= 0xDFFF)
{
msg = "unpaired surrogate UTF-16 value";
goto Lerr;
}
else
i++;
// Note: u+FFFE and u+FFFF are specifically permitted by the
// Unicode standard for application internal use (see isValidDchar)
}
else
{
i++;
}
index = i;
return cast(dchar)u;
Lerr:
throw (new UTFException(msg)).setSequence(str[i]);
}
unittest
{
size_t i;
i = 0; assert(decode([ cast(wchar)0xFFFE ], i) == cast(dchar)0xFFFE && i == 1);
i = 0; assert(decode([ cast(wchar)0xFFFF ], i) == cast(dchar)0xFFFF && i == 1);
}
/// ditto
dchar decode(in dchar[] str, ref size_t index) @safe pure
{
enforceEx!UTFException(index < str.length, "Attempted to decode past the end of a string");
size_t i = index;
dchar c = str[i];
if (!isValidDchar(c))
goto Lerr;
index = i + 1;
return c;
Lerr:
throw (new UTFException("Invalid UTF-32 value")).setSequence(c);
}
/* =================== Encode ======================= */
/++
Encodes $(D c) into the static array, $(D buf), and returns the actual
length of the encoded character (a number between $(D 1) and $(D 4) for
$(D char[4]) buffers and a number between $(D 1) and $(D 2) for
$(D wchar[2]) buffers.
Throws:
$(D UTFException) if $(D c) is not a valid UTF code point.
+/
size_t encode(ref char[4] buf, dchar c) @safe pure
{
if (c <= 0x7F)
{
assert(isValidDchar(c));
buf[0] = cast(char)c;
return 1;
}
if (c <= 0x7FF)
{
assert(isValidDchar(c));
buf[0] = cast(char)(0xC0 | (c >> 6));
buf[1] = cast(char)(0x80 | (c & 0x3F));
return 2;
}
if (c <= 0xFFFF)
{
if (0xD800 <= c && c <= 0xDFFF)
throw (new UTFException("Encoding a surrogate code point in UTF-8")).setSequence(c);
assert(isValidDchar(c));
buf[0] = cast(char)(0xE0 | (c >> 12));
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[2] = cast(char)(0x80 | (c & 0x3F));
return 3;
}
if (c <= 0x10FFFF)
{
assert(isValidDchar(c));
buf[0] = cast(char)(0xF0 | (c >> 18));
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[3] = cast(char)(0x80 | (c & 0x3F));
return 4;
}
assert(!isValidDchar(c));
throw (new UTFException("Encoding an invalid code point in UTF-8")).setSequence(c);
}
unittest
{
char[4] buf;
assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
}
/// Ditto
size_t encode(ref wchar[2] buf, dchar c) @safe pure
{
if (c <= 0xFFFF)
{
if (0xD800 <= c && c <= 0xDFFF)
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
assert(isValidDchar(c));
buf[0] = cast(wchar)c;
return 1;
}
if (c <= 0x10FFFF)
{
assert(isValidDchar(c));
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
return 2;
}
assert(!isValidDchar(c));
throw (new UTFException("Encoding an invalid code point in UTF-16")).setSequence(c);
}
unittest
{
wchar[2] buf;
assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
}
/++
Encodes $(D c) in $(D str)'s encoding and appends it to $(D str).
Throws:
$(D UTFException) if $(D c) is not a valid UTF code point.
+/
void encode(ref char[] str, dchar c) @safe pure
{
char[] r = str;
if (c <= 0x7F)
{
assert(isValidDchar(c));
r ~= cast(char)c;
}
else
{
char[4] buf;
uint L;
if (c <= 0x7FF)
{
assert(isValidDchar(c));
buf[0] = cast(char)(0xC0 | (c >> 6));
buf[1] = cast(char)(0x80 | (c & 0x3F));
L = 2;
}
else if (c <= 0xFFFF)
{
if (0xD800 <= c && c <= 0xDFFF)
throw (new UTFException("Encoding a surrogate code point in UTF-8")).setSequence(c);
assert(isValidDchar(c));
buf[0] = cast(char)(0xE0 | (c >> 12));
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[2] = cast(char)(0x80 | (c & 0x3F));
L = 3;
}
else if (c <= 0x10FFFF)
{
assert(isValidDchar(c));
buf[0] = cast(char)(0xF0 | (c >> 18));
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[3] = cast(char)(0x80 | (c & 0x3F));
L = 4;
}
else
{
assert(!isValidDchar(c));
throw (new UTFException("Encoding an invalid code point in UTF-8")).setSequence(c);
}
r ~= buf[0 .. L];
}
str = r;
}
unittest
{
debug(utf) printf("utf.encode.unittest\n");
char[] s = "abcd".dup;
encode(s, cast(dchar)'a');
assert(s.length == 5);
assert(s == "abcda");
encode(s, cast(dchar)'\u00A9');
assert(s.length == 7);
assert(s == "abcda\xC2\xA9");
//assert(s == "abcda\u00A9"); // BUG: fix compiler
encode(s, cast(dchar)'\u2260');
assert(s.length == 10);
assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
}
unittest
{
char[] buf;
encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
}
/// ditto
void encode(ref wchar[] str, dchar c) @safe pure
{
wchar[] r = str;
if (c <= 0xFFFF)
{
if (0xD800 <= c && c <= 0xDFFF)
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
assert(isValidDchar(c));
r ~= cast(wchar)c;
}
else if (c <= 0x10FFFF)
{
wchar[2] buf;
assert(isValidDchar(c));
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
r ~= buf;
}
else
{
assert(!isValidDchar(c));
throw (new UTFException("Encoding an invalid code point in UTF-16")).setSequence(c);
}
str = r;
}
unittest
{
wchar[] buf;
encode(buf, '\u0000'); assert(buf[0] == '\u0000');
encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
encode(buf, '\uE000'); assert(buf[2] == '\uE000');
encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
}
/// ditto
void encode(ref dchar[] str, dchar c) @safe pure
{
if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
throw (new UTFException("Encoding an invalid code point in UTF-32")).setSequence(c);
assert(isValidDchar(c));
str ~= c;
}
unittest
{
dchar[] buf;
encode(buf, '\u0000'); assert(buf[0] == '\u0000');
encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
encode(buf, '\uE000'); assert(buf[2] == '\uE000');
encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
}
/++
Returns the number of code units that are required to encode the code point
$(D c) when $(D C) is the character type used to encode it.
Examples:
------
assert(codeLength!char('a') == 1);
assert(codeLength!wchar('a') == 1);
assert(codeLength!dchar('a') == 1);
assert(codeLength!char('\U0010FFFF') == 4);
assert(codeLength!wchar('\U0010FFFF') == 2);
assert(codeLength!dchar('\U0010FFFF') == 1);
------
+/
ubyte codeLength(C)(dchar c) @safe pure nothrow
{
static if (C.sizeof == 1)
{
return
c <= 0x7F ? 1
: c <= 0x7FF ? 2
: c <= 0xFFFF ? 3
: c <= 0x10FFFF ? 4
: (assert(false), 6);
}
else static if (C.sizeof == 2)
{
return c <= 0xFFFF ? 1 : 2;
}
else
{
static assert(C.sizeof == 4);
return 1;
}
}
//Verify Examples.
unittest
{
assert(codeLength!char('a') == 1);
assert(codeLength!wchar('a') == 1);
assert(codeLength!dchar('a') == 1);
assert(codeLength!char('\U0010FFFF') == 4);
assert(codeLength!wchar('\U0010FFFF') == 2);
assert(codeLength!dchar('\U0010FFFF') == 1);
}
/* =================== Validation ======================= */
/++
Checks to see if $(D str) is well-formed unicode or not.
Throws:
$(D UTFException) if $(D str) is not well-formed.
+/
void validate(S)(in S str) @safe pure
if(isSomeString!S)
{
immutable len = str.length;
for (size_t i = 0; i < len; )
{
decode(str, i);
}
}
/* =================== Conversion to UTF8 ======================= */
@trusted
{
char[] toUTF8(out char[4] buf, dchar c)
in
{
assert(isValidDchar(c));
}
body
{
if (c <= 0x7F)
{
buf[0] = cast(char)c;
return buf[0 .. 1];
}
else if (c <= 0x7FF)
{
buf[0] = cast(char)(0xC0 | (c >> 6));
buf[1] = cast(char)(0x80 | (c & 0x3F));
return buf[0 .. 2];
}
else if (c <= 0xFFFF)
{
buf[0] = cast(char)(0xE0 | (c >> 12));
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[2] = cast(char)(0x80 | (c & 0x3F));
return buf[0 .. 3];
}
else if (c <= 0x10FFFF)
{
buf[0] = cast(char)(0xF0 | (c >> 18));
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[3] = cast(char)(0x80 | (c & 0x3F));
return buf[0 .. 4];
}
assert(0);
}
/*******************
* Encodes string $(D_PARAM s) into UTF-8 and returns the encoded string.
*/
string toUTF8(in char[] s)
{
validate(s);
return s.idup;
}
/// ditto
string toUTF8(in wchar[] s)
{
char[] r;
size_t i;
size_t slen = s.length;
r.length = slen;
for (i = 0; i < slen; i++)
{
wchar c = s[i];
if (c <= 0x7F)
r[i] = cast(char)c; // fast path for ascii
else
{
r.length = i;
while (i < slen)
encode(r, decode(s, i));
break;
}
}
return r.assumeUnique();
}
/// ditto
pure string toUTF8(in dchar[] s)
{
char[] r;
size_t i;
size_t slen = s.length;
r.length = slen;
for (i = 0; i < slen; i++)
{
dchar c = s[i];
if (c <= 0x7F)
r[i] = cast(char)c; // fast path for ascii
else
{
r.length = i;
foreach (dchar d; s[i .. slen])
{
encode(r, d);
}
break;
}
}
return r.assumeUnique();
}
/* =================== Conversion to UTF16 ======================= */
pure wchar[] toUTF16(ref wchar[2] buf, dchar c)
in
{
assert(isValidDchar(c));
}
body
{
if (c <= 0xFFFF)
{
buf[0] = cast(wchar)c;
return buf[0 .. 1];
}
else
{
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
return buf[0 .. 2];
}
}
/****************
* Encodes string $(D s) into UTF-16 and returns the encoded string.
*/
wstring toUTF16(in char[] s)
{
wchar[] r;
size_t slen = s.length;
r.length = slen;
r.length = 0;
for (size_t i = 0; i < slen; )
{
dchar c = s[i];
if (c <= 0x7F)
{
i++;
r ~= cast(wchar)c;
}
else
{
c = decode(s, i);
encode(r, c);
}
}
return r.assumeUnique(); // ok because r is unique
}
/// ditto
wstring toUTF16(in wchar[] s)
{
validate(s);
return s.idup;
}
/// ditto
pure wstring toUTF16(in dchar[] s)
{
wchar[] r;
size_t slen = s.length;
r.length = slen;
r.length = 0;
for (size_t i = 0; i < slen; i++)
{
encode(r, s[i]);
}
return r.assumeUnique(); // ok because r is unique
}
/++
Encodes string $(D s) into UTF-16 and returns the encoded string.
$(D toUTF16z) is suitable for calling the 'W' functions in the Win32 API
that take an $(D LPWSTR) or $(D LPCWSTR) argument.
+/
const(wchar)* toUTF16z(in char[] s)
{
wchar[] r;
size_t slen = s.length;
r.length = slen + 1;
r.length = 0;
for (size_t i = 0; i < slen; )
{
dchar c = s[i];
if (c <= 0x7F)
{
i++;
r ~= cast(wchar)c;
}
else
{
c = decode(s, i);
encode(r, c);
}
}
r ~= "\000";
return r.ptr;
}
/* =================== Conversion to UTF32 ======================= */
/*****
* Encodes string $(D_PARAM s) into UTF-32 and returns the encoded string.
*/
dstring toUTF32(in char[] s)
{
dchar[] r;
size_t slen = s.length;
size_t j = 0;
r.length = slen; // r[] will never be longer than s[]
for (size_t i = 0; i < slen; )
{
dchar c = s[i];
if (c >= 0x80)
c = decode(s, i);
else
i++; // c is ascii, no need for decode
r[j++] = c;
}
return r[0 .. j].assumeUnique(); // legit because it's unique
}
/// ditto
dstring toUTF32(in wchar[] s)
{
dchar[] r;
size_t slen = s.length;
size_t j = 0;
r.length = slen; // r[] will never be longer than s[]
for (size_t i = 0; i < slen; )
{
dchar c = s[i];
if (c >= 0x80)
c = decode(s, i);
else
i++; // c is ascii, no need for decode
r[j++] = c;
}
return r[0 .. j].assumeUnique(); // legit because it's unique
}
/// ditto
dstring toUTF32(in dchar[] s)
{
validate(s);
return s.idup;
}
} // Convert functions are @safe
/* =================== toUTFz ======================= */
/++
Returns a C-style zero-terminated string equivalent to $(D str). $(D str)
must not contain embedded $(D '\0')'s as any C function will treat the first
$(D '\0') that it sees a the end of the string. If $(D str.empty) is
$(D true), then a string containing only $(D '\0') is returned.
$(D toUTFz) accepts any type of string and is templated on the type of
character pointer that you wish to convert to. It will avoid allocating a
new string if it can, but there's a decent chance that it will end up having
to allocate a new string - particularly when dealing with character types
other than $(D char).
$(RED Warning 1:) If the result of $(D toUTFz) equals $(D str.ptr), then if
anything alters the character one past the end of $(D str) (which is the
$(D '\0') character terminating the string), then the string won't be
zero-terminated anymore. The most likely scenarios for that are if you
append to $(D str) and no reallocation takes place or when $(D str) is a
slice of a larger array, and you alter the character in the larger array
which is one character past the end of $(D str). Another case where it could
occur would be if you had a mutable character array immediately after
$(D str) in memory (for example, if they're member variables in a
user-defined type with one declared right after the other) and that
character array happened to start with $(D '\0'). Such scenarios will never
occur if you immediately use the zero-terminated string after calling
$(D toUTFz) and the C function using it doesn't keep a reference to it.
Also, they are unlikely to occur even if you save the zero-terminated string
(the cases above would be among the few examples of where it could happen).
However, if you save the zero-terminate string and want to be absolutely
certain that the string stays zero-terminated, then simply append a
$(D '\0') to the string and use its $(D ptr) property rather than calling
$(D toUTFz).
$(RED Warning 2:) When passing a character pointer to a C function, and the
C function keeps it around for any reason, make sure that you keep a
reference to it in your D code. Otherwise, it may go away during a garbage
collection cycle and cause a nasty bug when the C code tries to use it.
Examples:
--------------------
auto p1 = toUTFz!(char*)("hello world");
auto p2 = toUTFz!(const(char)*)("hello world");
auto p3 = toUTFz!(immutable(char)*)("hello world");
auto p4 = toUTFz!(char*)("hello world"d);
auto p5 = toUTFz!(const(wchar)*)("hello world");
auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
--------------------
+/
P toUTFz(P, S)(S str) @system
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
//immutable(C)[] -> C*, const(C)*, or immutable(C)*
{
if(str.empty)
{
typeof(*P.init)[] retval = ['\0'];
return retval.ptr;
}
alias Unqual!(ElementEncodingType!S) C;
//If the P is mutable, then we have to make a copy.
static if(is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
return toUTFz!(P, const(C)[])(cast(const(C)[])str);
else
{
immutable p = str.ptr + str.length;
// Peek past end of str, if it's 0, no conversion necessary.
// Note that the compiler will put a 0 past the end of static
// strings, and the storage allocator will put a 0 past the end
// of newly allocated char[]'s.
// Is p dereferenceable? A simple test: if the p points to an
// address multiple of 4, then conservatively assume the pointer
// might be pointing to a new block of memory, which might be
// unreadable. Otherwise, it's definitely pointing to valid
// memory.
if((cast(size_t)p & 3) && *p == '\0')
return str.ptr;
return toUTFz!(P, const(C)[])(cast(const(C)[])str);
}
}
P toUTFz(P, S)(S str) @system
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
!is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
//C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
{
alias ElementEncodingType!S InChar;
alias typeof(*P.init) OutChar;
//const(C)[] -> const(C)* or
//C[] -> C* or const(C)*
static if((is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) ||
(!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
{
auto p = str.ptr + str.length;
if((cast(size_t)p & 3) && *p == '\0')
return str.ptr;
str ~= '\0';
return str.ptr;
}
//const(C)[] -> C* or immutable(C)* or
//C[] -> immutable(C)*
else
{
auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
copy[0 .. $ - 1] = str[];
copy[$ - 1] = '\0';
return cast(P)copy.ptr;
}
}
P toUTFz(P, S)(S str)
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
!is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)))
//C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
{
auto retval = appender!(typeof(*P.init)[])();
foreach(dchar c; str)
retval.put(c);
retval.put('\0');
return cast(P)retval.data.ptr;
}
//Verify Examples.
unittest
{
auto p1 = toUTFz!(char*)("hello world");
auto p2 = toUTFz!(const(char)*)("hello world");
auto p3 = toUTFz!(immutable(char)*)("hello world");
auto p4 = toUTFz!(char*)("hello world"d);
auto p5 = toUTFz!(const(wchar)*)("hello world");
auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
}
unittest
{
import core.exception;
import std.algorithm;
import std.metastrings;
import std.typetuple;
size_t zeroLen(C)(const(C)* ptr)
{
size_t len = 0;
while(*ptr != '\0')
{
++ptr;
++len;
}
return len;
}
foreach(S; TypeTuple!(string, wstring, dstring))
{
alias Unqual!(typeof(S.init[0])) C;
auto s1 = to!S("hello\U00010143\u0100\U00010143");
auto temp = new C[](s1.length + 1);
temp[0 .. $ - 1] = s1[0 .. $];
temp[$ - 1] = '\n';
--temp.length;
auto s2 = assumeUnique(temp);
assert(s1 == s2);
foreach(P; TypeTuple!(C*, const(C)*, immutable(C)*))
{
auto p1 = toUTFz!P(s1);
assert(p1[0 .. s1.length] == s1);
assert(p1[s1.length] == '\0');
auto p2 = toUTFz!P(s2);
assert(p2[0 .. s2.length] == s2);
assert(p2[s2.length] == '\0');
}
}
void test(P, S)(S s, size_t line = __LINE__)
{
auto p = toUTFz!P(s);
immutable len = zeroLen(p);
enforce(cmp(s, p[0 .. len]) == 0,
new AssertError(Format!("Unit test failed: %s %s", P.stringof, S.stringof),
__FILE__, line));
}
foreach(P; TypeTuple!(wchar*, const(wchar)*, immutable(wchar)*,
dchar*, const(dchar)*, immutable(dchar)*))
{
test!P("hello\U00010143\u0100\U00010143");
}
foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
dchar*, const(dchar)*, immutable(dchar)*))
{
test!P("hello\U00010143\u0100\U00010143"w);
}
foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
wchar*, const(wchar)*, immutable(wchar)*))
{
test!P("hello\U00010143\u0100\U00010143"d);
}
foreach(S; TypeTuple!(char[], wchar[], dchar[],
const(char)[], const(wchar)[], const(dchar)[]))
{
auto s = to!S("hello\U00010143\u0100\U00010143");
foreach(P; TypeTuple!(char*, wchar*, dchar*,
const(char)*, const(wchar)*, const(dchar)*,
immutable(char)*, immutable(wchar)*, immutable(dchar)*))
{
test!P(s);
}
}
}
/* ================================ tests ================================== */
unittest
{
debug(utf) printf("utf.toUTF.unittest\n");
string c;
wstring w;
dstring d;
c = "hello";
w = toUTF16(c);
assert(w == "hello");
d = toUTF32(c);
assert(d == "hello");
c = toUTF8(w);
assert(c == "hello");
d = toUTF32(w);
assert(d == "hello");
c = toUTF8(d);
assert(c == "hello");
w = toUTF16(d);
assert(w == "hello");
c = "hel\u1234o";
w = toUTF16(c);
assert(w == "hel\u1234o");
d = toUTF32(c);
assert(d == "hel\u1234o");
c = toUTF8(w);
assert(c == "hel\u1234o");
d = toUTF32(w);
assert(d == "hel\u1234o");
c = toUTF8(d);
assert(c == "hel\u1234o");
w = toUTF16(d);
assert(w == "hel\u1234o");
c = "he\U0010AAAAllo";
w = toUTF16(c);
//foreach (wchar c; w) printf("c = x%x\n", c);
//foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
assert(w == "he\U0010AAAAllo");
d = toUTF32(c);
assert(d == "he\U0010AAAAllo");
c = toUTF8(w);
assert(c == "he\U0010AAAAllo");
d = toUTF32(w);
assert(d == "he\U0010AAAAllo");
c = toUTF8(d);
assert(c == "he\U0010AAAAllo");
w = toUTF16(d);
assert(w == "he\U0010AAAAllo");
}
/++
Returns the total number of code points encoded in $(D str).
Supercedes: This function supercedes $(LREF toUCSindex).
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
Throws:
$(D UTFException) if $(D str) is not well-formed.
+/
size_t count(C)(const(C)[] str) @trusted pure
if(isSomeChar!C)
{
return walkLength(str);
}
unittest
{
assert(count("") == 0);
assert(count("a") == 1);
assert(count("abc") == 3);
assert(count("\u20AC100") == 4);
}