mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 14:10:30 +03:00

It's easier to define aliases this way. I created two templates though so that you can still pass both template arguments if you want to, and it'll avoid breaking any code.
1781 lines
48 KiB
D
1781 lines
48 KiB
D
// Written in the D programming language.
|
|
|
|
/++
|
|
Encode and decode UTF-8, UTF-16 and UTF-32 strings.
|
|
|
|
UTF character support is restricted to
|
|
$(D '\u0000' <= character <= '\U0010FFFF').
|
|
|
|
See_Also:
|
|
$(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
|
|
$(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
|
|
$(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
|
|
Macros:
|
|
WIKI = Phobos/StdUtf
|
|
|
|
Copyright: Copyright Digital Mars 2000 - 2010.
|
|
License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
|
|
Authors: $(WEB digitalmars.com, Walter Bright) and Jonathan M Davis
|
|
Source: $(PHOBOSSRC std/_utf.d)
|
|
+/
|
|
module std.utf;
|
|
|
|
import std.conv; // to, assumeUnique
|
|
import std.exception; // enforce, assumeUnique
|
|
import std.range; // walkLength
|
|
import std.traits; // isSomeChar, isSomeString
|
|
import std.typetuple; // TypeTuple
|
|
|
|
//debug=utf; // uncomment to turn on debugging printf's
|
|
|
|
debug (utf) import core.stdc.stdio : printf;
|
|
|
|
version(unittest)
|
|
{
|
|
import core.exception;
|
|
import std.string;
|
|
}
|
|
|
|
|
|
/++
|
|
Exception thrown on errors in std.utf functions.
|
|
+/
|
|
class UTFException : Exception
|
|
{
|
|
uint[4] sequence;
|
|
size_t len;
|
|
|
|
|
|
UTFException setSequence(uint[] data...) @safe pure nothrow
|
|
{
|
|
import std.algorithm;
|
|
|
|
assert(data.length <= 4);
|
|
|
|
len = min(data.length, 4);
|
|
sequence[0 .. len] = data[0 .. len];
|
|
|
|
return this;
|
|
}
|
|
|
|
|
|
this(string msg, string file = __FILE__, size_t line = __LINE__, Throwable next = null)
|
|
{
|
|
super(msg, file, line, next);
|
|
}
|
|
|
|
|
|
this(string msg, size_t index, string file = __FILE__, size_t line = __LINE__, Throwable next = null)
|
|
{
|
|
import std.string;
|
|
super(msg ~ format(" (at index %s)", index), file, line, next);
|
|
}
|
|
|
|
|
|
override string toString()
|
|
{
|
|
import std.string;
|
|
if(len == 0)
|
|
return super.toString();
|
|
|
|
string result = "Invalid UTF sequence:";
|
|
|
|
foreach(i; sequence[0 .. len])
|
|
result ~= format(" %02x", i);
|
|
|
|
if(super.msg.length > 0)
|
|
{
|
|
result ~= " - ";
|
|
result ~= super.msg;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
}
|
|
|
|
|
|
/++
|
|
$(RED Scheduled for deprecation in December 2012.
|
|
Please use $(LREF UTFException) instead.)
|
|
+/
|
|
alias UTFException UtfException;
|
|
|
|
|
|
/++
|
|
Returns whether $(D c) is a valid UTF-32 character.
|
|
|
|
$(D '\uFFFE') and $(D '\uFFFF') are considered valid by $(D isValidDchar),
|
|
as they are permitted for internal use by an application, but they are
|
|
not allowed for interchange by the Unicode standard.
|
|
+/
|
|
@safe
|
|
pure nothrow bool isValidDchar(dchar c)
|
|
{
|
|
/* Note: FFFE and FFFF are specifically permitted by the
|
|
* Unicode standard for application internal use, but are not
|
|
* allowed for interchange.
|
|
* (thanks to Arcane Jill)
|
|
*/
|
|
|
|
return c < 0xD800 ||
|
|
(c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(utf) printf("utf.isValidDchar.unittest\n");
|
|
assert(isValidDchar(cast(dchar)'a') == true);
|
|
assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
|
|
|
|
assert(!isValidDchar(cast(dchar)0x00D800));
|
|
assert(!isValidDchar(cast(dchar)0x00DBFF));
|
|
assert(!isValidDchar(cast(dchar)0x00DC00));
|
|
assert(!isValidDchar(cast(dchar)0x00DFFF));
|
|
assert(isValidDchar(cast(dchar)0x00FFFE));
|
|
assert(isValidDchar(cast(dchar)0x00FFFF));
|
|
assert(isValidDchar(cast(dchar)0x01FFFF));
|
|
assert(isValidDchar(cast(dchar)0x10FFFF));
|
|
assert(!isValidDchar(cast(dchar)0x110000));
|
|
}
|
|
|
|
|
|
/++
|
|
$(D stride) returns the length of the UTF-8 sequence starting at $(D index)
|
|
in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-8 sequence.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str[index]) is not the start of a valid UTF-8
|
|
sequence.
|
|
+/
|
|
uint stride(S)(in S str, size_t index) @safe pure
|
|
if (is(S : const(char[])))
|
|
{
|
|
immutable c = str[index];
|
|
if (c < 0x80)
|
|
return 1;
|
|
else
|
|
return strideImpl(c, index);
|
|
}
|
|
|
|
private uint strideImpl(char c, size_t index) @trusted pure
|
|
in { assert(c & 0x80); }
|
|
body
|
|
{
|
|
static if (__traits(compiles, {import core.bitop; bsr(1);}))
|
|
{
|
|
import core.bitop;
|
|
immutable msbs = 7 - bsr(~c);
|
|
if (msbs >= 2 && msbs <= 6) return msbs;
|
|
}
|
|
else
|
|
{
|
|
if (!(c & 0x40)) goto Lerr;
|
|
if (!(c & 0x20)) return 2;
|
|
if (!(c & 0x10)) return 3;
|
|
if (!(c & 0x08)) return 4;
|
|
if (!(c & 0x04)) return 5;
|
|
if (!(c & 0x02)) return 6;
|
|
}
|
|
Lerr:
|
|
throw new UTFException("Invalid UTF-8 sequence", index);
|
|
}
|
|
|
|
@trusted unittest
|
|
{
|
|
static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__)
|
|
{
|
|
enforce(stride(s, i) == codeLength!char(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'a');
|
|
test("hello\U00010143\u0100\U00010143", 'h', 0);
|
|
test("hello\U00010143\u0100\U00010143", 'e', 1);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 2);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 3);
|
|
test("hello\U00010143\u0100\U00010143", 'o', 4);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
|
|
test("hello\U00010143\u0100\U00010143", '\u0100', 9);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 11);
|
|
}
|
|
|
|
|
|
/++
|
|
$(D strideBack) returns the length of the UTF-8 sequence ending one code
|
|
unit before $(D index) in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-8 sequence.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str[index]) is not one past the end of a valid
|
|
UTF-8 sequence.
|
|
+/
|
|
uint strideBack(in char[] str, size_t index) @safe pure
|
|
{
|
|
if (index >= 1 && (str[index-1] & 0b1100_0000) != 0b1000_0000)
|
|
return 1;
|
|
else if (index >= 2 && (str[index-2] & 0b1100_0000) != 0b1000_0000)
|
|
return 2;
|
|
else if (index >= 3 && (str[index-3] & 0b1100_0000) != 0b1000_0000)
|
|
return 3;
|
|
else if (index >= 4 && (str[index-4] & 0b1100_0000) != 0b1000_0000)
|
|
return 4;
|
|
else
|
|
throw new UTFException("Not the end of the UTF sequence", index);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
|
|
{
|
|
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'f');
|
|
test("\U00010143\u0100\U00010143hello", 'o', 15);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 14);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 13);
|
|
test("\U00010143\u0100\U00010143hello", 'e', 12);
|
|
test("\U00010143\u0100\U00010143hello", 'h', 11);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 10);
|
|
test("\U00010143\u0100\U00010143hello", '\u0100', 6);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 4);
|
|
}
|
|
|
|
|
|
/++
|
|
$(D stride) returns the length of the UTF-16 sequence starting at $(D index)
|
|
in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-16 sequence.
|
|
+/
|
|
uint stride(S)(in S str, size_t index) @safe pure nothrow
|
|
if (is(S : const(wchar[])))
|
|
{
|
|
immutable uint u = str[index];
|
|
return 1 + (u >= 0xD800 && u <= 0xDBFF);
|
|
}
|
|
|
|
@trusted unittest
|
|
{
|
|
static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__)
|
|
{
|
|
enforce(stride(s, i) == codeLength!wchar(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'a');
|
|
test("hello\U00010143\u0100\U00010143", 'h', 0);
|
|
test("hello\U00010143\u0100\U00010143", 'e', 1);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 2);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 3);
|
|
test("hello\U00010143\u0100\U00010143", 'o', 4);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
|
|
test("hello\U00010143\u0100\U00010143", '\u0100', 7);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 8);
|
|
}
|
|
|
|
|
|
/++
|
|
$(D strideBack) returns the length of the UTF-16 sequence ending one code
|
|
unit before $(D index) in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-16 sequence.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str[index]) is not one past the end of a valid
|
|
UTF-16 sequence.
|
|
+/
|
|
uint strideBack(in wchar[] str, size_t index) @safe pure
|
|
{
|
|
enforce(index != 0 && (str[index-1] < 0xD800 || str[index-1] > 0xDBFF),
|
|
new UTFException("Not the end of the UTF-16 sequence", index));
|
|
if (index <= 1)
|
|
return 1;
|
|
immutable c = str[index - 2];
|
|
return 1 + (c >= 0xD800 && c <= 0xDBFF);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
|
|
{
|
|
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'f');
|
|
test("\U00010143\u0100\U00010143hello", 'o', 10);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 9);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 8);
|
|
test("\U00010143\u0100\U00010143hello", 'e', 7);
|
|
test("\U00010143\u0100\U00010143hello", 'h', 6);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 5);
|
|
test("\U00010143\u0100\U00010143hello", '\u0100', 3);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 2);
|
|
}
|
|
|
|
|
|
/++
|
|
$(D stride) returns the length of the UTF-32 sequence starting at $(D index)
|
|
in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-32 sequence (always $(D 1)).
|
|
+/
|
|
uint stride(S)(in S str, size_t index) @safe pure nothrow
|
|
if (is(S : const(dchar[])))
|
|
{
|
|
assert(index < str.length);
|
|
return 1;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__)
|
|
{
|
|
enforce(stride(s, i) == codeLength!dchar(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'a');
|
|
test("hello\U00010143\u0100\U00010143", 'h', 0);
|
|
test("hello\U00010143\u0100\U00010143", 'e', 1);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 2);
|
|
test("hello\U00010143\u0100\U00010143", 'l', 3);
|
|
test("hello\U00010143\u0100\U00010143", 'o', 4);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 5);
|
|
test("hello\U00010143\u0100\U00010143", '\u0100', 6);
|
|
test("hello\U00010143\u0100\U00010143", '\U00010143', 7);
|
|
}
|
|
|
|
|
|
/++
|
|
$(D strideBack) returns the length of the UTF-32 sequence ending one code
|
|
unit before $(D index) in $(D str).
|
|
|
|
Returns:
|
|
The number of bytes in the UTF-32 sequence (always $(D 1)).
|
|
+/
|
|
uint strideBack(in dchar[] str, size_t index) @safe pure nothrow
|
|
{
|
|
assert(index <= str.length);
|
|
return 1;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__)
|
|
{
|
|
enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c),
|
|
new AssertError(format("Unit test failure: %s", s), __FILE__, line));
|
|
}
|
|
|
|
test("a", 'a');
|
|
test(" ", ' ');
|
|
test("\u2029", '\u2029'); //paraSep
|
|
test("\u0100", '\u0100');
|
|
test("\u0430", '\u0430');
|
|
test("\U00010143", '\U00010143');
|
|
test("abcdefcdef", 'f');
|
|
test("\U00010143\u0100\U00010143hello", 'o', 8);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 7);
|
|
test("\U00010143\u0100\U00010143hello", 'l', 6);
|
|
test("\U00010143\u0100\U00010143hello", 'e', 5);
|
|
test("\U00010143\u0100\U00010143hello", 'h', 4);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 3);
|
|
test("\U00010143\u0100\U00010143hello", '\u0100', 2);
|
|
test("\U00010143\u0100\U00010143hello", '\U00010143', 1);
|
|
}
|
|
|
|
|
|
/++
|
|
Given $(D index) into $(D str) and assuming that $(D index) is at the start
|
|
of a UTF sequence, $(D toUCSindex) determines the number of UCS characters
|
|
up to $(D index). So, $(D index) is the index of a code unit at the
|
|
beginning of a code point, and the return value is how many code points into
|
|
the string that that code point is.
|
|
|
|
Examples:
|
|
--------------------
|
|
assert(toUCSindex(`hello world`, 7) == 7);
|
|
assert(toUCSindex(`hello world`w, 7) == 7);
|
|
assert(toUCSindex(`hello world`d, 7) == 7);
|
|
|
|
assert(toUCSindex(`Ma Chérie`, 7) == 6);
|
|
assert(toUCSindex(`Ma Chérie`w, 7) == 7);
|
|
assert(toUCSindex(`Ma Chérie`d, 7) == 7);
|
|
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
|
|
--------------------
|
|
+/
|
|
size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure
|
|
if(isSomeChar!C)
|
|
{
|
|
static if(is(Unqual!C == dchar))
|
|
return index;
|
|
else
|
|
{
|
|
size_t n = 0;
|
|
size_t j = 0;
|
|
|
|
for(; j < index; ++n)
|
|
j += stride(str, j);
|
|
|
|
if(j > index)
|
|
{
|
|
static if(is(Unqual!C == char))
|
|
throw new UTFException("Invalid UTF-8 sequence", index);
|
|
else
|
|
throw new UTFException("Invalid UTF-16 sequence", index);
|
|
}
|
|
|
|
return n;
|
|
}
|
|
}
|
|
|
|
unittest
|
|
{
|
|
assert(toUCSindex(`hello world`, 7) == 7);
|
|
assert(toUCSindex(`hello world`w, 7) == 7);
|
|
assert(toUCSindex(`hello world`d, 7) == 7);
|
|
|
|
assert(toUCSindex(`Ma Chérie`, 7) == 6);
|
|
assert(toUCSindex(`Ma Chérie`w, 7) == 7);
|
|
assert(toUCSindex(`Ma Chérie`d, 7) == 7);
|
|
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3);
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
|
|
assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
|
|
}
|
|
|
|
|
|
/++
|
|
Given a UCS index $(D n) into $(D str), returns the UTF index.
|
|
So, $(D n) is how many code points into the string the code point is, and
|
|
the array index of the code unit is returned.
|
|
|
|
Examples:
|
|
--------------------
|
|
assert(toUTFindex(`hello world`, 7) == 7);
|
|
assert(toUTFindex(`hello world`w, 7) == 7);
|
|
assert(toUTFindex(`hello world`d, 7) == 7);
|
|
|
|
assert(toUTFindex(`Ma Chérie`, 6) == 7);
|
|
assert(toUTFindex(`Ma Chérie`w, 7) == 7);
|
|
assert(toUTFindex(`Ma Chérie`d, 7) == 7);
|
|
|
|
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9);
|
|
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9);
|
|
assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9);
|
|
--------------------
|
|
+/
|
|
size_t toUTFindex(in char[] str, size_t n) @safe pure
|
|
{
|
|
size_t i;
|
|
while (n--)
|
|
i += stride(str, i);
|
|
return i;
|
|
}
|
|
|
|
/// ditto
|
|
size_t toUTFindex(in wchar[] str, size_t n) @safe pure nothrow
|
|
{
|
|
size_t i;
|
|
|
|
while (n--)
|
|
{
|
|
wchar u = str[i];
|
|
|
|
i += 1 + (u >= 0xD800 && u <= 0xDBFF);
|
|
}
|
|
|
|
return i;
|
|
}
|
|
|
|
/// ditto
|
|
size_t toUTFindex(in dchar[] str, size_t n) @safe pure nothrow
|
|
{
|
|
return n;
|
|
}
|
|
|
|
|
|
/* =================== Decode ======================= */
|
|
|
|
/++
|
|
Decodes and returns the character starting at $(D str[index]). $(D index)
|
|
is advanced to one past the decoded character. If the character is not
|
|
well-formed, then a $(D UTFException) is thrown and $(D index) remains
|
|
unchanged.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str[index]) is not the start of a valid UTF
|
|
sequence.
|
|
+/
|
|
dchar decode(S)(in S str, ref size_t index) @trusted pure
|
|
if(is(S : const(char[])))
|
|
in
|
|
{
|
|
assert(index < str.length, "Attempted to decode past the end of a string");
|
|
}
|
|
out (result)
|
|
{
|
|
assert(isValidDchar(result));
|
|
}
|
|
body
|
|
{
|
|
if (str[index] < 0x80)
|
|
return str[index++];
|
|
else
|
|
return decodeImpl(str.ptr + index, str.length - index, index);
|
|
}
|
|
|
|
/*
|
|
* This function does it's own bounds checking to give a more useful
|
|
* error message when attempting to decode past the end of a string.
|
|
* Subsequently it uses a pointer instead of an array to avoid
|
|
* redundant bounds checking.
|
|
*/
|
|
private dchar decodeImpl(const(char)* pstr, size_t length, ref size_t index) @trusted pure
|
|
in
|
|
{
|
|
assert(pstr[0] & 0x80);
|
|
}
|
|
body
|
|
{
|
|
/* The following encodings are valid, except for the 5 and 6 byte
|
|
* combinations:
|
|
* 0xxxxxxx
|
|
* 110xxxxx 10xxxxxx
|
|
* 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
|
|
/* Dchar bitmask for different numbers of UTF-8 code units.
|
|
*/
|
|
enum bitMask = [(1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1];
|
|
|
|
ubyte fst = pstr[0], tmp=void;
|
|
dchar d = fst; // upper control bits are masked out later
|
|
fst <<= 1;
|
|
|
|
foreach(i; TypeTuple!(1, 2, 3))
|
|
{
|
|
if (i == length)
|
|
goto Ebounds;
|
|
|
|
tmp = pstr[i];
|
|
|
|
if ((tmp & 0xC0) != 0x80)
|
|
goto Eutf;
|
|
|
|
d = (d << 6) | (tmp & 0x3F);
|
|
fst <<= 1;
|
|
|
|
if (!(fst & 0x80)) // no more bytes
|
|
{
|
|
d &= bitMask[i]; // mask out control bits
|
|
|
|
// overlong, could have been encoded with i bytes
|
|
if ((d & ~bitMask[i - 1]) == 0)
|
|
goto Eutf;
|
|
|
|
// check for surrogates only needed for 3 bytes
|
|
static if (i == 2)
|
|
{
|
|
if (!isValidDchar(d))
|
|
goto Eutf;
|
|
}
|
|
|
|
index += i + 1;
|
|
return d;
|
|
}
|
|
}
|
|
|
|
static UTFException exception(in char[] str, string msg)
|
|
{
|
|
uint[4] sequence = void;
|
|
size_t i;
|
|
do
|
|
{
|
|
sequence[i] = str[i];
|
|
} while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
|
|
|
|
return (new UTFException(msg, i)).setSequence(sequence[0 .. i]);
|
|
}
|
|
|
|
Eutf:
|
|
throw exception(pstr[0 .. length], "Invalid UTF-8 sequence");
|
|
Ebounds:
|
|
throw exception(pstr[0 .. length], "Attempted to decode past the end of a string");
|
|
}
|
|
|
|
unittest
|
|
{
|
|
size_t i;
|
|
dchar c;
|
|
|
|
debug(utf) printf("utf.decode.unittest\n");
|
|
|
|
static string s1 = "abcd";
|
|
i = 0;
|
|
c = decode(s1, i);
|
|
assert(c == cast(dchar)'a');
|
|
assert(i == 1);
|
|
c = decode(s1, i);
|
|
assert(c == cast(dchar)'b');
|
|
assert(i == 2);
|
|
|
|
static string s2 = "\xC2\xA9";
|
|
i = 0;
|
|
c = decode(s2, i);
|
|
assert(c == cast(dchar)'\u00A9');
|
|
assert(i == 2);
|
|
|
|
static string s3 = "\xE2\x89\xA0";
|
|
i = 0;
|
|
c = decode(s3, i);
|
|
assert(c == cast(dchar)'\u2260');
|
|
assert(i == 3);
|
|
|
|
static string[] s4 = [
|
|
"\xE2\x89", // too short
|
|
"\xC0\x8A",
|
|
"\xE0\x80\x8A",
|
|
"\xF0\x80\x80\x8A",
|
|
"\xF8\x80\x80\x80\x8A",
|
|
"\xFC\x80\x80\x80\x80\x8A",
|
|
];
|
|
|
|
for (int j = 0; j < s4.length; j++)
|
|
{
|
|
try
|
|
{
|
|
i = 0;
|
|
c = decode(s4[j], i);
|
|
assert(0);
|
|
}
|
|
catch (UTFException u)
|
|
{
|
|
i = 23;
|
|
delete u;
|
|
}
|
|
|
|
assert(i == 23);
|
|
}
|
|
}
|
|
|
|
unittest
|
|
{
|
|
size_t i;
|
|
|
|
i = 0; assert(decode("\xEF\xBF\xBE"c, i) == cast(dchar)0xFFFE);
|
|
i = 0; assert(decode("\xEF\xBF\xBF"c, i) == cast(dchar)0xFFFF);
|
|
i = 0;
|
|
assertThrown!UTFException(decode("\xED\xA0\x80"c, i));
|
|
assertThrown!UTFException(decode("\xED\xAD\xBF"c, i));
|
|
assertThrown!UTFException(decode("\xED\xAE\x80"c, i));
|
|
assertThrown!UTFException(decode("\xED\xAF\xBF"c, i));
|
|
assertThrown!UTFException(decode("\xED\xB0\x80"c, i));
|
|
assertThrown!UTFException(decode("\xED\xBE\x80"c, i));
|
|
assertThrown!UTFException(decode("\xED\xBF\xBF"c, i));
|
|
}
|
|
|
|
/// ditto
|
|
dchar decode(S)(in S str, ref size_t index) @trusted pure
|
|
if(is(S : const(wchar[])))
|
|
in
|
|
{
|
|
assert(index < str.length, "Attempted to decode past the end of a string");
|
|
}
|
|
out (result)
|
|
{
|
|
assert(isValidDchar(result));
|
|
}
|
|
body
|
|
{
|
|
if (str[index] < 0xD800)
|
|
return str[index++];
|
|
else
|
|
return decodeImpl(str.ptr + index, str.length - index, index);
|
|
}
|
|
|
|
/// ditto
|
|
private dchar decodeImpl(const(wchar)* pstr, size_t length, ref size_t index) @trusted pure
|
|
in
|
|
{
|
|
assert(pstr[0] >= 0xD800);
|
|
}
|
|
body
|
|
{
|
|
string msg;
|
|
uint u = pstr[0];
|
|
|
|
if (u >= 0xD800 && u <= 0xDBFF)
|
|
{
|
|
if (length == 1)
|
|
{
|
|
msg = "surrogate UTF-16 high value past end of string";
|
|
goto Lerr;
|
|
}
|
|
immutable uint u2 = pstr[1];
|
|
if (u2 < 0xDC00 || u2 > 0xDFFF)
|
|
{
|
|
msg = "surrogate UTF-16 low value out of range";
|
|
goto Lerr;
|
|
}
|
|
u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
|
|
index += 2;
|
|
}
|
|
else if (u >= 0xDC00 && u <= 0xDFFF)
|
|
{
|
|
msg = "unpaired surrogate UTF-16 value";
|
|
goto Lerr;
|
|
}
|
|
else
|
|
++index;
|
|
// Note: u+FFFE and u+FFFF are specifically permitted by the
|
|
// Unicode standard for application internal use (see isValidDchar)
|
|
|
|
return cast(dchar)u;
|
|
|
|
Lerr:
|
|
throw (new UTFException(msg)).setSequence(pstr[0]);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
size_t i;
|
|
|
|
i = 0; assert(decode([ cast(wchar)0xFFFE ], i) == cast(dchar)0xFFFE && i == 1);
|
|
i = 0; assert(decode([ cast(wchar)0xFFFF ], i) == cast(dchar)0xFFFF && i == 1);
|
|
}
|
|
|
|
|
|
/// ditto
|
|
dchar decode(S)(in S str, ref size_t index) @safe pure
|
|
if(is(S : const(dchar[])))
|
|
in
|
|
{
|
|
assert(index < str.length, "Attempted to decode past the end of a string");
|
|
}
|
|
body
|
|
{
|
|
if (str[index] < 0xD800)
|
|
return str[index++];
|
|
else
|
|
return decodeImpl(str, index);
|
|
}
|
|
|
|
/// ditto
|
|
private dchar decodeImpl(in dchar[] str, ref size_t index) @safe pure
|
|
{
|
|
if (!isValidDchar(str[index]))
|
|
throw (new UTFException("Invalid UTF-32 value")).setSequence(str[index]);
|
|
return str[index++];
|
|
}
|
|
|
|
/* =================== Encode ======================= */
|
|
|
|
/++
|
|
Encodes $(D c) into the static array, $(D buf), and returns the actual
|
|
length of the encoded character (a number between $(D 1) and $(D 4) for
|
|
$(D char[4]) buffers and a number between $(D 1) and $(D 2) for
|
|
$(D wchar[2]) buffers.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D c) is not a valid UTF code point.
|
|
+/
|
|
size_t encode(ref char[4] buf, dchar c) @safe pure
|
|
{
|
|
if (c <= 0x7F)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)c;
|
|
return 1;
|
|
}
|
|
if (c <= 0x7FF)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
return 2;
|
|
}
|
|
if (c <= 0xFFFF)
|
|
{
|
|
if (0xD800 <= c && c <= 0xDFFF)
|
|
throw (new UTFException("Encoding a surrogate code point in UTF-8")).setSequence(c);
|
|
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
return 3;
|
|
}
|
|
if (c <= 0x10FFFF)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
return 4;
|
|
}
|
|
|
|
assert(!isValidDchar(c));
|
|
throw (new UTFException("Encoding an invalid code point in UTF-8")).setSequence(c);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
char[4] buf;
|
|
|
|
assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
|
|
assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
|
|
assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
|
|
assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
|
|
assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
|
|
assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
|
|
assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
|
|
assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
|
|
assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
|
|
assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
|
|
assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");
|
|
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
|
|
}
|
|
|
|
|
|
/// Ditto
|
|
size_t encode(ref wchar[2] buf, dchar c) @safe pure
|
|
{
|
|
if (c <= 0xFFFF)
|
|
{
|
|
if (0xD800 <= c && c <= 0xDFFF)
|
|
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
|
|
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(wchar)c;
|
|
return 1;
|
|
}
|
|
if (c <= 0x10FFFF)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
|
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
|
|
return 2;
|
|
}
|
|
|
|
assert(!isValidDchar(c));
|
|
throw (new UTFException("Encoding an invalid code point in UTF-16")).setSequence(c);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
wchar[2] buf;
|
|
|
|
assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
|
|
assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
|
|
assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
|
|
assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
|
|
assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
|
|
assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
|
|
assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");
|
|
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
|
|
}
|
|
|
|
|
|
/++
|
|
Encodes $(D c) in $(D str)'s encoding and appends it to $(D str).
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D c) is not a valid UTF code point.
|
|
+/
|
|
void encode(ref char[] str, dchar c) @safe pure
|
|
{
|
|
char[] r = str;
|
|
|
|
if (c <= 0x7F)
|
|
{
|
|
assert(isValidDchar(c));
|
|
r ~= cast(char)c;
|
|
}
|
|
else
|
|
{
|
|
char[4] buf;
|
|
uint L;
|
|
|
|
if (c <= 0x7FF)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 2;
|
|
}
|
|
else if (c <= 0xFFFF)
|
|
{
|
|
if (0xD800 <= c && c <= 0xDFFF)
|
|
throw (new UTFException("Encoding a surrogate code point in UTF-8")).setSequence(c);
|
|
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 3;
|
|
}
|
|
else if (c <= 0x10FFFF)
|
|
{
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 4;
|
|
}
|
|
else
|
|
{
|
|
assert(!isValidDchar(c));
|
|
throw (new UTFException("Encoding an invalid code point in UTF-8")).setSequence(c);
|
|
}
|
|
r ~= buf[0 .. L];
|
|
}
|
|
str = r;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(utf) printf("utf.encode.unittest\n");
|
|
|
|
char[] s = "abcd".dup;
|
|
encode(s, cast(dchar)'a');
|
|
assert(s.length == 5);
|
|
assert(s == "abcda");
|
|
|
|
encode(s, cast(dchar)'\u00A9');
|
|
assert(s.length == 7);
|
|
assert(s == "abcda\xC2\xA9");
|
|
//assert(s == "abcda\u00A9"); // BUG: fix compiler
|
|
|
|
encode(s, cast(dchar)'\u2260');
|
|
assert(s.length == 10);
|
|
assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
|
|
}
|
|
|
|
unittest
|
|
{
|
|
char[] buf;
|
|
|
|
encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
|
|
encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
|
|
encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
|
|
encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
|
|
encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
|
|
encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
|
|
encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
|
|
encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
|
|
encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
|
|
encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
|
|
encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
|
|
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
|
|
}
|
|
|
|
/// ditto
|
|
void encode(ref wchar[] str, dchar c) @safe pure
|
|
{
|
|
wchar[] r = str;
|
|
|
|
if (c <= 0xFFFF)
|
|
{
|
|
if (0xD800 <= c && c <= 0xDFFF)
|
|
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
|
|
|
|
assert(isValidDchar(c));
|
|
r ~= cast(wchar)c;
|
|
}
|
|
else if (c <= 0x10FFFF)
|
|
{
|
|
wchar[2] buf;
|
|
|
|
assert(isValidDchar(c));
|
|
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
|
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
|
|
r ~= buf;
|
|
}
|
|
else
|
|
{
|
|
assert(!isValidDchar(c));
|
|
throw (new UTFException("Encoding an invalid code point in UTF-16")).setSequence(c);
|
|
}
|
|
|
|
str = r;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
wchar[] buf;
|
|
|
|
encode(buf, '\u0000'); assert(buf[0] == '\u0000');
|
|
encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
|
|
encode(buf, '\uE000'); assert(buf[2] == '\uE000');
|
|
encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
|
|
encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
|
|
encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
|
|
encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
|
|
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
|
|
}
|
|
|
|
/// ditto
|
|
void encode(ref dchar[] str, dchar c) @safe pure
|
|
{
|
|
if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
|
|
throw (new UTFException("Encoding an invalid code point in UTF-32")).setSequence(c);
|
|
|
|
assert(isValidDchar(c));
|
|
str ~= c;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
dchar[] buf;
|
|
|
|
encode(buf, '\u0000'); assert(buf[0] == '\u0000');
|
|
encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
|
|
encode(buf, '\uE000'); assert(buf[2] == '\uE000');
|
|
encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
|
|
encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
|
|
encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
|
|
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xD800));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDBFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDC00));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0xDFFF));
|
|
assertThrown!UTFException(encode(buf, cast(dchar)0x110000));
|
|
}
|
|
|
|
|
|
/++
|
|
Returns the number of code units that are required to encode the code point
|
|
$(D c) when $(D C) is the character type used to encode it.
|
|
|
|
Examples:
|
|
------
|
|
assert(codeLength!char('a') == 1);
|
|
assert(codeLength!wchar('a') == 1);
|
|
assert(codeLength!dchar('a') == 1);
|
|
|
|
assert(codeLength!char('\U0010FFFF') == 4);
|
|
assert(codeLength!wchar('\U0010FFFF') == 2);
|
|
assert(codeLength!dchar('\U0010FFFF') == 1);
|
|
------
|
|
+/
|
|
ubyte codeLength(C)(dchar c) @safe pure nothrow
|
|
{
|
|
static if (C.sizeof == 1)
|
|
{
|
|
return
|
|
c <= 0x7F ? 1
|
|
: c <= 0x7FF ? 2
|
|
: c <= 0xFFFF ? 3
|
|
: c <= 0x10FFFF ? 4
|
|
: (assert(false), 6);
|
|
}
|
|
else static if (C.sizeof == 2)
|
|
{
|
|
return c <= 0xFFFF ? 1 : 2;
|
|
}
|
|
else
|
|
{
|
|
static assert(C.sizeof == 4);
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
//Verify Examples.
|
|
unittest
|
|
{
|
|
assert(codeLength!char('a') == 1);
|
|
assert(codeLength!wchar('a') == 1);
|
|
assert(codeLength!dchar('a') == 1);
|
|
|
|
assert(codeLength!char('\U0010FFFF') == 4);
|
|
assert(codeLength!wchar('\U0010FFFF') == 2);
|
|
assert(codeLength!dchar('\U0010FFFF') == 1);
|
|
}
|
|
|
|
|
|
/* =================== Validation ======================= */
|
|
|
|
/++
|
|
Checks to see if $(D str) is well-formed unicode or not.
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str) is not well-formed.
|
|
+/
|
|
void validate(S)(in S str) @safe pure
|
|
if(isSomeString!S)
|
|
{
|
|
immutable len = str.length;
|
|
for (size_t i = 0; i < len; )
|
|
{
|
|
decode(str, i);
|
|
}
|
|
}
|
|
|
|
|
|
/* =================== Conversion to UTF8 ======================= */
|
|
|
|
pure
|
|
{
|
|
|
|
char[] toUTF8(out char[4] buf, dchar c) nothrow @safe
|
|
in
|
|
{
|
|
assert(isValidDchar(c));
|
|
}
|
|
body
|
|
{
|
|
if (c <= 0x7F)
|
|
{
|
|
buf[0] = cast(char)c;
|
|
return buf[0 .. 1];
|
|
}
|
|
else if (c <= 0x7FF)
|
|
{
|
|
buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
return buf[0 .. 2];
|
|
}
|
|
else if (c <= 0xFFFF)
|
|
{
|
|
buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
return buf[0 .. 3];
|
|
}
|
|
else if (c <= 0x10FFFF)
|
|
{
|
|
buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
return buf[0 .. 4];
|
|
}
|
|
|
|
assert(0);
|
|
}
|
|
|
|
|
|
/*******************
|
|
* Encodes string $(D_PARAM s) into UTF-8 and returns the encoded string.
|
|
*/
|
|
string toUTF8(in char[] s) @safe
|
|
{
|
|
validate(s);
|
|
return s.idup;
|
|
}
|
|
|
|
/// ditto
|
|
string toUTF8(in wchar[] s) @trusted
|
|
{
|
|
char[] r;
|
|
size_t i;
|
|
size_t slen = s.length;
|
|
|
|
r.length = slen;
|
|
for (i = 0; i < slen; i++)
|
|
{
|
|
wchar c = s[i];
|
|
|
|
if (c <= 0x7F)
|
|
r[i] = cast(char)c; // fast path for ascii
|
|
else
|
|
{
|
|
r.length = i;
|
|
while (i < slen)
|
|
encode(r, decode(s, i));
|
|
break;
|
|
}
|
|
}
|
|
|
|
return r.assumeUnique();
|
|
}
|
|
|
|
/// ditto
|
|
string toUTF8(in dchar[] s) @trusted
|
|
{
|
|
char[] r;
|
|
size_t i;
|
|
size_t slen = s.length;
|
|
|
|
r.length = slen;
|
|
for (i = 0; i < slen; i++)
|
|
{
|
|
dchar c = s[i];
|
|
|
|
if (c <= 0x7F)
|
|
r[i] = cast(char)c; // fast path for ascii
|
|
else
|
|
{
|
|
r.length = i;
|
|
foreach (dchar d; s[i .. slen])
|
|
{
|
|
encode(r, d);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
return r.assumeUnique();
|
|
}
|
|
|
|
|
|
/* =================== Conversion to UTF16 ======================= */
|
|
|
|
wchar[] toUTF16(ref wchar[2] buf, dchar c) nothrow @safe
|
|
in
|
|
{
|
|
assert(isValidDchar(c));
|
|
}
|
|
body
|
|
{
|
|
if (c <= 0xFFFF)
|
|
{
|
|
buf[0] = cast(wchar)c;
|
|
return buf[0 .. 1];
|
|
}
|
|
else
|
|
{
|
|
buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
|
buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
|
|
return buf[0 .. 2];
|
|
}
|
|
}
|
|
|
|
/****************
|
|
* Encodes string $(D s) into UTF-16 and returns the encoded string.
|
|
*/
|
|
wstring toUTF16(in char[] s) @trusted
|
|
{
|
|
wchar[] r;
|
|
size_t slen = s.length;
|
|
|
|
r.length = slen;
|
|
r.length = 0;
|
|
for (size_t i = 0; i < slen; )
|
|
{
|
|
dchar c = s[i];
|
|
if (c <= 0x7F)
|
|
{
|
|
i++;
|
|
r ~= cast(wchar)c;
|
|
}
|
|
else
|
|
{
|
|
c = decode(s, i);
|
|
encode(r, c);
|
|
}
|
|
}
|
|
|
|
return r.assumeUnique(); // ok because r is unique
|
|
}
|
|
|
|
/// ditto
|
|
wstring toUTF16(in wchar[] s) @safe
|
|
{
|
|
validate(s);
|
|
return s.idup;
|
|
}
|
|
|
|
/// ditto
|
|
pure wstring toUTF16(in dchar[] s) @trusted
|
|
{
|
|
wchar[] r;
|
|
size_t slen = s.length;
|
|
|
|
r.length = slen;
|
|
r.length = 0;
|
|
for (size_t i = 0; i < slen; i++)
|
|
{
|
|
encode(r, s[i]);
|
|
}
|
|
|
|
return r.assumeUnique(); // ok because r is unique
|
|
}
|
|
|
|
|
|
/* =================== Conversion to UTF32 ======================= */
|
|
|
|
/*****
|
|
* Encodes string $(D_PARAM s) into UTF-32 and returns the encoded string.
|
|
*/
|
|
dstring toUTF32(in char[] s) @trusted
|
|
{
|
|
dchar[] r;
|
|
size_t slen = s.length;
|
|
size_t j = 0;
|
|
|
|
r.length = slen; // r[] will never be longer than s[]
|
|
for (size_t i = 0; i < slen; )
|
|
{
|
|
dchar c = s[i];
|
|
if (c >= 0x80)
|
|
c = decode(s, i);
|
|
else
|
|
i++; // c is ascii, no need for decode
|
|
r[j++] = c;
|
|
}
|
|
|
|
return r[0 .. j].assumeUnique(); // legit because it's unique
|
|
}
|
|
|
|
/// ditto
|
|
dstring toUTF32(in wchar[] s) @trusted
|
|
{
|
|
dchar[] r;
|
|
size_t slen = s.length;
|
|
size_t j = 0;
|
|
|
|
r.length = slen; // r[] will never be longer than s[]
|
|
for (size_t i = 0; i < slen; )
|
|
{
|
|
dchar c = s[i];
|
|
if (c >= 0x80)
|
|
c = decode(s, i);
|
|
else
|
|
i++; // c is ascii, no need for decode
|
|
r[j++] = c;
|
|
}
|
|
|
|
return r[0 .. j].assumeUnique(); // legit because it's unique
|
|
}
|
|
|
|
/// ditto
|
|
dstring toUTF32(in dchar[] s) @safe
|
|
{
|
|
validate(s);
|
|
return s.idup;
|
|
}
|
|
|
|
} // Convert functions are @safe
|
|
|
|
|
|
/* =================== toUTFz ======================= */
|
|
|
|
/++
|
|
Returns a C-style zero-terminated string equivalent to $(D str). $(D str)
|
|
must not contain embedded $(D '\0')'s as any C function will treat the first
|
|
$(D '\0') that it sees a the end of the string. If $(D str.empty) is
|
|
$(D true), then a string containing only $(D '\0') is returned.
|
|
|
|
$(D toUTFz) accepts any type of string and is templated on the type of
|
|
character pointer that you wish to convert to. It will avoid allocating a
|
|
new string if it can, but there's a decent chance that it will end up having
|
|
to allocate a new string - particularly when dealing with character types
|
|
other than $(D char).
|
|
|
|
$(RED Warning 1:) If the result of $(D toUTFz) equals $(D str.ptr), then if
|
|
anything alters the character one past the end of $(D str) (which is the
|
|
$(D '\0') character terminating the string), then the string won't be
|
|
zero-terminated anymore. The most likely scenarios for that are if you
|
|
append to $(D str) and no reallocation takes place or when $(D str) is a
|
|
slice of a larger array, and you alter the character in the larger array
|
|
which is one character past the end of $(D str). Another case where it could
|
|
occur would be if you had a mutable character array immediately after
|
|
$(D str) in memory (for example, if they're member variables in a
|
|
user-defined type with one declared right after the other) and that
|
|
character array happened to start with $(D '\0'). Such scenarios will never
|
|
occur if you immediately use the zero-terminated string after calling
|
|
$(D toUTFz) and the C function using it doesn't keep a reference to it.
|
|
Also, they are unlikely to occur even if you save the zero-terminated string
|
|
(the cases above would be among the few examples of where it could happen).
|
|
However, if you save the zero-terminate string and want to be absolutely
|
|
certain that the string stays zero-terminated, then simply append a
|
|
$(D '\0') to the string and use its $(D ptr) property rather than calling
|
|
$(D toUTFz).
|
|
|
|
$(RED Warning 2:) When passing a character pointer to a C function, and the
|
|
C function keeps it around for any reason, make sure that you keep a
|
|
reference to it in your D code. Otherwise, it may go away during a garbage
|
|
collection cycle and cause a nasty bug when the C code tries to use it.
|
|
|
|
Examples:
|
|
--------------------
|
|
auto p1 = toUTFz!(char*)("hello world");
|
|
auto p2 = toUTFz!(const(char)*)("hello world");
|
|
auto p3 = toUTFz!(immutable(char)*)("hello world");
|
|
auto p4 = toUTFz!(char*)("hello world"d);
|
|
auto p5 = toUTFz!(const(wchar)*)("hello world");
|
|
auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
|
|
--------------------
|
|
+/
|
|
template toUTFz(P)
|
|
{
|
|
P toUTFz(S)(S str) @system
|
|
{
|
|
return toUTFzImpl!(P, S)(str);
|
|
}
|
|
}
|
|
|
|
/++ Ditto +/
|
|
template toUTFz(P, S)
|
|
{
|
|
P toUTFz(S str) @system
|
|
{
|
|
return toUTFzImpl!(P, S)(str);
|
|
}
|
|
}
|
|
|
|
private P toUTFzImpl(P, S)(S str) @system
|
|
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
|
|
is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
|
|
is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
|
|
//immutable(C)[] -> C*, const(C)*, or immutable(C)*
|
|
{
|
|
if(str.empty)
|
|
{
|
|
typeof(*P.init)[] retval = ['\0'];
|
|
|
|
return retval.ptr;
|
|
}
|
|
|
|
alias Unqual!(ElementEncodingType!S) C;
|
|
|
|
//If the P is mutable, then we have to make a copy.
|
|
static if(is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
|
|
return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
|
|
else
|
|
{
|
|
immutable p = str.ptr + str.length;
|
|
|
|
// Peek past end of str, if it's 0, no conversion necessary.
|
|
// Note that the compiler will put a 0 past the end of static
|
|
// strings, and the storage allocator will put a 0 past the end
|
|
// of newly allocated char[]'s.
|
|
// Is p dereferenceable? A simple test: if the p points to an
|
|
// address multiple of 4, then conservatively assume the pointer
|
|
// might be pointing to a new block of memory, which might be
|
|
// unreadable. Otherwise, it's definitely pointing to valid
|
|
// memory.
|
|
if((cast(size_t)p & 3) && *p == '\0')
|
|
return str.ptr;
|
|
|
|
return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str);
|
|
}
|
|
}
|
|
|
|
private P toUTFzImpl(P, S)(S str) @system
|
|
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
|
|
is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)) &&
|
|
!is(immutable(Unqual!(ElementEncodingType!S)) == ElementEncodingType!S))
|
|
//C[] or const(C)[] -> C*, const(C)*, or immutable(C)*
|
|
{
|
|
alias ElementEncodingType!S InChar;
|
|
alias typeof(*P.init) OutChar;
|
|
|
|
//const(C)[] -> const(C)* or
|
|
//C[] -> C* or const(C)*
|
|
static if((is(const(Unqual!InChar) == InChar) && is(const(Unqual!OutChar) == OutChar)) ||
|
|
(!is(const(Unqual!InChar) == InChar) && !is(immutable(Unqual!OutChar) == OutChar)))
|
|
{
|
|
auto p = str.ptr + str.length;
|
|
|
|
if((cast(size_t)p & 3) && *p == '\0')
|
|
return str.ptr;
|
|
|
|
str ~= '\0';
|
|
return str.ptr;
|
|
}
|
|
//const(C)[] -> C* or immutable(C)* or
|
|
//C[] -> immutable(C)*
|
|
else
|
|
{
|
|
auto copy = uninitializedArray!(Unqual!OutChar[])(str.length + 1);
|
|
copy[0 .. $ - 1] = str[];
|
|
copy[$ - 1] = '\0';
|
|
|
|
return cast(P)copy.ptr;
|
|
}
|
|
}
|
|
|
|
private P toUTFzImpl(P, S)(S str)
|
|
if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
|
|
!is(Unqual!(typeof(*P.init)) == Unqual!(ElementEncodingType!S)))
|
|
//C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)*
|
|
{
|
|
auto retval = appender!(typeof(*P.init)[])();
|
|
|
|
foreach(dchar c; str)
|
|
retval.put(c);
|
|
retval.put('\0');
|
|
|
|
return cast(P)retval.data.ptr;
|
|
}
|
|
|
|
//Verify Examples.
|
|
unittest
|
|
{
|
|
auto p1 = toUTFz!(char*)("hello world");
|
|
auto p2 = toUTFz!(const(char)*)("hello world");
|
|
auto p3 = toUTFz!(immutable(char)*)("hello world");
|
|
auto p4 = toUTFz!(char*)("hello world"d);
|
|
auto p5 = toUTFz!(const(wchar)*)("hello world");
|
|
auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
import core.exception;
|
|
import std.algorithm;
|
|
import std.metastrings;
|
|
import std.typetuple;
|
|
|
|
size_t zeroLen(C)(const(C)* ptr)
|
|
{
|
|
size_t len = 0;
|
|
|
|
while(*ptr != '\0')
|
|
{
|
|
++ptr;
|
|
++len;
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
foreach(S; TypeTuple!(string, wstring, dstring))
|
|
{
|
|
alias Unqual!(typeof(S.init[0])) C;
|
|
|
|
auto s1 = to!S("hello\U00010143\u0100\U00010143");
|
|
auto temp = new C[](s1.length + 1);
|
|
temp[0 .. $ - 1] = s1[0 .. $];
|
|
temp[$ - 1] = '\n';
|
|
--temp.length;
|
|
auto s2 = assumeUnique(temp);
|
|
assert(s1 == s2);
|
|
|
|
foreach(P; TypeTuple!(C*, const(C)*, immutable(C)*))
|
|
{
|
|
auto p1 = toUTFz!P(s1);
|
|
assert(p1[0 .. s1.length] == s1);
|
|
assert(p1[s1.length] == '\0');
|
|
|
|
auto p2 = toUTFz!P(s2);
|
|
assert(p2[0 .. s2.length] == s2);
|
|
assert(p2[s2.length] == '\0');
|
|
}
|
|
}
|
|
|
|
void test(P, S)(S s, size_t line = __LINE__)
|
|
{
|
|
auto p = toUTFz!P(s);
|
|
immutable len = zeroLen(p);
|
|
enforce(cmp(s, p[0 .. len]) == 0,
|
|
new AssertError(Format!("Unit test failed: %s %s", P.stringof, S.stringof),
|
|
__FILE__, line));
|
|
}
|
|
|
|
foreach(P; TypeTuple!(wchar*, const(wchar)*, immutable(wchar)*,
|
|
dchar*, const(dchar)*, immutable(dchar)*))
|
|
{
|
|
test!P("hello\U00010143\u0100\U00010143");
|
|
}
|
|
|
|
foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
|
|
dchar*, const(dchar)*, immutable(dchar)*))
|
|
{
|
|
test!P("hello\U00010143\u0100\U00010143"w);
|
|
}
|
|
|
|
foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
|
|
wchar*, const(wchar)*, immutable(wchar)*))
|
|
{
|
|
test!P("hello\U00010143\u0100\U00010143"d);
|
|
}
|
|
|
|
foreach(S; TypeTuple!(char[], wchar[], dchar[],
|
|
const(char)[], const(wchar)[], const(dchar)[]))
|
|
{
|
|
auto s = to!S("hello\U00010143\u0100\U00010143");
|
|
|
|
foreach(P; TypeTuple!(char*, wchar*, dchar*,
|
|
const(char)*, const(wchar)*, const(dchar)*,
|
|
immutable(char)*, immutable(wchar)*, immutable(dchar)*))
|
|
{
|
|
test!P(s);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/++
|
|
$(D toUTF16z) is a convenience function for $(D toUTFz!(const(wchar)*)).
|
|
|
|
Encodes string $(D s) into UTF-16 and returns the encoded string.
|
|
$(D toUTF16z) is suitable for calling the 'W' functions in the Win32 API
|
|
that take an $(D LPWSTR) or $(D LPCWSTR) argument.
|
|
+/
|
|
const(wchar)* toUTF16z(C)(const(C)[] str)
|
|
if(isSomeChar!C)
|
|
{
|
|
return toUTFz!(const(wchar)*)(str);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
import std.typetuple;
|
|
|
|
//toUTFz is already thoroughly tested, so this will just verify that
|
|
//toUTF16z compiles properly for the various string types.
|
|
foreach(S; TypeTuple!(string, wstring, dstring))
|
|
static assert(__traits(compiles, toUTF16z(to!S("hello world"))));
|
|
}
|
|
|
|
|
|
/* ================================ tests ================================== */
|
|
|
|
unittest
|
|
{
|
|
debug(utf) printf("utf.toUTF.unittest\n");
|
|
|
|
string c;
|
|
wstring w;
|
|
dstring d;
|
|
|
|
c = "hello";
|
|
w = toUTF16(c);
|
|
assert(w == "hello");
|
|
d = toUTF32(c);
|
|
assert(d == "hello");
|
|
c = toUTF8(w);
|
|
assert(c == "hello");
|
|
d = toUTF32(w);
|
|
assert(d == "hello");
|
|
|
|
c = toUTF8(d);
|
|
assert(c == "hello");
|
|
w = toUTF16(d);
|
|
assert(w == "hello");
|
|
|
|
|
|
c = "hel\u1234o";
|
|
w = toUTF16(c);
|
|
assert(w == "hel\u1234o");
|
|
d = toUTF32(c);
|
|
assert(d == "hel\u1234o");
|
|
|
|
c = toUTF8(w);
|
|
assert(c == "hel\u1234o");
|
|
d = toUTF32(w);
|
|
assert(d == "hel\u1234o");
|
|
|
|
c = toUTF8(d);
|
|
assert(c == "hel\u1234o");
|
|
w = toUTF16(d);
|
|
assert(w == "hel\u1234o");
|
|
|
|
|
|
c = "he\U0010AAAAllo";
|
|
w = toUTF16(c);
|
|
//foreach (wchar c; w) printf("c = x%x\n", c);
|
|
//foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
|
|
assert(w == "he\U0010AAAAllo");
|
|
d = toUTF32(c);
|
|
assert(d == "he\U0010AAAAllo");
|
|
|
|
c = toUTF8(w);
|
|
assert(c == "he\U0010AAAAllo");
|
|
d = toUTF32(w);
|
|
assert(d == "he\U0010AAAAllo");
|
|
|
|
c = toUTF8(d);
|
|
assert(c == "he\U0010AAAAllo");
|
|
w = toUTF16(d);
|
|
assert(w == "he\U0010AAAAllo");
|
|
}
|
|
|
|
|
|
/++
|
|
Returns the total number of code points encoded in $(D str).
|
|
|
|
Supercedes: This function supercedes $(LREF toUCSindex).
|
|
|
|
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
|
|
Throws:
|
|
$(D UTFException) if $(D str) is not well-formed.
|
|
+/
|
|
size_t count(C)(const(C)[] str) @trusted pure
|
|
if(isSomeChar!C)
|
|
{
|
|
return walkLength(str);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
assert(count("") == 0);
|
|
assert(count("a") == 1);
|
|
assert(count("abc") == 3);
|
|
assert(count("\u20AC100") == 4);
|
|
}
|
|
|