mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 22:21:09 +03:00
504 lines
8.2 KiB
D
504 lines
8.2 KiB
D
// utf.d
|
|
// Written by Walter Bright
|
|
// Copyright (c) 2003 Digital Mars
|
|
// All Rights Reserved
|
|
// www.digitalmars.com
|
|
|
|
// Description of UTF-8 at:
|
|
// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
|
|
|
|
module std.utf;
|
|
|
|
//debug=utf; // uncomment to turn on debugging printf's
|
|
|
|
class UtfError : Error
|
|
{
|
|
uint idx; // index in string of where error occurred
|
|
|
|
this(char[] s, uint i)
|
|
{
|
|
idx = i;
|
|
super("s");
|
|
}
|
|
}
|
|
|
|
|
|
//alias uint dchar;
|
|
|
|
bit isValidDchar(dchar c)
|
|
{
|
|
return c < 0xD800 ||
|
|
(c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(utf) printf("utf.isValidDchar.unittest\n");
|
|
assert(isValidDchar((dchar)'a') == true);
|
|
assert(isValidDchar((dchar)0x1FFFFF) == false);
|
|
}
|
|
|
|
/* =================== Decode ======================= */
|
|
|
|
dchar decode(char[] s, inout uint idx)
|
|
in
|
|
{
|
|
assert(idx >= 0 && idx < s.length);
|
|
}
|
|
out (result)
|
|
{
|
|
assert(isValidDchar(result));
|
|
}
|
|
body
|
|
{
|
|
uint len = s.length;
|
|
dchar V;
|
|
uint i = idx;
|
|
char u = s[i];
|
|
|
|
if (u & 0x80)
|
|
{ uint n;
|
|
char u2;
|
|
|
|
/* The following encodings are valid, except for the 5 and 6 byte
|
|
* combinations:
|
|
* 0xxxxxxx
|
|
* 110xxxxx 10xxxxxx
|
|
* 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
for (n = 1; ; n++)
|
|
{
|
|
if (n > 4)
|
|
goto Lerr; // only do the first 4 of 6 encodings
|
|
if (((u << n) & 0x80) == 0)
|
|
{
|
|
if (n == 1)
|
|
goto Lerr;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Pick off (7 - n) significant bits of B from first byte of octet
|
|
V = cast(dchar)(u & ((1 << (7 - n)) - 1));
|
|
|
|
if (i + (n - 1) >= len)
|
|
goto Lerr; // off end of string
|
|
|
|
/* The following combinations are overlong, and illegal:
|
|
* 1100000x (10xxxxxx)
|
|
* 11100000 100xxxxx (10xxxxxx)
|
|
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
|
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
|
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
*/
|
|
u2 = s[i + 1];
|
|
if ((u & 0xFE) == 0xC0 ||
|
|
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
|
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
|
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
|
(u == 0xFC && (u2 & 0xFC) == 0x80))
|
|
goto Lerr; // overlong combination
|
|
|
|
for (uint j = 1; j != n; j++)
|
|
{
|
|
u = s[i + j];
|
|
if ((u & 0xC0) != 0x80)
|
|
goto Lerr; // trailing bytes are 10xxxxxx
|
|
V = (V << 6) | (u & 0x3F);
|
|
}
|
|
if (!isValidDchar(V))
|
|
goto Lerr;
|
|
i += n;
|
|
}
|
|
else
|
|
{
|
|
V = cast(dchar) u;
|
|
i++;
|
|
}
|
|
|
|
idx = i;
|
|
return V;
|
|
|
|
Lerr:
|
|
throw new UtfError("invalid UTF-8 sequence", i);
|
|
}
|
|
|
|
unittest
|
|
{ uint i;
|
|
dchar c;
|
|
|
|
debug(utf) printf("utf.decode.unittest\n");
|
|
|
|
static char[] s1 = "abcd";
|
|
i = 0;
|
|
c = decode(s1, i);
|
|
assert(c == (dchar)'a');
|
|
assert(i == 1);
|
|
c = decode(s1, i);
|
|
assert(c == (dchar)'b');
|
|
assert(i == 2);
|
|
|
|
static char[] s2 = "\xC2\xA9";
|
|
i = 0;
|
|
c = decode(s2, i);
|
|
assert(c == (dchar)'\u00A9');
|
|
assert(i == 2);
|
|
|
|
static char[] s3 = "\xE2\x89\xA0";
|
|
i = 0;
|
|
c = decode(s3, i);
|
|
assert(c == (dchar)'\u2260');
|
|
assert(i == 3);
|
|
|
|
static char[][] s4 =
|
|
[ "\xE2\x89", // too short
|
|
"\xC0\x8A",
|
|
"\xE0\x80\x8A",
|
|
"\xF0\x80\x80\x8A",
|
|
"\xF8\x80\x80\x80\x8A",
|
|
"\xFC\x80\x80\x80\x80\x8A",
|
|
];
|
|
|
|
for (int j = 0; j < s4.length; j++)
|
|
{
|
|
try
|
|
{
|
|
i = 0;
|
|
c = decode(s4[j], i);
|
|
assert(0);
|
|
}
|
|
catch (UtfError u)
|
|
{
|
|
i = 23;
|
|
}
|
|
assert(i == 23);
|
|
}
|
|
}
|
|
|
|
/********************************************************/
|
|
|
|
dchar decode(wchar[] s, inout uint idx)
|
|
in
|
|
{
|
|
assert(idx >= 0 && idx < s.length);
|
|
}
|
|
out (result)
|
|
{
|
|
assert(isValidDchar(result));
|
|
}
|
|
body
|
|
{
|
|
char[] msg;
|
|
dchar V;
|
|
uint i = idx;
|
|
uint u = s[i];
|
|
|
|
if (u & ~0x7F)
|
|
{ if (u >= 0xD800 && u <= 0xDBFF)
|
|
{ uint u2;
|
|
|
|
if (i + 1 == s.length)
|
|
{ msg = "surrogate UTF-16 high value past end of string";
|
|
goto Lerr;
|
|
}
|
|
u2 = s[i + 1];
|
|
if (u2 < 0xDC00 || u2 > 0xDFFF)
|
|
{ msg = "surrogate UTF-16 low value out of range";
|
|
goto Lerr;
|
|
}
|
|
u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
|
|
i += 2;
|
|
}
|
|
else if (u >= 0xDC00 && u <= 0xDFFF)
|
|
{ msg = "unpaired surrogate UTF-16 value";
|
|
goto Lerr;
|
|
}
|
|
else if (u == 0xFFFE || u == 0xFFFF)
|
|
{ msg = "illegal UTF-16 value";
|
|
goto Lerr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
i++;
|
|
}
|
|
|
|
idx = i;
|
|
return cast(dchar)u;
|
|
|
|
Lerr:
|
|
throw new UtfError(msg, i);
|
|
}
|
|
|
|
/********************************************************/
|
|
|
|
dchar decode(dchar[] s, inout uint idx)
|
|
in
|
|
{
|
|
assert(idx >= 0 && idx < s.length);
|
|
}
|
|
body
|
|
{
|
|
uint i = idx;
|
|
dchar c = s[i];
|
|
|
|
if (!isValidDchar(c))
|
|
goto Lerr;
|
|
idx = i + 1;
|
|
return c;
|
|
|
|
Lerr:
|
|
throw new UtfError("invalid UTF-32 value", i);
|
|
}
|
|
|
|
|
|
/* =================== Encode ======================= */
|
|
|
|
void encode(inout char[] s, dchar c)
|
|
in
|
|
{
|
|
assert(isValidDchar(c));
|
|
}
|
|
body
|
|
{
|
|
char[] r = s;
|
|
|
|
if (c <= 0x7F)
|
|
{
|
|
r ~= cast(char) c;
|
|
}
|
|
else
|
|
{
|
|
char[4] buf;
|
|
uint L;
|
|
|
|
if (c <= 0x7FF)
|
|
{
|
|
buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 2;
|
|
}
|
|
else if (c <= 0xFFFF)
|
|
{
|
|
buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 3;
|
|
}
|
|
else if (c <= 0x10FFFF)
|
|
{
|
|
buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
L = 4;
|
|
}
|
|
else
|
|
{
|
|
assert(0);
|
|
}
|
|
r ~= buf[0 .. L];
|
|
}
|
|
s = r;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
debug(utf) printf("utf.encode.unittest\n");
|
|
|
|
char[] s = "abcd";
|
|
encode(s, (dchar)'a');
|
|
assert(s.length == 5);
|
|
assert(s == "abcda");
|
|
|
|
encode(s, (dchar)'\u00A9');
|
|
assert(s.length == 7);
|
|
assert(s == "abcda\xC2\xA9");
|
|
//assert(s == "abcda\u00A9"); // BUG: fix compiler
|
|
|
|
encode(s, (dchar)'\u2260');
|
|
assert(s.length == 10);
|
|
assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
|
|
}
|
|
|
|
/********************************************************/
|
|
|
|
void encode(inout wchar[] s, dchar c)
|
|
in
|
|
{
|
|
assert(isValidDchar(c));
|
|
}
|
|
body
|
|
{
|
|
wchar[] r = s;
|
|
|
|
if (c <= 0xFFFF)
|
|
{
|
|
r ~= cast(wchar) c;
|
|
}
|
|
else
|
|
{
|
|
wchar[2] buf;
|
|
|
|
buf[0] = (((c - 0x10000) >> 10) & 0x3FF) + 0xD800;
|
|
buf[1] = ((c - 0x10000) & 0x3FF) + 0xDC00;
|
|
r ~= buf;
|
|
}
|
|
s = r;
|
|
}
|
|
|
|
void encode(inout dchar[] s, dchar c)
|
|
in
|
|
{
|
|
assert(isValidDchar(c));
|
|
}
|
|
body
|
|
{
|
|
s ~= c;
|
|
}
|
|
|
|
/* =================== Validation ======================= */
|
|
|
|
void validate(char[] s)
|
|
{
|
|
uint len = s.length;
|
|
uint i;
|
|
|
|
for (i = 0; i < len; )
|
|
{
|
|
decode(s, i);
|
|
}
|
|
}
|
|
|
|
void validate(wchar[] s)
|
|
{
|
|
uint len = s.length;
|
|
uint i;
|
|
|
|
for (i = 0; i < len; )
|
|
{
|
|
decode(s, i);
|
|
}
|
|
}
|
|
|
|
void validate(dchar[] s)
|
|
{
|
|
uint len = s.length;
|
|
uint i;
|
|
|
|
for (i = 0; i < len; )
|
|
{
|
|
decode(s, i);
|
|
}
|
|
}
|
|
|
|
/* =================== Conversion to UTF8 ======================= */
|
|
|
|
|
|
char[] toUTF8(char[] s)
|
|
in
|
|
{
|
|
validate(s);
|
|
}
|
|
body
|
|
{
|
|
return s;
|
|
}
|
|
|
|
char[] toUTF8(wchar[] s)
|
|
{
|
|
char[] r;
|
|
|
|
for (int i = 0; i < s.length; i++)
|
|
{
|
|
encode(r, cast(dchar)s[i]);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
char[] toUTF8(dchar[] s)
|
|
{
|
|
char[] r;
|
|
|
|
for (int i = 0; i < s.length; i++)
|
|
{
|
|
encode(r, s[i]);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
/* =================== Conversion to UTF16 ======================= */
|
|
|
|
wchar[] toUTF16(char[] s)
|
|
{
|
|
wchar[] r;
|
|
|
|
for (uint i = 0; i < s.length; )
|
|
{
|
|
dchar c = decode(s, i);
|
|
encode(r, c);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
wchar[] toUTF16(wchar[] s)
|
|
in
|
|
{
|
|
validate(s);
|
|
}
|
|
body
|
|
{
|
|
return s;
|
|
}
|
|
|
|
wchar[] toUTF16(dchar[] s)
|
|
{
|
|
wchar[] r;
|
|
|
|
for (uint i = 0; i < s.length; i++)
|
|
{
|
|
encode(r, s[i]);
|
|
}
|
|
return r;
|
|
}
|
|
|
|
/* =================== Conversion to UTF32 ======================= */
|
|
|
|
dchar[] toUTF32(char[] s)
|
|
{
|
|
dchar[] r;
|
|
|
|
for (uint i = 0; i < s.length; )
|
|
{
|
|
dchar c = decode(s, i);
|
|
r ~= c;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
dchar[] toUTF32(wchar[] s)
|
|
{
|
|
dchar[] r;
|
|
|
|
for (uint i = 0; i < s.length; )
|
|
{
|
|
dchar c = decode(s, i);
|
|
r ~= c;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
dchar[] toUTF32(dchar[] s)
|
|
in
|
|
{
|
|
validate(s);
|
|
}
|
|
body
|
|
{
|
|
return s;
|
|
}
|
|
|
|
|