mirror of
https://github.com/dlang/phobos.git
synced 2025-05-01 15:40:36 +03:00
1867 lines
41 KiB
D
Executable file
1867 lines
41 KiB
D
Executable file
// Written in the D programming language.
|
|
|
|
/**
|
|
Classes and functions for handling and transcoding between various encodings.
|
|
Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
|
|
(also known as LATIN-1), and WINDOWS-1252
|
|
|
|
Functions are provided for arbitrary encoding and decoding of single
|
|
characters, arbitrary transcoding between strings of different type, as well as
|
|
validation and sanitization.
|
|
|
|
The type EString!(Ascii) represents an ASCII string; the type EString!(Latin1)
|
|
represents an ISO-8859-1 string, and so on. In general, EString!(E) is the
|
|
string type for encoding E, and EString(Utf8), EString(Utf16) and
|
|
EString(Utf32) are aliases for string, wstring and dstring respectively.
|
|
|
|
Future directions for this module include the ability to handle arbitrary
|
|
encodings.
|
|
|
|
Authors: Janice Caron
|
|
|
|
Date: 2006.02.21
|
|
|
|
License: Public Domain
|
|
|
|
$(BIG $(B A Brief Tutorial))
|
|
|
|
There are many character sets (or more properly, character repertoires) on the
|
|
planet. Unicode is the superset of all other legacy character sets. Therefore,
|
|
$(I every) character which exists in any character repertoire, also exists in
|
|
Unicode. Every character in Unicode has an integer associated with it. That
|
|
integer is the called the character's code point. For example, the code point
|
|
of the letter 'A' is 65, or in hex, 0x41. It is important to know that a
|
|
character's code point is unchangeable. It is a permanent property of the
|
|
character, and it does not depend on how you encode it. The code point of 'A'
|
|
is 65, period.
|
|
|
|
Most character repertoires consist of 256 characters or fewer. This is because
|
|
it is convenient to use single-byte encoding schemes. In such repertoires,
|
|
every character will have an integer in the range 0 to 255 associated with it,
|
|
denoting its position within that repertoire. That number is called a code
|
|
unit. Note that, in general, code unit != code point.
|
|
|
|
For example, the Euro currency symbol has code point 0x20AC. This is a
|
|
permanent property of the character. That character does not exist in the
|
|
ASCII repertoire, and so cannot be encoded in ASCII. It also does not exist in
|
|
the Latin-1 character repertoire, and likewise cannot be encoded in Latin-1.It
|
|
$(I does) exist in the Windows-1252 character repertoire though. In that
|
|
encoding, it is represented by the byte 0x80. So in that encoding, its code
|
|
UNIT is 0x80, but its code POINT is still 0x20AC. Code points are always
|
|
measured in Unicode.
|
|
|
|
Some character repertoires contain more than 256 characters. Yet it is still
|
|
desirable to be able to store those characters as a byte sequence, so
|
|
multi-byte encodings got invented to allow that. In such encodings a single
|
|
character may require more than one byte to represent it.
|
|
|
|
The process of converting a single code point into one or more code units is
|
|
called ENCODING. The reverse process, that of converting multiple code units
|
|
into a single code point is called DECODING.
|
|
|
|
Almost all encodings use 8-bit bytes as the storage type for a single code
|
|
unit - but there are exceptions. UTF-16, for example, uses 16-bit wide code
|
|
units. (The character repertoire which it represents contains more than 2^16
|
|
characters, so some of those characters need to expressed as multiple code
|
|
units, even in UTF-16). UTF-32 uses a 32-bit wide code unit, which means, just
|
|
like in the good old days of ASCII, one code unit == one code point. UTF-32,
|
|
however, is the $(I only) encoding for which this is true.
|
|
|
|
*/
|
|
|
|
module std.encoding;
|
|
|
|
unittest
|
|
{
|
|
ubyte[][] validStrings =
|
|
[
|
|
// Plain ASCII
|
|
cast(ubyte[])"hello",
|
|
|
|
// The Greek word 'kosme'
|
|
cast(ubyte[])"κόσμε",
|
|
|
|
// First possible sequence of a certain length
|
|
[ 0x00 ], // U+00000000 one byte
|
|
[ 0xC2, 0x80 ], // U+00000080 two bytes
|
|
[ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
|
|
[ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
|
|
|
|
// Last possible sequence of a certain length
|
|
[ 0x7F ], // U+0000007F one byte
|
|
[ 0xDF, 0xBF ], // U+000007FF two bytes
|
|
[ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
|
|
|
|
// Other boundary conditions
|
|
[ 0xED, 0x9F, 0xBF ], // U+0000D7FF Last character before surrogates
|
|
[ 0xEE, 0x80, 0x80 ], // U+0000E000 First character after surrogates
|
|
[ 0xEF, 0xBF, 0xBD ], // U+0000FFFD Unicode replacement character
|
|
[ 0xF4, 0x8F, 0xBF, 0xBF ], // U+0010FFFF Very last character
|
|
|
|
// Non-character code points
|
|
/* NOTE: These are legal in UTF, and may be converted from one UTF to
|
|
another, however they do not represent Unicode characters. These
|
|
code points have been reserved by Unicode as non-character code
|
|
points. They are permissible for data exchange within an
|
|
application, but they are are not permitted to be used as
|
|
characters. Since this module deals with UTF, and not with Unicode
|
|
per se, we choose to accept them here. */
|
|
[ 0xDF, 0xBE ], // U+0000FFFE
|
|
[ 0xDF, 0xBF ], // U+0000FFFF
|
|
];
|
|
|
|
ubyte[][] invalidStrings =
|
|
[
|
|
// First possible sequence of a certain length, but greater than U+10FFFF
|
|
[ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
|
|
[ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
|
|
|
|
// Last possible sequence of a certain length, but greater than U+10FFFF
|
|
[ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
|
|
[ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
|
|
[ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
|
|
|
|
// Other boundary conditions
|
|
[ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 First code
|
|
// point after last character
|
|
|
|
// Unexpected continuation bytes
|
|
[ 0x80 ],
|
|
[ 0xBF ],
|
|
[ 0x20, 0x80, 0x20 ],
|
|
[ 0x20, 0xBF, 0x20 ],
|
|
|
|
// Lonely start bytes
|
|
[ 0xC0 ],
|
|
[ 0xCF ],
|
|
[ 0x20, 0xC0, 0x20 ],
|
|
[ 0x20, 0xCF, 0x20 ],
|
|
[ 0xD0 ],
|
|
[ 0xDF ],
|
|
[ 0x20, 0xD0, 0x20 ],
|
|
[ 0x20, 0xDF, 0x20 ],
|
|
[ 0xE0 ],
|
|
[ 0xEF ],
|
|
[ 0x20, 0xE0, 0x20 ],
|
|
[ 0x20, 0xEF, 0x20 ],
|
|
[ 0xF0 ],
|
|
[ 0xF1 ],
|
|
[ 0xF2 ],
|
|
[ 0xF3 ],
|
|
[ 0xF4 ],
|
|
[ 0xF5 ], // If this were legal it would start a character > U+10FFFF
|
|
[ 0xF6 ], // If this were legal it would start a character > U+10FFFF
|
|
[ 0xF7 ], // If this were legal it would start a character > U+10FFFF
|
|
|
|
[ 0xEF, 0xBF ], // Three byte sequence with third byte missing
|
|
[ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
|
|
|
|
// Impossible bytes
|
|
[ 0xF8 ],
|
|
[ 0xF9 ],
|
|
[ 0xFA ],
|
|
[ 0xFB ],
|
|
[ 0xFC ],
|
|
[ 0xFD ],
|
|
[ 0xFE ],
|
|
[ 0xFF ],
|
|
[ 0x20, 0xF8, 0x20 ],
|
|
[ 0x20, 0xF9, 0x20 ],
|
|
[ 0x20, 0xFA, 0x20 ],
|
|
[ 0x20, 0xFB, 0x20 ],
|
|
[ 0x20, 0xFC, 0x20 ],
|
|
[ 0x20, 0xFD, 0x20 ],
|
|
[ 0x20, 0xFE, 0x20 ],
|
|
[ 0x20, 0xFF, 0x20 ],
|
|
|
|
// Overlong sequences, all representing U+002F
|
|
/* With a safe UTF-8 decoder, all of the following five overlong
|
|
representations of the ASCII character slash ("/") should be
|
|
rejected like a malformed UTF-8 sequence */
|
|
[ 0xC0, 0xAF ],
|
|
[ 0xE0, 0x80, 0xAF ],
|
|
[ 0xF0, 0x80, 0x80, 0xAF ],
|
|
[ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
|
|
[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
|
|
|
|
// Maximum overlong sequences
|
|
/* Below you see the highest Unicode value that is still resulting in
|
|
an overlong sequence if represented with the given number of bytes.
|
|
This is a boundary test for safe UTF-8 decoders. All five
|
|
characters should be rejected like malformed UTF-8 sequences. */
|
|
[ 0xC1, 0xBF ], // U+0000007F
|
|
[ 0xE0, 0x9F, 0xBF ], // U+000007FF
|
|
[ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
|
|
[ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
|
|
[ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
|
|
|
|
// Overlong representation of the NUL character
|
|
/* The following five sequences should also be rejected like malformed
|
|
UTF-8 sequences and should not be treated like the ASCII NUL
|
|
character. */
|
|
[ 0xC0, 0x80 ],
|
|
[ 0xE0, 0x80, 0x80 ],
|
|
[ 0xF0, 0x80, 0x80, 0x80 ],
|
|
[ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
|
|
[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
|
|
|
|
// Illegal code positions
|
|
/* The following UTF-8 sequences should be rejected like malformed
|
|
sequences, because they never represent valid ISO 10646 characters
|
|
and a UTF-8 decoder that accepts them might introduce security
|
|
problems comparable to overlong UTF-8 sequences. */
|
|
[ 0xED, 0xA0, 0x80 ], // U+D800
|
|
[ 0xED, 0xAD, 0xBF ], // U+DB7F
|
|
[ 0xED, 0xAE, 0x80 ], // U+DB80
|
|
[ 0xED, 0xAF, 0xBF ], // U+DBFF
|
|
[ 0xED, 0xB0, 0x80 ], // U+DC00
|
|
[ 0xED, 0xBE, 0x80 ], // U+DF80
|
|
[ 0xED, 0xBF, 0xBF ], // U+DFFF
|
|
];
|
|
|
|
// Make sure everything that should be valid, is
|
|
foreach(a;validStrings)
|
|
{
|
|
string s = cast(string)a;
|
|
assert(isValid(s),"Failed to validate: "~makeReadable(s));
|
|
}
|
|
|
|
// Make sure everything that shouldn't be valid, isn't
|
|
foreach(a;invalidStrings)
|
|
{
|
|
string s = cast(string)a;
|
|
assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
|
|
}
|
|
|
|
// Make sure we can sanitize everything bad
|
|
foreach(a;invalidStrings)
|
|
{
|
|
string s = cast(string)a;
|
|
string t = sanitize(s);
|
|
assert(isValid(t));
|
|
ubyte[] u = cast(ubyte[])t;
|
|
validStrings ~= u;
|
|
}
|
|
|
|
// Make sure all transcodings work in both directions, using both forward
|
|
// and reverse iteration
|
|
foreach(a;validStrings)
|
|
{
|
|
string s = cast(string)a;
|
|
string s2;
|
|
wstring ws, ws2;
|
|
dstring ds, ds2;
|
|
|
|
transcode(s,ws);
|
|
assert(isValid(ws));
|
|
transcode(ws,s2);
|
|
assert(s == s2);
|
|
|
|
transcode(s,ds);
|
|
assert(isValid(ds));
|
|
transcode(ds,s2);
|
|
assert(s == s2);
|
|
|
|
transcode(ws,s);
|
|
assert(isValid(s));
|
|
transcode(s,ws2);
|
|
assert(ws == ws2);
|
|
|
|
transcode(ws,ds);
|
|
assert(isValid(ds));
|
|
transcode(ds,ws2);
|
|
assert(ws == ws2);
|
|
|
|
transcode(ds,s);
|
|
assert(isValid(s));
|
|
transcode(s,ds2);
|
|
assert(ds == ds2);
|
|
|
|
transcode(ds,ws);
|
|
assert(isValid(ws));
|
|
transcode(ws,ds2);
|
|
assert(ds == ds2);
|
|
|
|
transcodeReverse(s,ws);
|
|
assert(isValid(ws));
|
|
transcodeReverse(ws,s2);
|
|
assert(s == s2);
|
|
|
|
transcodeReverse(s,ds);
|
|
assert(isValid(ds));
|
|
transcodeReverse(ds,s2);
|
|
assert(s == s2);
|
|
|
|
transcodeReverse(ws,s);
|
|
assert(isValid(s));
|
|
transcodeReverse(s,ws2);
|
|
assert(ws == ws2);
|
|
|
|
transcodeReverse(ws,ds);
|
|
assert(isValid(ds));
|
|
transcodeReverse(ds,ws2);
|
|
assert(ws == ws2);
|
|
|
|
transcodeReverse(ds,s);
|
|
assert(isValid(s));
|
|
transcodeReverse(s,ds2);
|
|
assert(ds == ds2);
|
|
|
|
transcodeReverse(ds,ws);
|
|
assert(isValid(ws));
|
|
transcodeReverse(ws,ds2);
|
|
assert(ds == ds2);
|
|
}
|
|
|
|
// Make sure the non-UTF encodings work too
|
|
{
|
|
auto s = "\u20AC100";
|
|
auto t = to!(Windows1252)(s);
|
|
assert(t == [cast(Windows1252)0x80, '1', '0', '0']);
|
|
auto u = to!(Utf8)(s);
|
|
assert(s == u);
|
|
auto v = to!(Latin1)(s);
|
|
assert(cast(string)v == "?100");
|
|
auto w = to!(Ascii)(v);
|
|
assert(cast(string)w == "?100");
|
|
}
|
|
}
|
|
|
|
// Unit tests over. Now for the code...
|
|
|
|
template Encoding(T)
|
|
{
|
|
static if (is(T==char))
|
|
{
|
|
enum MAX_SEQUENCE_LENGTH = 4;
|
|
|
|
invariant(char)[] encodingName = "UTF-8";
|
|
|
|
byte[256] tailTable =
|
|
[
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
|
-1,-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
3,3,3,3,3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
|
|
];
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return c < 0x80 || tails(c) >= 0;
|
|
}
|
|
|
|
int tails(T c)
|
|
{
|
|
return tailTable[c];
|
|
}
|
|
|
|
bool isSingle(T c)
|
|
{
|
|
return c < 0x80;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return tails(c) > 0;
|
|
}
|
|
|
|
bool isTail(T c)
|
|
{
|
|
return tails(c) == 0;
|
|
}
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
if (n > s.length) return true;
|
|
for (uint i=0; i<n; ++i)
|
|
{
|
|
if (!isTail(s[i])) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
switch(c) // Deal with special cases
|
|
{
|
|
case 0xE0: return (d & 0xE0) == 0x80; // fail overlong 3-byte sequences
|
|
case 0xED: return (d & 0xE0) == 0xA0; // fail surrogates
|
|
case 0xF0: return (d & 0xF0) == 0x80; // fail overlong 4-byte sequences
|
|
case 0xF4: return (d & 0xF0) >= 0x90; // fail code points > 0x10FFFF
|
|
default: return false;
|
|
}
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
if (c < 0x80)
|
|
{
|
|
buffer[0] = c;
|
|
return 1;
|
|
}
|
|
else if (c < 0x800)
|
|
{
|
|
buffer[0] = (c >> 6) + 0xC0;
|
|
buffer[1] = (c & 0x3F) + 0x80;
|
|
return 2;
|
|
}
|
|
else if (c < 0x10000)
|
|
{
|
|
buffer[0] = (c >> 12) + 0xE0;
|
|
buffer[1] = ((c >> 6) & 0x3F) + 0x80;
|
|
buffer[2] = (c & 0x3F) + 0x80;
|
|
return 3;
|
|
}
|
|
else
|
|
{
|
|
buffer[0] = (c >> 18) + 0xF0;
|
|
buffer[1] = ((c >> 12) & 0x3F) + 0x80;
|
|
buffer[2] = ((c >> 6) & 0x3F) + 0x80;
|
|
buffer[3] = (c & 0x3F) + 0x80;
|
|
return 4;
|
|
}
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(!isSingle(s[0]));
|
|
assert(isValid(firstSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
uint c = s[0];
|
|
int n = tails(c);
|
|
assert(n > 0);
|
|
c &= (1 << (6 - n)) - 1;
|
|
foreach(T d;s[1..n+1])
|
|
{
|
|
c = (c << 6) + (d & 0x3F);
|
|
}
|
|
s = s[n+1..$];
|
|
return c;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(!isSingle(s[$-1]));
|
|
assert(isValid(lastSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
uint c = s[$-1];
|
|
uint i;
|
|
uint shift = 0;
|
|
c &= 0x3F;
|
|
for (i=s.length-2; i>=0; --i)
|
|
{
|
|
shift += 6;
|
|
assert(shift < 24);
|
|
uint d = s[i];
|
|
uint n = tails(d);
|
|
uint mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
|
|
c = c + ((d & mask) << shift);
|
|
if (n != 0) break;
|
|
}
|
|
s = s[0..i];
|
|
return c;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= "\xEF\xBF\xBD";
|
|
}
|
|
}
|
|
else static if (is(T==wchar))
|
|
{
|
|
alias wstring string;
|
|
|
|
invariant(char)[] encodingName = "UTF-16";
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
enum MAX_SEQUENCE_LENGTH = 2;
|
|
|
|
bool isSingle(T c)
|
|
{
|
|
return c < 0xD800 || c >= 0xE000;
|
|
}
|
|
|
|
int tails(T c)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return c >= 0xD800 && c < 0xDC00;
|
|
}
|
|
|
|
bool isTail(T c)
|
|
{
|
|
return c >= 0xDC00 && c < 0xE000;
|
|
}
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
return (!isTail(s[0]));
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
if (c < 0x10000)
|
|
{
|
|
buffer[0] = c;
|
|
return 1;
|
|
}
|
|
else
|
|
{
|
|
c -= 0x10000;
|
|
buffer[0] = (c >> 10) + 0xD800;
|
|
buffer[1] = (c & 0x3FF) + 0xDC00;
|
|
return 2;
|
|
}
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(!isSingle(s[0]));
|
|
assert(isValid(firstSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
uint c = s[0];
|
|
uint d = s[1];
|
|
s = s[2..$];
|
|
return ((c & 0x3FF) << 10) + (d & 0x3FF) + 0x10000;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(!isSingle(s[$-1]));
|
|
assert(isValid(lastSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
uint c = s[$-1];
|
|
uint d = s[$-2];
|
|
s = s[0..$-2];
|
|
return ((d & 0x3FF) << 10) + (c & 0x3FF) + 0x10000;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= cast(T)0xFFFD;
|
|
}
|
|
}
|
|
else static if (is(T==dchar))
|
|
{
|
|
alias dstring string;
|
|
|
|
enum MAX_SEQUENCE_LENGTH = 1;
|
|
|
|
invariant(char)[] encodingName = "UTF-32";
|
|
|
|
alias isValidCodePoint isValidCodeUnit;
|
|
|
|
alias isValidCodePoint isSingle;
|
|
|
|
int tails(T c)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
alias isHead isTail;
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
buffer[0] = c;
|
|
return 1;
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= cast(T)0xFFFD;
|
|
}
|
|
}
|
|
else static if (is(T:Ascii))
|
|
{
|
|
alias invariant(Ascii)[] string;
|
|
|
|
enum MAX_SEQUENCE_LENGTH = 1;
|
|
|
|
invariant(char)[] encodingName = "ASCII";
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return c < 0x80;
|
|
}
|
|
|
|
alias isValidCodeUnit isSingle;
|
|
|
|
int tails(T c)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
alias isHead isTail;
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
buffer[0] = cast(T)(c < 0x80 ? c : '?');
|
|
return 1;
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= cast(T)'?';
|
|
}
|
|
}
|
|
else static if (is(T:Latin1))
|
|
{
|
|
alias invariant(Latin1)[] string;
|
|
|
|
enum MAX_SEQUENCE_LENGTH = 1;
|
|
|
|
invariant(char)[] encodingName = "ISO-8859-1";
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
alias isValidCodeUnit isSingle;
|
|
|
|
int tails(T c)
|
|
{
|
|
return -1;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
alias isHead isTail;
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
buffer[0] = cast(T)(c < 0x100 ? c : '?');
|
|
return 1;
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(false);
|
|
}
|
|
body
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= cast(T)'?';
|
|
}
|
|
}
|
|
else static if (is(T:Windows1252))
|
|
{
|
|
alias invariant(Windows1252)[] string;
|
|
|
|
enum MAX_SEQUENCE_LENGTH = 1;
|
|
|
|
invariant(char)[] encodingName = "WINDOWS-1252";
|
|
|
|
wstring charMap =
|
|
"\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
|
|
"\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
|
|
"\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
|
|
"\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"
|
|
;
|
|
|
|
dchar win2uni(T c)
|
|
{
|
|
return isSingle(c) ? c : charMap[c-0x80];
|
|
}
|
|
|
|
T uni2win(dchar c)
|
|
{
|
|
if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return cast(T)c;
|
|
if (c != 0xFFFD)
|
|
{
|
|
foreach(n,d;charMap)
|
|
{
|
|
if (c == d) return cast(T)(n + 0x80);
|
|
}
|
|
}
|
|
return '?';
|
|
}
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return(win2uni(c) != 0xFFFD);
|
|
}
|
|
|
|
bool isSingle(T c)
|
|
{
|
|
return c < 0x80 || c >= 0xA0;
|
|
}
|
|
|
|
int tails(T c)
|
|
{
|
|
return isSingle(c) ? -1 : 0;
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
return isSingle(c) ? false : isValidCodeUnit(c);
|
|
}
|
|
|
|
bool isTail(T c)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
buffer[0] = uni2win(c);
|
|
return 1;
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
}
|
|
body
|
|
{
|
|
dchar c = win2uni(s[0]);
|
|
s = s[1..$];
|
|
return c;
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
}
|
|
body
|
|
{
|
|
dchar c = win2uni(s[$-1]);
|
|
s = s[0..$-1];
|
|
return c;
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
s ~= cast(T)'?';
|
|
}
|
|
}
|
|
// NOTE: The "else" case is commented out because it doesn't work (yet?)
|
|
// because of some template issues which have yet to be resolved. Expect
|
|
// this to work in some future release.
|
|
/+
|
|
else // The generic case. All other encodings.
|
|
{
|
|
alias invariant(T)[] string;
|
|
|
|
static if(is(T.MAX_SEQUENCE_LENGTH))
|
|
{
|
|
enum MAX_SEQUENCE_LENGTH = T.MAX_SEQUENCE_LENGTH;
|
|
}
|
|
else
|
|
{
|
|
enum MAX_SEQUENCE_LENGTH = 1;
|
|
}
|
|
|
|
invariant(char)[] encodingName()
|
|
{
|
|
return T.encodingName;
|
|
}
|
|
|
|
bool isValidCodeUnit(T c)
|
|
{
|
|
return c.isValidCodeUnit;
|
|
}
|
|
|
|
bool isSingle(T c)
|
|
{
|
|
static if (is(T.isSingle == function))
|
|
{
|
|
return c.isSingle;
|
|
}
|
|
else
|
|
{
|
|
return c.isValidCodeUnit;
|
|
}
|
|
}
|
|
|
|
int tails(T c)
|
|
{
|
|
static if (is(T.tails == function))
|
|
{
|
|
return c.tails();
|
|
}
|
|
else
|
|
{
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
bool isHead(T c)
|
|
{
|
|
static if(is(T.isHead == function))
|
|
{
|
|
return c.isHead;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool isTail(T c)
|
|
{
|
|
static if(is(T.isTail == function))
|
|
{
|
|
return c.isTail;
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool badTail(const(T)[] s, uint n)
|
|
{
|
|
static if(is(T.badTail == function))
|
|
{
|
|
return T.badTail(s,n);
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool isInvalidHeadTail(T c, T d)
|
|
{
|
|
static if(is(T.isInvalidHeadTail == function))
|
|
{
|
|
return c.isInvalidHeadTail(d);
|
|
}
|
|
else
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
uint encode(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(c));
|
|
assert(buffer.length >= MAX_SEQUENCE_LENGTH);
|
|
}
|
|
body
|
|
{
|
|
return encode(c,buffer);
|
|
}
|
|
|
|
dchar decodeSingleSequence(ref string s)
|
|
{
|
|
static if(is(T.decodeSingleSequence == function))
|
|
{
|
|
return T.decodeSingleSequence(s);
|
|
}
|
|
else
|
|
{
|
|
assert(false);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
dchar decodeSingleSequenceReverse(ref string s)
|
|
{
|
|
static if(is(T.decodeSingleSequenceReverse == function))
|
|
{
|
|
return T.decodeSingleSequenceReverse(s);
|
|
}
|
|
else
|
|
{
|
|
assert(false);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
void appendReplacementChar(ref string s)
|
|
{
|
|
if (is(T.replacementChar == function))
|
|
{
|
|
s ~= T.replacementChar();
|
|
}
|
|
else
|
|
{
|
|
s ~= cast(T)'?';
|
|
}
|
|
}
|
|
}
|
|
+/
|
|
|
|
uint pseudoSequenceLength(string s)
|
|
{
|
|
assert(s.length != 0);
|
|
if (!isValidCodeUnit(s[0])) return 1;
|
|
int i = isHead(s[0]) ? 1 : 0;
|
|
for (; i < s.length; ++i)
|
|
{
|
|
if (!isTail(s[i])) break;
|
|
}
|
|
assert(i != 0);
|
|
return i;
|
|
}
|
|
|
|
string firstSequence(string s)
|
|
{
|
|
foreach(i,T c;s[1..$])
|
|
{
|
|
if (!isTail(c)) return s[0..i+1];
|
|
}
|
|
return s;
|
|
}
|
|
|
|
string lastSequence(string s)
|
|
{
|
|
foreach_reverse(i,T c;s)
|
|
{
|
|
if (!isTail(c)) return s[i..$];
|
|
}
|
|
return s;
|
|
}
|
|
|
|
// find the first invalid code unit
|
|
uint validatePartial(const(T)[] s)
|
|
{
|
|
uint i;
|
|
for (i=0; i<s.length; ++i)
|
|
{
|
|
T c = s[i];
|
|
if (isSingle(c)) continue;
|
|
uint n = tails(c);
|
|
if (n <= 0) return i; // fail with illegal code units
|
|
if (i + n >= s.length) return i; // fail if we exceed the length of the string
|
|
if (isInvalidHeadTail(c,s[i+1])) return i; // fail with invalid head/tail combinations
|
|
if (badTail(s[i+1..$],n)) return i; // fail incomplete sequences
|
|
i += n; // step over whole sequence
|
|
}
|
|
return i;
|
|
}
|
|
|
|
bool isValid(const(T)[] s)
|
|
{
|
|
return validatePartial(s) == s.length;
|
|
}
|
|
|
|
string sanitize(string s)
|
|
out(r)
|
|
{
|
|
assert(isValid(r));
|
|
}
|
|
body
|
|
{
|
|
uint i = validatePartial(s);
|
|
if (i == s.length) return s; // If string is valid, return the original
|
|
|
|
string r = s[0..i].idup;
|
|
while (i < s.length)
|
|
{
|
|
appendReplacementChar(r);
|
|
i += pseudoSequenceLength(s[i..$]);
|
|
uint n = validatePartial(s[i..$]);
|
|
r ~= s[i..i+n];
|
|
i += n;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
uint count(string s)
|
|
in
|
|
{
|
|
assert(isValid(s));
|
|
}
|
|
body
|
|
{
|
|
uint i = 0;
|
|
foreach(T c;s)
|
|
{
|
|
if (!isTail(c)) ++i;
|
|
}
|
|
return i;
|
|
}
|
|
|
|
int index(string s, int n)
|
|
in
|
|
{
|
|
assert(isValid(s));
|
|
assert(n >= 0);
|
|
}
|
|
body
|
|
{
|
|
uint i = 0;
|
|
foreach(j,T c;s)
|
|
{
|
|
if (!isTail(c))
|
|
{
|
|
if (i == n) return j;
|
|
++i;
|
|
}
|
|
}
|
|
return i == n ? s.length : -1;
|
|
}
|
|
|
|
dchar decode(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(isValid(firstSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
T c = s[0];
|
|
if (isSingle(c))
|
|
{
|
|
s = s[1..$];
|
|
return c;
|
|
}
|
|
return decodeSingleSequence(s);
|
|
}
|
|
|
|
dchar decodeSingleReverse(ref string s)
|
|
in
|
|
{
|
|
assert(s.length != 0);
|
|
assert(isValid(lastSequence(s)));
|
|
}
|
|
body
|
|
{
|
|
T c = s[$-1];
|
|
if (isSingle(c))
|
|
{
|
|
s = s[0..$-1];
|
|
return c;
|
|
}
|
|
return decodeSingleSequenceReverse(s);
|
|
}
|
|
|
|
struct CodePoints
|
|
{
|
|
string s;
|
|
|
|
int opApply(int delegate(ref dchar) dg)
|
|
{
|
|
int result = 0;
|
|
while (s.length != 0)
|
|
{
|
|
dchar c = decode(s);
|
|
result = dg(c);
|
|
if (result != 0) break;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int opApply(int delegate(ref uint, ref dchar) dg)
|
|
{
|
|
uint i = 0;
|
|
int result = 0;
|
|
while (s.length != 0)
|
|
{
|
|
uint len = s.length;
|
|
dchar c = decode(s);
|
|
uint j = i; // We don't want the delegate corrupting i
|
|
result = dg(j,c);
|
|
if (result != 0) break;
|
|
i += len - s.length;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int opApplyReverse(int delegate(ref dchar) dg)
|
|
{
|
|
int result = 0;
|
|
while (s.length != 0)
|
|
{
|
|
dchar c = decodeSingleReverse(s);
|
|
result = dg(c);
|
|
if (result != 0) break;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int opApplyReverse(int delegate(ref uint, ref dchar) dg)
|
|
{
|
|
int result = 0;
|
|
while (s.length != 0)
|
|
{
|
|
dchar c = decodeSingleReverse(s);
|
|
uint i = s.length;
|
|
result = dg(i,c);
|
|
if (result != 0) break;
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
CodePoints codePoints(string s)
|
|
{
|
|
CodePoints ci;
|
|
ci.s = s;
|
|
return ci;
|
|
}
|
|
|
|
struct CodeUnits
|
|
{
|
|
T[MAX_SEQUENCE_LENGTH] buffer;
|
|
uint len;
|
|
|
|
int opApply(int delegate(ref T) dg)
|
|
{
|
|
int result = 0;
|
|
foreach(T c;buffer[0..len])
|
|
{
|
|
result = dg(c);
|
|
if (result != 0) break;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
int opApplyReverse(int delegate(ref T) dg)
|
|
{
|
|
int result = 0;
|
|
foreach_reverse(T c;buffer[0..len])
|
|
{
|
|
result = dg(c);
|
|
if (result != 0) break;
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
CodeUnits codeUnits(dchar d)
|
|
in
|
|
{
|
|
assert(isValidCodePoint(d));
|
|
}
|
|
body
|
|
{
|
|
CodeUnits codeUnits;
|
|
codeUnits.len = encode(d,codeUnits.buffer);
|
|
return codeUnits;
|
|
}
|
|
}
|
|
|
|
alias char Utf8; /// A type representing the UTF-8 encoding (an alias of char)
|
|
alias wchar Utf16; /// A type representing the UTF-16 encoding (an alias of wchar)
|
|
alias dchar Utf32; /// A type representing the UTF-32 encoding (an alias of dchar)
|
|
typedef char Ascii; /// A type representing the ASCII encoding (a typedef of char)
|
|
typedef ubyte Latin1; /// A type representing the ISO-8859-1 (aka Latin-1) encoding (a typedef of ubyte)
|
|
typedef ubyte Windows1252; /// A type representing the WINDOWS-1252 encoding (a typedef of ubyte)
|
|
|
|
/**
|
|
* A type representing a string of some specified encoding. The encoding is specified by the template parameter.
|
|
*/
|
|
template EString(T)
|
|
{
|
|
alias invariant(T)[] EString;
|
|
}
|
|
|
|
/**
|
|
* Returns the name of an encoding.
|
|
*
|
|
* The type of the output cannot be deduced. Therefore, it is necessary to
|
|
* explicitly specify the encoding type.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Examples:
|
|
* -----------------------------------
|
|
* writefln(encodingName!(Latin1));
|
|
* // writes ISO-8859-1
|
|
* -----------------------------------
|
|
*/
|
|
string encodingName(T)()
|
|
{
|
|
return Encoding!(T).encodingName;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
assert(encodingName!(Utf8) == "UTF-8");
|
|
assert(encodingName!(Utf16) == "UTF-16");
|
|
assert(encodingName!(Utf32) == "UTF-32");
|
|
assert(encodingName!(Ascii) == "ASCII");
|
|
assert(encodingName!(Latin1) == "ISO-8859-1");
|
|
}
|
|
|
|
/**
|
|
* Returns true if c is a valid code point
|
|
*
|
|
* Note that this includes the non-character code points U+FFFE and U+FFFF,
|
|
* since these are valid code points (even though they are not valid
|
|
* characters).
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.isValidDchar().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* c = the code point to be tested
|
|
*/
|
|
bool isValidCodePoint(dchar c)
|
|
{
|
|
return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
|
|
}
|
|
|
|
/**
|
|
* Returns true if the code unit is legal. For example, the byte 0x80 would
|
|
* not be legal in ASCII, because ASCII code units must always be in the range
|
|
* 0x00 to 0x7F.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* c = the code unit to be tested
|
|
*/
|
|
bool isValidCodeUnit(T)(T c)
|
|
{
|
|
return Encoding!(T).isValidCodeUnit(c);
|
|
}
|
|
|
|
/**
|
|
* Returns true if the string is encoded correctly
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.validate(), however note that this
|
|
* function returns a bool indicating whether the input was valid or not,
|
|
* wheras the older funtion would throw an exception.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be tested
|
|
*/
|
|
bool isValid(T)(const(T)[] s)
|
|
{
|
|
return Encoding!(T).isValid(s);
|
|
}
|
|
|
|
/**
|
|
* Sanitizes a string by replacing malformed code unit sequences with valid
|
|
* code unit sequences. The result is guaranteed to be valid for this encoding.
|
|
*
|
|
* If the input string is already valid, this function returns the original,
|
|
* otherwise it constructs a new string by replacing all illegal code unit
|
|
* sequences with the encoding's replacement character, Invalid sequences will
|
|
* be replaced with the Unicode replacement character (U+FFFD) if the
|
|
* character repertoire contains it, otherwise invalid sequences will be
|
|
* replaced with '?'.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be sanitized
|
|
*/
|
|
invariant(T)[] sanitize(T)(invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).sanitize(s);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
|
|
}
|
|
|
|
/**
|
|
* Returns the slice of the input string from the first character to the end
|
|
* of the first encoded sequence. The resulting string may consist of multiple
|
|
* code units, but it will always represent at most one character. If the input
|
|
* is the empty string, the return value will be the empty string
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be sliced
|
|
*/
|
|
invariant(T)[] firstSequence(T)(invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).firstSequence(s);
|
|
}
|
|
|
|
/**
|
|
* Returns the slice of the input string from the start of the last encoded
|
|
* sequence to the end of the string. The resulting string may consist of
|
|
* multiple code units, but it will always represent at most one character.
|
|
* If the input is the empty string, the return value will be the empty string.
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be sliced
|
|
*/
|
|
invariant(T)[] lastSequence(T)(invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).lastSequence(s);
|
|
}
|
|
|
|
/**
|
|
* Returns the total number of code points encoded in a string.
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.toUCSindex().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be counted
|
|
*/
|
|
uint count(T)(invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).count(s);
|
|
}
|
|
|
|
/**
|
|
* Returns the array index at which the (n+1)th code point begins.
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.toUTFindex().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be counted
|
|
*/
|
|
int index(T)(invariant(T)[] s,int n)
|
|
{
|
|
return Encoding!(T).index(s,n);
|
|
}
|
|
|
|
/**
|
|
* Decodes a single code point.
|
|
*
|
|
* This function removes one or more code units from the start of a string,
|
|
* and returns the decoded code point which those code units represent.
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.decode(), however, note that the
|
|
* function codePoints() supercedes it more conveniently.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string whose first code point is to be decoded
|
|
*/
|
|
dchar decode(T)(ref invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).decode(s);
|
|
}
|
|
|
|
/**
|
|
* Encodes a single code point.
|
|
*
|
|
* This function encodes a single code point into one or more code units.
|
|
* It returns a string containing those code units.
|
|
*
|
|
* The input to this function MUST be a valid code point.
|
|
*
|
|
* The type of the output cannot be deduced. Therefore, it is necessary to
|
|
* explicitly specify the encoding as a template parameter.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.encode(), however, note that the
|
|
* function codeUnits() supercedes it more conveniently.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* c = the code point to be encoded
|
|
*/
|
|
invariant(T)[] encode(T)(dchar c)
|
|
{
|
|
T[] buffer = new T[4];
|
|
uint len = encode(c,buffer);
|
|
return cast(invariant(T)[])(buffer[0..len]);
|
|
}
|
|
|
|
/**
|
|
* Encodes a single code point into a user-supplied buffer.
|
|
*
|
|
* The input to this function MUST be a valid code point.
|
|
*
|
|
* The user-supplied buffer needs to be of type T[], where T is the encoding
|
|
* type (currently one of Utf8, Utf16, Utf32, Ascii, Latin1 or Windows1252).
|
|
* Note that Utf8, Utf16 and Utf32 are aliases for char, wchar and dchar
|
|
* respectively.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.encode(), however, note that the
|
|
* function codeUnits() supercedes it more conveniently.
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* c = the code point to be encoded
|
|
* buffer = where to store the output
|
|
*/
|
|
uint encode(T)(dchar c,T[] buffer)
|
|
in
|
|
{
|
|
assert(buffer.length >= 4);
|
|
}
|
|
body
|
|
{
|
|
return Encoding!(T).encode(cast(uint)c,buffer);
|
|
}
|
|
|
|
template CodePoints(T)
|
|
{
|
|
alias Encoding!(T).CodePoints CodePoints;
|
|
}
|
|
|
|
template CodeUnits(T)
|
|
{
|
|
alias Encoding!(T).CodeUnits CodeUnits;
|
|
}
|
|
|
|
/**
|
|
* Returns a foreachable struct which can bidirectionally iterate over all
|
|
* code points in a string.
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* You can foreach either
|
|
* with or without an index. If an index is specified, it will be initialized
|
|
* at each iteration with the offset into the string at which the code point
|
|
* begins.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.decode().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the string to be decoded
|
|
*
|
|
* Examples:
|
|
* --------------------------------------------------------
|
|
* string s = "hello world";
|
|
* foreach(c;codePoints(s))
|
|
* {
|
|
* // do something with c (which will always be a dchar)
|
|
* }
|
|
* --------------------------------------------------------
|
|
*
|
|
* Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
|
|
* in that the latter will fall over on encountering U+FFFF.
|
|
*/
|
|
CodePoints!(T) codePoints(T)(invariant(T)[] s)
|
|
{
|
|
return Encoding!(T).codePoints(s);
|
|
}
|
|
|
|
/**
|
|
* Returns a foreachable struct which can bidirectionally iterate over all
|
|
* code units in a code point.
|
|
*
|
|
* The input to this function MUST be a valid code point.
|
|
*
|
|
* The type of the output cannot be deduced. Therefore, it is necessary to
|
|
* explicitly specify the encoding type in the template parameter.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.encode().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* d = the code point to be encoded
|
|
*
|
|
* Examples:
|
|
* --------------------------------------------------------
|
|
* dchar d = '\u20AC';
|
|
* foreach(c;codeUnits!(Utf8)(d))
|
|
* {
|
|
* writefln("%X",c)
|
|
* }
|
|
* // will print
|
|
* // E2
|
|
* // 82
|
|
* // AC
|
|
* --------------------------------------------------------
|
|
*/
|
|
CodeUnits!(T) codeUnits(T)(dchar d)
|
|
{
|
|
return Encoding!(T).codeUnits(d);
|
|
}
|
|
|
|
/**
|
|
* Convert a string from one encoding to another. (See also to!() below).
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
|
|
* std.utf.toUTF32()
|
|
* (but note that to!() supercedes it more conveniently).
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* s = the source string
|
|
* r = the destination string
|
|
*
|
|
* Examples:
|
|
* --------------------------------------------------------
|
|
* wstring ws;
|
|
* transcode("hello world",ws);
|
|
* // transcode from UTF-8 to UTF-16
|
|
*
|
|
* EString!(Latin1) ls;
|
|
* transcode(ws, ls);
|
|
* // transcode from UTF-16 to ISO-8859-1
|
|
* --------------------------------------------------------
|
|
*/
|
|
void transcode(T,U)(invariant(T)[] s,out invariant(U)[] r)
|
|
{
|
|
static if(is(T==U))
|
|
{
|
|
r = s;
|
|
}
|
|
else static if(is(T==Ascii))
|
|
{
|
|
transcode!(char,U)(cast(string)s,r);
|
|
}
|
|
else
|
|
{
|
|
foreach(d;codePoints(s))
|
|
{
|
|
foreach(c;codeUnits!(U)(d))
|
|
{
|
|
r ~= c;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Convert a string from one encoding to another. (See also transcode() above).
|
|
*
|
|
* The input to this function MUST be validly encoded.
|
|
* This is enforced by the function's in-contract.
|
|
*
|
|
* Supercedes:
|
|
* This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
|
|
* std.utf.toUTF32().
|
|
*
|
|
* Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
|
|
*
|
|
* Params:
|
|
* U = the destination encoding type
|
|
* s = the source string
|
|
*
|
|
* Examples:
|
|
* -----------------------------------------------------------------------------
|
|
* auto ws = to!(Utf16)("hello world"); // transcode from UTF-8 to UTF-16
|
|
* auto ls = to!(Latin1)(ws); // transcode from UTF-16 to ISO-8859-1
|
|
* -----------------------------------------------------------------------------
|
|
*/
|
|
invariant(U)[] to(U,T)(invariant(T)[] s)
|
|
{
|
|
invariant(U)[] r;
|
|
transcode(s,r);
|
|
return r;
|
|
}
|
|
|
|
// Helper functions
|
|
debug
|
|
{
|
|
void transcodeReverse(T,U)(invariant(T)[] s, out invariant(U)[] r)
|
|
{
|
|
static if(is(T==U))
|
|
{
|
|
return s;
|
|
}
|
|
else static if(is(T==Ascii))
|
|
{
|
|
transcodeReverse!(char,U)(cast(string)s,r);
|
|
}
|
|
else
|
|
{
|
|
foreach_reverse(d;codePoints(s))
|
|
{
|
|
foreach_reverse(c;codeUnits!(U)(d))
|
|
{
|
|
r = c ~ r;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
string makeReadable(string s)
|
|
{
|
|
string r = "\"";
|
|
foreach(char c;s)
|
|
{
|
|
if (c >= 0x20 && c < 0x80)
|
|
{
|
|
r ~= c;
|
|
}
|
|
else
|
|
{
|
|
r ~= "\\x";
|
|
r ~= toHexDigit(c >> 4);
|
|
r ~= toHexDigit(c);
|
|
}
|
|
}
|
|
r ~= "\"";
|
|
return r;
|
|
}
|
|
|
|
string makeReadable(wstring s)
|
|
{
|
|
string r = "\"";
|
|
foreach(wchar c;s)
|
|
{
|
|
if (c >= 0x20 && c < 0x80)
|
|
{
|
|
r ~= c;
|
|
}
|
|
else
|
|
{
|
|
r ~= "\\u";
|
|
r ~= toHexDigit(c >> 12);
|
|
r ~= toHexDigit(c >> 8);
|
|
r ~= toHexDigit(c >> 4);
|
|
r ~= toHexDigit(c);
|
|
}
|
|
}
|
|
r ~= "\"w";
|
|
return r;
|
|
}
|
|
|
|
string makeReadable(dstring s)
|
|
{
|
|
string r = "\"";
|
|
foreach(dchar c;s)
|
|
{
|
|
if (c >= 0x20 && c < 0x80)
|
|
{
|
|
r ~= c;
|
|
}
|
|
else if (c < 0x10000)
|
|
{
|
|
r ~= "\\u";
|
|
r ~= toHexDigit(c >> 12);
|
|
r ~= toHexDigit(c >> 8);
|
|
r ~= toHexDigit(c >> 4);
|
|
r ~= toHexDigit(c);
|
|
}
|
|
else
|
|
{
|
|
r ~= "\\U00";
|
|
r ~= toHexDigit(c >> 20);
|
|
r ~= toHexDigit(c >> 16);
|
|
r ~= toHexDigit(c >> 12);
|
|
r ~= toHexDigit(c >> 8);
|
|
r ~= toHexDigit(c >> 4);
|
|
r ~= toHexDigit(c);
|
|
}
|
|
}
|
|
r ~= "\"d";
|
|
return r;
|
|
}
|
|
|
|
char toHexDigit(int n)
|
|
{
|
|
return "0123456789ABCDEF"[n & 0xF];
|
|
}
|
|
}
|