// Written in the D programming language. /** Classes and functions for handling and transcoding between various encodings. For cases where the _encoding is known at compile-time, functions are provided for arbitrary _encoding and decoding of characters, arbitrary transcoding between strings of different type, as well as validation and sanitization. Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 (also known as LATIN-1), and WINDOWS-1252. $(UL $(LI The type $(D AsciiChar) represents an ASCII character.) $(LI The type $(D AsciiString) represents an ASCII string.) $(LI The type $(D Latin1Char) represents an ISO-8859-1 character.) $(LI The type $(D Latin1String) represents an ISO-8859-1 string.) $(LI The type $(D Windows1252Char) represents a Windows-1252 character.) $(LI The type $(D Windows1252String) represents a Windows-1252 string.)) For cases where the _encoding is not known at compile-time, but is known at run-time, we provide the abstract class $(D EncodingScheme) and its subclasses. To construct a run-time encoder/decoder, one does e.g. ---------------------------------------------------- auto e = EncodingScheme.create("utf-8"); ---------------------------------------------------- This library supplies $(D EncodingScheme) subclasses for ASCII, ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE. This library provides a mechanism whereby other modules may add $(D EncodingScheme) subclasses for any other _encoding. Macros: WIKI=Phobos/StdEncoding Copyright: Copyright Janice Caron 2008 - 2009. License: Boost License 1.0. Authors: Janice Caron Source: $(PHOBOSSRC std/_encoding.d) */ /* Copyright Janice Caron 2008 - 2009. Distributed under the Boost Software License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */ module std.encoding; import std.string; import std.traits; import std.range; unittest { static ubyte[][] validStrings = [ // Plain ASCII cast(ubyte[])"hello", // First possible sequence of a certain length [ 0x00 ], // U+00000000 one byte [ 0xC2, 0x80 ], // U+00000080 two bytes [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes // Last possible sequence of a certain length [ 0x7F ], // U+0000007F one byte [ 0xDF, 0xBF ], // U+000007FF two bytes [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes // Other boundary conditions [ 0xED, 0x9F, 0xBF ], // U+0000D7FF Last character before surrogates [ 0xEE, 0x80, 0x80 ], // U+0000E000 First character after surrogates [ 0xEF, 0xBF, 0xBD ], // U+0000FFFD Unicode replacement character [ 0xF4, 0x8F, 0xBF, 0xBF ], // U+0010FFFF Very last character // Non-character code points /* NOTE: These are legal in UTF, and may be converted from one UTF to another, however they do not represent Unicode characters. These code points have been reserved by Unicode as non-character code points. They are permissible for data exchange within an application, but they are are not permitted to be used as characters. Since this module deals with UTF, and not with Unicode per se, we choose to accept them here. */ [ 0xDF, 0xBE ], // U+0000FFFE [ 0xDF, 0xBF ], // U+0000FFFF ]; static ubyte[][] invalidStrings = [ // First possible sequence of a certain length, but greater // than U+10FFFF [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes // Last possible sequence of a certain length, but greater than U+10FFFF [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes // Other boundary conditions [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 // First code // point after // last character // Unexpected continuation bytes [ 0x80 ], [ 0xBF ], [ 0x20, 0x80, 0x20 ], [ 0x20, 0xBF, 0x20 ], [ 0x80, 0x9F, 0xA0 ], // Lonely start bytes [ 0xC0 ], [ 0xCF ], [ 0x20, 0xC0, 0x20 ], [ 0x20, 0xCF, 0x20 ], [ 0xD0 ], [ 0xDF ], [ 0x20, 0xD0, 0x20 ], [ 0x20, 0xDF, 0x20 ], [ 0xE0 ], [ 0xEF ], [ 0x20, 0xE0, 0x20 ], [ 0x20, 0xEF, 0x20 ], [ 0xF0 ], [ 0xF1 ], [ 0xF2 ], [ 0xF3 ], [ 0xF4 ], [ 0xF5 ], // If this were legal it would start a character > U+10FFFF [ 0xF6 ], // If this were legal it would start a character > U+10FFFF [ 0xF7 ], // If this were legal it would start a character > U+10FFFF [ 0xEF, 0xBF ], // Three byte sequence with third byte missing [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above // Impossible bytes [ 0xF8 ], [ 0xF9 ], [ 0xFA ], [ 0xFB ], [ 0xFC ], [ 0xFD ], [ 0xFE ], [ 0xFF ], [ 0x20, 0xF8, 0x20 ], [ 0x20, 0xF9, 0x20 ], [ 0x20, 0xFA, 0x20 ], [ 0x20, 0xFB, 0x20 ], [ 0x20, 0xFC, 0x20 ], [ 0x20, 0xFD, 0x20 ], [ 0x20, 0xFE, 0x20 ], [ 0x20, 0xFF, 0x20 ], // Overlong sequences, all representing U+002F /* With a safe UTF-8 decoder, all of the following five overlong representations of the ASCII character slash ("/") should be rejected like a malformed UTF-8 sequence */ [ 0xC0, 0xAF ], [ 0xE0, 0x80, 0xAF ], [ 0xF0, 0x80, 0x80, 0xAF ], [ 0xF8, 0x80, 0x80, 0x80, 0xAF ], [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ], // Maximum overlong sequences /* Below you see the highest Unicode value that is still resulting in an overlong sequence if represented with the given number of bytes. This is a boundary test for safe UTF-8 decoders. All five characters should be rejected like malformed UTF-8 sequences. */ [ 0xC1, 0xBF ], // U+0000007F [ 0xE0, 0x9F, 0xBF ], // U+000007FF [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF // Overlong representation of the NUL character /* The following five sequences should also be rejected like malformed UTF-8 sequences and should not be treated like the ASCII NUL character. */ [ 0xC0, 0x80 ], [ 0xE0, 0x80, 0x80 ], [ 0xF0, 0x80, 0x80, 0x80 ], [ 0xF8, 0x80, 0x80, 0x80, 0x80 ], [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ], // Illegal code positions /* The following UTF-8 sequences should be rejected like malformed sequences, because they never represent valid ISO 10646 characters and a UTF-8 decoder that accepts them might introduce security problems comparable to overlong UTF-8 sequences. */ [ 0xED, 0xA0, 0x80 ], // U+D800 [ 0xED, 0xAD, 0xBF ], // U+DB7F [ 0xED, 0xAE, 0x80 ], // U+DB80 [ 0xED, 0xAF, 0xBF ], // U+DBFF [ 0xED, 0xB0, 0x80 ], // U+DC00 [ 0xED, 0xBE, 0x80 ], // U+DF80 [ 0xED, 0xBF, 0xBF ], // U+DFFF ]; static string[] sanitizedStrings = [ "\uFFFD","\uFFFD", "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ", "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ", " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD", "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ", " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", ]; // Make sure everything that should be valid, is foreach(a;validStrings) { string s = cast(string)a; assert(isValid(s),"Failed to validate: "~makeReadable(s)); } // Make sure everything that shouldn't be valid, isn't foreach(a;invalidStrings) { string s = cast(string)a; assert(!isValid(s),"Incorrectly validated: "~makeReadable(s)); } // Make sure we can sanitize everything bad assert(invalidStrings.length == sanitizedStrings.length); for(int i=0; i= 0xA0 && c < 0x100)) return true; if (c >= 0xFFFD) return false; foreach(wchar d;charMap) { if (c == d) return true; } return false; } bool isValidCodeUnit(Windows1252Char c) { if (c < 0x80 || c >= 0xA0) return true; return (charMap[c-0x80] != 0xFFFD); } size_t encodedLength(dchar c) in { assert(canEncode(c)); } body { return 1; } void encodeViaWrite()(dchar c) { if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {} else if (c >= 0xFFFD) { c = '?'; } else { sizediff_t n = -1; foreach (i, wchar d; charMap) { if (c == d) { n = i; break; } } c = n == -1 ? '?' : 0x80 + cast(dchar) n; } write(cast(Windows1252Char)c); } void skipViaRead()() { read(); } dchar decodeViaRead()() { Windows1252Char c = read; return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; } dchar safeDecodeViaRead()() { Windows1252Char c = read; dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; return d == 0xFFFD ? INVALID_SEQUENCE : d; } dchar decodeReverseViaRead()() { Windows1252Char c = read; return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; } EString replacementSequence() { return cast(EString)("?"); } mixin EncoderFunctions; } //============================================================================= // UTF-8 //============================================================================= template EncoderInstance(CharType : char) { alias char E; alias immutable(char)[] EString; string encodingName() { return "UTF-8"; } bool canEncode(dchar c) { return isValidCodePoint(c); } bool isValidCodeUnit(char c) { return (c < 0xC0 || (c >= 0xC2 && c < 0xF5)); } immutable ubyte[128] tailTable = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0, ]; private int tails(char c) in { assert(c >= 0x80); } body { return tailTable[c-0x80]; } size_t encodedLength(dchar c) in { assert(canEncode(c)); } body { if (c < 0x80) return 1; if (c < 0x800) return 2; if (c < 0x10000) return 3; return 4; } void encodeViaWrite()(dchar c) { if (c < 0x80) { write(cast(char)c); } else if (c < 0x800) { write(cast(char)((c >> 6) + 0xC0)); write(cast(char)((c & 0x3F) + 0x80)); } else if (c < 0x10000) { write(cast(char)((c >> 12) + 0xE0)); write(cast(char)(((c >> 6) & 0x3F) + 0x80)); write(cast(char)((c & 0x3F) + 0x80)); } else { write(cast(char)((c >> 18) + 0xF0)); write(cast(char)(((c >> 12) & 0x3F) + 0x80)); write(cast(char)(((c >> 6) & 0x3F) + 0x80)); write(cast(char)((c & 0x3F) + 0x80)); } } void skipViaRead()() { auto c = read; if (c < 0xC0) return; int n = tails(cast(char) c); for (size_t i=0; i 0xF4) // fail overlong 4-6-byte sequences || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF ); c &= (1 << (6 - n)) - 1; for (size_t i=0; i> 10))); write(cast(wchar)(0xDC00 + (n & 0x3FF))); } } void skipViaRead()() { wchar c = read; if (c < 0xD800 || c >= 0xE000) return; read(); } dchar decodeViaRead()() { wchar c = read; if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; wchar d = read; c &= 0x3FF; d &= 0x3FF; return 0x10000 + (c << 10) + d; } dchar safeDecodeViaRead()() { wchar c = read; if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; if (c >= 0xDC00) return INVALID_SEQUENCE; if (!canRead) return INVALID_SEQUENCE; wchar d = peek; if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE; d = read; c &= 0x3FF; d &= 0x3FF; return 0x10000 + (c << 10) + d; } dchar decodeReverseViaRead()() { wchar c = read; if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; wchar d = read; c &= 0x3FF; d &= 0x3FF; return 0x10000 + (d << 10) + c; } EString replacementSequence() { return "\uFFFD"w; } mixin EncoderFunctions; } //============================================================================= // UTF-32 //============================================================================= template EncoderInstance(CharType : dchar) { alias dchar E; alias immutable(dchar)[] EString; string encodingName() { return "UTF-32"; } bool canEncode(dchar c) { return isValidCodePoint(c); } bool isValidCodeUnit(dchar c) { return isValidCodePoint(c); } size_t encodedLength(dchar c) in { assert(canEncode(c)); } body { return 1; } void encodeViaWrite()(dchar c) { write(c); } void skipViaRead()() { read(); } dchar decodeViaRead()() { return cast(dchar)read; } dchar safeDecodeViaRead()() { dchar c = read; return isValidCodePoint(c) ? c : INVALID_SEQUENCE; } dchar decodeReverseViaRead()() { return cast(dchar)read; } EString replacementSequence() { return "\uFFFD"d; } mixin EncoderFunctions; } //============================================================================= // Below are forwarding functions which expose the function to the user /** Returns true if c is a valid code point Note that this includes the non-character code points U+FFFE and U+FFFF, since these are valid code points (even though they are not valid characters). Supercedes: This function supercedes $(D std.utf.startsValidDchar()). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: c = the code point to be tested */ bool isValidCodePoint(dchar c) { return c < 0xD800 || (c >= 0xE000 && c < 0x110000); } /** Returns the name of an encoding. The type of encoding cannot be deduced. Therefore, it is necessary to explicitly specify the encoding type. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Examples: ----------------------------------- assert(encodingName!(Latin1Char) == "ISO-8859-1"); ----------------------------------- */ string encodingName(T)() { return EncoderInstance!(T).encodingName; } unittest { assert(encodingName!(char) == "UTF-8"); assert(encodingName!(wchar) == "UTF-16"); assert(encodingName!(dchar) == "UTF-32"); assert(encodingName!(AsciiChar) == "ASCII"); assert(encodingName!(Latin1Char) == "ISO-8859-1"); assert(encodingName!(Windows1252Char) == "windows-1252"); } /** Returns true iff it is possible to represent the specifed codepoint in the encoding. The type of encoding cannot be deduced. Therefore, it is necessary to explicitly specify the encoding type. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Examples: ----------------------------------- assert(canEncode!(Latin1Char)('A')); ----------------------------------- */ bool canEncode(E)(dchar c) { return EncoderInstance!(E).canEncode(c); } unittest { assert(!canEncode!(AsciiChar)('\u00A0')); assert(canEncode!(Latin1Char)('\u00A0')); assert(canEncode!(Windows1252Char)('\u20AC')); assert(!canEncode!(Windows1252Char)('\u20AD')); assert(!canEncode!(Windows1252Char)('\uFFFD')); assert(!canEncode!(char)(cast(dchar)0x110000)); } /** Returns true if the code unit is legal. For example, the byte 0x80 would not be legal in ASCII, because ASCII code units must always be in the range 0x00 to 0x7F. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: c = the code unit to be tested */ bool isValidCodeUnit(E)(E c) { return EncoderInstance!(E).isValidCodeUnit(c); } unittest { assert(!isValidCodeUnit(cast(AsciiChar)0xA0)); assert( isValidCodeUnit(cast(Windows1252Char)0x80)); assert(!isValidCodeUnit(cast(Windows1252Char)0x81)); assert(!isValidCodeUnit(cast(char)0xC0)); assert(!isValidCodeUnit(cast(char)0xFF)); assert( isValidCodeUnit(cast(wchar)0xD800)); assert(!isValidCodeUnit(cast(dchar)0xD800)); } /** Returns true if the string is encoded correctly Supercedes: This function supercedes std.utf.validate(), however note that this function returns a bool indicating whether the input was valid or not, wheras the older funtion would throw an exception. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be tested */ bool isValid(E)(const(E)[] s) { return s.length == validLength(s); } unittest { assert(isValid("\u20AC100")); } /** Returns the length of the longest possible substring, starting from the first code unit, which is validly encoded. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be tested */ size_t validLength(E)(const(E)[] s) { size_t result, before = void; while ((before = s.length) > 0) { if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE) break; result += before - s.length; } return result; } /** Sanitizes a string by replacing malformed code unit sequences with valid code unit sequences. The result is guaranteed to be valid for this encoding. If the input string is already valid, this function returns the original, otherwise it constructs a new string by replacing all illegal code unit sequences with the encoding's replacement character, Invalid sequences will be replaced with the Unicode replacement character (U+FFFD) if the character repertoire contains it, otherwise invalid sequences will be replaced with '?'. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be sanitized */ immutable(E)[] sanitize(E)(immutable(E)[] s) { size_t n = validLength(s); if (n == s.length) return s; auto repSeq = EncoderInstance!(E).replacementSequence; // Count how long the string needs to be. // Overestimating is not a problem size_t len = s.length; const(E)[] t = s[n..$]; while (t.length != 0) { dchar c = EncoderInstance!(E).safeDecode(t); assert(c == INVALID_SEQUENCE); len += repSeq.length; t = t[validLength(t)..$]; } // Now do the write E[] array = new E[len]; array[0..n] = s[0..n]; size_t offset = n; t = s[n..$]; while (t.length != 0) { dchar c = EncoderInstance!(E).safeDecode(t); assert(c == INVALID_SEQUENCE); array[offset..offset+repSeq.length] = repSeq[]; offset += repSeq.length; n = validLength(t); array[offset..offset+n] = t[0..n]; offset += n; t = t[n..$]; } return cast(immutable(E)[])array[0..offset]; } unittest { assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); } /** Returns the length of the first encoded sequence. The input to this function MUST be validly encoded. This is enforced by the function's in-contract. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be sliced */ size_t firstSequence(E)(const(E)[] s) in { assert(s.length != 0); const(E)[] u = s; assert(safeDecode(u) != INVALID_SEQUENCE); } body { auto before = s.length; EncoderInstance!(E).skip(s); return before - s.length; } unittest { assert(firstSequence("\u20AC1000") == "\u20AC".length); } /** Returns the length the last encoded sequence. The input to this function MUST be validly encoded. This is enforced by the function's in-contract. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be sliced */ size_t lastSequence(E)(const(E)[] s) in { assert(s.length != 0); assert(isValid(s)); } body { const(E)[] t = s; EncoderInstance!(E).decodeReverse(s); return t.length - s.length; } unittest { assert(lastSequence("1000\u20AC") == "\u20AC".length); } /** Returns the array index at which the (n+1)th code point begins. The input to this function MUST be validly encoded. This is enforced by the function's in-contract. Supercedes: This function supercedes std.utf.toUTFindex(). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be counted */ sizediff_t index(E)(const(E)[] s,int n) in { assert(isValid(s)); assert(n >= 0); } body { const(E)[] t = s; for (size_t i=0; i> 6))); range.put(cast(char)(0x80 | (c & 0x3F))); return 2; } if (c <= 0xFFFF) { range.put(cast(char)(0xE0 | (c >> 12))); range.put(cast(char)(0x80 | ((c >> 6) & 0x3F))); range.put(cast(char)(0x80 | (c & 0x3F))); return 3; } if (c <= 0x10FFFF) { range.put(cast(char)(0xF0 | (c >> 18))); range.put(cast(char)(0x80 | ((c >> 12) & 0x3F))); range.put(cast(char)(0x80 | ((c >> 6) & 0x3F))); range.put(cast(char)(0x80 | (c & 0x3F))); return 4; } else { assert(0); } } else static if (is(Unqual!E == wchar)) { if (c <= 0xFFFF) { r.put(cast(wchar) c); return 1; } r.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800)); r.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00)); return 2; } else static if (is(Unqual!E == dchar)) { r.put(c); return 1; } else { assert(0); } } /** Encodes a single code point to a delegate. This function encodes a single code point into one or more code units. The code units are passed one at a time to the supplied delegate. The input to this function MUST be a valid code point. This is enforced by the function's in-contract. The type of the output cannot be deduced. Therefore, it is necessary to explicitly specify the encoding as a template parameter. Supercedes: This function supercedes std.utf.encode(), however, note that the function codeUnits() supercedes it more conveniently. Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: c = the code point to be encoded */ void encode(E)(dchar c, void delegate(E) dg) in { assert(isValidCodePoint(c)); } body { EncoderInstance!(E).encode(c,dg); } /** Returns a foreachable struct which can bidirectionally iterate over all code points in a string. The input to this function MUST be validly encoded. This is enforced by the function's in-contract. You can foreach either with or without an index. If an index is specified, it will be initialized at each iteration with the offset into the string at which the code point begins. Supercedes: This function supercedes std.utf.decode(). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the string to be decoded Examples: -------------------------------------------------------- string s = "hello world"; foreach(c;codePoints(s)) { // do something with c (which will always be a dchar) } -------------------------------------------------------- Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s) in that the latter will fall over on encountering U+FFFF. */ CodePoints!(E) codePoints(E)(immutable(E)[] s) in { assert(isValid(s)); } body { return CodePoints!(E)(s); } unittest { string s = "hello"; string t; foreach(c;codePoints(s)) { t ~= cast(char)c; } assert(s == t); } /** Returns a foreachable struct which can bidirectionally iterate over all code units in a code point. The input to this function MUST be a valid code point. This is enforced by the function's in-contract. The type of the output cannot be deduced. Therefore, it is necessary to explicitly specify the encoding type in the template parameter. Supercedes: This function supercedes std.utf.encode(). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: d = the code point to be encoded Examples: -------------------------------------------------------- dchar d = '\u20AC'; foreach(c;codeUnits!(char)(d)) { writefln("%X",c) } // will print // E2 // 82 // AC -------------------------------------------------------- */ CodeUnits!(E) codeUnits(E)(dchar c) in { assert(isValidCodePoint(c)); } body { return CodeUnits!(E)(c); } unittest { char[] a; foreach(c;codeUnits!(char)(cast(dchar)'\u20AC')) { a ~= c; } assert(a.length == 3); assert(a[0] == 0xE2); assert(a[1] == 0x82); assert(a[2] == 0xAC); } /** Encodes $(D c) in units of type $(D E) and writes the result to the output range $(D R). Returns the number of $(D E)s written. */ size_t encode(Tgt, Src, R)(in Src[] s, R range) { size_t result; foreach (c; s) { result += encode!(Tgt)(c, range); } return result; } /** Convert a string from one encoding to another. (See also to!() below). The input to this function MUST be validly encoded. This is enforced by the function's in-contract. Supercedes: This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32() (but note that to!() supercedes it more conveniently). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: s = the source string r = the destination string Examples: -------------------------------------------------------- wstring ws; transcode("hello world",ws); // transcode from UTF-8 to UTF-16 Latin1String ls; transcode(ws, ls); // transcode from UTF-16 to ISO-8859-1 -------------------------------------------------------- */ void transcode(Src,Dst)(immutable(Src)[] s,out immutable(Dst)[] r) in { assert(isValid(s)); } body { static if(is(Src==Dst)) { r = s; } else static if(is(Src==AsciiChar)) { transcode!(char,Dst)(cast(string)s,r); } else { const(Src)[] t = s; while (t.length != 0) { r ~= encode!(Dst)(decode(t)); } } } /* Convert a string from one encoding to another. (See also transcode() above). The input to this function MUST be validly encoded. This is enforced by the function's in-contract. Supercedes: This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and std.utf.toUTF32(). Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 Params: Dst = the destination encoding type s = the source string Examples: ----------------------------------------------------------------------------- auto ws = to!(wchar)("hello world"); // transcode from UTF-8 to UTF-16 auto ls = to!(Latin1Char)(ws); // transcode from UTF-16 to ISO-8859-1 ----------------------------------------------------------------------------- */ // TODO: Commented out for no - to be moved to std.conv // Dst to(Dst,Src)(immutable(Src)[] s) // in // { // assert(isValid(s)); // } // body // { // Dst r; // transcode(s,r); // return r; // } //============================================================================= /** The base class for exceptions thrown by this module */ class EncodingException : Exception { this(string msg) { super(msg); } } class UnrecognizedEncodingException : EncodingException { private this(string msg) { super(msg); } } /** Abstract base class of all encoding schemes */ abstract class EncodingScheme { /** * Registers a subclass of EncodingScheme. * * This function allows user-defined subclasses of EncodingScheme to * be declared in other modules. * * Examples: * ---------------------------------------------- * class Amiga1251 : EncodingScheme * { * shared static this() * { * EncodingScheme.register("path.to.Amiga1251"); * } * } * ---------------------------------------------- */ static void register(string className) { auto scheme = cast(EncodingScheme)ClassInfo.find(className).create(); if (scheme is null) throw new EncodingException("Unable to create class "~className); foreach(encodingName;scheme.names()) { supported[tolower(encodingName)] = className; } } /** * Obtains a subclass of EncodingScheme which is capable of encoding * and decoding the named encoding scheme. * * This function is only aware of EncodingSchemes which have been * registered with the register() function. * * Examples: * --------------------------------------------------- * auto scheme = EncodingScheme.create("Amiga-1251"); * --------------------------------------------------- */ static EncodingScheme create(string encodingName) { auto p = std.string.tolower(encodingName) in supported; if (p is null) throw new EncodingException("Unrecognized Encoding: "~encodingName); string className = *p; auto scheme = cast(EncodingScheme)ClassInfo.find(className).create(); if (scheme is null) throw new EncodingException("Unable to create class "~className); return scheme; } const { /** * Returns the standard name of the encoding scheme */ abstract override string toString(); /** * Returns an array of all known names for this encoding scheme */ abstract string[] names(); /** * Returns true if the character c can be represented * in this encoding scheme. */ abstract bool canEncode(dchar c); /** * Returns the number of ubytes required to encode this code point. * * The input to this function MUST be a valid code point. * * Params: * c = the code point to be encoded * * Returns: * the number of ubytes required. */ abstract size_t encodedLength(dchar c); /** * Encodes a single code point into a user-supplied, fixed-size buffer. * * This function encodes a single code point into one or more ubytes. * The supplied buffer must be code unit aligned. * (For example, UTF-16LE or UTF-16BE must be wchar-aligned, * UTF-32LE or UTF-32BE must be dchar-aligned, etc.) * * The input to this function MUST be a valid code point. * * Params: * c = the code point to be encoded * * Returns: * the number of ubytes written. */ abstract size_t encode(dchar c, ubyte[] buffer); /** * Decodes a single code point. * * This function removes one or more ubytes from the start of an array, * and returns the decoded code point which those ubytes represent. * * The input to this function MUST be validly encoded. * * Params: * s = the array whose first code point is to be decoded */ abstract dchar decode(ref const(ubyte)[] s); /** * Decodes a single code point. The input does not have to be valid. * * This function removes one or more ubytes from the start of an array, * and returns the decoded code point which those ubytes represent. * * This function will accept an invalidly encoded array as input. * If an invalid sequence is found at the start of the string, this * function will remove it, and return the value INVALID_SEQUENCE. * * Params: * s = the array whose first code point is to be decoded */ abstract dchar safeDecode(ref const(ubyte)[] s); /** * Returns the sequence of ubytes to be used to represent * any character which cannot be represented in the encoding scheme. * * Normally this will be a representation of some substitution * character, such as U+FFFD or '?'. */ abstract immutable(ubyte)[] replacementSequence(); } /** * Returns true if the array is encoded correctly * * Params: * s = the array to be tested */ bool isValid(const(ubyte)[] s) { while (s.length != 0) { dchar d = safeDecode(s); if (d == INVALID_SEQUENCE) return false; } return true; } /** * Returns the length of the longest possible substring, starting from * the first element, which is validly encoded. * * Params: * s = the array to be tested */ size_t validLength(const(ubyte)[] s) { const(ubyte)[] r = s; const(ubyte)[] t = s; while (s.length != 0) { if (safeDecode(s) == INVALID_SEQUENCE) break; t = s; } return r.length - t.length; } /** * Sanitizes an array by replacing malformed ubyte sequences with valid * ubyte sequences. The result is guaranteed to be valid for this * encoding scheme. * * If the input array is already valid, this function returns the * original, otherwise it constructs a new array by replacing all illegal * sequences with the encoding scheme's replacement sequence. * * Params: * s = the string to be sanitized */ immutable(ubyte)[] sanitize(immutable(ubyte)[] s) { auto n = validLength(s); if (n == s.length) return s; auto repSeq = replacementSequence; // Count how long the string needs to be. // Overestimating is not a problem auto len = s.length; const(ubyte)[] t = s[n..$]; while (t.length != 0) { dchar c = safeDecode(t); assert(c == INVALID_SEQUENCE); len += repSeq.length; t = t[validLength(t)..$]; } // Now do the write ubyte[] array = new ubyte[len]; array[0..n] = s[0..n]; auto offset = n; t = s[n..$]; while (t.length != 0) { dchar c = safeDecode(t); assert(c == INVALID_SEQUENCE); array[offset..offset+repSeq.length] = repSeq[]; offset += repSeq.length; n = validLength(t); array[offset..offset+n] = t[0..n]; offset += n; t = t[n..$]; } return cast(immutable(ubyte)[])array[0..offset]; } /** * Returns the length of the first encoded sequence. * * The input to this function MUST be validly encoded. * This is enforced by the function's in-contract. * * Params: * s = the array to be sliced */ size_t firstSequence(const(ubyte)[] s) in { assert(s.length != 0); const(ubyte)[] u = s; assert(safeDecode(u) != INVALID_SEQUENCE); } body { const(ubyte)[] t = s; decode(s); return t.length - s.length; } /** * Returns the total number of code points encoded in a ubyte array. * * The input to this function MUST be validly encoded. * This is enforced by the function's in-contract. * * Params: * s = the string to be counted */ size_t count(const(ubyte)[] s) in { assert(isValid(s)); } body { size_t n = 0; while (s.length != 0) { decode(s); ++n; } return n; } /** * Returns the array index at which the (n+1)th code point begins. * * The input to this function MUST be validly encoded. * This is enforced by the function's in-contract. * * Params: * s = the string to be counted */ sizediff_t index(const(ubyte)[] s, size_t n) in { assert(isValid(s)); assert(n >= 0); } body { const(ubyte)[] t = s; for (size_t i=0; i= 0x20 && c < 0x80) { r ~= c; } else { r ~= "\\x"; r ~= toHexDigit(c >> 4); r ~= toHexDigit(c); } } r ~= "\""; return r; } string makeReadable(wstring s) { string r = "\""; foreach(wchar c;s) { if (c >= 0x20 && c < 0x80) { r ~= cast(char) c; } else { r ~= "\\u"; r ~= toHexDigit(c >> 12); r ~= toHexDigit(c >> 8); r ~= toHexDigit(c >> 4); r ~= toHexDigit(c); } } r ~= "\"w"; return r; } string makeReadable(dstring s) { string r = "\""; foreach(dchar c; s) { if (c >= 0x20 && c < 0x80) { r ~= cast(char) c; } else if (c < 0x10000) { r ~= "\\u"; r ~= toHexDigit(c >> 12); r ~= toHexDigit(c >> 8); r ~= toHexDigit(c >> 4); r ~= toHexDigit(c); } else { r ~= "\\U00"; r ~= toHexDigit(c >> 20); r ~= toHexDigit(c >> 16); r ~= toHexDigit(c >> 12); r ~= toHexDigit(c >> 8); r ~= toHexDigit(c >> 4); r ~= toHexDigit(c); } } r ~= "\"d"; return r; } char toHexDigit(int n) { return "0123456789ABCDEF"[n & 0xF]; } }