phobos/std/utf.d

// Written in the D programming language.

/**
 * Encode and decode UTF-8, UTF-16 and UTF-32 strings.
 *
 * For Win32 systems, the C wchar_t type is UTF-16 and corresponds to the D
 * wchar type.
 * For linux systems, the C wchar_t type is UTF-32 and corresponds to
 * the D utf.dchar type.
 *
 * UTF character support is restricted to (\u0000 &lt;= character &lt;= \U0010FFFF).
 *
 * See_Also:
 *  $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br>
 *  $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br>
 *  $(LINK http://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335)
 * Macros:
 *  WIKI = Phobos/StdUtf
 *
 * Copyright: Copyright Digital Mars 2000 - 2010.
 * License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
 * Authors:   $(WEB digitalmars.com, Walter Bright)
 * Source:    $(PHOBOSSRC std/_utf.d)
 */
/*          Copyright Digital Mars 2000 - 2010.
 * Distributed under the Boost Software License, Version 1.0.
 *    (See accompanying file LICENSE_1_0.txt or copy at
 *          http://www.boost.org/LICENSE_1_0.txt)
 */
module std.utf;

import std.conv;       // to, assumeUnique
import std.exception;  // enforce, assumeUnique
import std.range;      // walkLength
import std.traits;     // isSomeChar, isSomeString

//debug=utf;           // uncomment to turn on debugging printf's

debug (utf) import core.stdc.stdio : printf;


/**********************************
 * Exception class that is thrown upon any errors.
 */
class UtfException : Exception
{
    //size_t idx;   /// index in string of where error occurred
    uint[4] sequence;
    size_t len;

    this(string s, dchar[] data...)
    {
        len = data.length;
        foreach (i, e; data) sequence[i] = e;
        super(s);
    }

    override string toString()
    {
        string result;

        if (len > 0)
        {
            result = "Invalid UTF sequence:";
            foreach (i; 0 .. len)
                result ~= " " ~ to!string(sequence[i], 16);
        }
        if (super.msg.length > 0)
        {
            if (result.length > 0)
                result ~= " - ";
            result ~= super.msg;
        }

        return result;
    }
}

// For unittests
version (unittest) private
{
    @trusted
    bool expectError_(lazy void expr)
    {
        try
        {
            expr();
        }
        catch (UtfException e)
        {
            return true;
        }

        return false;
    }
}


/*******************************
 * Test if c is a valid UTF-32 character.
 *
 * \uFFFE and \uFFFF are considered valid by this function,
 * as they are permitted for internal use by an application,
 * but they are not allowed for interchange by the Unicode standard.
 *
 * Returns: true if it is, false if not.
 */
@safe
pure nothrow bool isValidDchar(dchar c)
{
    /* Note: FFFE and FFFF are specifically permitted by the
     * Unicode standard for application internal use, but are not
     * allowed for interchange.
     * (thanks to Arcane Jill)
     */

    return c < 0xD800 ||
          (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);
}

unittest
{
    debug(utf) printf("utf.isValidDchar.unittest\n");
    assert(isValidDchar(cast(dchar)'a') == true);
    assert(isValidDchar(cast(dchar)0x1FFFFF) == false);

    assert(!isValidDchar(cast(dchar)0x00D800));
    assert(!isValidDchar(cast(dchar)0x00DBFF));
    assert(!isValidDchar(cast(dchar)0x00DC00));
    assert(!isValidDchar(cast(dchar)0x00DFFF));
    assert(isValidDchar(cast(dchar)0x00FFFE));
    assert(isValidDchar(cast(dchar)0x00FFFF));
    assert(isValidDchar(cast(dchar)0x01FFFF));
    assert(isValidDchar(cast(dchar)0x10FFFF));
    assert(!isValidDchar(cast(dchar)0x110000));
}


@safe pure
{

private immutable ubyte[256] UTF8stride =
[
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];

/**
 * stride() returns the length of a UTF-8 sequence starting at index $(D_PARAM i)
 * in string $(D_PARAM s).
 * Returns:
 *  The number of bytes in the UTF-8 sequence.
 * Throws:
 *  UtfException if s[i] is not the start of the UTF-8 sequence.
 */
uint stride(in char[] s, size_t i)
{
    immutable result = UTF8stride[s[i]];
    if (result == 0xFF)
        throw new UtfException("Not the start of the UTF-8 sequence");
    return result;
}

/**
 * strideBack() returns the length of a UTF-8 sequence ending before index $(D_PARAM i)
 * in string $(D_PARAM s).
 * Returns:
 *  The number of bytes in the UTF-8 sequence.
 * Throws:
 *  UtfException if s[i-1] is not the end of the UTF-8 sequence.
 */
uint strideBack(in char[] s, size_t i)
{
    if (i >= 1 && (s[i-1] & 0b1100_0000) != 0b1000_0000)
        return 1;
    else if (i >= 2 && (s[i-2] & 0b1100_0000) != 0b1000_0000)
        return 2;
    else if (i >= 3 && (s[i-3] & 0b1100_0000) != 0b1000_0000)
        return 3;
    else if (i >= 4 && (s[i-4] & 0b1100_0000) != 0b1000_0000)
        return 4;
    else
        throw new UtfException("Not the end of the UTF sequence");
}

/**
 * stride() returns the length of a UTF-16 sequence starting at index $(D_PARAM i)
 * in string $(D_PARAM s).
 */
nothrow uint stride(in wchar[] s, size_t i)
{
    immutable uint u = s[i];
    return 1 + (u >= 0xD800 && u <= 0xDBFF);
}

/**
 * strideBack() returns the length of a UTF-16 sequence ending before index $(D_PARAM i)
 * in string $(D_PARAM s).
 */
uint strideBack(in wchar[] s, size_t i)
{
    if (i == 0 || (s[i-1] >= 0xD800 && s[i-1] <= 0xDBFF))
        throw new UtfException("Not the end of the UTF-16 sequence");
    if (i <= 1)
        return 1;
    immutable c = s[i - 2];
    return 1 + (c >= 0xD800 && c <= 0xDBFF);
}

/**
 * stride() returns the length of a UTF-32 sequence starting at index $(D_PARAM i)
 * in string $(D_PARAM s).
 * Returns: The return value will always be 1.
 */
nothrow uint stride(in dchar[] s, size_t i)
{
    return 1;
}

/**
 * strideBack() returns the length of a UTF-32 sequence ending before index $(D_PARAM i)
 * in string $(D_PARAM s).
 * Returns: The return value will always be 1.
 */
nothrow uint strideBack(in dchar[] s, size_t i)
{
    return 1;
}

}  // stride functions are @safe and pure


@safe pure
{

/*******************************************
 * Given an index $(D_PARAM i) into an array of characters $(D_PARAM s[]),
 * and assuming that index $(D_PARAM i) is at the start of a UTF character,
 * determine the number of UCS characters up to that index $(D_PARAM i).
 */
size_t toUCSindex(in char[] s, size_t i)
{
    size_t n;
    size_t j;

    for (j = 0; j < i; )
    {
        j += stride(s, j);
        n++;
    }
    if (j > i)
    {
        throw new UtfException("1invalid UTF-8 sequence");
    }

    return n;
}

/// ditto
size_t toUCSindex(in wchar[] s, size_t i)
{
    size_t n;
    size_t j;

    for (j = 0; j < i; )
    {
        j += stride(s, j);
        n++;
    }
    if (j > i)
    {
        throw new UtfException("2invalid UTF-16 sequence");
    }

    return n;
}

/// ditto
nothrow size_t toUCSindex(in dchar[] s, size_t i)
{
    return i;
}


/******************************************
 * Given a UCS index $(D_PARAM n) into an array of characters $(D_PARAM s[]),
 * return the UTF index.
 */
size_t toUTFindex(in char[] s, size_t n)
{
    size_t i;

    while (n--)
    {
        uint j = UTF8stride[s[i]];
        if (j == 0xFF)
            throw new UtfException("3invalid UTF-8 sequence ", s[i]);
        i += j;
    }

    return i;
}

/// ditto
nothrow size_t toUTFindex(in wchar[] s, size_t n)
{
    size_t i;

    while (n--)
    {
        wchar u = s[i];

        i += 1 + (u >= 0xD800 && u <= 0xDBFF);
    }

    return i;
}

/// ditto
nothrow size_t toUTFindex(in dchar[] s, size_t n)
{
    return n;
}

}  // toUTF and toUCS index functions are @safe and pure


/* =================== Decode ======================= */

@trusted  // I think those functions should be @safe and pure.
{

/***************
 * Decodes and returns character starting at s[idx]. $(D_PARAM idx) is
 * advanced past the decoded character. If the character is not well formed,
 * a $(D UtfException) is thrown and $(D_PARAM idx) remains unchanged.
 */
dchar decode(in char[] s, ref size_t idx)
out (result)
{
    assert(isValidDchar(result));
}
body
{
    enforce(idx < s.length, "Attempted to decode past the end of a string");

    size_t len = s.length;
    dchar V;
    size_t i = idx;
    char u = s[i];

    if (u & 0x80)
    {
        /* The following encodings are valid, except for the 5 and 6 byte
         * combinations:
         *  0xxxxxxx
         *  110xxxxx 10xxxxxx
         *  1110xxxx 10xxxxxx 10xxxxxx
         *  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
         *  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
         *  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
         */
        uint n = 1;
        for (; ; n++)
        {
            if (n > 4)
                goto Lerr;      // only do the first 4 of 6 encodings
            if (((u << n) & 0x80) == 0)
            {
                if (n == 1)
                    goto Lerr;
                break;
            }
        }

        // Pick off (7 - n) significant bits of B from first byte of octet
        V = cast(dchar)(u & ((1 << (7 - n)) - 1));

        if (i + n > len)
            goto Lerr;          // off end of string

        /* The following combinations are overlong, and illegal:
         *  1100000x (10xxxxxx)
         *  11100000 100xxxxx (10xxxxxx)
         *  11110000 1000xxxx (10xxxxxx 10xxxxxx)
         *  11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
         *  11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
         */
        auto u2 = s[i + 1];
        if ((u & 0xFE) == 0xC0 ||
            (u == 0xE0 && (u2 & 0xE0) == 0x80) ||
            (u == 0xF0 && (u2 & 0xF0) == 0x80) ||
            (u == 0xF8 && (u2 & 0xF8) == 0x80) ||
            (u == 0xFC && (u2 & 0xFC) == 0x80))
            goto Lerr;          // overlong combination

        foreach (j; 1 .. n)
        {
            u = s[i + j];
            if ((u & 0xC0) != 0x80)
                goto Lerr;          // trailing bytes are 10xxxxxx
            V = (V << 6) | (u & 0x3F);
        }
        if (!isValidDchar(V))
            goto Lerr;
        i += n;
    }
    else
    {
        V = cast(dchar)u;
        i++;
    }

    idx = i;
    return V;

  Lerr:
    //printf("\ndecode: idx = %d, i = %d, length = %d s = \n'%.*s'\n%x\n"
    //"'%.*s'\n", idx, i, s.length, s, s[i], s[i .. $]);
    throw new UtfException(text("dchar decode(in char[], ref size_t): "
                    "Invalid UTF-8 sequence ", cast(const ubyte[])s,
                    " around index ", i));
}

unittest
{
    size_t i;
    dchar c;

    debug(utf) printf("utf.decode.unittest\n");

    static string s1 = "abcd";
    i = 0;
    c = decode(s1, i);
    assert(c == cast(dchar)'a');
    assert(i == 1);
    c = decode(s1, i);
    assert(c == cast(dchar)'b');
    assert(i == 2);

    static string s2 = "\xC2\xA9";
    i = 0;
    c = decode(s2, i);
    assert(c == cast(dchar)'\u00A9');
    assert(i == 2);

    static string s3 = "\xE2\x89\xA0";
    i = 0;
    c = decode(s3, i);
    assert(c == cast(dchar)'\u2260');
    assert(i == 3);

    static string[] s4 = [
        "\xE2\x89",     // too short
        "\xC0\x8A",
        "\xE0\x80\x8A",
        "\xF0\x80\x80\x8A",
        "\xF8\x80\x80\x80\x8A",
        "\xFC\x80\x80\x80\x80\x8A",
    ];

    for (int j = 0; j < s4.length; j++)
    {
        try
        {
            i = 0;
            c = decode(s4[j], i);
            assert(0);
        }
        catch (UtfException u)
        {
            i = 23;
            delete u;
        }

        assert(i == 23);
    }
}

unittest
{
    size_t i;

    i = 0; assert(decode("\xEF\xBF\xBE"c, i) == cast(dchar)0xFFFE);
    i = 0; assert(decode("\xEF\xBF\xBF"c, i) == cast(dchar)0xFFFF);
    i = 0;
    assert(expectError_( decode("\xED\xA0\x80"c, i) ));
    assert(expectError_( decode("\xED\xAD\xBF"c, i) ));
    assert(expectError_( decode("\xED\xAE\x80"c, i) ));
    assert(expectError_( decode("\xED\xAF\xBF"c, i) ));
    assert(expectError_( decode("\xED\xB0\x80"c, i) ));
    assert(expectError_( decode("\xED\xBE\x80"c, i) ));
    assert(expectError_( decode("\xED\xBF\xBF"c, i) ));
}

/// ditto
dchar decode(in wchar[] s, ref size_t idx)
out (result)
{
    assert(isValidDchar(result));
}
body
{
    enforce(idx < s.length, "Attempted to decode past the end of a string");

    string msg;
    dchar V;
    size_t i = idx;
    uint u = s[i];

    if (u & ~0x7F)
    {
        if (u >= 0xD800 && u <= 0xDBFF)
        {
            uint u2;

            if (i + 1 == s.length)
            {
                msg = "surrogate UTF-16 high value past end of string";
                goto Lerr;
            }
            u2 = s[i + 1];
            if (u2 < 0xDC00 || u2 > 0xDFFF)
            {
                msg = "surrogate UTF-16 low value out of range";
                goto Lerr;
            }
            u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
            i += 2;
        }
        else if (u >= 0xDC00 && u <= 0xDFFF)
        {
            msg = "unpaired surrogate UTF-16 value";
            goto Lerr;
        }
        else
            i++;
        // Note: u+FFFE and u+FFFF are specifically permitted by the
        // Unicode standard for application internal use (see isValidDchar)
    }
    else
    {
        i++;
    }

    idx = i;
    return cast(dchar)u;

  Lerr:
    throw new UtfException(msg, s[i]);
}

unittest
{
    size_t i;

    i = 0; assert(decode([ cast(wchar)0xFFFE ], i) == cast(dchar)0xFFFE && i == 1);
    i = 0; assert(decode([ cast(wchar)0xFFFF ], i) == cast(dchar)0xFFFF && i == 1);
}


/// ditto
dchar decode(in dchar[] s, ref size_t idx)
{
    enforce(idx < s.length, "Attempted to decode past the end of a string");

    size_t i = idx;
    dchar c = s[i];

    if (!isValidDchar(c))
        goto Lerr;
    idx = i + 1;
    return c;

  Lerr:
    throw new UtfException("5invalid UTF-32 value", c);
}

}  // Decode functions are @trusted


/* =================== Encode ======================= */

@safe // pure  @@@NOTE@@@ unittest is a function. Currently, unittest is affected by applying attributes.
{

/*******************************
 * Encodes character $(D_PARAM c) into fixed-size array $(D_PARAM s).
 * Returns the actual length of the encoded character (a number between 1 and
 * 4 for $(D char[4]) buffers, and between 1 and 2 for $(D wchar[2]) buffers).
 */
pure size_t encode(ref char[4] buf, dchar c)
{
    if (c <= 0x7F)
    {
        assert(isValidDchar(c));
        buf[0] = cast(char)c;
        return 1;
    }
    if (c <= 0x7FF)
    {
        assert(isValidDchar(c));
        buf[0] = cast(char)(0xC0 | (c >> 6));
        buf[1] = cast(char)(0x80 | (c & 0x3F));
        return 2;
    }
    if (c <= 0xFFFF)
    {
        if (0xD800 <= c && c <= 0xDFFF)
            throw new UtfException("encoding a surrogate code point in UTF-8", c);

        assert(isValidDchar(c));
        buf[0] = cast(char)(0xE0 | (c >> 12));
        buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[2] = cast(char)(0x80 | (c & 0x3F));
        return 3;
    }
    if (c <= 0x10FFFF)
    {
        assert(isValidDchar(c));
        buf[0] = cast(char)(0xF0 | (c >> 18));
        buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
        buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[3] = cast(char)(0x80 | (c & 0x3F));
        return 4;
    }

    assert(!isValidDchar(c));
    throw new UtfException("encoding an invalid code point in UTF-8", c);
}

unittest
{
    char[4] buf;

    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
    assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F");
    assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080");
    assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF");
    assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800");
    assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF");
    assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000");
    assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE");
    assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF");
    assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000");
    assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF");

    assert(expectError_( encode(buf, cast(dchar)0xD800) ));
    assert(expectError_( encode(buf, cast(dchar)0xDBFF) ));
    assert(expectError_( encode(buf, cast(dchar)0xDC00) ));
    assert(expectError_( encode(buf, cast(dchar)0xDFFF) ));
    assert(expectError_( encode(buf, cast(dchar)0x110000) ));
}


/// Ditto
pure size_t encode(ref wchar[2] buf, dchar c)
{
    if (c <= 0xFFFF)
    {
        if (0xD800 <= c && c <= 0xDFFF)
            throw new UtfException("encoding an isolated surrogate code point in UTF-16", c);

        assert(isValidDchar(c));
        buf[0] = cast(wchar)c;
        return 1;
    }
    if (c <= 0x10FFFF)
    {
        assert(isValidDchar(c));
        buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
        buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
        return 2;
    }

    assert(!isValidDchar(c));
    throw new UtfException("encoding an invalid code point in UTF-16", c);
}

unittest
{
    wchar[2] buf;

    assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000");
    assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF");
    assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000");
    assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE);
    assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF);
    assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000");
    assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF");

    assert(expectError_( encode(buf, cast(dchar)0xD800) ));
    assert(expectError_( encode(buf, cast(dchar)0xDBFF) ));
    assert(expectError_( encode(buf, cast(dchar)0xDC00) ));
    assert(expectError_( encode(buf, cast(dchar)0xDFFF) ));
    assert(expectError_( encode(buf, cast(dchar)0x110000) ));
}


/*******************************
 * Encodes character $(D_PARAM c) and appends it to array $(D_PARAM s[]).
 */
pure void encode(ref char[] s, dchar c)
{
    char[] r = s;

    if (c <= 0x7F)
    {
        assert(isValidDchar(c));
        r ~= cast(char)c;
    }
    else
    {
        char[4] buf;
        uint L;

        if (c <= 0x7FF)
        {
            assert(isValidDchar(c));
            buf[0] = cast(char)(0xC0 | (c >> 6));
            buf[1] = cast(char)(0x80 | (c & 0x3F));
            L = 2;
        }
        else if (c <= 0xFFFF)
        {
            if (0xD800 <= c && c <= 0xDFFF)
                throw new UtfException("encoding a surrogate code point in UTF-8", c);

            assert(isValidDchar(c));
            buf[0] = cast(char)(0xE0 | (c >> 12));
            buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
            buf[2] = cast(char)(0x80 | (c & 0x3F));
            L = 3;
        }
        else if (c <= 0x10FFFF)
        {
            assert(isValidDchar(c));
            buf[0] = cast(char)(0xF0 | (c >> 18));
            buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
            buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
            buf[3] = cast(char)(0x80 | (c & 0x3F));
            L = 4;
        }
        else
        {
            assert(!isValidDchar(c));
            throw new UtfException("encoding an invalid code point in UTF-8", c);
        }
        r ~= buf[0 .. L];
    }
    s = r;
}

unittest
{
    debug(utf) printf("utf.encode.unittest\n");

    char[] s = "abcd".dup;
    encode(s, cast(dchar)'a');
    assert(s.length == 5);
    assert(s == "abcda");

    encode(s, cast(dchar)'\u00A9');
    assert(s.length == 7);
    assert(s == "abcda\xC2\xA9");
    //assert(s == "abcda\u00A9");   // BUG: fix compiler

    encode(s, cast(dchar)'\u2260');
    assert(s.length == 10);
    assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
}

unittest
{
    char[] buf;

    encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
    encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
    encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
    encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
    encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
    encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
    encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
    encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
    encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
    encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
    encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");

    assert(expectError_( encode(buf, cast(dchar)0xD800) ));
    assert(expectError_( encode(buf, cast(dchar)0xDBFF) ));
    assert(expectError_( encode(buf, cast(dchar)0xDC00) ));
    assert(expectError_( encode(buf, cast(dchar)0xDFFF) ));
    assert(expectError_( encode(buf, cast(dchar)0x110000) ));
}

/// ditto
pure void encode(ref wchar[] s, dchar c)
{
    wchar[] r = s;

    if (c <= 0xFFFF)
    {
        if (0xD800 <= c && c <= 0xDFFF)
            throw new UtfException("encoding an isolated surrogate code point in UTF-16", c);

        assert(isValidDchar(c));
        r ~= cast(wchar)c;
    }
    else if (c <= 0x10FFFF)
    {
        wchar[2] buf;

        assert(isValidDchar(c));
        buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
        buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
        r ~= buf;
    }
    else
    {
        assert(!isValidDchar(c));
        throw new UtfException("encoding an invalid code point in UTF-16", c);
    }

    s = r;
}

unittest
{
    wchar[] buf;

    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
    encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
    encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
    encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
    encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");

    assert(expectError_( encode(buf, cast(dchar)0xD800) ));
    assert(expectError_( encode(buf, cast(dchar)0xDBFF) ));
    assert(expectError_( encode(buf, cast(dchar)0xDC00) ));
    assert(expectError_( encode(buf, cast(dchar)0xDFFF) ));
    assert(expectError_( encode(buf, cast(dchar)0x110000) ));
}

/// ditto
pure void encode(ref dchar[] s, dchar c)
{
    if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
        throw new UtfException("encoding an invalid code point in UTF-32", c);

    assert(isValidDchar(c));
    s ~= c;
}

unittest
{
    dchar[] buf;

    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
    encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
    encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
    encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');

    assert(expectError_( encode(buf, cast(dchar)0xD800) ));
    assert(expectError_( encode(buf, cast(dchar)0xDBFF) ));
    assert(expectError_( encode(buf, cast(dchar)0xDC00) ));
    assert(expectError_( encode(buf, cast(dchar)0xDFFF) ));
    assert(expectError_( encode(buf, cast(dchar)0x110000) ));
}

}  // Encode functions are @safe and pure


/++
    Returns the number of code units that are required to encode the code point
    $(D c) when $(D C) is the character type used to encode it.

Examples:
------
assert(codeLength!char('a') == 1);
assert(codeLength!wchar('a') == 1);
assert(codeLength!dchar('a') == 1);

assert(codeLength!char('\U0010FFFF') == 4);
assert(codeLength!wchar('\U0010FFFF') == 2);
assert(codeLength!dchar('\U0010FFFF') == 1);
------
  +/
@safe
pure nothrow ubyte codeLength(C)(dchar c)
{
    static if (C.sizeof == 1)
    {
        return
            c <= 0x7F ? 1
            : c <= 0x7FF ? 2
            : c <= 0xFFFF ? 3
            : c <= 0x10FFFF ? 4
            : (assert(false), 6);
    }
    else static if (C.sizeof == 2)
    {
        return c <= 0xFFFF ? 1 : 2;
    }
    else
    {
        static assert(C.sizeof == 4);
        return 1;
    }
}

//Verify Examples.
unittest
{
    assert(codeLength!char('a') == 1);
    assert(codeLength!wchar('a') == 1);
    assert(codeLength!dchar('a') == 1);

    assert(codeLength!char('\U0010FFFF') == 4);
    assert(codeLength!wchar('\U0010FFFF') == 2);
    assert(codeLength!dchar('\U0010FFFF') == 1);
}


/* =================== Validation ======================= */

/***********************************
 * Checks to see if string is well formed or not. $(D S) can be an array
 * of $(D char), $(D wchar), or $(D dchar). Throws a $(D UtfException)
 * if it is not. Use to check all untrusted input for correctness.
 */
@safe
void validate(S)(in S s) if (isSomeString!S)
{
    immutable len = s.length;
    for (size_t i = 0; i < len; )
    {
        decode(s, i);
    }
}


/* =================== Conversion to UTF8 ======================= */

@trusted
{

char[] toUTF8(out char[4] buf, dchar c)
in
{
    assert(isValidDchar(c));
}
body
{
    if (c <= 0x7F)
    {
        buf[0] = cast(char)c;
        return buf[0 .. 1];
    }
    else if (c <= 0x7FF)
    {
        buf[0] = cast(char)(0xC0 | (c >> 6));
        buf[1] = cast(char)(0x80 | (c & 0x3F));
        return buf[0 .. 2];
    }
    else if (c <= 0xFFFF)
    {
        buf[0] = cast(char)(0xE0 | (c >> 12));
        buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[2] = cast(char)(0x80 | (c & 0x3F));
        return buf[0 .. 3];
    }
    else if (c <= 0x10FFFF)
    {
        buf[0] = cast(char)(0xF0 | (c >> 18));
        buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
        buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[3] = cast(char)(0x80 | (c & 0x3F));
        return buf[0 .. 4];
    }

    assert(0);
}


/*******************
 * Encodes string $(D_PARAM s) into UTF-8 and returns the encoded string.
 */
string toUTF8(in char[] s)
{
    validate(s);
    return s.idup;
}

/// ditto
string toUTF8(in wchar[] s)
{
    char[] r;
    size_t i;
    size_t slen = s.length;

    r.length = slen;
    for (i = 0; i < slen; i++)
    {
        wchar c = s[i];

        if (c <= 0x7F)
            r[i] = cast(char)c;     // fast path for ascii
        else
        {
            r.length = i;
            while (i < slen)
                encode(r, decode(s, i));
            break;
        }
    }

    return r.assumeUnique();
}

/// ditto
pure string toUTF8(in dchar[] s)
{
    char[] r;
    size_t i;
    size_t slen = s.length;

    r.length = slen;
    for (i = 0; i < slen; i++)
    {
        dchar c = s[i];

        if (c <= 0x7F)
            r[i] = cast(char)c;     // fast path for ascii
        else
        {
            r.length = i;
            foreach (dchar d; s[i .. slen])
            {
                encode(r, d);
            }
            break;
        }
    }

    return r.assumeUnique();
}


/* =================== Conversion to UTF16 ======================= */

pure wchar[] toUTF16(ref wchar[2] buf, dchar c)
in
{
    assert(isValidDchar(c));
}
body
{
    if (c <= 0xFFFF)
    {
        buf[0] = cast(wchar)c;
        return buf[0 .. 1];
    }
    else
    {
        buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
        buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
        return buf[0 .. 2];
    }
}

/****************
 * Encodes string $(D_PARAM s) into UTF-16 and returns the encoded string.
 * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
 * an LPWSTR or LPCWSTR argument.
 */
wstring toUTF16(in char[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen;
    r.length = 0;
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c <= 0x7F)
        {
            i++;
            r ~= cast(wchar)c;
        }
        else
        {
            c = decode(s, i);
            encode(r, c);
        }
    }

    return r.assumeUnique();  // ok because r is unique
}

/// ditto
const(wchar)* toUTF16z(in char[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen + 1;
    r.length = 0;
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c <= 0x7F)
        {
            i++;
            r ~= cast(wchar)c;
        }
        else
        {
            c = decode(s, i);
            encode(r, c);
        }
    }
    r ~= "\000";

    return r.ptr;
}

/// ditto
wstring toUTF16(in wchar[] s)
{
    validate(s);
    return s.idup;
}

/// ditto
pure wstring toUTF16(in dchar[] s)
{
    wchar[] r;
    size_t slen = s.length;

    r.length = slen;
    r.length = 0;
    for (size_t i = 0; i < slen; i++)
    {
        encode(r, s[i]);
    }

    return r.assumeUnique();  // ok because r is unique
}


/* =================== Conversion to UTF32 ======================= */

/*****
 * Encodes string $(D_PARAM s) into UTF-32 and returns the encoded string.
 */
dstring toUTF32(in char[] s)
{
    dchar[] r;
    size_t slen = s.length;
    size_t j = 0;

    r.length = slen;        // r[] will never be longer than s[]
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c >= 0x80)
            c = decode(s, i);
        else
            i++;        // c is ascii, no need for decode
        r[j++] = c;
    }

    return r[0 .. j].assumeUnique(); // legit because it's unique
}

/// ditto
dstring toUTF32(in wchar[] s)
{
    dchar[] r;
    size_t slen = s.length;
    size_t j = 0;

    r.length = slen;        // r[] will never be longer than s[]
    for (size_t i = 0; i < slen; )
    {
        dchar c = s[i];
        if (c >= 0x80)
            c = decode(s, i);
        else
            i++;        // c is ascii, no need for decode
        r[j++] = c;
    }

    return r[0 .. j].assumeUnique();  // legit because it's unique
}

/// ditto
dstring toUTF32(in dchar[] s)
{
    validate(s);
    return s.idup;
}

} // Convert functions are @safe


/* =================== toUTFz ======================= */

/++
    Returns a C-style 0-terminated string equivalent to $(D s). $(D s) must not
    contain embedded $(D '\0')'s as any C function will treat the first
    $(D '\0') that it sees a the end of the string. If $(D s) is $(D null) or
    empty, then a string containing only $(D '\0') is returned.

    $(D toUTFz) accepts any type of string and is templated on the type of
    character pointer that you wish to convert to. It will avoid allocating a
    new string if it can, but there's a decent chance that it will end up having
    to allocate a new string - particularly when dealing with character types
    other than $(D char).

    $(RED Warning:) When passing a character pointer to a C function, and the C
    function keeps it around for any reason, make sure that you keep a reference
    to it in your D code. Otherwise, it may go away during a garbage collection
    cycle and cause a nasty bug when the C code tries to use it.

    Examples:
--------------------
auto p1 = toUTFz!(char*)("hello world");
auto p2 = toUTFz!(const(char)*)("hello world");
auto p3 = toUTFz!(immutable(char)*)("hello world");
auto p4 = toUTFz!(char*)("hello world"d);
auto p5 = toUTFz!(const(wchar)*)("hello world");
auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
--------------------
  +/
P toUTFz(P, S)(S str)
    if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
       is(Unqual!(typeof(*P.init)) == Unqual!(typeof(str[0]))) &&
       is(immutable(Unqual!(typeof(str[0]))) == typeof(str[0])))
{
    if(str.empty)
        return cast(P)"".ptr;

    alias Unqual!(typeof(str[0])) C;

    //If the P is mutable, then we have to make a copy.
    static if(is(Unqual!(typeof(*P.init)) == typeof(*P.init)))
        return toUTFz!(P, const(C)[])(cast(const(C)[])str);
    else
    {
        immutable p = str.ptr + str.length;

        // Peek past end of str, if it's 0, no conversion necessary.
        // Note that the compiler will put a 0 past the end of static
        // strings, and the storage allocator will put a 0 past the end
        // of newly allocated char[]'s.
        // Is p dereferenceable? A simple test: if the p points to an
        // address multiple of 4, then conservatively assume the pointer
        // might be pointing to a new block of memory, which might be
        // unreadable. Otherwise, it's definitely pointing to valid
        // memory.
        if((cast(size_t)p & 3) && *p == '\0')
            return str.ptr;

        return toUTFz!(P, const(C)[])(cast(const(C)[])str);
    }
}

P toUTFz(P, S)(S str)
    if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
       is(Unqual!(typeof(*P.init)) == Unqual!(typeof(str[0]))) &&
       !is(immutable(Unqual!(typeof(str[0]))) == typeof(str[0])))
{
    alias Unqual!(typeof(*P.init)) OutChar;

    auto copy = new OutChar[](str.length + 1);
    copy[0 .. $ - 1] = str[];
    copy[$ - 1] = '\0';

    return cast(P)copy;
}

P toUTFz(P, S)(S str)
    if(isSomeString!S && isPointer!P && isSomeChar!(typeof(*P.init)) &&
       !is(Unqual!(typeof(*P.init)) == Unqual!(typeof(str[0]))))
{
    auto retval = appender!(typeof(*P.init)[])();

    foreach(dchar c; str)
        retval.put(c);
    retval.put('\0');

    return cast(P)retval.data.ptr;
}

//Verify Examples.
unittest
{
    auto p1 = toUTFz!(char*)("hello world");
    auto p2 = toUTFz!(const(char)*)("hello world");
    auto p3 = toUTFz!(immutable(char)*)("hello world");
    auto p4 = toUTFz!(char*)("hello world"d);
    auto p5 = toUTFz!(const(wchar)*)("hello world");
    auto p6 = toUTFz!(immutable(dchar)*)("hello world"w);
}

unittest
{
    import core.exception;
    import std.algorithm;
    import std.metastrings;
    import std.typetuple;

    size_t zeroLen(C)(const(C)* ptr)
    {
        size_t len = 0;

        while(*ptr != '\0')
        {
            ++ptr;
            ++len;
        }

        return len;
    }

    foreach(S; TypeTuple!(string, wstring, dstring))
    {
        alias Unqual!(typeof(S.init[0])) C;

        auto s1 = to!S("hello\U00010143\u0100\U00010143");
        auto temp = new C[](s1.length + 1);
        temp[0 .. $ - 1] = s1[0 .. $];
        temp[$ - 1] = '\n';
        --temp.length;
        auto s2 = assumeUnique(temp);
        assert(s1 == s2);

        foreach(P; TypeTuple!(C*, const(C)*, immutable(C)*))
        {
            auto p1 = toUTFz!P(s1);
            assert(p1[0 .. s1.length] == s1);
            assert(p1[s1.length] == '\0');

            auto p2 = toUTFz!P(s2);
            assert(p2[0 .. s2.length] == s2);
            assert(p2[s2.length] == '\0');
        }
    }

    void test(P, S)(S s, size_t line = __LINE__)
    {
        auto p = toUTFz!P(s);
        immutable len = zeroLen(p);
        enforce(cmp(s, p[0 .. len]) == 0,
                new AssertError(Format!("Unit test failed: %s %s", P.stringof, S.stringof),
                                __FILE__, line));
    }

    foreach(P; TypeTuple!(wchar*, const(wchar)*, immutable(wchar)*,
                          dchar*, const(dchar)*, immutable(dchar)*))
    {
        test!P("hello\U00010143\u0100\U00010143");
    }

    foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
                          dchar*, const(dchar)*, immutable(dchar)*))
    {
        test!P("hello\U00010143\u0100\U00010143"w);
    }

    foreach(P; TypeTuple!(char*, const(char)*, immutable(char)*,
                          wchar*, const(wchar)*, immutable(wchar)*))
    {
        test!P("hello\U00010143\u0100\U00010143"d);
    }

    foreach(S; TypeTuple!(char[], wchar[], dchar[],
                          const(char)[], const(wchar)[], const(dchar)[]))
    {
        auto s = to!S("hello\U00010143\u0100\U00010143");

        foreach(P; TypeTuple!(char*, wchar*, dchar*,
                              const(char)*, const(wchar)*, const(dchar)*,
                              immutable(char)*, immutable(wchar)*, immutable(dchar)*))
        {
            test!P(s);
        }
    }
}


/* ================================ tests ================================== */

unittest
{
    debug(utf) printf("utf.toUTF.unittest\n");

    string c;
    wstring w;
    dstring d;

    c = "hello";
    w = toUTF16(c);
    assert(w == "hello");
    d = toUTF32(c);
    assert(d == "hello");
    c = toUTF8(w);
    assert(c == "hello");
    d = toUTF32(w);
    assert(d == "hello");

    c = toUTF8(d);
    assert(c == "hello");
    w = toUTF16(d);
    assert(w == "hello");


    c = "hel\u1234o";
    w = toUTF16(c);
    assert(w == "hel\u1234o");
    d = toUTF32(c);
    assert(d == "hel\u1234o");

    c = toUTF8(w);
    assert(c == "hel\u1234o");
    d = toUTF32(w);
    assert(d == "hel\u1234o");

    c = toUTF8(d);
    assert(c == "hel\u1234o");
    w = toUTF16(d);
    assert(w == "hel\u1234o");


    c = "he\U0010AAAAllo";
    w = toUTF16(c);
    //foreach (wchar c; w) printf("c = x%x\n", c);
    //foreach (wchar c; cast(wstring)"he\U0010AAAAllo") printf("c = x%x\n", c);
    assert(w == "he\U0010AAAAllo");
    d = toUTF32(c);
    assert(d == "he\U0010AAAAllo");

    c = toUTF8(w);
    assert(c == "he\U0010AAAAllo");
    d = toUTF32(w);
    assert(d == "he\U0010AAAAllo");

    c = toUTF8(d);
    assert(c == "he\U0010AAAAllo");
    w = toUTF16(d);
    assert(w == "he\U0010AAAAllo");
}


/**
 * Returns the total number of code points encoded in a string.
 *
 * The input to this function MUST be validly encoded.
 *
 * Supercedes: This function supercedes $(D std.utf.toUCSindex()).
 *
 * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
 *
 * Params:
 *  s = the string to be counted
 */
@trusted
size_t count(E)(const(E)[] s) if (isSomeChar!E)
{
    static if (E.sizeof < 4)
    {
        return walkLength(s);
        //size_t result = 0;
        //while (!s.empty)
        //{
        //    ++result;
        //    s.popFront();
        //}
        //return result;
    }
    else
    {
        return s.length;
    }
}

unittest
{
    assert(count("") == 0);
    assert(count("a") == 1);
    assert(count("abc") == 3);
    assert(count("\u20AC100") == 4);
}