From 7f2e55613f13ce8450e716a1632bd20b88b7f85c Mon Sep 17 00:00:00 2001
From: Shin Fujishiro <rsinfu@gmail.com>
Date: Wed, 23 Jun 2010 03:02:15 +0000
Subject: [PATCH] [1.x] Fixed bugzilla 978: std.utf's toUTF* functions accept
 some invalid and reject some valid UTF.

* Fixed decode() to accept U+FFFE and U+FFFF.
* Changed some assert contracts (which check input for validity) to if-throw.
---
 std/utf.d | 245 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 201 insertions(+), 44 deletions(-)

diff --git a/std/utf.d b/std/utf.d
index b295ee5a9..f566c8c01 100644
--- a/std/utf.d
+++ b/std/utf.d
@@ -46,6 +46,8 @@ private import std.stdio;
 
 //debug=utf;            // uncomment to turn on debugging printf's
 
+
+
 deprecated class UtfError : Error
 {
     size_t idx; // index in string of where error occurred
@@ -72,6 +74,20 @@ class UtfException : Exception
     }
 }
 
+// For unittests
+private bool expectError_(lazy void expr)
+{
+    try
+    {
+        expr;
+    }
+    catch (UtfException e)
+    {
+        return true;
+    }
+    return false;
+}
+
 /*******************************
  * Test if c is a valid UTF-32 character.
  *
@@ -99,6 +115,16 @@ unittest
     debug(utf) printf("utf.isValidDchar.unittest\n");
     assert(isValidDchar(cast(dchar)'a') == true);
     assert(isValidDchar(cast(dchar)0x1FFFFF) == false);
+
+    assert(!isValidDchar(cast(dchar)0x00D800));
+    assert(!isValidDchar(cast(dchar)0x00DBFF));
+    assert(!isValidDchar(cast(dchar)0x00DC00));
+    assert(!isValidDchar(cast(dchar)0x00DFFF));
+    assert(isValidDchar(cast(dchar)0x00FFFE));
+    assert(isValidDchar(cast(dchar)0x00FFFF));
+    assert(isValidDchar(cast(dchar)0x01FFFF));
+    assert(isValidDchar(cast(dchar)0x10FFFF));
+    assert(!isValidDchar(cast(dchar)0x110000));
 }
 
 
@@ -398,6 +424,21 @@ unittest
     }
 }
 
+unittest
+{
+    size_t i;
+
+    i = 0; assert(decode("\xEF\xBF\xBE"c, i) == cast(dchar) 0xFFFE);
+    i = 0; assert(decode("\xEF\xBF\xBF"c, i) == cast(dchar) 0xFFFF);
+    assert(expectError_( decode("\xED\xA0\x80"c, i) ));
+    assert(expectError_( decode("\xED\xAD\xBF"c, i) ));
+    assert(expectError_( decode("\xED\xAE\x80"c, i) ));
+    assert(expectError_( decode("\xED\xAF\xBF"c, i) ));
+    assert(expectError_( decode("\xED\xB0\x80"c, i) ));
+    assert(expectError_( decode("\xED\xBE\x80"c, i) ));
+    assert(expectError_( decode("\xED\xBF\xBF"c, i) ));
+}
+
 /** ditto */
 
 dchar decode(wchar[] s, inout size_t idx)
@@ -436,12 +477,10 @@ dchar decode(wchar[] s, inout size_t idx)
             {   msg = "unpaired surrogate UTF-16 value";
                 goto Lerr;
             }
-            else if (u == 0xFFFE || u == 0xFFFF)
-            {   msg = "illegal UTF-16 value";
-                goto Lerr;
-            }
             else
                 i++;
+            // Note: u+FFFE and u+FFFF are specifically permitted by the
+            // Unicode standard for application internal use (see isValidDchar)
         }
         else
         {
@@ -455,6 +494,14 @@ dchar decode(wchar[] s, inout size_t idx)
         throw new UtfException(msg, i);
     }
 
+unittest
+{
+    size_t i;
+
+    i = 0; assert(decode([ cast(wchar) 0xFFFE ], i) == cast(dchar) 0xFFFE && i == 1);
+    i = 0; assert(decode([ cast(wchar) 0xFFFF ], i) == cast(dchar) 0xFFFF && i == 1);
+}
+
 /** ditto */
 
 dchar decode(dchar[] s, inout size_t idx)
@@ -484,16 +531,12 @@ dchar decode(dchar[] s, inout size_t idx)
  */
 
 void encode(inout char[] s, dchar c)
-    in
-    {
-        assert(isValidDchar(c));
-    }
-    body
     {
         char[] r = s;
 
         if (c <= 0x7F)
         {
+            assert(isValidDchar(c));
             r ~= cast(char) c;
         }
         else
@@ -503,12 +546,18 @@ void encode(inout char[] s, dchar c)
 
             if (c <= 0x7FF)
             {
+                assert(isValidDchar(c));
                 buf[0] = cast(char)(0xC0 | (c >> 6));
                 buf[1] = cast(char)(0x80 | (c & 0x3F));
                 L = 2;
             }
             else if (c <= 0xFFFF)
             {
+                if (0xD800 <= c && c <= 0xDFFF)
+                    throw new UtfException(
+                        "encoding a surrogate code point in UTF-8",
+                        s.length);
+                assert(isValidDchar(c));
                 buf[0] = cast(char)(0xE0 | (c >> 12));
                 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
                 buf[2] = cast(char)(0x80 | (c & 0x3F));
@@ -516,6 +565,7 @@ void encode(inout char[] s, dchar c)
             }
             else if (c <= 0x10FFFF)
             {
+                assert(isValidDchar(c));
                 buf[0] = cast(char)(0xF0 | (c >> 18));
                 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
                 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
@@ -524,7 +574,10 @@ void encode(inout char[] s, dchar c)
             }
             else
             {
-                assert(0);
+                assert(!isValidDchar(c));
+                throw new UtfException(
+                    "encoding an invalid code point in UTF-8",
+                    s.length);
             }
             r ~= buf[0 .. L];
         }
@@ -550,44 +603,112 @@ unittest
     assert(s == "abcda\xC2\xA9\xE2\x89\xA0");
 }
 
+unittest
+{
+    char[] buf;
+
+    encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000");
+    encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F");
+    encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080");
+    encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF");
+    encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800");
+    encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF");
+    encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000");
+    encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE");
+    encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF");
+    encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000");
+    encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF");
+
+    assert(expectError_( encode(buf, cast(dchar) 0xD800) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDBFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDC00) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDFFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0x110000) ));
+}
+
 /** ditto */
 
 void encode(inout wchar[] s, dchar c)
-    in
-    {
-        assert(isValidDchar(c));
-    }
-    body
     {
         wchar[] r = s;
 
         if (c <= 0xFFFF)
         {
+            if (0xD800 <= c && c <= 0xDFFF)
+                throw new UtfException(
+                    "encoding an isolated surrogate code point in UTF-16",
+                    s.length);
+            assert(isValidDchar(c));
             r ~= cast(wchar) c;
         }
-        else
+        else if (c <= 0x10FFFF)
         {
             wchar[2] buf;
 
+            assert(isValidDchar(c));
             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
             r ~= buf;
         }
+        else
+        {
+            assert(!isValidDchar(c));
+            throw new UtfException(
+                "encoding an invalid code point in UTF-16",
+                s.length);
+        }
         s = r;
     }
 
+unittest
+{
+    wchar[] buf;
+
+    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
+    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
+    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
+    encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE);
+    encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF);
+    encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000");
+    encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF");
+
+    assert(expectError_( encode(buf, cast(dchar) 0xD800) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDBFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDC00) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDFFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0x110000) ));
+}
+
 /** ditto */
 
 void encode(inout dchar[] s, dchar c)
-    in
     {
+        if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c)
+            throw new UtfException(
+                "encoding an invalid code point in UTF-32",
+                s.length);
         assert(isValidDchar(c));
-    }
-    body
-    {
         s ~= c;
     }
 
+unittest
+{
+    dchar[] buf;
+
+    encode(buf, '\u0000'); assert(buf[0] == '\u0000');
+    encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF');
+    encode(buf, '\uE000'); assert(buf[2] == '\uE000');
+    encode(buf, 0xFFFE ); assert(buf[3] == 0xFFFE);
+    encode(buf, 0xFFFF ); assert(buf[4] == 0xFFFF);
+    encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF');
+
+    assert(expectError_( encode(buf, cast(dchar) 0xD800) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDBFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDC00) ));
+    assert(expectError_( encode(buf, cast(dchar) 0xDFFF) ));
+    assert(expectError_( encode(buf, cast(dchar) 0x110000) ));
+}
+
 /* =================== Validation ======================= */
 
 /***********************************
@@ -635,25 +756,26 @@ void validate(dchar[] s)
 /* =================== Conversion to UTF8 ======================= */
 
 char[] toUTF8(char[4] buf, dchar c)
-    in
-    {
-        assert(isValidDchar(c));
-    }
-    body
     {
         if (c <= 0x7F)
         {
+            assert(isValidDchar(c));
             buf[0] = cast(char) c;
             return buf[0 .. 1];
         }
         else if (c <= 0x7FF)
         {
+            assert(isValidDchar(c));
             buf[0] = cast(char)(0xC0 | (c >> 6));
             buf[1] = cast(char)(0x80 | (c & 0x3F));
             return buf[0 .. 2];
         }
         else if (c <= 0xFFFF)
         {
+            if (0xD800 <= c && c <= 0xDFFF)
+                throw new UtfException(
+                    "encoding a surrogate code point in UTF-8", 0);
+            assert(isValidDchar(c));
             buf[0] = cast(char)(0xE0 | (c >> 12));
             buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
             buf[2] = cast(char)(0x80 | (c & 0x3F));
@@ -661,26 +783,48 @@ char[] toUTF8(char[4] buf, dchar c)
         }
         else if (c <= 0x10FFFF)
         {
+            assert(isValidDchar(c));
             buf[0] = cast(char)(0xF0 | (c >> 18));
             buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
             buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
             buf[3] = cast(char)(0x80 | (c & 0x3F));
             return buf[0 .. 4];
         }
-        assert(0);
+        assert(!isValidDchar(c));
+        throw new UtfException(
+            "encoding an invalid code point in UTF-8", 0);
     }
 
+unittest
+{
+    char[4] buf;
+
+    assert(toUTF8(buf, '\u0000') == "\u0000");
+    assert(toUTF8(buf, '\u007F') == "\u007F");
+    assert(toUTF8(buf, '\u0080') == "\u0080");
+    assert(toUTF8(buf, '\u07FF') == "\u07FF");
+    assert(toUTF8(buf, '\u0800') == "\u0800");
+    assert(toUTF8(buf, '\uD7FF') == "\uD7FF");
+    assert(toUTF8(buf, '\uE000') == "\uE000");
+    assert(toUTF8(buf, 0xFFFE) == "\xEF\xBF\xBE");
+    assert(toUTF8(buf, 0xFFFF) == "\xEF\xBF\xBF");
+    assert(toUTF8(buf, '\U00010000') == "\U00010000");
+    assert(toUTF8(buf, '\U0010FFFF') == "\U0010FFFF");
+
+    assert(expectError_( toUTF8(buf, cast(dchar) 0xD800) ));
+    assert(expectError_( toUTF8(buf, cast(dchar) 0xDBFF) ));
+    assert(expectError_( toUTF8(buf, cast(dchar) 0xDC00) ));
+    assert(expectError_( toUTF8(buf, cast(dchar) 0xDFFF) ));
+    assert(expectError_( toUTF8(buf, cast(dchar) 0x110000) ));
+}
+
 /*******************
  * Encodes string s into UTF-8 and returns the encoded string.
  */
 
 char[] toUTF8(char[] s)
-    in
     {
         validate(s);
-    }
-    body
-    {
         return s;
     }
 
@@ -743,25 +887,46 @@ char[] toUTF8(dchar[] s)
 /* =================== Conversion to UTF16 ======================= */
 
 wchar[] toUTF16(wchar[2] buf, dchar c)
-    in
-    {
-        assert(isValidDchar(c));
-    }
-    body
     {
         if (c <= 0xFFFF)
         {
+            if (0xD800 <= c && c <= 0xDFFF)
+                throw new UtfException(
+                    "encoding an isolated surrogate code point in UTF-16", 0);
+            assert(isValidDchar(c));
             buf[0] = cast(wchar) c;
             return buf[0 .. 1];
         }
-        else
+        else if (c <= 0x10FFFF)
         {
             buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
             buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);
             return buf[0 .. 2];
         }
+        assert(!isValidDchar(c));
+        throw new UtfException(
+            "encoding an invalid code point in UTF-16", 0);
     }
 
+unittest
+{
+    wchar[2] buf;
+
+    assert(toUTF16(buf, '\u0000') == "\u0000");
+    assert(toUTF16(buf, '\uD7FF') == "\uD7FF");
+    assert(toUTF16(buf, '\uE000') == "\uE000");
+    assert(toUTF16(buf, 0xFFFE)[0] == 0xFFFE);
+    assert(toUTF16(buf, 0xFFFF)[0] == 0xFFFF);
+    assert(toUTF16(buf, '\U00010000') == "\U00010000");
+    assert(toUTF16(buf, '\U0010FFFF') == "\U0010FFFF");
+
+    assert(expectError_( toUTF16(buf, cast(dchar) 0xD800) ));
+    assert(expectError_( toUTF16(buf, cast(dchar) 0xDBFF) ));
+    assert(expectError_( toUTF16(buf, cast(dchar) 0xDC00) ));
+    assert(expectError_( toUTF16(buf, cast(dchar) 0xDFFF) ));
+    assert(expectError_( toUTF16(buf, cast(dchar) 0x110000) ));
+}
+
 /****************
  * Encodes string s into UTF-16 and returns the encoded string.
  * toUTF16z() is suitable for calling the 'W' functions in the Win32 API that take
@@ -822,12 +987,8 @@ wchar* toUTF16z(char[] s)
 /** ditto */
 
 wchar[] toUTF16(wchar[] s)
-    in
     {
         validate(s);
-    }
-    body
-    {
         return s;
     }
 
@@ -896,12 +1057,8 @@ dchar[] toUTF32(wchar[] s)
 /** ditto */
 
 dchar[] toUTF32(dchar[] s)
-    in
     {
         validate(s);
-    }
-    body
-    {
         return s;
     }