fix utf8-decode

+ fixed decoding of 3 bytes unicode codepoints + ((ch1 & 0x1F) << 12) to ((ch1 & 0x3F) << 12) + refactored code to be able to make simple unittest + added unittests for utf8 decoding
2015-12-30 15:28:02 +01:00 · 2015-12-30 15:28:02 +01:00 · 72953d0cdc
parent 483780ac96
commit 72953d0cdc
1 changed files with 166 additions and 82 deletions
--- a/src/dlangui/core/linestream.d
+++ b/src/dlangui/core/linestream.d
@ -309,6 +309,11 @@ class LineStream {
        _streamEof = _stream.eof;
    }
    /// this constructor was created for unittests only
    protected this(){
        _encoding = EncodingType.UTF8;
    }
    /// returns slice of bytes available in buffer
    protected uint readBytes() {
        uint bytesLeft = _len - _pos;
@ -565,103 +570,182 @@ private class Utf8LineStream : LineStream {
    this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) {
        super(stream, filename, EncodingType.UTF8, buf, skip, len);
    }
-    override uint decodeText() {
+
-        if (invalidCharFlag) {
+    uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){
            invalidCharError();
            return 0;
        }
        uint bytesAvailable = readBytes();
        ubyte * bytes = _buf.ptr + _pos;
        if (bytesAvailable == 0)
            return 0; // nothing to decode
        uint len = bytesAvailable;
        uint chars = 0;
        ubyte* b = bytes;
        uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes
        dchar* text = reserveTextBuf(maxResultingBytes);
        uint i = 0;
        for (; i < len; i++) {
            uint ch = 0;
            uint ch0 = b[i];
            uint bleft = len - i;
        uint bread = 0;
        uint ch0 = b[0];
        if (!(ch0 & 0x80)) {
            // 0x00..0x7F single byte
            // 0x80 == 10000000
            // !(ch0 & 0x80) => ch0 < 10000000
            ch = ch0;
            bread = 1;
-            } if ((ch0 & 0xE0) == 0xC0) {
+        } else if ((ch0 & 0xE0) == 0xC0) {
            // two bytes 110xxxxx 10xxxxxx
-                if (bleft < 2)
+            if (bleft < 2) {
-                    break;
+                needMoreFlag = true;
-                uint ch1 = b[i + 1];
+                return 0;
            }
            uint ch1 = b[1];
            if ((ch1 & 0xC0) != 0x80) {
-                    invalidCharFlag = true;
+                return 0;
                    break;
            }
            ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F);
            bread = 2;
-            } if ((ch0 & 0xF0) == 0xE0) {
+        } else if ((ch0 & 0xF0) == 0xE0) {
            // three bytes 1110xxxx 10xxxxxx 10xxxxxx
-                if (bleft < 3)
+            if (bleft < 3) {
-                    break;
+                needMoreFlag = true;
-                uint ch1 = b[i + 1];
+                return 0;
                uint ch2 = b[i + 2];
                if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) {
                    invalidCharFlag = true;
                    break;
            }
-                ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x1F) << 6) | (ch2 & 0x3F);
+            uint ch1 = b[1];
            uint ch2 = b[2];
            if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) {
                return 0;
            }
            ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F);
            bread = 3;
-            } if ((ch0 & 0xF8) == 0xF0) {
+        } else if ((ch0 & 0xF8) == 0xF0) {
            // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-                if (bleft < 4)
+            if (bleft < 4) {
-                    break;
+                needMoreFlag = true;
-                uint ch1 = b[i + 1];
+                return 0;
-                uint ch2 = b[i + 2];
+            }
-                uint ch3 = b[i + 3];
+            uint ch1 = b[1];
            uint ch2 = b[2];
            uint ch3 = b[3];
            if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) {
-                    invalidCharFlag = true;
+                return 0;
                    break;
            }
            ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F);
            bread = 4;
-            } if ((ch0 & 0xFC) == 0xF8) {
+        } else if ((ch0 & 0xFC) == 0xF8) {
            // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-                if (bleft < 5)
+            if (bleft < 5) {
-                    break;
+                needMoreFlag = true;
-                uint ch1 = b[i + 1];
+                return 0;
-                uint ch2 = b[i + 2];
+            }
-                uint ch3 = b[i + 3];
+            uint ch1 = b[1];
-                uint ch4 = b[i + 4];
+            uint ch2 = b[2];
            uint ch3 = b[3];
            uint ch4 = b[4];
            if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) {
-                    invalidCharFlag = true;
+                return 0;
                    break;
            }
            ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F);
            bread = 5;
-            } if ((ch0 & 0xFE) == 0xFC) {
+        } else if ((ch0 & 0xFE) == 0xFC) {
            // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-                if (bleft < 6)
+            if (bleft < 6){
-                    break;
+                needMoreFlag = true;
-                uint ch1 = b[i + 1];
+                return 0;
-                uint ch2 = b[i + 2];
+            }
-                uint ch3 = b[i + 3];
+
-                uint ch4 = b[i + 4];
+            uint ch1 = b[1];
-                uint ch5 = b[i + 5];
+            uint ch2 = b[2];
            uint ch3 = b[3];
            uint ch4 = b[4];
            uint ch5 = b[5];
            if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) {
-                    invalidCharFlag = true;
+                return 0;
                    break;
            }
            ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F);
            bread = 5;
        }
        if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) {
-                invalidCharFlag = true;
+            return 0;
        }
        return bread;
    }
    /// this constructor was created for unittests only
    protected this(){
    }
    unittest {
        auto o = new Utf8LineStream();
        ubyte[] buffer =  new ubyte[4];
        ubyte * bytes  = buffer.ptr;
        uint ch;
        bool needMoreFlag;
        uint bread;
        //convert simple character
        buffer[0] = '/';
        bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
        assert(!needMoreFlag);
        assert(bread == 1);
        assert(ch == '/');
        //writefln("/ as hex: 0x%32x,0x%32x", ch,'/');
        //convert 2byte character
        buffer[0] = 0xc3;
        buffer[1] = 0x84;
        bread = o.decodeBytes(bytes,1,ch,needMoreFlag);
        assert(needMoreFlag);
        bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
        assert(!needMoreFlag);
        assert(bread == 2);
        assert(ch == 'Ä');
        //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä');
        //convert 3byte character
        buffer[0] = 0xe0;
        buffer[1] = 0xa4;
        buffer[2] = 0xb4;
        bread = o.decodeBytes(bytes,2,ch,needMoreFlag);
        assert(needMoreFlag);
        bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
        assert(!needMoreFlag);
        assert(bread == 3);
        //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ');
        assert(ch == 'ऴ');
        //regression test for https://github.com/buggins/dlangide/issues/65
        buffer[0] = 0xEB;
        buffer[1] = 0xB8;
        buffer[2] = 0x94;
        bread = o.decodeBytes(bytes,3,ch,needMoreFlag);
        assert(!needMoreFlag);
        assert(bread == 3);
        //writefln("블 as hex: 0x%32x,0x%32x", ch,'블');
        assert(ch == '블');
    }
    override uint decodeText() {
        //number of bytesAvailable
        uint len = readBytes();
        if (len == 0)
            return 0; // nothing to decode
        if (invalidCharFlag) {
            invalidCharError();
            return 0;
        }
        ubyte * bytes = _buf.ptr + _pos;
        ubyte* b = bytes;
        uint chars = 0;
        uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes
        dchar* text = reserveTextBuf(maxResultingBytes);
        uint i = 0;
        bool needMoreFlag = false;
        for (; i < len; i++) {
            uint ch = 0;
            uint bleft = len - i;
            uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag);
            if(needMoreFlag){
                //decodeBytes needs more bytes, but nore more bytes left in the buffer
                break;
            }
            //if the code above could not read any charater stop procesing
            if (bread == 0) {
                //decodeBytes could not read any charater. stop procesing
                invalidCharFlag = true;
                break;
            }