From 72953d0cdceb86174968d52cbab17f5f707f1491 Mon Sep 17 00:00:00 2001 From: Keywan Ghadami Date: Wed, 30 Dec 2015 15:28:02 +0100 Subject: [PATCH] fix utf8-decode + fixed decoding of 3 bytes unicode codepoints + ((ch1 & 0x1F) << 12) to ((ch1 & 0x3F) << 12) + refactored code to be able to make simple unittest + added unittests for utf8 decoding --- src/dlangui/core/linestream.d | 248 +++++++++++++++++++++++----------- 1 file changed, 166 insertions(+), 82 deletions(-) diff --git a/src/dlangui/core/linestream.d b/src/dlangui/core/linestream.d index 67267830..12f44e38 100644 --- a/src/dlangui/core/linestream.d +++ b/src/dlangui/core/linestream.d @@ -308,6 +308,11 @@ class LineStream { _pos = offset; _streamEof = _stream.eof; } + + /// this constructor was created for unittests only + protected this(){ + _encoding = EncodingType.UTF8; + } /// returns slice of bytes available in buffer protected uint readBytes() { @@ -565,105 +570,184 @@ private class Utf8LineStream : LineStream { this(InputStream stream, string filename, ubyte[] buf, uint len, int skip) { super(stream, filename, EncodingType.UTF8, buf, skip, len); } + + uint decodeBytes(ubyte* b,in uint bleft, out uint ch, out bool needMoreFlag){ + uint bread = 0; + uint ch0 = b[0]; + if (!(ch0 & 0x80)) { + // 0x00..0x7F single byte + // 0x80 == 10000000 + // !(ch0 & 0x80) => ch0 < 10000000 + ch = ch0; + bread = 1; + } else if ((ch0 & 0xE0) == 0xC0) { + // two bytes 110xxxxx 10xxxxxx + if (bleft < 2) { + needMoreFlag = true; + return 0; + } + uint ch1 = b[1]; + if ((ch1 & 0xC0) != 0x80) { + return 0; + } + ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F); + bread = 2; + } else if ((ch0 & 0xF0) == 0xE0) { + // three bytes 1110xxxx 10xxxxxx 10xxxxxx + if (bleft < 3) { + needMoreFlag = true; + return 0; + } + uint ch1 = b[1]; + uint ch2 = b[2]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) { + return 0; + } + ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x3F) << 6) | (ch2 & 0x3F); + bread = 3; + } else if ((ch0 & 0xF8) == 0xF0) { + // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 4) { + needMoreFlag = true; + return 0; + } + uint ch1 = b[1]; + uint ch2 = b[2]; + uint ch3 = b[3]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) { + return 0; + } + ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); + bread = 4; + } else if ((ch0 & 0xFC) == 0xF8) { + // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 5) { + needMoreFlag = true; + return 0; + } + uint ch1 = b[1]; + uint ch2 = b[2]; + uint ch3 = b[3]; + uint ch4 = b[4]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) { + return 0; + } + ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F); + bread = 5; + } else if ((ch0 & 0xFE) == 0xFC) { + // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 6){ + needMoreFlag = true; + return 0; + } + + uint ch1 = b[1]; + uint ch2 = b[2]; + uint ch3 = b[3]; + uint ch4 = b[4]; + uint ch5 = b[5]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) { + return 0; + } + ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F); + bread = 5; + } + if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { + return 0; + } + return bread; + } + + /// this constructor was created for unittests only + protected this(){ + + } + + unittest { + auto o = new Utf8LineStream(); + ubyte[] buffer = new ubyte[4]; + ubyte * bytes = buffer.ptr; + uint ch; + bool needMoreFlag; + uint bread; + + //convert simple character + buffer[0] = '/'; + bread = o.decodeBytes(bytes,1,ch,needMoreFlag); + assert(!needMoreFlag); + assert(bread == 1); + assert(ch == '/'); + //writefln("/ as hex: 0x%32x,0x%32x", ch,'/'); + + + //convert 2byte character + buffer[0] = 0xc3; + buffer[1] = 0x84; + bread = o.decodeBytes(bytes,1,ch,needMoreFlag); + assert(needMoreFlag); + + bread = o.decodeBytes(bytes,2,ch,needMoreFlag); + assert(!needMoreFlag); + assert(bread == 2); + assert(ch == 'Ä'); + //writefln("Ä as hex: 0x%32x,0x%32x", ch,'Ä'); + + //convert 3byte character + buffer[0] = 0xe0; + buffer[1] = 0xa4; + buffer[2] = 0xb4; + bread = o.decodeBytes(bytes,2,ch,needMoreFlag); + assert(needMoreFlag); + + bread = o.decodeBytes(bytes,3,ch,needMoreFlag); + assert(!needMoreFlag); + assert(bread == 3); + //writefln("ऴ as hex: 0x%32x,0x%32x", ch,'ऴ'); + assert(ch == 'ऴ'); + + //regression test for https://github.com/buggins/dlangide/issues/65 + buffer[0] = 0xEB; + buffer[1] = 0xB8; + buffer[2] = 0x94; + bread = o.decodeBytes(bytes,3,ch,needMoreFlag); + assert(!needMoreFlag); + assert(bread == 3); + //writefln("블 as hex: 0x%32x,0x%32x", ch,'블'); + assert(ch == '블'); + } + override uint decodeText() { + //number of bytesAvailable + uint len = readBytes(); + if (len == 0) + return 0; // nothing to decode + if (invalidCharFlag) { invalidCharError(); return 0; } - uint bytesAvailable = readBytes(); ubyte * bytes = _buf.ptr + _pos; - if (bytesAvailable == 0) - return 0; // nothing to decode - uint len = bytesAvailable; - uint chars = 0; ubyte* b = bytes; + uint chars = 0; uint maxResultingBytes = len*2; //len*2 because worst case is if all input chars are singelbyte and resulting in two bytes dchar* text = reserveTextBuf(maxResultingBytes); uint i = 0; + + bool needMoreFlag = false; for (; i < len; i++) { uint ch = 0; - uint ch0 = b[i]; uint bleft = len - i; - uint bread = 0; - if (!(ch0 & 0x80)) { - // 0x00..0x7F single byte - ch = ch0; - bread = 1; - } if ((ch0 & 0xE0) == 0xC0) { - // two bytes 110xxxxx 10xxxxxx - if (bleft < 2) - break; - uint ch1 = b[i + 1]; - if ((ch1 & 0xC0) != 0x80) { - invalidCharFlag = true; - break; - } - ch = ((ch0 & 0x1F) << 6) | (ch1 & 0x3F); - bread = 2; - } if ((ch0 & 0xF0) == 0xE0) { - // three bytes 1110xxxx 10xxxxxx 10xxxxxx - if (bleft < 3) - break; - uint ch1 = b[i + 1]; - uint ch2 = b[i + 2]; - if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) { - invalidCharFlag = true; - break; - } - ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x1F) << 6) | (ch2 & 0x3F); - bread = 3; - } if ((ch0 & 0xF8) == 0xF0) { - // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - if (bleft < 4) - break; - uint ch1 = b[i + 1]; - uint ch2 = b[i + 2]; - uint ch3 = b[i + 3]; - if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) { - invalidCharFlag = true; - break; - } - ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F); - bread = 4; - } if ((ch0 & 0xFC) == 0xF8) { - // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - if (bleft < 5) - break; - uint ch1 = b[i + 1]; - uint ch2 = b[i + 2]; - uint ch3 = b[i + 3]; - uint ch4 = b[i + 4]; - if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) { - invalidCharFlag = true; - break; - } - ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | (ch4 & 0x3F); - bread = 5; - } if ((ch0 & 0xFE) == 0xFC) { - // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - if (bleft < 6) - break; - uint ch1 = b[i + 1]; - uint ch2 = b[i + 2]; - uint ch3 = b[i + 3]; - uint ch4 = b[i + 4]; - uint ch5 = b[i + 5]; - if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) { - invalidCharFlag = true; - break; - } - ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | (ch5 & 0x3F); - bread = 5; - } - if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { - invalidCharFlag = true; + uint bread = decodeBytes(b+i,bleft,ch,needMoreFlag); + + if(needMoreFlag){ + //decodeBytes needs more bytes, but nore more bytes left in the buffer break; } - //if the code above could not read any charater stop procesing if (bread == 0) { - invalidCharFlag = true; - break; + //decodeBytes could not read any charater. stop procesing + invalidCharFlag = true; + break; } if (ch < 0x10000) {