diff --git a/std/uni.d b/std/uni.d index 04587820a..6124f3266 100644 --- a/std/uni.d +++ b/std/uni.d @@ -4583,7 +4583,6 @@ template Utf8Matcher() char truncate()(char ch) pure @safe { - assert((ch & 0b1100_0000) == 0x80); ch -= 0x80; return ch < 0x40 ? ch : (badEncoding(), cast(char)0); } @@ -4632,7 +4631,10 @@ template Utf8Matcher() (ch & ~leadMask!%d) == encMask!(%d) ? lookup!(%d, mode)(inp) : }, size, size, size); - code ~= "false"; + static if (Sizes.length == 4) //covers all code unit cases + code ~= "(badEncoding(), false)"; + else + code ~= "false"; //may be just fine but not covered return code; } enum dispatch = genDispatch(); @@ -4727,6 +4729,28 @@ template Utf8Matcher() { needle[i] = truncate(inp[i]); } + //overlong encoding checks + static if(size == 2) + { + //0x80-0x7FF + //got 6 bits in needle[1], must use at least 8 bits + //must use at least 2 bits in needle[1] + if(needle[0] < 2) badEncoding(); + } + else static if(size == 3) + { + //0x800-0xFFFF + //got 6 bits in needle[2], must use at least 12bits + //must use 6 bits in needle[1] or anything in needle[0] + if(needle[0] == 0 && needle[1] < 0x20) badEncoding(); + } + else static if(size == 4) + { + //0x800-0xFFFF + //got 2x6=12 bits in needle[2..3] must use at least 17bits + //must use 5 bits (or above) in needle[1] or anything in needle[0] + if(needle[0] == 0 && needle[1] < 0x10) badEncoding(); + } static if(mode == Mode.alwaysSkip) { inp.popFrontN(size); @@ -4908,6 +4932,8 @@ template Utf16Matcher() //not a high surrogate if(x > 0x3FF) { + //low surrogate + if(x <= 0x7FF) badEncoding(); static if(sizeFlags & 1) { auto ch = inp[0]; @@ -5156,6 +5182,35 @@ package auto units(C)(C[] s) } } +// cover decode fail cases of Matcher +unittest +{ + import std.string : format; + auto utf16 = utfMatcher!wchar(unicode.L); + auto utf8 = utfMatcher!char(unicode.L); + //decode failure cases UTF-8 + alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", + "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", + "\xCF\x00\0x00\0x00\x00"); + foreach(msg; fails8){ + assert(collectException((){ + auto s = msg; + import std.utf; + size_t idx = 0; + //decode(s, idx); + utf8.test(s); + }()), format("%( %2x %)", cast(ubyte[])msg)); + } + //decode failure cases UTF-16 + alias fails16 = TypeTuple!([0xD811], [0xDC02]); + foreach(msg; fails16){ + assert(collectException((){ + auto s = msg.map!(x => cast(wchar)x); + utf16.test(s); + }())); + } +} + /++ Convenience function to construct optimal configurations for packed Trie from any $(D set) of $(CODEPOINTS).