more checking of UTF errors

Overlong sequences, wrong continuation  for UTF-8.
Lone high surrogate for UTf-16/.
This commit is contained in:
Dmitry Olshansky 2014-03-11 00:40:37 +04:00
parent 8073f8371d
commit fff184b1cc

View file

@ -4583,7 +4583,6 @@ template Utf8Matcher()
char truncate()(char ch) pure @safe
{
assert((ch & 0b1100_0000) == 0x80);
ch -= 0x80;
return ch < 0x40 ? ch : (badEncoding(), cast(char)0);
}
@ -4632,7 +4631,10 @@ template Utf8Matcher()
(ch & ~leadMask!%d) == encMask!(%d)
? lookup!(%d, mode)(inp) :
}, size, size, size);
code ~= "false";
static if (Sizes.length == 4) //covers all code unit cases
code ~= "(badEncoding(), false)";
else
code ~= "false"; //may be just fine but not covered
return code;
}
enum dispatch = genDispatch();
@ -4727,6 +4729,28 @@ template Utf8Matcher()
{
needle[i] = truncate(inp[i]);
}
//overlong encoding checks
static if(size == 2)
{
//0x80-0x7FF
//got 6 bits in needle[1], must use at least 8 bits
//must use at least 2 bits in needle[1]
if(needle[0] < 2) badEncoding();
}
else static if(size == 3)
{
//0x800-0xFFFF
//got 6 bits in needle[2], must use at least 12bits
//must use 6 bits in needle[1] or anything in needle[0]
if(needle[0] == 0 && needle[1] < 0x20) badEncoding();
}
else static if(size == 4)
{
//0x800-0xFFFF
//got 2x6=12 bits in needle[2..3] must use at least 17bits
//must use 5 bits (or above) in needle[1] or anything in needle[0]
if(needle[0] == 0 && needle[1] < 0x10) badEncoding();
}
static if(mode == Mode.alwaysSkip)
{
inp.popFrontN(size);
@ -4908,6 +4932,8 @@ template Utf16Matcher()
//not a high surrogate
if(x > 0x3FF)
{
//low surrogate
if(x <= 0x7FF) badEncoding();
static if(sizeFlags & 1)
{
auto ch = inp[0];
@ -5156,6 +5182,35 @@ package auto units(C)(C[] s)
}
}
// cover decode fail cases of Matcher
unittest
{
import std.string : format;
auto utf16 = utfMatcher!wchar(unicode.L);
auto utf8 = utfMatcher!char(unicode.L);
//decode failure cases UTF-8
alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
"\xCF\x00\0x00\0x00\x00");
foreach(msg; fails8){
assert(collectException((){
auto s = msg;
import std.utf;
size_t idx = 0;
//decode(s, idx);
utf8.test(s);
}()), format("%( %2x %)", cast(ubyte[])msg));
}
//decode failure cases UTF-16
alias fails16 = TypeTuple!([0xD811], [0xDC02]);
foreach(msg; fails16){
assert(collectException((){
auto s = msg.map!(x => cast(wchar)x);
utf16.test(s);
}()));
}
}
/++
Convenience function to construct optimal configurations for
packed Trie from any $(D set) of $(CODEPOINTS).