mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 22:21:09 +03:00
more checking of UTF errors
Overlong sequences, wrong continuation for UTF-8. Lone high surrogate for UTf-16/.
This commit is contained in:
parent
8073f8371d
commit
fff184b1cc
1 changed files with 57 additions and 2 deletions
59
std/uni.d
59
std/uni.d
|
@ -4583,7 +4583,6 @@ template Utf8Matcher()
|
|||
|
||||
char truncate()(char ch) pure @safe
|
||||
{
|
||||
assert((ch & 0b1100_0000) == 0x80);
|
||||
ch -= 0x80;
|
||||
return ch < 0x40 ? ch : (badEncoding(), cast(char)0);
|
||||
}
|
||||
|
@ -4632,7 +4631,10 @@ template Utf8Matcher()
|
|||
(ch & ~leadMask!%d) == encMask!(%d)
|
||||
? lookup!(%d, mode)(inp) :
|
||||
}, size, size, size);
|
||||
code ~= "false";
|
||||
static if (Sizes.length == 4) //covers all code unit cases
|
||||
code ~= "(badEncoding(), false)";
|
||||
else
|
||||
code ~= "false"; //may be just fine but not covered
|
||||
return code;
|
||||
}
|
||||
enum dispatch = genDispatch();
|
||||
|
@ -4727,6 +4729,28 @@ template Utf8Matcher()
|
|||
{
|
||||
needle[i] = truncate(inp[i]);
|
||||
}
|
||||
//overlong encoding checks
|
||||
static if(size == 2)
|
||||
{
|
||||
//0x80-0x7FF
|
||||
//got 6 bits in needle[1], must use at least 8 bits
|
||||
//must use at least 2 bits in needle[1]
|
||||
if(needle[0] < 2) badEncoding();
|
||||
}
|
||||
else static if(size == 3)
|
||||
{
|
||||
//0x800-0xFFFF
|
||||
//got 6 bits in needle[2], must use at least 12bits
|
||||
//must use 6 bits in needle[1] or anything in needle[0]
|
||||
if(needle[0] == 0 && needle[1] < 0x20) badEncoding();
|
||||
}
|
||||
else static if(size == 4)
|
||||
{
|
||||
//0x800-0xFFFF
|
||||
//got 2x6=12 bits in needle[2..3] must use at least 17bits
|
||||
//must use 5 bits (or above) in needle[1] or anything in needle[0]
|
||||
if(needle[0] == 0 && needle[1] < 0x10) badEncoding();
|
||||
}
|
||||
static if(mode == Mode.alwaysSkip)
|
||||
{
|
||||
inp.popFrontN(size);
|
||||
|
@ -4908,6 +4932,8 @@ template Utf16Matcher()
|
|||
//not a high surrogate
|
||||
if(x > 0x3FF)
|
||||
{
|
||||
//low surrogate
|
||||
if(x <= 0x7FF) badEncoding();
|
||||
static if(sizeFlags & 1)
|
||||
{
|
||||
auto ch = inp[0];
|
||||
|
@ -5156,6 +5182,35 @@ package auto units(C)(C[] s)
|
|||
}
|
||||
}
|
||||
|
||||
// cover decode fail cases of Matcher
|
||||
unittest
|
||||
{
|
||||
import std.string : format;
|
||||
auto utf16 = utfMatcher!wchar(unicode.L);
|
||||
auto utf8 = utfMatcher!char(unicode.L);
|
||||
//decode failure cases UTF-8
|
||||
alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
|
||||
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
|
||||
"\xCF\x00\0x00\0x00\x00");
|
||||
foreach(msg; fails8){
|
||||
assert(collectException((){
|
||||
auto s = msg;
|
||||
import std.utf;
|
||||
size_t idx = 0;
|
||||
//decode(s, idx);
|
||||
utf8.test(s);
|
||||
}()), format("%( %2x %)", cast(ubyte[])msg));
|
||||
}
|
||||
//decode failure cases UTF-16
|
||||
alias fails16 = TypeTuple!([0xD811], [0xDC02]);
|
||||
foreach(msg; fails16){
|
||||
assert(collectException((){
|
||||
auto s = msg.map!(x => cast(wchar)x);
|
||||
utf16.test(s);
|
||||
}()));
|
||||
}
|
||||
}
|
||||
|
||||
/++
|
||||
Convenience function to construct optimal configurations for
|
||||
packed Trie from any $(D set) of $(CODEPOINTS).
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue