mirror of
https://github.com/dlang/phobos.git
synced 2025-04-29 06:30:28 +03:00
more checking of UTF errors
Overlong sequences, wrong continuation for UTF-8. Lone high surrogate for UTf-16/.
This commit is contained in:
parent
8073f8371d
commit
fff184b1cc
1 changed files with 57 additions and 2 deletions
59
std/uni.d
59
std/uni.d
|
@ -4583,7 +4583,6 @@ template Utf8Matcher()
|
||||||
|
|
||||||
char truncate()(char ch) pure @safe
|
char truncate()(char ch) pure @safe
|
||||||
{
|
{
|
||||||
assert((ch & 0b1100_0000) == 0x80);
|
|
||||||
ch -= 0x80;
|
ch -= 0x80;
|
||||||
return ch < 0x40 ? ch : (badEncoding(), cast(char)0);
|
return ch < 0x40 ? ch : (badEncoding(), cast(char)0);
|
||||||
}
|
}
|
||||||
|
@ -4632,7 +4631,10 @@ template Utf8Matcher()
|
||||||
(ch & ~leadMask!%d) == encMask!(%d)
|
(ch & ~leadMask!%d) == encMask!(%d)
|
||||||
? lookup!(%d, mode)(inp) :
|
? lookup!(%d, mode)(inp) :
|
||||||
}, size, size, size);
|
}, size, size, size);
|
||||||
code ~= "false";
|
static if (Sizes.length == 4) //covers all code unit cases
|
||||||
|
code ~= "(badEncoding(), false)";
|
||||||
|
else
|
||||||
|
code ~= "false"; //may be just fine but not covered
|
||||||
return code;
|
return code;
|
||||||
}
|
}
|
||||||
enum dispatch = genDispatch();
|
enum dispatch = genDispatch();
|
||||||
|
@ -4727,6 +4729,28 @@ template Utf8Matcher()
|
||||||
{
|
{
|
||||||
needle[i] = truncate(inp[i]);
|
needle[i] = truncate(inp[i]);
|
||||||
}
|
}
|
||||||
|
//overlong encoding checks
|
||||||
|
static if(size == 2)
|
||||||
|
{
|
||||||
|
//0x80-0x7FF
|
||||||
|
//got 6 bits in needle[1], must use at least 8 bits
|
||||||
|
//must use at least 2 bits in needle[1]
|
||||||
|
if(needle[0] < 2) badEncoding();
|
||||||
|
}
|
||||||
|
else static if(size == 3)
|
||||||
|
{
|
||||||
|
//0x800-0xFFFF
|
||||||
|
//got 6 bits in needle[2], must use at least 12bits
|
||||||
|
//must use 6 bits in needle[1] or anything in needle[0]
|
||||||
|
if(needle[0] == 0 && needle[1] < 0x20) badEncoding();
|
||||||
|
}
|
||||||
|
else static if(size == 4)
|
||||||
|
{
|
||||||
|
//0x800-0xFFFF
|
||||||
|
//got 2x6=12 bits in needle[2..3] must use at least 17bits
|
||||||
|
//must use 5 bits (or above) in needle[1] or anything in needle[0]
|
||||||
|
if(needle[0] == 0 && needle[1] < 0x10) badEncoding();
|
||||||
|
}
|
||||||
static if(mode == Mode.alwaysSkip)
|
static if(mode == Mode.alwaysSkip)
|
||||||
{
|
{
|
||||||
inp.popFrontN(size);
|
inp.popFrontN(size);
|
||||||
|
@ -4908,6 +4932,8 @@ template Utf16Matcher()
|
||||||
//not a high surrogate
|
//not a high surrogate
|
||||||
if(x > 0x3FF)
|
if(x > 0x3FF)
|
||||||
{
|
{
|
||||||
|
//low surrogate
|
||||||
|
if(x <= 0x7FF) badEncoding();
|
||||||
static if(sizeFlags & 1)
|
static if(sizeFlags & 1)
|
||||||
{
|
{
|
||||||
auto ch = inp[0];
|
auto ch = inp[0];
|
||||||
|
@ -5156,6 +5182,35 @@ package auto units(C)(C[] s)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cover decode fail cases of Matcher
|
||||||
|
unittest
|
||||||
|
{
|
||||||
|
import std.string : format;
|
||||||
|
auto utf16 = utfMatcher!wchar(unicode.L);
|
||||||
|
auto utf8 = utfMatcher!char(unicode.L);
|
||||||
|
//decode failure cases UTF-8
|
||||||
|
alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
|
||||||
|
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
|
||||||
|
"\xCF\x00\0x00\0x00\x00");
|
||||||
|
foreach(msg; fails8){
|
||||||
|
assert(collectException((){
|
||||||
|
auto s = msg;
|
||||||
|
import std.utf;
|
||||||
|
size_t idx = 0;
|
||||||
|
//decode(s, idx);
|
||||||
|
utf8.test(s);
|
||||||
|
}()), format("%( %2x %)", cast(ubyte[])msg));
|
||||||
|
}
|
||||||
|
//decode failure cases UTF-16
|
||||||
|
alias fails16 = TypeTuple!([0xD811], [0xDC02]);
|
||||||
|
foreach(msg; fails16){
|
||||||
|
assert(collectException((){
|
||||||
|
auto s = msg.map!(x => cast(wchar)x);
|
||||||
|
utf16.test(s);
|
||||||
|
}()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/++
|
/++
|
||||||
Convenience function to construct optimal configurations for
|
Convenience function to construct optimal configurations for
|
||||||
packed Trie from any $(D set) of $(CODEPOINTS).
|
packed Trie from any $(D set) of $(CODEPOINTS).
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue