mirror of
https://github.com/dlang/phobos.git
synced 2025-05-04 00:54:05 +03:00
New primitive - Tiny UTF Binary Search Table
This lays on the foundation of UTF word notion, a 32-bit integer that contains UTF encoded codepoint. Top non-zero byte is the UTF-8 starter, so it's BigEndian when reading directly from strings. Same applies to UTF-16 word. Also this invents simpler API for matching that easily combines: - length of code point matched - result of match (true/false if belongs to this set) - bad encoding flag in one packed 32-bit machine word - UtfLookup. This struct is going to be reused for all of "directly on UTF" matchers in std.uni.
This commit is contained in:
parent
06506e407a
commit
aba3fe84a3
1 changed files with 744 additions and 2 deletions
746
std/uni.d
746
std/uni.d
|
@ -74,6 +74,13 @@
|
|||
significant performance improvements. See $(LREF MatcherConcept) for
|
||||
the common interface of UTF matchers.
|
||||
)
|
||||
$(LI
|
||||
Adding even more flexibilty to character classification without decoding
|
||||
there is $(LREF tinyUtfBst). TinyUTF tables achieve much smaller footprint
|
||||
then a Matcher (~ 8 bytes * number of intervals) while still operating on UTF
|
||||
directly avoiding decoding. Each match operation is $(BIGOH log(N)),
|
||||
where N is number of intervals in the given codepoint set.
|
||||
)
|
||||
$(LI
|
||||
Generally useful building blocks for customized normalization:
|
||||
$(LREF combiningClass) for querying combining class
|
||||
|
@ -4868,7 +4875,6 @@ template Utf16Matcher()
|
|||
import std.utf;
|
||||
throw new UTFException("Invalid UTF-16 sequence");
|
||||
}
|
||||
|
||||
// 1-stage ASCII
|
||||
alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
|
||||
//2-stage BMP
|
||||
|
@ -5134,6 +5140,738 @@ public auto utfMatcher(Char, Set)(Set set) @trusted
|
|||
static assert(false, "Only character types 'char' and 'wchar' are allowed");
|
||||
}
|
||||
|
||||
// helper for assert/debug prints
|
||||
string utfWordAsHex()(uint val, size_t sz=0)
|
||||
{
|
||||
import std.algorithm : reverse;
|
||||
import core.bitop;
|
||||
import std.format;
|
||||
char[] res;
|
||||
if(sz == 0)
|
||||
sz = (bsr(val)+8)/8;
|
||||
while(sz)
|
||||
{
|
||||
res ~= format("%x", val & 0xFF);
|
||||
val >>= 8;
|
||||
sz--;
|
||||
if(sz)
|
||||
res ~= "_";
|
||||
}
|
||||
return res.idup;
|
||||
}
|
||||
|
||||
// just encode as UTF-8 don't try to validate
|
||||
size_t encodeNoCheck_(ref char[4] buf, dchar c) @safe pure nothrow
|
||||
{
|
||||
if (c <= 0x7F)
|
||||
{
|
||||
buf[0] = cast(char)c;
|
||||
return 1;
|
||||
}
|
||||
if (c <= 0x7FF)
|
||||
{
|
||||
buf[0] = cast(char)(0xC0 | (c >> 6));
|
||||
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
||||
return 2;
|
||||
}
|
||||
if (c <= 0x3FFF)
|
||||
{
|
||||
|
||||
buf[0] = cast(char)(0xE0 | (c >> 12));
|
||||
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
||||
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
||||
return 3;
|
||||
}
|
||||
// +1 on top of the usual dchar range so as to allow half-open interval [A,B)
|
||||
assert(c <= 0x11_0000);
|
||||
buf[0] = cast(char)(0xF0 | (c >> 18));
|
||||
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
||||
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
||||
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
||||
return 4;
|
||||
}
|
||||
|
||||
// just encode as UTF-16 don't try to validate
|
||||
size_t encodeNoCheck_(ref wchar[2] buf, dchar c) @safe pure nothrow
|
||||
{
|
||||
if(c < 0xD800 || (c > 0xE000 && c < 0x1_0000))
|
||||
{
|
||||
buf[0] = cast(wchar)c;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
buf[0] = cast(wchar)(((c - 0x1_0000) >> 10) + 0xD800);
|
||||
buf[1] = cast(wchar)(((c - 0x1_0000) & 0x3FF) + 0xDC00);
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
|
||||
// compatible interface for dchars
|
||||
size_t encodeNoCheck_(ref dchar[1] buf, dchar c) @safe pure nothrow
|
||||
{
|
||||
buf[0] = c;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// internal helper
|
||||
static T bigEndian_(T)(T val) pure nothrow
|
||||
{
|
||||
version(LittleEndian)
|
||||
{
|
||||
import core.bitop;
|
||||
static if(T.sizeof == 2)
|
||||
return cast(ushort)(val << 8) | cast(ushort)(val >> 8);
|
||||
else static if(T.sizeof == 4)
|
||||
return bswap(val); // works for uints
|
||||
}
|
||||
else
|
||||
return val;
|
||||
}
|
||||
|
||||
// Load UTF-word from string-ish range.
|
||||
// UTF codeunits are represented as 32-bit integer
|
||||
// where highest non-zero byte is the first (starter) code unit
|
||||
uint toUtfWord(size_t sz, Range)(Range buf) pure nothrow
|
||||
if(is(Range : const(char)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : char)))
|
||||
{
|
||||
pragma(inline, true);
|
||||
static if(isArray!Range) // can generalize if there is .ptr but not Array
|
||||
{
|
||||
static if(sz == 1)
|
||||
return buf[0];
|
||||
else static if(sz == 2)
|
||||
return bigEndian_(*cast(ushort*)buf.ptr);
|
||||
else static if(sz == 3)
|
||||
{
|
||||
auto val = bigEndian_(*cast(ushort*)buf.ptr) << 8;
|
||||
val |= cast(uint)buf[2]; //add last 8bits
|
||||
return val;
|
||||
}
|
||||
else
|
||||
return bigEndian_(*cast(uint*)buf.ptr);
|
||||
}
|
||||
else // don't have .ptr
|
||||
{
|
||||
static if(sz == 1)
|
||||
return buf[0];
|
||||
else static if(sz == 2)
|
||||
return (cast(uint)buf[0]<<8) | buf[1];
|
||||
else static if(sz == 3)
|
||||
return (cast(uint)buf[0]<<16) | (cast(uint)buf[1]<<8) | buf[2];
|
||||
else
|
||||
return (cast(uint)buf[0]<<24) | (cast(uint)buf[1]<<16) | (cast(uint)buf[2]<<8) | buf[3];
|
||||
}
|
||||
}
|
||||
|
||||
// same for UTF-16, resulting words (UTF-16 vs UTF-8) are (of course) not interchangeble
|
||||
uint toUtf16Word(Range)(Range buf) pure nothrow
|
||||
if(is(Range : const(wchar)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : wchar)))
|
||||
{
|
||||
immutable c = buf[0];
|
||||
if(c < 0xD800)
|
||||
return c;
|
||||
else if(c <= 0xFFFF)
|
||||
return c + 0x1_0000;
|
||||
else //simply take as is
|
||||
{
|
||||
static if(isArray!Range) // can generalize if there is .ptr but not Array
|
||||
return bigEndian_(*cast(uint*)buf.ptr);
|
||||
else
|
||||
return (cast(uint)c<<16) | buf[1];
|
||||
}
|
||||
}
|
||||
|
||||
// same for UTF-16, resulting words (UTF-16 vs UTF-8) are (of course) not interchangeble
|
||||
uint toUtf16Word()(dchar ch) pure nothrow
|
||||
{
|
||||
wchar[2] buf;
|
||||
auto sz = encodeNoCheck_(buf, ch);
|
||||
return toUtf16Word(buf[]);
|
||||
}
|
||||
|
||||
// clean UTF-32 codepoint from UTF-8 word
|
||||
dchar fromUtfWord(uint val) @safe pure nothrow
|
||||
{
|
||||
// fast path goes first - ASCII
|
||||
if(val <= 0xFF)
|
||||
return val;
|
||||
else if(val <= 0xFF_FF)
|
||||
return ((val & 0x1F_00)>>2) | (val & 0x3F);
|
||||
else if(val <= 0xFF_FF_FF)
|
||||
return ((val & 0x0F_00_00)>>4) | ((val & 0x3F_00)>>2) | (val & 0x3F);
|
||||
else // 4 bytes
|
||||
return ((val & 0x07_00_00_00)>>6) | ((val & 0x3F_00_00)>>4) |
|
||||
((val & 0x3F_00)>>2) | (val & 0x3F);
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
import std.format;
|
||||
// enum STEP = 1; // exhaustive but too expansive
|
||||
enum STEP = 7; // some small prime as step
|
||||
for(dchar ch=0; ch < 0x11_0000; ch+=STEP)
|
||||
{
|
||||
char[4] buf;
|
||||
uint val = toUtfWord(buf[], encodeNoCheck_(buf, ch));
|
||||
assert(fromUtfWord(val) == ch, format("0x%x - buf %(0x%x, %) uw %s", ch, buf, utfWordAsHex(val)));
|
||||
}
|
||||
}
|
||||
|
||||
// size in bytes of UTF sequence in this UTF word
|
||||
uint utfWordSize(uint utfWord) @safe pure nothrow
|
||||
{
|
||||
import core.bitop;
|
||||
return utfWord ? (bsr(utfWord)+8)/8 : 1;
|
||||
}
|
||||
|
||||
// ditto with size as run-time parameter
|
||||
uint toUtfWord(Range)(Range r, size_t sz)
|
||||
{
|
||||
switch(sz)
|
||||
{
|
||||
foreach(c; AliasSeq!(1,2,3,4))
|
||||
{
|
||||
case c:
|
||||
return toUtfWord!c(r);
|
||||
}
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
// another overload for convenience
|
||||
uint toUtfWord(dchar ch) pure nothrow
|
||||
{
|
||||
char[4] buf;
|
||||
size_t sz = encodeNoCheck_(buf, ch);
|
||||
return toUtfWord(buf[], sz);
|
||||
}
|
||||
|
||||
// Slice up codepoint interval [a,b) to the jagged UTF word intevals
|
||||
struct UtfChunks
|
||||
{
|
||||
uint[8] chunks;
|
||||
size_t len=0;
|
||||
//
|
||||
this(Interval)(Interval i)
|
||||
{
|
||||
uint start = toUtfWord(i.a);
|
||||
uint end = toUtfWord(i.b);
|
||||
chunks[len++] = start;
|
||||
foreach(brk; AliasSeq!(0x80, 0x800, 0x4000))
|
||||
{
|
||||
if(i.a < brk && i.b >= brk)
|
||||
{
|
||||
chunks[len++] = toUtfWord(brk-1)+1; // encode max and add 1
|
||||
chunks[len++] = toUtfWord(brk);
|
||||
}
|
||||
}
|
||||
chunks[len++] = end;
|
||||
assert(len % 2 == 0);
|
||||
}
|
||||
//
|
||||
uint[] opIndex() return
|
||||
{
|
||||
return chunks[0..len];
|
||||
}
|
||||
}
|
||||
|
||||
/++
|
||||
Incapsulates the result of matching range of UTF codeunits against
|
||||
some UTF matcher e.g. $(LREF TinyUtfBst) or $(LREF UtfMatcher).
|
||||
|
||||
Packed in a single machine word, this struct contains info about the length
|
||||
of codepoint tested, if there was a match and whether the UTF encoding was
|
||||
broken.
|
||||
+/
|
||||
struct UtfLookup
|
||||
{
|
||||
enum {
|
||||
MASK = 0xF,
|
||||
MATCH_BIT = 4,
|
||||
MATCH = 1<<MATCH_BIT,
|
||||
BROKEN = MATCH<<1,
|
||||
BROKEN_BIT = MATCH_BIT+1
|
||||
}
|
||||
private uint word;
|
||||
const pure nothrow @safe @nogc:
|
||||
/++
|
||||
Length of the codepoint tseted in UTF range.
|
||||
|
||||
See also: $(XREF utf, stride).
|
||||
+/
|
||||
@property uint stride()(){ return word & MASK; }
|
||||
/// True if the codepoint was recognized by the matcher.
|
||||
@property bool matched()(){ return (word & MATCH) != 0; }
|
||||
/++
|
||||
Indicates if the encoding of UTF range was broken.
|
||||
In such cases $(D stride) always equals 1, $(D matched) is false.
|
||||
+/
|
||||
@property bool broken()(){ return (word & BROKEN) != 0; }
|
||||
/// Short-hand for $(D matched) to use in conditional statements.
|
||||
bool opCast(T:bool)(){ return matched; }
|
||||
}
|
||||
|
||||
mixin template SegmentedTable(Char)
|
||||
{
|
||||
@property auto badEncoding()(){ return UtfLookup(1 | UtfLookup.BROKEN); }
|
||||
// get a proper section of array by UTF-8 code length
|
||||
@property auto segment(size_t sz)()
|
||||
{
|
||||
static if(sz == 1)
|
||||
{
|
||||
static if(!is(typeof(segs))) // single segment
|
||||
return store;
|
||||
else
|
||||
return store[0..segs[0].start]; // up to the start of 2nd segment
|
||||
}
|
||||
else static if(sz > 1)
|
||||
{
|
||||
auto ptr = cast(Value!sz*)(store.ptr + segs[sz-2].start);
|
||||
return ptr[0..segs[sz-2].len]; // .len is measured in Values
|
||||
}
|
||||
}
|
||||
|
||||
// val is UTF word
|
||||
UtfLookup lookup(size_t sz)(uint val)
|
||||
{
|
||||
pragma(inline, true);
|
||||
import std.stdio;
|
||||
// import std.range : assumeSorted;
|
||||
auto seg = segment!sz;
|
||||
// UTF-8 specific continuation bit validation
|
||||
static if(is(Char : char))
|
||||
{
|
||||
uint bad1 = 0x80, bad2 = 0x0;
|
||||
foreach(i; Sequence!(0, sz-1))
|
||||
{
|
||||
enum mask1 = 0x80<<(8*i);
|
||||
enum mask2 = 0x40<<(8*i);
|
||||
bad1 &= (val & mask1)>>(8*i); // 0 is any of bits is 0
|
||||
bad2 |= (val & mask2)>>(8*i); // 1 if any of bits is 1
|
||||
}
|
||||
// starter starts with 01.... binary mask
|
||||
// check absense of at least one '1'
|
||||
bad1 = (~bad1 & 0x80) >> (7 - UtfLookup.BROKEN_BIT);
|
||||
// check absense of at least one '0'
|
||||
bad2 = bad2 >> (6 - UtfLookup.BROKEN_BIT);
|
||||
immutable bad = bad1 | bad2;
|
||||
}
|
||||
else
|
||||
{
|
||||
enum bad = 0;
|
||||
}
|
||||
// the trick is:
|
||||
// if lower bound is even this means all intervals below val are closed pairs [A,B)
|
||||
// else we are inside of some [A, B)
|
||||
uint ret = sharLowerBound!"a<=b"(seg, cast(Value!sz)val) & 1;
|
||||
// ret == 1 - matched
|
||||
return UtfLookup(sz | (ret<<UtfLookup.MATCH_BIT) | bad);
|
||||
}
|
||||
}
|
||||
|
||||
/+
|
||||
A Tiny Binary Search Table for decode-less codepoint range lookup.
|
||||
Breaks ranges into up into 4 segments - one per UTF encoding length.
|
||||
+/
|
||||
// nullary template to be inlined in user's code and not just some dead weight in phobos.lib
|
||||
struct TinyUtf8BST()
|
||||
{
|
||||
private:
|
||||
ubyte[] store; // all arrays in one, aligned to 1-2-4-4 bytes respectively
|
||||
alias SegLen = uint; // may try ushort to pack it tighter
|
||||
struct Segment
|
||||
{
|
||||
SegLen start; // offset in bytes from the start of the store
|
||||
SegLen len; // in 2 or 4 byte increments
|
||||
}
|
||||
Segment[3] segs; // of 2-byte, 3-byte and 4-bytes segments
|
||||
template Value(size_t sz)
|
||||
{
|
||||
static if(sz == 1)
|
||||
alias Value = ubyte;
|
||||
else static if(sz == 2)
|
||||
alias Value = ushort;
|
||||
else
|
||||
alias Value = uint;
|
||||
}
|
||||
mixin SegmentedTable!char; // reusable table lookup logic
|
||||
|
||||
// From range of [a,b) intervals
|
||||
this(Range)(Range pairs)
|
||||
{
|
||||
bool[3] seenSize; // if seen 2, 3, 4 byte code points
|
||||
// encode and store with the given size
|
||||
void append(uint utfWord)
|
||||
{
|
||||
uint sz = utfWordSize(utfWord);
|
||||
if(sz == 1)
|
||||
{
|
||||
store ~= cast(ubyte)utfWord;
|
||||
return;
|
||||
}
|
||||
static union Place
|
||||
{
|
||||
uint val;
|
||||
ubyte[4] bytes;
|
||||
}
|
||||
Place p;
|
||||
p.val = utfWord;
|
||||
immutable padSize = 1<<(sz+1)/2; // 2 -> 2, 3-4 -> 4
|
||||
immutable mask = padSize - 1;
|
||||
if(!seenSize[sz-2])
|
||||
{
|
||||
// first of sz-sized code points
|
||||
seenSize[sz-2] = true;
|
||||
auto rem = store.length & mask;
|
||||
if(rem) // need to align
|
||||
{
|
||||
foreach(_; 0..padSize-rem)
|
||||
store ~= 0;
|
||||
segs[sz-2].start = cast(SegLen)store.length;
|
||||
}
|
||||
// mark length
|
||||
segs[sz-2].start = cast(SegLen)store.length;
|
||||
}
|
||||
version(LittleEndian)
|
||||
{
|
||||
foreach(i; 0..padSize)
|
||||
store ~= p.bytes[i];
|
||||
}
|
||||
else // TODO: test on BigEndian iron
|
||||
{
|
||||
foreach(i; 0..padSize)
|
||||
store ~= p.bytes[sz-1-i];
|
||||
}
|
||||
}
|
||||
foreach(v; pairs)
|
||||
{
|
||||
auto chunks = UtfChunks(v);
|
||||
auto arr = chunks[];
|
||||
for(size_t i=0; i<arr.length; i+=2)
|
||||
{
|
||||
append(arr[i]);
|
||||
append(arr[i+1]);
|
||||
}
|
||||
}
|
||||
size_t total = store.length;
|
||||
foreach_reverse(i; 0..segs.length)
|
||||
{
|
||||
if(seenSize[i])
|
||||
{
|
||||
segs[i].len = cast(SegLen)(total - segs[i].start);
|
||||
total -= segs[i].len;
|
||||
}
|
||||
}
|
||||
// sets the length of the first segment as well
|
||||
// in case we don't have segment of size 2
|
||||
// trim padding for the first (implicit) segment
|
||||
while(total && store[total-1] == 0) total--;
|
||||
segs[0].start = cast(SegLen)total;
|
||||
// scale to Value!sz size
|
||||
segs[0].len /= 2;
|
||||
segs[0].len &= ~1; // trim padding for second segment
|
||||
segs[1].len /= 4;
|
||||
segs[2].len /= 4;
|
||||
}
|
||||
|
||||
// slow path that assumes 1-element cases are handled
|
||||
UtfLookup longMatch(Range)(Range r)
|
||||
{
|
||||
pragma(inline, false); // no point, it's a slow path
|
||||
immutable c = r[0];
|
||||
UtfLookup ret = badEncoding(); // default on fallthrough
|
||||
if(r.length >= 4) // start/middle of input - fast path
|
||||
{
|
||||
if(!(c & 0x20))
|
||||
ret = lookup!2(toUtfWord!2(r));
|
||||
else if(!(c & 0x10))
|
||||
ret = lookup!3(toUtfWord!3(r));
|
||||
else if(!(c & 0x08))
|
||||
ret = lookup!4(toUtfWord!4(r));
|
||||
}
|
||||
else // handle the tail of input - even slower (and more rare) path
|
||||
{
|
||||
switch(r.length)
|
||||
{
|
||||
case 3:
|
||||
if(!(c & 0x10))
|
||||
ret = lookup!3(toUtfWord!3(r));
|
||||
break;
|
||||
case 2:
|
||||
if(!(c & 0x20))
|
||||
ret = lookup!2(toUtfWord!2(r));
|
||||
break;
|
||||
default:
|
||||
// fallthrough - bad encoding
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/++
|
||||
Match against the front of the $(D r) range of char. This doesn't throw
|
||||
on bad encoding. Instead the result includes flags for match
|
||||
and broken encoding along with codepoint length (stride).
|
||||
|
||||
Returns:
|
||||
$(LREF UtfLookup) with the results of the test against this matcher.
|
||||
+/
|
||||
public auto opCall(Range)(Range r)
|
||||
if(is(Range : const(char)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : char)))
|
||||
{
|
||||
pragma(inline, true);
|
||||
immutable c = r[0];
|
||||
// no UTF validation - just test where the zero is
|
||||
if(c < 0x80) // fast path for ASCII
|
||||
{
|
||||
return lookup!1(c);
|
||||
}
|
||||
else
|
||||
return longMatch(r);
|
||||
}
|
||||
}
|
||||
|
||||
// Analog of TinyUtf8BST for UTf-16
|
||||
struct TinyUtf16BST()
|
||||
{
|
||||
private:
|
||||
ushort[] store; // all arrays in one, aligned to 1-2 ushorts respectively
|
||||
alias SegLen = uint; // may try ushort to pack it tighter
|
||||
struct Segment
|
||||
{
|
||||
SegLen start; // offset in ushorts from the start of the store
|
||||
SegLen len; // in ushort or uint increments
|
||||
}
|
||||
Segment[1] segs; // for uint-sized portion
|
||||
template Value(size_t sz)
|
||||
{
|
||||
static if(sz == 1)
|
||||
alias Value = ushort;
|
||||
else
|
||||
alias Value = uint;
|
||||
}
|
||||
mixin SegmentedTable!wchar; // reusable table lookup logic
|
||||
|
||||
this(Range)(Range pairs)
|
||||
{
|
||||
bool seenLong = false;
|
||||
void appendLong(dchar val)
|
||||
{
|
||||
if(!seenLong)
|
||||
{
|
||||
seenLong = true;
|
||||
// pad to uint size
|
||||
store.length += store.length % 4;
|
||||
segs[0].start = cast(SegLen)store.length;
|
||||
}
|
||||
union Place
|
||||
{
|
||||
ushort[2] us;
|
||||
uint ui;
|
||||
}
|
||||
Place place;
|
||||
place.ui = toUtf16Word(val);
|
||||
store ~= place.us[];
|
||||
}
|
||||
foreach(i; pairs)
|
||||
{
|
||||
if(i.b <= 0xFFFF)
|
||||
{
|
||||
store ~= cast(ushort)i.a;
|
||||
store ~= cast(ushort)i.b;
|
||||
}
|
||||
else if(i.a > 0xFFFF) // both are beyond 2 bytes
|
||||
{
|
||||
appendLong(i.a);
|
||||
appendLong(i.b);
|
||||
}
|
||||
else // cross-cutting a < 16bit b > 16 bit
|
||||
{
|
||||
store ~= cast(ushort)i.a;
|
||||
store ~= cast(ushort)0xFFFF;
|
||||
appendLong(0xFFFF);
|
||||
appendLong(i.b);
|
||||
}
|
||||
}
|
||||
if(!seenLong)
|
||||
segs[0].start = cast(SegLen)store.length;
|
||||
}
|
||||
|
||||
// assumes 1-element cases below surrogates are already handled
|
||||
auto longMatch(Range)(Range r)
|
||||
{
|
||||
pragma(inline, false); // slow path - don't inline
|
||||
immutable c = r[0];
|
||||
auto ret = badEncoding();
|
||||
if(r.length >= 2)
|
||||
{
|
||||
immutable c2 = r[1];
|
||||
if(c2 >= 0xDC00 && c2 <= 0xDFFF)
|
||||
return lookup!2(toUtf16Word(r));
|
||||
}
|
||||
else
|
||||
{
|
||||
if(c >= 0xE000)
|
||||
return lookup!2(toUtf16Word(r)); // [0xE000, 0x1_0000)
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/++
|
||||
Match against the front of the $(D r) range of char. This doesn't throw
|
||||
on bad encoding. Instead the result includes flags for match
|
||||
and broken encoding along with codepoint length (stride).
|
||||
|
||||
Returns:
|
||||
$(LREF UtfLookup) with the results of the test against this matcher.
|
||||
+/
|
||||
public auto opCall(Range)(Range r)
|
||||
if(is(Range : const(wchar)[]) ||
|
||||
(isRandomAccessRange!Range && is(ElementEncodingType!Range : wchar)))
|
||||
{
|
||||
pragma(inline, true);
|
||||
immutable c = r[0];
|
||||
// fast path for BMP under surrogate pairs
|
||||
if(c < 0xD800 || (c > 0xDFFF && c <= 0xFFFF))
|
||||
return lookup!1(c);
|
||||
else
|
||||
return longMatch(r);
|
||||
}
|
||||
};
|
||||
|
||||
// Analog of TinyUtf8BST for plain UTF-32
|
||||
struct TinyUtf32BST()
|
||||
{
|
||||
private:
|
||||
uint[] store; // plain binary sorted array
|
||||
alias Value(size_t sz) = uint; // always 32bit
|
||||
mixin SegmentedTable!dchar; // reusable table lookup logic
|
||||
|
||||
this(Range)(Range pairs)
|
||||
{
|
||||
store.length = pairs.length * 2;
|
||||
size_t idx = 0;
|
||||
foreach(p; pairs)
|
||||
{
|
||||
store[idx] = p.a;
|
||||
store[idx+1] = p.b;
|
||||
idx += 2;
|
||||
}
|
||||
}
|
||||
|
||||
/++
|
||||
Match against the front of the $(D r) range of char. This doesn't throw
|
||||
on bad encoding. Instead the result includes flags for match
|
||||
and broken encoding along with codepoint length (stride).
|
||||
|
||||
Returns:
|
||||
$(LREF UtfLookup) with the results of the test against this matcher.
|
||||
+/
|
||||
public auto opCall(Range)(Range r)
|
||||
if(is(Range : const(dchar)[]) ||
|
||||
(isForwardRange!Range && is(ElementEncodingType!Range : dchar)))
|
||||
{
|
||||
pragma(inline, true);
|
||||
return lookup!1(r.front);
|
||||
}
|
||||
};
|
||||
|
||||
/++
|
||||
Get UTF-8, UTF-16 or UTF-32 Tiny UTF BST type
|
||||
suitable for passed-in $(D Char) parameter.
|
||||
|
||||
Use $(LREF tinyUtfBst) to construct these functors.
|
||||
|
||||
See also: $(LREF UtfLookup).
|
||||
+/
|
||||
public template TinyUtfBst(Char)
|
||||
{
|
||||
static if(is(Char : char))
|
||||
alias TinyUtfBst = TinyUtf8BST!();
|
||||
else static if(is(Char : wchar))
|
||||
alias TinyUtfBst = TinyUtf16BST!();
|
||||
else static if(is(Char : dchar))
|
||||
alias TinyUtfBst = TinyUtf32BST!();
|
||||
else
|
||||
static assert(0, "Only built-in character types are supported.");
|
||||
}
|
||||
|
||||
/++
|
||||
Convenience factory function to create $(LREF TinyUtfBst)
|
||||
for $(D Char) from any $(D CodepointSet).
|
||||
|
||||
Returns:
|
||||
$(LREF TinyUtfBst) matcher for a given set,
|
||||
applicable as a callable to any range of $(D Char)
|
||||
including built-in strings.
|
||||
|
||||
See also: $(LREF UtfLookup).
|
||||
+/
|
||||
public auto tinyUtfBst(Char, CS)(CS set)
|
||||
if(isCodepointSet!CS)
|
||||
{
|
||||
return TinyUtfBst!Char(set.byInterval);
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
import std.conv, std.stdio, std.format, std.utf;
|
||||
// enum TOP = 0x10_FFFF; // exhaustive range
|
||||
enum TOP = 0x1_0000; // limit the range somewhat for every day tests
|
||||
void testSet(Char, Set)(Set set)
|
||||
{
|
||||
static int abs(int a){ return a > 0 ? a : -a; }
|
||||
auto tab = tinyUtfBst!Char(set);
|
||||
for(dchar ch=0; ch<=TOP; ch++)
|
||||
{
|
||||
Char[4/Char.sizeof] buf;
|
||||
static if(Char.sizeof == 2)
|
||||
{
|
||||
if(ch >= 0xD800 || ch <= 0xDFFF)
|
||||
continue; // skip surrogates - can't encode in UTF-16
|
||||
}
|
||||
auto sz = encodeNoCheck_(buf, ch);
|
||||
auto r = tab(buf[]); // length >= 4 cases
|
||||
auto r2 = tab(buf[0..sz]); // tail cases
|
||||
auto r3 = tab(buf[0..sz].byUTF!Char);
|
||||
assert(r.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
assert(r2.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
assert(r3.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
assert(r.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
assert(r2.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
assert(r3.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
|
||||
}
|
||||
}
|
||||
void testRange(Char)(uint[] range...)
|
||||
{
|
||||
return testSet!Char(CodepointSet(range));
|
||||
}
|
||||
// trivial sanity checks
|
||||
testRange!char('0', '9', 'a', 'z', 'Ф', 'Я');
|
||||
testRange!char(0x0, 0x10);
|
||||
testRange!char(0x80, 0x100);
|
||||
// walk between break-points
|
||||
testRange!char(0x0, 0x7F, 0x801, 0x3FFF, 0x4001, 0x7FFF, 0x8001, 0x10_FFFF);
|
||||
// across break-points
|
||||
testRange!char(0x7F, 0x81, 0x3FFF, 0x4010, 0x7FF0, 0x10_FFFF);
|
||||
// special case - hits padding of second segment
|
||||
testRange!char(0x0, 0x80, 0x100, 0x702, 0x800, 0x801);
|
||||
// some more tests
|
||||
testRange!char(0x0, 0x80, 0x100, 0x702, 0x810, 0x3FFF, 0x4010, 0x10_FFFF);
|
||||
testSet!char(unicode.L);
|
||||
testSet!char(unicode.L.inverted);
|
||||
|
||||
testRange!wchar('A', 'Z', 'a', 'z', 'й', 'я');
|
||||
testRange!wchar(0, 0xFFFF);
|
||||
testRange!wchar(0xF000, 0x1_0000, 0x1_0000, 0x09_0000);
|
||||
testRange!wchar(0, 0xD800, 0xDC00, 0xFEFF);
|
||||
testRange!wchar(0xD7FF, 0xD800, 0xD800, 0xD801, 0xDBFF, 0xDC01, 0xE000, 0x1FFFF);
|
||||
|
||||
testRange!dchar(0x0, 0x80, 0x100, 0x702, 0x810, 0x3FFF, 0x4010, 0x10_FFFF);
|
||||
testRange!dchar(0xD7FF, 0xD800, 0xD800, 0xD801, 0xDBFF, 0xDC01, 0xE000, 0x1FFFF);
|
||||
}
|
||||
|
||||
//a range of code units, packed with index to speed up forward iteration
|
||||
package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc
|
||||
|
@ -5302,11 +6040,14 @@ unittest
|
|||
import std.algorithm;
|
||||
auto utf16 = utfMatcher!wchar(unicode.L);
|
||||
auto utf8 = utfMatcher!char(unicode.L);
|
||||
auto tbst16 = tinyUtfBst!wchar(unicode.L);
|
||||
auto tbst8 = tinyUtfBst!char(unicode.L);
|
||||
//decode failure cases UTF-8
|
||||
alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
|
||||
alias fails8 = AliasSeq!("\xC1", "\xC0\xC0", "\x80\x00","\xC0\x00", "\xCF\x79",
|
||||
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
|
||||
"\xCF\x00\0x00\0x00\x00");
|
||||
foreach(msg; fails8){
|
||||
assert(tbst8(msg).broken, format("%( %2x %)", cast(ubyte[])msg));
|
||||
assert(collectException((){
|
||||
auto s = msg;
|
||||
import std.utf;
|
||||
|
@ -5318,6 +6059,7 @@ unittest
|
|||
//decode failure cases UTF-16
|
||||
alias fails16 = AliasSeq!([0xD811], [0xDC02]);
|
||||
foreach(msg; fails16){
|
||||
assert(tbst16(msg.map!(x=>cast(wchar)x)).broken, format("%( %2x %)", cast(ushort[])msg));
|
||||
assert(collectException((){
|
||||
auto s = msg.map!(x => cast(wchar)x);
|
||||
utf16.test(s);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue