New primitive - Tiny UTF Binary Search Table

This lays on the foundation of UTF word notion, a 32-bit integer that contains UTF encoded codepoint. Top non-zero byte is the UTF-8 starter, so it's BigEndian when reading directly from strings. Same applies to UTF-16 word. Also this invents simpler API for matching that easily combines: - length of code point matched - result of match (true/false if belongs to this set) - bad encoding flag in one packed 32-bit machine word - UtfLookup. This struct is going to be reused for all of "directly on UTF" matchers in std.uni.
2025-05-04 00:54:05 +03:00 · 2015-10-02 15:00:44 +03:00 · 2015-10-02 15:00:44 +03:00 · aba3fe84a3
commit aba3fe84a3
parent 06506e407a
1 changed files with 744 additions and 2 deletions
--- a/std/uni.d
+++ b/std/uni.d
@ -74,6 +74,13 @@
        significant performance improvements. See $(LREF MatcherConcept) for
        the common interface of UTF matchers.
    )
+    $(LI
+        Adding even more flexibilty to character classification without decoding
+        there is $(LREF tinyUtfBst). TinyUTF tables achieve much smaller footprint
+        then a Matcher (~ 8 bytes * number of intervals) while still operating on UTF
+        directly avoiding decoding.  Each match operation is $(BIGOH log(N)),
+        where N is number of intervals in the given codepoint set.
+    )
    $(LI
        Generally useful building blocks for customized normalization:
        $(LREF combiningClass) for querying combining class
@ -4868,7 +4875,6 @@ template Utf16Matcher()
        import std.utf;
        throw new UTFException("Invalid UTF-16 sequence");
    }
-
    // 1-stage ASCII
    alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
    //2-stage BMP
@ -5134,6 +5140,738 @@ public auto utfMatcher(Char, Set)(Set set) @trusted
        static assert(false, "Only character types 'char' and 'wchar' are allowed");
 }

+// helper for assert/debug prints
+string utfWordAsHex()(uint val, size_t sz=0)
+{
+    import std.algorithm : reverse;
+    import core.bitop;
+    import std.format;
+    char[] res;
+    if(sz == 0)
+        sz = (bsr(val)+8)/8;
+    while(sz)
+    {
+        res ~= format("%x", val & 0xFF);
+        val >>= 8;
+        sz--;
+        if(sz)
+            res ~= "_";
+    }
+    return res.idup;
+}
+
+// just encode as UTF-8 don't try to validate
+size_t encodeNoCheck_(ref char[4] buf, dchar c) @safe pure nothrow
+{
+    if (c <= 0x7F)
+    {
+        buf[0] = cast(char)c;
+        return 1;
+    }
+    if (c <= 0x7FF)
+    {
+        buf[0] = cast(char)(0xC0 | (c >> 6));
+        buf[1] = cast(char)(0x80 | (c & 0x3F));
+        return 2;
+    }
+    if (c <= 0x3FFF)
+    {
+
+        buf[0] = cast(char)(0xE0 | (c >> 12));
+        buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
+        buf[2] = cast(char)(0x80 | (c & 0x3F));
+        return 3;
+    }
+    // +1 on top of the usual dchar range so as to allow half-open interval [A,B)
+    assert(c <= 0x11_0000);
+    buf[0] = cast(char)(0xF0 | (c >> 18));
+    buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
+    buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
+    buf[3] = cast(char)(0x80 | (c & 0x3F));
+    return 4;
+}
+
+// just encode as UTF-16 don't try to validate
+size_t encodeNoCheck_(ref wchar[2] buf, dchar c) @safe pure nothrow
+{
+    if(c < 0xD800 || (c > 0xE000 && c < 0x1_0000))
+    {
+        buf[0] = cast(wchar)c;
+        return 1;
+    }
+    else
+    {
+        buf[0] = cast(wchar)(((c - 0x1_0000) >> 10) + 0xD800);
+        buf[1] = cast(wchar)(((c - 0x1_0000) & 0x3FF) + 0xDC00);
+        return 2;
+    }
+}
+
+// compatible interface for dchars
+size_t encodeNoCheck_(ref dchar[1] buf, dchar c) @safe pure nothrow
+{
+    buf[0] = c;
+    return 1;
+}
+
+// internal helper
+static T bigEndian_(T)(T val)  pure nothrow
+{
+    version(LittleEndian)
+    {
+        import core.bitop;
+        static if(T.sizeof == 2)
+            return cast(ushort)(val << 8) | cast(ushort)(val >> 8);
+        else static if(T.sizeof == 4)
+            return bswap(val); // works for uints
+    }
+    else
+        return val;
+}
+
+// Load UTF-word from string-ish range.
+// UTF codeunits are represented as 32-bit integer
+// where highest non-zero byte is the first (starter) code unit
+uint toUtfWord(size_t sz, Range)(Range buf) pure nothrow
+    if(is(Range : const(char)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : char)))
+{
+    pragma(inline, true);
+    static if(isArray!Range) // can generalize if there is .ptr but not Array
+    {
+        static if(sz == 1)
+            return buf[0];
+        else static if(sz == 2)
+            return bigEndian_(*cast(ushort*)buf.ptr);
+        else static if(sz == 3)
+        {
+            auto val = bigEndian_(*cast(ushort*)buf.ptr) << 8;
+            val |= cast(uint)buf[2]; //add last 8bits
+            return val;
+        }
+        else
+            return bigEndian_(*cast(uint*)buf.ptr);
+    }
+    else // don't have .ptr
+    {
+        static if(sz == 1)
+            return buf[0];
+        else static if(sz == 2)
+            return (cast(uint)buf[0]<<8) | buf[1];
+        else static if(sz == 3)
+            return (cast(uint)buf[0]<<16) | (cast(uint)buf[1]<<8) | buf[2];
+        else
+            return (cast(uint)buf[0]<<24) | (cast(uint)buf[1]<<16) | (cast(uint)buf[2]<<8) | buf[3];
+    }
+}
+
+// same for UTF-16, resulting words (UTF-16 vs UTF-8) are (of course) not interchangeble
+uint toUtf16Word(Range)(Range buf) pure nothrow
+    if(is(Range : const(wchar)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : wchar)))
+{
+    immutable c = buf[0];
+    if(c < 0xD800)
+        return c;
+    else if(c <= 0xFFFF)
+        return c + 0x1_0000;
+    else //simply take as is
+    {
+        static if(isArray!Range) // can generalize if there is .ptr but not Array
+            return bigEndian_(*cast(uint*)buf.ptr);
+        else
+            return (cast(uint)c<<16) | buf[1];
+    }
+}
+
+// same for UTF-16, resulting words (UTF-16 vs UTF-8) are (of course) not interchangeble
+uint toUtf16Word()(dchar ch) pure nothrow
+{
+    wchar[2] buf;
+    auto sz = encodeNoCheck_(buf, ch);
+    return toUtf16Word(buf[]);
+}
+
+// clean UTF-32 codepoint from UTF-8 word
+dchar fromUtfWord(uint val) @safe pure nothrow
+{
+    // fast path goes first - ASCII
+    if(val <= 0xFF)
+        return val;
+    else if(val <= 0xFF_FF)
+        return ((val & 0x1F_00)>>2) | (val & 0x3F);
+    else if(val <= 0xFF_FF_FF)
+        return ((val & 0x0F_00_00)>>4) | ((val & 0x3F_00)>>2) | (val & 0x3F);
+    else // 4 bytes
+        return ((val & 0x07_00_00_00)>>6) | ((val & 0x3F_00_00)>>4) |
+            ((val & 0x3F_00)>>2) | (val & 0x3F);
+}
+
+unittest
+{
+    import std.format;
+    // enum STEP = 1; // exhaustive but too expansive
+    enum STEP = 7; // some small prime as step
+    for(dchar ch=0; ch < 0x11_0000; ch+=STEP)
+    {
+        char[4] buf;
+        uint val = toUtfWord(buf[], encodeNoCheck_(buf, ch));
+        assert(fromUtfWord(val) == ch, format("0x%x - buf %(0x%x, %) uw %s", ch, buf, utfWordAsHex(val)));
+    }
+}
+
+// size in bytes of UTF sequence in this UTF word
+uint utfWordSize(uint utfWord) @safe pure nothrow
+{
+    import core.bitop;
+    return utfWord ? (bsr(utfWord)+8)/8 : 1;
+}
+
+// ditto with size as run-time parameter
+uint toUtfWord(Range)(Range r, size_t sz)
+{
+    switch(sz)
+    {
+        foreach(c; AliasSeq!(1,2,3,4))
+        {
+            case c:
+                return toUtfWord!c(r);
+        }
+        default:
+            assert(0);
+    }
+}
+
+// another overload for convenience
+uint toUtfWord(dchar ch) pure nothrow
+{
+    char[4] buf;
+    size_t sz = encodeNoCheck_(buf, ch);
+    return toUtfWord(buf[], sz);
+}
+
+// Slice up codepoint interval [a,b) to the jagged UTF word intevals
+struct UtfChunks
+{
+    uint[8] chunks;
+    size_t len=0;
+    //
+    this(Interval)(Interval i)
+    {
+        uint start = toUtfWord(i.a);
+        uint end = toUtfWord(i.b);
+        chunks[len++] = start;
+        foreach(brk; AliasSeq!(0x80, 0x800, 0x4000))
+        {
+            if(i.a < brk && i.b >= brk)
+            {
+                chunks[len++] = toUtfWord(brk-1)+1; // encode max and add 1
+                chunks[len++] = toUtfWord(brk);
+            }
+        }
+        chunks[len++] = end;
+        assert(len % 2 == 0);
+    }
+    //
+    uint[] opIndex() return
+    {
+        return chunks[0..len];
+    }
+}
+
+/++
+    Incapsulates the result of matching range of UTF codeunits against
+    some UTF matcher e.g. $(LREF TinyUtfBst) or $(LREF UtfMatcher).
+
+    Packed in a single machine word, this struct contains info about the length
+    of codepoint tested, if there was a match and whether the UTF encoding was
+    broken.
+/
+struct UtfLookup
+{
+    enum {
+        MASK        = 0xF,
+        MATCH_BIT   = 4,
+        MATCH       = 1<<MATCH_BIT,
+        BROKEN      = MATCH<<1,
+        BROKEN_BIT  = MATCH_BIT+1
+    }
+    private uint word;
+const pure nothrow @safe @nogc:
+    /++
+        Length of the codepoint tseted in UTF range.
+
+        See also: $(XREF utf, stride).
+    +/
+    @property uint stride()(){ return word & MASK; }
+    /// True if the codepoint was recognized by the matcher.
+    @property bool matched()(){ return (word & MATCH) != 0; }
+    /++
+        Indicates if the encoding of UTF range was broken.
+        In such cases $(D stride) always equals 1, $(D matched) is false.
+    +/
+    @property bool broken()(){ return (word & BROKEN) != 0; }
+    /// Short-hand for $(D matched) to use in conditional statements.
+    bool opCast(T:bool)(){ return matched; }
+}
+
+mixin template SegmentedTable(Char)
+{
+    @property auto badEncoding()(){ return UtfLookup(1 | UtfLookup.BROKEN); }
+    // get a proper section of array by UTF-8 code length
+    @property auto segment(size_t sz)()
+    {
+        static if(sz == 1)
+        {
+            static if(!is(typeof(segs))) // single segment
+                return store;
+            else
+                return store[0..segs[0].start]; // up to the start of 2nd segment
+        }
+        else static if(sz > 1)
+        {
+            auto ptr = cast(Value!sz*)(store.ptr + segs[sz-2].start);
+            return ptr[0..segs[sz-2].len]; // .len is measured in Values
+        }
+    }
+
+    // val is UTF word
+    UtfLookup lookup(size_t sz)(uint val)
+    {
+        pragma(inline, true);
+        import std.stdio;
+        // import std.range : assumeSorted;
+        auto seg = segment!sz;
+        // UTF-8 specific continuation bit validation
+        static if(is(Char : char))
+        {
+            uint bad1 = 0x80, bad2 = 0x0;
+            foreach(i; Sequence!(0, sz-1))
+            {
+                enum mask1 = 0x80<<(8*i);
+                enum mask2 = 0x40<<(8*i);
+                bad1 &= (val & mask1)>>(8*i); // 0 is any of bits is 0
+                bad2 |= (val & mask2)>>(8*i); // 1 if any of bits is 1
+            }
+            // starter starts with 01.... binary mask
+            // check absense of at least one '1'
+            bad1 = (~bad1 & 0x80) >> (7 - UtfLookup.BROKEN_BIT);
+            // check absense of at least one '0'
+            bad2 = bad2 >> (6 - UtfLookup.BROKEN_BIT);
+            immutable bad = bad1 | bad2;
+        }
+        else
+        {
+            enum bad = 0;
+        }
+        // the trick is:
+        // if lower bound is even this means all intervals below val are closed pairs [A,B)
+        // else we are inside of some [A, B)
+        uint ret = sharLowerBound!"a<=b"(seg, cast(Value!sz)val) & 1;
+        // ret == 1 - matched
+        return UtfLookup(sz | (ret<<UtfLookup.MATCH_BIT) | bad);
+    }
+}
+
+/+
+   A Tiny Binary Search Table for decode-less codepoint range lookup.
+   Breaks ranges into up into 4 segments - one per UTF encoding length.
+/
+// nullary template to be inlined in user's code and not just some dead weight in phobos.lib
+struct TinyUtf8BST()
+{
+private:
+    ubyte[] store;       // all arrays in one, aligned to 1-2-4-4 bytes respectively
+    alias SegLen = uint; // may try ushort to pack it tighter
+    struct Segment
+    {
+        SegLen start;   // offset in bytes from the start of the store
+        SegLen len;     // in 2 or 4 byte increments
+    }
+    Segment[3] segs;    // of 2-byte, 3-byte  and 4-bytes segments
+    template Value(size_t sz)
+    {
+        static if(sz == 1)
+            alias Value = ubyte;
+        else static if(sz == 2)
+            alias Value = ushort;
+        else
+            alias Value = uint;
+    }
+    mixin SegmentedTable!char;   // reusable table lookup logic
+
+    // From range of [a,b) intervals
+    this(Range)(Range pairs)
+    {
+        bool[3] seenSize; // if seen 2, 3, 4 byte code points
+        // encode and store with the given size
+        void append(uint utfWord)
+        {
+            uint sz = utfWordSize(utfWord);
+            if(sz == 1)
+            {
+                store ~= cast(ubyte)utfWord;
+                return;
+            }
+            static union Place
+            {
+                uint val;
+                ubyte[4] bytes;
+            }
+            Place p;
+            p.val = utfWord;
+            immutable padSize = 1<<(sz+1)/2; // 2 -> 2, 3-4 -> 4
+            immutable mask = padSize - 1;
+            if(!seenSize[sz-2])
+            {
+                // first of sz-sized code points
+                seenSize[sz-2] = true;
+                auto rem = store.length & mask;
+                if(rem) // need to align
+                {
+                    foreach(_; 0..padSize-rem)
+                        store ~= 0;
+                    segs[sz-2].start = cast(SegLen)store.length;
+                }
+                // mark length
+                segs[sz-2].start = cast(SegLen)store.length;
+            }
+            version(LittleEndian)
+            {
+                foreach(i; 0..padSize)
+                    store ~= p.bytes[i];
+            }
+            else // TODO: test on BigEndian iron
+            {
+                foreach(i; 0..padSize)
+                    store ~= p.bytes[sz-1-i];
+            }
+        }
+        foreach(v; pairs)
+        {
+            auto chunks = UtfChunks(v);
+            auto arr = chunks[];
+            for(size_t i=0; i<arr.length; i+=2)
+            {
+                append(arr[i]);
+                append(arr[i+1]);
+            }
+        }
+        size_t total = store.length;
+        foreach_reverse(i; 0..segs.length)
+        {
+            if(seenSize[i])
+            {
+                segs[i].len = cast(SegLen)(total - segs[i].start);
+                total -= segs[i].len;
+            }
+        }
+        // sets the length of the first segment as well
+        // in case we don't have segment of size 2
+        // trim padding for the first (implicit) segment
+        while(total && store[total-1] == 0) total--;
+        segs[0].start = cast(SegLen)total;
+        // scale to Value!sz size
+        segs[0].len /= 2;
+        segs[0].len &= ~1; // trim padding for second segment
+        segs[1].len /= 4;
+        segs[2].len /= 4;
+    }
+
+    // slow path that assumes 1-element cases are handled
+    UtfLookup longMatch(Range)(Range r)
+    {
+        pragma(inline, false); // no point, it's a slow path
+        immutable c = r[0];
+        UtfLookup ret = badEncoding(); // default on fallthrough
+        if(r.length >= 4) // start/middle of input - fast path
+        {
+            if(!(c & 0x20))
+                ret = lookup!2(toUtfWord!2(r));
+            else if(!(c & 0x10))
+                ret = lookup!3(toUtfWord!3(r));
+            else if(!(c & 0x08))
+                ret = lookup!4(toUtfWord!4(r));
+        }
+        else // handle the tail of input - even slower (and more rare) path
+        {
+            switch(r.length)
+            {
+                case 3:
+                    if(!(c & 0x10))
+                        ret = lookup!3(toUtfWord!3(r));
+                    break;
+                case 2:
+                    if(!(c & 0x20))
+                        ret = lookup!2(toUtfWord!2(r));
+                    break;
+                default:
+                    // fallthrough - bad encoding
+            }
+        }
+        return ret;
+    }
+
+    /++
+        Match against the front of the $(D r) range of char. This doesn't throw
+        on bad encoding. Instead the result includes flags for match
+        and broken encoding along with codepoint length (stride).
+
+        Returns:
+            $(LREF UtfLookup) with the results of the test against this matcher.
+    +/
+    public auto opCall(Range)(Range r)
+        if(is(Range : const(char)[]) || (isRandomAccessRange!Range && is(ElementEncodingType!Range : char)))
+    {
+        pragma(inline, true);
+        immutable c = r[0];
+        // no UTF validation - just test where the zero is
+        if(c < 0x80) // fast path for ASCII
+        {
+            return lookup!1(c);
+        }
+        else
+            return longMatch(r);
+    }
+}
+
+// Analog of TinyUtf8BST for UTf-16
+struct TinyUtf16BST()
+{
+private:
+    ushort[] store; // all arrays in one, aligned to 1-2 ushorts respectively
+    alias SegLen = uint; // may try ushort to pack it tighter
+    struct Segment
+    {
+        SegLen start; // offset in ushorts from the start of the store
+        SegLen len; // in ushort or uint increments
+    }
+    Segment[1] segs; // for uint-sized portion
+    template Value(size_t sz)
+    {
+        static if(sz == 1)
+            alias Value = ushort;
+        else
+            alias Value = uint;
+    }
+    mixin SegmentedTable!wchar;   // reusable table lookup logic
+
+    this(Range)(Range pairs)
+    {
+        bool seenLong = false;
+        void appendLong(dchar val)
+        {
+            if(!seenLong)
+            {
+                seenLong = true;
+                // pad to uint size
+                store.length += store.length % 4;
+                segs[0].start = cast(SegLen)store.length;
+            }
+            union Place
+            {
+                ushort[2] us;
+                uint ui;
+            }
+            Place place;
+            place.ui = toUtf16Word(val);
+            store ~= place.us[];
+        }
+        foreach(i; pairs)
+        {
+            if(i.b <= 0xFFFF)
+            {
+                store ~= cast(ushort)i.a;
+                store ~= cast(ushort)i.b;
+            }
+            else if(i.a > 0xFFFF) // both are beyond 2 bytes
+            {
+                appendLong(i.a);
+                appendLong(i.b);
+            }
+            else // cross-cutting a < 16bit b > 16 bit
+            {
+                store ~= cast(ushort)i.a;
+                store ~= cast(ushort)0xFFFF;
+                appendLong(0xFFFF);
+                appendLong(i.b);
+            }
+        }
+        if(!seenLong)
+            segs[0].start = cast(SegLen)store.length;
+    }
+
+    // assumes 1-element cases below surrogates are already handled
+    auto longMatch(Range)(Range r)
+    {
+        pragma(inline, false); // slow path - don't inline
+        immutable c = r[0];
+        auto ret = badEncoding();
+        if(r.length >= 2)
+        {
+            immutable c2 = r[1];
+            if(c2 >= 0xDC00 && c2 <= 0xDFFF)
+                return lookup!2(toUtf16Word(r));
+        }
+        else
+        {
+            if(c >= 0xE000)
+                return lookup!2(toUtf16Word(r)); // [0xE000, 0x1_0000)
+        }
+        return ret;
+    }
+
+    /++
+        Match against the front of the $(D r) range of char. This doesn't throw
+        on bad encoding. Instead the result includes flags for match
+        and broken encoding along with codepoint length (stride).
+
+        Returns:
+            $(LREF UtfLookup) with the results of the test against this matcher.
+    +/
+    public auto opCall(Range)(Range r)
+        if(is(Range : const(wchar)[]) ||
+         (isRandomAccessRange!Range && is(ElementEncodingType!Range : wchar)))
+    {
+        pragma(inline, true);
+        immutable c = r[0];
+        // fast path for BMP under surrogate pairs
+        if(c < 0xD800 || (c > 0xDFFF && c <= 0xFFFF))
+            return lookup!1(c);
+        else
+            return longMatch(r);
+    }
+};
+
+// Analog of TinyUtf8BST for plain UTF-32
+struct TinyUtf32BST()
+{
+private:
+    uint[] store; // plain binary sorted array
+    alias Value(size_t sz) = uint; // always 32bit
+    mixin SegmentedTable!dchar;   // reusable table lookup logic
+
+    this(Range)(Range pairs)
+    {
+        store.length = pairs.length * 2;
+        size_t idx = 0;
+        foreach(p; pairs)
+        {
+            store[idx] = p.a;
+            store[idx+1] = p.b;
+            idx += 2;
+        }
+    }
+
+    /++
+        Match against the front of the $(D r) range of char. This doesn't throw
+        on bad encoding. Instead the result includes flags for match
+        and broken encoding along with codepoint length (stride).
+
+        Returns:
+            $(LREF UtfLookup) with the results of the test against this matcher.
+    +/
+    public auto opCall(Range)(Range r)
+        if(is(Range : const(dchar)[]) ||
+         (isForwardRange!Range && is(ElementEncodingType!Range : dchar)))
+    {
+        pragma(inline, true);
+        return lookup!1(r.front);
+    }
+};
+
+/++
+    Get UTF-8, UTF-16 or UTF-32 Tiny UTF BST type
+    suitable for passed-in $(D Char) parameter.
+
+    Use $(LREF tinyUtfBst) to construct these functors.
+
+    See also: $(LREF UtfLookup).
+/
+public template TinyUtfBst(Char)
+{
+    static if(is(Char : char))
+        alias TinyUtfBst = TinyUtf8BST!();
+    else static if(is(Char : wchar))
+        alias TinyUtfBst = TinyUtf16BST!();
+    else static if(is(Char : dchar))
+        alias TinyUtfBst = TinyUtf32BST!();
+    else
+        static assert(0, "Only built-in character types are supported.");
+}
+
+/++
+    Convenience factory function to create $(LREF TinyUtfBst)
+    for $(D Char) from any $(D CodepointSet).
+
+    Returns:
+        $(LREF TinyUtfBst) matcher for a given set,
+        applicable as a callable to any range of $(D Char)
+        including built-in strings.
+
+    See also: $(LREF UtfLookup).
+/
+public auto tinyUtfBst(Char, CS)(CS set)
+    if(isCodepointSet!CS)
+{
+    return TinyUtfBst!Char(set.byInterval);
+}
+
+unittest
+{
+    import std.conv, std.stdio, std.format, std.utf;
+    // enum TOP = 0x10_FFFF; // exhaustive range
+    enum TOP = 0x1_0000; // limit the range somewhat for every day tests
+    void testSet(Char, Set)(Set set)
+    {
+        static int abs(int a){ return a > 0 ? a : -a; }
+        auto tab = tinyUtfBst!Char(set);
+        for(dchar ch=0; ch<=TOP; ch++)
+        {
+            Char[4/Char.sizeof] buf;
+            static if(Char.sizeof == 2)
+            {
+                if(ch >= 0xD800 || ch <= 0xDFFF)
+                    continue; // skip surrogates - can't encode in UTF-16
+            }
+            auto sz = encodeNoCheck_(buf, ch);
+            auto r = tab(buf[]); // length >= 4 cases
+            auto r2 = tab(buf[0..sz]); // tail cases
+            auto r3 = tab(buf[0..sz].byUTF!Char);
+            assert(r.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
+            assert(r2.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
+            assert(r3.stride == sz, format("%s 0x%x - %s", Char.stringof, ch, r));
+            assert(r.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
+            assert(r2.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
+            assert(r3.matched == set[ch], format("%s 0x%x - %s", Char.stringof, ch, r));
+        }
+    }
+    void testRange(Char)(uint[] range...)
+    {
+        return testSet!Char(CodepointSet(range));
+    }
+    // trivial sanity checks
+    testRange!char('0', '9', 'a', 'z', 'Ф', 'Я');
+    testRange!char(0x0, 0x10);
+    testRange!char(0x80, 0x100);
+    // walk between break-points
+    testRange!char(0x0, 0x7F, 0x801, 0x3FFF, 0x4001, 0x7FFF, 0x8001, 0x10_FFFF);
+    // across break-points
+    testRange!char(0x7F, 0x81, 0x3FFF, 0x4010, 0x7FF0, 0x10_FFFF);
+    // special case - hits padding of second segment
+    testRange!char(0x0, 0x80, 0x100, 0x702, 0x800, 0x801);
+    // some more tests
+    testRange!char(0x0, 0x80, 0x100, 0x702, 0x810, 0x3FFF, 0x4010, 0x10_FFFF);
+    testSet!char(unicode.L);
+    testSet!char(unicode.L.inverted);
+
+    testRange!wchar('A', 'Z', 'a', 'z', 'й', 'я');
+    testRange!wchar(0, 0xFFFF);
+    testRange!wchar(0xF000, 0x1_0000, 0x1_0000, 0x09_0000);
+    testRange!wchar(0, 0xD800, 0xDC00, 0xFEFF);
+    testRange!wchar(0xD7FF, 0xD800, 0xD800, 0xD801, 0xDBFF, 0xDC01, 0xE000, 0x1FFFF);
+
+    testRange!dchar(0x0, 0x80, 0x100, 0x702, 0x810, 0x3FFF, 0x4010, 0x10_FFFF);
+    testRange!dchar(0xD7FF, 0xD800, 0xD800, 0xD801, 0xDBFF, 0xDC01, 0xE000, 0x1FFFF);
+}

 //a range of code units, packed with index to speed up forward iteration
 package auto decoder(C)(C[] s, size_t offset=0) @safe pure nothrow @nogc
@ -5302,11 +6040,14 @@ unittest
    import std.algorithm;
    auto utf16 = utfMatcher!wchar(unicode.L);
    auto utf8 = utfMatcher!char(unicode.L);
+    auto tbst16 = tinyUtfBst!wchar(unicode.L);
+    auto tbst8 = tinyUtfBst!char(unicode.L);
    //decode failure cases UTF-8
-    alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
+    alias fails8 = AliasSeq!("\xC1", "\xC0\xC0", "\x80\x00","\xC0\x00", "\xCF\x79",
        "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
        "\xCF\x00\0x00\0x00\x00");
    foreach(msg; fails8){
+        assert(tbst8(msg).broken, format("%( %2x %)", cast(ubyte[])msg));
        assert(collectException((){
            auto s = msg;
            import std.utf;
@ -5318,6 +6059,7 @@ unittest
    //decode failure cases UTF-16
    alias fails16 = AliasSeq!([0xD811], [0xDC02]);
    foreach(msg; fails16){
+        assert(tbst16(msg.map!(x=>cast(wchar)x)).broken, format("%( %2x %)", cast(ushort[])msg));
        assert(collectException((){
            auto s = msg.map!(x => cast(wchar)x);
            utf16.test(s);