phobos/std/regex/internal/bitnfa.d

//Written in the D programming language
/*
    Implementation of a concept "NFA in a word" which is
    bit-parallel impementation of regex where each bit represents
    a state in an NFA. Execution is Thompson-style achieved via bit tricks.

    There is a great number of limitations inlcuding not tracking any state (captures)
    and not supporting even basic assertions such as ^, $  or \b.
*/
module std.regex.internal.bitnfa;

package(std.regex):

import std.regex.internal.ir;

debug(std_regex_bitnfa) import std.stdio;
import std.algorithm;


struct HashTab
{
    @disable this(this);

    uint opIndex()(uint key)
    {
        auto p = locate(key, table);
        assert(p.occupied);
        return p.value;
    }

    void opIndexAssign(uint value, uint key)
    {
        if(table.length == 0) grow();
        auto p = locate(key, table);
        if(!p.occupied)
        {
            items++;
            if(4*items >= table.length*3)
            {
                grow();
                p = locate(key, table);
            }
            p.key_ = key;
            p.setOccupied();
        }
        p.value = value;
    }

    auto keys()
    {
        auto app = appender!(uint[])();
        foreach(i, v; table)
        {
            if(v.occupied)
                app.put(v.key);
        }
        return app.data;
    }

    auto values()
    {
        auto app = appender!(uint[])();
        foreach(i, v; table)
        {
            if(v.occupied)
                app.put(v.value);
        }
        return app.data;
    }

private:
    static uint hashOf()(uint val)
    {
        return (val >> 20) ^ (val>>8) ^ val;
    }

    struct Node
    {
        uint key_;
        uint value;
        @property uint key()(){ return key_ & 0x7fff_ffff; }
        @property bool occupied()(){ return (key_ & 0x8000_0000) != 0; }
        void setOccupied(){ key_ |= 0x8000_0000; }

    }
    Node[] table;
    size_t items;

    static Node* locate()(uint key, Node[] table)
    {
        size_t slot = hashOf(key) & (table.length-1);
        while(table[slot].occupied)
        {
            if(table[slot].key == key)
                break;
            slot += 1;
            if(slot == table.length)
                slot = 0;
        }
        return table.ptr+slot;
    }

    void grow()
    {
        Node[] newTable = new Node[table.length ? table.length*2 : 4];
        foreach(i, v; table)
        {
            if(v.occupied)
            {
                auto p = locate(v.key, newTable);
                *p = v;
            }
        }
        table = newTable;
    }
}


// Specialized 2-level trie of uint masks for BitNfa.
// Uses the concept of CoW: a page gets modified in place
// if the block's ref-count is 1, else a newblock is allocated
// and ref count is decreased
struct UIntTrie2
{
    ushort[] index;                       // pages --> blocks
    ushort[] refCounts;                   // ref counts for each block
    uint[]   hashes;                      // hashes of blocks
    uint[]   blocks;                      // linear array with blocks
    uint[]   scratch;                     // temporary block
    enum     blockBits = 8;               // size of block in bits
    enum     blockSize = 1<<blockBits;    // size of block


    static uint hash(uint[] data)
    {
        uint h = 5183;
        foreach(v; data)
        {
            h = 31*h + v;
        }
        return h;
    }

    static UIntTrie2 opCall()
    {
        UIntTrie2 ut;
        ut.index.length = 2<<13;
        ut.blocks = new uint[blockSize];
        ut.blocks[] = uint.max; // all ones
        ut.scratch = new uint[blockSize];
        ut.refCounts = new ushort[1];
        ut.refCounts[0] = 2<<13;
        ut.hashes = new uint[1];
        ut.hashes[0] = hash(ut.blocks);
        return ut;
    }

    uint opIndex(dchar ch)
    {
        immutable blk = index[ch>>blockBits];
        //writeln(">blk = ", blk);
        return blocks.ptr[blk*blockSize + (ch & (blockSize-1))];
    }

    void setPageRange(string op)(uint val, uint low, uint high)
    {
        immutable blk = index[low>>blockBits];
        //writeln("<blk = ", blk);
        if(refCounts[blk] == 1) // modify in-place
        {
            immutable lowIdx = blk*blockSize + (low & (blockSize-1));
            immutable highIdx = high - low + lowIdx;
            mixin("blocks[lowIdx..highIdx] "~op~"= val;");
        }
        else
        {
            // create a new page
            refCounts[blk]--;
            immutable lowIdx = low & (blockSize-1);
            immutable highIdx = high - low + lowIdx;
            scratch[] = blocks[blk*blockSize..(blk+1)*blockSize];
            mixin("scratch[lowIdx..highIdx] "~op~"= val;");
            uint h = hash(scratch);
            bool found = false;
            foreach(i,x; hashes)
            {
                if(x != h) continue;
                if(scratch[] == blocks[i*blockSize .. (i+1)*blockSize])
                {
                    // re-route to existing page
                    index[low>>blockBits] = cast(ushort)i;
                    refCounts[i]++; // inc refs
                    found = true;
                    break;
                }
            }
            if(!found)
            {
                index[low>>blockBits] = cast(ushort)hashes.length;
                blocks ~= scratch[];
                refCounts ~= 1;
                hashes ~= h;
            }
        }
    }

    void opIndexOpAssign(string op)(uint val, dchar ch)
    {
        setPageRange!op(val, ch, ch+1);
    }

    void opSliceOpAssign(string op)(uint val, uint start, uint end)
    {
        uint startBlk  = start >> blockBits;
        uint endBlk = end >> blockBits;
        uint first = min(startBlk*blockSize+blockSize, end);
        setPageRange!op(val, start, first);
        foreach(blk; startBlk..endBlk)
            setPageRange!op(val, blk*blockSize, (blk+1)*blockSize);
        if(first != end)
        {
            setPageRange!op(val, endBlk*blockSize, end);
        }
    }
}

unittest
{
    UIntTrie2 trie = UIntTrie2();
    trie['d'] &= 3;
    assert(trie['d'] == 3);
    trie['\u0280'] &= 1;
    assert(trie['\u0280'] == 1);
    import std.uni;
    UIntTrie2 trie2 = UIntTrie2();
    auto letters = unicode("L");
    foreach(r; letters.byInterval)
        trie2[r.a..r.b] &= 1;
    foreach(ch; letters.byCodepoint)
        assert(trie2[ch] == 1);
    auto space = unicode("WhiteSpace");
    auto trie3 = UIntTrie2();
    foreach(r; space.byInterval)
        trie3[r.a..r.b] &= 2;
    foreach(ch; space.byCodepoint)
        assert(trie3[ch] == 2);
}

// Since there is no way to mark a starting position
// we need 2 instances of BitNfa: one to find the end, and the other
// to run backwards to find the start.
struct BitNfa
{
    uint[128]   asciiTab;         // state mask for ascii characters
    UIntTrie2   uniTab;           // state mask for unicode characters
    HashTab     controlFlow;      // maps each bit pattern to resulting jumps pattern
    uint        controlFlowMask;  // masks all control flow bits
    uint        finalMask;        // marks final states terminating the NFA
    uint        length;            // if this engine is empty

    @property bool empty() const { return length == 0; }

    void combineControlFlow()
    {
        uint[] keys = controlFlow.keys;
        uint[] values = controlFlow.values;
        auto selection = new bool[keys.length];
        bool nextChoice()
        {
            uint i;
            for(i=0;i<selection.length; i++)
            {
                selection[i] ^= true;
                if(selection[i])
                    break;
            }
            return i != selection.length;
        }
        // first prepare full mask
        foreach(k; keys) controlFlowMask |= k;
        // next set all combinations in cf
        while(nextChoice())
        {
            uint kmask = 0, vmask = 0;
            foreach(i,v; selection)
                if(v)
                {
                    kmask |= keys[i];
                    vmask |= values[i];
                }
            controlFlow[kmask] = vmask;
        }
    }

    uint[] collectControlFlow(Bytecode[] ir, uint i)
    {
        uint[] result;
        Stack!uint paths;
        paths.push(i);
        while(!paths.empty())
        {
            uint j = paths.pop();
            switch(ir[j].code) with(IR)
            {
            case OrStart:
                j += IRL!OrStart;
                assert(ir[j].code == Option);
                while(ir[j].code == Option)
                {
                    //import std.stdio;
                    //writefln("> %d %s", j, ir[j].mnemonic);
                    paths.push(j+IRL!Option);
                    //writefln(">> %d", j+IRL!Option);
                    j = j + ir[j].data + IRL!Option;
                }
                break;
            case GotoEndOr:
                paths.push(j+IRL!GotoEndOr+ir[j].data);
                break;
            case OrEnd, Wordboundary, Notwordboundary, Bol, Eol, Nop, GroupStart, GroupEnd:
                paths.push(j+ir[j].length);
                break;
            case LookaheadStart, NeglookaheadStart, LookbehindStart,
                NeglookbehindStart:
                paths.push(j + IRL!LookaheadStart + ir[j].data + IRL!LookaheadEnd);
                break;
            case InfiniteStart, InfiniteQStart:
                paths.push(j+IRL!InfiniteStart);
                paths.push(j+ir[j].data+IRL!InfiniteEnd);
                break;
            case InfiniteBloomStart:
                paths.push(j+IRL!InfiniteStart);
                paths.push(j+ir[j].data+IRL!InfiniteBloomEnd);
                break;
            case InfiniteEnd, InfiniteQEnd:
                paths.push(j-ir[j].data);
                paths.push(j+IRL!InfiniteEnd);
                break;
            case InfiniteBloomEnd:
                paths.push(j-ir[j].data);
                paths.push(j+IRL!InfiniteBloomEnd);
                break;
            default:
                result ~= j;
            }
        }
        return result;
    }

    this(Char)(auto ref Regex!Char re)
    {
        asciiTab[] = uint.max; // all ones
        uniTab = UIntTrie2();
        controlFlow[0] = 0;
        // pc -> bit number
        uint[] bitMapping = new uint[re.ir.length];
        uint bitCount = 0, nesting=0, lastNonnested=0;
        with(re)
outer:  for(uint i=0; i<ir.length; i += ir[i].length) with(IR)
        {
            if(nesting == 0) lastNonnested = i;
            if(ir[i].isStart) nesting++;
            if(ir[i].isEnd) nesting--;
            switch(ir[i].code)
            {
            case Option, OrEnd, Nop, Bol,
            GroupStart, GroupEnd,
            Eol, Wordboundary, Notwordboundary:
                bitMapping[i] = bitCount;
                break;
            // skipover complex assertions
            case LookaheadStart, NeglookaheadStart, LookbehindStart,
                NeglookbehindStart:
                bitMapping[i] = bitCount;
                nesting--;
                i += IRL!LookbehindStart + ir[i].data; // IRL end gets skiped by 'for'
                break;
            // unsupported instructions
            case RepeatStart, RepeatQStart, Backref:
                bitMapping[i] = bitCount;
                break outer;
            case OrChar:
                uint s = ir[i].sequence;
                for(uint j=i; j<i+s; j++)
                    bitMapping[j] = bitCount;
                i += (s-1)*IRL!OrChar;
                bitCount++;
                if(bitCount == 32)
                    break outer;
                break;
            default:
                bitMapping[i] = bitCount++;
                if(bitCount == 32)
                    break outer;
            }
        }
        debug(std_regex_bitnfa) writeln("LEN:", lastNonnested);
        // the total processable length
        finalMask |= 1u<<bitMapping[lastNonnested];
        length = lastNonnested;
        with(re)
        for(uint i=0; i<length; i += ir[i].length)
        {
            switch(ir[i].code) with (IR)
            {
            case OrStart,GotoEndOr, InfiniteStart,
            InfiniteBloomStart, InfiniteBloomEnd,
            InfiniteEnd, InfiniteQEnd, InfiniteQStart:
                // collect stops across all paths
                auto rets = collectControlFlow(ir, i);
                uint mask = 0;
                debug(std_regex_bitnfa) writeln(rets);
                foreach(pc; rets) mask |= 1u<<bitMapping[pc];
                // map this individual c-f to all possible stops
                controlFlow[1u<<bitMapping[i]] = mask;
                break;
            case Option, OrEnd, Nop, Bol,
                GroupStart, GroupEnd,
                Eol, Wordboundary, Notwordboundary:
                break;
            case LookaheadStart, NeglookaheadStart, LookbehindStart,
                NeglookbehindStart:
                i += IRL!LookaheadStart + ir[i].data;
                break;
            case End:
                finalMask |= 1u<<bitMapping[i];
                break;
            case Any:
                uint mask = 1u<<bitMapping[i];
                asciiTab[0..0x80] &= ~mask;
                uniTab[0..0x11_0000] &= ~mask;
                break;
            case Char:
                uint mask = 1u<<bitMapping[i];
                auto ch = ir[i].data;
                //import std.stdio;
                //writefln("Char %c - %b", cast(dchar)ch, mask);
                if(ch < 0x80)
                    asciiTab[ch] &= ~mask;
                else
                    uniTab[ch] &= ~mask;
                break;
            case OrChar:
                uint s = ir[i].sequence;
                for(size_t j=i; j<i+s; j++)
                {
                    uint mask = 1u<<bitMapping[i];
                    auto ch = ir[j].data;
                    //import std.stdio;
                    //writefln("OrChar %c - %b", cast(dchar)ch, mask);
                    if(ch < 0x80)
                        asciiTab[ch] &= ~mask;
                    else
                        uniTab[ch] &= ~mask;
                }
                i += s-1;
                break;
            case CodepointSet, Trie:
                auto cset = charsets[ir[i].data];
                uint mask = 1u<<bitMapping[i];
                foreach(ival; cset.byInterval)
                {
                    if(ival.b < 0x80)
                        asciiTab[ival.a..ival.b] &= ~mask;
                    else
                    {
                        if(ival.a < 0x80)
                            asciiTab[ival.a..0x80] &= ~mask;
                        uniTab[ival.a..ival.b] &= ~mask;
                    }
                }
                break;
            default:
                assert(0, "Unexpected instruction in BitNFA: "~ir[i].mnemonic);
            }
        }
        length += re.ir[lastNonnested].length;
        combineControlFlow();
    }

    bool opCall(Input)(ref Input r)
    {
        bool matched = false;
        size_t mIdx = 0;
        dchar ch;
        size_t idx;
        uint word = ~0u;
        for(;;)
        {
            word <<= 1; // shift - create a state
            // cfMask has 1 for each control-flow op
            uint cflow = ~word  & controlFlowMask;
            word = word | controlFlowMask; // kill cflow
            word &= ~controlFlow[cflow]; // map normal ops
            debug(std_regex_bitnfa) writefln("%b %b %b %b", word, finalMask, cflow, controlFlowMask);
            if((word & finalMask) != finalMask)
            {
                matched = true; // keep running to see if there is longer match
                mIdx = r._index;
            }
            else if(matched)
                break;
            if(!r.nextChar(ch, idx))
                break;
            // mask away failing states
            if(ch < 0x80)
                word |= asciiTab[ch];
            else
                word |= uniTab[ch];
        }
        if(matched)
        {
            r.reset(mIdx);
        }
        return matched;
    }
}

final class BitMatcher(Char) : Kickstart!(Char)
    if(is(Char : dchar))
{
@trusted:
    BitNfa forward, backward;

    this()(auto ref Regex!Char re)
    {
        forward = BitNfa(re);
        //reverse Bytecode
        auto re2 = re;
        re2.ir = re2.ir.dup;
        // keep the end where it belongs
        uint len = forward.length - 1;
        reverseBytecode(re2.ir[0..len]);
        // check for the case of multiple patterns as one alternation
        if(len == re2.ir.length-IRL!(IR.End))
        with(IR) with(re2) if(ir[0].code == OrStart)
        {
            size_t pc = IRL!OrStart;
            while(ir[pc].code == Option)
            {
                size_t size = ir[pc].data;
                if(ir[pc+size-IRL!GotoEndOr].code == GotoEndOr)
                    size -= IRL!GotoEndOr;
                size_t j = pc + IRL!Option;
                if(ir[j].code == End)
                {
                    auto save = ir[j];
                    foreach(k; j+1..j+size)
                        ir[k-1] = ir[k];
                    ir[j+size-1] = save;
                }
                pc = j + ir[pc].data;
            }
        }
        backward = BitNfa(re2);
    }

    final bool opCall(ref Input!Char r)
    {
        bool res = forward(r);
        if(res){
            auto back = r.loopBack(r._index);
            assert(backward(back));
            r.reset(back._index);
        }
        return res;
    }

    final @property bool empty() const{ return forward.empty; }
}

version(unittest)
{
    template check(alias make)
    {
        private void check(T)(string input, T re, size_t idx=uint.max)
        {
            import std.regex, std.conv;
            import std.stdio;
            auto rex = regex(re);
            auto m = make(rex);
            auto s = Input!char(input);
            assert(m(s), "Failed "~input~" with "~to!string(re));
            assert(s._index == idx || (idx ==uint.max && s._index == input.length));
        }
    }

    template checkFail(alias make)
    {
        private void checkFail(T)(string input, T re, size_t idx=uint.max)
        {
            import std.regex, std.conv;
            import std.stdio;
            auto rex = regex(re);
            auto m = make(rex);
            auto s = Input!char(input);
            assert(!m(s), "Should have failed "~input~" with "~to!string(re));
            assert(s._index == idx || (idx ==uint.max && s._index == input.length));
        }
    }

    alias checkBit = check!BitNfa;
    alias checkBitFail = checkFail!BitNfa;
    auto makeMatcher(Char)(Regex!Char regex){ return new BitMatcher!(Char)(regex); }
    alias checkM = check!makeMatcher;
    alias checkMFail = checkFail!makeMatcher;
}

unittest
{
    "xabcd".checkBit("abc", 4);
    "xabbbcdyy".checkBit("a[b-c]*c", 6);
    "abc1".checkBit("([a-zA-Z_0-9]*)1");
    "abbabc".checkBit("(a|b)*",5);
    "abd".checkBitFail("abc");
    // check truncation
    "0123456789_0123456789_0123456789_012"
        .checkBit("0123456789_0123456789_0123456789_0123456789", 31);
    "0123456789_0123456789_0123456789_012"
        .checkBit("0123456789(0123456789_0123456789_0123456789_0123456789|01234)",10);
    // assertions ignored
    "0abc1".checkBit("(?<![0-9])[a-c]*$", 4);
    // stop on repetition
    "abcdef1".checkBit("a[a-z]{5}", 1);
    "ads@email.com".checkBit(`\S+@\S+`);
    "abc@email.com".checkBit(`\S+@\S?1`, 4);
}

unittest
{
    "xxabcy".checkM("abc", 2);
    "_10bcy".checkM([`\d+`, `[a-z]+`], 1);
    "abc@email.com".checkM(`\S+@\S?1`, 0);
}