//Written in the D programming language /* Implementation of a concept "NFA in a word" which is bit-parallel impementation of regex where each bit represents a state in an NFA. Execution is Thompson-style achieved via bit tricks. There is a great number of limitations inlcuding not tracking any state (captures) and not supporting even basic assertions such as ^, $ or \b. */ module std.regex.internal.bitnfa; package(std.regex): import std.regex.internal.ir; debug(std_regex_bitnfa) import std.stdio; import std.algorithm; struct HashTab { pure: @disable this(this); uint opIndex()(uint key) const { auto p = locateExisting(key, table); assert(p.occupied); return p.value; } bool opBinaryRight(string op:"in")(uint key) const { auto p = locate(key, table); return p.occupied; } void opIndexAssign(uint value, uint key) { if (table.length == 0) grow(); auto p = locate(key, table); if (!p.occupied) { items++; if (4 * items >= table.length * 3) { grow(); p = locate(key, table); } p.key_ = key; p.setOccupied(); } p.value = value; } auto keys() const { import std.array : appender; auto app = appender!(uint[])(); foreach (i, v; table) { if (v.occupied) app.put(v.key); } return app.data; } auto values() const { import std.array : appender; auto app = appender!(uint[])(); foreach (i, v; table) { if (v.occupied) app.put(v.value); } return app.data; } private: static uint hashOf()(uint val) { return (val >> 20) ^ (val>>8) ^ val; } struct Node { pure: uint key_; uint value; @property uint key()() const { return key_ & 0x7fff_ffff; } @property bool occupied()() const { return (key_ & 0x8000_0000) != 0; } void setOccupied(){ key_ |= 0x8000_0000; } } Node[] table; size_t items; static N* locateExisting(N)(uint key, N[] table) { size_t slot = hashOf(key) & (table.length-1); key |= 0x8000_0000; while (table[slot].key_ != key) { slot += 1; if (slot == table.length) slot = 0; } return table.ptr + slot; } static N* locate(N)(uint key, N[] table) { size_t slot = hashOf(key) & (table.length-1); while (table[slot].occupied) { if (table[slot].key == key) break; slot += 1; if (slot == table.length) slot = 0; } return table.ptr + slot; } void grow() { Node[] newTable = new Node[table.length ? table.length*2 : 4]; foreach (i, v; table) { if (v.occupied) { auto p = locate(v.key, newTable); *p = v; } } table = newTable; } } unittest { HashTab tab; tab[3] = 1; tab[7] = 2; tab[11] = 3; assert(tab[3] == 1); assert(tab[7] == 2); assert(tab[11] == 3); } // Specialized 2-level trie of uint masks for BitNfa. // Uses the concept of CoW: a page gets modified in place // if the block's ref-count is 1, else a newblock is allocated // and ref count is decreased struct UIntTrie2 { pure: ushort[] index; // pages --> blocks ushort[] refCounts; // ref counts for each block uint[] hashes; // hashes of blocks uint[] blocks; // linear array with blocks uint[] scratch; // temporary block enum blockBits = 8; // size of block in bits enum blockSize = 1<>blockBits]; return blocks.ptr[blk*blockSize + (ch & (blockSize-1))]; } void setPageRange(string op)(uint val, uint low, uint high) { immutable blk = index[low>>blockBits]; if (refCounts[blk] == 1) // modify in-place { immutable lowIdx = blk*blockSize + (low & (blockSize-1)); immutable highIdx = high - low + lowIdx; mixin("blocks[lowIdx..highIdx] "~op~"= val;"); } else { // create a new page refCounts[blk]--; immutable lowIdx = low & (blockSize-1); immutable highIdx = high - low + lowIdx; scratch[] = blocks[blk*blockSize..(blk+1)*blockSize]; mixin("scratch[lowIdx..highIdx] "~op~"= val;"); uint h = hash(scratch); bool found = false; foreach (i,x; hashes) { if (x != h) continue; if (scratch[] == blocks[i*blockSize .. (i+1)*blockSize]) { // re-route to existing page index[low>>blockBits] = cast(ushort)i; refCounts[i]++; // inc refs found = true; break; } } if (!found) { index[low>>blockBits] = cast(ushort)hashes.length; blocks ~= scratch[]; refCounts ~= 1; hashes ~= h; } } } void opIndexOpAssign(string op)(uint val, dchar ch) { setPageRange!op(val, ch, ch+1); } void opSliceOpAssign(string op)(uint val, uint start, uint end) { uint startBlk = start >> blockBits; uint endBlk = end >> blockBits; uint first = min(startBlk*blockSize+blockSize, end); setPageRange!op(val, start, first); foreach (blk; startBlk..endBlk) setPageRange!op(val, blk*blockSize, (blk+1)*blockSize); if (first != end) { setPageRange!op(val, endBlk*blockSize, end); } } } unittest { UIntTrie2 trie = UIntTrie2(); trie['d'] &= 3; assert(trie['d'] == 3); trie['\u0280'] &= 1; assert(trie['\u0280'] == 1); import std.uni; UIntTrie2 trie2 = UIntTrie2(); auto letters = unicode("L"); foreach (r; letters.byInterval) trie2[r.a..r.b] &= 1; foreach (ch; letters.byCodepoint) assert(trie2[ch] == 1); auto space = unicode("WhiteSpace"); auto trie3 = UIntTrie2(); foreach (r; space.byInterval) trie3[r.a..r.b] &= 2; foreach (ch; space.byCodepoint) assert(trie3[ch] == 2); } // Since there is no way to mark a starting position // we need 2 instances of BitNfa: one to find the end, and the other // to run backwards to find the start. struct BitNfa { pure: uint[128] asciiTab; // state mask for ascii characters UIntTrie2 uniTab; // state mask for unicode characters HashTab controlFlow; // maps each bit pattern to resulting jumps pattern uint controlFlowMask; // masks all control flow bits uint finalMask; // marks final states terminating the NFA uint length; // if this engine is empty @property bool empty() const { return length == 0; } void combineControlFlow() { uint[] keys = controlFlow.keys; uint[] values = controlFlow.values; auto selection = new bool[keys.length]; bool nextChoice() { uint i; for (i=0;i %d %s", j, ir[j].mnemonic); paths.push(j+IRL!Option); //writefln(">> %d", j+IRL!Option); j = j + ir[j].data + IRL!Option; } break; case GotoEndOr: paths.push(j+IRL!GotoEndOr+ir[j].data); break; case OrEnd, Wordboundary, Notwordboundary, Bof, Bol, Eol, Eof, Nop, GroupStart, GroupEnd: paths.push(j+ir[j].length); break; case LookaheadStart, NeglookaheadStart, LookbehindStart, NeglookbehindStart: paths.push(j + IRL!LookaheadStart + ir[j].data + IRL!LookaheadEnd); break; case InfiniteStart, InfiniteQStart: paths.push(j+IRL!InfiniteStart); paths.push(j+IRL!InfiniteStart+ir[j].data+IRL!InfiniteEnd); break; case InfiniteBloomStart: paths.push(j+IRL!InfiniteStart); paths.push(j+IRL!InfiniteBloomStart+ir[j].data+IRL!InfiniteBloomEnd); break; case InfiniteEnd, InfiniteQEnd: paths.push(j-ir[j].data); paths.push(j+IRL!InfiniteEnd); break; case InfiniteBloomEnd: paths.push(j-ir[j].data); paths.push(j+IRL!InfiniteBloomEnd); break; default: result ~= j; } } return result; } this(Char)(auto ref Regex!Char re) { asciiTab[] = uint.max; // all ones uniTab = UIntTrie2(); controlFlow[0] = 0; // pc -> bit number uint[] bitMapping = new uint[re.ir.length]; uint bitCount = 0, nesting=0, lastNonnested=0; with(re) outer: for (uint i=0; i