phobos/std/regex/internal/ir.d

/*
    Implementation of std.regex IR, an intermediate representation
    of a regular expression pattern.

    This is a common ground between frontend regex component (parser)
    and backend components - generators, matchers and other "filters".
*/
module std.regex.internal.ir;

package(std.regex):

import std.exception, std.meta, std.range.primitives, std.traits, std.uni;

debug(std_regex_parser) import std.stdio;
// just a common trait, may be moved elsewhere
alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);

enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';

// heuristic value determines maximum CodepointSet length suitable for linear search
enum maxCharsetUsed = 6;

// another variable to tweak behavior of caching generated Tries for character classes
enum maxCachedMatchers = 8;

alias Trie = CodepointSetTrie!(13, 8);
alias makeTrie = codepointSetTrie!(13, 8);

CharMatcher[CodepointSet] matcherCache;

//accessor with caching
@trusted CharMatcher getMatcher(CodepointSet set)
{
    // almost all properties of AA are not @safe
    // https://issues.dlang.org/show_bug.cgi?id=6357
    if (__ctfe || maxCachedMatchers == 0)
        return CharMatcher(set);
    else
    {
        auto p = set in matcherCache;
        if (p)
            return *p;
        if (matcherCache.length == maxCachedMatchers)
        {
            // flush enmatchers in trieCache
            matcherCache = null;
        }
        return (matcherCache[set] = CharMatcher(set));
    }
}

@property ref wordMatcher()()
{
    static immutable CharMatcher matcher = CharMatcher(wordCharacter);
    return matcher;
}

// some special Unicode white space characters
private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';

//Regular expression engine/parser options:
// global - search  all nonoverlapping matches in input
// casefold - case insensitive matching, do casefolding on match in unicode mode
// freeform - ignore whitespace in pattern, to match space use [ ] or \s
// multiline - switch  ^, $ detect start and end of linesinstead of just start and end of input
enum RegexOption: uint {
    global = 0x1,
    casefold = 0x2,
    freeform = 0x4,
    nonunicode = 0x8,
    multiline = 0x10,
    singleline = 0x20
}
//do not reorder this list
alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
static assert( RegexOption.max < 0x80);

package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
{
    flags &= (RegexOption.max << 1) - 1;
    if (!flags)
        return "";
    char[RegexOptionNames.length] buffer = void;
    size_t pos = 0;
    foreach (i, flag; __traits(allMembers, RegexOption))
        if (flags & __traits(getMember, RegexOption, flag))
            buffer[pos++] = RegexOptionNames[i];
    return buffer[0 .. pos].idup;
}

// flags that allow guide execution of engine
enum RegexInfo : uint { oneShot = 0x80 }

// IR bit pattern: 0b1_xxxxx_yy
// where yy indicates class of instruction, xxxxx for actual operation code
//     00: atom, a normal instruction
//     01: open, opening of a group, has length of contained IR in the low bits
//     10: close, closing of a group, has length of contained IR in the low bits
//     11 unused
//
// Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
// Possible changes:
//* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
//* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
//  (like lookaround), or make it easier to identify hotspots.

enum IR:uint {
    Char              = 0b1_00000_00, //a character
    Any               = 0b1_00001_00, //any character
    CodepointSet      = 0b1_00010_00, //a most generic CodepointSet [...]
    Trie              = 0b1_00011_00, //CodepointSet implemented as Trie
    //match with any of a consecutive OrChar's in this sequence
    //(used for case insensitive match)
    //OrChar holds in upper two bits of data total number of OrChars in this _sequence_
    //the drawback of this representation is that it is difficult
    // to detect a jump in the middle of it
    OrChar             = 0b1_00100_00,
    Nop                = 0b1_00101_00, //no operation (padding)
    End                = 0b1_00110_00, //end of program
    Bol                = 0b1_00111_00, //beginning of a line ^
    Eol                = 0b1_01000_00, //end of a line $
    Wordboundary       = 0b1_01001_00, //boundary of a word
    Notwordboundary    = 0b1_01010_00, //not a word boundary
    Backref            = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
    GroupStart         = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
    GroupEnd           = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
    Option             = 0b1_01110_00, //start of an option within an alternation x | y (length)
    GotoEndOr          = 0b1_01111_00, //end of an option (length of the rest)
    Bof                = 0b1_10000_00, //begining of "file" (string) ^
    Eof                = 0b1_10001_00, //end of "file" (string) $
    //... any additional atoms here

    OrStart            = 0b1_00000_01, //start of alternation group  (length)
    OrEnd              = 0b1_00000_10, //end of the or group (length,mergeIndex)
    //with this instruction order
    //bit mask 0b1_00001_00 could be used to test/set greediness
    InfiniteStart      = 0b1_00001_01, //start of an infinite repetition x* (length)
    InfiniteEnd        = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
    InfiniteQStart     = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
    InfiniteQEnd       = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
    InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
    InfiniteBloomEnd   = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
    RepeatStart        = 0b1_00100_01, //start of a {n,m} repetition (length)
    RepeatEnd          = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
    RepeatQStart       = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
    RepeatQEnd         = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)

    //
    LookaheadStart     = 0b1_00110_01, //begin of the lookahead group (length)
    LookaheadEnd       = 0b1_00110_10, //end of a lookahead group (length)
    NeglookaheadStart  = 0b1_00111_01, //start of a negative lookahead (length)
    NeglookaheadEnd    = 0b1_00111_10, //end of a negative lookahead (length)
    LookbehindStart    = 0b1_01000_01, //start of a lookbehind (length)
    LookbehindEnd      = 0b1_01000_10, //end of a lookbehind (length)
    NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
    NeglookbehindEnd   = 0b1_01001_10, //end of negative lookbehind (length)
}

//a shorthand for IR length - full length of specific opcode evaluated at compile time
template IRL(IR code)
{
    enum uint IRL =  lengthOfIR(code);
}
static assert(IRL!(IR.LookaheadStart) == 3);

//how many parameters follow the IR, should be optimized fixing some IR bits
int immediateParamsIR(IR i) @safe pure nothrow @nogc
{
    switch (i)
    {
    case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
        return 1;  // merge table index
    case IR.InfiniteBloomEnd:
        return 2;  // bloom filter index + merge table index
    case IR.RepeatEnd, IR.RepeatQEnd:
        return 4;
    case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
        return 2;  // start-end of captures used
    default:
        return 0;
    }
}

//full length of IR instruction inlcuding all parameters that might follow it
int lengthOfIR(IR i) @safe pure nothrow @nogc
{
    return 1 + immediateParamsIR(i);
}

//full length of the paired IR instruction inlcuding all parameters that might follow it
int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
{
    return 1 + immediateParamsIR(pairedIR(i));
}

//if the operation has a merge point (this relies on the order of the ops)
bool hasMerge(IR i) @safe pure nothrow @nogc
{
    return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
}

//is an IR that opens a "group"
bool isStartIR(IR i) @safe pure nothrow @nogc
{
    return (i&0b11)==0b01;
}

//is an IR that ends a "group"
bool isEndIR(IR i) @safe pure nothrow @nogc
{
    return (i&0b11)==0b10;
}

//is a standalone IR
bool isAtomIR(IR i) @safe pure nothrow @nogc
{
    return (i&0b11)==0b00;
}

//makes respective pair out of IR i, swapping start/end bits of instruction
IR pairedIR(IR i) @safe pure nothrow @nogc
{
    assert(isStartIR(i) || isEndIR(i));
    return cast(IR) (i ^ 0b11);
}

//encoded IR instruction
@safe pure
struct Bytecode
{
    uint raw;
    //natural constraints
    enum maxSequence = 2+4;
    enum maxData = 1 << 22;
    enum maxRaw = 1 << 31;

@safe pure:
    this(IR code, uint data)
    {
        assert(data < (1 << 22) && code < 256);
        raw = code << 24 | data;
    }

    this(IR code, uint data, uint seq)
    {
        assert(data < (1 << 22) && code < 256 );
        assert(seq >= 2 && seq < maxSequence);
        raw = code << 24 | (seq - 2)<<22 | data;
    }

    //store raw data
    static Bytecode fromRaw(uint data)
    {
        Bytecode t;
        t.raw = data;
        return t;
    }

    // bit twiddling helpers
    // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
    @property uint data()() const { return raw & 0x003f_ffff; }

    @property void data()(uint val)
    {
        raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
    }

    // ditto
    // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
    @property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }

    // ditto
    // 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
    @property IR code()() const { return cast(IR)(raw >> 24); }

    //ditto
    @property bool hotspot() const { return hasMerge(code); }

    //test the class of this instruction
    @property bool isAtom() const { return isAtomIR(code); }

    //ditto
    @property bool isStart() const { return isStartIR(code); }

    //ditto
    @property bool isEnd() const { return isEndIR(code); }

    //number of arguments for this instruction
    @property int args() const { return immediateParamsIR(code); }

    //mark this GroupStart or GroupEnd as referenced in backreference
    void setBackrefence()
    {
        assert(code == IR.GroupStart || code == IR.GroupEnd);
        raw = raw | 1 << 23;
    }

    //is referenced
    @property bool backreference() const
    {
        assert(code == IR.GroupStart || code == IR.GroupEnd);
        return cast(bool)(raw & 1 << 23);
    }

    //mark as local reference (for backrefs in lookarounds)
    void setLocalRef()
    {
        assert(code == IR.Backref);
        raw = raw | 1 << 23;
    }

    //is a local ref
    @property bool localRef() const
    {
        assert(code == IR.Backref);
        return cast(bool)(raw & 1 << 23);
    }

    //human readable name of instruction
    @trusted @property string mnemonic()() const
    {//@@@BUG@@@ to is @system
        import std.conv : to;
        return to!string(code);
    }

    //full length of instruction
    @property uint length() const
    {
        return lengthOfIR(code);
    }

    //full length of respective start/end of this instruction
    @property uint pairedLength() const
    {
        return lengthOfPairedIR(code);
    }

    //returns bytecode of paired instruction (assuming this one is start or end)
    @property Bytecode paired() const
    {//depends on bit and struct layout order
        assert(isStart || isEnd);
        return Bytecode.fromRaw(raw ^ 0b11 << 24);
    }

    //gets an index into IR block of the respective pair
    uint indexOfPair(uint pc) const
    {
        assert(isStart || isEnd);
        return isStart ? pc + data + length  : pc - data - lengthOfPairedIR(code);
    }
}

static assert(Bytecode.sizeof == 4);


//index entry structure for name --> number of submatch
struct NamedGroup
{
    string name;
    uint group;
}

//holds pair of start-end markers for a submatch
struct Group(DataIndex)
{
    DataIndex begin = DataIndex.max;
    DataIndex end   = DataIndex.min;

    bool opCast(T : bool)() const
    {
        return begin <= end;
    }

    @trusted string toString()() const
    {
        if (begin < end)
            return "(unmatched)";
        import std.array : appender;
        import std.format.write : formattedWrite;
        auto a = appender!string();
        formattedWrite(a, "%s..%s", begin, end);
        return a.data;
    }
}

//debugging tool, prints out instruction along with opcodes
@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
{
    import std.array : appender;
    import std.format.write : formattedWrite;
    auto output = appender!string();
    formattedWrite(output,"%s", irb[pc].mnemonic);
    switch (irb[pc].code)
    {
    case IR.Char:
        formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
        break;
    case IR.OrChar:
        formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
        break;
    case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
    IR.Option, IR.GotoEndOr, IR.OrStart:
        //forward-jump instructions
        uint len = irb[pc].data;
        formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
        break;
    case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
        uint len = irb[pc].data;
        formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
            pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
        break;
    case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
        uint len = irb[pc].data;
        formattedWrite(output, " pc=>%u", pc-len);
        break;
    case  IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
        uint len = irb[pc].data;
        formattedWrite(output, " pc=>%u", pc-len);
        break;
    case IR.GroupStart, IR.GroupEnd:
        uint n = irb[pc].data;
        string name;
        foreach (v;dict)
            if (v.group == n)
            {
                name = "'"~v.name~"'";
                break;
            }
        formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
                name, n);
        break;
    case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
        uint len = irb[pc].data;
        uint start = irb[pc+1].raw, end = irb[pc+2].raw;
        formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
        break;
    case IR.Backref: case IR.CodepointSet: case IR.Trie:
        uint n = irb[pc].data;
        formattedWrite(output, " %u",  n);
        if (irb[pc].code == IR.Backref)
            formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
        break;
    default://all data-free instructions
    }
    if (irb[pc].hotspot)
        formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
    return output.data;
}

//disassemble the whole chunk
@trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
{
    import std.stdio : writeln;
    for (uint pc=0; pc<slice.length; pc += slice[pc].length)
        writeln("\t", disassemble(slice, pc, dict));
}

// Encapsulates memory management, explicit ref counting
// and the exact type of engine created
// there is a single instance per engine combination type x Char
// In future may also maintain a (TLS?) cache of memory
interface MatcherFactory(Char)
{
@safe:
    Matcher!Char create(const ref Regex!Char, in Char[] input) const;
    Matcher!Char dup(Matcher!Char m, in Char[] input) const;
    size_t incRef(Matcher!Char m) const;
    size_t decRef(Matcher!Char m) const;
}

// Only memory management, no compile-time vs run-time specialities
abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
{
    import core.memory : pureFree;
    import std.internal.memory : enforceMalloc;
    import core.memory : GC;
    // round up to next multiple of size_t for alignment purposes
    enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);

    EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;

    override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
    {
        immutable size = EngineType!Char.initialMemory(re) + classSize;
        auto memory = enforceMalloc(size)[0 .. size];
        scope(failure) pureFree(memory.ptr);
        GC.addRange(memory.ptr, classSize);
        auto engine = construct(re, input, memory);
        assert(engine.refCount == 1);
        assert(cast(void*) engine == memory.ptr);
        return engine;
    }

    override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
    {
        immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
        auto memory = enforceMalloc(size)[0 .. size];
        scope(failure) pureFree(memory.ptr);
        auto copy = construct(engine.pattern, input, memory);
        GC.addRange(memory.ptr, classSize);
        engine.dupTo(copy, memory[classSize .. size]);
        assert(copy.refCount == 1);
        return copy;
    }

    override size_t incRef(Matcher!Char m) const
    {
        return ++m.refCount;
    }

    override size_t decRef(Matcher!Char m) const  @trusted
    {
        assert(m.refCount != 0);
        auto cnt = --m.refCount;
        if (cnt == 0)
        {
            void* ptr = cast(void*) m;
            GC.removeRange(ptr);
            pureFree(ptr);
        }
        return cnt;
    }
}

// A factory for run-time engines
class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
{
    override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
    {
        import core.lifetime : emplace;
        return emplace!(EngineType!Char)(memory[0 .. classSize],
            re, Input!Char(input), memory[classSize .. $]);
    }
}

// A factory for compile-time engine
class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
{
    override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
    {
        import core.lifetime : emplace;
        return emplace!(EngineType!Char)(memory[0 .. classSize],
            re, &func, Input!Char(input), memory[classSize .. $]);
    }
}

// A workaround for R-T enum re = regex(...)
template defaultFactory(Char)
{
    @property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
    {
        auto defaultFactoryImpl = () {
            import std.regex.internal.backtracking : BacktrackingMatcher;
            import std.regex.internal.thompson : ThompsonMatcher;
            import std.algorithm.searching : canFind;
            static MatcherFactory!Char backtrackingFactory;
            static MatcherFactory!Char thompsonFactory;
            if (re.backrefed.canFind!"a != 0")
            {
                if (backtrackingFactory is null)
                    backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
                return backtrackingFactory;
            }
            else
            {
                if (thompsonFactory is null)
                    thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
                return thompsonFactory;
            }
        };

        // this should be faked as pure because the static mutable variables are
        // used to cache the created instance, like memoize
        alias T = typeof(defaultFactoryImpl);
        enum attrs = functionAttributes!T | FunctionAttribute.pure_;
        return (() @trusted =>
            (cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) defaultFactoryImpl))()();
    }
}

// Defining it as an interface has the undesired side-effect:
// casting any class to an interface silently adjusts pointer to point to a nested vtbl
abstract class Matcher(Char)
{
abstract:
    // Get a (next) match
    int match(Group!size_t[] matches) pure;
    // This only maintains internal ref-count,
    // deallocation happens inside MatcherFactory
    @property ref size_t refCount() @safe;
    // Copy internal state to another engine, using memory arena 'memory'
    void dupTo(Matcher!Char m, void[] memory);
    // The pattern loaded
    @property ref const(Regex!Char) pattern() @safe;
    // Re-arm the engine with new Input
    Matcher rearm(in Char[] stream);
}

/++
    `Regex` object holds regular expression pattern in compiled form.
    Instances of this object are constructed via calls to `regex`.
    This is an intended form for caching and storage of frequently
    used regular expressions.
+/
struct Regex(Char)
{
    //temporary workaround for identifier lookup
    CodepointSet[] charsets; //
    Bytecode[] ir;      //compiled bytecode of pattern


    @safe @property bool empty() const nothrow {  return ir is null; }

    @safe @property auto namedCaptures()
    {
        static struct NamedGroupRange
        {
        private:
            const(NamedGroup)[] groups;
            size_t start;
            size_t end;
        public:
            this(const(NamedGroup)[] g, size_t s, size_t e)
            {
                assert(s <= e);
                assert(e <= g.length);
                groups = g;
                start = s;
                end = e;
            }

            @property string front() { return groups[start].name; }
            @property string back() { return groups[end-1].name; }
            @property bool empty() { return start >= end; }
            @property size_t length() { return end - start; }
            alias opDollar = length;
            @property NamedGroupRange save()
            {
                return NamedGroupRange(groups, start, end);
            }
            void popFront() { assert(!empty); start++; }
            void popBack() { assert(!empty); end--; }
            string opIndex()(size_t i)
            {
                assert(start + i < end,
                       "Requested named group is out of range.");
                return groups[start+i].name;
            }
            NamedGroupRange opSlice(size_t low, size_t high) {
                assert(low <= high);
                assert(start + high <= end);
                return NamedGroupRange(groups, start + low, start + high);
            }
            NamedGroupRange opSlice() { return this.save; }
        }
        return NamedGroupRange(dict, 0, dict.length);
    }

package(std.regex):
    import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
    const(NamedGroup)[] dict;              // maps name -> user group number
    uint ngroup;                           // number of internal groups
    uint maxCounterDepth;                  // max depth of nested {n,m} repetitions
    uint hotspotTableSize;                 // number of entries in merge table
    uint threadCount;                      // upper bound on number of Thompson VM threads
    uint flags;                            // global regex flags
    public const(CharMatcher)[]  matchers; // tables that represent character sets
    public const(BitTable)[] filters;      // bloom filters for conditional loops
    uint[] backrefed;                      // bit array of backreferenced submatches
    Kickstart!Char kickstart;
    MatcherFactory!Char factory;           // produces optimal matcher for this pattern
    immutable(Char)[] pattern;             // copy of pattern to serve as cache key

    const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
    {
        auto r = cast() this;
        r.factory = factory;
        return r;
    }

    const(Regex) withFlags(uint newFlags) pure const @trusted
    {
        auto r = cast() this;
        r.flags = newFlags;
        return r;
    }

    const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
    {
        auto r = cast() this;
        r.ir = code.dup; // TODO: sidestep const instead?
        return r;
    }

    const(Regex) withNGroup(uint nGroup) pure const @trusted
    {
        auto r = cast() this;
        r.ngroup = nGroup;
        return r;
    }

    //bit access helper
    uint isBackref(uint n)
    {
        if (n/32 >= backrefed.length)
            return 0;
        return backrefed[n / 32] & (1 << (n & 31));
    }

    //check if searching is not needed
    void checkIfOneShot()
    {
    L_CheckLoop:
        for (uint i = 0; i < ir.length; i += ir[i].length)
        {
            switch (ir[i].code)
            {
                case IR.Bof:
                    flags |= RegexInfo.oneShot;
                    break L_CheckLoop;
                case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
                IR.Wordboundary, IR.Notwordboundary:
                    break;
                default:
                    break L_CheckLoop;
            }
        }
    }

    //print out disassembly a program's IR
    @trusted debug(std_regex_parser) void print() const
    {//@@@BUG@@@ write is system
        for (uint i = 0; i < ir.length; i += ir[i].length)
        {
            writefln("%d\t%s ", i, disassemble(ir, i, dict));
        }
        writeln("Total merge table size: ", hotspotTableSize);
        writeln("Max counter nesting depth: ", maxCounterDepth);
    }

    public string toString()() const
    {
        import std.format : format;
        static if (is(typeof(pattern) : string))
            alias patternString = pattern;
        else
        {
            import std.conv : to;
            auto patternString = conv.to!string(pattern);
        }
        auto quotedEscapedPattern = format("%(%s %)", [patternString]);
        auto flagString = regexOptionsToString(flags);
        return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
    }
}

// The stuff below this point is temporarrily part of IR module
// but may need better place in the future (all internals)
package(std.regex):

//Simple UTF-string abstraction compatible with stream interface
struct Input(Char)
if (is(Char :dchar))
{
    import std.utf : decode;
    alias DataIndex = size_t;
    enum bool isLoopback = false;
    alias String = const(Char)[];
    String _origin;
    size_t _index;

    //constructs Input object out of plain string
    this(String input, size_t idx = 0)
    {
        _origin = input;
        _index = idx;
    }

    //codepoint at current stream position
    pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
    {
        pos = _index;
        // DMD's inliner hates multiple return functions
        // but can live with single statement if/else bodies
        bool n = !(_index == _origin.length);
        if (n)
            res = decode(_origin, _index);
        return n;
    }
    @property bool atEnd(){
        return _index == _origin.length;
    }
    bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
    {
        size_t idx = kick.search(_origin, _index);
        _index = idx;
        return nextChar(res, pos);
    }

    //index of at End position
    @property size_t lastIndex(){   return _origin.length; }

    //support for backtracker engine, might not be present
    void reset(size_t index){   _index = index;  }

    String opSlice(size_t start, size_t end){   return _origin[start .. end]; }

    auto loopBack(size_t index){   return BackLooper!Input(this, index); }
}

struct BackLooperImpl(Input)
{
    import std.utf : strideBack;
    alias DataIndex = size_t;
    alias String = Input.String;
    enum bool isLoopback = true;
    String _origin;
    size_t _index;
    this(Input input, size_t index)
    {
        _origin = input._origin;
        _index = index;
    }
    this(String input)
    {
        _origin = input;
        _index = input.length;
    }
    @trusted bool nextChar(ref dchar res,ref size_t pos)
    {
        pos = _index;
        if (_index == 0)
            return false;

        res = _origin[0.._index].back;
        _index -= strideBack(_origin, _index);

        return true;
    }
    @property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
    auto loopBack(size_t index){   return Input(_origin, index); }

    //support for backtracker engine, might not be present
    //void reset(size_t index){   _index = index ? index-std.utf.strideBack(_origin, index) : 0;  }
    void reset(size_t index){   _index = index;  }

    String opSlice(size_t start, size_t end){   return _origin[end .. start]; }
    //index of at End position
    @property size_t lastIndex(){   return 0; }
}

template BackLooper(E)
{
    static if (is(E : BackLooperImpl!U, U))
    {
        alias BackLooper = U;
    }
    else
    {
        alias BackLooper = BackLooperImpl!E;
    }
}

//both helpers below are internal, on its own are quite "explosive"
//unsafe, no initialization of elements
@system pure T[] mallocArray(T)(size_t len)
{
    import core.memory : pureMalloc;
    return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
}

//very unsafe, no initialization
@system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
{
    auto ret = (cast(T*) chunk.ptr)[0 .. len];
    chunk = chunk[len * T.sizeof .. $];
    return ret;
}

//
@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
{//equal is @system?
    import std.algorithm.comparison : equal;
    import std.algorithm.iteration : map;
    import std.conv : text;
    import std.range : assumeSorted;

    auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
    enforce(fnd < dict.length && equal(dict[fnd].name, name),
        text("no submatch named ", name));
    return dict[fnd].group;
}

// whether ch is one of unicode newline sequences
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
bool endOfLine()(dchar front, bool seenCr)
{
    return ((front == '\n') ^ seenCr) || front == '\r'
    || front == NEL || front == LS || front == PS;
}

// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
bool startOfLine()(dchar back, bool seenNl)
{
    return ((back == '\r') ^ seenNl) || back == '\n'
    || back == NEL || back == LS || back == PS;
}

///Exception object thrown in case of errors during regex compilation.
public class RegexException : Exception
{
    mixin basicExceptionCtors;
}

// simple 128-entry bit-table used with a hash function
struct BitTable {
    uint[4] filter;

    this(CodepointSet set){
        foreach (iv; set.byInterval)
        {
            foreach (v; iv.a .. iv.b)
                add(v);
        }
    }

    void add()(dchar ch){
        immutable i = index(ch);
        filter[i >> 5]  |=  1<<(i & 31);
    }
    // non-zero -> might be present, 0 -> absent
    bool opIndex()(dchar ch) const{
        immutable i = index(ch);
        return (filter[i >> 5]>>(i & 31)) & 1;
    }

    static uint index()(dchar ch){
        return ((ch >> 7) ^ ch) & 0x7F;
    }
}

struct CharMatcher {
    BitTable ascii; // fast path for ASCII
    Trie trie;      // slow path for Unicode

    this(CodepointSet set)
    {
        auto asciiSet = set & unicode.ASCII;
        ascii = BitTable(asciiSet);
        trie = makeTrie(set);
    }

    bool opIndex()(dchar ch) const
    {
        if (ch < 0x80)
            return ascii[ch];
        else
            return trie[ch];
    }
}

// Internal non-resizeble array, switches between inline storage and CoW
// POD-only
struct SmallFixedArray(T, uint SMALL=3)
if (!hasElaborateDestructor!T)
{
    import std.internal.memory : enforceMalloc;
    import core.memory : pureFree;
    static struct Payload
    {
        size_t refcount;
        T[0] placeholder;
        inout(T)* ptr() inout { return placeholder.ptr; }
    }
    static assert(Payload.sizeof == size_t.sizeof);
    union
    {
        Payload* big;
        T[SMALL] small;
    }
    size_t _sizeMask;
    enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
    enum SIZE_MASK = ~BIG_MASK;

    @property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
    @property size_t length() const { return _sizeMask & SIZE_MASK; }

    this(size_t size)
    {
        if (size <= SMALL)
        {
            small[] = T.init;
            _sizeMask = size;
        }
        else
        {
            big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
            big.refcount = 1;
            _sizeMask = size | BIG_MASK;
        }
    }

    private @trusted @property inout(T)[] internalSlice() inout
    {
        return isBig ? big.ptr[0 .. length] : small[0 .. length];
    }

    this(this)
    {
        if (isBig)
        {
            big.refcount++;
        }
    }

    bool opEquals(SmallFixedArray a)
    {
        return internalSlice[] == a.internalSlice[];
    }

    size_t toHash() const
    {
        return hashOf(internalSlice[]);
    }

    ref inout(T) opIndex(size_t idx) inout
    {
        return internalSlice[idx];
    }

    // accesses big to test self-referencing so not @safe
    @trusted ref opAssign(SmallFixedArray arr)
    {
        if (isBig)
        {
            if (arr.isBig)
            {
                if (big is arr.big) return this; // self-assign
                else
                {
                    abandonRef();
                    _sizeMask = arr._sizeMask;
                    big = arr.big;
                    big.refcount++;
                }
            }
            else
            {
                abandonRef();
                _sizeMask = arr._sizeMask;
                small = arr.small;
            }
        }
        else
        {
            if (arr.isBig)
            {
                _sizeMask = arr._sizeMask;
                big = arr.big;
                big.refcount++;
            }
            else
            {
                _sizeMask = arr._sizeMask;
                small = arr.small;
            }
        }
        return this;
    }

    void mutate(scope void delegate(T[]) pure filler)
    {
        if (isBig && big.refcount != 1) // copy on write
        {
            auto oldSizeMask = _sizeMask;
            auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
            newbig.refcount = 1;
            abandonRef();
            big = newbig;
            _sizeMask = oldSizeMask;
        }
        filler(internalSlice);
    }

    ~this()
    {
        if (isBig)
        {
            abandonRef();
        }
    }

    @trusted private void abandonRef()
    {
        assert(isBig);
        if (--big.refcount == 0)
        {
            pureFree(big);
            _sizeMask = 0;
            assert(!isBig);
        }
    }
}

@system unittest
{
    alias SA = SmallFixedArray!(int, 2);
    SA create(int[] data)
    {
        SA a = SA(data.length);
        a.mutate((slice) { slice[] = data[]; });
        assert(a.internalSlice == data);
        return a;
    }

    {
        SA a;
        a = SA(1);
        assert(a.length == 1);
        a = SA.init;
        assert(a.length == 0);
    }

    {
        SA a, b, c, d;
        assert(a.length == 0);
        assert(a.internalSlice == b.internalSlice);
        a = create([1]);
        assert(a.internalSlice == [1]);
        b = create([2, 3]);
        assert(b.internalSlice == [2, 3]);
        c = create([3, 4, 5]);
        d = create([5, 6, 7, 8]);
        assert(c.isBig);
        a = c;
        assert(a.isBig);
        assert(a.big is c.big);
        assert(a.big.refcount == 2);
        assert(a.internalSlice == [3, 4, 5]);
        assert(c.internalSlice == [3, 4, 5]);
        a = b;
        assert(!a.isBig);
        assert(a.internalSlice == [2, 3]);
        assert(c.big.refcount == 1);
        a = c;
        assert(c.big.refcount == 2);

        // mutate copies on write if ref-count is not 1
        a.mutate((slice){ slice[] = 1; });
        assert(a.internalSlice == [1, 1, 1]);
        assert(c.internalSlice == [3, 4, 5]);
        assert(a.isBig && c.isBig);
        assert(a.big.refcount == 1);
        assert(c.big.refcount == 1);

        auto e = d;
        assert(e.big.refcount == 2);
        auto f = d;
        f = a;
        assert(f.isBig);
        assert(f.internalSlice == [1, 1, 1]);
        assert(f.big.refcount == 2); // a & f
        assert(e.big.refcount == 2); // d & e
        a = c;
        assert(f.big.refcount == 1); // f
        assert(e.big.refcount == 2); // d & e
        a = a;
        a = a;
        a = a;
        assert(a.big.refcount == 2); // a & c
    }
}