//Written in the D programming language /++ $(LUCKY Regular expressions) are commonly used method of pattern matching on strings, with $(I regex) being a catchy word for a pattern in this domain specific language. Typical problems usually solved by regular expressions include validation of user input and ubiquitous find & replace in text processing utilities. Synposis: --- import std.regex; import std.stdio; void main() { //print out all possible dd/mm/yy(yy) dates found in user input //g - global, find all matches auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b", "g"); foreach(line; stdin.byLine) { //match returns a range that can be iterated //to get all of subsequent matches foreach(c; match(line, r)) writeln(c.hit); } } ... //create static regex at compile-time, contains fast native code enum ctr = ctRegex!(`^.*/([^/]+)/?$`); //works just like normal regex: auto m2 = match("foo/bar", ctr); //first match found here if any assert(m2); // be sure to check if there is a match, before examining contents! assert(m2.captures[1] == "bar");//captures is a range of submatches, 0 - full match ... //result of match is directly testable with if/assert/while //e.g. test if a string consists of letters: assert(match("Letter", `^\p{L}+$`)); --- The general usage guideline is keeping regex complexity on the side of simplicity, as its capabilities reside in purely character-level manipulation, and as such are ill suited for tasks involving higher level invariants like matching an integer number $(U bounded) in [a,b] interval. Checks of this sort of are better addressed by additional post-processing. The basic syntax shouldn't surprize experienced users of regular expressions. Thankfully, nowdays the web is bustling with resources to help newcomers, and a good $(WEB www.regular-expressions.info, reference with tutorial ) on regular expressions could be found. This library uses ECMAScript syntax flavor with the following extensions: $(UL $(LI Named subexpressions, with Python syntax. ) $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) ) $(REG_START Pattern syntax ) $(I std.regex operates on codepoint level, 'character' in this table denotes single unicode codepoint.) $(REG_TABLE $(REG_TITLE Pattern element, Semantics ) $(REG_TITLE Atoms, Match single characters ) $(REG_ROW any character except [|*+?(), Matches the character itself. ) $(REG_ROW ., In single line mode matches any charcter. Otherwise it matches any character except '\n' and '\r'. ) $(REG_ROW [class], Matches single character that belongs to this character class. ) $(REG_ROW [^class], Matches single character that does $(U not) belong to this character class.) $(REG_ROW \cC, Matches the control character corresponding to letter C) $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) $(REG_ROW \f, Matches a formfeed character. ) $(REG_ROW \n, Matches a linefeed character. ) $(REG_ROW \r, Matches a carriage return character. ) $(REG_ROW \t, Matches a tab character. ) $(REG_ROW \v, Matches a vertical tab character. ) $(REG_ROW \d, Matches any unicode digit. ) $(REG_ROW \D, Matches any character but unicode digit. ) $(REG_ROW \w, Matches any word character (note: this includes numbers).) $(REG_ROW \W, Matches any non-word character.) $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) $(REG_ROW \S, Matches any character but these recognized as $(I \s ). ) $(REG_ROW \\, Matches \ character. ) $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) $(REG_ROW \p{PropertyName}, Matches character that belongs to unicode PropertyName set. Single letter abreviations could be used without surrounding {,}. ) $(REG_ROW \P{PropertyName}, Matches character that does not belong to unicode PropertyName set. Single letter abreviations could be used without surrounding {,}. ) $(REG_ROW \p{InBasicLatin}, Matches any character that is part of BasicLatin unicode $(U block).) $(REG_ROW \P{InBasicLatin}, Matches any character except ones in BasicLatin unicode $(U block).) $(REG_ROW \p{Cyrilic}, Matches any character that is part of Cyrilic $(U script).) $(REG_ROW \P{Cyrilic}, Matches any character except ones in Cyrilic $(U script).) $(REG_TITLE Quantifiers, Specify repetition of other elements) $(REG_ROW *, Matches previous character/subexpression 0 or more times. Greedy version - tries as many times as possible.) $(REG_ROW *?, Matches previous character/subexpression 0 or more times. Lazy version - stops as early as possible.) $(REG_ROW +, Matches previous character/subexpression 1 or more times. Greedy version - tries as many times as possible.) $(REG_ROW +?, Matches previous character/subexpression 1 or more times. Lazy version - stops as early as possible.) $(REG_ROW {n}, Matches previous character/subexpression n exactly times. ) $(REG_ROW {n,}, Matches previous character/subexpression n times or more. Greedy version - tries as many times as possible. ) $(REG_ROW {n,}?, Matches previous character/subexpression n times or more. Lazy version - stops as early as possible.) $(REG_ROW {n,m}, Matches previous character/subexpression n to m times. Greedy version - tries as many times as possible. ) $(REG_ROW {n,m}?, Matches previous character/subexpression n to m times. Lazy version - stops as early as possible, but no less then n times.) $(REG_TITLE Other, Subexpressions & alternations ) $(REG_ROW (regex), Matches subexpression regex, saving matched portion of text for later retrival. ) $(REG_ROW (?:regex), Matches subexpression regex, $(U not) saving matched portion of text. Useful to speed up matching. ) $(REG_ROW A|B, Matches subexpression A, failing that matches B. ) $(REG_ROW (?P<name>regex), Matches named subexpression regex labeling it with name 'name'. When refering to matched portion of text, names work like aliases in addition to direct numbers. ) $(REG_TITLE Assertions, Match position rather then character ) $(REG_ROW ^, Matches at the begining of input or line (in multiline mode).) $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) $(REG_ROW \b, Matches at word boundary. ) $(REG_ROW \B, Matches when $(U not) at word boundary. ) $(REG_ROW (?=regex), Zero-width lookahead assertion. Matches at a point where the subexpression regex could be matched starting from current position. ) $(REG_ROW (?!regex), Zero-width negative lookahead assertion. Matches at a point where the subexpression regex could $(U not ) be matched starting from current position. ) $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point where the subexpression regex could be matched ending at current position (matching goes backwards). ) $(REG_ROW (? $0 REG_START =

$0

+/ module std.regex; import std.internal.uni, std.internal.uni_tab;//unicode property tables import std.array, std.algorithm, std.range, std.conv, std.exception, std.traits, std.typetuple, std.uni, std.utf, std.format, std.typecons, std.bitmanip, std.functional, std.exception; import core.bitop, core.stdc.string, core.stdc.stdlib; import ascii = std.ascii; import std.string : representation; version(unittest) debug import std.stdio; private: @safe: //uncomment to get a barrage of debug info //debug = fred_parser; //debug = fred_matching; //debug = fred_charset; // IR bit pattern: 0b1_xxxxx_yy // where yy indicates class of instruction, xxxxx for actual operation code // 00: atom, a normal instruction // 01: open, opening of a group, has length of contained IR in the low bits // 10: close, closing of a group, has length of contained IR in the low bits // 11 unused // // Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version // Possible changes: //* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2}) //* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops // (like lookaround), or make it easier to identify hotspots. enum IR:uint { Char = 0b1_00000_00, //a character Any = 0b1_00001_00, //any character CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...] Trie = 0b1_00011_00, //CodepointSet implemented as Trie //match with any of a consecutive OrChar's in this sequence //(used for case insensitive match) //OrChar holds in upper two bits of data total number of OrChars in this _sequence_ //the drawback of this representation is that it is difficult // to detect a jump in the middle of it OrChar = 0b1_00100_00, Nop = 0b1_00101_00, //no operation (padding) End = 0b1_00110_00, //end of program Bol = 0b1_00111_00, //beginning of a string ^ Eol = 0b1_01000_00, //end of a string $ Wordboundary = 0b1_01001_00, //boundary of a word Notwordboundary = 0b1_01010_00, //not a word boundary Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index) GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit)) GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit)) Option = 0b1_01110_00, //start of an option within an alternation x | y (length) GotoEndOr = 0b1_01111_00, //end of an option (length of the rest) //... any additional atoms here OrStart = 0b1_00000_01, //start of alternation group (length) OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex) //with this instruction order //bit mask 0b1_00001_00 could be used to test/set greediness InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length) InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex) InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length) InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex) RepeatStart = 0b1_00011_01, //start of a {n,m} repetition (length) RepeatEnd = 0b1_00011_10, //end of x{n,m} repetition (length,step,minRep,maxRep) RepeatQStart = 0b1_00100_01, //start of a non eager x{n,m}? repetition (length) RepeatQEnd = 0b1_00100_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep) // LookaheadStart = 0b1_00101_01, //begin of the lookahead group (length) LookaheadEnd = 0b1_00101_10, //end of a lookahead group (length) NeglookaheadStart = 0b1_00110_01, //start of a negative lookahead (length) NeglookaheadEnd = 0b1_00110_10, //end of a negative lookahead (length) LookbehindStart = 0b1_00111_01, //start of a lookbehind (length) LookbehindEnd = 0b1_00111_10, //end of a lookbehind (length) NeglookbehindStart= 0b1_01000_01, //start of a negative lookbehind (length) NeglookbehindEnd = 0b1_01000_10, //end of negative lookbehind (length) } //a shorthand for IR length - full length of specific opcode evaluated at compile time template IRL(IR code) { enum uint IRL = lengthOfIR(code); } static assert (IRL!(IR.LookaheadStart) == 3); //how many parameters follow the IR, should be optimized fixing some IR bits int immediateParamsIR(IR i){ switch (i){ case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd: return 1; case IR.RepeatEnd, IR.RepeatQEnd: return 4; case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: return 2; default: return 0; } } //full length of IR instruction inlcuding all parameters that might follow it int lengthOfIR(IR i) { return 1 + immediateParamsIR(i); } //full length of the paired IR instruction inlcuding all parameters that might follow it int lengthOfPairedIR(IR i) { return 1 + immediateParamsIR(pairedIR(i)); } //if the operation has a merge point (this relies on the order of the ops) bool hasMerge(IR i) { return (i&0b11)==0b10 && i<=IR.RepeatQEnd; } //is an IR that opens a "group" bool isStartIR(IR i) { return (i&0b11)==0b01; } //is an IR that ends a "group" bool isEndIR(IR i) { return (i&0b11)==0b10; } //is a standalone IR bool isAtomIR(IR i) { return (i&0b11)==0b00; } //makes respective pair out of IR i, swapping start/end bits of instruction IR pairedIR(IR i) { assert(isStartIR(i) || isEndIR(i)); return cast(IR)(i ^ 0b11); } //encoded IR instruction struct Bytecode { uint raw; //natural constraints enum maxSequence = 2+4; enum maxData = 1<<22; enum maxRaw = 1<<31; this(IR code, uint data) { assert(data < (1<<22) && code < 256); raw = code<<24 | data; } this(IR code, uint data, uint seq) { assert(data < (1<<22) && code < 256 ); assert(seq >= 2 && seq < maxSequence); raw = code<<24 | ((seq-2)<<22) | data; } //store raw data static Bytecode fromRaw(uint data) { Bytecode t; t.raw = data; return t; } //bit twiddling helpers @property uint data() const { return raw & 0x003f_ffff; } //ditto @property uint sequence() const { return 2+((raw >>22) & 0x3); } //ditto @property IR code() const { return cast(IR)(raw>>24); } //ditto @property bool hotspot() const { return hasMerge(code); } //test the class of this instruction @property bool isAtom() const { return isAtomIR(code); } //ditto @property bool isStart() const { return isStartIR(code); } //ditto @property bool isEnd() const { return isEndIR(code); } //number of arguments for this instruction @property int args() const { return immediateParamsIR(code); } //mark this GroupStart or GroupEnd as referenced in backreference void setBackrefence() { assert(code == IR.GroupStart || code == IR.GroupEnd); raw = raw | (1<<23); } //is referenced @property bool backreference() const { assert(code == IR.GroupStart || code == IR.GroupEnd); return cast(bool)(raw & (1<<23)); } //mark as local reference (for backrefs in lookarounds) void setLocalRef() { assert(code == IR.Backref); raw = raw | (1<<23); } //is a local ref @property bool localRef() const { assert(code == IR.Backref); return cast(bool)(raw & (1<<23)); } //human readable name of instruction @trusted @property string mnemonic() const {//@@@BUG@@@ to is @system return to!string(code); } //full length of instruction @property uint length() const { return lengthOfIR(code); } //full length of respective start/end of this instruction @property uint pairedLength() const { return lengthOfPairedIR(code); } //returns bytecode of paired instruction (assuming this one is start or end) @property Bytecode paired() const {//depends on bit and struct layout order assert(isStart || isEnd); return Bytecode.fromRaw(raw ^ (0b11<<24)); } //gets an index into IR block of the respective pair uint indexOfPair(uint pc) const { assert(isStart || isEnd); return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code); } } static assert(Bytecode.sizeof == 4); //debugging tool, prints out instruction along with opcodes @trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[]) { auto output = appender!string(); formattedWrite(output,"%s", irb[pc].mnemonic); switch(irb[pc].code) { case IR.Char: formattedWrite(output, " %s (0x%x)",cast(dchar)irb[pc].data, irb[pc].data); break; case IR.OrChar: formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar)irb[pc].data, irb[pc].data, irb[pc].sequence); break; case IR.RepeatStart, IR.InfiniteStart, IR.Option, IR.GotoEndOr, IR.OrStart: //forward-jump instructions uint len = irb[pc].data; formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart)); break; case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions uint len = irb[pc].data; formattedWrite(output, " pc=>%u min=%u max=%u step=%u" , pc-len, irb[pc+3].raw, irb[pc+4].raw, irb[pc+2].raw); break; case IR.InfiniteEnd, IR.InfiniteQEnd, IR.OrEnd: //ditto uint len = irb[pc].data; formattedWrite(output, " pc=>%u", pc-len); break; case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto uint len = irb[pc].data; formattedWrite(output, " pc=>%u", pc-len); break; case IR.GroupStart, IR.GroupEnd: uint n = irb[pc].data; string name; foreach(v;dict) if(v.group == n) { name = "'"~v.name~"'"; break; } formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""), name, n); break; case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: uint len = irb[pc].data; uint start = irb[pc+1].raw, end = irb[pc+2].raw; formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end); break; case IR.Backref: case IR.CodepointSet: case IR.Trie: uint n = irb[pc].data; formattedWrite(output, " %u", n); if(irb[pc].code == IR.Backref) formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global"); break; default://all data-free instructions } if(irb[pc].hotspot) formattedWrite(output, " Hotspot %u", irb[pc+1].raw); return output.data; } //another pretty printer, writes out the bytecode of a regex and where the pc is @trusted void prettyPrint(Sink,Char=const(char)) (Sink sink, const(Bytecode)[] irb, uint pc=uint.max, int indent=3, size_t index=0) if (isOutputRange!(Sink,Char)) {//formattedWrite is @system while(irb.length>0) { formattedWrite(sink,"%3d",index); if(pc==0 && irb[0].code!=IR.Char) { for (int i=0;i "); } else { if(isEndIR(irb[0].code)) { indent-=2; } if(indent>0) { string spaces=" "; put(sink,spaces[0..(indent%spaces.length)]); for (size_t i=indent/spaces.length;i>0;--i) put(sink,spaces); } } if(irb[0].code==IR.Char) { put(sink,`"`); int i=0; do{ put(sink,cast(char[])([cast(dchar)irb[i].data])); ++i; } while(i0;++ii) put(sink,"="); put(sink,"^"); } index+=i; irb=irb[i..$]; } else { put(sink,irb[0].mnemonic); put(sink,"("); formattedWrite(sink,"%d",irb[0].data); int nArgs= irb[0].args; for(int iarg=0;iarg number of submatch struct NamedGroup { string name; uint group; } //holds pair of start-end markers for a submatch struct Group(DataIndex) { DataIndex begin, end; @trusted string toString() const { auto a = appender!string(); formattedWrite(a, "%s..%s", begin, end); return a.data; } } //Regular expression engine/parser options: // global - search all nonoverlapping matches in input // casefold - case insensitive matching, do casefolding on match in unicode mode // freeform - ignore whitespace in pattern, to match space use [ ] or \s // multiline - switch ^, $ detect start and end of linesinstead of just start and end of input enum RegexOption: uint { global = 0x1, casefold = 0x2, freeform = 0x4, nonunicode = 0x8, multiline = 0x10, singleline = 0x20 }; alias TypeTuple!('g', 'i', 'x', 'U', 'm', 's') RegexOptionNames;//do not reorder this list static assert( RegexOption.max < 0x80); enum RegexInfo : uint { oneShot = 0x80 }; private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029'; //test if a given string starts with hex number of maxDigit that's a valid codepoint //returns it's value and skips these maxDigit chars on success, throws on failure dchar parseUniHex(Char)(ref Char[] str, uint maxDigit) { enforce(str.length >= maxDigit,"incomplete escape sequence"); uint val; for(int k=0;k= PHASHNKEYS || ucmp(name,unicodeProperties[key].name) != 0) enforce(0, "invalid property name"); s = cast(CodepointSet)unicodeProperties[key].set; } else { auto range = assumeSorted!((x,y){ return ucmp(x.name, y.name) < 0; })(unicodeProperties); //creating empty Codepointset is a workaround auto eq = range.lowerBound(UnicodeProperty(cast(string)name,CodepointSet.init)).length; enforce(eq!=range.length && ucmp(name,range[eq].name)==0,"invalid property name"); s = range[eq].set.dup; } } if(casefold) s = caseEnclose(s); if(negated) s.negate(); return cast(const CodepointSet)s; } //basic stack, just in case it gets used anywhere else then Parser @trusted struct Stack(T, bool CTFE=false) { static if(!CTFE) Appender!(T[]) stack;//compiles but bogus at CTFE else { struct Proxy { T[] data; void put(T val) { data ~= val; } void shrinkTo(size_t sz){ data = data[0..sz]; } } Proxy stack; } @property bool empty(){ return stack.data.empty; } void push(T item) { stack.put(item); } @property ref T top() { assert(!empty); return stack.data[$-1]; } @property size_t length() { return stack.data.length; } T pop() { assert(!empty); auto t = stack.data[$-1]; stack.shrinkTo(stack.data.length-1); return t; } } //safety limits enum maxGroupNumber = 2^^19; enum maxLookaroundDepth = 16; // *Bytecode.sizeof, i.e. 1Mb of bytecode alone enum maxCompiledLength = 2^^18; //amounts to up to 4 Mb of auxilary table for matching enum maxCumulativeRepetitionLength = 2^^20; template BasicElementOf(Range) { alias Unqual!(ElementEncodingType!Range) BasicElementOf; } struct Parser(R, bool CTFE=false) if (isForwardRange!R && is(ElementType!R : dchar)) { enum infinite = ~0u; dchar _current; bool empty; R pat, origin; //keep full pattern for pretty printing error messages Bytecode[] ir; //resulting bytecode uint re_flags = 0; //global flags e.g. multiline + internal ones Stack!(uint, CTFE) fixupStack; //stack of opened start instructions NamedGroup[] dict; //maps name -> user group number //current num of group, group nesting level and repetitions step Stack!(uint, CTFE) groupStack; uint nesting = 0; uint lookaroundNest = 0; uint counterDepth = 0; //current depth of nested counted repetitions const(CodepointSet)[] charsets; // const(Trie)[] tries; // uint[] backrefed; //bitarray for groups @trusted this(S)(R pattern, S flags) if(isSomeString!S) { pat = origin = pattern; if(!__ctfe) ir.reserve(pat.length); parseFlags(flags); _current = ' ';//a safe default for freeform parsing next(); if(__ctfe) parseRegex(); else { try { parseRegex(); } catch(Exception e) { error(e.msg);//also adds pattern location } } put(Bytecode(IR.End, 0)); } //mark referenced groups for latter processing void markBackref(uint n) { if(n/32 >= backrefed.length) backrefed.length = n/32 + 1; backrefed[n/32] |= 1<<(n & 31); } @property dchar current(){ return _current; } bool _next() { if(pat.empty) { empty = true; return false; } //for CTFEability size_t idx=0; _current = decode(pat, idx); pat = pat[idx..$]; return true; } void skipSpace() { while(isWhite(current) && _next()){ } } bool next() { if(re_flags & RegexOption.freeform) { bool r = _next(); skipSpace(); return r; } else return _next(); } void put(Bytecode code) { enforce(ir.length < maxCompiledLength , "maximum compiled pattern length is exceeded"); if(__ctfe) { ir = ir ~ code; } else ir ~= code; } void putRaw(uint number) { enforce(ir.length < maxCompiledLength , "maximum compiled pattern length is exceeded"); ir ~= Bytecode.fromRaw(number); } //parsing number with basic overflow check uint parseDecimal() { uint r=0; while(ascii.isDigit(current)) { if(r >= (uint.max/10)) error("Overflow in decimal number"); r = 10*r + cast(uint)(current-'0'); if(!next()) break; } return r; } //parse control code of form \cXXX, c assumed to be the current symbol dchar parseControlCode() { enforce(next(), "Unfinished escape sequence"); enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'), "Only letters are allowed after \\c"); return current & 0x1f; } // @trusted void parseFlags(S)(S flags) {//@@@BUG@@@ text is @system foreach(ch; flags)//flags are ASCII anyway { L_FlagSwitch: switch(ch) { foreach(i, op; __traits(allMembers, RegexOption)) { case RegexOptionNames[i]: if(re_flags & mixin("RegexOption."~op)) throw new RegexException(text("redundant flag specified: ",ch)); re_flags |= mixin("RegexOption."~op); break L_FlagSwitch; } default: if(__ctfe) assert(text("unknown regex flag '",ch,"'")); else new RegexException(text("unknown regex flag '",ch,"'")); } } } //parse and store IR for regex pattern @trusted void parseRegex() { fixupStack.push(0); groupStack.push(1);//0 - whole match auto maxCounterDepth = counterDepth; uint fix;//fixup pointer while(!empty) { debug(fred_parser) writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.stack.data); switch(current) { case '(': next(); nesting++; uint nglob; fixupStack.push(cast(uint)ir.length); if(current == '?') { next(); switch(current) { case ':': put(Bytecode(IR.Nop, 0)); next(); break; case '=': genLookaround(IR.LookaheadStart); next(); break; case '!': genLookaround(IR.NeglookaheadStart); next(); break; case 'P': next(); if(current != '<') error("Expected '<' in named group"); string name; while(next() && isAlpha(current)) { name ~= current; } if(current != '>') error("Expected '>' closing named group"); next(); nglob = groupStack.top++; enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded"); auto t = NamedGroup(name, nglob); if(__ctfe) { size_t ind; for(ind=0; ind = dict[ind].name) break; insertInPlaceAlt(dict, ind, t); } else { auto d = assumeSorted!"a.name < b.name"(dict); auto ind = d.lowerBound(t).length; insertInPlaceAlt(dict, ind, t); } put(Bytecode(IR.GroupStart, nglob)); break; case '<': next(); if(current == '=') genLookaround(IR.LookbehindStart); else if(current == '!') genLookaround(IR.NeglookbehindStart); else error("'!' or '=' expected after '<'"); next(); break; default: error(" ':', '=', '<', 'P' or '!' expected after '(?' "); } } else { nglob = groupStack.top++; enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded"); put(Bytecode(IR.GroupStart, nglob)); } break; case ')': enforce(nesting, "Unmatched ')'"); nesting--; next(); fix = fixupStack.pop(); switch(ir[fix].code) { case IR.GroupStart: put(Bytecode(IR.GroupEnd,ir[fix].data)); parseQuantifier(fix); break; case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: assert(lookaroundNest); fixLookaround(fix); lookaroundNest--; put(ir[fix].paired); break; case IR.Option: //| xxx ) //two fixups: last option + full OR finishAlternation(fix); fix = fixupStack.top; switch(ir[fix].code) { case IR.GroupStart: fixupStack.pop(); put(Bytecode(IR.GroupEnd,ir[fix].data)); parseQuantifier(fix); break; case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: assert(lookaroundNest); lookaroundNest--; fix = fixupStack.pop(); fixLookaround(fix); put(ir[fix].paired); break; default://(?:xxx) fixupStack.pop(); parseQuantifier(fix); } break; default://(?:xxx) parseQuantifier(fix); } break; case '|': next(); fix = fixupStack.top; if(ir.length > fix && ir[fix].code == IR.Option) { ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix); put(Bytecode(IR.GotoEndOr, 0)); fixupStack.top = cast(uint)ir.length; //replace latest fixup for Option put(Bytecode(IR.Option, 0)); break; } //start a new option if(fixupStack.length == 1)//only root entry fix = -1; uint len = cast(uint)ir.length - fix; insertInPlaceAlt(ir, fix+1, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len)); assert(ir[fix+1].code == IR.OrStart); put(Bytecode(IR.GotoEndOr, 0)); fixupStack.push(fix+1); //fixup for StartOR fixupStack.push(cast(uint)ir.length); //for Option put(Bytecode(IR.Option, 0)); break; default://no groups or whatever uint start = cast(uint)ir.length; parseAtom(); parseQuantifier(start); } } if(fixupStack.length != 1) { fix = fixupStack.pop(); enforce(ir[fix].code == IR.Option, "no matching ')'"); finishAlternation(fix); enforce(fixupStack.length == 1, "no matching ')'"); } } //helper function, finalizes IR.Option, fix points to the first option of sequence void finishAlternation(uint fix) { enforce(ir[fix].code == IR.Option, "no matching ')'"); ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix - IRL!(IR.OrStart)); fix = fixupStack.pop(); enforce(ir[fix].code == IR.OrStart, "no matching ')'"); ir[fix] = Bytecode(IR.OrStart, cast(uint)ir.length - fix - IRL!(IR.OrStart)); put(Bytecode(IR.OrEnd, cast(uint)ir.length - fix - IRL!(IR.OrStart))); uint pc = fix + IRL!(IR.OrStart); while(ir[pc].code == IR.Option) { pc = pc + ir[pc].data; if(ir[pc].code != IR.GotoEndOr) break; ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd))); pc += IRL!(IR.GotoEndOr); } put(Bytecode.fromRaw(0)); } //parse and store IR for atom-quantifier pair @trusted void parseQuantifier(uint offset) {//moveAll is @system uint replace = ir[offset].code == IR.Nop; if(empty && !replace) return; uint min, max; switch(current) { case '*': min = 0; max = infinite; break; case '?': min = 0; max = 1; break; case '+': min = 1; max = infinite; break; case '{': enforce(next(), "Unexpected end of regex pattern"); enforce(ascii.isDigit(current), "First number required in repetition"); min = parseDecimal(); if(current == '}') max = min; else if(current == ',') { next(); if(ascii.isDigit(current)) max = parseDecimal(); else if(current == '}') max = infinite; else error("Unexpected symbol in regex pattern"); skipSpace(); if(current != '}') error("Unmatched '{' in regex pattern"); } else error("Unexpected symbol in regex pattern"); break; default: if(replace) { moveAllAlt(ir[offset+1..$],ir[offset..$-1]); ir.length -= 1; } return; } uint len = cast(uint)ir.length - offset - replace; bool greedy = true; //check only if we managed to get new symbol if(next() && current == '?') { greedy = false; next(); } if(max != infinite) { if(min != 1 || max != 1) { Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); if(replace) ir[offset] = op; else insertInPlaceAlt(ir, offset, op); put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); put(Bytecode.init); //hotspot putRaw(1); putRaw(min); putRaw(max); counterDepth = std.algorithm.max(counterDepth, nesting+1); } } else if(min) //&& max is infinite { if(min != 1) { Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len); if(replace) ir[offset] = op; else insertInPlaceAlt(ir, offset, op); offset += 1;//so it still points to the repeated block put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len)); put(Bytecode.init); //hotspot putRaw(1); putRaw(min); putRaw(min); counterDepth = std.algorithm.max(counterDepth, nesting+1); } else if(replace) { if(__ctfe)//CTFE workaround: no moveAll and length -= x; { ir = ir[0..offset] ~ ir[offset+1..$]; } else { moveAll(ir[offset+1 .. $],ir[offset .. $-1]); ir.length -= 1; } } put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len)); enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded"); ir ~= ir[offset .. offset+len]; //IR.InfinteX is always a hotspot put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); put(Bytecode.init); //merge index } else//vanila {0,inf} { Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len); if(replace) ir[offset] = op; else insertInPlaceAlt(ir, offset, op); //IR.InfinteX is always a hotspot put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len)); put(Bytecode.init); //merge index } } //parse and store IR for atom void parseAtom() { if(empty) return; switch(current) { case '*', '?', '+', '|', '{', '}': error("'*', '+', '?', '{', '}' not allowed in atom"); break; case '.': put(Bytecode(IR.Any, 0)); next(); break; case '[': parseCharset(); break; case '\\': enforce(_next(), "Unfinished escape sequence"); parseEscape(); break; case '^': put(Bytecode(IR.Bol, 0)); next(); break; case '$': put(Bytecode(IR.Eol, 0)); next(); break; default: if(re_flags & RegexOption.casefold) { dchar[5] data; auto range = getCommonCasing(current, data); assert(range.length <= 5); if(range.length == 1) put(Bytecode(IR.Char, range[0])); else foreach(v; range) put(Bytecode(IR.OrChar, v, cast(uint)range.length)); } else put(Bytecode(IR.Char, current)); next(); } } //generate code for start of lookaround: (?= (?! (?<= (?','`' ,'*','+','(',')','{','}', '~': last = current; state = State.Char; break; case 'p': set.add(parseUnicodePropertySpec(false)); state = State.Start; continue L_CharTermLoop; //next char already fetched case 'P': set.add(parseUnicodePropertySpec(true)); state = State.Start; continue L_CharTermLoop; //next char already fetched case 'x': last = parseUniHex(pat, 2); state = State.Char; break; case 'u': last = parseUniHex(pat, 4); state = State.Char; break; case 'U': last = parseUniHex(pat, 8); state = State.Char; break; case 'd': set.add(unicodeNd); state = State.Start; break; case 'D': set.add(unicodeNd.dup.negate()); state = State.Start; break; case 's': set.add(unicodeWhite_Space); state = State.Start; break; case 'S': set.add(unicodeWhite_Space.dup.negate()); state = State.Start; break; case 'w': set.add(wordCharacter); state = State.Start; break; case 'W': set.add(wordCharacter.dup.negate()); state = State.Start; break; default: enforce(false, "invalid escape sequence"); } break; case State.PotentialTwinSymbolOperatorAtStart: if(current == twinSymbol) { op = twinSymbolOperator(twinSymbol); next();//skip second twin break L_CharTermLoop; } else { set.add(twinSymbol); last = current; state = State.Char; } break; case State.PotentialTwinSymbolOperator: if(current == twinSymbol) { addWithFlags(set, last, re_flags); op = twinSymbolOperator(twinSymbol); next();//skip second twin break L_CharTermLoop; } else if(twinSymbol == '-') goto case State.Dash; else { addWithFlags(set, last, re_flags); set.add(twinSymbol); last = current; state = State.Char; } break; case State.Dash: switch(current) { case '[': op = Operator.Union; goto case; case ']': //means dash is a single char not an interval specifier addWithFlags(set, last, re_flags); set.add('-'); break L_CharTermLoop; case '\\': state = State.DashEscape; break; default: enforce(last <= current, "inverted range"); if(re_flags & RegexOption.casefold) { for(uint ch = last; ch <= current; ch++) addWithFlags(set, ch, re_flags); } else set.add(Interval(last, current)); state = State.Start; } break; case State.DashEscape: //xxxx-\yyyy uint end; switch(current) { case 'f': end = '\f'; break; case 'n': end = '\n'; break; case 'r': end = '\r'; break; case 't': end = '\t'; break; case 'v': end = '\v'; break; case '[',']','\\','^','$','.','|','?',',','-',';',':' ,'#','&','%','/','<','>','`' ,'*','+','(',')','{','}', '~': end = current; break; case 'c': end = parseControlCode(); break; case 'x': end = parseUniHex(pat, 2); break; case 'u': end = parseUniHex(pat, 4); break; case 'U': end = parseUniHex(pat, 8); break; default: error("invalid escape sequence"); } enforce(last <= end,"inverted range"); set.add(Interval(last,end)); state = State.Start; break; } enforce(next(), "unexpected end of CodepointSet"); } return tuple(set, op); } alias Stack!(CodepointSet, CTFE) ValStack; alias Stack!(Operator, CTFE) OpStack; //parse and store IR for CodepointSet void parseCharset() { ValStack vstack; OpStack opstack; // static bool apply(Operator op, ref ValStack stack) { switch(op) { case Operator.Negate: stack.top.negate(); break; case Operator.Union: auto s = stack.pop();//2nd operand enforce(!stack.empty, "no operand for '||'"); stack.top.add(s); break; case Operator.Difference: auto s = stack.pop();//2nd operand enforce(!stack.empty, "no operand for '--'"); stack.top.sub(s); break; case Operator.SymDifference: auto s = stack.pop();//2nd operand enforce(!stack.empty, "no operand for '~~'"); stack.top.symmetricSub(s); break; case Operator.Intersection: auto s = stack.pop();//2nd operand enforce(!stack.empty, "no operand for '&&'"); stack.top.intersect(s); break; default: return false; } return true; } static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) { while(cond(opstack.top)) { debug(fred_charset) writeln(opstack.stack.data); if(!apply(opstack.pop(),vstack)) return false;//syntax error if(opstack.empty) return false; } return true; } L_CharsetLoop: do { switch(current) { case '[': opstack.push(Operator.Open); enforce(next(), "unexpected end of CodepointSet"); if(current == '^') { opstack.push(Operator.Negate); enforce(next(), "unexpected end of CodepointSet"); } //[] is prohibited enforce(current != ']', "wrong CodepointSet"); goto default; case ']': enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack) , "CodepointSet syntax error"); enforce(!opstack.empty, "unmatched ']'"); opstack.pop(); next(); if(opstack.empty) break L_CharsetLoop; auto pair = parseCharTerm(); if(!pair[0].empty)//not only operator e.g. -- or ~~ { vstack.top.add(pair[0]);//apply union } if(pair[1] != Operator.None) { if(opstack.top == Operator.Union) unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); opstack.push(pair[1]); } break; // default://yet another pair of term(op)? auto pair = parseCharTerm(); if(pair[1] != Operator.None) { if(opstack.top == Operator.Union) unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); opstack.push(pair[1]); } vstack.push(pair[0]); } }while(!empty || !opstack.empty); while(!opstack.empty) apply(opstack.pop(),vstack); assert(vstack.length == 1); charsetToIr(vstack.top); } //try to generate optimal IR code for this CodepointSet @trusted void charsetToIr(in CodepointSet set) {//@@@BUG@@@ writeln is @system uint chars = set.chars(); if(chars < Bytecode.maxSequence) { switch(chars) { case 1: put(Bytecode(IR.Char, set.ivals[0])); break; case 0: error("empty CodepointSet not allowed"); break; default: foreach(ch; set[]) put(Bytecode(IR.OrChar, ch, chars)); } } else { if(set.ivals.length > maxCharsetUsed) { auto t = getTrie(set); put(Bytecode(IR.Trie, cast(uint)tries.length)); tries ~= t; debug(fred_allocation) writeln("Trie generated"); } else { put(Bytecode(IR.CodepointSet, cast(uint)charsets.length)); tries ~= Trie.init; } charsets ~= set; assert(charsets.length == tries.length); } } //parse and generate IR for escape stand alone escape sequence @trusted void parseEscape() {//accesses array of appender switch(current) { case 'f': next(); put(Bytecode(IR.Char, '\f')); break; case 'n': next(); put(Bytecode(IR.Char, '\n')); break; case 'r': next(); put(Bytecode(IR.Char, '\r')); break; case 't': next(); put(Bytecode(IR.Char, '\t')); break; case 'v': next(); put(Bytecode(IR.Char, '\v')); break; case 'd': next(); charsetToIr(unicodeNd); break; case 'D': next(); charsetToIr(unicodeNd.dup.negate()); break; case 'b': next(); put(Bytecode(IR.Wordboundary, 0)); break; case 'B': next(); put(Bytecode(IR.Notwordboundary, 0)); break; case 's': next(); charsetToIr(unicodeWhite_Space); break; case 'S': next(); charsetToIr(unicodeWhite_Space.dup.negate()); break; case 'w': next(); charsetToIr(wordCharacter); break; case 'W': next(); charsetToIr(wordCharacter.dup.negate()); break; case 'p': case 'P': auto CodepointSet = parseUnicodePropertySpec(current == 'P'); charsetToIr(CodepointSet); break; case 'x': uint code = parseUniHex(pat, 2); next(); put(Bytecode(IR.Char,code)); break; case 'u': case 'U': uint code = parseUniHex(pat, current == 'u' ? 4 : 8); next(); put(Bytecode(IR.Char, code)); break; case 'c': //control codes Bytecode code = Bytecode(IR.Char, parseControlCode()); next(); put(code); break; case '0': next(); put(Bytecode(IR.Char, 0));//NUL character break; case '1': .. case '9': uint nref = cast(uint)current - '0'; uint maxBackref; foreach(v; groupStack.stack.data) maxBackref += v; uint localLimit = maxBackref - groupStack.top; enforce(nref < maxBackref, "Backref to unseen group"); //perl's disambiguation rule i.e. //get next digit only if there is such group number while(nref < maxBackref && next() && ascii.isDigit(current)) { nref = nref * 10 + current - '0'; } if(nref >= maxBackref) nref /= 10; if(nref >= localLimit) { put(Bytecode(IR.Backref, nref-localLimit)); ir[$-1].setLocalRef(); } else put(Bytecode(IR.Backref, nref)); markBackref(nref); break; default: auto op = Bytecode(IR.Char, current); next(); put(op); } } //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, //\ - assumed to be processed, p - is current const(CodepointSet) parseUnicodePropertySpec(bool negated) { alias comparePropertyName ucmp; enum MAX_PROPERTY = 128; char[MAX_PROPERTY] result; uint k=0; enforce(next()); if(current == '{') { while(k user group number uint ngroup; //number of internal groups uint maxCounterDepth; //max depth of nested {n,m} repetitions uint hotspotTableSize; //number of entries in merge table uint threadCount; uint flags; //global regex flags const(Trie)[] tries; // uint[] backrefed; //bit array of backreferenced submatches Kickstart!Char kickstart; //bit access helper uint isBackref(uint n) { if(n/32 >= backrefed.length) return 0; return backrefed[n/32] & (1<<(n&31)); } //check if searching is not needed void checkIfOneShot() { if(flags & RegexOption.multiline) return; L_CheckLoop: for(uint i=0; i 0) { pc -= len; counter += step; } else { counter = counter%step; pc += IRL!(IR.RepeatEnd); } } else { counter = counter%step; pc += IRL!(IR.RepeatEnd); } break; case IR.InfiniteStart, IR.InfiniteQStart: pc += re.ir[pc].data + IRL!(IR.InfiniteStart); goto case IR.InfiniteEnd; //both Q and non-Q case IR.InfiniteEnd: case IR.InfiniteQEnd: uint len = re.ir[pc].data; if(app.data.length == dataLenOld) { pc += IRL!(IR.InfiniteEnd); break; } dataLenOld = app.data.length; if(app.data.length < limit && rand(3) > 0) pc = pc - len; else pc = pc + IRL!(IR.InfiniteEnd); break; case IR.GroupStart, IR.GroupEnd: pc += IRL!(IR.GroupStart); break; case IR.Bol, IR.Wordboundary, IR.Notwordboundary: case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart: default: return; } } } @property Char[] front() { return app.data; } @property empty(){ return false; } void popFront() { app.shrinkTo(0); compose(); } } /++ A $(D StaticRegex) is $(D Regex) object that contains specially generated machine code to speed up matching. Implicitly convertible to normal $(D Regex), however doing so will result in loosing this additional capability. +/ public struct StaticRegex(Char) { private: alias BacktrackingMatcher!(true) Matcher; alias bool function(ref Matcher!Char) MatchFn; MatchFn nativeFn; public: Regex!Char _regex; alias _regex this; this(Regex!Char re, MatchFn fn) { _regex = re; nativeFn = fn; } } //utility for shiftOr, returns a minimum number of bytes to test in a Char uint effectiveSize(Char)() { static if(is(Char == char)) return 1; else static if(is(Char == wchar)) return 2; else static if(is(Char == dchar)) return 3; else static assert(0); } /* Kickstart engine using ShiftOr algorithm, a bit parallel technique for inexact string searching. */ struct ShiftOr(Char) { private: uint[] table; uint fChar; uint n_length; enum charSize = effectiveSize!Char(); //maximum number of chars in CodepointSet to process enum uint charsetThreshold = 32_000; static struct ShiftThread { uint[] tab; uint mask; uint idx; uint pc, counter, hops; this(uint newPc, uint newCounter, uint[] table) { pc = newPc; counter = newCounter; mask = 1; idx = 0; hops = 0; tab = table; } void setMask(uint idx, uint mask) { tab[idx] |= mask; } void setInvMask(uint idx, uint mask) { tab[idx] &= ~mask; } void set(alias setBits=setInvMask)(dchar ch) { static if(charSize == 3) { uint val = ch, tmask = mask; setBits(val&0xFF, tmask); tmask <<= 1; val >>= 8; setBits(val&0xFF, tmask); tmask <<= 1; val >>= 8; assert(val <= 0x10); setBits(val, tmask); tmask <<= 1; } else { Char[dchar.sizeof/Char.sizeof] buf; uint tmask = mask; size_t total = encode(buf, ch); for(size_t i=0; i>8, tmask); } } } } void add(dchar ch){ return set!setInvMask(ch); } void advance(uint s) { mask <<= s; idx += s; } @property bool full(){ return !mask; } } static ShiftThread fork(ShiftThread t, uint newPc, uint newCounter) { ShiftThread nt = t; nt.pc = newPc; nt.counter = newCounter; return nt; } @trusted static ShiftThread fetch(ref ShiftThread[] worklist) { auto t = worklist[$-1]; worklist.length -= 1; if(!__ctfe) worklist.assumeSafeAppend(); return t; } static uint charLen(uint ch) { assert(ch <= 0x10FFFF); return codeLength!Char(cast(dchar)ch)*charSize; } public: @trusted this(const ref Regex!Char re, uint[] memory) { assert(memory.length == 256); fChar = uint.max; L_FindChar: for(size_t i = 0;;) { switch(re.ir[i].code) { case IR.Char: fChar = re.ir[i].data; static if(charSize != 3) { Char buf[dchar.sizeof/Char.sizeof]; encode(buf, fChar); fChar = buf[0]; } fChar = fChar & 0xFF; break L_FindChar; case IR.GroupStart, IR.GroupEnd: i += IRL!(IR.GroupStart); break; case IR.Bol, IR.Wordboundary, IR.Notwordboundary: i += IRL!(IR.Bol); break; default: break L_FindChar; } } table = memory; table[] = uint.max; ShiftThread[] trs; ShiftThread t = ShiftThread(0, 0, table); //locate first fixed char if any n_length = 32; for(;;) { L_Eval_Thread: for(;;) { switch(re.ir[t.pc].code) { case IR.Char: uint s = charLen(re.ir[t.pc].data); if(t.idx+s > n_length) goto L_StopThread; t.add(re.ir[t.pc].data); t.advance(s); t.pc += IRL!(IR.Char); break; case IR.OrChar://assumes IRL!(OrChar) == 1 uint len = re.ir[t.pc].sequence; uint end = t.pc + len; uint[Bytecode.maxSequence] s; uint numS; for(uint i = 0; i start || (end == start && (end & 1))) s[numS++] = (i+1)*charSize; } } if(numS == 0 || t.idx + s[numS-1] > n_length) goto L_StopThread; auto chars = set.chars; if(chars > charsetThreshold) goto L_StopThread; foreach(ch; set[]) { //avoid surrogate pairs if(0xD800 <= ch && ch <= 0xDFFF) continue; t.add(ch); } for(uint i=0; i= 0x80); debug (fred_search) writeln("ShiftOr stumbled on ",re.ir[t.pc].mnemonic); n_length = min(t.idx, n_length); break L_Eval_Thread; } } if(trs.empty) break; t = fetch(trs); } debug(fred_search) { writeln("Min length: ", n_length); } } @property bool empty() const { return n_length == 0; } @property uint length() const{ return n_length/charSize; } // lookup compatible bit pattern in haystack, return starting index // has a useful trait: if supplied with valid UTF indexes, // returns only valid UTF indexes // (that given the haystack in question is valid UTF string) @trusted size_t search(const(Char)[] haystack, size_t idx) { assert(!empty); auto p = cast(const(ubyte)*)(haystack.ptr+idx); uint state = uint.max; uint limit = 1u<<(n_length - 1u); debug(fred_search) writefln("Limit: %32b",limit); if(fChar != uint.max) { const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); const orginalAlign = cast(size_t)p & (Char.sizeof-1); while(p != end) { if(!~state) { for(;;) { p = cast(ubyte*)memchr(p, fChar, end - p); if(!p) return haystack.length; if((cast(size_t)p & (Char.sizeof-1)) == orginalAlign) break; if(++p == end) return haystack.length; } state = ~1u; assert((cast(size_t)p & (Char.sizeof-1)) == orginalAlign); static if(charSize == 3) { state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[2]]; p += 3; } } //first char is already tested, see if that's all if(!(state & limit))//division rounds down for dchar return (p-cast(ubyte*)haystack.ptr)/Char.sizeof -length+1; static if(charSize == 3) { state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[2]]; state = (state<<1) | table[p[3]]; p+=4; } else { state = (state<<1) | table[p[1]]; p++; } debug(fred_search) writefln("State: %32b", state); } } else { //in this path we have to shift first static if(charSize == 3) { const(ubyte)* end = cast(ubyte*)(haystack.ptr + haystack.length); while(p != end) { state = (state<<1) | table[p[0]]; state = (state<<1) | table[p[1]]; state = (state<<1) | table[p[2]]; p += 4; if(!(state & limit))//division rounds down for dchar return (p-cast(ubyte*)haystack.ptr)/Char.sizeof -length; } } else { auto len = cast(ubyte*)(haystack.ptr + haystack.length) - p; size_t i = 0; if(len & 1) { state = (state<<1) | table[p[i++]]; if(!(state & limit)) return idx+i/Char.sizeof-length; } while(i