mirror of
https://github.com/dlang/phobos.git
synced 2025-04-27 13:40:20 +03:00
1661 lines
53 KiB
D
1661 lines
53 KiB
D
//Written in the D programming language
|
|
/*
|
|
Regular expression pattern parser.
|
|
*/
|
|
module std.regex.internal.parser;
|
|
|
|
import std.regex.internal.ir;
|
|
import std.algorithm, std.range, std.uni, std.meta,
|
|
std.traits, std.typecons, std.exception;
|
|
static import std.ascii;
|
|
|
|
// package relevant info from parser into a regex object
|
|
auto makeRegex(S, CG)(Parser!(S, CG) p)
|
|
{
|
|
Regex!(BasicElementOf!S) re;
|
|
auto g = p.g;
|
|
with(re)
|
|
{
|
|
ir = g.ir;
|
|
dict = g.dict;
|
|
ngroup = g.ngroup;
|
|
maxCounterDepth = g.counterDepth;
|
|
flags = p.re_flags;
|
|
charsets = g.charsets;
|
|
matchers = g.matchers;
|
|
backrefed = g.backrefed;
|
|
re.postprocess();
|
|
debug(std_regex_parser)
|
|
{
|
|
__ctfe || print();
|
|
}
|
|
//@@@BUG@@@ (not reduced)
|
|
//somehow just using validate _collides_ with std.utf.validate (!)
|
|
version(assert) re.validateRe();
|
|
}
|
|
return re;
|
|
}
|
|
|
|
// helper for unittest
|
|
auto makeRegex(S)(S arg)
|
|
if (isSomeString!S)
|
|
{
|
|
return makeRegex(Parser!(S, CodeGen)(arg, ""));
|
|
}
|
|
|
|
unittest
|
|
{
|
|
auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
|
|
auto nc = re.namedCaptures;
|
|
static assert(isRandomAccessRange!(typeof(nc)));
|
|
assert(!nc.empty);
|
|
assert(nc.length == 2);
|
|
assert(nc.equal(["name", "var"]));
|
|
assert(nc[0] == "name");
|
|
assert(nc[1..$].equal(["var"]));
|
|
|
|
re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
|
|
nc = re.namedCaptures;
|
|
assert(nc.length == 1);
|
|
assert(nc[0] == "named");
|
|
assert(nc.front == "named");
|
|
assert(nc.back == "named");
|
|
|
|
re = makeRegex(`(\w+) (\w+)`);
|
|
nc = re.namedCaptures;
|
|
assert(nc.empty);
|
|
|
|
re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
|
|
nc = re.namedCaptures;
|
|
auto cp = nc.save;
|
|
assert(nc.equal(cp));
|
|
nc.popFront();
|
|
assert(nc.equal(cp[1..$]));
|
|
nc.popBack();
|
|
assert(nc.equal(cp[1 .. $ - 1]));
|
|
}
|
|
|
|
|
|
@trusted void reverseBytecode()(Bytecode[] code)
|
|
{
|
|
Bytecode[] rev = new Bytecode[code.length];
|
|
uint revPc = cast(uint)rev.length;
|
|
Stack!(Tuple!(uint, uint, uint)) stack;
|
|
uint start = 0;
|
|
uint end = cast(uint)code.length;
|
|
for (;;)
|
|
{
|
|
for (uint pc = start; pc < end; )
|
|
{
|
|
uint len = code[pc].length;
|
|
if (code[pc].code == IR.GotoEndOr)
|
|
break; //pick next alternation branch
|
|
if (code[pc].isAtom)
|
|
{
|
|
rev[revPc - len .. revPc] = code[pc .. pc + len];
|
|
revPc -= len;
|
|
pc += len;
|
|
}
|
|
else if (code[pc].isStart || code[pc].isEnd)
|
|
{
|
|
//skip over other embedded lookbehinds they are reversed
|
|
if (code[pc].code == IR.LookbehindStart
|
|
|| code[pc].code == IR.NeglookbehindStart)
|
|
{
|
|
uint blockLen = len + code[pc].data
|
|
+ code[pc].pairedLength;
|
|
rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
|
|
pc += blockLen;
|
|
revPc -= blockLen;
|
|
continue;
|
|
}
|
|
uint second = code[pc].indexOfPair(pc);
|
|
uint secLen = code[second].length;
|
|
rev[revPc - secLen .. revPc] = code[second .. second + secLen];
|
|
revPc -= secLen;
|
|
if (code[pc].code == IR.OrStart)
|
|
{
|
|
//we pass len bytes forward, but secLen in reverse
|
|
uint revStart = revPc - (second + len - secLen - pc);
|
|
uint r = revStart;
|
|
uint i = pc + IRL!(IR.OrStart);
|
|
while (code[i].code == IR.Option)
|
|
{
|
|
if (code[i - 1].code != IR.OrStart)
|
|
{
|
|
assert(code[i - 1].code == IR.GotoEndOr);
|
|
rev[r - 1] = code[i - 1];
|
|
}
|
|
rev[r] = code[i];
|
|
auto newStart = i + IRL!(IR.Option);
|
|
auto newEnd = newStart + code[i].data;
|
|
auto newRpc = r + code[i].data + IRL!(IR.Option);
|
|
if (code[newEnd].code != IR.OrEnd)
|
|
{
|
|
newRpc--;
|
|
}
|
|
stack.push(tuple(newStart, newEnd, newRpc));
|
|
r += code[i].data + IRL!(IR.Option);
|
|
i += code[i].data + IRL!(IR.Option);
|
|
}
|
|
pc = i;
|
|
revPc = revStart;
|
|
assert(code[pc].code == IR.OrEnd);
|
|
}
|
|
else
|
|
pc += len;
|
|
}
|
|
}
|
|
if (stack.empty)
|
|
break;
|
|
start = stack.top[0];
|
|
end = stack.top[1];
|
|
revPc = stack.top[2];
|
|
stack.pop();
|
|
}
|
|
code[] = rev[];
|
|
}
|
|
|
|
//test if a given string starts with hex number of maxDigit that's a valid codepoint
|
|
//returns it's value and skips these maxDigit chars on success, throws on failure
|
|
dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
|
|
{
|
|
//std.conv.parse is both @system and bogus
|
|
enforce(str.length >= maxDigit,"incomplete escape sequence");
|
|
uint val;
|
|
for (int k = 0; k < maxDigit; k++)
|
|
{
|
|
auto current = str[k];//accepts ascii only, so it's OK to index directly
|
|
if ('0' <= current && current <= '9')
|
|
val = val * 16 + current - '0';
|
|
else if ('a' <= current && current <= 'f')
|
|
val = val * 16 + current -'a' + 10;
|
|
else if ('A' <= current && current <= 'F')
|
|
val = val * 16 + current - 'A' + 10;
|
|
else
|
|
throw new Exception("invalid escape sequence");
|
|
}
|
|
enforce(val <= 0x10FFFF, "invalid codepoint");
|
|
str = str[maxDigit..$];
|
|
return val;
|
|
}
|
|
|
|
@system unittest //BUG canFind is system
|
|
{
|
|
string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
|
|
string[] hex = [ "01", "ff", "00af", "10FFFF" ];
|
|
int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
|
|
foreach (v; non_hex)
|
|
assert(collectException(parseUniHex(v, v.length)).msg
|
|
.canFind("invalid escape sequence"));
|
|
foreach (i, v; hex)
|
|
assert(parseUniHex(v, v.length) == value[i]);
|
|
string over = "0011FFFF";
|
|
assert(collectException(parseUniHex(over, over.length)).msg
|
|
.canFind("invalid codepoint"));
|
|
}
|
|
|
|
auto caseEnclose(CodepointSet set)
|
|
{
|
|
auto cased = set & unicode.LC;
|
|
foreach (dchar ch; cased.byCodepoint)
|
|
{
|
|
foreach (c; simpleCaseFoldings(ch))
|
|
set |= c;
|
|
}
|
|
return set;
|
|
}
|
|
|
|
/+
|
|
fetch codepoint set corresponding to a name (InBlock or binary property)
|
|
+/
|
|
@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold)
|
|
{
|
|
CodepointSet s = unicode(name);
|
|
//FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
|
|
if (casefold)
|
|
s = caseEnclose(s);
|
|
if (negated)
|
|
s = s.inverted;
|
|
return s;
|
|
}
|
|
|
|
//basic stack, just in case it gets used anywhere else then Parser
|
|
@trusted struct Stack(T)
|
|
{
|
|
T[] data;
|
|
@property bool empty(){ return data.empty; }
|
|
|
|
@property size_t length(){ return data.length; }
|
|
|
|
void push(T val){ data ~= val; }
|
|
|
|
T pop()
|
|
{
|
|
assert(!empty);
|
|
auto val = data[$ - 1];
|
|
data = data[0 .. $ - 1];
|
|
if (!__ctfe)
|
|
cast(void)data.assumeSafeAppend();
|
|
return val;
|
|
}
|
|
|
|
@property ref T top()
|
|
{
|
|
assert(!empty);
|
|
return data[$ - 1];
|
|
}
|
|
}
|
|
|
|
struct CodeGen
|
|
{
|
|
Bytecode[] ir; // resulting bytecode
|
|
Stack!(uint) fixupStack; // stack of opened start instructions
|
|
NamedGroup[] dict; // maps name -> user group number
|
|
Stack!(uint) groupStack; // stack of current number of group
|
|
uint nesting = 0; // group nesting level and repetitions step
|
|
uint lookaroundNest = 0; // nesting of lookaround
|
|
uint counterDepth = 0; // current depth of nested counted repetitions
|
|
CodepointSet[] charsets; // sets for char classes
|
|
const(CharMatcher)[] matchers; // matchers for char classes
|
|
uint[] backrefed; // bitarray for groups refered by backref
|
|
uint ngroup; // final number of groups (of all patterns)
|
|
|
|
void start(uint length)
|
|
{
|
|
if (!__ctfe)
|
|
ir.reserve((length*5+2)/4);
|
|
fixupStack.push(0);
|
|
groupStack.push(1);//0 - whole match
|
|
}
|
|
|
|
//mark referenced groups for latter processing
|
|
void markBackref(uint n)
|
|
{
|
|
if (n/32 >= backrefed.length)
|
|
backrefed.length = n/32 + 1;
|
|
backrefed[n / 32] |= 1 << (n & 31);
|
|
}
|
|
|
|
bool isOpenGroup(uint n)
|
|
{
|
|
// walk the fixup stack and see if there are groups labeled 'n'
|
|
// fixup '0' is reserved for alternations
|
|
return fixupStack.data[1..$].
|
|
canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
|
|
}
|
|
|
|
void put(Bytecode code)
|
|
{
|
|
enforce(ir.length < maxCompiledLength,
|
|
"maximum compiled pattern length is exceeded");
|
|
ir ~= code;
|
|
}
|
|
|
|
void putRaw(uint number)
|
|
{
|
|
enforce(ir.length < maxCompiledLength,
|
|
"maximum compiled pattern length is exceeded");
|
|
ir ~= Bytecode.fromRaw(number);
|
|
}
|
|
|
|
//try to generate optimal IR code for this CodepointSet
|
|
@trusted void charsetToIr(CodepointSet set)
|
|
{//@@@BUG@@@ writeln is @system
|
|
uint chars = cast(uint)set.length;
|
|
if (chars < Bytecode.maxSequence)
|
|
{
|
|
switch (chars)
|
|
{
|
|
case 1:
|
|
put(Bytecode(IR.Char, set.byCodepoint.front));
|
|
break;
|
|
case 0:
|
|
throw new RegexException("empty CodepointSet not allowed");
|
|
default:
|
|
foreach (ch; set.byCodepoint)
|
|
put(Bytecode(IR.OrChar, ch, chars));
|
|
}
|
|
}
|
|
else
|
|
{
|
|
import std.algorithm : countUntil;
|
|
auto ivals = set.byInterval;
|
|
auto n = charsets.countUntil(set);
|
|
if (n >= 0)
|
|
{
|
|
if (ivals.length*2 > maxCharsetUsed)
|
|
put(Bytecode(IR.Trie, cast(uint)n));
|
|
else
|
|
put(Bytecode(IR.CodepointSet, cast(uint)n));
|
|
return;
|
|
}
|
|
if (ivals.length*2 > maxCharsetUsed)
|
|
{
|
|
auto t = getMatcher(set);
|
|
put(Bytecode(IR.Trie, cast(uint)matchers.length));
|
|
matchers ~= t;
|
|
debug(std_regex_allocation) writeln("Trie generated");
|
|
}
|
|
else
|
|
{
|
|
put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
|
|
matchers ~= CharMatcher.init;
|
|
}
|
|
charsets ~= set;
|
|
assert(charsets.length == matchers.length);
|
|
}
|
|
}
|
|
|
|
void genLogicGroup()
|
|
{
|
|
nesting++;
|
|
pushFixup(length);
|
|
put(Bytecode(IR.Nop, 0));
|
|
}
|
|
|
|
void genGroup()
|
|
{
|
|
nesting++;
|
|
pushFixup(length);
|
|
uint nglob = groupStack.top++;
|
|
enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
|
|
put(Bytecode(IR.GroupStart, nglob));
|
|
}
|
|
|
|
void genNamedGroup(string name)
|
|
{
|
|
nesting++;
|
|
pushFixup(length);
|
|
uint nglob = groupStack.top++;
|
|
enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
|
|
auto t = NamedGroup(name, nglob);
|
|
auto d = assumeSorted!"a.name < b.name"(dict);
|
|
auto ind = d.lowerBound(t).length;
|
|
insertInPlace(dict, ind, t);
|
|
put(Bytecode(IR.GroupStart, nglob));
|
|
}
|
|
|
|
//generate code for start of lookaround: (?= (?! (?<= (?<!
|
|
void genLookaround(IR opcode)
|
|
{
|
|
nesting++;
|
|
pushFixup(length);
|
|
put(Bytecode(opcode, 0));
|
|
put(Bytecode.fromRaw(0));
|
|
put(Bytecode.fromRaw(0));
|
|
groupStack.push(0);
|
|
lookaroundNest++;
|
|
enforce(lookaroundNest <= maxLookaroundDepth,
|
|
"maximum lookaround depth is exceeded");
|
|
}
|
|
|
|
void endPattern(uint num)
|
|
{
|
|
put(Bytecode(IR.End, num));
|
|
ngroup = max(ngroup, groupStack.top);
|
|
groupStack.top = 1; // reset group counter
|
|
}
|
|
|
|
//fixup lookaround with start at offset fix and append a proper *-End opcode
|
|
void fixLookaround(uint fix)
|
|
{
|
|
lookaroundNest--;
|
|
ir[fix] = Bytecode(ir[fix].code,
|
|
cast(uint)ir.length - fix - IRL!(IR.LookaheadStart));
|
|
auto g = groupStack.pop();
|
|
assert(!groupStack.empty);
|
|
ir[fix+1] = Bytecode.fromRaw(groupStack.top);
|
|
//groups are cumulative across lookarounds
|
|
ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
|
|
groupStack.top += g;
|
|
if (ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
|
|
{
|
|
reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
|
|
}
|
|
put(ir[fix].paired);
|
|
}
|
|
|
|
// repetition of {1,1}
|
|
void fixRepetition(uint offset)
|
|
{
|
|
bool replace = ir[offset].code == IR.Nop;
|
|
if (replace)
|
|
{
|
|
copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
|
|
ir.length -= 1;
|
|
}
|
|
}
|
|
|
|
// repetition of {x,y}
|
|
void fixRepetition(uint offset, uint min, uint max, bool greedy)
|
|
{
|
|
bool replace = ir[offset].code == IR.Nop;
|
|
uint len = cast(uint)ir.length - offset - replace;
|
|
if (max != infinite)
|
|
{
|
|
if (min != 1 || max != 1)
|
|
{
|
|
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
|
|
if (replace)
|
|
ir[offset] = op;
|
|
else
|
|
insertInPlace(ir, offset, op);
|
|
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
|
|
put(Bytecode.init); //hotspot
|
|
putRaw(1);
|
|
putRaw(min);
|
|
putRaw(max);
|
|
counterDepth = std.algorithm.max(counterDepth, nesting+1);
|
|
}
|
|
}
|
|
else if (min) //&& max is infinite
|
|
{
|
|
if (min != 1)
|
|
{
|
|
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
|
|
if (replace)
|
|
ir[offset] = op;
|
|
else
|
|
insertInPlace(ir, offset, op);
|
|
offset += 1;//so it still points to the repeated block
|
|
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
|
|
put(Bytecode.init); //hotspot
|
|
putRaw(1);
|
|
putRaw(min);
|
|
putRaw(min);
|
|
counterDepth = std.algorithm.max(counterDepth, nesting+1);
|
|
}
|
|
else if (replace)
|
|
{
|
|
copy(ir[offset+1 .. $], ir[offset .. $-1]);
|
|
ir.length -= 1;
|
|
}
|
|
put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
|
|
enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
|
|
ir ~= ir[offset .. offset+len];
|
|
//IR.InfinteX is always a hotspot
|
|
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
|
|
put(Bytecode.init); //merge index
|
|
}
|
|
else//vanila {0,inf}
|
|
{
|
|
Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
|
|
if (replace)
|
|
ir[offset] = op;
|
|
else
|
|
insertInPlace(ir, offset, op);
|
|
//IR.InfinteX is always a hotspot
|
|
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
|
|
put(Bytecode.init); //merge index
|
|
}
|
|
}
|
|
|
|
void fixAlternation()
|
|
{
|
|
uint fix = fixupStack.top;
|
|
if (ir.length > fix && ir[fix].code == IR.Option)
|
|
{
|
|
ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix);
|
|
put(Bytecode(IR.GotoEndOr, 0));
|
|
fixupStack.top = cast(uint)ir.length; //replace latest fixup for Option
|
|
put(Bytecode(IR.Option, 0));
|
|
return;
|
|
}
|
|
uint len, orStart;
|
|
//start a new option
|
|
if (fixupStack.length == 1)
|
|
{//only root entry, effectively no fixup
|
|
len = cast(uint)ir.length + IRL!(IR.GotoEndOr);
|
|
orStart = 0;
|
|
}
|
|
else
|
|
{//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
|
|
len = cast(uint)ir.length - fix - (ir[fix].length - 1);
|
|
orStart = fix + ir[fix].length;
|
|
}
|
|
insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
|
|
assert(ir[orStart].code == IR.OrStart);
|
|
put(Bytecode(IR.GotoEndOr, 0));
|
|
fixupStack.push(orStart); //fixup for StartOR
|
|
fixupStack.push(cast(uint)ir.length); //for second Option
|
|
put(Bytecode(IR.Option, 0));
|
|
}
|
|
|
|
// finalizes IR.Option, fix points to the first option of sequence
|
|
void finishAlternation(uint fix)
|
|
{
|
|
enforce(ir[fix].code == IR.Option, "no matching ')'");
|
|
ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix - IRL!(IR.OrStart));
|
|
fix = fixupStack.pop();
|
|
enforce(ir[fix].code == IR.OrStart, "no matching ')'");
|
|
ir[fix] = Bytecode(IR.OrStart, cast(uint)ir.length - fix - IRL!(IR.OrStart));
|
|
put(Bytecode(IR.OrEnd, cast(uint)ir.length - fix - IRL!(IR.OrStart)));
|
|
uint pc = fix + IRL!(IR.OrStart);
|
|
while (ir[pc].code == IR.Option)
|
|
{
|
|
pc = pc + ir[pc].data;
|
|
if (ir[pc].code != IR.GotoEndOr)
|
|
break;
|
|
ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
|
|
pc += IRL!(IR.GotoEndOr);
|
|
}
|
|
put(Bytecode.fromRaw(0));
|
|
}
|
|
|
|
// returns: (flag - repetition possible?, fixup of the start of this "group")
|
|
Tuple!(bool, uint) onClose()
|
|
{
|
|
nesting--;
|
|
uint fix = popFixup();
|
|
switch (ir[fix].code)
|
|
{
|
|
case IR.GroupStart:
|
|
put(Bytecode(IR.GroupEnd, ir[fix].data));
|
|
return tuple(true, fix);
|
|
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
|
|
assert(lookaroundNest);
|
|
fixLookaround(fix);
|
|
return tuple(false, 0u);
|
|
case IR.Option: //| xxx )
|
|
//two fixups: last option + full OR
|
|
finishAlternation(fix);
|
|
fix = topFixup;
|
|
switch (ir[fix].code)
|
|
{
|
|
case IR.GroupStart:
|
|
popFixup();
|
|
put(Bytecode(IR.GroupEnd, ir[fix].data));
|
|
return tuple(true, fix);
|
|
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
|
|
assert(lookaroundNest);
|
|
fix = popFixup();
|
|
fixLookaround(fix);
|
|
return tuple(false, 0u);
|
|
default://(?:xxx)
|
|
popFixup();
|
|
return tuple(true, fix);
|
|
}
|
|
default://(?:xxx)
|
|
return tuple(true, fix);
|
|
}
|
|
}
|
|
|
|
uint popFixup(){ return fixupStack.pop(); }
|
|
|
|
void pushFixup(uint val){ return fixupStack.push(val); }
|
|
|
|
@property uint topFixup(){ return fixupStack.top; }
|
|
|
|
@property size_t fixupLength(){ return fixupStack.data.length; }
|
|
|
|
@property uint length(){ return cast(uint)ir.length; }
|
|
}
|
|
|
|
// safety limits
|
|
enum maxGroupNumber = 2^^19;
|
|
enum maxLookaroundDepth = 16;
|
|
// *Bytecode.sizeof, i.e. 1Mb of bytecode alone
|
|
enum maxCompiledLength = 2^^18;
|
|
// amounts to up to 4 Mb of auxilary table for matching
|
|
enum maxCumulativeRepetitionLength = 2^^20;
|
|
// marker to indicate infinite repetition
|
|
enum infinite = ~0u;
|
|
|
|
struct Parser(R, Generator)
|
|
if (isForwardRange!R && is(ElementType!R : dchar))
|
|
{
|
|
dchar _current;
|
|
bool empty;
|
|
R pat, origin; //keep full pattern for pretty printing error messages
|
|
uint re_flags = 0; //global flags e.g. multiline + internal ones
|
|
Generator g;
|
|
|
|
@trusted this(S)(R pattern, S flags)
|
|
if (isSomeString!S)
|
|
{
|
|
pat = origin = pattern;
|
|
//reserve slightly more then avg as sampled from unittests
|
|
parseFlags(flags);
|
|
_current = ' ';//a safe default for freeform parsing
|
|
next();
|
|
g.start(cast(uint)pat.length);
|
|
try
|
|
{
|
|
parseRegex();
|
|
}
|
|
catch(Exception e)
|
|
{
|
|
error(e.msg);//also adds pattern location
|
|
}
|
|
g.endPattern(1);
|
|
}
|
|
|
|
@property dchar current(){ return _current; }
|
|
|
|
bool _next()
|
|
{
|
|
if (pat.empty)
|
|
{
|
|
empty = true;
|
|
return false;
|
|
}
|
|
_current = pat.front;
|
|
pat.popFront();
|
|
return true;
|
|
}
|
|
|
|
void skipSpace()
|
|
{
|
|
while (isWhite(current) && _next()){ }
|
|
}
|
|
|
|
bool next()
|
|
{
|
|
if (re_flags & RegexOption.freeform)
|
|
{
|
|
bool r = _next();
|
|
skipSpace();
|
|
return r;
|
|
}
|
|
else
|
|
return _next();
|
|
}
|
|
|
|
//parsing number with basic overflow check
|
|
uint parseDecimal()
|
|
{
|
|
uint r = 0;
|
|
while (std.ascii.isDigit(current))
|
|
{
|
|
if (r >= (uint.max/10))
|
|
error("Overflow in decimal number");
|
|
r = 10*r + cast(uint)(current-'0');
|
|
if (!next())
|
|
break;
|
|
}
|
|
return r;
|
|
}
|
|
|
|
//parse control code of form \cXXX, c assumed to be the current symbol
|
|
dchar parseControlCode()
|
|
{
|
|
enforce(next(), "Unfinished escape sequence");
|
|
enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'),
|
|
"Only letters are allowed after \\c");
|
|
return current & 0x1f;
|
|
}
|
|
|
|
//
|
|
@trusted void parseFlags(S)(S flags)
|
|
{//@@@BUG@@@ text is @system
|
|
import std.conv;
|
|
foreach (ch; flags)//flags are ASCII anyway
|
|
{
|
|
L_FlagSwitch:
|
|
switch (ch)
|
|
{
|
|
|
|
foreach (i, op; __traits(allMembers, RegexOption))
|
|
{
|
|
case RegexOptionNames[i]:
|
|
if (re_flags & mixin("RegexOption."~op))
|
|
throw new RegexException(text("redundant flag specified: ",ch));
|
|
re_flags |= mixin("RegexOption."~op);
|
|
break L_FlagSwitch;
|
|
}
|
|
default:
|
|
throw new RegexException(text("unknown regex flag '",ch,"'"));
|
|
}
|
|
}
|
|
}
|
|
|
|
//parse and store IR for regex pattern
|
|
@trusted void parseRegex()
|
|
{
|
|
uint fix;//fixup pointer
|
|
|
|
while (!empty)
|
|
{
|
|
debug(std_regex_parser)
|
|
__ctfe || writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.data);
|
|
switch (current)
|
|
{
|
|
case '(':
|
|
next();
|
|
if (current == '?')
|
|
{
|
|
next();
|
|
switch (current)
|
|
{
|
|
case '#':
|
|
for (;;)
|
|
{
|
|
if (!next())
|
|
error("Unexpected end of pattern");
|
|
if (current == ')')
|
|
{
|
|
next();
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
case ':':
|
|
g.genLogicGroup();
|
|
next();
|
|
break;
|
|
case '=':
|
|
g.genLookaround(IR.LookaheadStart);
|
|
next();
|
|
break;
|
|
case '!':
|
|
g.genLookaround(IR.NeglookaheadStart);
|
|
next();
|
|
break;
|
|
case 'P':
|
|
next();
|
|
if (current != '<')
|
|
error("Expected '<' in named group");
|
|
string name;
|
|
if (!next() || !(isAlpha(current) || current == '_'))
|
|
error("Expected alpha starting a named group");
|
|
name ~= current;
|
|
while (next() && (isAlpha(current) ||
|
|
current == '_' || std.ascii.isDigit(current)))
|
|
{
|
|
name ~= current;
|
|
}
|
|
if (current != '>')
|
|
error("Expected '>' closing named group");
|
|
next();
|
|
g.genNamedGroup(name);
|
|
break;
|
|
case '<':
|
|
next();
|
|
if (current == '=')
|
|
g.genLookaround(IR.LookbehindStart);
|
|
else if (current == '!')
|
|
g.genLookaround(IR.NeglookbehindStart);
|
|
else
|
|
error("'!' or '=' expected after '<'");
|
|
next();
|
|
break;
|
|
default:
|
|
error(" ':', '=', '<', 'P' or '!' expected after '(?' ");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
g.genGroup();
|
|
}
|
|
break;
|
|
case ')':
|
|
enforce(g.nesting, "Unmatched ')'");
|
|
next();
|
|
auto pair = g.onClose();
|
|
if (pair[0])
|
|
parseQuantifier(pair[1]);
|
|
break;
|
|
case '|':
|
|
next();
|
|
g.fixAlternation();
|
|
break;
|
|
default://no groups or whatever
|
|
uint start = g.length;
|
|
parseAtom();
|
|
parseQuantifier(start);
|
|
}
|
|
}
|
|
|
|
if (g.fixupLength != 1)
|
|
{
|
|
fix = g.popFixup();
|
|
g.finishAlternation(fix);
|
|
enforce(g.fixupLength == 1, "no matching ')'");
|
|
}
|
|
}
|
|
|
|
|
|
//parse and store IR for atom-quantifier pair
|
|
@trusted void parseQuantifier(uint offset)
|
|
{//copy is @system
|
|
if (empty)
|
|
return g.fixRepetition(offset);
|
|
uint min, max;
|
|
switch (current)
|
|
{
|
|
case '*':
|
|
min = 0;
|
|
max = infinite;
|
|
break;
|
|
case '?':
|
|
min = 0;
|
|
max = 1;
|
|
break;
|
|
case '+':
|
|
min = 1;
|
|
max = infinite;
|
|
break;
|
|
case '{':
|
|
enforce(next(), "Unexpected end of regex pattern");
|
|
enforce(std.ascii.isDigit(current), "First number required in repetition");
|
|
min = parseDecimal();
|
|
if (current == '}')
|
|
max = min;
|
|
else if (current == ',')
|
|
{
|
|
next();
|
|
if (std.ascii.isDigit(current))
|
|
max = parseDecimal();
|
|
else if (current == '}')
|
|
max = infinite;
|
|
else
|
|
error("Unexpected symbol in regex pattern");
|
|
skipSpace();
|
|
if (current != '}')
|
|
error("Unmatched '{' in regex pattern");
|
|
}
|
|
else
|
|
error("Unexpected symbol in regex pattern");
|
|
if (min > max)
|
|
error("Illegal {n,m} quantifier");
|
|
break;
|
|
default:
|
|
g.fixRepetition(offset);
|
|
return;
|
|
}
|
|
bool greedy = true;
|
|
//check only if we managed to get new symbol
|
|
if (next() && current == '?')
|
|
{
|
|
greedy = false;
|
|
next();
|
|
}
|
|
g.fixRepetition(offset, min, max, greedy);
|
|
}
|
|
|
|
//parse and store IR for atom
|
|
void parseAtom()
|
|
{
|
|
if (empty)
|
|
return;
|
|
switch (current)
|
|
{
|
|
case '*', '?', '+', '|', '{', '}':
|
|
error("'*', '+', '?', '{', '}' not allowed in atom");
|
|
break;
|
|
case '.':
|
|
g.put(Bytecode(IR.Any, 0));
|
|
next();
|
|
break;
|
|
case '[':
|
|
parseCharset();
|
|
break;
|
|
case '\\':
|
|
enforce(_next(), "Unfinished escape sequence");
|
|
parseEscape();
|
|
break;
|
|
case '^':
|
|
g.put(Bytecode(IR.Bol, 0));
|
|
next();
|
|
break;
|
|
case '$':
|
|
g.put(Bytecode(IR.Eol, 0));
|
|
next();
|
|
break;
|
|
default:
|
|
//FIXME: getCommonCasing in new std uni
|
|
if (re_flags & RegexOption.casefold)
|
|
{
|
|
auto range = simpleCaseFoldings(current);
|
|
assert(range.length <= 5);
|
|
if (range.length == 1)
|
|
g.put(Bytecode(IR.Char, range.front));
|
|
else
|
|
foreach (v; range)
|
|
g.put(Bytecode(IR.OrChar, v, cast(uint)range.length));
|
|
}
|
|
else
|
|
g.put(Bytecode(IR.Char, current));
|
|
next();
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//CodepointSet operations relatively in order of priority
|
|
enum Operator:uint {
|
|
Open = 0, Negate, Difference, SymDifference, Intersection, Union, None
|
|
}
|
|
|
|
//parse unit of CodepointSet spec, most notably escape sequences and char ranges
|
|
//also fetches next set operation
|
|
Tuple!(CodepointSet,Operator) parseCharTerm()
|
|
{
|
|
enum State{ Start, Char, Escape, CharDash, CharDashEscape,
|
|
PotentialTwinSymbolOperator }
|
|
Operator op = Operator.None;
|
|
dchar last;
|
|
CodepointSet set;
|
|
State state = State.Start;
|
|
|
|
static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags)
|
|
{
|
|
if (re_flags & RegexOption.casefold)
|
|
{
|
|
auto range = simpleCaseFoldings(ch);
|
|
foreach (v; range)
|
|
set |= v;
|
|
}
|
|
else
|
|
set |= ch;
|
|
}
|
|
|
|
static Operator twinSymbolOperator(dchar symbol)
|
|
{
|
|
switch (symbol)
|
|
{
|
|
case '|':
|
|
return Operator.Union;
|
|
case '-':
|
|
return Operator.Difference;
|
|
case '~':
|
|
return Operator.SymDifference;
|
|
case '&':
|
|
return Operator.Intersection;
|
|
default:
|
|
assert(false);
|
|
}
|
|
}
|
|
|
|
L_CharTermLoop:
|
|
for (;;)
|
|
{
|
|
final switch (state)
|
|
{
|
|
case State.Start:
|
|
switch (current)
|
|
{
|
|
case '|':
|
|
case '-':
|
|
case '~':
|
|
case '&':
|
|
state = State.PotentialTwinSymbolOperator;
|
|
last = current;
|
|
break;
|
|
case '[':
|
|
op = Operator.Union;
|
|
goto case;
|
|
case ']':
|
|
break L_CharTermLoop;
|
|
case '\\':
|
|
state = State.Escape;
|
|
break;
|
|
default:
|
|
state = State.Char;
|
|
last = current;
|
|
}
|
|
break;
|
|
case State.Char:
|
|
// xxx last current xxx
|
|
switch (current)
|
|
{
|
|
case '|':
|
|
case '~':
|
|
case '&':
|
|
// then last is treated as normal char and added as implicit union
|
|
state = State.PotentialTwinSymbolOperator;
|
|
addWithFlags(set, last, re_flags);
|
|
last = current;
|
|
break;
|
|
case '-': // still need more info
|
|
state = State.CharDash;
|
|
break;
|
|
case '\\':
|
|
set |= last;
|
|
state = State.Escape;
|
|
break;
|
|
case '[':
|
|
op = Operator.Union;
|
|
goto case;
|
|
case ']':
|
|
addWithFlags(set, last, re_flags);
|
|
break L_CharTermLoop;
|
|
default:
|
|
state = State.Char;
|
|
addWithFlags(set, last, re_flags);
|
|
last = current;
|
|
}
|
|
break;
|
|
case State.PotentialTwinSymbolOperator:
|
|
// xxx last current xxxx
|
|
// where last = [|-&~]
|
|
if (current == last)
|
|
{
|
|
op = twinSymbolOperator(last);
|
|
next();//skip second twin char
|
|
break L_CharTermLoop;
|
|
}
|
|
goto case State.Char;
|
|
case State.Escape:
|
|
// xxx \ current xxx
|
|
switch (current)
|
|
{
|
|
case 'f':
|
|
last = '\f';
|
|
state = State.Char;
|
|
break;
|
|
case 'n':
|
|
last = '\n';
|
|
state = State.Char;
|
|
break;
|
|
case 'r':
|
|
last = '\r';
|
|
state = State.Char;
|
|
break;
|
|
case 't':
|
|
last = '\t';
|
|
state = State.Char;
|
|
break;
|
|
case 'v':
|
|
last = '\v';
|
|
state = State.Char;
|
|
break;
|
|
case 'c':
|
|
last = parseControlCode();
|
|
state = State.Char;
|
|
break;
|
|
foreach (val; Escapables)
|
|
{
|
|
case val:
|
|
}
|
|
last = current;
|
|
state = State.Char;
|
|
break;
|
|
case 'p':
|
|
set.add(parseUnicodePropertySpec(false));
|
|
state = State.Start;
|
|
continue L_CharTermLoop; //next char already fetched
|
|
case 'P':
|
|
set.add(parseUnicodePropertySpec(true));
|
|
state = State.Start;
|
|
continue L_CharTermLoop; //next char already fetched
|
|
case 'x':
|
|
last = parseUniHex(pat, 2);
|
|
state = State.Char;
|
|
break;
|
|
case 'u':
|
|
last = parseUniHex(pat, 4);
|
|
state = State.Char;
|
|
break;
|
|
case 'U':
|
|
last = parseUniHex(pat, 8);
|
|
state = State.Char;
|
|
break;
|
|
case 'd':
|
|
set.add(unicode.Nd);
|
|
state = State.Start;
|
|
break;
|
|
case 'D':
|
|
set.add(unicode.Nd.inverted);
|
|
state = State.Start;
|
|
break;
|
|
case 's':
|
|
set.add(unicode.White_Space);
|
|
state = State.Start;
|
|
break;
|
|
case 'S':
|
|
set.add(unicode.White_Space.inverted);
|
|
state = State.Start;
|
|
break;
|
|
case 'w':
|
|
set.add(wordCharacter);
|
|
state = State.Start;
|
|
break;
|
|
case 'W':
|
|
set.add(wordCharacter.inverted);
|
|
state = State.Start;
|
|
break;
|
|
default:
|
|
enforce(false, "invalid escape sequence");
|
|
}
|
|
break;
|
|
case State.CharDash:
|
|
// xxx last - current xxx
|
|
switch (current)
|
|
{
|
|
case '[':
|
|
op = Operator.Union;
|
|
goto case;
|
|
case ']':
|
|
//means dash is a single char not an interval specifier
|
|
addWithFlags(set, last, re_flags);
|
|
addWithFlags(set, '-', re_flags);
|
|
break L_CharTermLoop;
|
|
case '-'://set Difference again
|
|
addWithFlags(set, last, re_flags);
|
|
op = Operator.Difference;
|
|
next();//skip '-'
|
|
break L_CharTermLoop;
|
|
case '\\':
|
|
state = State.CharDashEscape;
|
|
break;
|
|
default:
|
|
enforce(last <= current, "inverted range");
|
|
if (re_flags & RegexOption.casefold)
|
|
{
|
|
for (uint ch = last; ch <= current; ch++)
|
|
addWithFlags(set, ch, re_flags);
|
|
}
|
|
else
|
|
set.add(last, current + 1);
|
|
state = State.Start;
|
|
}
|
|
break;
|
|
case State.CharDashEscape:
|
|
//xxx last - \ current xxx
|
|
uint end;
|
|
switch (current)
|
|
{
|
|
case 'f':
|
|
end = '\f';
|
|
break;
|
|
case 'n':
|
|
end = '\n';
|
|
break;
|
|
case 'r':
|
|
end = '\r';
|
|
break;
|
|
case 't':
|
|
end = '\t';
|
|
break;
|
|
case 'v':
|
|
end = '\v';
|
|
break;
|
|
foreach (val; Escapables)
|
|
{
|
|
case val:
|
|
}
|
|
end = current;
|
|
break;
|
|
case 'c':
|
|
end = parseControlCode();
|
|
break;
|
|
case 'x':
|
|
end = parseUniHex(pat, 2);
|
|
break;
|
|
case 'u':
|
|
end = parseUniHex(pat, 4);
|
|
break;
|
|
case 'U':
|
|
end = parseUniHex(pat, 8);
|
|
break;
|
|
default:
|
|
error("invalid escape sequence");
|
|
}
|
|
enforce(last <= end,"inverted range");
|
|
set.add(last, end + 1);
|
|
state = State.Start;
|
|
break;
|
|
}
|
|
enforce(next(), "unexpected end of CodepointSet");
|
|
}
|
|
return tuple(set, op);
|
|
}
|
|
|
|
alias ValStack = Stack!(CodepointSet);
|
|
alias OpStack = Stack!(Operator);
|
|
|
|
//parse and store IR for CodepointSet
|
|
void parseCharset()
|
|
{
|
|
auto save = re_flags;
|
|
re_flags &= ~RegexOption.freeform; // stop ignoring whitespace if we did
|
|
parseCharsetImpl();
|
|
re_flags = save;
|
|
}
|
|
|
|
void parseCharsetImpl()
|
|
{
|
|
ValStack vstack;
|
|
OpStack opstack;
|
|
import std.functional : unaryFun;
|
|
//
|
|
static bool apply(Operator op, ref ValStack stack)
|
|
{
|
|
switch (op)
|
|
{
|
|
case Operator.Negate:
|
|
stack.top = stack.top.inverted;
|
|
break;
|
|
case Operator.Union:
|
|
auto s = stack.pop();//2nd operand
|
|
enforce(!stack.empty, "no operand for '||'");
|
|
stack.top.add(s);
|
|
break;
|
|
case Operator.Difference:
|
|
auto s = stack.pop();//2nd operand
|
|
enforce(!stack.empty, "no operand for '--'");
|
|
stack.top.sub(s);
|
|
break;
|
|
case Operator.SymDifference:
|
|
auto s = stack.pop();//2nd operand
|
|
enforce(!stack.empty, "no operand for '~~'");
|
|
stack.top ~= s;
|
|
break;
|
|
case Operator.Intersection:
|
|
auto s = stack.pop();//2nd operand
|
|
enforce(!stack.empty, "no operand for '&&'");
|
|
stack.top.intersect(s);
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
|
|
{
|
|
while (cond(opstack.top))
|
|
{
|
|
if (!apply(opstack.pop(),vstack))
|
|
return false;//syntax error
|
|
if (opstack.empty)
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
L_CharsetLoop:
|
|
do
|
|
{
|
|
switch (current)
|
|
{
|
|
case '[':
|
|
opstack.push(Operator.Open);
|
|
enforce(next(), "unexpected end of character class");
|
|
if (current == '^')
|
|
{
|
|
opstack.push(Operator.Negate);
|
|
enforce(next(), "unexpected end of character class");
|
|
}
|
|
else if (current == ']') // []...] is special cased
|
|
{
|
|
enforce(next(), "wrong character set");
|
|
auto pair = parseCharTerm();
|
|
pair[0].add(']', ']'+1);
|
|
if (pair[1] != Operator.None)
|
|
{
|
|
if (opstack.top == Operator.Union)
|
|
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
|
|
opstack.push(pair[1]);
|
|
}
|
|
vstack.push(pair[0]);
|
|
}
|
|
break;
|
|
case ']':
|
|
enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
|
|
"character class syntax error");
|
|
enforce(!opstack.empty, "unmatched ']'");
|
|
opstack.pop();
|
|
next();
|
|
if (opstack.empty)
|
|
break L_CharsetLoop;
|
|
auto pair = parseCharTerm();
|
|
if (!pair[0].empty)//not only operator e.g. -- or ~~
|
|
{
|
|
vstack.top.add(pair[0]);//apply union
|
|
}
|
|
if (pair[1] != Operator.None)
|
|
{
|
|
if (opstack.top == Operator.Union)
|
|
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
|
|
opstack.push(pair[1]);
|
|
}
|
|
break;
|
|
//
|
|
default://yet another pair of term(op)?
|
|
auto pair = parseCharTerm();
|
|
if (pair[1] != Operator.None)
|
|
{
|
|
if (opstack.top == Operator.Union)
|
|
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
|
|
opstack.push(pair[1]);
|
|
}
|
|
vstack.push(pair[0]);
|
|
}
|
|
|
|
}while (!empty || !opstack.empty);
|
|
while (!opstack.empty)
|
|
apply(opstack.pop(),vstack);
|
|
assert(vstack.length == 1);
|
|
g.charsetToIr(vstack.top);
|
|
}
|
|
|
|
//parse and generate IR for escape stand alone escape sequence
|
|
@trusted void parseEscape()
|
|
{//accesses array of appender
|
|
|
|
switch (current)
|
|
{
|
|
case 'f': next(); g.put(Bytecode(IR.Char, '\f')); break;
|
|
case 'n': next(); g.put(Bytecode(IR.Char, '\n')); break;
|
|
case 'r': next(); g.put(Bytecode(IR.Char, '\r')); break;
|
|
case 't': next(); g.put(Bytecode(IR.Char, '\t')); break;
|
|
case 'v': next(); g.put(Bytecode(IR.Char, '\v')); break;
|
|
|
|
case 'd':
|
|
next();
|
|
g.charsetToIr(unicode.Nd);
|
|
break;
|
|
case 'D':
|
|
next();
|
|
g.charsetToIr(unicode.Nd.inverted);
|
|
break;
|
|
case 'b': next(); g.put(Bytecode(IR.Wordboundary, 0)); break;
|
|
case 'B': next(); g.put(Bytecode(IR.Notwordboundary, 0)); break;
|
|
case 's':
|
|
next();
|
|
g.charsetToIr(unicode.White_Space);
|
|
break;
|
|
case 'S':
|
|
next();
|
|
g.charsetToIr(unicode.White_Space.inverted);
|
|
break;
|
|
case 'w':
|
|
next();
|
|
g.charsetToIr(wordCharacter);
|
|
break;
|
|
case 'W':
|
|
next();
|
|
g.charsetToIr(wordCharacter.inverted);
|
|
break;
|
|
case 'p': case 'P':
|
|
auto CodepointSet = parseUnicodePropertySpec(current == 'P');
|
|
g.charsetToIr(CodepointSet);
|
|
break;
|
|
case 'x':
|
|
uint code = parseUniHex(pat, 2);
|
|
next();
|
|
g.put(Bytecode(IR.Char,code));
|
|
break;
|
|
case 'u': case 'U':
|
|
uint code = parseUniHex(pat, current == 'u' ? 4 : 8);
|
|
next();
|
|
g.put(Bytecode(IR.Char, code));
|
|
break;
|
|
case 'c': //control codes
|
|
Bytecode code = Bytecode(IR.Char, parseControlCode());
|
|
next();
|
|
g.put(code);
|
|
break;
|
|
case '0':
|
|
next();
|
|
g.put(Bytecode(IR.Char, 0));//NUL character
|
|
break;
|
|
case '1': .. case '9':
|
|
uint nref = cast(uint)current - '0';
|
|
uint maxBackref = sum(g.groupStack.data);
|
|
enforce(nref < maxBackref, "Backref to unseen group");
|
|
//perl's disambiguation rule i.e.
|
|
//get next digit only if there is such group number
|
|
while (nref < maxBackref && next() && std.ascii.isDigit(current))
|
|
{
|
|
nref = nref * 10 + current - '0';
|
|
}
|
|
if (nref >= maxBackref)
|
|
nref /= 10;
|
|
enforce(!g.isOpenGroup(nref), "Backref to open group");
|
|
uint localLimit = maxBackref - g.groupStack.top;
|
|
if (nref >= localLimit)
|
|
{
|
|
g.put(Bytecode(IR.Backref, nref-localLimit));
|
|
g.ir[$-1].setLocalRef();
|
|
}
|
|
else
|
|
g.put(Bytecode(IR.Backref, nref));
|
|
g.markBackref(nref);
|
|
break;
|
|
default:
|
|
if (current >= privateUseStart && current <= privateUseEnd)
|
|
{
|
|
g.endPattern(current - privateUseStart + 1);
|
|
break;
|
|
}
|
|
auto op = Bytecode(IR.Char, current);
|
|
next();
|
|
g.put(op);
|
|
}
|
|
}
|
|
|
|
//parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
|
|
//\ - assumed to be processed, p - is current
|
|
CodepointSet parseUnicodePropertySpec(bool negated)
|
|
{
|
|
enum MAX_PROPERTY = 128;
|
|
char[MAX_PROPERTY] result;
|
|
uint k = 0;
|
|
enforce(next(), "eof parsing unicode property spec");
|
|
if (current == '{')
|
|
{
|
|
while (k < MAX_PROPERTY && next() && current !='}' && current !=':')
|
|
if (current != '-' && current != ' ' && current != '_')
|
|
result[k++] = cast(char)std.ascii.toLower(current);
|
|
enforce(k != MAX_PROPERTY, "invalid property name");
|
|
enforce(current == '}', "} expected ");
|
|
}
|
|
else
|
|
{//single char properties e.g.: \pL, \pN ...
|
|
enforce(current < 0x80, "invalid property name");
|
|
result[k++] = cast(char)current;
|
|
}
|
|
auto s = getUnicodeSet(result[0..k], negated,
|
|
cast(bool)(re_flags & RegexOption.casefold));
|
|
enforce(!s.empty, "unrecognized unicode property spec");
|
|
next();
|
|
return s;
|
|
}
|
|
|
|
//
|
|
@trusted void error(string msg)
|
|
{
|
|
import std.format;
|
|
auto app = appender!string();
|
|
formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
|
|
msg, origin[0..$-pat.length], pat);
|
|
throw new RegexException(app.data);
|
|
}
|
|
|
|
alias Char = BasicElementOf!R;
|
|
|
|
@property program()
|
|
{
|
|
return makeRegex(this);
|
|
}
|
|
}
|
|
|
|
/+
|
|
Postproces the IR, then optimize.
|
|
+/
|
|
@trusted void postprocess(Char)(ref Regex!Char zis)
|
|
{//@@@BUG@@@ write is @system
|
|
with(zis)
|
|
{
|
|
struct FixedStack(T)
|
|
{
|
|
T[] arr;
|
|
uint _top;
|
|
//this(T[] storage){ arr = storage; _top = -1; }
|
|
@property ref T top(){ assert(!empty); return arr[_top]; }
|
|
void push(T x){ arr[++_top] = x; }
|
|
T pop() { assert(!empty); return arr[_top--]; }
|
|
@property bool empty(){ return _top == -1; }
|
|
}
|
|
auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
|
|
counterRange.push(1);
|
|
ulong cumRange = 0;
|
|
for (uint i = 0; i < ir.length; i += ir[i].length)
|
|
{
|
|
if (ir[i].hotspot)
|
|
{
|
|
assert(i + 1 < ir.length,
|
|
"unexpected end of IR while looking for hotspot");
|
|
ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
|
|
hotspotTableSize += counterRange.top;
|
|
}
|
|
switch (ir[i].code)
|
|
{
|
|
case IR.RepeatStart, IR.RepeatQStart:
|
|
uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
|
|
assert(ir[repEnd].code == ir[i].paired.code);
|
|
uint max = ir[repEnd + 4].raw;
|
|
ir[repEnd+2].raw = counterRange.top;
|
|
ir[repEnd+3].raw *= counterRange.top;
|
|
ir[repEnd+4].raw *= counterRange.top;
|
|
ulong cntRange = cast(ulong)(max)*counterRange.top;
|
|
cumRange += cntRange;
|
|
enforce(cumRange < maxCumulativeRepetitionLength,
|
|
"repetition length limit is exceeded");
|
|
counterRange.push(cast(uint)cntRange + counterRange.top);
|
|
threadCount += counterRange.top;
|
|
break;
|
|
case IR.RepeatEnd, IR.RepeatQEnd:
|
|
threadCount += counterRange.top;
|
|
counterRange.pop();
|
|
break;
|
|
case IR.GroupStart:
|
|
if (isBackref(ir[i].data))
|
|
ir[i].setBackrefence();
|
|
threadCount += counterRange.top;
|
|
break;
|
|
case IR.GroupEnd:
|
|
if (isBackref(ir[i].data))
|
|
ir[i].setBackrefence();
|
|
threadCount += counterRange.top;
|
|
break;
|
|
default:
|
|
threadCount += counterRange.top;
|
|
}
|
|
}
|
|
checkIfOneShot();
|
|
if (!(flags & RegexInfo.oneShot))
|
|
kickstart = Kickstart!Char(zis, new uint[](256));
|
|
debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
|
|
optimize(zis);
|
|
}
|
|
}
|
|
|
|
void fixupBytecode()(Bytecode[] ir)
|
|
{
|
|
Stack!uint fixups;
|
|
|
|
with(IR) for (uint i=0; i<ir.length; i+= ir[i].length)
|
|
{
|
|
if (ir[i].isStart || ir[i].code == Option)
|
|
fixups.push(i);
|
|
else if (ir[i].code == OrEnd)
|
|
{
|
|
// Alternatives need more care
|
|
auto j = fixups.pop(); // last Option
|
|
ir[j].data = i - j - ir[j].length;
|
|
j = fixups.pop(); // OrStart
|
|
ir[j].data = i - j - ir[j].length;
|
|
ir[i].data = ir[j].data;
|
|
|
|
// fixup all GotoEndOrs
|
|
j = j + IRL!(OrStart);
|
|
assert(ir[j].code == Option);
|
|
for (;;)
|
|
{
|
|
auto next = j + ir[j].data + IRL!(Option);
|
|
if (ir[next].code == IR.OrEnd)
|
|
break;
|
|
ir[next - IRL!(GotoEndOr)].data = i - next;
|
|
j = next;
|
|
}
|
|
}
|
|
else if (ir[i].code == GotoEndOr)
|
|
{
|
|
auto j = fixups.pop(); // Option
|
|
ir[j].data = i - j + IRL!(GotoEndOr)- IRL!(Option); // to the next option
|
|
}
|
|
else if (ir[i].isEnd)
|
|
{
|
|
auto j = fixups.pop();
|
|
ir[i].data = i - j - ir[j].length;
|
|
ir[j].data = ir[i].data;
|
|
}
|
|
}
|
|
assert(fixups.empty);
|
|
}
|
|
|
|
void optimize(Char)(ref Regex!Char zis)
|
|
{
|
|
CodepointSet nextSet(uint idx)
|
|
{
|
|
CodepointSet set;
|
|
with(zis) with(IR)
|
|
Outer:
|
|
for (uint i = idx; i < ir.length; i += ir[i].length)
|
|
{
|
|
switch (ir[i].code)
|
|
{
|
|
case Char:
|
|
set.add(ir[i].data, ir[i].data+1);
|
|
goto default;
|
|
//TODO: OrChar
|
|
case Trie, CodepointSet:
|
|
set = zis.charsets[ir[i].data];
|
|
goto default;
|
|
case GroupStart,GroupEnd:
|
|
break;
|
|
default:
|
|
break Outer;
|
|
}
|
|
}
|
|
return set;
|
|
}
|
|
|
|
with(zis) with(IR) for (uint i = 0; i < ir.length; i += ir[i].length)
|
|
{
|
|
if (ir[i].code == InfiniteEnd)
|
|
{
|
|
auto set = nextSet(i+IRL!(InfiniteEnd));
|
|
if (!set.empty && set.length < 10_000)
|
|
{
|
|
ir[i] = Bytecode(InfiniteBloomEnd, ir[i].data);
|
|
ir[i - ir[i].data - IRL!(InfiniteStart)] =
|
|
Bytecode(InfiniteBloomStart, ir[i].data);
|
|
ir.insertInPlace(i+IRL!(InfiniteEnd),
|
|
Bytecode.fromRaw(cast(uint)zis.filters.length));
|
|
zis.filters ~= BitTable(set);
|
|
fixupBytecode(ir);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//IR code validator - proper nesting, illegal instructions, etc.
|
|
@trusted void validateRe(Char)(ref Regex!Char zis)
|
|
{//@@@BUG@@@ text is @system
|
|
import std.conv;
|
|
with(zis)
|
|
{
|
|
for (uint pc = 0; pc < ir.length; pc += ir[pc].length)
|
|
{
|
|
if (ir[pc].isStart || ir[pc].isEnd)
|
|
{
|
|
uint dest = ir[pc].indexOfPair(pc);
|
|
assert(dest < ir.length, text("Wrong length in opcode at pc=",
|
|
pc, " ", dest, " vs ", ir.length));
|
|
assert(ir[dest].paired == ir[pc],
|
|
text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
|
|
}
|
|
else if (ir[pc].isAtom)
|
|
{
|
|
|
|
}
|
|
else
|
|
assert(0, text("Unknown type of instruction at pc=", pc));
|
|
}
|
|
}
|
|
}
|