phobos/std/regex/internal/parser.d
David Soria Parra 46831a485e std.regex: correctly add last character of a set to regex (bugzilla 14529)
When encounting the end of a character set ']' we have to correctly add the last
encountered valid character to the regex and resepect flags. This bug caused the
last character to not be correctly case folded if case folding was requested.

This fixes https://issues.dlang.org/show_bug.cgi?id=14529.
2015-07-07 01:32:11 +02:00

1529 lines
50 KiB
D

//Written in the D programming language
/*
Regular expression pattern parser.
*/
module std.regex.internal.parser;
import std.regex.internal.ir;
import std.algorithm, std.range, std.uni, std.typetuple,
std.traits, std.typecons, std.exception;
// package relevant info from parser into a regex object
auto makeRegex(S)(Parser!S p)
{
Regex!(BasicElementOf!S) re;
with(re)
{
ir = p.ir;
dict = p.dict;
ngroup = p.groupStack.top;
maxCounterDepth = p.counterDepth;
flags = p.re_flags;
charsets = p.charsets;
tries = p.tries;
backrefed = p.backrefed;
re.lightPostprocess();
debug(std_regex_parser)
{
print();
}
//@@@BUG@@@ (not reduced)
//somehow just using validate _collides_ with std.utf.validate (!)
version(assert) re.validateRe();
}
return re;
}
// helper for unittest
auto makeRegex(S)(S arg)
if(isSomeString!S)
{
return makeRegex(Parser!S(arg, ""));
}
unittest
{
auto re = makeRegex(`(?P<name>\w+) = (?P<var>\d+)`);
auto nc = re.namedCaptures;
static assert(isRandomAccessRange!(typeof(nc)));
assert(!nc.empty);
assert(nc.length == 2);
assert(nc.equal(["name", "var"]));
assert(nc[0] == "name");
assert(nc[1..$].equal(["var"]));
re = makeRegex(`(\w+) (?P<named>\w+) (\w+)`);
nc = re.namedCaptures;
assert(nc.length == 1);
assert(nc[0] == "named");
assert(nc.front == "named");
assert(nc.back == "named");
re = makeRegex(`(\w+) (\w+)`);
nc = re.namedCaptures;
assert(nc.empty);
re = makeRegex(`(?P<year>\d{4})/(?P<month>\d{2})/(?P<day>\d{2})/`);
nc = re.namedCaptures;
auto cp = nc.save;
assert(nc.equal(cp));
nc.popFront();
assert(nc.equal(cp[1..$]));
nc.popBack();
assert(nc.equal(cp[1 .. $ - 1]));
}
@trusted void reverseBytecode()(Bytecode[] code)
{
Bytecode[] rev = new Bytecode[code.length];
uint revPc = cast(uint)rev.length;
Stack!(Tuple!(uint, uint, uint)) stack;
uint start = 0;
uint end = cast(uint)code.length;
for(;;)
{
for(uint pc = start; pc < end; )
{
uint len = code[pc].length;
if(code[pc].code == IR.GotoEndOr)
break; //pick next alternation branch
if(code[pc].isAtom)
{
rev[revPc - len .. revPc] = code[pc .. pc + len];
revPc -= len;
pc += len;
}
else if(code[pc].isStart || code[pc].isEnd)
{
//skip over other embedded lookbehinds they are reversed
if(code[pc].code == IR.LookbehindStart
|| code[pc].code == IR.NeglookbehindStart)
{
uint blockLen = len + code[pc].data
+ code[pc].pairedLength;
rev[revPc - blockLen .. revPc] = code[pc .. pc + blockLen];
pc += blockLen;
revPc -= blockLen;
continue;
}
uint second = code[pc].indexOfPair(pc);
uint secLen = code[second].length;
rev[revPc - secLen .. revPc] = code[second .. second + secLen];
revPc -= secLen;
if(code[pc].code == IR.OrStart)
{
//we pass len bytes forward, but secLen in reverse
uint revStart = revPc - (second + len - secLen - pc);
uint r = revStart;
uint i = pc + IRL!(IR.OrStart);
while(code[i].code == IR.Option)
{
if(code[i - 1].code != IR.OrStart)
{
assert(code[i - 1].code == IR.GotoEndOr);
rev[r - 1] = code[i - 1];
}
rev[r] = code[i];
auto newStart = i + IRL!(IR.Option);
auto newEnd = newStart + code[i].data;
auto newRpc = r + code[i].data + IRL!(IR.Option);
if(code[newEnd].code != IR.OrEnd)
{
newRpc--;
}
stack.push(tuple(newStart, newEnd, newRpc));
r += code[i].data + IRL!(IR.Option);
i += code[i].data + IRL!(IR.Option);
}
pc = i;
revPc = revStart;
assert(code[pc].code == IR.OrEnd);
}
else
pc += len;
}
}
if(stack.empty)
break;
start = stack.top[0];
end = stack.top[1];
revPc = stack.top[2];
stack.pop();
}
code[] = rev[];
}
alias Escapables = TypeTuple!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~');
//test if a given string starts with hex number of maxDigit that's a valid codepoint
//returns it's value and skips these maxDigit chars on success, throws on failure
dchar parseUniHex(Char)(ref Char[] str, size_t maxDigit)
{
//std.conv.parse is both @system and bogus
enforce(str.length >= maxDigit,"incomplete escape sequence");
uint val;
for(int k = 0; k < maxDigit; k++)
{
auto current = str[k];//accepts ascii only, so it's OK to index directly
if('0' <= current && current <= '9')
val = val * 16 + current - '0';
else if('a' <= current && current <= 'f')
val = val * 16 + current -'a' + 10;
else if('A' <= current && current <= 'F')
val = val * 16 + current - 'A' + 10;
else
throw new Exception("invalid escape sequence");
}
enforce(val <= 0x10FFFF, "invalid codepoint");
str = str[maxDigit..$];
return val;
}
@system unittest //BUG canFind is system
{
string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
string[] hex = [ "01", "ff", "00af", "10FFFF" ];
int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
foreach(v; non_hex)
assert(collectException(parseUniHex(v, v.length)).msg
.canFind("invalid escape sequence"));
foreach(i, v; hex)
assert(parseUniHex(v, v.length) == value[i]);
string over = "0011FFFF";
assert(collectException(parseUniHex(over, over.length)).msg
.canFind("invalid codepoint"));
}
//heuristic value determines maximum CodepointSet length suitable for linear search
enum maxCharsetUsed = 6;
enum maxCachedTries = 8;
alias Trie = CodepointSetTrie!(13, 8);
alias makeTrie = codepointSetTrie!(13, 8);
Trie[CodepointSet] trieCache;
//accessor with caching
@trusted Trie getTrie(CodepointSet set)
{// @@@BUG@@@ 6357 almost all properties of AA are not @safe
if(__ctfe || maxCachedTries == 0)
return makeTrie(set);
else
{
auto p = set in trieCache;
if(p)
return *p;
if(trieCache.length == maxCachedTries)
{
// flush entries in trieCache
trieCache = null;
}
return (trieCache[set] = makeTrie(set));
}
}
auto caseEnclose(CodepointSet set)
{
auto cased = set & unicode.LC;
foreach (dchar ch; cased.byCodepoint)
{
foreach(c; simpleCaseFoldings(ch))
set |= c;
}
return set;
}
/+
fetch codepoint set corresponding to a name (InBlock or binary property)
+/
@trusted CodepointSet getUnicodeSet(in char[] name, bool negated, bool casefold)
{
CodepointSet s = unicode(name);
//FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
if(casefold)
s = caseEnclose(s);
if(negated)
s = s.inverted;
return s;
}
//basic stack, just in case it gets used anywhere else then Parser
@trusted struct Stack(T)
{
T[] data;
@property bool empty(){ return data.empty; }
@property size_t length(){ return data.length; }
void push(T val){ data ~= val; }
T pop()
{
assert(!empty);
auto val = data[$ - 1];
data = data[0 .. $ - 1];
if(!__ctfe)
cast(void)data.assumeSafeAppend();
return val;
}
@property ref T top()
{
assert(!empty);
return data[$ - 1];
}
}
//safety limits
enum maxGroupNumber = 2^^19;
enum maxLookaroundDepth = 16;
// *Bytecode.sizeof, i.e. 1Mb of bytecode alone
enum maxCompiledLength = 2^^18;
//amounts to up to 4 Mb of auxilary table for matching
enum maxCumulativeRepetitionLength = 2^^20;
struct Parser(R)
if (isForwardRange!R && is(ElementType!R : dchar))
{
enum infinite = ~0u;
dchar _current;
bool empty;
R pat, origin; //keep full pattern for pretty printing error messages
Bytecode[] ir; //resulting bytecode
uint re_flags = 0; //global flags e.g. multiline + internal ones
Stack!(uint) fixupStack; //stack of opened start instructions
NamedGroup[] dict; //maps name -> user group number
//current num of group, group nesting level and repetitions step
Stack!(uint) groupStack;
uint nesting = 0;
uint lookaroundNest = 0;
uint counterDepth = 0; //current depth of nested counted repetitions
CodepointSet[] charsets; //
const(Trie)[] tries; //
uint[] backrefed; //bitarray for groups
@trusted this(S)(R pattern, S flags)
if(isSomeString!S)
{
pat = origin = pattern;
//reserve slightly more then avg as sampled from unittests
if(!__ctfe)
ir.reserve((pat.length*5+2)/4);
parseFlags(flags);
_current = ' ';//a safe default for freeform parsing
next();
try
{
parseRegex();
}
catch(Exception e)
{
error(e.msg);//also adds pattern location
}
put(Bytecode(IR.End, 0));
}
//mark referenced groups for latter processing
void markBackref(uint n)
{
if(n/32 >= backrefed.length)
backrefed.length = n/32 + 1;
backrefed[n / 32] |= 1 << (n & 31);
}
bool isOpenGroup(uint n)
{
// walk the fixup stack and see if there are groups labeled 'n'
// fixup '0' is reserved for alternations
return fixupStack.data[1..$].
canFind!(fix => ir[fix].code == IR.GroupStart && ir[fix].data == n)();
}
@property dchar current(){ return _current; }
bool _next()
{
if(pat.empty)
{
empty = true;
return false;
}
_current = pat.front;
pat.popFront();
return true;
}
void skipSpace()
{
while(isWhite(current) && _next()){ }
}
bool next()
{
if(re_flags & RegexOption.freeform)
{
bool r = _next();
skipSpace();
return r;
}
else
return _next();
}
void put(Bytecode code)
{
enforce(ir.length < maxCompiledLength,
"maximum compiled pattern length is exceeded");
ir ~= code;
}
void putRaw(uint number)
{
enforce(ir.length < maxCompiledLength,
"maximum compiled pattern length is exceeded");
ir ~= Bytecode.fromRaw(number);
}
//parsing number with basic overflow check
uint parseDecimal()
{
uint r = 0;
while(std.ascii.isDigit(current))
{
if(r >= (uint.max/10))
error("Overflow in decimal number");
r = 10*r + cast(uint)(current-'0');
if(!next())
break;
}
return r;
}
//parse control code of form \cXXX, c assumed to be the current symbol
dchar parseControlCode()
{
enforce(next(), "Unfinished escape sequence");
enforce(('a' <= current && current <= 'z') || ('A' <= current && current <= 'Z'),
"Only letters are allowed after \\c");
return current & 0x1f;
}
//
@trusted void parseFlags(S)(S flags)
{//@@@BUG@@@ text is @system
import std.conv;
foreach(ch; flags)//flags are ASCII anyway
{
L_FlagSwitch:
switch(ch)
{
foreach(i, op; __traits(allMembers, RegexOption))
{
case RegexOptionNames[i]:
if(re_flags & mixin("RegexOption."~op))
throw new RegexException(text("redundant flag specified: ",ch));
re_flags |= mixin("RegexOption."~op);
break L_FlagSwitch;
}
default:
throw new RegexException(text("unknown regex flag '",ch,"'"));
}
}
}
//parse and store IR for regex pattern
@trusted void parseRegex()
{
fixupStack.push(0);
groupStack.push(1);//0 - whole match
auto maxCounterDepth = counterDepth;
uint fix;//fixup pointer
while(!empty)
{
debug(std_regex_parser)
writeln("*LR*\nSource: ", pat, "\nStack: ",fixupStack.stack.data);
switch(current)
{
case '(':
next();
nesting++;
uint nglob;
fixupStack.push(cast(uint)ir.length);
if(current == '?')
{
next();
switch(current)
{
case ':':
put(Bytecode(IR.Nop, 0));
next();
break;
case '=':
genLookaround(IR.LookaheadStart);
next();
break;
case '!':
genLookaround(IR.NeglookaheadStart);
next();
break;
case 'P':
next();
if(current != '<')
error("Expected '<' in named group");
string name;
if(!next() || !(isAlpha(current) || current == '_'))
error("Expected alpha starting a named group");
name ~= current;
while(next() && (isAlpha(current) ||
current == '_' || std.ascii.isDigit(current)))
{
name ~= current;
}
if(current != '>')
error("Expected '>' closing named group");
next();
nglob = groupStack.top++;
enforce(groupStack.top <= maxGroupNumber, "limit on submatches is exceeded");
auto t = NamedGroup(name, nglob);
auto d = assumeSorted!"a.name < b.name"(dict);
auto ind = d.lowerBound(t).length;
insertInPlace(dict, ind, t);
put(Bytecode(IR.GroupStart, nglob));
break;
case '<':
next();
if(current == '=')
genLookaround(IR.LookbehindStart);
else if(current == '!')
genLookaround(IR.NeglookbehindStart);
else
error("'!' or '=' expected after '<'");
next();
break;
default:
error(" ':', '=', '<', 'P' or '!' expected after '(?' ");
}
}
else
{
nglob = groupStack.top++;
enforce(groupStack.top <= maxGroupNumber, "limit on number of submatches is exceeded");
put(Bytecode(IR.GroupStart, nglob));
}
break;
case ')':
enforce(nesting, "Unmatched ')'");
nesting--;
next();
fix = fixupStack.pop();
switch(ir[fix].code)
{
case IR.GroupStart:
put(Bytecode(IR.GroupEnd,ir[fix].data));
parseQuantifier(fix);
break;
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
assert(lookaroundNest);
fixLookaround(fix);
lookaroundNest--;
break;
case IR.Option: //| xxx )
//two fixups: last option + full OR
finishAlternation(fix);
fix = fixupStack.top;
switch(ir[fix].code)
{
case IR.GroupStart:
fixupStack.pop();
put(Bytecode(IR.GroupEnd,ir[fix].data));
parseQuantifier(fix);
break;
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
assert(lookaroundNest);
lookaroundNest--;
fix = fixupStack.pop();
fixLookaround(fix);
break;
default://(?:xxx)
fixupStack.pop();
parseQuantifier(fix);
}
break;
default://(?:xxx)
parseQuantifier(fix);
}
break;
case '|':
next();
fix = fixupStack.top;
if(ir.length > fix && ir[fix].code == IR.Option)
{
ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix);
put(Bytecode(IR.GotoEndOr, 0));
fixupStack.top = cast(uint)ir.length; //replace latest fixup for Option
put(Bytecode(IR.Option, 0));
break;
}
uint len, orStart;
//start a new option
if(fixupStack.length == 1)
{//only root entry, effectively no fixup
len = cast(uint)ir.length + IRL!(IR.GotoEndOr);
orStart = 0;
}
else
{//IR.lookahead, etc. fixups that have length > 1, thus check ir[x].length
len = cast(uint)ir.length - fix - (ir[fix].length - 1);
orStart = fix + ir[fix].length;
}
insertInPlace(ir, orStart, Bytecode(IR.OrStart, 0), Bytecode(IR.Option, len));
assert(ir[orStart].code == IR.OrStart);
put(Bytecode(IR.GotoEndOr, 0));
fixupStack.push(orStart); //fixup for StartOR
fixupStack.push(cast(uint)ir.length); //for second Option
put(Bytecode(IR.Option, 0));
break;
default://no groups or whatever
uint start = cast(uint)ir.length;
parseAtom();
parseQuantifier(start);
}
}
if(fixupStack.length != 1)
{
fix = fixupStack.pop();
enforce(ir[fix].code == IR.Option, "no matching ')'");
finishAlternation(fix);
enforce(fixupStack.length == 1, "no matching ')'");
}
}
//helper function, finalizes IR.Option, fix points to the first option of sequence
void finishAlternation(uint fix)
{
enforce(ir[fix].code == IR.Option, "no matching ')'");
ir[fix] = Bytecode(ir[fix].code, cast(uint)ir.length - fix - IRL!(IR.OrStart));
fix = fixupStack.pop();
enforce(ir[fix].code == IR.OrStart, "no matching ')'");
ir[fix] = Bytecode(IR.OrStart, cast(uint)ir.length - fix - IRL!(IR.OrStart));
put(Bytecode(IR.OrEnd, cast(uint)ir.length - fix - IRL!(IR.OrStart)));
uint pc = fix + IRL!(IR.OrStart);
while(ir[pc].code == IR.Option)
{
pc = pc + ir[pc].data;
if(ir[pc].code != IR.GotoEndOr)
break;
ir[pc] = Bytecode(IR.GotoEndOr, cast(uint)(ir.length - pc - IRL!(IR.OrEnd)));
pc += IRL!(IR.GotoEndOr);
}
put(Bytecode.fromRaw(0));
}
//parse and store IR for atom-quantifier pair
@trusted void parseQuantifier(uint offset)
{//copy is @system
uint replace = ir[offset].code == IR.Nop;
if(empty && !replace)
return;
uint min, max;
switch(current)
{
case '*':
min = 0;
max = infinite;
break;
case '?':
min = 0;
max = 1;
break;
case '+':
min = 1;
max = infinite;
break;
case '{':
enforce(next(), "Unexpected end of regex pattern");
enforce(std.ascii.isDigit(current), "First number required in repetition");
min = parseDecimal();
if(current == '}')
max = min;
else if(current == ',')
{
next();
if(std.ascii.isDigit(current))
max = parseDecimal();
else if(current == '}')
max = infinite;
else
error("Unexpected symbol in regex pattern");
skipSpace();
if(current != '}')
error("Unmatched '{' in regex pattern");
}
else
error("Unexpected symbol in regex pattern");
if(min > max)
error("Illegal {n,m} quantifier");
break;
default:
if(replace)
{
copy(ir[offset + 1 .. $], ir[offset .. $ - 1]);
ir.length -= 1;
}
return;
}
uint len = cast(uint)ir.length - offset - replace;
bool greedy = true;
//check only if we managed to get new symbol
if(next() && current == '?')
{
greedy = false;
next();
}
if(max != infinite)
{
if(min != 1 || max != 1)
{
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
if(replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
put(Bytecode.init); //hotspot
putRaw(1);
putRaw(min);
putRaw(max);
counterDepth = std.algorithm.max(counterDepth, nesting+1);
}
}
else if(min) //&& max is infinite
{
if(min != 1)
{
Bytecode op = Bytecode(greedy ? IR.RepeatStart : IR.RepeatQStart, len);
if(replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
offset += 1;//so it still points to the repeated block
put(Bytecode(greedy ? IR.RepeatEnd : IR.RepeatQEnd, len));
put(Bytecode.init); //hotspot
putRaw(1);
putRaw(min);
putRaw(min);
counterDepth = std.algorithm.max(counterDepth, nesting+1);
}
else if(replace)
{
copy(ir[offset+1 .. $], ir[offset .. $-1]);
ir.length -= 1;
}
put(Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len));
enforce(ir.length + len < maxCompiledLength, "maximum compiled pattern length is exceeded");
ir ~= ir[offset .. offset+len];
//IR.InfinteX is always a hotspot
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
put(Bytecode.init); //merge index
}
else//vanila {0,inf}
{
Bytecode op = Bytecode(greedy ? IR.InfiniteStart : IR.InfiniteQStart, len);
if(replace)
ir[offset] = op;
else
insertInPlace(ir, offset, op);
//IR.InfinteX is always a hotspot
put(Bytecode(greedy ? IR.InfiniteEnd : IR.InfiniteQEnd, len));
put(Bytecode.init); //merge index
}
}
//parse and store IR for atom
void parseAtom()
{
if(empty)
return;
switch(current)
{
case '*', '?', '+', '|', '{', '}':
error("'*', '+', '?', '{', '}' not allowed in atom");
break;
case '.':
put(Bytecode(IR.Any, 0));
next();
break;
case '[':
parseCharset();
break;
case '\\':
enforce(_next(), "Unfinished escape sequence");
parseEscape();
break;
case '^':
put(Bytecode(IR.Bol, 0));
next();
break;
case '$':
put(Bytecode(IR.Eol, 0));
next();
break;
default:
//FIXME: getCommonCasing in new std uni
if(re_flags & RegexOption.casefold)
{
auto range = simpleCaseFoldings(current);
assert(range.length <= 5);
if(range.length == 1)
put(Bytecode(IR.Char, range.front));
else
foreach(v; range)
put(Bytecode(IR.OrChar, v, cast(uint)range.length));
}
else
put(Bytecode(IR.Char, current));
next();
}
}
//generate code for start of lookaround: (?= (?! (?<= (?<!
void genLookaround(IR opcode)
{
put(Bytecode(opcode, 0));
put(Bytecode.fromRaw(0));
put(Bytecode.fromRaw(0));
groupStack.push(0);
lookaroundNest++;
enforce(lookaroundNest <= maxLookaroundDepth,
"maximum lookaround depth is exceeded");
}
//fixup lookaround with start at offset fix and append a proper *-End opcode
void fixLookaround(uint fix)
{
ir[fix] = Bytecode(ir[fix].code,
cast(uint)ir.length - fix - IRL!(IR.LookaheadStart));
auto g = groupStack.pop();
assert(!groupStack.empty);
ir[fix+1] = Bytecode.fromRaw(groupStack.top);
//groups are cumulative across lookarounds
ir[fix+2] = Bytecode.fromRaw(groupStack.top+g);
groupStack.top += g;
if(ir[fix].code == IR.LookbehindStart || ir[fix].code == IR.NeglookbehindStart)
{
reverseBytecode(ir[fix + IRL!(IR.LookbehindStart) .. $]);
}
put(ir[fix].paired);
}
//CodepointSet operations relatively in order of priority
enum Operator:uint {
Open = 0, Negate, Difference, SymDifference, Intersection, Union, None
}
//parse unit of CodepointSet spec, most notably escape sequences and char ranges
//also fetches next set operation
Tuple!(CodepointSet,Operator) parseCharTerm()
{
enum State{ Start, Char, Escape, CharDash, CharDashEscape,
PotentialTwinSymbolOperator }
Operator op = Operator.None;
dchar last;
CodepointSet set;
State state = State.Start;
static void addWithFlags(ref CodepointSet set, uint ch, uint re_flags)
{
if(re_flags & RegexOption.casefold)
{
auto range = simpleCaseFoldings(ch);
foreach(v; range)
set |= v;
}
else
set |= ch;
}
static Operator twinSymbolOperator(dchar symbol)
{
switch(symbol)
{
case '|':
return Operator.Union;
case '-':
return Operator.Difference;
case '~':
return Operator.SymDifference;
case '&':
return Operator.Intersection;
default:
assert(false);
}
}
L_CharTermLoop:
for(;;)
{
final switch(state)
{
case State.Start:
switch(current)
{
case '|':
case '-':
case '~':
case '&':
state = State.PotentialTwinSymbolOperator;
last = current;
break;
case '[':
op = Operator.Union;
goto case;
case ']':
break L_CharTermLoop;
case '\\':
state = State.Escape;
break;
default:
state = State.Char;
last = current;
}
break;
case State.Char:
// xxx last current xxx
switch(current)
{
case '|':
case '~':
case '&':
// then last is treated as normal char and added as implicit union
state = State.PotentialTwinSymbolOperator;
addWithFlags(set, last, re_flags);
last = current;
break;
case '-': // still need more info
state = State.CharDash;
break;
case '\\':
set |= last;
state = State.Escape;
break;
case '[':
op = Operator.Union;
goto case;
case ']':
addWithFlags(set, last, re_flags);
break L_CharTermLoop;
default:
addWithFlags(set, last, re_flags);
last = current;
}
break;
case State.PotentialTwinSymbolOperator:
// xxx last current xxxx
// where last = [|-&~]
if(current == last)
{
op = twinSymbolOperator(last);
next();//skip second twin char
break L_CharTermLoop;
}
//~~~WORKAROUND~~~
//It's a copy of State.Char, should be goto case but see @@@BUG12603
switch(current)
{
case '|':
case '~':
case '&':
// then last is treated as normal char and added as implicit union
state = State.PotentialTwinSymbolOperator;
addWithFlags(set, last, re_flags);
last = current;
break;
case '-': // still need more info
state = State.CharDash;
break;
case '\\':
set |= last;
state = State.Escape;
break;
case '[':
op = Operator.Union;
goto case;
case ']':
addWithFlags(set, last, re_flags);
break L_CharTermLoop;
default:
addWithFlags(set, last, re_flags);
state = State.Char;
last = current;
}
break;
//~~~END OF WORKAROUND~~~
//goto case State.Char;// it's not a twin lets re-run normal logic
case State.Escape:
// xxx \ current xxx
switch(current)
{
case 'f':
last = '\f';
state = State.Char;
break;
case 'n':
last = '\n';
state = State.Char;
break;
case 'r':
last = '\r';
state = State.Char;
break;
case 't':
last = '\t';
state = State.Char;
break;
case 'v':
last = '\v';
state = State.Char;
break;
case 'c':
last = parseControlCode();
state = State.Char;
break;
foreach(val; Escapables)
{
case val:
}
last = current;
state = State.Char;
break;
case 'p':
set.add(parseUnicodePropertySpec(false));
state = State.Start;
continue L_CharTermLoop; //next char already fetched
case 'P':
set.add(parseUnicodePropertySpec(true));
state = State.Start;
continue L_CharTermLoop; //next char already fetched
case 'x':
last = parseUniHex(pat, 2);
state = State.Char;
break;
case 'u':
last = parseUniHex(pat, 4);
state = State.Char;
break;
case 'U':
last = parseUniHex(pat, 8);
state = State.Char;
break;
case 'd':
set.add(unicode.Nd);
state = State.Start;
break;
case 'D':
set.add(unicode.Nd.inverted);
state = State.Start;
break;
case 's':
set.add(unicode.White_Space);
state = State.Start;
break;
case 'S':
set.add(unicode.White_Space.inverted);
state = State.Start;
break;
case 'w':
set.add(wordCharacter);
state = State.Start;
break;
case 'W':
set.add(wordCharacter.inverted);
state = State.Start;
break;
default:
enforce(false, "invalid escape sequence");
}
break;
case State.CharDash:
// xxx last - current xxx
switch(current)
{
case '[':
op = Operator.Union;
goto case;
case ']':
//means dash is a single char not an interval specifier
addWithFlags(set, last, re_flags);
addWithFlags(set, '-', re_flags);
break L_CharTermLoop;
case '-'://set Difference again
addWithFlags(set, last, re_flags);
op = Operator.Difference;
next();//skip '-'
break L_CharTermLoop;
case '\\':
state = State.CharDashEscape;
break;
default:
enforce(last <= current, "inverted range");
if(re_flags & RegexOption.casefold)
{
for(uint ch = last; ch <= current; ch++)
addWithFlags(set, ch, re_flags);
}
else
set.add(last, current + 1);
state = State.Start;
}
break;
case State.CharDashEscape:
//xxx last - \ current xxx
uint end;
switch(current)
{
case 'f':
end = '\f';
break;
case 'n':
end = '\n';
break;
case 'r':
end = '\r';
break;
case 't':
end = '\t';
break;
case 'v':
end = '\v';
break;
foreach(val; Escapables)
{
case val:
}
end = current;
break;
case 'c':
end = parseControlCode();
break;
case 'x':
end = parseUniHex(pat, 2);
break;
case 'u':
end = parseUniHex(pat, 4);
break;
case 'U':
end = parseUniHex(pat, 8);
break;
default:
error("invalid escape sequence");
}
enforce(last <= end,"inverted range");
set.add(last, end + 1);
state = State.Start;
break;
}
enforce(next(), "unexpected end of CodepointSet");
}
return tuple(set, op);
}
alias ValStack = Stack!(CodepointSet);
alias OpStack = Stack!(Operator);
//parse and store IR for CodepointSet
void parseCharset()
{
ValStack vstack;
OpStack opstack;
import std.functional : unaryFun;
//
static bool apply(Operator op, ref ValStack stack)
{
switch(op)
{
case Operator.Negate:
stack.top = stack.top.inverted;
break;
case Operator.Union:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '||'");
stack.top.add(s);
break;
case Operator.Difference:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '--'");
stack.top.sub(s);
break;
case Operator.SymDifference:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '~~'");
stack.top ~= s;
break;
case Operator.Intersection:
auto s = stack.pop();//2nd operand
enforce(!stack.empty, "no operand for '&&'");
stack.top.intersect(s);
break;
default:
return false;
}
return true;
}
static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
{
while(cond(opstack.top))
{
if(!apply(opstack.pop(),vstack))
return false;//syntax error
if(opstack.empty)
return false;
}
return true;
}
L_CharsetLoop:
do
{
switch(current)
{
case '[':
opstack.push(Operator.Open);
enforce(next(), "unexpected end of character class");
if(current == '^')
{
opstack.push(Operator.Negate);
enforce(next(), "unexpected end of character class");
}
//[] is prohibited
enforce(current != ']', "wrong character class");
goto default;
case ']':
enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
"character class syntax error");
enforce(!opstack.empty, "unmatched ']'");
opstack.pop();
next();
if(opstack.empty)
break L_CharsetLoop;
auto pair = parseCharTerm();
if(!pair[0].empty)//not only operator e.g. -- or ~~
{
vstack.top.add(pair[0]);//apply union
}
if(pair[1] != Operator.None)
{
if(opstack.top == Operator.Union)
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
opstack.push(pair[1]);
}
break;
//
default://yet another pair of term(op)?
auto pair = parseCharTerm();
if(pair[1] != Operator.None)
{
if(opstack.top == Operator.Union)
unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
opstack.push(pair[1]);
}
vstack.push(pair[0]);
}
}while(!empty || !opstack.empty);
while(!opstack.empty)
apply(opstack.pop(),vstack);
assert(vstack.length == 1);
charsetToIr(vstack.top);
}
//try to generate optimal IR code for this CodepointSet
@trusted void charsetToIr(CodepointSet set)
{//@@@BUG@@@ writeln is @system
uint chars = cast(uint)set.length;
if(chars < Bytecode.maxSequence)
{
switch(chars)
{
case 1:
put(Bytecode(IR.Char, set.byCodepoint.front));
break;
case 0:
error("empty CodepointSet not allowed");
break;
default:
foreach(ch; set.byCodepoint)
put(Bytecode(IR.OrChar, ch, chars));
}
}
else
{
import std.algorithm : countUntil;
auto ivals = set.byInterval;
auto n = charsets.countUntil(set);
if(n >= 0)
{
if(ivals.length*2 > maxCharsetUsed)
put(Bytecode(IR.Trie, cast(uint)n));
else
put(Bytecode(IR.CodepointSet, cast(uint)n));
return;
}
if(ivals.length*2 > maxCharsetUsed)
{
auto t = getTrie(set);
put(Bytecode(IR.Trie, cast(uint)tries.length));
tries ~= t;
debug(std_regex_allocation) writeln("Trie generated");
}
else
{
put(Bytecode(IR.CodepointSet, cast(uint)charsets.length));
tries ~= Trie.init;
}
charsets ~= set;
assert(charsets.length == tries.length);
}
}
//parse and generate IR for escape stand alone escape sequence
@trusted void parseEscape()
{//accesses array of appender
switch(current)
{
case 'f': next(); put(Bytecode(IR.Char, '\f')); break;
case 'n': next(); put(Bytecode(IR.Char, '\n')); break;
case 'r': next(); put(Bytecode(IR.Char, '\r')); break;
case 't': next(); put(Bytecode(IR.Char, '\t')); break;
case 'v': next(); put(Bytecode(IR.Char, '\v')); break;
case 'd':
next();
charsetToIr(unicode.Nd);
break;
case 'D':
next();
charsetToIr(unicode.Nd.inverted);
break;
case 'b': next(); put(Bytecode(IR.Wordboundary, 0)); break;
case 'B': next(); put(Bytecode(IR.Notwordboundary, 0)); break;
case 's':
next();
charsetToIr(unicode.White_Space);
break;
case 'S':
next();
charsetToIr(unicode.White_Space.inverted);
break;
case 'w':
next();
charsetToIr(wordCharacter);
break;
case 'W':
next();
charsetToIr(wordCharacter.inverted);
break;
case 'p': case 'P':
auto CodepointSet = parseUnicodePropertySpec(current == 'P');
charsetToIr(CodepointSet);
break;
case 'x':
uint code = parseUniHex(pat, 2);
next();
put(Bytecode(IR.Char,code));
break;
case 'u': case 'U':
uint code = parseUniHex(pat, current == 'u' ? 4 : 8);
next();
put(Bytecode(IR.Char, code));
break;
case 'c': //control codes
Bytecode code = Bytecode(IR.Char, parseControlCode());
next();
put(code);
break;
case '0':
next();
put(Bytecode(IR.Char, 0));//NUL character
break;
case '1': .. case '9':
uint nref = cast(uint)current - '0';
uint maxBackref = sum(groupStack.data);
enforce(nref < maxBackref, "Backref to unseen group");
//perl's disambiguation rule i.e.
//get next digit only if there is such group number
while(nref < maxBackref && next() && std.ascii.isDigit(current))
{
nref = nref * 10 + current - '0';
}
if(nref >= maxBackref)
nref /= 10;
enforce(!isOpenGroup(nref), "Backref to open group");
uint localLimit = maxBackref - groupStack.top;
if(nref >= localLimit)
{
put(Bytecode(IR.Backref, nref-localLimit));
ir[$-1].setLocalRef();
}
else
put(Bytecode(IR.Backref, nref));
markBackref(nref);
break;
default:
auto op = Bytecode(IR.Char, current);
next();
put(op);
}
}
//parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
//\ - assumed to be processed, p - is current
CodepointSet parseUnicodePropertySpec(bool negated)
{
enum MAX_PROPERTY = 128;
char[MAX_PROPERTY] result;
uint k = 0;
enforce(next());
if(current == '{')
{
while(k < MAX_PROPERTY && next() && current !='}' && current !=':')
if(current != '-' && current != ' ' && current != '_')
result[k++] = cast(char)std.ascii.toLower(current);
enforce(k != MAX_PROPERTY, "invalid property name");
enforce(current == '}', "} expected ");
}
else
{//single char properties e.g.: \pL, \pN ...
enforce(current < 0x80, "invalid property name");
result[k++] = cast(char)current;
}
auto s = getUnicodeSet(result[0..k], negated,
cast(bool)(re_flags & RegexOption.casefold));
enforce(!s.empty, "unrecognized unicode property spec");
next();
return s;
}
//
@trusted void error(string msg)
{
import std.format;
auto app = appender!string();
ir = null;
formattedWrite(app, "%s\nPattern with error: `%s` <--HERE-- `%s`",
msg, origin[0..$-pat.length], pat);
throw new RegexException(app.data);
}
alias Char = BasicElementOf!R;
@property program()
{
return makeRegex(this);
}
}
/+
lightweight post process step,
only essentials
+/
@trusted void lightPostprocess(Char)(ref Regex!Char zis)
{//@@@BUG@@@ write is @system
with(zis)
{
struct FixedStack(T)
{
T[] arr;
uint _top;
//this(T[] storage){ arr = storage; _top = -1; }
@property ref T top(){ assert(!empty); return arr[_top]; }
void push(T x){ arr[++_top] = x; }
T pop() { assert(!empty); return arr[_top--]; }
@property bool empty(){ return _top == -1; }
}
auto counterRange = FixedStack!uint(new uint[maxCounterDepth+1], -1);
counterRange.push(1);
ulong cumRange = 0;
for(uint i = 0; i < ir.length; i += ir[i].length)
{
if(ir[i].hotspot)
{
assert(i + 1 < ir.length,
"unexpected end of IR while looking for hotspot");
ir[i+1] = Bytecode.fromRaw(hotspotTableSize);
hotspotTableSize += counterRange.top;
}
switch(ir[i].code)
{
case IR.RepeatStart, IR.RepeatQStart:
uint repEnd = cast(uint)(i + ir[i].data + IRL!(IR.RepeatStart));
assert(ir[repEnd].code == ir[i].paired.code);
uint max = ir[repEnd + 4].raw;
ir[repEnd+2].raw = counterRange.top;
ir[repEnd+3].raw *= counterRange.top;
ir[repEnd+4].raw *= counterRange.top;
ulong cntRange = cast(ulong)(max)*counterRange.top;
cumRange += cntRange;
enforce(cumRange < maxCumulativeRepetitionLength,
"repetition length limit is exceeded");
counterRange.push(cast(uint)cntRange + counterRange.top);
threadCount += counterRange.top;
break;
case IR.RepeatEnd, IR.RepeatQEnd:
threadCount += counterRange.top;
counterRange.pop();
break;
case IR.GroupStart:
if(isBackref(ir[i].data))
ir[i].setBackrefence();
threadCount += counterRange.top;
break;
case IR.GroupEnd:
if(isBackref(ir[i].data))
ir[i].setBackrefence();
threadCount += counterRange.top;
break;
default:
threadCount += counterRange.top;
}
}
checkIfOneShot();
if(!(flags & RegexInfo.oneShot))
kickstart = Kickstart!Char(zis, new uint[](256));
debug(std_regex_allocation) writefln("IR processed, max threads: %d", threadCount);
}
}
//IR code validator - proper nesting, illegal instructions, etc.
@trusted void validateRe(Char)(ref Regex!Char zis)
{//@@@BUG@@@ text is @system
import std.conv;
with(zis)
{
for(uint pc = 0; pc < ir.length; pc += ir[pc].length)
{
if(ir[pc].isStart || ir[pc].isEnd)
{
uint dest = ir[pc].indexOfPair(pc);
assert(dest < ir.length, text("Wrong length in opcode at pc=",
pc, " ", dest, " vs ", ir.length));
assert(ir[dest].paired == ir[pc],
text("Wrong pairing of opcodes at pc=", pc, "and pc=", dest));
}
else if(ir[pc].isAtom)
{
}
else
assert(0, text("Unknown type of instruction at pc=", pc));
}
}
}