mirror of
https://github.com/dlang/phobos.git
synced 2025-04-28 14:10:30 +03:00
1172 lines
36 KiB
D
1172 lines
36 KiB
D
/*
|
|
Implementation of std.regex IR, an intermediate representation
|
|
of a regular expression pattern.
|
|
|
|
This is a common ground between frontend regex component (parser)
|
|
and backend components - generators, matchers and other "filters".
|
|
*/
|
|
module std.regex.internal.ir;
|
|
|
|
package(std.regex):
|
|
|
|
import std.exception, std.meta, std.range.primitives, std.traits, std.uni;
|
|
|
|
debug(std_regex_parser) import std.stdio;
|
|
// just a common trait, may be moved elsewhere
|
|
alias BasicElementOf(Range) = Unqual!(ElementEncodingType!Range);
|
|
|
|
enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
|
|
|
|
// heuristic value determines maximum CodepointSet length suitable for linear search
|
|
enum maxCharsetUsed = 6;
|
|
|
|
// another variable to tweak behavior of caching generated Tries for character classes
|
|
enum maxCachedMatchers = 8;
|
|
|
|
alias Trie = CodepointSetTrie!(13, 8);
|
|
alias makeTrie = codepointSetTrie!(13, 8);
|
|
|
|
CharMatcher[CodepointSet] matcherCache;
|
|
|
|
//accessor with caching
|
|
@trusted CharMatcher getMatcher(CodepointSet set)
|
|
{
|
|
// almost all properties of AA are not @safe
|
|
// https://issues.dlang.org/show_bug.cgi?id=6357
|
|
if (__ctfe || maxCachedMatchers == 0)
|
|
return CharMatcher(set);
|
|
else
|
|
{
|
|
auto p = set in matcherCache;
|
|
if (p)
|
|
return *p;
|
|
if (matcherCache.length == maxCachedMatchers)
|
|
{
|
|
// flush enmatchers in trieCache
|
|
matcherCache = null;
|
|
}
|
|
return (matcherCache[set] = CharMatcher(set));
|
|
}
|
|
}
|
|
|
|
@property ref wordMatcher()()
|
|
{
|
|
static immutable CharMatcher matcher = CharMatcher(wordCharacter);
|
|
return matcher;
|
|
}
|
|
|
|
// some special Unicode white space characters
|
|
private enum NEL = '\u0085', LS = '\u2028', PS = '\u2029';
|
|
|
|
//Regular expression engine/parser options:
|
|
// global - search all nonoverlapping matches in input
|
|
// casefold - case insensitive matching, do casefolding on match in unicode mode
|
|
// freeform - ignore whitespace in pattern, to match space use [ ] or \s
|
|
// multiline - switch ^, $ detect start and end of linesinstead of just start and end of input
|
|
enum RegexOption: uint {
|
|
global = 0x1,
|
|
casefold = 0x2,
|
|
freeform = 0x4,
|
|
nonunicode = 0x8,
|
|
multiline = 0x10,
|
|
singleline = 0x20
|
|
}
|
|
//do not reorder this list
|
|
alias RegexOptionNames = AliasSeq!('g', 'i', 'x', 'U', 'm', 's');
|
|
static assert( RegexOption.max < 0x80);
|
|
|
|
package(std) string regexOptionsToString()(uint flags) nothrow pure @safe
|
|
{
|
|
flags &= (RegexOption.max << 1) - 1;
|
|
if (!flags)
|
|
return "";
|
|
char[RegexOptionNames.length] buffer = void;
|
|
size_t pos = 0;
|
|
foreach (i, flag; __traits(allMembers, RegexOption))
|
|
if (flags & __traits(getMember, RegexOption, flag))
|
|
buffer[pos++] = RegexOptionNames[i];
|
|
return buffer[0 .. pos].idup;
|
|
}
|
|
|
|
// flags that allow guide execution of engine
|
|
enum RegexInfo : uint { oneShot = 0x80 }
|
|
|
|
// IR bit pattern: 0b1_xxxxx_yy
|
|
// where yy indicates class of instruction, xxxxx for actual operation code
|
|
// 00: atom, a normal instruction
|
|
// 01: open, opening of a group, has length of contained IR in the low bits
|
|
// 10: close, closing of a group, has length of contained IR in the low bits
|
|
// 11 unused
|
|
//
|
|
// Loops with Q (non-greedy, with ? mark) must have the same size / other properties as non Q version
|
|
// Possible changes:
|
|
//* merge group, option, infinite/repeat start (to never copy during parsing of (a|b){1,2})
|
|
//* reorganize groups to make n args easier to find, or simplify the check for groups of similar ops
|
|
// (like lookaround), or make it easier to identify hotspots.
|
|
|
|
enum IR:uint {
|
|
Char = 0b1_00000_00, //a character
|
|
Any = 0b1_00001_00, //any character
|
|
CodepointSet = 0b1_00010_00, //a most generic CodepointSet [...]
|
|
Trie = 0b1_00011_00, //CodepointSet implemented as Trie
|
|
//match with any of a consecutive OrChar's in this sequence
|
|
//(used for case insensitive match)
|
|
//OrChar holds in upper two bits of data total number of OrChars in this _sequence_
|
|
//the drawback of this representation is that it is difficult
|
|
// to detect a jump in the middle of it
|
|
OrChar = 0b1_00100_00,
|
|
Nop = 0b1_00101_00, //no operation (padding)
|
|
End = 0b1_00110_00, //end of program
|
|
Bol = 0b1_00111_00, //beginning of a line ^
|
|
Eol = 0b1_01000_00, //end of a line $
|
|
Wordboundary = 0b1_01001_00, //boundary of a word
|
|
Notwordboundary = 0b1_01010_00, //not a word boundary
|
|
Backref = 0b1_01011_00, //backreference to a group (that has to be pinned, i.e. locally unique) (group index)
|
|
GroupStart = 0b1_01100_00, //start of a group (x) (groupIndex+groupPinning(1bit))
|
|
GroupEnd = 0b1_01101_00, //end of a group (x) (groupIndex+groupPinning(1bit))
|
|
Option = 0b1_01110_00, //start of an option within an alternation x | y (length)
|
|
GotoEndOr = 0b1_01111_00, //end of an option (length of the rest)
|
|
Bof = 0b1_10000_00, //begining of "file" (string) ^
|
|
Eof = 0b1_10001_00, //end of "file" (string) $
|
|
//... any additional atoms here
|
|
|
|
OrStart = 0b1_00000_01, //start of alternation group (length)
|
|
OrEnd = 0b1_00000_10, //end of the or group (length,mergeIndex)
|
|
//with this instruction order
|
|
//bit mask 0b1_00001_00 could be used to test/set greediness
|
|
InfiniteStart = 0b1_00001_01, //start of an infinite repetition x* (length)
|
|
InfiniteEnd = 0b1_00001_10, //end of infinite repetition x* (length,mergeIndex)
|
|
InfiniteQStart = 0b1_00010_01, //start of a non eager infinite repetition x*? (length)
|
|
InfiniteQEnd = 0b1_00010_10, //end of non eager infinite repetition x*? (length,mergeIndex)
|
|
InfiniteBloomStart = 0b1_00011_01, //start of an filtered infinite repetition x* (length)
|
|
InfiniteBloomEnd = 0b1_00011_10, //end of filtered infinite repetition x* (length,mergeIndex)
|
|
RepeatStart = 0b1_00100_01, //start of a {n,m} repetition (length)
|
|
RepeatEnd = 0b1_00100_10, //end of x{n,m} repetition (length,step,minRep,maxRep)
|
|
RepeatQStart = 0b1_00101_01, //start of a non eager x{n,m}? repetition (length)
|
|
RepeatQEnd = 0b1_00101_10, //end of non eager x{n,m}? repetition (length,step,minRep,maxRep)
|
|
|
|
//
|
|
LookaheadStart = 0b1_00110_01, //begin of the lookahead group (length)
|
|
LookaheadEnd = 0b1_00110_10, //end of a lookahead group (length)
|
|
NeglookaheadStart = 0b1_00111_01, //start of a negative lookahead (length)
|
|
NeglookaheadEnd = 0b1_00111_10, //end of a negative lookahead (length)
|
|
LookbehindStart = 0b1_01000_01, //start of a lookbehind (length)
|
|
LookbehindEnd = 0b1_01000_10, //end of a lookbehind (length)
|
|
NeglookbehindStart = 0b1_01001_01, //start of a negative lookbehind (length)
|
|
NeglookbehindEnd = 0b1_01001_10, //end of negative lookbehind (length)
|
|
}
|
|
|
|
//a shorthand for IR length - full length of specific opcode evaluated at compile time
|
|
template IRL(IR code)
|
|
{
|
|
enum uint IRL = lengthOfIR(code);
|
|
}
|
|
static assert(IRL!(IR.LookaheadStart) == 3);
|
|
|
|
//how many parameters follow the IR, should be optimized fixing some IR bits
|
|
int immediateParamsIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
switch (i)
|
|
{
|
|
case IR.OrEnd,IR.InfiniteEnd,IR.InfiniteQEnd:
|
|
return 1; // merge table index
|
|
case IR.InfiniteBloomEnd:
|
|
return 2; // bloom filter index + merge table index
|
|
case IR.RepeatEnd, IR.RepeatQEnd:
|
|
return 4;
|
|
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
|
|
return 2; // start-end of captures used
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
//full length of IR instruction inlcuding all parameters that might follow it
|
|
int lengthOfIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return 1 + immediateParamsIR(i);
|
|
}
|
|
|
|
//full length of the paired IR instruction inlcuding all parameters that might follow it
|
|
int lengthOfPairedIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return 1 + immediateParamsIR(pairedIR(i));
|
|
}
|
|
|
|
//if the operation has a merge point (this relies on the order of the ops)
|
|
bool hasMerge(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return (i&0b11)==0b10 && i <= IR.RepeatQEnd;
|
|
}
|
|
|
|
//is an IR that opens a "group"
|
|
bool isStartIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return (i&0b11)==0b01;
|
|
}
|
|
|
|
//is an IR that ends a "group"
|
|
bool isEndIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return (i&0b11)==0b10;
|
|
}
|
|
|
|
//is a standalone IR
|
|
bool isAtomIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
return (i&0b11)==0b00;
|
|
}
|
|
|
|
//makes respective pair out of IR i, swapping start/end bits of instruction
|
|
IR pairedIR(IR i) @safe pure nothrow @nogc
|
|
{
|
|
assert(isStartIR(i) || isEndIR(i));
|
|
return cast(IR) (i ^ 0b11);
|
|
}
|
|
|
|
//encoded IR instruction
|
|
@safe pure
|
|
struct Bytecode
|
|
{
|
|
uint raw;
|
|
//natural constraints
|
|
enum maxSequence = 2+4;
|
|
enum maxData = 1 << 22;
|
|
enum maxRaw = 1 << 31;
|
|
|
|
@safe pure:
|
|
this(IR code, uint data)
|
|
{
|
|
assert(data < (1 << 22) && code < 256);
|
|
raw = code << 24 | data;
|
|
}
|
|
|
|
this(IR code, uint data, uint seq)
|
|
{
|
|
assert(data < (1 << 22) && code < 256 );
|
|
assert(seq >= 2 && seq < maxSequence);
|
|
raw = code << 24 | (seq - 2)<<22 | data;
|
|
}
|
|
|
|
//store raw data
|
|
static Bytecode fromRaw(uint data)
|
|
{
|
|
Bytecode t;
|
|
t.raw = data;
|
|
return t;
|
|
}
|
|
|
|
// bit twiddling helpers
|
|
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
|
|
@property uint data()() const { return raw & 0x003f_ffff; }
|
|
|
|
@property void data()(uint val)
|
|
{
|
|
raw = (raw & ~0x003f_ffff) | (val & 0x003f_ffff);
|
|
}
|
|
|
|
// ditto
|
|
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
|
|
@property uint sequence()() const { return 2 + (raw >> 22 & 0x3); }
|
|
|
|
// ditto
|
|
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
|
|
@property IR code()() const { return cast(IR)(raw >> 24); }
|
|
|
|
//ditto
|
|
@property bool hotspot() const { return hasMerge(code); }
|
|
|
|
//test the class of this instruction
|
|
@property bool isAtom() const { return isAtomIR(code); }
|
|
|
|
//ditto
|
|
@property bool isStart() const { return isStartIR(code); }
|
|
|
|
//ditto
|
|
@property bool isEnd() const { return isEndIR(code); }
|
|
|
|
//number of arguments for this instruction
|
|
@property int args() const { return immediateParamsIR(code); }
|
|
|
|
//mark this GroupStart or GroupEnd as referenced in backreference
|
|
void setBackrefence()
|
|
{
|
|
assert(code == IR.GroupStart || code == IR.GroupEnd);
|
|
raw = raw | 1 << 23;
|
|
}
|
|
|
|
//is referenced
|
|
@property bool backreference() const
|
|
{
|
|
assert(code == IR.GroupStart || code == IR.GroupEnd);
|
|
return cast(bool)(raw & 1 << 23);
|
|
}
|
|
|
|
//mark as local reference (for backrefs in lookarounds)
|
|
void setLocalRef()
|
|
{
|
|
assert(code == IR.Backref);
|
|
raw = raw | 1 << 23;
|
|
}
|
|
|
|
//is a local ref
|
|
@property bool localRef() const
|
|
{
|
|
assert(code == IR.Backref);
|
|
return cast(bool)(raw & 1 << 23);
|
|
}
|
|
|
|
//human readable name of instruction
|
|
@trusted @property string mnemonic()() const
|
|
{//@@@BUG@@@ to is @system
|
|
import std.conv : to;
|
|
return to!string(code);
|
|
}
|
|
|
|
//full length of instruction
|
|
@property uint length() const
|
|
{
|
|
return lengthOfIR(code);
|
|
}
|
|
|
|
//full length of respective start/end of this instruction
|
|
@property uint pairedLength() const
|
|
{
|
|
return lengthOfPairedIR(code);
|
|
}
|
|
|
|
//returns bytecode of paired instruction (assuming this one is start or end)
|
|
@property Bytecode paired() const
|
|
{//depends on bit and struct layout order
|
|
assert(isStart || isEnd);
|
|
return Bytecode.fromRaw(raw ^ 0b11 << 24);
|
|
}
|
|
|
|
//gets an index into IR block of the respective pair
|
|
uint indexOfPair(uint pc) const
|
|
{
|
|
assert(isStart || isEnd);
|
|
return isStart ? pc + data + length : pc - data - lengthOfPairedIR(code);
|
|
}
|
|
}
|
|
|
|
static assert(Bytecode.sizeof == 4);
|
|
|
|
|
|
//index entry structure for name --> number of submatch
|
|
struct NamedGroup
|
|
{
|
|
string name;
|
|
uint group;
|
|
}
|
|
|
|
//holds pair of start-end markers for a submatch
|
|
struct Group(DataIndex)
|
|
{
|
|
DataIndex begin = DataIndex.max;
|
|
DataIndex end = DataIndex.min;
|
|
|
|
bool opCast(T : bool)() const
|
|
{
|
|
return begin <= end;
|
|
}
|
|
|
|
@trusted string toString()() const
|
|
{
|
|
if (begin < end)
|
|
return "(unmatched)";
|
|
import std.array : appender;
|
|
import std.format.write : formattedWrite;
|
|
auto a = appender!string();
|
|
formattedWrite(a, "%s..%s", begin, end);
|
|
return a.data;
|
|
}
|
|
}
|
|
|
|
//debugging tool, prints out instruction along with opcodes
|
|
@trusted string disassemble(in Bytecode[] irb, uint pc, in NamedGroup[] dict=[])
|
|
{
|
|
import std.array : appender;
|
|
import std.format.write : formattedWrite;
|
|
auto output = appender!string();
|
|
formattedWrite(output,"%s", irb[pc].mnemonic);
|
|
switch (irb[pc].code)
|
|
{
|
|
case IR.Char:
|
|
formattedWrite(output, " %s (0x%x)",cast(dchar) irb[pc].data, irb[pc].data);
|
|
break;
|
|
case IR.OrChar:
|
|
formattedWrite(output, " %s (0x%x) seq=%d", cast(dchar) irb[pc].data, irb[pc].data, irb[pc].sequence);
|
|
break;
|
|
case IR.RepeatStart, IR.InfiniteStart, IR.InfiniteBloomStart,
|
|
IR.Option, IR.GotoEndOr, IR.OrStart:
|
|
//forward-jump instructions
|
|
uint len = irb[pc].data;
|
|
formattedWrite(output, " pc=>%u", pc+len+IRL!(IR.RepeatStart));
|
|
break;
|
|
case IR.RepeatEnd, IR.RepeatQEnd: //backward-jump instructions
|
|
uint len = irb[pc].data;
|
|
formattedWrite(output, " pc=>%u min=%u max=%u step=%u",
|
|
pc - len, irb[pc + 3].raw, irb[pc + 4].raw, irb[pc + 2].raw);
|
|
break;
|
|
case IR.InfiniteEnd, IR.InfiniteQEnd, IR.InfiniteBloomEnd, IR.OrEnd: //ditto
|
|
uint len = irb[pc].data;
|
|
formattedWrite(output, " pc=>%u", pc-len);
|
|
break;
|
|
case IR.LookaheadEnd, IR.NeglookaheadEnd: //ditto
|
|
uint len = irb[pc].data;
|
|
formattedWrite(output, " pc=>%u", pc-len);
|
|
break;
|
|
case IR.GroupStart, IR.GroupEnd:
|
|
uint n = irb[pc].data;
|
|
string name;
|
|
foreach (v;dict)
|
|
if (v.group == n)
|
|
{
|
|
name = "'"~v.name~"'";
|
|
break;
|
|
}
|
|
formattedWrite(output, " %s #%u " ~ (irb[pc].backreference ? "referenced" : ""),
|
|
name, n);
|
|
break;
|
|
case IR.LookaheadStart, IR.NeglookaheadStart, IR.LookbehindStart, IR.NeglookbehindStart:
|
|
uint len = irb[pc].data;
|
|
uint start = irb[pc+1].raw, end = irb[pc+2].raw;
|
|
formattedWrite(output, " pc=>%u [%u..%u]", pc + len + IRL!(IR.LookaheadStart), start, end);
|
|
break;
|
|
case IR.Backref: case IR.CodepointSet: case IR.Trie:
|
|
uint n = irb[pc].data;
|
|
formattedWrite(output, " %u", n);
|
|
if (irb[pc].code == IR.Backref)
|
|
formattedWrite(output, " %s", irb[pc].localRef ? "local" : "global");
|
|
break;
|
|
default://all data-free instructions
|
|
}
|
|
if (irb[pc].hotspot)
|
|
formattedWrite(output, " Hotspot %u", irb[pc+1].raw);
|
|
return output.data;
|
|
}
|
|
|
|
//disassemble the whole chunk
|
|
@trusted void printBytecode()(in Bytecode[] slice, in NamedGroup[] dict=[])
|
|
{
|
|
import std.stdio : writeln;
|
|
for (uint pc=0; pc<slice.length; pc += slice[pc].length)
|
|
writeln("\t", disassemble(slice, pc, dict));
|
|
}
|
|
|
|
// Encapsulates memory management, explicit ref counting
|
|
// and the exact type of engine created
|
|
// there is a single instance per engine combination type x Char
|
|
// In future may also maintain a (TLS?) cache of memory
|
|
interface MatcherFactory(Char)
|
|
{
|
|
@safe:
|
|
Matcher!Char create(const ref Regex!Char, in Char[] input) const;
|
|
Matcher!Char dup(Matcher!Char m, in Char[] input) const;
|
|
size_t incRef(Matcher!Char m) const;
|
|
size_t decRef(Matcher!Char m) const;
|
|
}
|
|
|
|
// Only memory management, no compile-time vs run-time specialities
|
|
abstract class GenericFactory(alias EngineType, Char) : MatcherFactory!Char
|
|
{
|
|
import core.memory : pureFree;
|
|
import std.internal.memory : enforceMalloc;
|
|
import core.memory : GC;
|
|
// round up to next multiple of size_t for alignment purposes
|
|
enum classSize = (__traits(classInstanceSize, EngineType!Char) + size_t.sizeof - 1) & ~(size_t.sizeof - 1);
|
|
|
|
EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const;
|
|
|
|
override EngineType!Char create(const ref Regex!Char re, in Char[] input) const @trusted
|
|
{
|
|
immutable size = EngineType!Char.initialMemory(re) + classSize;
|
|
auto memory = enforceMalloc(size)[0 .. size];
|
|
scope(failure) pureFree(memory.ptr);
|
|
GC.addRange(memory.ptr, classSize);
|
|
auto engine = construct(re, input, memory);
|
|
assert(engine.refCount == 1);
|
|
assert(cast(void*) engine == memory.ptr);
|
|
return engine;
|
|
}
|
|
|
|
override EngineType!Char dup(Matcher!Char engine, in Char[] input) const @trusted
|
|
{
|
|
immutable size = EngineType!Char.initialMemory(engine.pattern) + classSize;
|
|
auto memory = enforceMalloc(size)[0 .. size];
|
|
scope(failure) pureFree(memory.ptr);
|
|
auto copy = construct(engine.pattern, input, memory);
|
|
GC.addRange(memory.ptr, classSize);
|
|
engine.dupTo(copy, memory[classSize .. size]);
|
|
assert(copy.refCount == 1);
|
|
return copy;
|
|
}
|
|
|
|
override size_t incRef(Matcher!Char m) const
|
|
{
|
|
return ++m.refCount;
|
|
}
|
|
|
|
override size_t decRef(Matcher!Char m) const @trusted
|
|
{
|
|
assert(m.refCount != 0);
|
|
auto cnt = --m.refCount;
|
|
if (cnt == 0)
|
|
{
|
|
void* ptr = cast(void*) m;
|
|
GC.removeRange(ptr);
|
|
pureFree(ptr);
|
|
}
|
|
return cnt;
|
|
}
|
|
}
|
|
|
|
// A factory for run-time engines
|
|
class RuntimeFactory(alias EngineType, Char) : GenericFactory!(EngineType, Char)
|
|
{
|
|
override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
|
|
{
|
|
import core.lifetime : emplace;
|
|
return emplace!(EngineType!Char)(memory[0 .. classSize],
|
|
re, Input!Char(input), memory[classSize .. $]);
|
|
}
|
|
}
|
|
|
|
// A factory for compile-time engine
|
|
class CtfeFactory(alias EngineType, Char, alias func) : GenericFactory!(EngineType, Char)
|
|
{
|
|
override EngineType!Char construct(const ref Regex!Char re, in Char[] input, void[] memory) const
|
|
{
|
|
import core.lifetime : emplace;
|
|
return emplace!(EngineType!Char)(memory[0 .. classSize],
|
|
re, &func, Input!Char(input), memory[classSize .. $]);
|
|
}
|
|
}
|
|
|
|
// A workaround for R-T enum re = regex(...)
|
|
template defaultFactory(Char)
|
|
{
|
|
@property MatcherFactory!Char defaultFactory(const ref Regex!Char re) @safe pure
|
|
{
|
|
auto defaultFactoryImpl = () {
|
|
import std.regex.internal.backtracking : BacktrackingMatcher;
|
|
import std.regex.internal.thompson : ThompsonMatcher;
|
|
import std.algorithm.searching : canFind;
|
|
static MatcherFactory!Char backtrackingFactory;
|
|
static MatcherFactory!Char thompsonFactory;
|
|
if (re.backrefed.canFind!"a != 0")
|
|
{
|
|
if (backtrackingFactory is null)
|
|
backtrackingFactory = new RuntimeFactory!(BacktrackingMatcher, Char);
|
|
return backtrackingFactory;
|
|
}
|
|
else
|
|
{
|
|
if (thompsonFactory is null)
|
|
thompsonFactory = new RuntimeFactory!(ThompsonMatcher, Char);
|
|
return thompsonFactory;
|
|
}
|
|
};
|
|
|
|
// this should be faked as pure because the static mutable variables are
|
|
// used to cache the created instance, like memoize
|
|
alias T = typeof(defaultFactoryImpl);
|
|
enum attrs = functionAttributes!T | FunctionAttribute.pure_;
|
|
return (() @trusted =>
|
|
(cast(SetFunctionAttributes!(T, functionLinkage!T, attrs)) defaultFactoryImpl))()();
|
|
}
|
|
}
|
|
|
|
// Defining it as an interface has the undesired side-effect:
|
|
// casting any class to an interface silently adjusts pointer to point to a nested vtbl
|
|
abstract class Matcher(Char)
|
|
{
|
|
abstract:
|
|
// Get a (next) match
|
|
int match(Group!size_t[] matches) pure;
|
|
// This only maintains internal ref-count,
|
|
// deallocation happens inside MatcherFactory
|
|
@property ref size_t refCount() @safe;
|
|
// Copy internal state to another engine, using memory arena 'memory'
|
|
void dupTo(Matcher!Char m, void[] memory);
|
|
// The pattern loaded
|
|
@property ref const(Regex!Char) pattern() @safe;
|
|
// Re-arm the engine with new Input
|
|
Matcher rearm(in Char[] stream);
|
|
}
|
|
|
|
/++
|
|
`Regex` object holds regular expression pattern in compiled form.
|
|
Instances of this object are constructed via calls to `regex`.
|
|
This is an intended form for caching and storage of frequently
|
|
used regular expressions.
|
|
+/
|
|
struct Regex(Char)
|
|
{
|
|
//temporary workaround for identifier lookup
|
|
CodepointSet[] charsets; //
|
|
Bytecode[] ir; //compiled bytecode of pattern
|
|
|
|
|
|
@safe @property bool empty() const nothrow { return ir is null; }
|
|
|
|
@safe @property auto namedCaptures()
|
|
{
|
|
static struct NamedGroupRange
|
|
{
|
|
private:
|
|
const(NamedGroup)[] groups;
|
|
size_t start;
|
|
size_t end;
|
|
public:
|
|
this(const(NamedGroup)[] g, size_t s, size_t e)
|
|
{
|
|
assert(s <= e);
|
|
assert(e <= g.length);
|
|
groups = g;
|
|
start = s;
|
|
end = e;
|
|
}
|
|
|
|
@property string front() { return groups[start].name; }
|
|
@property string back() { return groups[end-1].name; }
|
|
@property bool empty() { return start >= end; }
|
|
@property size_t length() { return end - start; }
|
|
alias opDollar = length;
|
|
@property NamedGroupRange save()
|
|
{
|
|
return NamedGroupRange(groups, start, end);
|
|
}
|
|
void popFront() { assert(!empty); start++; }
|
|
void popBack() { assert(!empty); end--; }
|
|
string opIndex()(size_t i)
|
|
{
|
|
assert(start + i < end,
|
|
"Requested named group is out of range.");
|
|
return groups[start+i].name;
|
|
}
|
|
NamedGroupRange opSlice(size_t low, size_t high) {
|
|
assert(low <= high);
|
|
assert(start + high <= end);
|
|
return NamedGroupRange(groups, start + low, start + high);
|
|
}
|
|
NamedGroupRange opSlice() { return this.save; }
|
|
}
|
|
return NamedGroupRange(dict, 0, dict.length);
|
|
}
|
|
|
|
package(std.regex):
|
|
import std.regex.internal.kickstart : Kickstart; //TODO: get rid of this dependency
|
|
const(NamedGroup)[] dict; // maps name -> user group number
|
|
uint ngroup; // number of internal groups
|
|
uint maxCounterDepth; // max depth of nested {n,m} repetitions
|
|
uint hotspotTableSize; // number of entries in merge table
|
|
uint threadCount; // upper bound on number of Thompson VM threads
|
|
uint flags; // global regex flags
|
|
public const(CharMatcher)[] matchers; // tables that represent character sets
|
|
public const(BitTable)[] filters; // bloom filters for conditional loops
|
|
uint[] backrefed; // bit array of backreferenced submatches
|
|
Kickstart!Char kickstart;
|
|
MatcherFactory!Char factory; // produces optimal matcher for this pattern
|
|
immutable(Char)[] pattern; // copy of pattern to serve as cache key
|
|
|
|
const(Regex) withFactory(MatcherFactory!Char factory) pure const @trusted
|
|
{
|
|
auto r = cast() this;
|
|
r.factory = factory;
|
|
return r;
|
|
}
|
|
|
|
const(Regex) withFlags(uint newFlags) pure const @trusted
|
|
{
|
|
auto r = cast() this;
|
|
r.flags = newFlags;
|
|
return r;
|
|
}
|
|
|
|
const(Regex) withCode(const(Bytecode)[] code) pure const @trusted
|
|
{
|
|
auto r = cast() this;
|
|
r.ir = code.dup; // TODO: sidestep const instead?
|
|
return r;
|
|
}
|
|
|
|
const(Regex) withNGroup(uint nGroup) pure const @trusted
|
|
{
|
|
auto r = cast() this;
|
|
r.ngroup = nGroup;
|
|
return r;
|
|
}
|
|
|
|
//bit access helper
|
|
uint isBackref(uint n)
|
|
{
|
|
if (n/32 >= backrefed.length)
|
|
return 0;
|
|
return backrefed[n / 32] & (1 << (n & 31));
|
|
}
|
|
|
|
//check if searching is not needed
|
|
void checkIfOneShot()
|
|
{
|
|
L_CheckLoop:
|
|
for (uint i = 0; i < ir.length; i += ir[i].length)
|
|
{
|
|
switch (ir[i].code)
|
|
{
|
|
case IR.Bof:
|
|
flags |= RegexInfo.oneShot;
|
|
break L_CheckLoop;
|
|
case IR.GroupStart, IR.GroupEnd, IR.Bol, IR.Eol, IR.Eof,
|
|
IR.Wordboundary, IR.Notwordboundary:
|
|
break;
|
|
default:
|
|
break L_CheckLoop;
|
|
}
|
|
}
|
|
}
|
|
|
|
//print out disassembly a program's IR
|
|
@trusted debug(std_regex_parser) void print() const
|
|
{//@@@BUG@@@ write is system
|
|
for (uint i = 0; i < ir.length; i += ir[i].length)
|
|
{
|
|
writefln("%d\t%s ", i, disassemble(ir, i, dict));
|
|
}
|
|
writeln("Total merge table size: ", hotspotTableSize);
|
|
writeln("Max counter nesting depth: ", maxCounterDepth);
|
|
}
|
|
|
|
public string toString()() const
|
|
{
|
|
import std.format : format;
|
|
static if (is(typeof(pattern) : string))
|
|
alias patternString = pattern;
|
|
else
|
|
{
|
|
import std.conv : to;
|
|
auto patternString = conv.to!string(pattern);
|
|
}
|
|
auto quotedEscapedPattern = format("%(%s %)", [patternString]);
|
|
auto flagString = regexOptionsToString(flags);
|
|
return "Regex!" ~ Char.stringof ~ "(" ~ quotedEscapedPattern ~ ", \"" ~ flagString ~ "\")";
|
|
}
|
|
}
|
|
|
|
// The stuff below this point is temporarrily part of IR module
|
|
// but may need better place in the future (all internals)
|
|
package(std.regex):
|
|
|
|
//Simple UTF-string abstraction compatible with stream interface
|
|
struct Input(Char)
|
|
if (is(Char :dchar))
|
|
{
|
|
import std.utf : decode;
|
|
alias DataIndex = size_t;
|
|
enum bool isLoopback = false;
|
|
alias String = const(Char)[];
|
|
String _origin;
|
|
size_t _index;
|
|
|
|
//constructs Input object out of plain string
|
|
this(String input, size_t idx = 0)
|
|
{
|
|
_origin = input;
|
|
_index = idx;
|
|
}
|
|
|
|
//codepoint at current stream position
|
|
pragma(inline, true) bool nextChar(ref dchar res, ref size_t pos)
|
|
{
|
|
pos = _index;
|
|
// DMD's inliner hates multiple return functions
|
|
// but can live with single statement if/else bodies
|
|
bool n = !(_index == _origin.length);
|
|
if (n)
|
|
res = decode(_origin, _index);
|
|
return n;
|
|
}
|
|
@property bool atEnd(){
|
|
return _index == _origin.length;
|
|
}
|
|
bool search(Kickstart)(ref const Kickstart kick, ref dchar res, ref size_t pos)
|
|
{
|
|
size_t idx = kick.search(_origin, _index);
|
|
_index = idx;
|
|
return nextChar(res, pos);
|
|
}
|
|
|
|
//index of at End position
|
|
@property size_t lastIndex(){ return _origin.length; }
|
|
|
|
//support for backtracker engine, might not be present
|
|
void reset(size_t index){ _index = index; }
|
|
|
|
String opSlice(size_t start, size_t end){ return _origin[start .. end]; }
|
|
|
|
auto loopBack(size_t index){ return BackLooper!Input(this, index); }
|
|
}
|
|
|
|
struct BackLooperImpl(Input)
|
|
{
|
|
import std.utf : strideBack;
|
|
alias DataIndex = size_t;
|
|
alias String = Input.String;
|
|
enum bool isLoopback = true;
|
|
String _origin;
|
|
size_t _index;
|
|
this(Input input, size_t index)
|
|
{
|
|
_origin = input._origin;
|
|
_index = index;
|
|
}
|
|
this(String input)
|
|
{
|
|
_origin = input;
|
|
_index = input.length;
|
|
}
|
|
@trusted bool nextChar(ref dchar res,ref size_t pos)
|
|
{
|
|
pos = _index;
|
|
if (_index == 0)
|
|
return false;
|
|
|
|
res = _origin[0.._index].back;
|
|
_index -= strideBack(_origin, _index);
|
|
|
|
return true;
|
|
}
|
|
@property atEnd(){ return _index == 0 || _index == strideBack(_origin, _index); }
|
|
auto loopBack(size_t index){ return Input(_origin, index); }
|
|
|
|
//support for backtracker engine, might not be present
|
|
//void reset(size_t index){ _index = index ? index-std.utf.strideBack(_origin, index) : 0; }
|
|
void reset(size_t index){ _index = index; }
|
|
|
|
String opSlice(size_t start, size_t end){ return _origin[end .. start]; }
|
|
//index of at End position
|
|
@property size_t lastIndex(){ return 0; }
|
|
}
|
|
|
|
template BackLooper(E)
|
|
{
|
|
static if (is(E : BackLooperImpl!U, U))
|
|
{
|
|
alias BackLooper = U;
|
|
}
|
|
else
|
|
{
|
|
alias BackLooper = BackLooperImpl!E;
|
|
}
|
|
}
|
|
|
|
//both helpers below are internal, on its own are quite "explosive"
|
|
//unsafe, no initialization of elements
|
|
@system pure T[] mallocArray(T)(size_t len)
|
|
{
|
|
import core.memory : pureMalloc;
|
|
return (cast(T*) pureMalloc(len * T.sizeof))[0 .. len];
|
|
}
|
|
|
|
//very unsafe, no initialization
|
|
@system T[] arrayInChunk(T)(size_t len, ref void[] chunk)
|
|
{
|
|
auto ret = (cast(T*) chunk.ptr)[0 .. len];
|
|
chunk = chunk[len * T.sizeof .. $];
|
|
return ret;
|
|
}
|
|
|
|
//
|
|
@trusted uint lookupNamedGroup(String)(const(NamedGroup)[] dict, String name)
|
|
{//equal is @system?
|
|
import std.algorithm.comparison : equal;
|
|
import std.algorithm.iteration : map;
|
|
import std.conv : text;
|
|
import std.range : assumeSorted;
|
|
|
|
auto fnd = assumeSorted!"cmp(a,b) < 0"(map!"a.name"(dict)).lowerBound(name).length;
|
|
enforce(fnd < dict.length && equal(dict[fnd].name, name),
|
|
text("no submatch named ", name));
|
|
return dict[fnd].group;
|
|
}
|
|
|
|
// whether ch is one of unicode newline sequences
|
|
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
|
|
bool endOfLine()(dchar front, bool seenCr)
|
|
{
|
|
return ((front == '\n') ^ seenCr) || front == '\r'
|
|
|| front == NEL || front == LS || front == PS;
|
|
}
|
|
|
|
// 0-arg template due to https://issues.dlang.org/show_bug.cgi?id=10985
|
|
bool startOfLine()(dchar back, bool seenNl)
|
|
{
|
|
return ((back == '\r') ^ seenNl) || back == '\n'
|
|
|| back == NEL || back == LS || back == PS;
|
|
}
|
|
|
|
///Exception object thrown in case of errors during regex compilation.
|
|
public class RegexException : Exception
|
|
{
|
|
mixin basicExceptionCtors;
|
|
}
|
|
|
|
// simple 128-entry bit-table used with a hash function
|
|
struct BitTable {
|
|
uint[4] filter;
|
|
|
|
this(CodepointSet set){
|
|
foreach (iv; set.byInterval)
|
|
{
|
|
foreach (v; iv.a .. iv.b)
|
|
add(v);
|
|
}
|
|
}
|
|
|
|
void add()(dchar ch){
|
|
immutable i = index(ch);
|
|
filter[i >> 5] |= 1<<(i & 31);
|
|
}
|
|
// non-zero -> might be present, 0 -> absent
|
|
bool opIndex()(dchar ch) const{
|
|
immutable i = index(ch);
|
|
return (filter[i >> 5]>>(i & 31)) & 1;
|
|
}
|
|
|
|
static uint index()(dchar ch){
|
|
return ((ch >> 7) ^ ch) & 0x7F;
|
|
}
|
|
}
|
|
|
|
struct CharMatcher {
|
|
BitTable ascii; // fast path for ASCII
|
|
Trie trie; // slow path for Unicode
|
|
|
|
this(CodepointSet set)
|
|
{
|
|
auto asciiSet = set & unicode.ASCII;
|
|
ascii = BitTable(asciiSet);
|
|
trie = makeTrie(set);
|
|
}
|
|
|
|
bool opIndex()(dchar ch) const
|
|
{
|
|
if (ch < 0x80)
|
|
return ascii[ch];
|
|
else
|
|
return trie[ch];
|
|
}
|
|
}
|
|
|
|
// Internal non-resizeble array, switches between inline storage and CoW
|
|
// POD-only
|
|
struct SmallFixedArray(T, uint SMALL=3)
|
|
if (!hasElaborateDestructor!T)
|
|
{
|
|
import std.internal.memory : enforceMalloc;
|
|
import core.memory : pureFree;
|
|
static struct Payload
|
|
{
|
|
size_t refcount;
|
|
T[0] placeholder;
|
|
inout(T)* ptr() inout { return placeholder.ptr; }
|
|
}
|
|
static assert(Payload.sizeof == size_t.sizeof);
|
|
union
|
|
{
|
|
Payload* big;
|
|
T[SMALL] small;
|
|
}
|
|
size_t _sizeMask;
|
|
enum BIG_MASK = size_t(1)<<(8*size_t.sizeof-1);
|
|
enum SIZE_MASK = ~BIG_MASK;
|
|
|
|
@property bool isBig() const { return (_sizeMask & BIG_MASK) != 0; }
|
|
@property size_t length() const { return _sizeMask & SIZE_MASK; }
|
|
|
|
this(size_t size)
|
|
{
|
|
if (size <= SMALL)
|
|
{
|
|
small[] = T.init;
|
|
_sizeMask = size;
|
|
}
|
|
else
|
|
{
|
|
big = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*size);
|
|
big.refcount = 1;
|
|
_sizeMask = size | BIG_MASK;
|
|
}
|
|
}
|
|
|
|
private @trusted @property inout(T)[] internalSlice() inout
|
|
{
|
|
return isBig ? big.ptr[0 .. length] : small[0 .. length];
|
|
}
|
|
|
|
this(this)
|
|
{
|
|
if (isBig)
|
|
{
|
|
big.refcount++;
|
|
}
|
|
}
|
|
|
|
bool opEquals(SmallFixedArray a)
|
|
{
|
|
return internalSlice[] == a.internalSlice[];
|
|
}
|
|
|
|
size_t toHash() const
|
|
{
|
|
return hashOf(internalSlice[]);
|
|
}
|
|
|
|
ref inout(T) opIndex(size_t idx) inout
|
|
{
|
|
return internalSlice[idx];
|
|
}
|
|
|
|
// accesses big to test self-referencing so not @safe
|
|
@trusted ref opAssign(SmallFixedArray arr)
|
|
{
|
|
if (isBig)
|
|
{
|
|
if (arr.isBig)
|
|
{
|
|
if (big is arr.big) return this; // self-assign
|
|
else
|
|
{
|
|
abandonRef();
|
|
_sizeMask = arr._sizeMask;
|
|
big = arr.big;
|
|
big.refcount++;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
abandonRef();
|
|
_sizeMask = arr._sizeMask;
|
|
small = arr.small;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
if (arr.isBig)
|
|
{
|
|
_sizeMask = arr._sizeMask;
|
|
big = arr.big;
|
|
big.refcount++;
|
|
}
|
|
else
|
|
{
|
|
_sizeMask = arr._sizeMask;
|
|
small = arr.small;
|
|
}
|
|
}
|
|
return this;
|
|
}
|
|
|
|
void mutate(scope void delegate(T[]) pure filler)
|
|
{
|
|
if (isBig && big.refcount != 1) // copy on write
|
|
{
|
|
auto oldSizeMask = _sizeMask;
|
|
auto newbig = cast(Payload*) enforceMalloc(Payload.sizeof + T.sizeof*length);
|
|
newbig.refcount = 1;
|
|
abandonRef();
|
|
big = newbig;
|
|
_sizeMask = oldSizeMask;
|
|
}
|
|
filler(internalSlice);
|
|
}
|
|
|
|
~this()
|
|
{
|
|
if (isBig)
|
|
{
|
|
abandonRef();
|
|
}
|
|
}
|
|
|
|
@trusted private void abandonRef()
|
|
{
|
|
assert(isBig);
|
|
if (--big.refcount == 0)
|
|
{
|
|
pureFree(big);
|
|
_sizeMask = 0;
|
|
assert(!isBig);
|
|
}
|
|
}
|
|
}
|
|
|
|
@system unittest
|
|
{
|
|
alias SA = SmallFixedArray!(int, 2);
|
|
SA create(int[] data)
|
|
{
|
|
SA a = SA(data.length);
|
|
a.mutate((slice) { slice[] = data[]; });
|
|
assert(a.internalSlice == data);
|
|
return a;
|
|
}
|
|
|
|
{
|
|
SA a;
|
|
a = SA(1);
|
|
assert(a.length == 1);
|
|
a = SA.init;
|
|
assert(a.length == 0);
|
|
}
|
|
|
|
{
|
|
SA a, b, c, d;
|
|
assert(a.length == 0);
|
|
assert(a.internalSlice == b.internalSlice);
|
|
a = create([1]);
|
|
assert(a.internalSlice == [1]);
|
|
b = create([2, 3]);
|
|
assert(b.internalSlice == [2, 3]);
|
|
c = create([3, 4, 5]);
|
|
d = create([5, 6, 7, 8]);
|
|
assert(c.isBig);
|
|
a = c;
|
|
assert(a.isBig);
|
|
assert(a.big is c.big);
|
|
assert(a.big.refcount == 2);
|
|
assert(a.internalSlice == [3, 4, 5]);
|
|
assert(c.internalSlice == [3, 4, 5]);
|
|
a = b;
|
|
assert(!a.isBig);
|
|
assert(a.internalSlice == [2, 3]);
|
|
assert(c.big.refcount == 1);
|
|
a = c;
|
|
assert(c.big.refcount == 2);
|
|
|
|
// mutate copies on write if ref-count is not 1
|
|
a.mutate((slice){ slice[] = 1; });
|
|
assert(a.internalSlice == [1, 1, 1]);
|
|
assert(c.internalSlice == [3, 4, 5]);
|
|
assert(a.isBig && c.isBig);
|
|
assert(a.big.refcount == 1);
|
|
assert(c.big.refcount == 1);
|
|
|
|
auto e = d;
|
|
assert(e.big.refcount == 2);
|
|
auto f = d;
|
|
f = a;
|
|
assert(f.isBig);
|
|
assert(f.internalSlice == [1, 1, 1]);
|
|
assert(f.big.refcount == 2); // a & f
|
|
assert(e.big.refcount == 2); // d & e
|
|
a = c;
|
|
assert(f.big.refcount == 1); // f
|
|
assert(e.big.refcount == 2); // d & e
|
|
a = a;
|
|
a = a;
|
|
a = a;
|
|
assert(a.big.refcount == 2); // a & c
|
|
}
|
|
}
|