This commit is contained in:
Hackerpilot 2013-03-04 02:05:18 +00:00
commit c1fcef1873
2 changed files with 3368 additions and 3381 deletions

View File

@ -102,7 +102,7 @@
*
* Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott
* Authors: Brian Schott, Dmitry Olshansky
* Source: $(PHOBOSSRC std/d/_lexer.d)
*/
@ -203,7 +203,7 @@ enum IterationStyle
includeSpecialTokens = 0b0100,
/// Do not stop iteration on reaching the ___EOF__ token
ignoreEOF = 0b1000,
/// Include everything
/// Include _everything
everything = includeComments | includeWhitespace | ignoreEOF
}
@ -266,7 +266,7 @@ struct LexerConfig
TokenStyle tokenStyle = tokenStyle.default_;
/**
* Replacement for the ___VERSION__ token. Defaults to 1.
* Replacement for the ___VERSION__ token. Defaults to 100.
*/
uint versionNumber = 100;
@ -289,12 +289,6 @@ struct LexerConfig
* and error messsage.
*/
void delegate(string, size_t, uint, uint, string) errorFunc;
/**
* Initial size of the lexer's internal token buffer in bytes. The lexer
* will grow this buffer if necessary.
*/
size_t bufferSize = 1024 * 4;
}
/**
@ -331,287 +325,6 @@ auto byToken(R)(R range, LexerConfig config)
return r;
}
// For now a private helper that is tailored to the way lexer works
// hides away forwardness of range by buffering
// RA-version is strightforward thin wrapping
// ATM it is byte-oriented
private struct LexSource(R)
if(isForwardRange!R && !isRandomAccessRange!R)
{
bool empty() const { return _empty; }
auto ref front() const
{
return accum[accumIdx];
}
auto ref peek() const
in
{
assert (accumIdx + 1 < accum.length);
}
body
{
return accum[accumIdx + 1];
}
void popFront()
{
++_index;
range.popFront();
// if that was last byte
// just advance so that open-righted slice just works
accumIdx = (accumIdx+1) & mask;
if(range.empty)
{
_empty = true;
return;
}
if(accumIdx == savedAccumIdx)
{
// and move stuff around
auto oldLen = accum.length;
auto toCopy = oldLen - accumIdx;
accum.length *= 2; // keep pow of 2
// copy starting with last item
copy(retro(accum[accumIdx..oldLen]),
retro(accum[$-toCopy..$]));
savedAccumIdx = accum.length - toCopy;
}
accum[accumIdx] = range.front;
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
// sadly need to dup circular buffer, as it overwrites items
copy.accum = copy.accum.dup;
return copy;
}
// mark a position to slice from later on
size_t mark()
{
savedAccumIdx = accumIdx;
return accumIdx;
}
// slice to current position from previously marked position
auto slice() @property
{
// it's an open right range as usual
return CircularRange(accum, savedAccumIdx, accumIdx);
}
size_t index() const @property
{
return _index;
}
private:
this(R src, size_t bufferSize)
{
range = src;
assert(bufferSize > 0);
assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
accum = new ubyte[bufferSize];
if(range.empty)
_empty = true;
else
accum[accumIdx] = range.front; // load front
}
// a true RA-range of ubyte
struct CircularRange
{
this(ubyte[] buf, size_t s, size_t e)
{
assert((buffer.length & (buffer.length-1)) == 0);
buffer = buf;
start = s;
end = e;
}
//Forward range primitives
@property bool empty() const { return start == end; }
@property auto ref front() const { return buffer[start]; }
void popFront() { start = (start + 1) & mask; }
@property auto save() { return this; }
//Backwards is a bit slower, but should be rarely used (if at all)
@property ref back(){ return buffer[(end-1) & mask]; }
void popBack() { end = (end - 1) & mask; }
// RA range primitives
ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
@property size_t length()
{
return end < start ? end + buffer.length -start : end - start;
}
alias length opDollar;
auto opSlice(size_t newStart, size_t newEnd)
{
size_t maskedStart = (start+newStart) & mask;
size_t maskedEnd = (start+newEnd) & mask;
return typeof(this)(buffer, maskedStart, maskedEnd);
}
// @@@bug fwd-ref in ldc0.10 (if placed above previous one)
auto opSlice(){ return opSlice(0, length); }
private:
@property auto mask(){ return buffer.length-1; }
size_t start, end;
ubyte[] buffer;
}
@property auto mask(){ return accum.length-1; }
R range;
bool _empty;
ubyte[] accum; // accumulator buffer for non-RA ranges
size_t savedAccumIdx;
size_t accumIdx; // current index in accumulator
size_t _index; // index of current element in original range
}
// TODO: make sure it's RandomAccess later
/*static assert(isRandomAccessRange!(
LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
.CircularRange)
);*/
//trivial pass-through for RA ranges
private struct LexSource(R)
if(isRandomAccessRange!R)
{
bool empty() const @property { return cur >= range.length; }
bool canPeek() const { return cur + 1 < range.length; }
auto ref front() const @property { return range[cur]; }
void popFront(){ cur++; }
auto ref peek() const
in
{
assert (canPeek());
}
body
{
return range[cur + 1];
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
return copy;
}
auto mark()
{
saved = cur;
}
// use the underliying range slicing capability
auto slice() @property
{
return range[saved..cur];
}
size_t index() const @property
{
return cur;
}
private:
this(R src)
{
range = src;
}
size_t cur, saved;
R range;
}
auto lexerSource(Range)(Range range, size_t bufSize=8)
if(isForwardRange!Range && !isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range, bufSize);
}
auto lexerSource(Range)(Range range)
if(isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range);
}
unittest
{
// test the basic functionality of a "mark-slice" range
import std.string, std.stdio;
static void test_hello(T)(T lexs)
{
assert(lexs.front == 'H');
lexs.popFront();
assert(lexs.front == 'e');
foreach(i; 0..2)
{
auto saved = lexs.save;
lexs.mark();
assert(lexs.slice.equal(""));
lexs.popFront();
assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
lexs.popFrontN(4);
auto bytes = lexs.slice.map!"cast(char)a".array();
assert(bytes.equal("ello,"), bytes.to!string);
lexs.mark();
assert(lexs.slice.equal(""));
assert(lexs.front == 'w');
lexs.popFrontN(6);
assert(lexs.empty);
auto s = lexs.slice();
auto msg = s.save.map!"cast(char)a".array;
assert(s[].equal("world!"), msg);
assert(s[2..$-1].equal("rld"), msg);
assert(s[0] == 'w' && s[$-1] == '!');
s.popFront();
assert(s.front == 'o' && s.back == '!');
s.popBack();
assert(s.front == 'o' && s.back == 'd');
//restore and repeat again
lexs = saved;
}
}
static void test_empty(T)(T lexs)
{
assert(lexs.empty);
lexs.mark();
assert(lexs.slice().equal(""));
}
auto fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 16 // and the one that is more then enough
);
test_hello(fwdLex);
fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 1 // try the smallest initial buffer
);
test_hello(fwdLex);
fwdLex = lexerSource("".representation.filter!"a != ' '");
auto raLex = lexerSource("".representation);
test_empty(raLex);
test_empty(fwdLex);
raLex = lexerSource("Hello,world!".representation);
test_hello(raLex);
}
/**
* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
*/
@ -718,10 +431,10 @@ L_advance:
"=", "TokenType.assign",
"@", "TokenType.at",
"&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEqual",
"&=", "TokenType.bitAndEquals",
"|", "TokenType.bitOr",
"|=", "TokenType.bitOrEqual",
"~=", "TokenType.catEqual",
"|=", "TokenType.bitOrEquals",
"~=", "TokenType.catEquals",
":", "TokenType.colon",
",", "TokenType.comma",
"--", "TokenType.decrement",
@ -741,21 +454,21 @@ L_advance:
"||", "TokenType.logicOr",
"(", "TokenType.lParen",
"-", "TokenType.minus",
"-=", "TokenType.minusEqual",
"-=", "TokenType.minusEquals",
"%", "TokenType.mod",
"%=", "TokenType.modEqual",
"*=", "TokenType.mulEqual",
"%=", "TokenType.modEquals",
"*=", "TokenType.mulEquals",
"!", "TokenType.not",
"!=", "TokenType.notEqual",
"!=", "TokenType.notEquals",
"!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus",
"+=", "TokenType.plusEqual",
"+=", "TokenType.plusEquals",
"^^", "TokenType.pow",
"^^=", "TokenType.powEqual",
"^^=", "TokenType.powEquals",
"}", "TokenType.rBrace",
"]", "TokenType.rBracket",
")", "TokenType.rParen",
@ -771,7 +484,7 @@ L_advance:
">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual",
"^", "TokenType.xor",
"^=", "TokenType.xorEqual",
"^=", "TokenType.xorEquals",
));
case '/':
nextCharNonLF();
@ -792,7 +505,7 @@ L_advance:
goto L_advance; // tail-recursion
case '=':
current.type = TokenType.divEqual;
current.type = TokenType.divEquals;
current.value = "/=";
src.popFront();
return;
@ -933,7 +646,7 @@ L_advance:
do
{
nextChar();
}while (!isEoF() && isWhite());
} while (!isEoF() && isWhite());
static if (keep) setTokenValue();
}
@ -2168,7 +1881,7 @@ L_advance:
*/
pure nothrow bool isOperator(const TokenType t)
{
return t >= TokenType.assign && t <= TokenType.xorEqual;
return t >= TokenType.assign && t <= TokenType.xorEquals;
}
/**
@ -2332,15 +2045,15 @@ enum TokenType: ushort
assign, /// =
at, /// @
bitAnd, /// &
bitAndEqual, /// &=
bitAndEquals, /// &=
bitOr, /// |
bitOrEqual, /// |=
catEqual, /// ~=
bitOrEquals, /// |=
catEquals, /// ~=
colon, /// :
comma, /// ,
decrement, /// --
div, /// /
divEqual, /// /=
divEquals, /// /=
dollar, /// $
dot, /// .
equals, /// ==
@ -2359,21 +2072,21 @@ enum TokenType: ushort
logicOr, /// ||
lParen, /// $(LPAREN)
minus, /// -
minusEqual, /// -=
minusEquals, /// -=
mod, /// %
modEqual, /// %=
mulEqual, /// *=
modEquals, /// %=
mulEquals, /// *=
not, /// !
notEqual, /// !=
notEquals, /// !=
notGreater, /// !>
notGreaterEqual, /// !>=
notLess, /// !<
notLessEqual, /// !<=
notLessEqualGreater, /// !<>
plus, /// +
plusEqual, /// +=
plusEquals, /// +=
pow, /// ^^
powEqual, /// ^^=
powEquals, /// ^^=
rBrace, /// }
rBracket, /// ]
rParen, /// $(RPAREN)
@ -2391,7 +2104,7 @@ enum TokenType: ushort
unsignedShiftRightEqual, /// >>>=
vararg, /// ...
xor, /// ^
xorEqual, /// ^=
xorEquals, /// ^=
bool_, /// $(D_KEYWORD bool)
byte_, /// $(D_KEYWORD byte)
@ -2401,7 +2114,6 @@ enum TokenType: ushort
char_, /// $(D_KEYWORD char)
creal_, /// $(D_KEYWORD creal)
dchar_, /// $(D_KEYWORD dchar)
delegate_, /// $(D_KEYWORD delegate)
double_, /// $(D_KEYWORD double)
float_, /// $(D_KEYWORD float)
function_, /// $(D_KEYWORD function)
@ -2453,6 +2165,7 @@ enum TokenType: ushort
continue_, /// $(D_KEYWORD continue)
debug_, /// $(D_KEYWORD debug)
default_, /// $(D_KEYWORD default)
delegate_, /// $(D_KEYWORD delegate)
delete_, /// $(D_KEYWORD delete)
do_, /// $(D_KEYWORD do)
else_, /// $(D_KEYWORD else)
@ -2529,22 +2242,298 @@ enum TokenType: ushort
dstringLiteral, /// $(D_STRING "32-bit character string"d)
stringLiteral, /// $(D_STRING "an 8-bit string")
wstringLiteral, /// $(D_STRING "16-bit character string"w)
invalid, /// Not a valid token type
}
// Implementation details follow
private:
// For now a private helper that is tailored to the way lexer works
// hides away forwardness of range by buffering
// RA-version is strightforward thin wrapping
// ATM it is byte-oriented
private struct LexSource(R)
if(isForwardRange!R && !isRandomAccessRange!R)
{
bool empty() const { return _empty; }
auto ref front() const
{
return accum[accumIdx];
}
auto ref peek() const
in
{
assert (accumIdx + 1 < accum.length);
}
body
{
return accum[accumIdx + 1];
}
void popFront()
{
++_index;
range.popFront();
// if that was last byte
// just advance so that open-righted slice just works
accumIdx = (accumIdx+1) & mask;
if(range.empty)
{
_empty = true;
return;
}
if(accumIdx == savedAccumIdx)
{
// and move stuff around
auto oldLen = accum.length;
auto toCopy = oldLen - accumIdx;
accum.length *= 2; // keep pow of 2
// copy starting with last item
copy(retro(accum[accumIdx..oldLen]),
retro(accum[$-toCopy..$]));
savedAccumIdx = accum.length - toCopy;
}
accum[accumIdx] = range.front;
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
// sadly need to dup circular buffer, as it overwrites items
copy.accum = copy.accum.dup;
return copy;
}
// mark a position to slice from later on
size_t mark()
{
savedAccumIdx = accumIdx;
return accumIdx;
}
// slice to current position from previously marked position
auto slice() @property
{
// it's an open right range as usual
return CircularRange(accum, savedAccumIdx, accumIdx);
}
size_t index() const @property
{
return _index;
}
private:
this(R src, size_t bufferSize)
{
range = src;
assert(bufferSize > 0);
assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
accum = new ubyte[bufferSize];
if(range.empty)
_empty = true;
else
accum[accumIdx] = range.front; // load front
}
// a true RA-range of ubyte
struct CircularRange
{
this(ubyte[] buf, size_t s, size_t e)
{
assert((buffer.length & (buffer.length-1)) == 0);
buffer = buf;
start = s;
end = e;
}
//Forward range primitives
@property bool empty() const { return start == end; }
@property auto ref front() const { return buffer[start]; }
void popFront() { start = (start + 1) & mask; }
@property auto save() { return this; }
//Backwards is a bit slower, but should be rarely used (if at all)
@property ref back(){ return buffer[(end-1) & mask]; }
void popBack() { end = (end - 1) & mask; }
// RA range primitives
ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
@property size_t length()
{
return end < start ? end + buffer.length -start : end - start;
}
alias length opDollar;
auto opSlice(size_t newStart, size_t newEnd)
{
size_t maskedStart = (start+newStart) & mask;
size_t maskedEnd = (start+newEnd) & mask;
return typeof(this)(buffer, maskedStart, maskedEnd);
}
// @@@bug fwd-ref in ldc0.10 (if placed above previous one)
auto opSlice(){ return opSlice(0, length); }
private:
@property auto mask(){ return buffer.length-1; }
size_t start, end;
ubyte[] buffer;
}
@property auto mask(){ return accum.length-1; }
R range;
bool _empty;
ubyte[] accum; // accumulator buffer for non-RA ranges
size_t savedAccumIdx;
size_t accumIdx; // current index in accumulator
size_t _index; // index of current element in original range
}
// TODO: make sure it's RandomAccess later
/*static assert(isRandomAccessRange!(
LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
.CircularRange)
);*/
//trivial pass-through for RA ranges
private struct LexSource(R)
if(isRandomAccessRange!R)
{
bool empty() const @property { return cur >= range.length; }
bool canPeek() const { return cur + 1 < range.length; }
auto ref front() const @property { return range[cur]; }
void popFront(){ cur++; }
auto ref peek() const
in
{
assert (canPeek());
}
body
{
return range[cur + 1];
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
return copy;
}
auto mark()
{
saved = cur;
}
// use the underliying range slicing capability
auto slice() @property
{
return range[saved..cur];
}
size_t index() const @property
{
return cur;
}
private:
this(R src)
{
range = src;
}
size_t cur, saved;
R range;
}
auto lexerSource(Range)(Range range, size_t bufSize=8)
if(isForwardRange!Range && !isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range, bufSize);
}
auto lexerSource(Range)(Range range)
if(isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range);
}
unittest
{
// test the basic functionality of a "mark-slice" range
import std.string, std.stdio;
static void test_hello(T)(T lexs)
{
assert(lexs.front == 'H');
lexs.popFront();
assert(lexs.front == 'e');
foreach(i; 0..2)
{
auto saved = lexs.save;
lexs.mark();
assert(lexs.slice.equal(""));
lexs.popFront();
assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
lexs.popFrontN(4);
auto bytes = lexs.slice.map!"cast(char)a".array();
assert(bytes.equal("ello,"), bytes.to!string);
lexs.mark();
assert(lexs.slice.equal(""));
assert(lexs.front == 'w');
lexs.popFrontN(6);
assert(lexs.empty);
auto s = lexs.slice();
auto msg = s.save.map!"cast(char)a".array;
assert(s[].equal("world!"), msg);
assert(s[2..$-1].equal("rld"), msg);
assert(s[0] == 'w' && s[$-1] == '!');
s.popFront();
assert(s.front == 'o' && s.back == '!');
s.popBack();
assert(s.front == 'o' && s.back == 'd');
//restore and repeat again
lexs = saved;
}
}
static void test_empty(T)(T lexs)
{
assert(lexs.empty);
lexs.mark();
assert(lexs.slice().equal(""));
}
auto fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 16 // and the one that is more then enough
);
test_hello(fwdLex);
fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 1 // try the smallest initial buffer
);
test_hello(fwdLex);
fwdLex = lexerSource("".representation.filter!"a != ' '");
auto raLex = lexerSource("".representation);
test_empty(raLex);
test_empty(fwdLex);
raLex = lexerSource("Hello,world!".representation);
test_hello(raLex);
}
// uses auto-detection for pure, safe nothrow
bool isRangeEoF(R)(ref R range)
{
return range.empty || range.front == 0 || range.front == 0x1a;
}
/*
* Slices of the above string to save memory. This array is automatically
* generated.
*/
// Lookup table for token values
immutable(string[TokenType.max + 1]) tokenValues = [
"=",
"@",
@ -2617,7 +2606,6 @@ immutable(string[TokenType.max + 1]) tokenValues = [
"char",
"creal",
"dchar",
"delegate",
"double",
"float",
"function",
@ -2667,6 +2655,7 @@ immutable(string[TokenType.max + 1]) tokenValues = [
"continue",
"debug",
"default",
"delegate",
"delete",
"do",
"else",
@ -2742,7 +2731,6 @@ immutable(string[TokenType.max + 1]) tokenValues = [
null,
null,
null,
null,
];
pure string getTokenValue(const TokenType type)
@ -3062,7 +3050,7 @@ struct StringCache
uint bucket = h % mapSize;
Slot *s = &index[bucket];
//1st slot not yet initialized?
if(s.value.ptr == null)
if (s.value.ptr == null)
{
*s = Slot(putIntoCache(range), null, h);
return s.value;
@ -3396,5 +3384,4 @@ unittest
assert (tokenCount == 16);
}
//void main(string[] args){}