This commit is contained in:
Hackerpilot 2013-03-04 02:05:18 +00:00
commit c1fcef1873
2 changed files with 3368 additions and 3381 deletions

View File

@ -102,7 +102,7 @@
* *
* Copyright: Brian Schott 2013 * Copyright: Brian Schott 2013
* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0)
* Authors: Brian Schott * Authors: Brian Schott, Dmitry Olshansky
* Source: $(PHOBOSSRC std/d/_lexer.d) * Source: $(PHOBOSSRC std/d/_lexer.d)
*/ */
@ -203,7 +203,7 @@ enum IterationStyle
includeSpecialTokens = 0b0100, includeSpecialTokens = 0b0100,
/// Do not stop iteration on reaching the ___EOF__ token /// Do not stop iteration on reaching the ___EOF__ token
ignoreEOF = 0b1000, ignoreEOF = 0b1000,
/// Include everything /// Include _everything
everything = includeComments | includeWhitespace | ignoreEOF everything = includeComments | includeWhitespace | ignoreEOF
} }
@ -266,7 +266,7 @@ struct LexerConfig
TokenStyle tokenStyle = tokenStyle.default_; TokenStyle tokenStyle = tokenStyle.default_;
/** /**
* Replacement for the ___VERSION__ token. Defaults to 1. * Replacement for the ___VERSION__ token. Defaults to 100.
*/ */
uint versionNumber = 100; uint versionNumber = 100;
@ -289,12 +289,6 @@ struct LexerConfig
* and error messsage. * and error messsage.
*/ */
void delegate(string, size_t, uint, uint, string) errorFunc; void delegate(string, size_t, uint, uint, string) errorFunc;
/**
* Initial size of the lexer's internal token buffer in bytes. The lexer
* will grow this buffer if necessary.
*/
size_t bufferSize = 1024 * 4;
} }
/** /**
@ -331,287 +325,6 @@ auto byToken(R)(R range, LexerConfig config)
return r; return r;
} }
// For now a private helper that is tailored to the way lexer works
// hides away forwardness of range by buffering
// RA-version is strightforward thin wrapping
// ATM it is byte-oriented
private struct LexSource(R)
if(isForwardRange!R && !isRandomAccessRange!R)
{
bool empty() const { return _empty; }
auto ref front() const
{
return accum[accumIdx];
}
auto ref peek() const
in
{
assert (accumIdx + 1 < accum.length);
}
body
{
return accum[accumIdx + 1];
}
void popFront()
{
++_index;
range.popFront();
// if that was last byte
// just advance so that open-righted slice just works
accumIdx = (accumIdx+1) & mask;
if(range.empty)
{
_empty = true;
return;
}
if(accumIdx == savedAccumIdx)
{
// and move stuff around
auto oldLen = accum.length;
auto toCopy = oldLen - accumIdx;
accum.length *= 2; // keep pow of 2
// copy starting with last item
copy(retro(accum[accumIdx..oldLen]),
retro(accum[$-toCopy..$]));
savedAccumIdx = accum.length - toCopy;
}
accum[accumIdx] = range.front;
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
// sadly need to dup circular buffer, as it overwrites items
copy.accum = copy.accum.dup;
return copy;
}
// mark a position to slice from later on
size_t mark()
{
savedAccumIdx = accumIdx;
return accumIdx;
}
// slice to current position from previously marked position
auto slice() @property
{
// it's an open right range as usual
return CircularRange(accum, savedAccumIdx, accumIdx);
}
size_t index() const @property
{
return _index;
}
private:
this(R src, size_t bufferSize)
{
range = src;
assert(bufferSize > 0);
assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
accum = new ubyte[bufferSize];
if(range.empty)
_empty = true;
else
accum[accumIdx] = range.front; // load front
}
// a true RA-range of ubyte
struct CircularRange
{
this(ubyte[] buf, size_t s, size_t e)
{
assert((buffer.length & (buffer.length-1)) == 0);
buffer = buf;
start = s;
end = e;
}
//Forward range primitives
@property bool empty() const { return start == end; }
@property auto ref front() const { return buffer[start]; }
void popFront() { start = (start + 1) & mask; }
@property auto save() { return this; }
//Backwards is a bit slower, but should be rarely used (if at all)
@property ref back(){ return buffer[(end-1) & mask]; }
void popBack() { end = (end - 1) & mask; }
// RA range primitives
ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
@property size_t length()
{
return end < start ? end + buffer.length -start : end - start;
}
alias length opDollar;
auto opSlice(size_t newStart, size_t newEnd)
{
size_t maskedStart = (start+newStart) & mask;
size_t maskedEnd = (start+newEnd) & mask;
return typeof(this)(buffer, maskedStart, maskedEnd);
}
// @@@bug fwd-ref in ldc0.10 (if placed above previous one)
auto opSlice(){ return opSlice(0, length); }
private:
@property auto mask(){ return buffer.length-1; }
size_t start, end;
ubyte[] buffer;
}
@property auto mask(){ return accum.length-1; }
R range;
bool _empty;
ubyte[] accum; // accumulator buffer for non-RA ranges
size_t savedAccumIdx;
size_t accumIdx; // current index in accumulator
size_t _index; // index of current element in original range
}
// TODO: make sure it's RandomAccess later
/*static assert(isRandomAccessRange!(
LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
.CircularRange)
);*/
//trivial pass-through for RA ranges
private struct LexSource(R)
if(isRandomAccessRange!R)
{
bool empty() const @property { return cur >= range.length; }
bool canPeek() const { return cur + 1 < range.length; }
auto ref front() const @property { return range[cur]; }
void popFront(){ cur++; }
auto ref peek() const
in
{
assert (canPeek());
}
body
{
return range[cur + 1];
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
return copy;
}
auto mark()
{
saved = cur;
}
// use the underliying range slicing capability
auto slice() @property
{
return range[saved..cur];
}
size_t index() const @property
{
return cur;
}
private:
this(R src)
{
range = src;
}
size_t cur, saved;
R range;
}
auto lexerSource(Range)(Range range, size_t bufSize=8)
if(isForwardRange!Range && !isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range, bufSize);
}
auto lexerSource(Range)(Range range)
if(isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range);
}
unittest
{
// test the basic functionality of a "mark-slice" range
import std.string, std.stdio;
static void test_hello(T)(T lexs)
{
assert(lexs.front == 'H');
lexs.popFront();
assert(lexs.front == 'e');
foreach(i; 0..2)
{
auto saved = lexs.save;
lexs.mark();
assert(lexs.slice.equal(""));
lexs.popFront();
assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
lexs.popFrontN(4);
auto bytes = lexs.slice.map!"cast(char)a".array();
assert(bytes.equal("ello,"), bytes.to!string);
lexs.mark();
assert(lexs.slice.equal(""));
assert(lexs.front == 'w');
lexs.popFrontN(6);
assert(lexs.empty);
auto s = lexs.slice();
auto msg = s.save.map!"cast(char)a".array;
assert(s[].equal("world!"), msg);
assert(s[2..$-1].equal("rld"), msg);
assert(s[0] == 'w' && s[$-1] == '!');
s.popFront();
assert(s.front == 'o' && s.back == '!');
s.popBack();
assert(s.front == 'o' && s.back == 'd');
//restore and repeat again
lexs = saved;
}
}
static void test_empty(T)(T lexs)
{
assert(lexs.empty);
lexs.mark();
assert(lexs.slice().equal(""));
}
auto fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 16 // and the one that is more then enough
);
test_hello(fwdLex);
fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 1 // try the smallest initial buffer
);
test_hello(fwdLex);
fwdLex = lexerSource("".representation.filter!"a != ' '");
auto raLex = lexerSource("".representation);
test_empty(raLex);
test_empty(fwdLex);
raLex = lexerSource("Hello,world!".representation);
test_hello(raLex);
}
/** /**
* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate.
*/ */
@ -718,10 +431,10 @@ L_advance:
"=", "TokenType.assign", "=", "TokenType.assign",
"@", "TokenType.at", "@", "TokenType.at",
"&", "TokenType.bitAnd", "&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEqual", "&=", "TokenType.bitAndEquals",
"|", "TokenType.bitOr", "|", "TokenType.bitOr",
"|=", "TokenType.bitOrEqual", "|=", "TokenType.bitOrEquals",
"~=", "TokenType.catEqual", "~=", "TokenType.catEquals",
":", "TokenType.colon", ":", "TokenType.colon",
",", "TokenType.comma", ",", "TokenType.comma",
"--", "TokenType.decrement", "--", "TokenType.decrement",
@ -741,21 +454,21 @@ L_advance:
"||", "TokenType.logicOr", "||", "TokenType.logicOr",
"(", "TokenType.lParen", "(", "TokenType.lParen",
"-", "TokenType.minus", "-", "TokenType.minus",
"-=", "TokenType.minusEqual", "-=", "TokenType.minusEquals",
"%", "TokenType.mod", "%", "TokenType.mod",
"%=", "TokenType.modEqual", "%=", "TokenType.modEquals",
"*=", "TokenType.mulEqual", "*=", "TokenType.mulEquals",
"!", "TokenType.not", "!", "TokenType.not",
"!=", "TokenType.notEqual", "!=", "TokenType.notEquals",
"!>", "TokenType.notGreater", "!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual", "!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess", "!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual", "!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater", "!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus", "+", "TokenType.plus",
"+=", "TokenType.plusEqual", "+=", "TokenType.plusEquals",
"^^", "TokenType.pow", "^^", "TokenType.pow",
"^^=", "TokenType.powEqual", "^^=", "TokenType.powEquals",
"}", "TokenType.rBrace", "}", "TokenType.rBrace",
"]", "TokenType.rBracket", "]", "TokenType.rBracket",
")", "TokenType.rParen", ")", "TokenType.rParen",
@ -771,7 +484,7 @@ L_advance:
">>>", "TokenType.unsignedShiftRight", ">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual", ">>>=", "TokenType.unsignedShiftRightEqual",
"^", "TokenType.xor", "^", "TokenType.xor",
"^=", "TokenType.xorEqual", "^=", "TokenType.xorEquals",
)); ));
case '/': case '/':
nextCharNonLF(); nextCharNonLF();
@ -792,7 +505,7 @@ L_advance:
goto L_advance; // tail-recursion goto L_advance; // tail-recursion
case '=': case '=':
current.type = TokenType.divEqual; current.type = TokenType.divEquals;
current.value = "/="; current.value = "/=";
src.popFront(); src.popFront();
return; return;
@ -2168,7 +1881,7 @@ L_advance:
*/ */
pure nothrow bool isOperator(const TokenType t) pure nothrow bool isOperator(const TokenType t)
{ {
return t >= TokenType.assign && t <= TokenType.xorEqual; return t >= TokenType.assign && t <= TokenType.xorEquals;
} }
/** /**
@ -2332,15 +2045,15 @@ enum TokenType: ushort
assign, /// = assign, /// =
at, /// @ at, /// @
bitAnd, /// & bitAnd, /// &
bitAndEqual, /// &= bitAndEquals, /// &=
bitOr, /// | bitOr, /// |
bitOrEqual, /// |= bitOrEquals, /// |=
catEqual, /// ~= catEquals, /// ~=
colon, /// : colon, /// :
comma, /// , comma, /// ,
decrement, /// -- decrement, /// --
div, /// / div, /// /
divEqual, /// /= divEquals, /// /=
dollar, /// $ dollar, /// $
dot, /// . dot, /// .
equals, /// == equals, /// ==
@ -2359,21 +2072,21 @@ enum TokenType: ushort
logicOr, /// || logicOr, /// ||
lParen, /// $(LPAREN) lParen, /// $(LPAREN)
minus, /// - minus, /// -
minusEqual, /// -= minusEquals, /// -=
mod, /// % mod, /// %
modEqual, /// %= modEquals, /// %=
mulEqual, /// *= mulEquals, /// *=
not, /// ! not, /// !
notEqual, /// != notEquals, /// !=
notGreater, /// !> notGreater, /// !>
notGreaterEqual, /// !>= notGreaterEqual, /// !>=
notLess, /// !< notLess, /// !<
notLessEqual, /// !<= notLessEqual, /// !<=
notLessEqualGreater, /// !<> notLessEqualGreater, /// !<>
plus, /// + plus, /// +
plusEqual, /// += plusEquals, /// +=
pow, /// ^^ pow, /// ^^
powEqual, /// ^^= powEquals, /// ^^=
rBrace, /// } rBrace, /// }
rBracket, /// ] rBracket, /// ]
rParen, /// $(RPAREN) rParen, /// $(RPAREN)
@ -2391,7 +2104,7 @@ enum TokenType: ushort
unsignedShiftRightEqual, /// >>>= unsignedShiftRightEqual, /// >>>=
vararg, /// ... vararg, /// ...
xor, /// ^ xor, /// ^
xorEqual, /// ^= xorEquals, /// ^=
bool_, /// $(D_KEYWORD bool) bool_, /// $(D_KEYWORD bool)
byte_, /// $(D_KEYWORD byte) byte_, /// $(D_KEYWORD byte)
@ -2401,7 +2114,6 @@ enum TokenType: ushort
char_, /// $(D_KEYWORD char) char_, /// $(D_KEYWORD char)
creal_, /// $(D_KEYWORD creal) creal_, /// $(D_KEYWORD creal)
dchar_, /// $(D_KEYWORD dchar) dchar_, /// $(D_KEYWORD dchar)
delegate_, /// $(D_KEYWORD delegate)
double_, /// $(D_KEYWORD double) double_, /// $(D_KEYWORD double)
float_, /// $(D_KEYWORD float) float_, /// $(D_KEYWORD float)
function_, /// $(D_KEYWORD function) function_, /// $(D_KEYWORD function)
@ -2453,6 +2165,7 @@ enum TokenType: ushort
continue_, /// $(D_KEYWORD continue) continue_, /// $(D_KEYWORD continue)
debug_, /// $(D_KEYWORD debug) debug_, /// $(D_KEYWORD debug)
default_, /// $(D_KEYWORD default) default_, /// $(D_KEYWORD default)
delegate_, /// $(D_KEYWORD delegate)
delete_, /// $(D_KEYWORD delete) delete_, /// $(D_KEYWORD delete)
do_, /// $(D_KEYWORD do) do_, /// $(D_KEYWORD do)
else_, /// $(D_KEYWORD else) else_, /// $(D_KEYWORD else)
@ -2529,22 +2242,298 @@ enum TokenType: ushort
dstringLiteral, /// $(D_STRING "32-bit character string"d) dstringLiteral, /// $(D_STRING "32-bit character string"d)
stringLiteral, /// $(D_STRING "an 8-bit string") stringLiteral, /// $(D_STRING "an 8-bit string")
wstringLiteral, /// $(D_STRING "16-bit character string"w) wstringLiteral, /// $(D_STRING "16-bit character string"w)
invalid, /// Not a valid token type
} }
// Implementation details follow // Implementation details follow
private: private:
// For now a private helper that is tailored to the way lexer works
// hides away forwardness of range by buffering
// RA-version is strightforward thin wrapping
// ATM it is byte-oriented
private struct LexSource(R)
if(isForwardRange!R && !isRandomAccessRange!R)
{
bool empty() const { return _empty; }
auto ref front() const
{
return accum[accumIdx];
}
auto ref peek() const
in
{
assert (accumIdx + 1 < accum.length);
}
body
{
return accum[accumIdx + 1];
}
void popFront()
{
++_index;
range.popFront();
// if that was last byte
// just advance so that open-righted slice just works
accumIdx = (accumIdx+1) & mask;
if(range.empty)
{
_empty = true;
return;
}
if(accumIdx == savedAccumIdx)
{
// and move stuff around
auto oldLen = accum.length;
auto toCopy = oldLen - accumIdx;
accum.length *= 2; // keep pow of 2
// copy starting with last item
copy(retro(accum[accumIdx..oldLen]),
retro(accum[$-toCopy..$]));
savedAccumIdx = accum.length - toCopy;
}
accum[accumIdx] = range.front;
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
// sadly need to dup circular buffer, as it overwrites items
copy.accum = copy.accum.dup;
return copy;
}
// mark a position to slice from later on
size_t mark()
{
savedAccumIdx = accumIdx;
return accumIdx;
}
// slice to current position from previously marked position
auto slice() @property
{
// it's an open right range as usual
return CircularRange(accum, savedAccumIdx, accumIdx);
}
size_t index() const @property
{
return _index;
}
private:
this(R src, size_t bufferSize)
{
range = src;
assert(bufferSize > 0);
assert((bufferSize & (bufferSize-1)) == 0); //is power of 2
accum = new ubyte[bufferSize];
if(range.empty)
_empty = true;
else
accum[accumIdx] = range.front; // load front
}
// a true RA-range of ubyte
struct CircularRange
{
this(ubyte[] buf, size_t s, size_t e)
{
assert((buffer.length & (buffer.length-1)) == 0);
buffer = buf;
start = s;
end = e;
}
//Forward range primitives
@property bool empty() const { return start == end; }
@property auto ref front() const { return buffer[start]; }
void popFront() { start = (start + 1) & mask; }
@property auto save() { return this; }
//Backwards is a bit slower, but should be rarely used (if at all)
@property ref back(){ return buffer[(end-1) & mask]; }
void popBack() { end = (end - 1) & mask; }
// RA range primitives
ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; }
@property size_t length()
{
return end < start ? end + buffer.length -start : end - start;
}
alias length opDollar;
auto opSlice(size_t newStart, size_t newEnd)
{
size_t maskedStart = (start+newStart) & mask;
size_t maskedEnd = (start+newEnd) & mask;
return typeof(this)(buffer, maskedStart, maskedEnd);
}
// @@@bug fwd-ref in ldc0.10 (if placed above previous one)
auto opSlice(){ return opSlice(0, length); }
private:
@property auto mask(){ return buffer.length-1; }
size_t start, end;
ubyte[] buffer;
}
@property auto mask(){ return accum.length-1; }
R range;
bool _empty;
ubyte[] accum; // accumulator buffer for non-RA ranges
size_t savedAccumIdx;
size_t accumIdx; // current index in accumulator
size_t _index; // index of current element in original range
}
// TODO: make sure it's RandomAccess later
/*static assert(isRandomAccessRange!(
LexSource!(typeof(filter!"true"(cast(ubyte[])null)))
.CircularRange)
);*/
//trivial pass-through for RA ranges
private struct LexSource(R)
if(isRandomAccessRange!R)
{
bool empty() const @property { return cur >= range.length; }
bool canPeek() const { return cur + 1 < range.length; }
auto ref front() const @property { return range[cur]; }
void popFront(){ cur++; }
auto ref peek() const
in
{
assert (canPeek());
}
body
{
return range[cur + 1];
}
auto save()
{
typeof(this) copy = this;
copy.range = range.save;
return copy;
}
auto mark()
{
saved = cur;
}
// use the underliying range slicing capability
auto slice() @property
{
return range[saved..cur];
}
size_t index() const @property
{
return cur;
}
private:
this(R src)
{
range = src;
}
size_t cur, saved;
R range;
}
auto lexerSource(Range)(Range range, size_t bufSize=8)
if(isForwardRange!Range && !isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range, bufSize);
}
auto lexerSource(Range)(Range range)
if(isRandomAccessRange!Range
&& is(ElementType!Range : const(ubyte)))
{
return LexSource!(Range)(range);
}
unittest
{
// test the basic functionality of a "mark-slice" range
import std.string, std.stdio;
static void test_hello(T)(T lexs)
{
assert(lexs.front == 'H');
lexs.popFront();
assert(lexs.front == 'e');
foreach(i; 0..2)
{
auto saved = lexs.save;
lexs.mark();
assert(lexs.slice.equal(""));
lexs.popFront();
assert(lexs.slice.equal("e"), text(cast(char)lexs.front));
lexs.popFrontN(4);
auto bytes = lexs.slice.map!"cast(char)a".array();
assert(bytes.equal("ello,"), bytes.to!string);
lexs.mark();
assert(lexs.slice.equal(""));
assert(lexs.front == 'w');
lexs.popFrontN(6);
assert(lexs.empty);
auto s = lexs.slice();
auto msg = s.save.map!"cast(char)a".array;
assert(s[].equal("world!"), msg);
assert(s[2..$-1].equal("rld"), msg);
assert(s[0] == 'w' && s[$-1] == '!');
s.popFront();
assert(s.front == 'o' && s.back == '!');
s.popBack();
assert(s.front == 'o' && s.back == 'd');
//restore and repeat again
lexs = saved;
}
}
static void test_empty(T)(T lexs)
{
assert(lexs.empty);
lexs.mark();
assert(lexs.slice().equal(""));
}
auto fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 16 // and the one that is more then enough
);
test_hello(fwdLex);
fwdLex = lexerSource(
"Hello, world!"
.representation
.filter!"a != ' '", 1 // try the smallest initial buffer
);
test_hello(fwdLex);
fwdLex = lexerSource("".representation.filter!"a != ' '");
auto raLex = lexerSource("".representation);
test_empty(raLex);
test_empty(fwdLex);
raLex = lexerSource("Hello,world!".representation);
test_hello(raLex);
}
// uses auto-detection for pure, safe nothrow // uses auto-detection for pure, safe nothrow
bool isRangeEoF(R)(ref R range) bool isRangeEoF(R)(ref R range)
{ {
return range.empty || range.front == 0 || range.front == 0x1a; return range.empty || range.front == 0 || range.front == 0x1a;
} }
/* // Lookup table for token values
* Slices of the above string to save memory. This array is automatically
* generated.
*/
immutable(string[TokenType.max + 1]) tokenValues = [ immutable(string[TokenType.max + 1]) tokenValues = [
"=", "=",
"@", "@",
@ -2617,7 +2606,6 @@ immutable(string[TokenType.max + 1]) tokenValues = [
"char", "char",
"creal", "creal",
"dchar", "dchar",
"delegate",
"double", "double",
"float", "float",
"function", "function",
@ -2667,6 +2655,7 @@ immutable(string[TokenType.max + 1]) tokenValues = [
"continue", "continue",
"debug", "debug",
"default", "default",
"delegate",
"delete", "delete",
"do", "do",
"else", "else",
@ -2742,7 +2731,6 @@ immutable(string[TokenType.max + 1]) tokenValues = [
null, null,
null, null,
null, null,
null,
]; ];
pure string getTokenValue(const TokenType type) pure string getTokenValue(const TokenType type)
@ -3396,5 +3384,4 @@ unittest
assert (tokenCount == 16); assert (tokenCount == 16);
} }
//void main(string[] args){} //void main(string[] args){}