diff --git a/std/d/lexer.d b/std/d/lexer.d index df9e4e9..5ecef6d 100644 --- a/std/d/lexer.d +++ b/std/d/lexer.d @@ -1,110 +1,110 @@ // Written in the D programming language /** -* This module contains a range-based _lexer for the D programming language. -* -* For performance reasons the _lexer contained in this module operates only on -* ASCII and UTF-8 encoded source code. If the use of other encodings is -* desired, the source code must be converted to UTF-8 before passing it to this -* _lexer. -* -* To use the _lexer, create a LexerConfig struct -* --- -* LexerConfig config; -* config.iterStyle = IterationStyle.everything; -* config.tokenStyle = IterationStyle.source; -* config.versionNumber = 2061; -* config.vendorString = "Lexer Example"; -* --- -* Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your -* source code, passing in the configuration. -* --- -* auto source = "import std.stdio;"c; -* auto tokens = byToken(source, config); -* --- -* The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can -* be used easily with the algorithms from std.algorithm or iterated over with -* $(D_KEYWORD foreach) -* --- -* assert (tokens.front.type == TokenType.import_); -* assert (tokens.front.value == "import"); -* assert (tokens.front.line == 1); -* assert (tokens.front.startIndex == 0); -* --- -* -* Examples: -* -* Generate HTML markup of D code. -* --- -* module highlighter; -* -* import std.stdio; -* import std.array; -* import std.d.lexer; -* -* void writeSpan(string cssClass, string value) -* { -* stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); -* } -* -* -* // http://ethanschoonover.com/solarized -* void highlight(R)(R tokens) -* { -* stdout.writeln(q"[ -* -* -* -* -* -* -*
]");
-*
-*     foreach (Token t; tokens)
-*     {
-*         if (isType(t.type))
-*             writeSpan("type", t.value);
-*         else if (isKeyword(t.type))
-*             writeSpan("kwrd", t.value);
-*         else if (t.type == TokenType.comment)
-*             writeSpan("com", t.value);
-*         else if (isStringLiteral(t.type))
-*             writeSpan("str", t.value);
-*         else if (isNumberLiteral(t.type))
-*             writeSpan("num", t.value);
-*         else if (isOperator(t.type))
-*             writeSpan("op", t.value);
-*         else
-*             stdout.write(t.value.replace("<", "<"));
-*     }
-*     stdout.writeln("
\n"); -* } -* -* void main(string[] args) -* { -* LexerConfig config; -* config.tokenStyle = TokenStyle.source; -* config.iterStyle = IterationStyle.everything; -* config.fileName = args[1]; -* auto f = File(args[1]); -* (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); -* } -* --- -* -* Copyright: Brian Schott 2013 -* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) -* Authors: Brian Schott -* Source: $(PHOBOSSRC std/d/_lexer.d) -*/ + * This module contains a range-based _lexer for the D programming language. + * + * For performance reasons the _lexer contained in this module operates only on + * ASCII and UTF-8 encoded source code. If the use of other encodings is + * desired, the source code must be converted to UTF-8 before passing it to this + * _lexer. + * + * To use the _lexer, create a LexerConfig struct + * --- + * LexerConfig config; + * config.iterStyle = IterationStyle.everything; + * config.tokenStyle = IterationStyle.source; + * config.versionNumber = 2061; + * config.vendorString = "Lexer Example"; + * --- + * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your + * source code, passing in the configuration. + * --- + * auto source = "import std.stdio;"c; + * auto tokens = byToken(source, config); + * --- + * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can + * be used easily with the algorithms from std.algorithm or iterated over with + * $(D_KEYWORD foreach) + * --- + * assert (tokens.front.type == TokenType.import_); + * assert (tokens.front.value == "import"); + * assert (tokens.front.line == 1); + * assert (tokens.front.startIndex == 0); + * --- + * + * Examples: + * + * Generate HTML markup of D code. + * --- + * module highlighter; + * + * import std.stdio; + * import std.array; + * import std.d.lexer; + * + * void writeSpan(string cssClass, string value) + * { + * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); + * } + * + * + * // http://ethanschoonover.com/solarized + * void highlight(R)(R tokens) + * { + * stdout.writeln(q"[ + * + * + * + * + * + * + *
]");
+ *
+ *     foreach (Token t; tokens)
+ *     {
+ *         if (isType(t.type))
+ *             writeSpan("type", t.value);
+ *         else if (isKeyword(t.type))
+ *             writeSpan("kwrd", t.value);
+ *         else if (t.type == TokenType.comment)
+ *             writeSpan("com", t.value);
+ *         else if (isStringLiteral(t.type))
+ *             writeSpan("str", t.value);
+ *         else if (isNumberLiteral(t.type))
+ *             writeSpan("num", t.value);
+ *         else if (isOperator(t.type))
+ *             writeSpan("op", t.value);
+ *         else
+ *             stdout.write(t.value.replace("<", "<"));
+ *     }
+ *     stdout.writeln("
\n"); + * } + * + * void main(string[] args) + * { + * LexerConfig config; + * config.tokenStyle = TokenStyle.source; + * config.iterStyle = IterationStyle.everything; + * config.fileName = args[1]; + * auto f = File(args[1]); + * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); + * } + * --- + * + * Copyright: Brian Schott 2013 + * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) + * Authors: Brian Schott, Dmitry Olshansky + * Source: $(PHOBOSSRC std/d/_lexer.d) + */ module std.d.lexer; @@ -125,574 +125,287 @@ version (unittest) import std.stdio; public: /** -* Represents a D token -*/ + * Represents a D token + */ struct Token { - /** - * The token type. - */ - TokenType type; + /** + * The token type. + */ + TokenType type; - /** - * The representation of the token in the original source code. - */ - string value; + /** + * The representation of the token in the original source code. + */ + string value; - /** - * The number of the line the token is on. - */ - uint line; + /** + * The number of the line the token is on. + */ + uint line; - /** - * The column number of the start of the token in the original source. - * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) - */ - uint column; + /** + * The column number of the start of the token in the original source. + * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) + */ + uint column; - /** - * The index of the start of the token in the original source. - * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) - */ - size_t startIndex; + /** + * The index of the start of the token in the original source. + * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) + */ + size_t startIndex; - /** - * Check to see if the token is of the same type and has the same string - * representation as the given token. - */ - bool opEquals(ref const(Token) other) const - { - return other.type == type && other.value == value; - } + /** + * Check to see if the token is of the same type and has the same string + * representation as the given token. + */ + bool opEquals(ref const(Token) other) const + { + return other.type == type && other.value == value; + } - /** - * Checks to see if the token's string representation is equal to the given - * string. - */ - bool opEquals(string value) const { return this.value == value; } + /** + * Checks to see if the token's string representation is equal to the given + * string. + */ + bool opEquals(string value) const { return this.value == value; } - /** - * Checks to see if the token is of the given type. - */ - bool opEquals(TokenType type) const { return type == type; } + /** + * Checks to see if the token is of the given type. + */ + bool opEquals(TokenType type) const { return type == type; } - /** - * Comparison operator orders tokens by start index. - */ - int opCmp(ref const(Token) other) const - { - if (startIndex < other.startIndex) return -1; - if (startIndex > other.startIndex) return 1; - return 0; - } + /** + * Comparison operator orders tokens by start index. + */ + int opCmp(ref const(Token) other) const + { + if (startIndex < other.startIndex) return -1; + if (startIndex > other.startIndex) return 1; + return 0; + } } /** -* Configure the behavior of the byToken() function. These flags may be -* combined using a bitwise or. -*/ + * Configure the behavior of the byToken() function. These flags may be + * combined using a bitwise or. + */ enum IterationStyle { - /// Only include code, not whitespace or comments - codeOnly = 0, - /// Includes comments - includeComments = 0b0001, - /// Includes whitespace - includeWhitespace = 0b0010, - /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) - includeSpecialTokens = 0b0100, - /// Do not stop iteration on reaching the ___EOF__ token - ignoreEOF = 0b1000, - /// Include everything - everything = includeComments | includeWhitespace | ignoreEOF + /// Only include code, not whitespace or comments + codeOnly = 0, + /// Includes comments + includeComments = 0b0001, + /// Includes whitespace + includeWhitespace = 0b0010, + /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) + includeSpecialTokens = 0b0100, + /// Do not stop iteration on reaching the ___EOF__ token + ignoreEOF = 0b1000, + /// Include _everything + everything = includeComments | includeWhitespace | ignoreEOF } /** -* Configuration of the token lexing style. These flags may be combined with a -* bitwise or. -*/ + * Configuration of the token lexing style. These flags may be combined with a + * bitwise or. + */ enum TokenStyle : uint { - /** - * Escape sequences will be replaced with their equivalent characters, - * enclosing quote characters will not be included. Special tokens such as - * __VENDOR__ will be replaced with their equivalent strings. Useful for - * creating a compiler or interpreter. - */ - default_ = 0b0000, + /** + * Escape sequences will be replaced with their equivalent characters, + * enclosing quote characters will not be included. Special tokens such as + * __VENDOR__ will be replaced with their equivalent strings. Useful for + * creating a compiler or interpreter. + */ + default_ = 0b0000, - /** - * Escape sequences will not be processed. An escaped quote character will - * not terminate string lexing, but it will not be replaced with the quote - * character in the token. - */ - notEscaped = 0b0001, + /** + * Escape sequences will not be processed. An escaped quote character will + * not terminate string lexing, but it will not be replaced with the quote + * character in the token. + */ + notEscaped = 0b0001, - /** - * Strings will include their opening and closing quote characters as well - * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will - * include the $(D_STRING 'w') character as well as the opening and closing - * quotes$(RPAREN) - */ - includeQuotes = 0b0010, + /** + * Strings will include their opening and closing quote characters as well + * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will + * include the $(D_STRING 'w') character as well as the opening and closing + * quotes$(RPAREN) + */ + includeQuotes = 0b0010, - /** - * Do not replace the value field of the special tokens such as ___DATE__ - * with their string equivalents. - */ - doNotReplaceSpecial = 0b0100, + /** + * Do not replace the value field of the special tokens such as ___DATE__ + * with their string equivalents. + */ + doNotReplaceSpecial = 0b0100, - /** - * Strings will be read exactly as they appeared in the source, including - * their opening and closing quote characters. Useful for syntax - * highlighting. - */ - source = notEscaped | includeQuotes | doNotReplaceSpecial + /** + * Strings will be read exactly as they appeared in the source, including + * their opening and closing quote characters. Useful for syntax + * highlighting. + */ + source = notEscaped | includeQuotes | doNotReplaceSpecial } /** -* Lexer configuration -*/ + * Lexer configuration + */ struct LexerConfig { - /** - * Iteration style - */ - IterationStyle iterStyle = IterationStyle.codeOnly; + /** + * Iteration style + */ + IterationStyle iterStyle = IterationStyle.codeOnly; - /** - * Token style - */ - TokenStyle tokenStyle = tokenStyle.default_; + /** + * Token style + */ + TokenStyle tokenStyle = tokenStyle.default_; - /** - * Replacement for the ___VERSION__ token. Defaults to 1. - */ - uint versionNumber = 100; + /** + * Replacement for the ___VERSION__ token. Defaults to 100. + */ + uint versionNumber = 100; - /** - * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") - */ - string vendorString = "std.d.lexer"; + /** + * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") + */ + string vendorString = "std.d.lexer"; - /** - * Name used when creating error messages that are sent to errorFunc. This - * is needed because the lexer operates on any forwarad range of ASCII - * characters or UTF-8 code units and does not know what to call its input - * source. Defaults to the empty string. - */ - string fileName = ""; + /** + * Name used when creating error messages that are sent to errorFunc. This + * is needed because the lexer operates on any forwarad range of ASCII + * characters or UTF-8 code units and does not know what to call its input + * source. Defaults to the empty string. + */ + string fileName = ""; - /** - * This function is called when an error is encountered during lexing. - * Parameters are file name, code uint index, line number, column, - * and error messsage. - */ - void delegate(string, size_t, uint, uint, string) errorFunc; - - /** - * Initial size of the lexer's internal token buffer in bytes. The lexer - * will grow this buffer if necessary. - */ - size_t bufferSize = 1024 * 4; + /** + * This function is called when an error is encountered during lexing. + * Parameters are file name, code uint index, line number, column, + * and error messsage. + */ + void delegate(string, size_t, uint, uint, string) errorFunc; } /** -* Iterate over the given range of characters by D tokens. -* Params: -* range = the range of characters -* config = the lexer configuration -* bufferSize = initial size of internal circular buffer -* Returns: -* an input range of tokens -*/ + * Iterate over the given range of characters by D tokens. + * Params: + * range = the range of characters + * config = the lexer configuration + * bufferSize = initial size of internal circular buffer + * Returns: + * an input range of tokens + */ auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024) - if (isForwardRange!(R) && !isRandomAccessRange!(R) - && is(ElementType!R : const(ubyte))) + if (isForwardRange!(R) && !isRandomAccessRange!(R) + && is(ElementType!R : const(ubyte))) { - // 4K of circular buffer by default - auto r = TokenRange!(typeof(lexerSource(range))) - (lexerSource(range, bufferSize), config); - r.config = config; - r.lineNumber = 1; - r.popFront(); - return r; + // 4K of circular buffer by default + auto r = TokenRange!(typeof(lexerSource(range))) + (lexerSource(range, bufferSize), config); + r.config = config; + r.lineNumber = 1; + r.popFront(); + return r; } ///ditto auto byToken(R)(R range, LexerConfig config) - if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte))) + if (isRandomAccessRange!(R) && is(ElementType!R : const(ubyte))) { - auto r = TokenRange!(typeof(lexerSource(range))) - (lexerSource(range), config); - r.config = config; - r.lineNumber = 1; - r.popFront(); - return r; + auto r = TokenRange!(typeof(lexerSource(range))) + (lexerSource(range), config); + r.config = config; + r.lineNumber = 1; + r.popFront(); + return r; } -// For now a private helper that is tailored to the way lexer works -// hides away forwardness of range by buffering -// RA-version is strightforward thin wrapping -// ATM it is byte-oriented -private struct LexSource(R) - if(isForwardRange!R && !isRandomAccessRange!R) - { - bool empty() const { return _empty; } - - auto ref front() const - { - return accum[accumIdx]; - } - - auto ref peek() const - in - { - assert (accumIdx + 1 < accum.length); - } - body - { - return accum[accumIdx + 1]; - } - - void popFront() - { - ++_index; - range.popFront(); - // if that was last byte - // just advance so that open-righted slice just works - accumIdx = (accumIdx+1) & mask; - if(range.empty) - { - _empty = true; - return; - } - if(accumIdx == savedAccumIdx) - { - // and move stuff around - auto oldLen = accum.length; - auto toCopy = oldLen - accumIdx; - accum.length *= 2; // keep pow of 2 - // copy starting with last item - copy(retro(accum[accumIdx..oldLen]), - retro(accum[$-toCopy..$])); - savedAccumIdx = accum.length - toCopy; - } - accum[accumIdx] = range.front; - } - - auto save() - { - typeof(this) copy = this; - copy.range = range.save; - // sadly need to dup circular buffer, as it overwrites items - copy.accum = copy.accum.dup; - return copy; - } - - // mark a position to slice from later on - size_t mark() - { - savedAccumIdx = accumIdx; - return accumIdx; - } - - // slice to current position from previously marked position - auto slice() @property - { - // it's an open right range as usual - return CircularRange(accum, savedAccumIdx, accumIdx); - } - - size_t index() const @property - { - return _index; - } - -private: - this(R src, size_t bufferSize) - { - range = src; - assert(bufferSize > 0); - assert((bufferSize & (bufferSize-1)) == 0); //is power of 2 - accum = new ubyte[bufferSize]; - if(range.empty) - _empty = true; - else - accum[accumIdx] = range.front; // load front - } - - // a true RA-range of ubyte - struct CircularRange - { - this(ubyte[] buf, size_t s, size_t e) - { - assert((buffer.length & (buffer.length-1)) == 0); - buffer = buf; - start = s; - end = e; - } - //Forward range primitives - @property bool empty() const { return start == end; } - @property auto ref front() const { return buffer[start]; } - void popFront() { start = (start + 1) & mask; } - @property auto save() { return this; } - - //Backwards is a bit slower, but should be rarely used (if at all) - @property ref back(){ return buffer[(end-1) & mask]; } - void popBack() { end = (end - 1) & mask; } - - // RA range primitives - ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; } - @property size_t length() - { - return end < start ? end + buffer.length -start : end - start; - } - alias length opDollar; - - auto opSlice(size_t newStart, size_t newEnd) - { - size_t maskedStart = (start+newStart) & mask; - size_t maskedEnd = (start+newEnd) & mask; - return typeof(this)(buffer, maskedStart, maskedEnd); - } - // @@@bug fwd-ref in ldc0.10 (if placed above previous one) - auto opSlice(){ return opSlice(0, length); } - private: - @property auto mask(){ return buffer.length-1; } - size_t start, end; - ubyte[] buffer; - } - - @property auto mask(){ return accum.length-1; } - - R range; - bool _empty; - ubyte[] accum; // accumulator buffer for non-RA ranges - size_t savedAccumIdx; - size_t accumIdx; // current index in accumulator - size_t _index; // index of current element in original range -} - -// TODO: make sure it's RandomAccess later -/*static assert(isRandomAccessRange!( - LexSource!(typeof(filter!"true"(cast(ubyte[])null))) - .CircularRange) -);*/ - -//trivial pass-through for RA ranges -private struct LexSource(R) - if(isRandomAccessRange!R) -{ - bool empty() const @property { return cur >= range.length; } - bool canPeek() const { return cur + 1 < range.length; } - auto ref front() const @property { return range[cur]; } - void popFront(){ cur++; } - - auto ref peek() const - in - { - assert (canPeek()); - } - body - { - return range[cur + 1]; - } - - auto save() - { - typeof(this) copy = this; - copy.range = range.save; - return copy; - } - - auto mark() - { - saved = cur; - } - - // use the underliying range slicing capability - auto slice() @property - { - return range[saved..cur]; - } - - size_t index() const @property - { - return cur; - } - -private: - this(R src) - { - range = src; - } - size_t cur, saved; - R range; -} - -auto lexerSource(Range)(Range range, size_t bufSize=8) - if(isForwardRange!Range && !isRandomAccessRange!Range - && is(ElementType!Range : const(ubyte))) -{ - return LexSource!(Range)(range, bufSize); -} - -auto lexerSource(Range)(Range range) - if(isRandomAccessRange!Range - && is(ElementType!Range : const(ubyte))) -{ - return LexSource!(Range)(range); -} - -unittest -{ - // test the basic functionality of a "mark-slice" range - import std.string, std.stdio; - - static void test_hello(T)(T lexs) - { - assert(lexs.front == 'H'); - lexs.popFront(); - assert(lexs.front == 'e'); - foreach(i; 0..2) - { - auto saved = lexs.save; - lexs.mark(); - assert(lexs.slice.equal("")); - lexs.popFront(); - assert(lexs.slice.equal("e"), text(cast(char)lexs.front)); - lexs.popFrontN(4); - auto bytes = lexs.slice.map!"cast(char)a".array(); - assert(bytes.equal("ello,"), bytes.to!string); - lexs.mark(); - assert(lexs.slice.equal("")); - assert(lexs.front == 'w'); - lexs.popFrontN(6); - assert(lexs.empty); - auto s = lexs.slice(); - auto msg = s.save.map!"cast(char)a".array; - assert(s[].equal("world!"), msg); - assert(s[2..$-1].equal("rld"), msg); - assert(s[0] == 'w' && s[$-1] == '!'); - s.popFront(); - assert(s.front == 'o' && s.back == '!'); - s.popBack(); - assert(s.front == 'o' && s.back == 'd'); - //restore and repeat again - lexs = saved; - } - } - - static void test_empty(T)(T lexs) - { - assert(lexs.empty); - lexs.mark(); - assert(lexs.slice().equal("")); - } - - auto fwdLex = lexerSource( - "Hello, world!" - .representation - .filter!"a != ' '", 16 // and the one that is more then enough - ); - test_hello(fwdLex); - fwdLex = lexerSource( - "Hello, world!" - .representation - .filter!"a != ' '", 1 // try the smallest initial buffer - ); - test_hello(fwdLex); - fwdLex = lexerSource("".representation.filter!"a != ' '"); - auto raLex = lexerSource("".representation); - test_empty(raLex); - test_empty(fwdLex); - raLex = lexerSource("Hello,world!".representation); - test_hello(raLex); -} - - /** -* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. -*/ + * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. + */ struct TokenRange(LexSrc) - //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource + //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource { - /** - * Returns: true if the range is empty - */ - bool empty() const @property - { - return _empty; - } + /** + * Returns: true if the range is empty + */ + bool empty() const @property + { + return _empty; + } - /** - * Returns: the current token - */ - ref const(Token) front() const @property - { - assert(!empty, "trying to get front of an empty token range"); - return current; - } + /** + * Returns: the current token + */ + ref const(Token) front() const @property + { + assert(!empty, "trying to get front of an empty token range"); + return current; + } - /** - * Returns the current token and then removes it from the range - */ - Token moveFront() - { - auto r = move(current); - popFront(); - return r; - } + /** + * Returns the current token and then removes it from the range + */ + Token moveFront() + { + auto r = move(current); + popFront(); + return r; + } - /** - * Foreach operation - */ - int opApply(int delegate(Token) dg) - { - int result = 0; - while (!empty) - { - result = dg(front); - if (result) - break; - popFront(); - } - return result; - } + /** + * Foreach operation + */ + int opApply(int delegate(Token) dg) + { + int result = 0; + while (!empty) + { + result = dg(front); + if (result) + break; + popFront(); + } + return result; + } - /** - * Foreach operation - */ - int opApply(int delegate(size_t, Token) dg) - { - int result = 0; - int i = 0; - while (!empty) - { - result = dg(i, front); - if (result) - break; - popFront(); - } - return result; - } + /** + * Foreach operation + */ + int opApply(int delegate(size_t, Token) dg) + { + int result = 0; + int i = 0; + while (!empty) + { + result = dg(i, front); + if (result) + break; + popFront(); + } + return result; + } - /** - * Removes the current token from the range - */ - void popFront() - { + /** + * Removes the current token from the range + */ + void popFront() + { advance(); - } + } private: - /* - * Advances the range to the next token - */ - void advance() - { + /* + * Advances the range to the next token + */ + void advance() + { L_advance: if (src.empty) { @@ -700,185 +413,185 @@ L_advance: return; } src.mark(); // mark a start of a lexing "frame" - current.line = lineNumber; - current.startIndex = src.index; - current.column = column; - current.value = null; - switch (src.front) - { + current.line = lineNumber; + current.startIndex = src.index; + current.column = column; + current.value = null; + switch (src.front) + { // handle sentenels for end of input - case 0: + case 0: case 0x1a: - // TODO: check config flags, it's cheap - // since this branch at most is taken once per file + // TODO: check config flags, it's cheap + // since this branch at most is taken once per file _empty = true; - return; + return; // pragma(msg, generateCaseTrie( - mixin(generateCaseTrie( - "=", "TokenType.assign", - "@", "TokenType.at", - "&", "TokenType.bitAnd", - "&=", "TokenType.bitAndEquals", - "|", "TokenType.bitOr", - "|=", "TokenType.bitOrEquals", - "~=", "TokenType.catEquals", - ":", "TokenType.colon", - ",", "TokenType.comma", - "--", "TokenType.decrement", - "$", "TokenType.dollar", - "==", "TokenType.equals", - "=>", "TokenType.goesTo", - ">", "TokenType.greater", - ">=", "TokenType.greaterEqual", - "++", "TokenType.increment", - "{", "TokenType.lBrace", - "[", "TokenType.lBracket", - "<", "TokenType.less", - "<=", "TokenType.lessEqual", - "<>=", "TokenType.lessEqualGreater", - "<>", "TokenType.lessOrGreater", - "&&", "TokenType.logicAnd", - "||", "TokenType.logicOr", - "(", "TokenType.lParen", - "-", "TokenType.minus", - "-=", "TokenType.minusEquals", - "%", "TokenType.mod", - "%=", "TokenType.modEquals", - "*=", "TokenType.mulEquals", - "!", "TokenType.not", - "!=", "TokenType.notEquals", - "!>", "TokenType.notGreater", - "!>=", "TokenType.notGreaterEqual", - "!<", "TokenType.notLess", - "!<=", "TokenType.notLessEqual", - "!<>", "TokenType.notLessEqualGreater", - "+", "TokenType.plus", - "+=", "TokenType.plusEquals", - "^^", "TokenType.pow", - "^^=", "TokenType.powEquals", - "}", "TokenType.rBrace", - "]", "TokenType.rBracket", - ")", "TokenType.rParen", - ";", "TokenType.semicolon", - "<<", "TokenType.shiftLeft", - "<<=", "TokenType.shiftLeftEqual", - ">>", "TokenType.shiftRight", - ">>=", "TokenType.shiftRightEqual", - "*", "TokenType.star", - "?", "TokenType.ternary", - "~", "TokenType.tilde", - "!<>=", "TokenType.unordered", - ">>>", "TokenType.unsignedShiftRight", - ">>>=", "TokenType.unsignedShiftRightEqual", - "^", "TokenType.xor", - "^=", "TokenType.xorEquals", - )); - case '/': - nextCharNonLF(); - if (isEoF()) - { - current.type = TokenType.div; - current.value = "/"; - return; - } - switch (src.front) - { - case '/': - case '*': - case '+': - if (config.iterStyle & IterationStyle.includeComments) - return lexComment!true(); + mixin(generateCaseTrie( + "=", "TokenType.assign", + "@", "TokenType.at", + "&", "TokenType.bitAnd", + "&=", "TokenType.bitAndEquals", + "|", "TokenType.bitOr", + "|=", "TokenType.bitOrEquals", + "~=", "TokenType.catEquals", + ":", "TokenType.colon", + ",", "TokenType.comma", + "--", "TokenType.decrement", + "$", "TokenType.dollar", + "==", "TokenType.equals", + "=>", "TokenType.goesTo", + ">", "TokenType.greater", + ">=", "TokenType.greaterEqual", + "++", "TokenType.increment", + "{", "TokenType.lBrace", + "[", "TokenType.lBracket", + "<", "TokenType.less", + "<=", "TokenType.lessEqual", + "<>=", "TokenType.lessEqualGreater", + "<>", "TokenType.lessOrGreater", + "&&", "TokenType.logicAnd", + "||", "TokenType.logicOr", + "(", "TokenType.lParen", + "-", "TokenType.minus", + "-=", "TokenType.minusEquals", + "%", "TokenType.mod", + "%=", "TokenType.modEquals", + "*=", "TokenType.mulEquals", + "!", "TokenType.not", + "!=", "TokenType.notEquals", + "!>", "TokenType.notGreater", + "!>=", "TokenType.notGreaterEqual", + "!<", "TokenType.notLess", + "!<=", "TokenType.notLessEqual", + "!<>", "TokenType.notLessEqualGreater", + "+", "TokenType.plus", + "+=", "TokenType.plusEquals", + "^^", "TokenType.pow", + "^^=", "TokenType.powEquals", + "}", "TokenType.rBrace", + "]", "TokenType.rBracket", + ")", "TokenType.rParen", + ";", "TokenType.semicolon", + "<<", "TokenType.shiftLeft", + "<<=", "TokenType.shiftLeftEqual", + ">>", "TokenType.shiftRight", + ">>=", "TokenType.shiftRightEqual", + "*", "TokenType.star", + "?", "TokenType.ternary", + "~", "TokenType.tilde", + "!<>=", "TokenType.unordered", + ">>>", "TokenType.unsignedShiftRight", + ">>>=", "TokenType.unsignedShiftRightEqual", + "^", "TokenType.xor", + "^=", "TokenType.xorEquals", + )); + case '/': + nextCharNonLF(); + if (isEoF()) + { + current.type = TokenType.div; + current.value = "/"; + return; + } + switch (src.front) + { + case '/': + case '*': + case '+': + if (config.iterStyle & IterationStyle.includeComments) + return lexComment!true(); lexComment!false(); goto L_advance; // tail-recursion - - case '=': - current.type = TokenType.divEquals; - current.value = "/="; - src.popFront(); - return; - default: - current.type = TokenType.div; - current.value = "/"; - return; - } - case '.': - if (!src.canPeek()) - { - current.type = TokenType.dot; - current.value = getTokenValue(TokenType.dot); - return; - } - switch (src.peek()) - { - case '0': .. case '9': - lexNumber(); - return; - case '.': - nextCharNonLF(); - nextCharNonLF(); - current.type = TokenType.slice; - if (src.front == '.') - { - current.type = TokenType.vararg; - nextCharNonLF(); - } - current.value = getTokenValue(current.type); - return; - default: - nextCharNonLF(); - current.type = TokenType.dot; - current.value = getTokenValue(TokenType.dot); - return; - } - case '0': .. case '9': - lexNumber(); - return; - case '\'': - lexCharacterLiteral(); - return; - case '"': - case '`': - lexString(); - return; - case 'q': - nextCharNonLF(); - if (isEoF()) - goto default; - switch (src.front) - { - case '{': - lexTokenString(); - return; - case '"': - lexDelimitedString(); - return; - default: - break; - } - goto default; - case 'r': - nextCharNonLF(); - if (isEoF()) - goto default; - else if (src.front == '"') - { - lexString(); - return; - } - else - goto default; - case 'x': - nextCharNonLF(); - if (isEoF()) - goto default; - else if (src.front == '"') - { - lexHexString(); - return; - } - else - goto default; - case '#': + + case '=': + current.type = TokenType.divEquals; + current.value = "/="; + src.popFront(); + return; + default: + current.type = TokenType.div; + current.value = "/"; + return; + } + case '.': + if (!src.canPeek()) + { + current.type = TokenType.dot; + current.value = getTokenValue(TokenType.dot); + return; + } + switch (src.peek()) + { + case '0': .. case '9': + lexNumber(); + return; + case '.': + nextCharNonLF(); + nextCharNonLF(); + current.type = TokenType.slice; + if (src.front == '.') + { + current.type = TokenType.vararg; + nextCharNonLF(); + } + current.value = getTokenValue(current.type); + return; + default: + nextCharNonLF(); + current.type = TokenType.dot; + current.value = getTokenValue(TokenType.dot); + return; + } + case '0': .. case '9': + lexNumber(); + return; + case '\'': + lexCharacterLiteral(); + return; + case '"': + case '`': + lexString(); + return; + case 'q': + nextCharNonLF(); + if (isEoF()) + goto default; + switch (src.front) + { + case '{': + lexTokenString(); + return; + case '"': + lexDelimitedString(); + return; + default: + break; + } + goto default; + case 'r': + nextCharNonLF(); + if (isEoF()) + goto default; + else if (src.front == '"') + { + lexString(); + return; + } + else + goto default; + case 'x': + nextCharNonLF(); + if (isEoF()) + goto default; + else if (src.front == '"') + { + lexHexString(); + return; + } + else + goto default; + case '#': lexSpecialTokenSequence(); if(config.iterStyle & IterationStyle.includeSpecialTokens) return; @@ -889,1189 +602,1189 @@ L_advance: if (config.iterStyle & IterationStyle.includeWhitespace) return lexWhitespace!true(); lexWhitespace!false(); - goto L_advance; // tail-recursion - default: + goto L_advance; // tail-recursion + default: if ((src.front & 0x80) && isLongWhite()) - { + { if (config.iterStyle & IterationStyle.includeWhitespace) return lexWhitespace!true(); lexWhitespace!false(); goto L_advance; // tail-recursion } - for(;;) - { + for(;;) + { if(isSeparating()) break; - nextCharNonLF(); + nextCharNonLF(); if(isEoF()) break; - } + } - current.type = lookupTokenType(src.slice); - current.value = getTokenValue(current.type); - if (current.value is null) - setTokenValue(); + current.type = lookupTokenType(src.slice); + current.value = getTokenValue(current.type); + if (current.value is null) + setTokenValue(); - if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) - { - _empty = true; - return; - } + if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) + { + _empty = true; + return; + } - if (config.iterStyle & TokenStyle.doNotReplaceSpecial) - return; + if (config.iterStyle & TokenStyle.doNotReplaceSpecial) + return; expandSpecialToken(); - } - } + } + } - // TODO: LexSource could be improved for forward ranges - // to avoid buffering at all (by disabling it for a moment) - // so keep the 'keep' parameter here and elsewhere - void lexWhitespace(bool keep)() - { - current.type = TokenType.whitespace; - do - { - nextChar(); - }while (!isEoF() && isWhite()); - static if (keep) setTokenValue(); - } + // TODO: LexSource could be improved for forward ranges + // to avoid buffering at all (by disabling it for a moment) + // so keep the 'keep' parameter here and elsewhere + void lexWhitespace(bool keep)() + { + current.type = TokenType.whitespace; + do + { + nextChar(); + } while (!isEoF() && isWhite()); + static if (keep) setTokenValue(); + } - void lexComment(bool keep)() - in - { - assert (src.front == '/' || src.front == '*' || src.front == '+'); - } - body - { - current.type = TokenType.comment; - switch(src.front) - { - case '/': - while (!isEoF() && !isNewline(src.front)) - { - nextCharNonLF(); - } - break; - case '*': - while (!isEoF()) - { - if (src.front == '*') - { - static if (keep) nextCharNonLF(); - else src.popFront(); - if (src.front == '/') - { - nextCharNonLF(); - break; - } - } - else - nextChar(); - } - break; - case '+': - int depth = 1; - while (depth > 0 && !isEoF()) - { - if (src.front == '+') - { - nextCharNonLF(); - if (src.front == '/') - { - nextCharNonLF(); - --depth; - } - } - else if (src.front == '/') - { - nextCharNonLF(); - if (src.front == '+') - { - nextCharNonLF(); - ++depth; - } - } - else - nextChar(); - } - break; - default: - assert(false); - } - static if (keep) - setTokenValue(); - } + void lexComment(bool keep)() + in + { + assert (src.front == '/' || src.front == '*' || src.front == '+'); + } + body + { + current.type = TokenType.comment; + switch(src.front) + { + case '/': + while (!isEoF() && !isNewline(src.front)) + { + nextCharNonLF(); + } + break; + case '*': + while (!isEoF()) + { + if (src.front == '*') + { + static if (keep) nextCharNonLF(); + else src.popFront(); + if (src.front == '/') + { + nextCharNonLF(); + break; + } + } + else + nextChar(); + } + break; + case '+': + int depth = 1; + while (depth > 0 && !isEoF()) + { + if (src.front == '+') + { + nextCharNonLF(); + if (src.front == '/') + { + nextCharNonLF(); + --depth; + } + } + else if (src.front == '/') + { + nextCharNonLF(); + if (src.front == '+') + { + nextCharNonLF(); + ++depth; + } + } + else + nextChar(); + } + break; + default: + assert(false); + } + static if (keep) + setTokenValue(); + } - void lexHexString() - in - { - assert (src.front == '"'); - } - body - { - current.type = TokenType.stringLiteral; - nextChar(); - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated hex string literal"); - return; - } - else if (isHexDigit(src.front)) - { - nextCharNonLF(); - } - else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) - { - nextChar(); - } - else if (src.front == '"') - { - nextCharNonLF(); - break; - } - else - { - errorMessage(format("Invalid character '%s' in hex string literal", - cast(char) src.front)); - return; - } - } - bool hasSuffix = lexStringSuffix(); - if (config.tokenStyle & TokenStyle.notEscaped) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(2, hasSuffix ? -2 : -1); - } - else - { - // TODO: appender is an allocation happy fat pig - // remove it later - auto a = appender!(char[])(); - foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2)) - { - auto s = cast(char[])b; - ubyte ch = cast(ubyte)parse!uint(s, 16); - a.put(ch); - } - // can safely assume ownership of data - current.value = cast(string)a.data; - } - } + void lexHexString() + in + { + assert (src.front == '"'); + } + body + { + current.type = TokenType.stringLiteral; + nextChar(); + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated hex string literal"); + return; + } + else if (isHexDigit(src.front)) + { + nextCharNonLF(); + } + else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) + { + nextChar(); + } + else if (src.front == '"') + { + nextCharNonLF(); + break; + } + else + { + errorMessage(format("Invalid character '%s' in hex string literal", + cast(char) src.front)); + return; + } + } + bool hasSuffix = lexStringSuffix(); + if (config.tokenStyle & TokenStyle.notEscaped) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(2, hasSuffix ? -2 : -1); + } + else + { + // TODO: appender is an allocation happy fat pig + // remove it later + auto a = appender!(char[])(); + foreach (b; std.range.chunks(src.slice[2 .. $ - 1], 2)) + { + auto s = cast(char[])b; + ubyte ch = cast(ubyte)parse!uint(s, 16); + a.put(ch); + } + // can safely assume ownership of data + current.value = cast(string)a.data; + } + } - void lexNumber() - in - { - assert(isDigit(src.front) || src.front == '.'); - } - body - { - if (src.front != '0') - { - lexDecimal(); - return; - } - else - { - switch (src.peek()) - { - case 'x': - case 'X': - nextCharNonLF(); - nextCharNonLF(); - lexHex(); - break; - case 'b': - case 'B': - nextCharNonLF(); - nextCharNonLF(); - lexBinary(); - break; - default: - lexDecimal(); - break; - } - } - } + void lexNumber() + in + { + assert(isDigit(src.front) || src.front == '.'); + } + body + { + if (src.front != '0') + { + lexDecimal(); + return; + } + else + { + switch (src.peek()) + { + case 'x': + case 'X': + nextCharNonLF(); + nextCharNonLF(); + lexHex(); + break; + case 'b': + case 'B': + nextCharNonLF(); + nextCharNonLF(); + lexBinary(); + break; + default: + lexDecimal(); + break; + } + } + } - void lexFloatSuffix() - { - switch (src.front) - { - case 'L': - nextCharNonLF(); - current.type = TokenType.doubleLiteral; - break; - case 'f': - case 'F': - nextCharNonLF(); - current.type = TokenType.floatLiteral; - break; - default: - break; - } - if (!isEoF() && src.front == 'i') - { - nextCharNonLF(); - if (current.type == TokenType.floatLiteral) - current.type = TokenType.ifloatLiteral; - else - current.type = TokenType.idoubleLiteral; - } - } + void lexFloatSuffix() + { + switch (src.front) + { + case 'L': + nextCharNonLF(); + current.type = TokenType.doubleLiteral; + break; + case 'f': + case 'F': + nextCharNonLF(); + current.type = TokenType.floatLiteral; + break; + default: + break; + } + if (!isEoF() && src.front == 'i') + { + nextCharNonLF(); + if (current.type == TokenType.floatLiteral) + current.type = TokenType.ifloatLiteral; + else + current.type = TokenType.idoubleLiteral; + } + } - void lexIntSuffix() - { - bool foundU; - bool foundL; - while (!isEoF()) - { - switch (src.front) - { - case 'u': - case 'U': - if (foundU) - return; - switch (current.type) - { - case TokenType.intLiteral: - current.type = TokenType.uintLiteral; - nextCharNonLF(); - break; - case TokenType.longLiteral: - current.type = TokenType.ulongLiteral; - nextCharNonLF(); - break; - default: - assert (false); - } - foundU = true; - break; - case 'L': - if (foundL) - return; - switch (current.type) - { - case TokenType.intLiteral: - current.type = TokenType.longLiteral; - nextCharNonLF(); - break; - case TokenType.uintLiteral: - current.type = TokenType.ulongLiteral; - nextCharNonLF(); - break; - default: - assert (false); - } - foundL = true; - break; - default: - return; - } - } - } + void lexIntSuffix() + { + bool foundU; + bool foundL; + while (!isEoF()) + { + switch (src.front) + { + case 'u': + case 'U': + if (foundU) + return; + switch (current.type) + { + case TokenType.intLiteral: + current.type = TokenType.uintLiteral; + nextCharNonLF(); + break; + case TokenType.longLiteral: + current.type = TokenType.ulongLiteral; + nextCharNonLF(); + break; + default: + assert (false); + } + foundU = true; + break; + case 'L': + if (foundL) + return; + switch (current.type) + { + case TokenType.intLiteral: + current.type = TokenType.longLiteral; + nextCharNonLF(); + break; + case TokenType.uintLiteral: + current.type = TokenType.ulongLiteral; + nextCharNonLF(); + break; + default: + assert (false); + } + foundL = true; + break; + default: + return; + } + } + } - void lexExponent() - in - { - assert (src.front == 'e' || src.front == 'E' || src.front == 'p' - || src.front == 'P'); - } - body - { - nextCharNonLF(); - bool foundSign = false; - bool foundDigit = false; - while (!isEoF()) - { - switch (src.front) - { - case '-': - case '+': - if (foundSign) - { - if (!foundDigit) - errorMessage("Expected an exponent"); - return; - } - foundSign = true; - nextCharNonLF(); - break; - case '0': .. case '9': - case '_': - foundDigit = true; - nextCharNonLF(); - break; - case 'L': - case 'f': - case 'F': - case 'i': - lexFloatSuffix(); - return; - default: - if (!foundDigit) - errorMessage("Expected an exponent"); - return; - } - } - } + void lexExponent() + in + { + assert (src.front == 'e' || src.front == 'E' || src.front == 'p' + || src.front == 'P'); + } + body + { + nextCharNonLF(); + bool foundSign = false; + bool foundDigit = false; + while (!isEoF()) + { + switch (src.front) + { + case '-': + case '+': + if (foundSign) + { + if (!foundDigit) + errorMessage("Expected an exponent"); + return; + } + foundSign = true; + nextCharNonLF(); + break; + case '0': .. case '9': + case '_': + foundDigit = true; + nextCharNonLF(); + break; + case 'L': + case 'f': + case 'F': + case 'i': + lexFloatSuffix(); + return; + default: + if (!foundDigit) + errorMessage("Expected an exponent"); + return; + } + } + } - void lexDecimal() - in - { - assert (isDigit(src.front) || src.front == '.'); - } - body - { - bool foundDot = src.front == '.'; - if (foundDot) - nextCharNonLF(); - current.type = TokenType.intLiteral; - decimalLoop: while (!isEoF()) - { - switch (src.front) - { - case '0': .. case '9': - case '_': - nextCharNonLF(); - break; - case 'u': - case 'U': - if (!foundDot) - lexIntSuffix(); - break decimalLoop; - case 'i': - lexFloatSuffix(); - break decimalLoop; - case 'L': - if (foundDot) - lexFloatSuffix(); - else - lexIntSuffix(); - break decimalLoop; - case 'f': - case 'F': - lexFloatSuffix(); - break decimalLoop; - case 'e': - case 'E': - lexExponent(); - break decimalLoop; - case '.': - if (foundDot) - break decimalLoop; - if (src.canPeek() && src.peek() == '.') - break decimalLoop; - nextCharNonLF(); - foundDot = true; - current.type = TokenType.doubleLiteral; - break; - default: - break decimalLoop; - } - } - setTokenValue(); - } + void lexDecimal() + in + { + assert (isDigit(src.front) || src.front == '.'); + } + body + { + bool foundDot = src.front == '.'; + if (foundDot) + nextCharNonLF(); + current.type = TokenType.intLiteral; + decimalLoop: while (!isEoF()) + { + switch (src.front) + { + case '0': .. case '9': + case '_': + nextCharNonLF(); + break; + case 'u': + case 'U': + if (!foundDot) + lexIntSuffix(); + break decimalLoop; + case 'i': + lexFloatSuffix(); + break decimalLoop; + case 'L': + if (foundDot) + lexFloatSuffix(); + else + lexIntSuffix(); + break decimalLoop; + case 'f': + case 'F': + lexFloatSuffix(); + break decimalLoop; + case 'e': + case 'E': + lexExponent(); + break decimalLoop; + case '.': + if (foundDot) + break decimalLoop; + if (src.canPeek() && src.peek() == '.') + break decimalLoop; + nextCharNonLF(); + foundDot = true; + current.type = TokenType.doubleLiteral; + break; + default: + break decimalLoop; + } + } + setTokenValue(); + } - void lexBinary() - { - current.type = TokenType.intLiteral; - binaryLoop: while (!isEoF()) - { - switch (src.front) - { - case '0': - case '1': - case '_': - nextCharNonLF(); - break; - case 'u': - case 'U': - case 'L': - lexIntSuffix(); - break binaryLoop; - default: - break binaryLoop; - } - } - setTokenValue(); - } + void lexBinary() + { + current.type = TokenType.intLiteral; + binaryLoop: while (!isEoF()) + { + switch (src.front) + { + case '0': + case '1': + case '_': + nextCharNonLF(); + break; + case 'u': + case 'U': + case 'L': + lexIntSuffix(); + break binaryLoop; + default: + break binaryLoop; + } + } + setTokenValue(); + } - void lexHex() - { - current.type = TokenType.intLiteral; - bool foundDot; - hexLoop: while (!isEoF()) - { - switch (src.front) - { - case 'a': .. case 'f': - case 'A': .. case 'F': - case '0': .. case '9': - case '_': - nextCharNonLF(); - break; - case 'u': - case 'U': - lexIntSuffix(); - break hexLoop; - case 'i': - if (foundDot) - lexFloatSuffix(); - break hexLoop; - case 'L': - if (foundDot) - { - lexFloatSuffix(); - break hexLoop; - } - else - { - lexIntSuffix(); - break hexLoop; - } - case 'p': - case 'P': - lexExponent(); - break hexLoop; - case '.': - if (foundDot) - break hexLoop; - if (src.canPeek() && src.peek() == '.') - break hexLoop; - nextCharNonLF(); - foundDot = true; - current.type = TokenType.doubleLiteral; - break; - default: - break hexLoop; - } - } - setTokenValue(); - } + void lexHex() + { + current.type = TokenType.intLiteral; + bool foundDot; + hexLoop: while (!isEoF()) + { + switch (src.front) + { + case 'a': .. case 'f': + case 'A': .. case 'F': + case '0': .. case '9': + case '_': + nextCharNonLF(); + break; + case 'u': + case 'U': + lexIntSuffix(); + break hexLoop; + case 'i': + if (foundDot) + lexFloatSuffix(); + break hexLoop; + case 'L': + if (foundDot) + { + lexFloatSuffix(); + break hexLoop; + } + else + { + lexIntSuffix(); + break hexLoop; + } + case 'p': + case 'P': + lexExponent(); + break hexLoop; + case '.': + if (foundDot) + break hexLoop; + if (src.canPeek() && src.peek() == '.') + break hexLoop; + nextCharNonLF(); + foundDot = true; + current.type = TokenType.doubleLiteral; + break; + default: + break hexLoop; + } + } + setTokenValue(); + } - bool lexStringSuffix() - { - current.type = TokenType.stringLiteral; - bool foundSuffix = false; - if (!isEoF()) - { - switch (src.front) - { - case 'w': - current.type = TokenType.wstringLiteral; - goto case 'c'; - case 'd': - current.type = TokenType.dstringLiteral; - goto case 'c'; - case 'c': - foundSuffix = true; - nextCharNonLF(); - break; - default: - break; - } - } - return foundSuffix; - } + bool lexStringSuffix() + { + current.type = TokenType.stringLiteral; + bool foundSuffix = false; + if (!isEoF()) + { + switch (src.front) + { + case 'w': + current.type = TokenType.wstringLiteral; + goto case 'c'; + case 'd': + current.type = TokenType.dstringLiteral; + goto case 'c'; + case 'c': + foundSuffix = true; + nextCharNonLF(); + break; + default: + break; + } + } + return foundSuffix; + } - void lexCharacterLiteral() - in - { - assert (src.front == '\''); - } - body - { - current.type = TokenType.characterLiteral; - nextChar(); - if (isEoF()) - { - errorMessage("Unterminated character literal"); - return; - } - switch (src.front) - { - case '\'': - break; - case '\\': - if (config.tokenStyle & TokenStyle.notEscaped) - skipEscapeSequence(); - else - { - // the only special path - // 40 bytes is enough for 2 quotes - // and the longest character entity - ubyte[40] utf8; - size_t len; - if (config.tokenStyle & TokenStyle.includeQuotes) - { - utf8[0] = '\''; - len = decodeEscapeSequence(utf8[1..$]); - utf8[len++] = '\''; - } - else - len = decodeEscapeSequence(utf8[]); - if (src.front != '\'') - { - errorMessage("Expected \"'\" to end character literal"); - } - // skip over last "'" - nextChar(); - setTokenValue(utf8[0..len]); - return; - } - break; - default: - if (src.front & 0x80) - { - while (src.front & 0x80) - nextChar(); - break; - } - else - { - nextChar(); - break; - } - } - if (src.front != '\'') - errorMessage("Expected \"'\" to end character literal"); - nextChar(); - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(1, -1); - } + void lexCharacterLiteral() + in + { + assert (src.front == '\''); + } + body + { + current.type = TokenType.characterLiteral; + nextChar(); + if (isEoF()) + { + errorMessage("Unterminated character literal"); + return; + } + switch (src.front) + { + case '\'': + break; + case '\\': + if (config.tokenStyle & TokenStyle.notEscaped) + skipEscapeSequence(); + else + { + // the only special path + // 40 bytes is enough for 2 quotes + // and the longest character entity + ubyte[40] utf8; + size_t len; + if (config.tokenStyle & TokenStyle.includeQuotes) + { + utf8[0] = '\''; + len = decodeEscapeSequence(utf8[1..$]); + utf8[len++] = '\''; + } + else + len = decodeEscapeSequence(utf8[]); + if (src.front != '\'') + { + errorMessage("Expected \"'\" to end character literal"); + } + // skip over last "'" + nextChar(); + setTokenValue(utf8[0..len]); + return; + } + break; + default: + if (src.front & 0x80) + { + while (src.front & 0x80) + nextChar(); + break; + } + else + { + nextChar(); + break; + } + } + if (src.front != '\'') + errorMessage("Expected \"'\" to end character literal"); + nextChar(); + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(1, -1); + } - void lexString() - in - { - assert (src.front == '"'); - } - body - { - current.type = TokenType.stringLiteral; - bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r" - bool isWysiwyg = src.front == '`'; - // in case we need to unescape string - Appender!(ubyte[]) unescaped; - auto quote = src.front; - nextChar(); - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - return; - } - else if (src.front == '\\') - { - if (isWysiwyg || longWysiwg) - nextChar(); - else if(config.tokenStyle & TokenStyle.notEscaped) - { - skipEscapeSequence(); - } - else - { - if(unescaped == Appender!(ubyte[]).init) - unescaped = appender!(ubyte[])(); - unescaped.put(src.slice()); - decodeEscapeSequence(unescaped); - src.mark(); //start next slice after escape sequence - } - } - else if (src.front == quote) - { - nextCharNonLF(); - break; - } - else - nextChar(); - } - lexStringSuffix(); - // helper to handle quotes - void setData(R)(R range) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(range); - else if (longWysiwg) - setTokenValue(range[2..$-1]); - else - setTokenValue(range[1..$-1]); - } - import std.stdio; - if(unescaped != Appender!(ubyte[]).init) - { - //stuff in the last slice and use buffered data - unescaped.put(src.slice); - setData(unescaped.data); - } - else - { - setData(src.slice); //slice directly - } - } + void lexString() + in + { + assert (src.front == '"'); + } + body + { + current.type = TokenType.stringLiteral; + bool longWysiwg = src.slice.length > 0 && src.slice[0] == 'r'; // 2 chars : r" + bool isWysiwyg = src.front == '`'; + // in case we need to unescape string + Appender!(ubyte[]) unescaped; + auto quote = src.front; + nextChar(); + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + return; + } + else if (src.front == '\\') + { + if (isWysiwyg || longWysiwg) + nextChar(); + else if(config.tokenStyle & TokenStyle.notEscaped) + { + skipEscapeSequence(); + } + else + { + if(unescaped == Appender!(ubyte[]).init) + unescaped = appender!(ubyte[])(); + unescaped.put(src.slice()); + decodeEscapeSequence(unescaped); + src.mark(); //start next slice after escape sequence + } + } + else if (src.front == quote) + { + nextCharNonLF(); + break; + } + else + nextChar(); + } + lexStringSuffix(); + // helper to handle quotes + void setData(R)(R range) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(range); + else if (longWysiwg) + setTokenValue(range[2..$-1]); + else + setTokenValue(range[1..$-1]); + } + import std.stdio; + if(unescaped != Appender!(ubyte[]).init) + { + //stuff in the last slice and use buffered data + unescaped.put(src.slice); + setData(unescaped.data); + } + else + { + setData(src.slice); //slice directly + } + } - void lexDelimitedString() - in - { - assert(src.front == '"'); - } - body - { - current.type = TokenType.stringLiteral; + void lexDelimitedString() + in + { + assert(src.front == '"'); + } + body + { + current.type = TokenType.stringLiteral; - nextChar(); + nextChar(); - bool heredoc; - ubyte open; - ubyte close; + bool heredoc; + ubyte open; + ubyte close; - switch (src.front) - { - case '[': open = '['; close = ']'; break; - case '{': open = '{'; close = '}'; break; - case '(': open = '('; close = ')'; break; - case '<': open = '<'; close = '>'; break; - default: heredoc = true; break; - } - if (heredoc) - lexHeredocString(); - else - lexNormalDelimitedString(open, close); - } + switch (src.front) + { + case '[': open = '['; close = ']'; break; + case '{': open = '{'; close = '}'; break; + case '(': open = '('; close = ')'; break; + case '<': open = '<'; close = '>'; break; + default: heredoc = true; break; + } + if (heredoc) + lexHeredocString(); + else + lexNormalDelimitedString(open, close); + } - void lexNormalDelimitedString(ubyte open, ubyte close) - in - { - assert(src.slice[0 .. 2] == `q"`); - } - body - { - current.type = TokenType.stringLiteral; - int depth = 1; - nextChar(); - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - break; - } - if (src.front == open) - { - nextChar(); - ++depth; - } - else if (src.front == close) - { - nextChar(); - --depth; - if (depth <= 0) - { - auto r = src.save(); //TODO: allocates for Fwd range - if (r.front == '"') - { - nextChar(); - break; - } - else - { - errorMessage("Expected \" after balanced " - ~ cast(char) close ~ " but found " - ~ cast(char) r.front ~ " instead."); - break; - } - } - } - else - nextChar(); - } - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(3, -2); - } + void lexNormalDelimitedString(ubyte open, ubyte close) + in + { + assert(src.slice[0 .. 2] == `q"`); + } + body + { + current.type = TokenType.stringLiteral; + int depth = 1; + nextChar(); + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + break; + } + if (src.front == open) + { + nextChar(); + ++depth; + } + else if (src.front == close) + { + nextChar(); + --depth; + if (depth <= 0) + { + auto r = src.save(); //TODO: allocates for Fwd range + if (r.front == '"') + { + nextChar(); + break; + } + else + { + errorMessage("Expected \" after balanced " + ~ cast(char) close ~ " but found " + ~ cast(char) r.front ~ " instead."); + break; + } + } + } + else + nextChar(); + } + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(3, -2); + } - void lexHeredocString() - in - { - assert (src.slice.equal("q\"")); - } - body - { - typeof(src.slice) ident; - uint newlineBytes; - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - return; - } - else if (isNewline(src.front)) - { - ident = src.slice[2..$]; - nextChar(); - newlineBytes = cast(uint) (src.slice.length - 2 - ident.length); - break; - } - else if (isSeparating()) - { - nextChar(); - ident = src.slice[2..$]; - nextChar(); - newlineBytes = 0; - break; - } - else - { - nextChar(); - } - } - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - break; - } - else if (src.slice.length > ident.length - && src.slice[$-ident.length .. $].equal(ident)) - { - if (src.front == '"') - { - nextChar(); - lexStringSuffix(); - break; - } - else - { - errorMessage("Unterminated string literal: " ~ cast(string) src.slice); - break; - } - } - else - nextChar(); - } + void lexHeredocString() + in + { + assert (src.slice.equal("q\"")); + } + body + { + typeof(src.slice) ident; + uint newlineBytes; + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + return; + } + else if (isNewline(src.front)) + { + ident = src.slice[2..$]; + nextChar(); + newlineBytes = cast(uint) (src.slice.length - 2 - ident.length); + break; + } + else if (isSeparating()) + { + nextChar(); + ident = src.slice[2..$]; + nextChar(); + newlineBytes = 0; + break; + } + else + { + nextChar(); + } + } + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + break; + } + else if (src.slice.length > ident.length + && src.slice[$-ident.length .. $].equal(ident)) + { + if (src.front == '"') + { + nextChar(); + lexStringSuffix(); + break; + } + else + { + errorMessage("Unterminated string literal: " ~ cast(string) src.slice); + break; + } + } + else + nextChar(); + } - bool hasSuffix = lexStringSuffix(); + bool hasSuffix = lexStringSuffix(); - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - { - setTokenValue(cast(int) (2 + newlineBytes + ident.length), - cast(int) (-(ident.length + (hasSuffix ? 2 : 1)))); - } - } + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + { + setTokenValue(cast(int) (2 + newlineBytes + ident.length), + cast(int) (-(ident.length + (hasSuffix ? 2 : 1)))); + } + } - void lexTokenString() - in - { - assert (src.front == '{'); - } - body - { - current.type = TokenType.stringLiteral; - nextChar(); - auto app = appender!(ubyte[])(); - if (config.tokenStyle & TokenStyle.includeQuotes) - { - app.put('q'); - app.put('{'); - } - LexerConfig c = config; - scope (exit) config = c; - config.iterStyle = IterationStyle.everything; - config.tokenStyle = TokenStyle.source; - int depth = 1; + void lexTokenString() + in + { + assert (src.front == '{'); + } + body + { + current.type = TokenType.stringLiteral; + nextChar(); + auto app = appender!(ubyte[])(); + if (config.tokenStyle & TokenStyle.includeQuotes) + { + app.put('q'); + app.put('{'); + } + LexerConfig c = config; + scope (exit) config = c; + config.iterStyle = IterationStyle.everything; + config.tokenStyle = TokenStyle.source; + int depth = 1; - while (!isEoF()) - { - advance(); - if (current.type == TokenType.lBrace) - ++depth; - else if (current.type == TokenType.rBrace) - { - --depth; - if (depth <= 0) - break; - } - app.put(representation(current.value)); - } - config = c; - if (config.tokenStyle & TokenStyle.includeQuotes) - { - app.put('}'); - } - if (src.empty) - current.type = TokenType.stringLiteral; - else - { - switch (src.front) - { - case 'd': - if (config.tokenStyle & TokenStyle.includeQuotes) - app.put('d'); - current.type = TokenType.dstringLiteral; - src.popFront(); - break; - case 'w': - if (config.tokenStyle & TokenStyle.includeQuotes) - app.put('w'); - current.type = TokenType.wstringLiteral; - src.popFront(); - break; - case 'c': - if (config.tokenStyle & TokenStyle.includeQuotes) - app.put('c'); - src.popFront(); - goto default; - default: - current.type = TokenType.stringLiteral; - break; - } - } - current.value = cast(string) app.data; - } + while (!isEoF()) + { + advance(); + if (current.type == TokenType.lBrace) + ++depth; + else if (current.type == TokenType.rBrace) + { + --depth; + if (depth <= 0) + break; + } + app.put(representation(current.value)); + } + config = c; + if (config.tokenStyle & TokenStyle.includeQuotes) + { + app.put('}'); + } + if (src.empty) + current.type = TokenType.stringLiteral; + else + { + switch (src.front) + { + case 'd': + if (config.tokenStyle & TokenStyle.includeQuotes) + app.put('d'); + current.type = TokenType.dstringLiteral; + src.popFront(); + break; + case 'w': + if (config.tokenStyle & TokenStyle.includeQuotes) + app.put('w'); + current.type = TokenType.wstringLiteral; + src.popFront(); + break; + case 'c': + if (config.tokenStyle & TokenStyle.includeQuotes) + app.put('c'); + src.popFront(); + goto default; + default: + current.type = TokenType.stringLiteral; + break; + } + } + current.value = cast(string) app.data; + } - void lexSpecialTokenSequence() - in - { - assert (src.front == '#'); - } - body - { - nextChar(); - auto r = src.save(); - auto app = appender!(ubyte[])(); - app.put('#'); - while (true) - { - if (r.isRangeEoF()) - { - errorMessage("Found EOF when interpreting special token sequence"); - return; - } - else if (isNewline(r.front)) - break; - else - { - app.put(r.front); - r.popFront(); - } - } - auto m = match((cast(char[]) app.data), - `#line\s+(?P\d+)\s*(?P".+")*?`); - if (m) - { - current.type = TokenType.specialTokenSequence; - current.value = (cast(char[]) app.data).idup; - column += app.data.length; - foreach (i; 0 .. app.data.length) - src.popFront(); - auto c = m.captures; - if (c["filespec"]) - config.fileName = c["filespec"].idup; - auto l = c["line"]; - lineNumber = parse!uint(l); - } - else - { - current.type = TokenType.hash; - current.value = getTokenValue(TokenType.hash); - } - } + void lexSpecialTokenSequence() + in + { + assert (src.front == '#'); + } + body + { + nextChar(); + auto r = src.save(); + auto app = appender!(ubyte[])(); + app.put('#'); + while (true) + { + if (r.isRangeEoF()) + { + errorMessage("Found EOF when interpreting special token sequence"); + return; + } + else if (isNewline(r.front)) + break; + else + { + app.put(r.front); + r.popFront(); + } + } + auto m = match((cast(char[]) app.data), + `#line\s+(?P\d+)\s*(?P".+")*?`); + if (m) + { + current.type = TokenType.specialTokenSequence; + current.value = (cast(char[]) app.data).idup; + column += app.data.length; + foreach (i; 0 .. app.data.length) + src.popFront(); + auto c = m.captures; + if (c["filespec"]) + config.fileName = c["filespec"].idup; + auto l = c["line"]; + lineNumber = parse!uint(l); + } + else + { + current.type = TokenType.hash; + current.value = getTokenValue(TokenType.hash); + } + } //===================================================================== // Helpers for lexXYZ functions //===================================================================== - void skipEscapeSequence() - { - // no decoding, just minor sanity checks - nextChar(); - switch (src.front) - { - case '\'': - case '"': - case '?': - case '\\': - case 'a': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - case 0x0a: - case 0x00: - nextChar(); - return; - case '0': .. case '7': - foreach(i; 0 .. 3) - { - nextChar(); - if (src.front < '0' || src.front > '7') return; - } - return; - case 'x': - nextChar(); - foreach(i; 0 .. 2) - { - if (!isHexDigit(src.front)) - { - errorMessage("Expected hex digit"); - return; - } - nextChar(); - } - return; - case 'u': - case 'U': - uint digits = src.front == 'u' ? 4 : 8; - nextChar(); - foreach (i; 0 .. digits) - { - if (!isHexDigit(src.front)) - { - errorMessage("Expected hex digit instead of %s".format( - cast(char) src.front)); - return; - } - nextChar(); - } - return; - case '&': - while (!isEoF()) - { - nextChar(); - if (src.front == ';') - break; - } - return; - default: - errorMessage("Invalid escape sequence"); - return; - } - } + void skipEscapeSequence() + { + // no decoding, just minor sanity checks + nextChar(); + switch (src.front) + { + case '\'': + case '"': + case '?': + case '\\': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case 0x0a: + case 0x00: + nextChar(); + return; + case '0': .. case '7': + foreach(i; 0 .. 3) + { + nextChar(); + if (src.front < '0' || src.front > '7') return; + } + return; + case 'x': + nextChar(); + foreach(i; 0 .. 2) + { + if (!isHexDigit(src.front)) + { + errorMessage("Expected hex digit"); + return; + } + nextChar(); + } + return; + case 'u': + case 'U': + uint digits = src.front == 'u' ? 4 : 8; + nextChar(); + foreach (i; 0 .. digits) + { + if (!isHexDigit(src.front)) + { + errorMessage("Expected hex digit instead of %s".format( + cast(char) src.front)); + return; + } + nextChar(); + } + return; + case '&': + while (!isEoF()) + { + nextChar(); + if (src.front == ';') + break; + } + return; + default: + errorMessage("Invalid escape sequence"); + return; + } + } - size_t decodeEscapeSequence(OutputRange)(OutputRange dest) - in - { - assert (src.front == '\\'); - } - body - { - size_t reencodeNumeric(ubyte[] src, int radix, OutputRange dest) - { - char[] chunk = cast(char[])src; - char[4] utfBuf; - uint codepoint = parse!uint(chunk, radix); - size_t len; - try - len = encode(utfBuf, codepoint); - catch (UTFException ex) - { - errorMessage(ex.msg); - return 0; - } - dest.put(cast(ubyte[]) utfBuf[0..len]); - return len; - } + size_t decodeEscapeSequence(OutputRange)(OutputRange dest) + in + { + assert (src.front == '\\'); + } + body + { + size_t reencodeNumeric(ubyte[] src, int radix, OutputRange dest) + { + char[] chunk = cast(char[])src; + char[4] utfBuf; + uint codepoint = parse!uint(chunk, radix); + size_t len; + try + len = encode(utfBuf, codepoint); + catch (UTFException ex) + { + errorMessage(ex.msg); + return 0; + } + dest.put(cast(ubyte[]) utfBuf[0..len]); + return len; + } - ubyte[40] buffer; - src.popFront(); - switch (src.front) - { - case '\'': - case '"': - case '?': - case '\\': - buffer[0] = src.front; - src.popFront(); - return 1; - case 'a': dest.put('\a'); src.popFront(); return 1; - case 'b': dest.put('\b'); src.popFront(); return 1; - case 'f': dest.put('\f'); src.popFront(); return 1; - case 'n': dest.put('\n'); src.popFront(); return 1; - case 'r': dest.put('\r'); src.popFront(); return 1; - case 't': dest.put('\t'); src.popFront(); return 1; - case 'v': dest.put('\v'); src.popFront(); return 1; - case 0x0a: dest.put(cast(ubyte)0x0a); src.popFront(); return 1; - case 0x00: dest.put(cast(ubyte)0x00); src.popFront(); return 1; - case '0': .. case '7': - size_t idx = 0; - while(idx < 3 && !isEoF()) - { - buffer[idx++] = src.front; - src.popFront(); - if (src.front < '0' || src.front > '7') break; - } - return reencodeNumeric(buffer[0..idx], 8, dest); - case 'x': - src.popFront(); - foreach(i; 0 .. 2) - { - if (!isHexDigit(src.front)) - { - errorMessage("Expected hex digit"); - return 1; - } - buffer[i] = src.front; - src.popFront(); - } - return reencodeNumeric(buffer[0..2], 16, dest); - case 'u': - case 'U': - uint digitCount = src.front == 'u' ? 4 : 8; - src.popFront(); - foreach (i; 0 .. digitCount) - { - if (!isHexDigit(src.front)) - { - errorMessage("Expected hex digit"); - return 1; - } - buffer[i] = src.front; - src.popFront(); - } - return reencodeNumeric(buffer[0..digitCount], 16, dest); - case '&': - src.popFront(); - size_t idx = 0; - while (!isEoF()) - { - if (isAlpha(src.front)) - { - buffer[idx++] = src.front; - if(idx == buffer.length) // way over maximum length - errorMessage("Invalid character entity"); - src.popFront(); - } - else if (src.front == ';') - { - src.popFront(); - break; - } - else - { - errorMessage("Invalid character entity"); - return idx; - } - } + ubyte[40] buffer; + src.popFront(); + switch (src.front) + { + case '\'': + case '"': + case '?': + case '\\': + buffer[0] = src.front; + src.popFront(); + return 1; + case 'a': dest.put('\a'); src.popFront(); return 1; + case 'b': dest.put('\b'); src.popFront(); return 1; + case 'f': dest.put('\f'); src.popFront(); return 1; + case 'n': dest.put('\n'); src.popFront(); return 1; + case 'r': dest.put('\r'); src.popFront(); return 1; + case 't': dest.put('\t'); src.popFront(); return 1; + case 'v': dest.put('\v'); src.popFront(); return 1; + case 0x0a: dest.put(cast(ubyte)0x0a); src.popFront(); return 1; + case 0x00: dest.put(cast(ubyte)0x00); src.popFront(); return 1; + case '0': .. case '7': + size_t idx = 0; + while(idx < 3 && !isEoF()) + { + buffer[idx++] = src.front; + src.popFront(); + if (src.front < '0' || src.front > '7') break; + } + return reencodeNumeric(buffer[0..idx], 8, dest); + case 'x': + src.popFront(); + foreach(i; 0 .. 2) + { + if (!isHexDigit(src.front)) + { + errorMessage("Expected hex digit"); + return 1; + } + buffer[i] = src.front; + src.popFront(); + } + return reencodeNumeric(buffer[0..2], 16, dest); + case 'u': + case 'U': + uint digitCount = src.front == 'u' ? 4 : 8; + src.popFront(); + foreach (i; 0 .. digitCount) + { + if (!isHexDigit(src.front)) + { + errorMessage("Expected hex digit"); + return 1; + } + buffer[i] = src.front; + src.popFront(); + } + return reencodeNumeric(buffer[0..digitCount], 16, dest); + case '&': + src.popFront(); + size_t idx = 0; + while (!isEoF()) + { + if (isAlpha(src.front)) + { + buffer[idx++] = src.front; + if(idx == buffer.length) // way over maximum length + errorMessage("Invalid character entity"); + src.popFront(); + } + else if (src.front == ';') + { + src.popFront(); + break; + } + else + { + errorMessage("Invalid character entity"); + return idx; + } + } //TODO: avoid looking up as UTF string, use raw bytes - string chunk = cast(string)buffer[0..idx]; + string chunk = cast(string)buffer[0..idx]; auto names = assumeSorted(map!"a.name"(characterEntities)); auto place = names.lowerBound(chunk).length; - if (place == names.length || names[place] != chunk) - { - errorMessage("Invalid character entity \"&%s;\"" - .format(cast(string) chunk)); - return 1; - } + if (place == names.length || names[place] != chunk) + { + errorMessage("Invalid character entity \"&%s;\"" + .format(cast(string) chunk)); + return 1; + } auto entity = characterEntities[place].value; - dest.put(cast(ubyte[]) entity); - return entity.length; - default: - errorMessage("Invalid escape sequence"); - return 1; - } - } + dest.put(cast(ubyte[]) entity); + return entity.length; + default: + errorMessage("Invalid escape sequence"); + return 1; + } + } - // advances underlying mark-slice range and counts lines, cols - void nextChar() - { - bool foundNewline; - if (src.front == '\r') - { - src.popFront(); - foundNewline = true; - } - if (src.front == '\n') - { - src.popFront(); - foundNewline = true; - } - else - { - src.popFront(); - } - if (foundNewline) - { - ++lineNumber; - column = 0; - } - else - ++column; + // advances underlying mark-slice range and counts lines, cols + void nextChar() + { + bool foundNewline; + if (src.front == '\r') + { + src.popFront(); + foundNewline = true; + } + if (src.front == '\n') + { + src.popFront(); + foundNewline = true; + } + else + { + src.popFront(); + } + if (foundNewline) + { + ++lineNumber; + column = 0; + } + else + ++column; - } + } - //same but don't bother for LF sequences - void nextCharNonLF() - { - src.popFront(); - ++column; - } + //same but don't bother for LF sequences + void nextCharNonLF() + { + src.popFront(); + ++column; + } - void setTokenValue()() - { - current.value = cache.get(src.slice); - } + void setTokenValue()() + { + current.value = cache.get(src.slice); + } - void setTokenValue()(int startOffset, int endOffset) - in - { - assert(startOffset >= 0); - assert(endOffset <= 0); - } - body - { - auto piece = src.slice; - // avoid unsigned arithmetic as endOffset is negative - int end = cast(int)piece.length + endOffset; - current.value = cache.get(src.slice[startOffset .. end]); - } + void setTokenValue()(int startOffset, int endOffset) + in + { + assert(startOffset >= 0); + assert(endOffset <= 0); + } + body + { + auto piece = src.slice; + // avoid unsigned arithmetic as endOffset is negative + int end = cast(int)piece.length + endOffset; + current.value = cache.get(src.slice[startOffset .. end]); + } - void setTokenValue(R)(R range) - if(isRandomAccessRange!R && is(ElementType!R : const(ubyte))) - { - current.value = cache.get(range); - } + void setTokenValue(R)(R range) + if(isRandomAccessRange!R && is(ElementType!R : const(ubyte))) + { + current.value = cache.get(range); + } - bool isEoF() const - { - return src.empty || src.front == 0 || src.front == 0x1a; - } + bool isEoF() const + { + return src.empty || src.front == 0 || src.front == 0x1a; + } - bool isSeparating() - { - auto ch = src.front; - if (ch <= 0x2f) return true; - if (ch >= ':' && ch <= '@') return true; - if (ch >= '[' && ch <= '^') return true; - if (ch >= '{' && ch <= '~') return true; - if (ch == '`') return true; - if ((ch & 0x80) && isLongWhite()) return true; - return false; - } + bool isSeparating() + { + auto ch = src.front; + if (ch <= 0x2f) return true; + if (ch >= ':' && ch <= '@') return true; + if (ch >= '[' && ch <= '^') return true; + if (ch >= '{' && ch <= '~') return true; + if (ch == '`') return true; + if ((ch & 0x80) && isLongWhite()) return true; + return false; + } + + bool isWhite() + { + auto c = src.front; + if (c & 0x80) // multi-byte utf-8 + { + return isLongWhite(); + } + else + return c == 0x20 || (c >= 0x09 && c <= 0x0d); + } - bool isWhite() - { - auto c = src.front; - if (c & 0x80) // multi-byte utf-8 - { - return isLongWhite(); - } - else - return c == 0x20 || (c >= 0x09 && c <= 0x0d); - } - bool isLongWhite() { assert(src.front & 0x80); // only non-ascii @@ -2090,7 +1803,7 @@ L_advance: return false; return true; } - + void expandSpecialToken() { switch (current.type) @@ -2134,973 +1847,1250 @@ L_advance: } } - void errorMessage(string s) - { - import std.string: format; - if (config.errorFunc !is null) - config.errorFunc(config.fileName, current.startIndex, - current.line, current.column, s); - else - throw new Exception(format("%s(%d:%d): %s", - config.fileName, current.line, current.column, s)); - } + void errorMessage(string s) + { + import std.string: format; + if (config.errorFunc !is null) + config.errorFunc(config.fileName, current.startIndex, + current.line, current.column, s); + else + throw new Exception(format("%s(%d:%d): %s", + config.fileName, current.line, current.column, s)); + } - this(LexSrc lex, LexerConfig cfg) - { - src = move(lex); // lex is rhs - lineNumber = 1; - column = 0; - _empty = false; - config = move(cfg); - } + this(LexSrc lex, LexerConfig cfg) + { + src = move(lex); // lex is rhs + lineNumber = 1; + column = 0; + _empty = false; + config = move(cfg); + } - Token current; - uint lineNumber; - uint column; - LexSrc src; - bool _empty; - LexerConfig config; - StringCache cache; + Token current; + uint lineNumber; + uint column; + LexSrc src; + bool _empty; + LexerConfig config; + StringCache cache; } /** -* Returns: true if the token is an operator -*/ + * Returns: true if the token is an operator + */ pure nothrow bool isOperator(const TokenType t) { - return t >= TokenType.assign && t <= TokenType.xorEquals; + return t >= TokenType.assign && t <= TokenType.xorEquals; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isOperator(ref const Token t) { - return isOperator(t.type); + return isOperator(t.type); } /** -* Returns: true if the token is a keyword -*/ + * Returns: true if the token is a keyword + */ pure nothrow bool isKeyword(const TokenType t) { - return t >= TokenType.bool_ && t <= TokenType.with_; + return t >= TokenType.bool_ && t <= TokenType.with_; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isKeyword(ref const Token t) { - return isKeyword(t.type); + return isKeyword(t.type); } /** -* Returns: true if the token is a built-in type -*/ + * Returns: true if the token is a built-in type + */ pure nothrow bool isType(const TokenType t) { - return t >= TokenType.bool_ && t <= TokenType.wchar_; + return t >= TokenType.bool_ && t <= TokenType.wchar_; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isType(ref const Token t) { - return isType(t.type); + return isType(t.type); } /** -* Returns: true if the token is an attribute -*/ + * Returns: true if the token is an attribute + */ pure nothrow bool isAttribute(const TokenType t) { - return t >= TokenType.align_ && t <= TokenType.static_; + return t >= TokenType.align_ && t <= TokenType.static_; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isAttribute(ref const Token t) { - return isAttribute(t.type); + return isAttribute(t.type); } /** -* Returns: true if the token is a protection attribute -*/ + * Returns: true if the token is a protection attribute + */ pure nothrow bool isProtection(const TokenType t) { - return t >= TokenType.export_ && t <= TokenType.public_; + return t >= TokenType.export_ && t <= TokenType.public_; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isProtection(ref const Token t) { - return isProtection(t.type); + return isProtection(t.type); } /** -* Returns: true if the token is a compile-time constant such as ___DATE__ -*/ + * Returns: true if the token is a compile-time constant such as ___DATE__ + */ pure nothrow bool isConstant(const TokenType t) { - return t >= TokenType.date && t <= TokenType.traits; + return t >= TokenType.date && t <= TokenType.traits; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isConstant(ref const Token t) { - return isConstant(t.type); + return isConstant(t.type); } /** -* Returns: true if the token is a string or number literal -*/ + * Returns: true if the token is a string or number literal + */ pure nothrow bool isLiteral(const TokenType t) { - return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral; + return t >= TokenType.doubleLiteral && t <= TokenType.wstringLiteral; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isLiteral(ref const Token t) { - return isLiteral(t.type); + return isLiteral(t.type); } /** -* Returns: true if the token is a number literal -*/ + * Returns: true if the token is a number literal + */ pure nothrow bool isNumberLiteral(const TokenType t) { - return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral; + return t >= TokenType.doubleLiteral && t <= TokenType.ulongLiteral; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isNumberLiteral(ref const Token t) { - return isNumberLiteral(t.type); + return isNumberLiteral(t.type); } /** -* Returns: true if the token is a string literal -*/ + * Returns: true if the token is a string literal + */ pure nothrow bool isStringLiteral(const TokenType t) { - return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral; + return t >= TokenType.dstringLiteral && t <= TokenType.wstringLiteral; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isStringLiteral(ref const Token t) { - return isStringLiteral(t.type); + return isStringLiteral(t.type); } /** -* Returns: true if the token is whitespace, a commemnt, a special token -* sequence, or an identifier -*/ + * Returns: true if the token is whitespace, a commemnt, a special token + * sequence, or an identifier + */ pure nothrow bool isMisc(const TokenType t) { - return t >= TokenType.comment && t <= TokenType.specialTokenSequence; + return t >= TokenType.comment && t <= TokenType.specialTokenSequence; } /** -* ditto -*/ + * ditto + */ pure nothrow bool isMisc(ref const Token t) { - return isMisc(t.type); + return isMisc(t.type); } /** -* Listing of all the tokens in the D language. -*/ + * Listing of all the tokens in the D language. + */ enum TokenType: ushort { - assign, /// = - at, /// @ - bitAnd, /// & - bitAndEquals, /// &= - bitOr, /// | - bitOrEquals, /// |= - catEquals, /// ~= - colon, /// : - comma, /// , - decrement, /// -- - div, /// / - divEquals, /// /= - dollar, /// $ - dot, /// . - equals, /// == - goesTo, /// => - greater, /// > - greaterEqual, /// >= - hash, /// # - increment, /// ++ - lBrace, /// { - lBracket, /// [ - less, /// < - lessEqual, /// <= - lessEqualGreater, /// <>= - lessOrGreater, /// <> - logicAnd, /// && - logicOr, /// || - lParen, /// $(LPAREN) - minus, /// - - minusEquals, /// -= - mod, /// % - modEquals, /// %= - mulEquals, /// *= - not, /// ! - notEquals, /// != - notGreater, /// !> - notGreaterEqual, /// !>= - notLess, /// !< - notLessEqual, /// !<= - notLessEqualGreater, /// !<> - plus, /// + - plusEquals, /// += - pow, /// ^^ - powEquals, /// ^^= - rBrace, /// } - rBracket, /// ] - rParen, /// $(RPAREN) - semicolon, /// ; - shiftLeft, /// << - shiftLeftEqual, /// <<= - shiftRight, /// >> - shiftRightEqual, /// >>= - slice, /// .. - star, /// * - ternary, /// ? - tilde, /// ~ - unordered, /// !<>= - unsignedShiftRight, /// >>> - unsignedShiftRightEqual, /// >>>= - vararg, /// ... - xor, /// ^ - xorEquals, /// ^= + assign, /// = + at, /// @ + bitAnd, /// & + bitAndEquals, /// &= + bitOr, /// | + bitOrEquals, /// |= + catEquals, /// ~= + colon, /// : + comma, /// , + decrement, /// -- + div, /// / + divEquals, /// /= + dollar, /// $ + dot, /// . + equals, /// == + goesTo, /// => + greater, /// > + greaterEqual, /// >= + hash, /// # + increment, /// ++ + lBrace, /// { + lBracket, /// [ + less, /// < + lessEqual, /// <= + lessEqualGreater, /// <>= + lessOrGreater, /// <> + logicAnd, /// && + logicOr, /// || + lParen, /// $(LPAREN) + minus, /// - + minusEquals, /// -= + mod, /// % + modEquals, /// %= + mulEquals, /// *= + not, /// ! + notEquals, /// != + notGreater, /// !> + notGreaterEqual, /// !>= + notLess, /// !< + notLessEqual, /// !<= + notLessEqualGreater, /// !<> + plus, /// + + plusEquals, /// += + pow, /// ^^ + powEquals, /// ^^= + rBrace, /// } + rBracket, /// ] + rParen, /// $(RPAREN) + semicolon, /// ; + shiftLeft, /// << + shiftLeftEqual, /// <<= + shiftRight, /// >> + shiftRightEqual, /// >>= + slice, /// .. + star, /// * + ternary, /// ? + tilde, /// ~ + unordered, /// !<>= + unsignedShiftRight, /// >>> + unsignedShiftRightEqual, /// >>>= + vararg, /// ... + xor, /// ^ + xorEquals, /// ^= - bool_, /// $(D_KEYWORD bool) - byte_, /// $(D_KEYWORD byte) - cdouble_, /// $(D_KEYWORD cdouble) - cent_, /// $(D_KEYWORD cent) - cfloat_, /// $(D_KEYWORD cfloat) - char_, /// $(D_KEYWORD char) - creal_, /// $(D_KEYWORD creal) - dchar_, /// $(D_KEYWORD dchar) - double_, /// $(D_KEYWORD double) - float_, /// $(D_KEYWORD float) - function_, /// $(D_KEYWORD function) - idouble_, /// $(D_KEYWORD idouble) - ifloat_, /// $(D_KEYWORD ifloat) - int_, /// $(D_KEYWORD int) - ireal_, /// $(D_KEYWORD ireal) - long_, /// $(D_KEYWORD long) - real_, /// $(D_KEYWORD real) - short_, /// $(D_KEYWORD short) - ubyte_, /// $(D_KEYWORD ubyte) - ucent_, /// $(D_KEYWORD ucent) - uint_, /// $(D_KEYWORD uint) - ulong_, /// $(D_KEYWORD ulong) - ushort_, /// $(D_KEYWORD ushort) - void_, /// $(D_KEYWORD void) - wchar_, /// $(D_KEYWORD wchar) + bool_, /// $(D_KEYWORD bool) + byte_, /// $(D_KEYWORD byte) + cdouble_, /// $(D_KEYWORD cdouble) + cent_, /// $(D_KEYWORD cent) + cfloat_, /// $(D_KEYWORD cfloat) + char_, /// $(D_KEYWORD char) + creal_, /// $(D_KEYWORD creal) + dchar_, /// $(D_KEYWORD dchar) + double_, /// $(D_KEYWORD double) + float_, /// $(D_KEYWORD float) + function_, /// $(D_KEYWORD function) + idouble_, /// $(D_KEYWORD idouble) + ifloat_, /// $(D_KEYWORD ifloat) + int_, /// $(D_KEYWORD int) + ireal_, /// $(D_KEYWORD ireal) + long_, /// $(D_KEYWORD long) + real_, /// $(D_KEYWORD real) + short_, /// $(D_KEYWORD short) + ubyte_, /// $(D_KEYWORD ubyte) + ucent_, /// $(D_KEYWORD ucent) + uint_, /// $(D_KEYWORD uint) + ulong_, /// $(D_KEYWORD ulong) + ushort_, /// $(D_KEYWORD ushort) + void_, /// $(D_KEYWORD void) + wchar_, /// $(D_KEYWORD wchar) - align_, /// $(D_KEYWORD align) - deprecated_, /// $(D_KEYWORD deprecated) - extern_, /// $(D_KEYWORD extern) - pragma_, /// $(D_KEYWORD pragma) - export_, /// $(D_KEYWORD export) - package_, /// $(D_KEYWORD package) - private_, /// $(D_KEYWORD private) - protected_, /// $(D_KEYWORD protected) - public_, /// $(D_KEYWORD public) - abstract_, /// $(D_KEYWORD abstract) - auto_, /// $(D_KEYWORD auto) - const_, /// $(D_KEYWORD const) - final_, /// $(D_KEYWORD final) - gshared, /// $(D_KEYWORD __gshared) - immutable_, // immutable - inout_, // inout - scope_, /// $(D_KEYWORD scope) - shared_, // shared - static_, /// $(D_KEYWORD static) + align_, /// $(D_KEYWORD align) + deprecated_, /// $(D_KEYWORD deprecated) + extern_, /// $(D_KEYWORD extern) + pragma_, /// $(D_KEYWORD pragma) + export_, /// $(D_KEYWORD export) + package_, /// $(D_KEYWORD package) + private_, /// $(D_KEYWORD private) + protected_, /// $(D_KEYWORD protected) + public_, /// $(D_KEYWORD public) + abstract_, /// $(D_KEYWORD abstract) + auto_, /// $(D_KEYWORD auto) + const_, /// $(D_KEYWORD const) + final_, /// $(D_KEYWORD final) + gshared, /// $(D_KEYWORD __gshared) + immutable_, // immutable + inout_, // inout + scope_, /// $(D_KEYWORD scope) + shared_, // shared + static_, /// $(D_KEYWORD static) - synchronized_, /// $(D_KEYWORD synchronized) - alias_, /// $(D_KEYWORD alias) - asm_, /// $(D_KEYWORD asm) - assert_, /// $(D_KEYWORD assert) - body_, /// $(D_KEYWORD body) - break_, /// $(D_KEYWORD break) - case_, /// $(D_KEYWORD case) - cast_, /// $(D_KEYWORD cast) - catch_, /// $(D_KEYWORD catch) - class_, /// $(D_KEYWORD class) - continue_, /// $(D_KEYWORD continue) - debug_, /// $(D_KEYWORD debug) - default_, /// $(D_KEYWORD default) - delegate_, /// $(D_KEYWORD delegate) - delete_, /// $(D_KEYWORD delete) - do_, /// $(D_KEYWORD do) - else_, /// $(D_KEYWORD else) - enum_, /// $(D_KEYWORD enum) - false_, /// $(D_KEYWORD false) - finally_, /// $(D_KEYWORD finally) - foreach_, /// $(D_KEYWORD foreach) - foreach_reverse_, /// $(D_KEYWORD foreach_reverse) - for_, /// $(D_KEYWORD for) - goto_, /// $(D_KEYWORD goto) - if_, /// $(D_KEYWORD if) - import_, /// $(D_KEYWORD import) - in_, /// $(D_KEYWORD in) - interface_, /// $(D_KEYWORD interface) - invariant_, /// $(D_KEYWORD invariant) - is_, /// $(D_KEYWORD is) - lazy_, /// $(D_KEYWORD lazy) - macro_, /// $(D_KEYWORD macro) - mixin_, /// $(D_KEYWORD mixin) - module_, /// $(D_KEYWORD module) - new_, /// $(D_KEYWORD new) - nothrow_, /// $(D_KEYWORD nothrow) - null_, /// $(D_KEYWORD null) - out_, /// $(D_KEYWORD out) - override_, /// $(D_KEYWORD override) - pure_, /// $(D_KEYWORD pure) - ref_, /// $(D_KEYWORD ref) - return_, /// $(D_KEYWORD return) - struct_, /// $(D_KEYWORD struct) - super_, /// $(D_KEYWORD super) - switch_, /// $(D_KEYWORD switch) - template_, /// $(D_KEYWORD template) - this_, /// $(D_KEYWORD this) - throw_, /// $(D_KEYWORD throw) - true_, /// $(D_KEYWORD true) - try_, /// $(D_KEYWORD try) - typedef_, /// $(D_KEYWORD typedef) - typeid_, /// $(D_KEYWORD typeid) - typeof_, /// $(D_KEYWORD typeof) - union_, /// $(D_KEYWORD union) - unittest_, /// $(D_KEYWORD unittest) - version_, /// $(D_KEYWORD version) - volatile_, /// $(D_KEYWORD volatile) - while_, /// $(D_KEYWORD while) - with_, /// $(D_KEYWORD with) + synchronized_, /// $(D_KEYWORD synchronized) + alias_, /// $(D_KEYWORD alias) + asm_, /// $(D_KEYWORD asm) + assert_, /// $(D_KEYWORD assert) + body_, /// $(D_KEYWORD body) + break_, /// $(D_KEYWORD break) + case_, /// $(D_KEYWORD case) + cast_, /// $(D_KEYWORD cast) + catch_, /// $(D_KEYWORD catch) + class_, /// $(D_KEYWORD class) + continue_, /// $(D_KEYWORD continue) + debug_, /// $(D_KEYWORD debug) + default_, /// $(D_KEYWORD default) + delegate_, /// $(D_KEYWORD delegate) + delete_, /// $(D_KEYWORD delete) + do_, /// $(D_KEYWORD do) + else_, /// $(D_KEYWORD else) + enum_, /// $(D_KEYWORD enum) + false_, /// $(D_KEYWORD false) + finally_, /// $(D_KEYWORD finally) + foreach_, /// $(D_KEYWORD foreach) + foreach_reverse_, /// $(D_KEYWORD foreach_reverse) + for_, /// $(D_KEYWORD for) + goto_, /// $(D_KEYWORD goto) + if_, /// $(D_KEYWORD if) + import_, /// $(D_KEYWORD import) + in_, /// $(D_KEYWORD in) + interface_, /// $(D_KEYWORD interface) + invariant_, /// $(D_KEYWORD invariant) + is_, /// $(D_KEYWORD is) + lazy_, /// $(D_KEYWORD lazy) + macro_, /// $(D_KEYWORD macro) + mixin_, /// $(D_KEYWORD mixin) + module_, /// $(D_KEYWORD module) + new_, /// $(D_KEYWORD new) + nothrow_, /// $(D_KEYWORD nothrow) + null_, /// $(D_KEYWORD null) + out_, /// $(D_KEYWORD out) + override_, /// $(D_KEYWORD override) + pure_, /// $(D_KEYWORD pure) + ref_, /// $(D_KEYWORD ref) + return_, /// $(D_KEYWORD return) + struct_, /// $(D_KEYWORD struct) + super_, /// $(D_KEYWORD super) + switch_, /// $(D_KEYWORD switch) + template_, /// $(D_KEYWORD template) + this_, /// $(D_KEYWORD this) + throw_, /// $(D_KEYWORD throw) + true_, /// $(D_KEYWORD true) + try_, /// $(D_KEYWORD try) + typedef_, /// $(D_KEYWORD typedef) + typeid_, /// $(D_KEYWORD typeid) + typeof_, /// $(D_KEYWORD typeof) + union_, /// $(D_KEYWORD union) + unittest_, /// $(D_KEYWORD unittest) + version_, /// $(D_KEYWORD version) + volatile_, /// $(D_KEYWORD volatile) + while_, /// $(D_KEYWORD while) + with_, /// $(D_KEYWORD with) - date, /// ___DATE__ - eof, /// ___EOF__ - time, /// ___TIME__ - timestamp, /// ___TIMESTAMP__ - vendor, /// ___VENDOR__ - compilerVersion, /// ___VERSION__ - file, /// $(D_KEYWORD ___FILE__) - line, /// $(D_KEYWORD ___LINE__) - comment, /// $(D_COMMENT /** comment */) or $(D_COMMENT // comment) or $(D_COMMENT ///comment) - identifier, /// anything else - scriptLine, // Line at the beginning of source file that starts from #! - traits, /// $(D_KEYWORD ___traits) - parameters, /// $(D_KEYWORD ___parameters) - vector, /// $(D_KEYWORD ___vector) - whitespace, /// whitespace - specialTokenSequence, /// #line 10 "file.d" - doubleLiteral, /// 123.456 - floatLiteral, /// 123.456f or 0x123_45p-3 - idoubleLiteral, /// 123.456i - ifloatLiteral, /// 123.456fi - intLiteral, /// 123 or 0b1101010101 - longLiteral, /// 123L - realLiteral, /// 123.456L - irealLiteral, /// 123.456Li - uintLiteral, /// 123u - ulongLiteral, /// 123uL - characterLiteral, /// 'a' - dstringLiteral, /// $(D_STRING "32-bit character string"d) - stringLiteral, /// $(D_STRING "an 8-bit string") - wstringLiteral, /// $(D_STRING "16-bit character string"w) + date, /// ___DATE__ + eof, /// ___EOF__ + time, /// ___TIME__ + timestamp, /// ___TIMESTAMP__ + vendor, /// ___VENDOR__ + compilerVersion, /// ___VERSION__ + file, /// $(D_KEYWORD ___FILE__) + line, /// $(D_KEYWORD ___LINE__) + comment, /// $(D_COMMENT /** comment */) or $(D_COMMENT // comment) or $(D_COMMENT ///comment) + identifier, /// anything else + scriptLine, // Line at the beginning of source file that starts from #! + traits, /// $(D_KEYWORD ___traits) + parameters, /// $(D_KEYWORD ___parameters) + vector, /// $(D_KEYWORD ___vector) + whitespace, /// whitespace + specialTokenSequence, /// #line 10 "file.d" + doubleLiteral, /// 123.456 + floatLiteral, /// 123.456f or 0x123_45p-3 + idoubleLiteral, /// 123.456i + ifloatLiteral, /// 123.456fi + intLiteral, /// 123 or 0b1101010101 + longLiteral, /// 123L + realLiteral, /// 123.456L + irealLiteral, /// 123.456Li + uintLiteral, /// 123u + ulongLiteral, /// 123uL + characterLiteral, /// 'a' + dstringLiteral, /// $(D_STRING "32-bit character string"d) + stringLiteral, /// $(D_STRING "an 8-bit string") + wstringLiteral, /// $(D_STRING "16-bit character string"w) } // Implementation details follow private: +// For now a private helper that is tailored to the way lexer works +// hides away forwardness of range by buffering +// RA-version is strightforward thin wrapping +// ATM it is byte-oriented +private struct LexSource(R) + if(isForwardRange!R && !isRandomAccessRange!R) + { + bool empty() const { return _empty; } + + auto ref front() const + { + return accum[accumIdx]; + } + + auto ref peek() const + in + { + assert (accumIdx + 1 < accum.length); + } + body + { + return accum[accumIdx + 1]; + } + + void popFront() + { + ++_index; + range.popFront(); + // if that was last byte + // just advance so that open-righted slice just works + accumIdx = (accumIdx+1) & mask; + if(range.empty) + { + _empty = true; + return; + } + if(accumIdx == savedAccumIdx) + { + // and move stuff around + auto oldLen = accum.length; + auto toCopy = oldLen - accumIdx; + accum.length *= 2; // keep pow of 2 + // copy starting with last item + copy(retro(accum[accumIdx..oldLen]), + retro(accum[$-toCopy..$])); + savedAccumIdx = accum.length - toCopy; + } + accum[accumIdx] = range.front; + } + + auto save() + { + typeof(this) copy = this; + copy.range = range.save; + // sadly need to dup circular buffer, as it overwrites items + copy.accum = copy.accum.dup; + return copy; + } + + // mark a position to slice from later on + size_t mark() + { + savedAccumIdx = accumIdx; + return accumIdx; + } + + // slice to current position from previously marked position + auto slice() @property + { + // it's an open right range as usual + return CircularRange(accum, savedAccumIdx, accumIdx); + } + + size_t index() const @property + { + return _index; + } + +private: + this(R src, size_t bufferSize) + { + range = src; + assert(bufferSize > 0); + assert((bufferSize & (bufferSize-1)) == 0); //is power of 2 + accum = new ubyte[bufferSize]; + if(range.empty) + _empty = true; + else + accum[accumIdx] = range.front; // load front + } + + // a true RA-range of ubyte + struct CircularRange + { + this(ubyte[] buf, size_t s, size_t e) + { + assert((buffer.length & (buffer.length-1)) == 0); + buffer = buf; + start = s; + end = e; + } + //Forward range primitives + @property bool empty() const { return start == end; } + @property auto ref front() const { return buffer[start]; } + void popFront() { start = (start + 1) & mask; } + @property auto save() { return this; } + + //Backwards is a bit slower, but should be rarely used (if at all) + @property ref back(){ return buffer[(end-1) & mask]; } + void popBack() { end = (end - 1) & mask; } + + // RA range primitives + ref opIndex(size_t idx){ return buffer[(start+idx) & mask]; } + @property size_t length() + { + return end < start ? end + buffer.length -start : end - start; + } + alias length opDollar; + + auto opSlice(size_t newStart, size_t newEnd) + { + size_t maskedStart = (start+newStart) & mask; + size_t maskedEnd = (start+newEnd) & mask; + return typeof(this)(buffer, maskedStart, maskedEnd); + } + // @@@bug fwd-ref in ldc0.10 (if placed above previous one) + auto opSlice(){ return opSlice(0, length); } + private: + @property auto mask(){ return buffer.length-1; } + size_t start, end; + ubyte[] buffer; + } + + @property auto mask(){ return accum.length-1; } + + R range; + bool _empty; + ubyte[] accum; // accumulator buffer for non-RA ranges + size_t savedAccumIdx; + size_t accumIdx; // current index in accumulator + size_t _index; // index of current element in original range +} + +// TODO: make sure it's RandomAccess later +/*static assert(isRandomAccessRange!( + LexSource!(typeof(filter!"true"(cast(ubyte[])null))) + .CircularRange) +);*/ + +//trivial pass-through for RA ranges +private struct LexSource(R) + if(isRandomAccessRange!R) +{ + bool empty() const @property { return cur >= range.length; } + bool canPeek() const { return cur + 1 < range.length; } + auto ref front() const @property { return range[cur]; } + void popFront(){ cur++; } + + auto ref peek() const + in + { + assert (canPeek()); + } + body + { + return range[cur + 1]; + } + + auto save() + { + typeof(this) copy = this; + copy.range = range.save; + return copy; + } + + auto mark() + { + saved = cur; + } + + // use the underliying range slicing capability + auto slice() @property + { + return range[saved..cur]; + } + + size_t index() const @property + { + return cur; + } + +private: + this(R src) + { + range = src; + } + size_t cur, saved; + R range; +} + +auto lexerSource(Range)(Range range, size_t bufSize=8) + if(isForwardRange!Range && !isRandomAccessRange!Range + && is(ElementType!Range : const(ubyte))) +{ + return LexSource!(Range)(range, bufSize); +} + +auto lexerSource(Range)(Range range) + if(isRandomAccessRange!Range + && is(ElementType!Range : const(ubyte))) +{ + return LexSource!(Range)(range); +} + +unittest +{ + // test the basic functionality of a "mark-slice" range + import std.string, std.stdio; + + static void test_hello(T)(T lexs) + { + assert(lexs.front == 'H'); + lexs.popFront(); + assert(lexs.front == 'e'); + foreach(i; 0..2) + { + auto saved = lexs.save; + lexs.mark(); + assert(lexs.slice.equal("")); + lexs.popFront(); + assert(lexs.slice.equal("e"), text(cast(char)lexs.front)); + lexs.popFrontN(4); + auto bytes = lexs.slice.map!"cast(char)a".array(); + assert(bytes.equal("ello,"), bytes.to!string); + lexs.mark(); + assert(lexs.slice.equal("")); + assert(lexs.front == 'w'); + lexs.popFrontN(6); + assert(lexs.empty); + auto s = lexs.slice(); + auto msg = s.save.map!"cast(char)a".array; + assert(s[].equal("world!"), msg); + assert(s[2..$-1].equal("rld"), msg); + assert(s[0] == 'w' && s[$-1] == '!'); + s.popFront(); + assert(s.front == 'o' && s.back == '!'); + s.popBack(); + assert(s.front == 'o' && s.back == 'd'); + //restore and repeat again + lexs = saved; + } + } + + static void test_empty(T)(T lexs) + { + assert(lexs.empty); + lexs.mark(); + assert(lexs.slice().equal("")); + } + + auto fwdLex = lexerSource( + "Hello, world!" + .representation + .filter!"a != ' '", 16 // and the one that is more then enough + ); + test_hello(fwdLex); + fwdLex = lexerSource( + "Hello, world!" + .representation + .filter!"a != ' '", 1 // try the smallest initial buffer + ); + test_hello(fwdLex); + fwdLex = lexerSource("".representation.filter!"a != ' '"); + auto raLex = lexerSource("".representation); + test_empty(raLex); + test_empty(fwdLex); + raLex = lexerSource("Hello,world!".representation); + test_hello(raLex); +} + // uses auto-detection for pure, safe nothrow bool isRangeEoF(R)(ref R range) { - return range.empty || range.front == 0 || range.front == 0x1a; + return range.empty || range.front == 0 || range.front == 0x1a; } -/* -* Slices of the above string to save memory. This array is automatically -* generated. -*/ +// Lookup table for token values immutable(string[TokenType.max + 1]) tokenValues = [ - "=", - "@", - "&", - "&=", - "|", - "|=", - "~=", - ":", - ",", - "--", - "/", - "/=", - "$", - ".", - "==", - "=>", - ">", - ">=", - "#", - "++", - "{", - "[", - "<", - "<=", - "<>=", - "<>", - "&&", - "||", - "(", - "-", - "-=", - "%", - "%=", - "*=", - "!", - "!=", - "!>", - "!>=", - "!<", - "!<=", - "!<>", - "+", - "+=", - "^^", - "^^=", - "}", - "]", - ")", - ";", - "<<", - "<<=", - ">>", - ">>=", - "..", - "*", - "?", - "~", - "!<>=", - ">>>", - ">>>=", - "...", - "^", - "^=", - "bool", - "byte", - "cdouble", - "cent", - "cfloat", - "char", - "creal", - "dchar", - "double", - "float", - "function", - "idouble", - "ifloat", - "int", - "ireal", - "long", - "real", - "short", - "ubyte", - "ucent", - "uint", - "ulong", - "ushort", - "void", - "wchar", - "align", - "deprecated", - "extern", - "pragma", - "export", - "package", - "private", - "protected", - "public", - "abstract", - "auto", - "const", - "final", - "__gshared", - "immutable", - "inout", - "scope", - "shared", - "static", - "synchronized", - "alias", - "asm", - "assert", - "body", - "break", - "case", - "cast", - "catch", - "class", - "continue", - "debug", - "default", - "delegate", - "delete", - "do", - "else", - "enum", - "false", - "finally", - "foreach", - "foreach_reverse", - "for", - "goto", - "if", - "import", - "in", - "interface", - "invariant", - "is", - "lazy", - "macro", - "mixin", - "module", - "new", - "nothrow", - "null", - "out", - "override", - "pure", - "ref", - "return", - "struct", - "super", - "switch", - "template", - "this", - "throw", - "true", - "try", - "typedef", - "typeid", - "typeof", - "union", - "unittest", - "version", - "volatile", - "while", - "with", - "__DATE__", - "__EOF__", - "__TIME__", - "__TIMESTAMP__", - "__VENDOR__", - "__VERSION__", - "__FILE__", - "__LINE__", - null, - null, - null, - "__traits", - "__parameters", - "__vector", - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, - null, + "=", + "@", + "&", + "&=", + "|", + "|=", + "~=", + ":", + ",", + "--", + "/", + "/=", + "$", + ".", + "==", + "=>", + ">", + ">=", + "#", + "++", + "{", + "[", + "<", + "<=", + "<>=", + "<>", + "&&", + "||", + "(", + "-", + "-=", + "%", + "%=", + "*=", + "!", + "!=", + "!>", + "!>=", + "!<", + "!<=", + "!<>", + "+", + "+=", + "^^", + "^^=", + "}", + "]", + ")", + ";", + "<<", + "<<=", + ">>", + ">>=", + "..", + "*", + "?", + "~", + "!<>=", + ">>>", + ">>>=", + "...", + "^", + "^=", + "bool", + "byte", + "cdouble", + "cent", + "cfloat", + "char", + "creal", + "dchar", + "double", + "float", + "function", + "idouble", + "ifloat", + "int", + "ireal", + "long", + "real", + "short", + "ubyte", + "ucent", + "uint", + "ulong", + "ushort", + "void", + "wchar", + "align", + "deprecated", + "extern", + "pragma", + "export", + "package", + "private", + "protected", + "public", + "abstract", + "auto", + "const", + "final", + "__gshared", + "immutable", + "inout", + "scope", + "shared", + "static", + "synchronized", + "alias", + "asm", + "assert", + "body", + "break", + "case", + "cast", + "catch", + "class", + "continue", + "debug", + "default", + "delegate", + "delete", + "do", + "else", + "enum", + "false", + "finally", + "foreach", + "foreach_reverse", + "for", + "goto", + "if", + "import", + "in", + "interface", + "invariant", + "is", + "lazy", + "macro", + "mixin", + "module", + "new", + "nothrow", + "null", + "out", + "override", + "pure", + "ref", + "return", + "struct", + "super", + "switch", + "template", + "this", + "throw", + "true", + "try", + "typedef", + "typeid", + "typeof", + "union", + "unittest", + "version", + "volatile", + "while", + "with", + "__DATE__", + "__EOF__", + "__TIME__", + "__TIMESTAMP__", + "__VENDOR__", + "__VERSION__", + "__FILE__", + "__LINE__", + null, + null, + null, + "__traits", + "__parameters", + "__vector", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, ]; pure string getTokenValue(const TokenType type) { - return tokenValues[type]; + return tokenValues[type]; } private pure bool isNewline(ubyte ch) { - return ch == '\n' || ch == '\r'; + return ch == '\n' || ch == '\r'; } pure TokenType lookupTokenType(R)(R input) { - switch(input.length) - { - case 2: - switch (input[0]) - { - case 'd': if (input[1] == 'o') return TokenType.do_; else break; - case 'i': - if (input[1] == 'f') return TokenType.if_; - else if (input[1] == 'n') return TokenType.in_; - else if (input[1] == 's') return TokenType.is_; - else break; - default: break; - } - break; - case 3: - switch (input[0]) - { - case 'a': if (input[1..$].equal("sm")) return TokenType.asm_; else break; - case 'f': if (input[1..$].equal("or")) return TokenType.for_; else break; - case 'i': if (input[1..$].equal("nt")) return TokenType.int_; else break; - case 'n': if (input[1..$].equal("ew")) return TokenType.new_; else break; - case 'o': if (input[1..$].equal("ut")) return TokenType.out_; else break; - case 'r': if (input[1..$].equal("ef")) return TokenType.ref_; else break; - case 't': if (input[1..$].equal("ry")) return TokenType.try_; else break; - default: break; - } - break; - case 4: - switch (input[0]) - { - case 'a': if (input[1..$].equal("uto")) return TokenType.auto_; else break; - case 'b': if (input[1..$].equal("ody")) return TokenType.body_; - else if (input[1..$].equal("ool")) return TokenType.bool_; - else if (input[1..$].equal("yte")) return TokenType.byte_; - else break; - case 'c': if (input[1..$].equal("ase")) return TokenType.case_; - else if (input[1..$].equal("ast")) return TokenType.cast_; - else if (input[1..$].equal("ent")) return TokenType.cent_; - else if (input[1..$].equal("har")) return TokenType.char_; - else break; - case 'e': if (input[1..$].equal("lse")) return TokenType.else_; - else if (input[1..$].equal("num")) return TokenType.enum_; - else break; - case 'g': if (input[1..$].equal("oto")) return TokenType.goto_; else break; - case 'l': if (input[1..$].equal("azy")) return TokenType.lazy_; - else if (input[1..$].equal("ong")) return TokenType.long_; - else break; - case 'n': if (input[1..$].equal("ull")) return TokenType.null_; else break; - case 'p': if (input[1..$].equal("ure")) return TokenType.pure_; else break; - case 'r': if (input[1..$].equal("eal")) return TokenType.real_; else break; - case 't': if (input[1..$].equal("his")) return TokenType.this_; - else if (input[1..$].equal("rue")) return TokenType.true_; - else break; - case 'u': if (input[1..$].equal("int")) return TokenType.uint_; else break; - case 'v': if (input[1..$].equal("oid")) return TokenType.void_; else break; - case 'w': if (input[1..$].equal("ith")) return TokenType.with_; else break; - default: break; - } - break; - case 5: - switch (input[0]) - { - case 'a': if (input[1..$].equal("lias")) return TokenType.alias_; - else if (input[1..$].equal("lign")) return TokenType.align_; else break; - case 'b': if (input[1..$].equal("reak")) return TokenType.break_; else break; - case 'c': if (input[1..$].equal("atch")) return TokenType.catch_; - else if (input[1..$].equal("lass")) return TokenType.class_; - else if (input[1..$].equal("onst")) return TokenType.const_; - else if (input[1..$].equal("real")) return TokenType.creal_; - else break; - case 'd': if (input[1..$].equal("char")) return TokenType.dchar_; - else if (input[1..$].equal("ebug")) return TokenType.debug_; else break; - case 'f': if (input[1..$].equal("alse")) return TokenType.false_; - else if (input[1..$].equal("inal")) return TokenType.final_; - else if (input[1..$].equal("loat")) return TokenType.float_; - else break; - case 'i': if (input[1..$].equal("nout")) return TokenType.inout_; - else if (input[1..$].equal("real")) return TokenType.ireal_; else break; - case 'm': if (input[1..$].equal("acro")) return TokenType.macro_; - else if (input[1..$].equal("ixin")) return TokenType.mixin_; else break; - case 's': if (input[1..$].equal("cope")) return TokenType.scope_; - else if (input[1..$].equal("hort")) return TokenType.short_; - else if (input[1..$].equal("uper")) return TokenType.super_; else break; - case 't': if (input[1..$].equal("hrow")) return TokenType.throw_; else break; - case 'u': if (input[1..$].equal("byte")) return TokenType.ubyte_; - else if (input[1..$].equal("cent")) return TokenType.ucent_; - else if (input[1..$].equal("long")) return TokenType.ulong_; - else if (input[1..$].equal("nion")) return TokenType.union_; - else break; - case 'w': if (input[1..$].equal("char")) return TokenType.wchar_; - else if (input[1..$].equal("hile")) return TokenType.while_; - else break; - default: break; - } - break; - case 6: - switch (input[0]) - { - case 'a': if (input[1..$].equal("ssert")) return TokenType.assert_; else break; - case 'c': if (input[1..$].equal("float")) return TokenType.cfloat_; else break; - case 'd': if (input[1..$].equal("elete")) return TokenType.delete_; - else if (input[1..$].equal("ouble")) return TokenType.double_; else break; - case 'e': if (input[1..$].equal("xport")) return TokenType.export_; - else if (input[1..$].equal("xtern")) return TokenType.extern_; else break; - case 'i': if (input[1..$].equal("float")) return TokenType.ifloat_; - else if (input[1..$].equal("mport")) return TokenType.import_; else break; - case 'm': if (input[1..$].equal("odule")) return TokenType.module_; else break; - case 'p': if (input[1..$].equal("ragma")) return TokenType.pragma_; - else if (input[1..$].equal("ublic")) return TokenType.public_; else break; - case 'r': if (input[1..$].equal("eturn")) return TokenType.return_; else break; - case 's': if (input[1..$].equal("hared")) return TokenType.shared_; - else if (input[1..$].equal("tatic")) return TokenType.static_; - else if (input[1..$].equal("truct")) return TokenType.struct_; - else if (input[1..$].equal("witch")) return TokenType.switch_; else break; - case 't': if (input[1..$].equal("ypeid")) return TokenType.typeid_; - else if (input[1..$].equal("ypeof")) return TokenType.typeof_; else break; - case 'u': if (input[1..$].equal("short")) return TokenType.ushort_; else break; - default: break; - } - break; - case 7: - switch (input[0]) - { - case '_': if (input[1..$].equal("_EOF__")) return TokenType.eof; else break; - case 'c': if (input[1..$].equal("double")) return TokenType.cdouble_; else break; - case 'd': if (input[1..$].equal("efault")) return TokenType.default_; else break; - case 'f': if (input[1..$].equal("inally")) return TokenType.finally_; - else if (input[1..$].equal("oreach")) return TokenType.foreach_; else break; - case 'i': if (input[1..$].equal("double")) return TokenType.idouble_; else break; - case 'n': if (input[1..$].equal("othrow")) return TokenType.nothrow_; else break; - case 'p': if (input[1..$].equal("ackage")) return TokenType.package_; - else if (input[1..$].equal("rivate")) return TokenType.private_; else break; - case 't': if (input[1..$].equal("ypedef")) return TokenType.typedef_; else break; - case 'v': if (input[1..$].equal("ersion")) return TokenType.version_; else break; - default: break; - } - break; - case 8: - switch (input[0]) - { - case '_': if (input[1..$].equal("_DATE__")) return TokenType.date; - else if (input[1..$].equal("_FILE__")) return TokenType.file; - else if (input[1..$].equal("_LINE__")) return TokenType.line; - else if (input[1..$].equal("_TIME__")) return TokenType.time; - else if (input[1..$].equal("_traits")) return TokenType.traits; else break; - case 'a': if (input[1..$].equal("bstract")) return TokenType.abstract_; else break; - case 'c': if (input[1..$].equal("ontinue")) return TokenType.continue_; else break; - case 'd': if (input[1..$].equal("elegate")) return TokenType.delegate_; else break; - case 'f': if (input[1..$].equal("unction")) return TokenType.function_; else break; - case 'o': if (input[1..$].equal("verride")) return TokenType.override_; else break; - case 't': if (input[1..$].equal("emplate")) return TokenType.template_; else break; - case 'u': if (input[1..$].equal("nittest")) return TokenType.unittest_; else break; - case 'v': if (input[1..$].equal("olatile")) return TokenType.volatile_; else break; - default: break; - } - break; - case 9: - switch (input[0]) - { - case '_': if (input[1..$].equal("_gshared")) return TokenType.gshared; else break; - case 'i': if (input[1..$].equal("mmutable")) return TokenType.immutable_; - else if (input[1..$].equal("nterface")) return TokenType.interface_; - else if (input[1..$].equal("nvariant")) return TokenType.invariant_; else break; - case 'p': if (input[1..$].equal("rotected")) return TokenType.protected_; else break; - default: break; - } - break; - case 10: - switch (input[0]) - { - case 'd': if (input[1..$].equal("eprecated")) return TokenType.deprecated_; else break; - case '_': if (input[1..$].equal("_VENDOR__")) return TokenType.vendor; else break; - default: break; - } - break; - case 11: - if (input[1..$].equal("_VERSION__")) - return TokenType.compilerVersion; - break; - case 12: - if (input[1..$].equal("ynchronized")) - return TokenType.synchronized_; - break; - case 13: - if (input[1..$].equal("_TIMESTAMP__")) - return TokenType.timestamp; - break; - case 15: - if (input[1..$].equal("oreach_reverse")) - return TokenType.foreach_reverse_; - break; - default: break; - } - return TokenType.identifier; + switch(input.length) + { + case 2: + switch (input[0]) + { + case 'd': if (input[1] == 'o') return TokenType.do_; else break; + case 'i': + if (input[1] == 'f') return TokenType.if_; + else if (input[1] == 'n') return TokenType.in_; + else if (input[1] == 's') return TokenType.is_; + else break; + default: break; + } + break; + case 3: + switch (input[0]) + { + case 'a': if (input[1..$].equal("sm")) return TokenType.asm_; else break; + case 'f': if (input[1..$].equal("or")) return TokenType.for_; else break; + case 'i': if (input[1..$].equal("nt")) return TokenType.int_; else break; + case 'n': if (input[1..$].equal("ew")) return TokenType.new_; else break; + case 'o': if (input[1..$].equal("ut")) return TokenType.out_; else break; + case 'r': if (input[1..$].equal("ef")) return TokenType.ref_; else break; + case 't': if (input[1..$].equal("ry")) return TokenType.try_; else break; + default: break; + } + break; + case 4: + switch (input[0]) + { + case 'a': if (input[1..$].equal("uto")) return TokenType.auto_; else break; + case 'b': if (input[1..$].equal("ody")) return TokenType.body_; + else if (input[1..$].equal("ool")) return TokenType.bool_; + else if (input[1..$].equal("yte")) return TokenType.byte_; + else break; + case 'c': if (input[1..$].equal("ase")) return TokenType.case_; + else if (input[1..$].equal("ast")) return TokenType.cast_; + else if (input[1..$].equal("ent")) return TokenType.cent_; + else if (input[1..$].equal("har")) return TokenType.char_; + else break; + case 'e': if (input[1..$].equal("lse")) return TokenType.else_; + else if (input[1..$].equal("num")) return TokenType.enum_; + else break; + case 'g': if (input[1..$].equal("oto")) return TokenType.goto_; else break; + case 'l': if (input[1..$].equal("azy")) return TokenType.lazy_; + else if (input[1..$].equal("ong")) return TokenType.long_; + else break; + case 'n': if (input[1..$].equal("ull")) return TokenType.null_; else break; + case 'p': if (input[1..$].equal("ure")) return TokenType.pure_; else break; + case 'r': if (input[1..$].equal("eal")) return TokenType.real_; else break; + case 't': if (input[1..$].equal("his")) return TokenType.this_; + else if (input[1..$].equal("rue")) return TokenType.true_; + else break; + case 'u': if (input[1..$].equal("int")) return TokenType.uint_; else break; + case 'v': if (input[1..$].equal("oid")) return TokenType.void_; else break; + case 'w': if (input[1..$].equal("ith")) return TokenType.with_; else break; + default: break; + } + break; + case 5: + switch (input[0]) + { + case 'a': if (input[1..$].equal("lias")) return TokenType.alias_; + else if (input[1..$].equal("lign")) return TokenType.align_; else break; + case 'b': if (input[1..$].equal("reak")) return TokenType.break_; else break; + case 'c': if (input[1..$].equal("atch")) return TokenType.catch_; + else if (input[1..$].equal("lass")) return TokenType.class_; + else if (input[1..$].equal("onst")) return TokenType.const_; + else if (input[1..$].equal("real")) return TokenType.creal_; + else break; + case 'd': if (input[1..$].equal("char")) return TokenType.dchar_; + else if (input[1..$].equal("ebug")) return TokenType.debug_; else break; + case 'f': if (input[1..$].equal("alse")) return TokenType.false_; + else if (input[1..$].equal("inal")) return TokenType.final_; + else if (input[1..$].equal("loat")) return TokenType.float_; + else break; + case 'i': if (input[1..$].equal("nout")) return TokenType.inout_; + else if (input[1..$].equal("real")) return TokenType.ireal_; else break; + case 'm': if (input[1..$].equal("acro")) return TokenType.macro_; + else if (input[1..$].equal("ixin")) return TokenType.mixin_; else break; + case 's': if (input[1..$].equal("cope")) return TokenType.scope_; + else if (input[1..$].equal("hort")) return TokenType.short_; + else if (input[1..$].equal("uper")) return TokenType.super_; else break; + case 't': if (input[1..$].equal("hrow")) return TokenType.throw_; else break; + case 'u': if (input[1..$].equal("byte")) return TokenType.ubyte_; + else if (input[1..$].equal("cent")) return TokenType.ucent_; + else if (input[1..$].equal("long")) return TokenType.ulong_; + else if (input[1..$].equal("nion")) return TokenType.union_; + else break; + case 'w': if (input[1..$].equal("char")) return TokenType.wchar_; + else if (input[1..$].equal("hile")) return TokenType.while_; + else break; + default: break; + } + break; + case 6: + switch (input[0]) + { + case 'a': if (input[1..$].equal("ssert")) return TokenType.assert_; else break; + case 'c': if (input[1..$].equal("float")) return TokenType.cfloat_; else break; + case 'd': if (input[1..$].equal("elete")) return TokenType.delete_; + else if (input[1..$].equal("ouble")) return TokenType.double_; else break; + case 'e': if (input[1..$].equal("xport")) return TokenType.export_; + else if (input[1..$].equal("xtern")) return TokenType.extern_; else break; + case 'i': if (input[1..$].equal("float")) return TokenType.ifloat_; + else if (input[1..$].equal("mport")) return TokenType.import_; else break; + case 'm': if (input[1..$].equal("odule")) return TokenType.module_; else break; + case 'p': if (input[1..$].equal("ragma")) return TokenType.pragma_; + else if (input[1..$].equal("ublic")) return TokenType.public_; else break; + case 'r': if (input[1..$].equal("eturn")) return TokenType.return_; else break; + case 's': if (input[1..$].equal("hared")) return TokenType.shared_; + else if (input[1..$].equal("tatic")) return TokenType.static_; + else if (input[1..$].equal("truct")) return TokenType.struct_; + else if (input[1..$].equal("witch")) return TokenType.switch_; else break; + case 't': if (input[1..$].equal("ypeid")) return TokenType.typeid_; + else if (input[1..$].equal("ypeof")) return TokenType.typeof_; else break; + case 'u': if (input[1..$].equal("short")) return TokenType.ushort_; else break; + default: break; + } + break; + case 7: + switch (input[0]) + { + case '_': if (input[1..$].equal("_EOF__")) return TokenType.eof; else break; + case 'c': if (input[1..$].equal("double")) return TokenType.cdouble_; else break; + case 'd': if (input[1..$].equal("efault")) return TokenType.default_; else break; + case 'f': if (input[1..$].equal("inally")) return TokenType.finally_; + else if (input[1..$].equal("oreach")) return TokenType.foreach_; else break; + case 'i': if (input[1..$].equal("double")) return TokenType.idouble_; else break; + case 'n': if (input[1..$].equal("othrow")) return TokenType.nothrow_; else break; + case 'p': if (input[1..$].equal("ackage")) return TokenType.package_; + else if (input[1..$].equal("rivate")) return TokenType.private_; else break; + case 't': if (input[1..$].equal("ypedef")) return TokenType.typedef_; else break; + case 'v': if (input[1..$].equal("ersion")) return TokenType.version_; else break; + default: break; + } + break; + case 8: + switch (input[0]) + { + case '_': if (input[1..$].equal("_DATE__")) return TokenType.date; + else if (input[1..$].equal("_FILE__")) return TokenType.file; + else if (input[1..$].equal("_LINE__")) return TokenType.line; + else if (input[1..$].equal("_TIME__")) return TokenType.time; + else if (input[1..$].equal("_traits")) return TokenType.traits; else break; + case 'a': if (input[1..$].equal("bstract")) return TokenType.abstract_; else break; + case 'c': if (input[1..$].equal("ontinue")) return TokenType.continue_; else break; + case 'd': if (input[1..$].equal("elegate")) return TokenType.delegate_; else break; + case 'f': if (input[1..$].equal("unction")) return TokenType.function_; else break; + case 'o': if (input[1..$].equal("verride")) return TokenType.override_; else break; + case 't': if (input[1..$].equal("emplate")) return TokenType.template_; else break; + case 'u': if (input[1..$].equal("nittest")) return TokenType.unittest_; else break; + case 'v': if (input[1..$].equal("olatile")) return TokenType.volatile_; else break; + default: break; + } + break; + case 9: + switch (input[0]) + { + case '_': if (input[1..$].equal("_gshared")) return TokenType.gshared; else break; + case 'i': if (input[1..$].equal("mmutable")) return TokenType.immutable_; + else if (input[1..$].equal("nterface")) return TokenType.interface_; + else if (input[1..$].equal("nvariant")) return TokenType.invariant_; else break; + case 'p': if (input[1..$].equal("rotected")) return TokenType.protected_; else break; + default: break; + } + break; + case 10: + switch (input[0]) + { + case 'd': if (input[1..$].equal("eprecated")) return TokenType.deprecated_; else break; + case '_': if (input[1..$].equal("_VENDOR__")) return TokenType.vendor; else break; + default: break; + } + break; + case 11: + if (input[1..$].equal("_VERSION__")) + return TokenType.compilerVersion; + break; + case 12: + if (input[1..$].equal("ynchronized")) + return TokenType.synchronized_; + break; + case 13: + if (input[1..$].equal("_TIMESTAMP__")) + return TokenType.timestamp; + break; + case 15: + if (input[1..$].equal("oreach_reverse")) + return TokenType.foreach_reverse_; + break; + default: break; + } + return TokenType.identifier; } class Trie(K, V) if (isInputRange!K): TrieNode!(K, V) { - /** - * Adds the given value to the trie with the given key - */ - void add(K key, V value) pure - { - TrieNode!(K,V) current = this; - foreach(keyPart; key) - { - if ((keyPart in current.children) is null) - { - auto node = new TrieNode!(K, V); - current.children[keyPart] = node; - current = node; - } - else - current = current.children[keyPart]; - } - current.value = value; - } + /** + * Adds the given value to the trie with the given key + */ + void add(K key, V value) pure + { + TrieNode!(K,V) current = this; + foreach(keyPart; key) + { + if ((keyPart in current.children) is null) + { + auto node = new TrieNode!(K, V); + current.children[keyPart] = node; + current = node; + } + else + current = current.children[keyPart]; + } + current.value = value; + } } class TrieNode(K, V) if (isInputRange!K) { - V value; - TrieNode!(K,V)[ElementType!K] children; + V value; + TrieNode!(K,V)[ElementType!K] children; } string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString) { - string caseStatement = ""; - foreach(dchar k, TrieNode!(K,V) v; node.children) - { - caseStatement ~= indentString; - caseStatement ~= "case '"; - caseStatement ~= k; - caseStatement ~= "':\n"; - caseStatement ~= indentString; - caseStatement ~= "\tnextCharNonLF();\n"; - if (v.children.length > 0) - { - caseStatement ~= indentString; - caseStatement ~= "\tif (isEoF())\n"; - caseStatement ~= indentString; - caseStatement ~= "\t{\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\tcurrent.value = getTokenValue(current.type);\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\tcurrent.type = " ~ node.children[k].value; - caseStatement ~= ";\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\treturn;\n"; - caseStatement ~= indentString; - caseStatement ~= "\t}\n"; - caseStatement ~= indentString; - caseStatement ~= "\tswitch (src.front)\n"; - caseStatement ~= indentString; - caseStatement ~= "\t{\n"; - caseStatement ~= printCaseStatements(v, indentString ~ "\t"); - caseStatement ~= indentString; - caseStatement ~= "\tdefault:\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\tcurrent.type = "; - caseStatement ~= v.value; - caseStatement ~= ";\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\tcurrent.value = getTokenValue(current.type);\n"; - caseStatement ~= indentString; - caseStatement ~= "\t\treturn;\n"; - caseStatement ~= indentString; - caseStatement ~= "\t}\n"; - } - else - { - caseStatement ~= indentString; - caseStatement ~= "\tcurrent.type = "; - caseStatement ~= v.value; - caseStatement ~= ";\n"; - caseStatement ~= indentString; - caseStatement ~= "\tcurrent.value = getTokenValue(current.type);\n"; - caseStatement ~= indentString; - caseStatement ~= "\treturn;\n"; - } - } - return caseStatement; + string caseStatement = ""; + foreach(dchar k, TrieNode!(K,V) v; node.children) + { + caseStatement ~= indentString; + caseStatement ~= "case '"; + caseStatement ~= k; + caseStatement ~= "':\n"; + caseStatement ~= indentString; + caseStatement ~= "\tnextCharNonLF();\n"; + if (v.children.length > 0) + { + caseStatement ~= indentString; + caseStatement ~= "\tif (isEoF())\n"; + caseStatement ~= indentString; + caseStatement ~= "\t{\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\tcurrent.value = getTokenValue(current.type);\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\tcurrent.type = " ~ node.children[k].value; + caseStatement ~= ";\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\treturn;\n"; + caseStatement ~= indentString; + caseStatement ~= "\t}\n"; + caseStatement ~= indentString; + caseStatement ~= "\tswitch (src.front)\n"; + caseStatement ~= indentString; + caseStatement ~= "\t{\n"; + caseStatement ~= printCaseStatements(v, indentString ~ "\t"); + caseStatement ~= indentString; + caseStatement ~= "\tdefault:\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\tcurrent.type = "; + caseStatement ~= v.value; + caseStatement ~= ";\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\tcurrent.value = getTokenValue(current.type);\n"; + caseStatement ~= indentString; + caseStatement ~= "\t\treturn;\n"; + caseStatement ~= indentString; + caseStatement ~= "\t}\n"; + } + else + { + caseStatement ~= indentString; + caseStatement ~= "\tcurrent.type = "; + caseStatement ~= v.value; + caseStatement ~= ";\n"; + caseStatement ~= indentString; + caseStatement ~= "\tcurrent.value = getTokenValue(current.type);\n"; + caseStatement ~= indentString; + caseStatement ~= "\treturn;\n"; + } + } + return caseStatement; } string generateCaseTrie(string[] args ...) { - auto t = new Trie!(string, string); - for(int i = 0; i < args.length; i+=2) - { - t.add(args[i], args[i+1]); - } - return printCaseStatements(t, ""); + auto t = new Trie!(string, string); + for(int i = 0; i < args.length; i+=2) + { + t.add(args[i], args[i+1]); + } + return printCaseStatements(t, ""); } struct StringCache { - string get(R)(R range) - if(isRandomAccessRange!R - && is(Unqual!(ElementType!R) : const(ubyte))) - { - - uint h = hash(range); - uint bucket = h % mapSize; + string get(R)(R range) + if(isRandomAccessRange!R + && is(Unqual!(ElementType!R) : const(ubyte))) + { + + uint h = hash(range); + uint bucket = h % mapSize; Slot *s = &index[bucket]; //1st slot not yet initialized? - if(s.value.ptr == null) + if(s.value.ptr == null) { *s = Slot(putIntoCache(range), null, h); return s.value; } Slot* insSlot = s; - for(;;) - { - if(s.hash == h && s.value.equal(range)) + for(;;) + { + if(s.hash == h && s.value.equal(range)) return s.value; insSlot = s; s = s.next; if(s == null) break; - } + } string str = putIntoCache(range); insertIntoSlot(insSlot, str, h); return str; - } + } private: - static uint hash(R)(R data) - { - uint hash = 0; - foreach (b; data) - { - hash ^= sbox[b]; - hash *= 3; - } - return hash; - } + static uint hash(R)(R data) + { + uint hash = 0; + foreach (b; data) + { + hash ^= sbox[b]; + hash *= 3; + } + return hash; + } + + enum mapSize = 2048; - enum mapSize = 2048; - struct Slot { string value; Slot* next; uint hash; }; - + void insertIntoSlot(Slot* tgt, string val, uint hash) { auto slice = allocateInCache(Slot.sizeof); @@ -3108,291 +3098,290 @@ private: *newSlot = Slot(val, null, hash); tgt.next = newSlot; } - + Slot[mapSize] index; - - // leave some slack for alloctors/GC meta-data - enum chunkSize = 16*1024 - size_t.sizeof*8; - ubyte*[] chunkS; - size_t next = chunkSize; - + + // leave some slack for alloctors/GC meta-data + enum chunkSize = 16*1024 - size_t.sizeof*8; + ubyte*[] chunkS; + size_t next = chunkSize; + ubyte[] allocateInCache(size_t size) { import core.memory; if(next + size > chunkSize) - { - // avoid huge allocations - if(size> chunkSize/4) + { + // avoid huge allocations + if(size> chunkSize/4) { ubyte* p = cast(ubyte*)GC.malloc(size, GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR); return p[0..size]; } - chunkS ~= cast(ubyte*)GC.malloc(chunkSize, - GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR); - next = 0; - } + chunkS ~= cast(ubyte*)GC.malloc(chunkSize, + GC.BlkAttr.NO_SCAN | GC.BlkAttr.NO_INTERIOR); + next = 0; + } auto slice = chunkS[$-1][next..next+size]; - next += size; + next += size; return slice; } - string putIntoCache(R)(R data) - { + string putIntoCache(R)(R data) + { auto slice = allocateInCache(data.length); - slice[] = data[]; - return cast(string)slice; - } + slice[] = data[]; + return cast(string)slice; + } } immutable uint[] sbox = [ - 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, - 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, - 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, - 0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300, - 0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991, - 0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8, - 0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8, - 0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7, - 0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0, - 0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26, - 0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595, - 0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB, - 0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1, - 0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8, - 0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03, - 0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C, - 0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6, - 0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B, - 0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3, - 0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B, - 0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A, - 0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292, - 0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381, - 0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3, - 0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154, - 0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761, - 0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1, - 0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F, - 0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7, - 0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE, - 0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1, - 0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81, - 0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F, - 0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D, - 0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF, - 0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0, - 0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF, - 0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94, - 0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414, - 0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA, - 0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089, - 0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F, - 0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802, - 0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5, - 0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74, - 0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629, - 0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843, - 0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27, - 0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B, - 0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C, - 0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F, - 0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6, - 0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933, - 0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F, - 0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5, - 0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E, - 0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2, - 0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA, - 0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE, - 0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64, - 0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B, - 0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4, - 0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41, - 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A, + 0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53, + 0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982, + 0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56, + 0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300, + 0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991, + 0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8, + 0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8, + 0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7, + 0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0, + 0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26, + 0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595, + 0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB, + 0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1, + 0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8, + 0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03, + 0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C, + 0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6, + 0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B, + 0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3, + 0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B, + 0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A, + 0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292, + 0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381, + 0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3, + 0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154, + 0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761, + 0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1, + 0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F, + 0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7, + 0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE, + 0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1, + 0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81, + 0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F, + 0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D, + 0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF, + 0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0, + 0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF, + 0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94, + 0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414, + 0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA, + 0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089, + 0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F, + 0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802, + 0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5, + 0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74, + 0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629, + 0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843, + 0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27, + 0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B, + 0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C, + 0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F, + 0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6, + 0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933, + 0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F, + 0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5, + 0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E, + 0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2, + 0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA, + 0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE, + 0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64, + 0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B, + 0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4, + 0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41, + 0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A, ]; unittest { - LexerConfig cfg; - auto tkr = "void main(){ }".representation.byToken(cfg); - assert(tkr.map!"a.value".equal(["void", "main", "(", ")", "{", "}"])); - tkr = "1234 54.23232".representation.byToken(cfg); - assert(tkr.equal(["1234", "54.23232"])); - auto str = r"0 0. .0 1 0x3 0b102 007"; - cfg.iterStyle = IterationStyle.everything; - tkr = str.representation.byToken(cfg); - assert(tkr.map!"a.value".equal(["0", " ", "0.", " ", - ".0", " ", "1", " ", "0x3", " ", "0b10", - "2", " ", "007"] - ), text(tkr.map!"a.value")); + LexerConfig cfg; + auto tkr = "void main(){ }".representation.byToken(cfg); + assert(tkr.map!"a.value".equal(["void", "main", "(", ")", "{", "}"])); + tkr = "1234 54.23232".representation.byToken(cfg); + assert(tkr.equal(["1234", "54.23232"])); + auto str = r"0 0. .0 1 0x3 0b102 007"; + cfg.iterStyle = IterationStyle.everything; + tkr = str.representation.byToken(cfg); + assert(tkr.map!"a.value".equal(["0", " ", "0.", " ", + ".0", " ", "1", " ", "0x3", " ", "0b10", + "2", " ", "007"] + ), text(tkr.map!"a.value")); } unittest { - import std.stdio; - auto source = cast(ubyte[]) ( - " bool byte cdouble cent cfloat char creal dchar double float function" - ~ " idouble ifloat int ireal long real short ubyte ucent uint ulong" - ~ " ushort void wchar align deprecated extern pragma export package private" - ~ " protected public abstract auto const final __gshared immutable inout" - ~ " scope shared static synchronized alias asm assert body break case" - ~ " cast catch class continue debug default delegate delete do else" - ~ " enum false finally foreach foreach_reverse for goto if import in" - ~ " interface invariant is lazy macro mixin module new nothrow null" - ~ " out override pure ref return struct super switch template this" - ~ " throw true try typedef typeid typeof union unittest version volatile" - ~ " while with __traits __parameters __vector"); - auto expected = ["bool", "byte", "cdouble", - "cent", "cfloat", "char", "creal", - "dchar", "double", "float", "function", - "idouble", "ifloat", "int", "ireal", "long", - "real", "short", "ubyte", "ucent", "uint", - "ulong", "ushort", "void", "wchar", "align", - "deprecated", "extern", "pragma", "export", - "package", "private", "protected", "public", - "abstract", "auto", "const", "final", "__gshared", - "immutable", "inout", "scope", "shared", - "static", "synchronized", "alias", "asm", "assert", - "body", "break", "case", "cast", "catch", - "class", "continue", "debug", "default", "delegate", - "delete", "do", "else", "enum", "false", - "finally", "foreach", "foreach_reverse", "for", - "goto", "if", "import", "in", "interface", - "invariant", "is", "lazy","macro", "mixin", - "module", "new", "nothrow", "null", "out", - "override", "pure", "ref", "return", "struct", - "super", "switch", "template", "this", "throw", - "true", "try", "typedef", "typeid", "typeof", - "union", "unittest", "version", "volatile", - "while", "with", "__traits", "__parameters", "__vector"]; - LexerConfig config; - auto tokens = byToken(source, config); - //writeln(tokens.map!"a.value"().array()); - assert (equal(map!"a.value"(tokens), expected)); + import std.stdio; + auto source = cast(ubyte[]) ( + " bool byte cdouble cent cfloat char creal dchar double float function" + ~ " idouble ifloat int ireal long real short ubyte ucent uint ulong" + ~ " ushort void wchar align deprecated extern pragma export package private" + ~ " protected public abstract auto const final __gshared immutable inout" + ~ " scope shared static synchronized alias asm assert body break case" + ~ " cast catch class continue debug default delegate delete do else" + ~ " enum false finally foreach foreach_reverse for goto if import in" + ~ " interface invariant is lazy macro mixin module new nothrow null" + ~ " out override pure ref return struct super switch template this" + ~ " throw true try typedef typeid typeof union unittest version volatile" + ~ " while with __traits __parameters __vector"); + auto expected = ["bool", "byte", "cdouble", + "cent", "cfloat", "char", "creal", + "dchar", "double", "float", "function", + "idouble", "ifloat", "int", "ireal", "long", + "real", "short", "ubyte", "ucent", "uint", + "ulong", "ushort", "void", "wchar", "align", + "deprecated", "extern", "pragma", "export", + "package", "private", "protected", "public", + "abstract", "auto", "const", "final", "__gshared", + "immutable", "inout", "scope", "shared", + "static", "synchronized", "alias", "asm", "assert", + "body", "break", "case", "cast", "catch", + "class", "continue", "debug", "default", "delegate", + "delete", "do", "else", "enum", "false", + "finally", "foreach", "foreach_reverse", "for", + "goto", "if", "import", "in", "interface", + "invariant", "is", "lazy","macro", "mixin", + "module", "new", "nothrow", "null", "out", + "override", "pure", "ref", "return", "struct", + "super", "switch", "template", "this", "throw", + "true", "try", "typedef", "typeid", "typeof", + "union", "unittest", "version", "volatile", + "while", "with", "__traits", "__parameters", "__vector"]; + LexerConfig config; + auto tokens = byToken(source, config); + //writeln(tokens.map!"a.value"().array()); + assert (equal(map!"a.value"(tokens), expected)); } unittest { - auto source = cast(ubyte[]) ("=@& &=| |=~=:,--/ /=$.===>> >=++{[< <=<>=<>&&||(- -=%%=*=!!=!>!>=!+ +=^^^^=}]);<< <<=>> >>=..*?~!<>=>>>>>>=...^ ^="); - auto expected = ["=", "@", "&", "&=", "|", "|=", "~=", - ":", ",", "--", "/", "/=", "$", ".", "==", - "=>", ">", ">=", "++", "{", "[", "<", - "<=", "<>=", "<>", "&&", "||", "(", "-", "-=", "%", - "%=", "*=", "!", "!=", "!>", "!>=", "!<", - "!<=", "!<>", "+", "+=", "^^", "^^=", - "}", "]", ")", ";", "<<", "<<=", ">>", - ">>=", "..", "*", "?", "~", "!<>=", - ">>>", ">>>=", "...", "^", "^="]; - LexerConfig config; - auto tokens = byToken(source, config); - //writeln(tokens.map!"a.value"().array()); - assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); + auto source = cast(ubyte[]) ("=@& &=| |=~=:,--/ /=$.===>> >=++{[< <=<>=<>&&||(- -=%%=*=!!=!>!>=!+ +=^^^^=}]);<< <<=>> >>=..*?~!<>=>>>>>>=...^ ^="); + auto expected = ["=", "@", "&", "&=", "|", "|=", "~=", + ":", ",", "--", "/", "/=", "$", ".", "==", + "=>", ">", ">=", "++", "{", "[", "<", + "<=", "<>=", "<>", "&&", "||", "(", "-", "-=", "%", + "%=", "*=", "!", "!=", "!>", "!>=", "!<", + "!<=", "!<>", "+", "+=", "^^", "^^=", + "}", "]", ")", ";", "<<", "<<=", ">>", + ">>=", "..", "*", "?", "~", "!<>=", + ">>>", ">>>=", "...", "^", "^="]; + LexerConfig config; + auto tokens = byToken(source, config); + //writeln(tokens.map!"a.value"().array()); + assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); } unittest { - auto source = cast(ubyte[]) (` - 1 1.2 //comment - 1.2f 1u 1uL 0b011 0b1uu 0b1 /+abc/+def+/+/0x11001uL - 123e1L 123e+1f 123e-1i 15e++ 4ea 1.2u 4i 1337L 4.2L 1..2 4.3.5.8 - 0xabc 0xabcp4 0x1P-10 0x40u 0x29L 0x4Lu 0xdeadbeef - `); - auto expected = ["1", "1.2", "1.2f", "1u", "1uL", "0b011", "0b1u", "u", "0b1", - "0x11001uL", "123e1L", "123e+1f", "123e-1i", "15e+", "+", "4e", "a", - "1.2", "u", "4i", "1337L", "4.2L", "1", "..", "2", "4.3", ".5", ".8", - "0xabc", "0xabcp4", "0x1P-10", "0x40u", "0x29L", "0x4Lu", "0xdeadbeef"]; - int errCount = 0; - void errorFunction(string file, size_t index, uint line, uint col, string msg) - { - ++errCount; - } - LexerConfig config; - config.errorFunc = &errorFunction; - auto tokens = byToken(source, config); - //writeln(tokens.map!"a.value"()); - assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); - assert (errCount == 2); + auto source = cast(ubyte[]) (` + 1 1.2 //comment + 1.2f 1u 1uL 0b011 0b1uu 0b1 /+abc/+def+/+/0x11001uL + 123e1L 123e+1f 123e-1i 15e++ 4ea 1.2u 4i 1337L 4.2L 1..2 4.3.5.8 + 0xabc 0xabcp4 0x1P-10 0x40u 0x29L 0x4Lu 0xdeadbeef + `); + auto expected = ["1", "1.2", "1.2f", "1u", "1uL", "0b011", "0b1u", "u", "0b1", + "0x11001uL", "123e1L", "123e+1f", "123e-1i", "15e+", "+", "4e", "a", + "1.2", "u", "4i", "1337L", "4.2L", "1", "..", "2", "4.3", ".5", ".8", + "0xabc", "0xabcp4", "0x1P-10", "0x40u", "0x29L", "0x4Lu", "0xdeadbeef"]; + int errCount = 0; + void errorFunction(string file, size_t index, uint line, uint col, string msg) + { + ++errCount; + } + LexerConfig config; + config.errorFunc = &errorFunction; + auto tokens = byToken(source, config); + //writeln(tokens.map!"a.value"()); + assert (equal(map!"a.value"(tokens), expected), map!"a.value"(tokens).text()); + assert (errCount == 2); } unittest { - auto source = cast(ubyte[]) ("int #line 4\n double q{abcde (a + b) == 0} '\\u0020' q\"HEREDOC\r\nabcde\r\nHEREDOC\""); - LexerConfig config; - auto tokens = byToken(source, config); - assert (tokens.front.line == 1); - assert (tokens.moveFront() == TokenType.int_); - assert (tokens.front.line == 4); - assert (isType(tokens.front)); - assert (tokens.front.value == "double"); - tokens.popFront(); - assert (tokens.front.value == "abcde (a + b) == 0", tokens.front.value); - assert (isStringLiteral(tokens.front), tokens.front.type.text()); - tokens.popFront(); - assert (tokens.front.value == " "); - assert (tokens.front.type == TokenType.characterLiteral); - tokens.popFront(); - assert (tokens.front.value == "abcde\r\n", "[%s]".format(tokens.front.value)); + auto source = cast(ubyte[]) ("int #line 4\n double q{abcde (a + b) == 0} '\\u0020' q\"HEREDOC\r\nabcde\r\nHEREDOC\""); + LexerConfig config; + auto tokens = byToken(source, config); + assert (tokens.front.line == 1); + assert (tokens.moveFront() == TokenType.int_); + assert (tokens.front.line == 4); + assert (isType(tokens.front)); + assert (tokens.front.value == "double"); + tokens.popFront(); + assert (tokens.front.value == "abcde (a + b) == 0", tokens.front.value); + assert (isStringLiteral(tokens.front), tokens.front.type.text()); + tokens.popFront(); + assert (tokens.front.value == " "); + assert (tokens.front.type == TokenType.characterLiteral); + tokens.popFront(); + assert (tokens.front.value == "abcde\r\n", "[%s]".format(tokens.front.value)); } unittest { - auto source = cast(ubyte[]) "q{(a & 1) == 0} q\"/foo]/\" q\"HEREDOC\r\nabcde\r\nHEREDOC\""; - LexerConfig config; - config.tokenStyle = TokenStyle.includeQuotes; - auto tokens = byToken(source, config); - assert (tokens.front.value == "q{(a & 1) == 0}", tokens.front.value); - tokens.popFront(); - assert (tokens.front.value == "q\"/foo]/\"", tokens.front.value); - tokens.popFront(); - assert (tokens.front.value == "q\"HEREDOC\r\nabcde\r\nHEREDOC\"", tokens.front.value); + auto source = cast(ubyte[]) "q{(a & 1) == 0} q\"/foo]/\" q\"HEREDOC\r\nabcde\r\nHEREDOC\""; + LexerConfig config; + config.tokenStyle = TokenStyle.includeQuotes; + auto tokens = byToken(source, config); + assert (tokens.front.value == "q{(a & 1) == 0}", tokens.front.value); + tokens.popFront(); + assert (tokens.front.value == "q\"/foo]/\"", tokens.front.value); + tokens.popFront(); + assert (tokens.front.value == "q\"HEREDOC\r\nabcde\r\nHEREDOC\"", tokens.front.value); } unittest { - auto source = cast(ubyte[]) (`"string`); - int errCount = 0; - void errorFunction(string file, size_t index, uint line, uint col, string msg) - { - ++errCount; - } - LexerConfig config; - config.errorFunc = &errorFunction; - auto tokens = byToken(source, config); - assert (errCount == 1); + auto source = cast(ubyte[]) (`"string`); + int errCount = 0; + void errorFunction(string file, size_t index, uint line, uint col, string msg) + { + ++errCount; + } + LexerConfig config; + config.errorFunc = &errorFunction; + auto tokens = byToken(source, config); + assert (errCount == 1); } unittest { - auto source = cast(ubyte[]) ("import foo"); - LexerConfig config; - auto tokens = byToken(source, config); - Token a = tokens.moveFront(); - assert (a.type == TokenType.import_); - Token b = tokens.moveFront(); - assert (b.type == TokenType.identifier); - assert (a != b); - assert (a != "foo"); - assert (a < b); - assert (b == "foo"); - assert (b > a); - assert (!(a > a)); - assert (tokens.empty); + auto source = cast(ubyte[]) ("import foo"); + LexerConfig config; + auto tokens = byToken(source, config); + Token a = tokens.moveFront(); + assert (a.type == TokenType.import_); + Token b = tokens.moveFront(); + assert (b.type == TokenType.identifier); + assert (a != b); + assert (a != "foo"); + assert (a < b); + assert (b == "foo"); + assert (b > a); + assert (!(a > a)); + assert (tokens.empty); } unittest { - auto source = cast(ubyte[]) ("import std.stdio; void main(){writeln(\"hello world\");}"); - LexerConfig config; - auto tokens = byToken(source, config); - int tokenCount = 0; - foreach (t; tokens) - { - ++tokenCount; - } - assert (tokenCount == 16); + auto source = cast(ubyte[]) ("import std.stdio; void main(){writeln(\"hello world\");}"); + LexerConfig config; + auto tokens = byToken(source, config); + int tokenCount = 0; + foreach (t; tokens) + { + ++tokenCount; + } + assert (tokenCount == 16); } - //void main(string[] args){}