diff --git a/build.sh b/build.sh index 02774dd..1aac63a 100755 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ #dmd *.d std/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner -L-lsqlite3 #-inline #dmd *.d std/d/*.d -g -m64 -w -wi -property -ofdscanner -unittest -#ldc2 -O3 *.d std/d/*.d -of=dscanner -release -vectorize -m64 -ldc2 *.d std/d/*.d -of=dscanner -unittest -m64 -g +ldc2 -O2 *.d std/d/*.d -of=dscanner -release -vectorize -m64 +#ldc2 *.d std/d/*.d -of=dscanner -unittest -m64 -g diff --git a/std/d/lexer.d b/std/d/lexer.d index b9664c4..addb4af 100644 --- a/std/d/lexer.d +++ b/std/d/lexer.d @@ -1,110 +1,110 @@ // Written in the D programming language /** - * This module contains a range-based _lexer for the D programming language. - * - * For performance reasons the _lexer contained in this module operates only on - * ASCII and UTF-8 encoded source code. If the use of other encodings is - * desired, the source code must be converted to UTF-8 before passing it to this - * _lexer. - * - * To use the _lexer, create a LexerConfig struct - * --- - * LexerConfig config; - * config.iterStyle = IterationStyle.everything; - * config.tokenStyle = IterationStyle.source; - * config.versionNumber = 2061; - * config.vendorString = "Lexer Example"; - * --- - * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your - * source code, passing in the configuration. - * --- - * auto source = "import std.stdio;"c; - * auto tokens = byToken(source, config); - * --- - * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can - * be used easily with the algorithms from std.algorithm or iterated over with - * $(D_KEYWORD foreach) - * --- - * assert (tokens.front.type == TokenType.import_); - * assert (tokens.front.value == "import"); - * assert (tokens.front.line == 1); - * assert (tokens.front.startIndex == 0); - * --- - * - * Examples: - * - * Generate HTML markup of D code. - * --- - * module highlighter; - * - * import std.stdio; - * import std.array; - * import std.d.lexer; - * - * void writeSpan(string cssClass, string value) - * { - * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); - * } - * - * - * // http://ethanschoonover.com/solarized - * void highlight(R)(R tokens) - * { - * stdout.writeln(q"[ - * - *
- * - * - * - * - *]"); - * - * foreach (Token t; tokens) - * { - * if (isType(t.type)) - * writeSpan("type", t.value); - * else if (isKeyword(t.type)) - * writeSpan("kwrd", t.value); - * else if (t.type == TokenType.comment) - * writeSpan("com", t.value); - * else if (isStringLiteral(t.type)) - * writeSpan("str", t.value); - * else if (isNumberLiteral(t.type)) - * writeSpan("num", t.value); - * else if (isOperator(t.type)) - * writeSpan("op", t.value); - * else - * stdout.write(t.value.replace("<", "<")); - * } - * stdout.writeln("\n"); - * } - * - * void main(string[] args) - * { - * LexerConfig config; - * config.tokenStyle = TokenStyle.source; - * config.iterStyle = IterationStyle.everything; - * config.fileName = args[1]; - * auto f = File(args[1]); - * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); - * } - * --- - * - * Copyright: Brian Schott 2013 - * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) - * Authors: Brian Schott - * Source: $(PHOBOSSRC std/d/_lexer.d) - */ +* This module contains a range-based _lexer for the D programming language. +* +* For performance reasons the _lexer contained in this module operates only on +* ASCII and UTF-8 encoded source code. If the use of other encodings is +* desired, the source code must be converted to UTF-8 before passing it to this +* _lexer. +* +* To use the _lexer, create a LexerConfig struct +* --- +* LexerConfig config; +* config.iterStyle = IterationStyle.everything; +* config.tokenStyle = IterationStyle.source; +* config.versionNumber = 2061; +* config.vendorString = "Lexer Example"; +* --- +* Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your +* source code, passing in the configuration. +* --- +* auto source = "import std.stdio;"c; +* auto tokens = byToken(source, config); +* --- +* The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can +* be used easily with the algorithms from std.algorithm or iterated over with +* $(D_KEYWORD foreach) +* --- +* assert (tokens.front.type == TokenType.import_); +* assert (tokens.front.value == "import"); +* assert (tokens.front.line == 1); +* assert (tokens.front.startIndex == 0); +* --- +* +* Examples: +* +* Generate HTML markup of D code. +* --- +* module highlighter; +* +* import std.stdio; +* import std.array; +* import std.d.lexer; +* +* void writeSpan(string cssClass, string value) +* { +* stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); +* } +* +* +* // http://ethanschoonover.com/solarized +* void highlight(R)(R tokens) +* { +* stdout.writeln(q"[ +* +* +* +* +* +* +*
]"); +* +* foreach (Token t; tokens) +* { +* if (isType(t.type)) +* writeSpan("type", t.value); +* else if (isKeyword(t.type)) +* writeSpan("kwrd", t.value); +* else if (t.type == TokenType.comment) +* writeSpan("com", t.value); +* else if (isStringLiteral(t.type)) +* writeSpan("str", t.value); +* else if (isNumberLiteral(t.type)) +* writeSpan("num", t.value); +* else if (isOperator(t.type)) +* writeSpan("op", t.value); +* else +* stdout.write(t.value.replace("<", "<")); +* } +* stdout.writeln("\n"); +* } +* +* void main(string[] args) +* { +* LexerConfig config; +* config.tokenStyle = TokenStyle.source; +* config.iterStyle = IterationStyle.everything; +* config.fileName = args[1]; +* auto f = File(args[1]); +* (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); +* } +* --- +* +* Copyright: Brian Schott 2013 +* License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) +* Authors: Brian Schott +* Source: $(PHOBOSSRC std/d/_lexer.d) +*/ module std.d.lexer; @@ -123,844 +123,847 @@ import std.utf; public: /** - * Represents a D token - */ +* Represents a D token +*/ struct Token { - /** - * The token type. - */ - TokenType type; + /** + * The token type. + */ + TokenType type; - /** - * The representation of the token in the original source code. - */ - string value; + /** + * The representation of the token in the original source code. + */ + string value; - /** - * The number of the line the token is on. - */ - uint line; + /** + * The number of the line the token is on. + */ + uint line; - /** - * The column number of the start of the token in the original source. - * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) - */ - uint column; + /** + * The column number of the start of the token in the original source. + * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) + */ + uint column; - /** - * The index of the start of the token in the original source. - * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) - */ - size_t startIndex; + /** + * The index of the start of the token in the original source. + * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) + */ + size_t startIndex; - /** - * Check to see if the token is of the same type and has the same string - * representation as the given token. - */ - bool opEquals(ref const(Token) other) const - { - return other.type == type && other.value == value; - } + /** + * Check to see if the token is of the same type and has the same string + * representation as the given token. + */ + bool opEquals(ref const(Token) other) const + { + return other.type == type && other.value == value; + } - /** - * Checks to see if the token's string representation is equal to the given - * string. - */ - bool opEquals(string value) const { return this.value == value; } + /** + * Checks to see if the token's string representation is equal to the given + * string. + */ + bool opEquals(string value) const { return this.value == value; } - /** - * Checks to see if the token is of the given type. - */ - bool opEquals(TokenType type) const { return type == type; } + /** + * Checks to see if the token is of the given type. + */ + bool opEquals(TokenType type) const { return type == type; } - /** - * Comparison operator orders tokens by start index. - */ - int opCmp(size_t i) const - { - if (startIndex < i) return -1; - if (startIndex > i) return 1; - return 0; - } + /** + * Comparison operator orders tokens by start index. + */ + int opCmp(size_t i) const + { + if (startIndex < i) return -1; + if (startIndex > i) return 1; + return 0; + } } /** - * Configure the behavior of the byToken() function. These flags may be - * combined using a bitwise or. - */ +* Configure the behavior of the byToken() function. These flags may be +* combined using a bitwise or. +*/ enum IterationStyle { - /// Only include code, not whitespace or comments - codeOnly = 0, - /// Includes comments - includeComments = 0b0001, - /// Includes whitespace - includeWhitespace = 0b0010, - /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) - includeSpecialTokens = 0b0100, - /// Do not stop iteration on reaching the ___EOF__ token - ignoreEOF = 0b1000, - /// Include everything - everything = includeComments | includeWhitespace | ignoreEOF + /// Only include code, not whitespace or comments + codeOnly = 0, + /// Includes comments + includeComments = 0b0001, + /// Includes whitespace + includeWhitespace = 0b0010, + /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) + includeSpecialTokens = 0b0100, + /// Do not stop iteration on reaching the ___EOF__ token + ignoreEOF = 0b1000, + /// Include everything + everything = includeComments | includeWhitespace | ignoreEOF } /** - * Configuration of the token lexing style. These flags may be combined with a - * bitwise or. - */ +* Configuration of the token lexing style. These flags may be combined with a +* bitwise or. +*/ enum TokenStyle : uint { - /** - * Escape sequences will be replaced with their equivalent characters, - * enclosing quote characters will not be included. Special tokens such as - * __VENDOR__ will be replaced with their equivalent strings. Useful for - * creating a compiler or interpreter. - */ - default_ = 0b0000, + /** + * Escape sequences will be replaced with their equivalent characters, + * enclosing quote characters will not be included. Special tokens such as + * __VENDOR__ will be replaced with their equivalent strings. Useful for + * creating a compiler or interpreter. + */ + default_ = 0b0000, - /** - * Escape sequences will not be processed. An escaped quote character will - * not terminate string lexing, but it will not be replaced with the quote - * character in the token. - */ - notEscaped = 0b0001, + /** + * Escape sequences will not be processed. An escaped quote character will + * not terminate string lexing, but it will not be replaced with the quote + * character in the token. + */ + notEscaped = 0b0001, - /** - * Strings will include their opening and closing quote characters as well - * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will - * include the $(D_STRING 'w') character as well as the opening and closing - * quotes$(RPAREN) - */ - includeQuotes = 0b0010, + /** + * Strings will include their opening and closing quote characters as well + * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will + * include the $(D_STRING 'w') character as well as the opening and closing + * quotes$(RPAREN) + */ + includeQuotes = 0b0010, - /** - * Do not replace the value field of the special tokens such as ___DATE__ - * with their string equivalents. - */ - doNotReplaceSpecial = 0b0100, + /** + * Do not replace the value field of the special tokens such as ___DATE__ + * with their string equivalents. + */ + doNotReplaceSpecial = 0b0100, - /** - * Strings will be read exactly as they appeared in the source, including - * their opening and closing quote characters. Useful for syntax - * highlighting. - */ - source = notEscaped | includeQuotes | doNotReplaceSpecial + /** + * Strings will be read exactly as they appeared in the source, including + * their opening and closing quote characters. Useful for syntax + * highlighting. + */ + source = notEscaped | includeQuotes | doNotReplaceSpecial } /** - * Lexer configuration - */ +* Lexer configuration +*/ struct LexerConfig { - /** - * Iteration style - */ - IterationStyle iterStyle = IterationStyle.codeOnly; + /** + * Iteration style + */ + IterationStyle iterStyle = IterationStyle.codeOnly; - /** - * Token style - */ - TokenStyle tokenStyle = tokenStyle.default_; + /** + * Token style + */ + TokenStyle tokenStyle = tokenStyle.default_; - /** - * Replacement for the ___VERSION__ token. Defaults to 1. - */ - uint versionNumber = 100; + /** + * Replacement for the ___VERSION__ token. Defaults to 1. + */ + uint versionNumber = 100; - /** - * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") - */ - string vendorString = "std.d.lexer"; + /** + * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") + */ + string vendorString = "std.d.lexer"; - /** - * Name used when creating error messages that are sent to errorFunc. This - * is needed because the lexer operates on any forwarad range of ASCII - * characters or UTF-8 code units and does not know what to call its input - * source. Defaults to the empty string. - */ - string fileName = ""; + /** + * Name used when creating error messages that are sent to errorFunc. This + * is needed because the lexer operates on any forwarad range of ASCII + * characters or UTF-8 code units and does not know what to call its input + * source. Defaults to the empty string. + */ + string fileName = ""; - /** - * This function is called when an error is encountered during lexing. - * Parameters are file name, code uint index, line number, column, - * and error messsage. - */ - void delegate(string, size_t, uint, uint, string) errorFunc; + /** + * This function is called when an error is encountered during lexing. + * Parameters are file name, code uint index, line number, column, + * and error messsage. + */ + void delegate(string, size_t, uint, uint, string) errorFunc; - /** - * Initial size of the lexer's internal token buffer in bytes. The lexer - * will grow this buffer if necessary. - */ - size_t bufferSize = 1024 * 4; + /** + * Initial size of the lexer's internal token buffer in bytes. The lexer + * will grow this buffer if necessary. + */ + size_t bufferSize = 1024 * 4; } /** - * Iterate over the given range of characters by D tokens. - * Params: - * range = the range of characters - * config = the lexer configuration - * Returns: - * an input range of tokens - */ +* Iterate over the given range of characters by D tokens. +* Params: +* range = the range of characters +* config = the lexer configuration +* Returns: +* an input range of tokens +*/ TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R)) { - auto r = TokenRange!(R)(range); - r.config = config; - r.lineNumber = 1; - r.popFront(); - return r; + auto r = TokenRange!(R)(range); + r.config = config; + r.lineNumber = 1; + r.popFront(); + return r; } /** - * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. - */ +* Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. +*/ struct TokenRange(R) if (isForwardRange!(R)) { - /** - * Returns: true if the range is empty - */ - bool empty() const @property - { - return _empty; - } + /** + * Returns: true if the range is empty + */ + bool empty() const @property + { + return _empty; + } - /** - * Returns: the current token - */ - ref const(Token) front() const @property - { - enforce(!_empty, "Cannot call front() on empty token range"); - return current; - } + /** + * Returns: the current token + */ + ref const(Token) front() const @property + { + enforce(!_empty, "Cannot call front() on empty token range"); + return current; + } - /** - * Returns the current token and then removes it from the range - */ - Token moveFront() - { - auto r = front(); - popFront(); - return r; - } + /** + * Returns the current token and then removes it from the range + */ + Token moveFront() + { + auto r = front(); + popFront(); + return r; + } - /** - * Range operation - */ - int opApply(int delegate(Token) dg) - { - int result = 0; - while (!empty) - { - result = dg(front); - if (result) - break; - popFront(); - } - return result; - } + /** + * Range operation + */ + int opApply(int delegate(Token) dg) + { + int result = 0; + while (!empty) + { + result = dg(front); + if (result) + break; + popFront(); + } + return result; + } - /** - * Range operation - */ - int opApply(int delegate(size_t, Token) dg) - { - int result = 0; - int i = 0; - while (!empty) - { - result = dg(i, front); - if (result) - break; - popFront(); - } - return result; - } + /** + * Range operation + */ + int opApply(int delegate(size_t, Token) dg) + { + int result = 0; + int i = 0; + while (!empty) + { + result = dg(i, front); + if (result) + break; + popFront(); + } + return result; + } - /** - * Removes the current token from the range - */ - void popFront() - { - // Filter out tokens we don't care about - loop: do - { - advance(); - switch (current.type) - { - case TokenType.whitespace: - if (config.iterStyle & IterationStyle.includeWhitespace) - break loop; - break; - case TokenType.comment: - if (config.iterStyle & IterationStyle.includeComments) - break loop; - break; - case TokenType.specialTokenSequence: - if (config.iterStyle & IterationStyle.includeSpecialTokens) - break loop; - break; - default: - break loop; - } - } - while (!empty()); - } + /** + * Removes the current token from the range + */ + void popFront() + { + // Filter out tokens we don't care about + loop: do + { + advance(); + switch (current.type) + { + case TokenType.whitespace: + if (config.iterStyle & IterationStyle.includeWhitespace) + break loop; + break; + case TokenType.comment: + if (config.iterStyle & IterationStyle.includeComments) + break loop; + break; + case TokenType.specialTokenSequence: + if (config.iterStyle & IterationStyle.includeSpecialTokens) + break loop; + break; + default: + break loop; + } + } + while (!empty()); + } private: - this(ref R range) - { - this.range = range; - buffer = uninitializedArray!(ubyte[])(bufferSize); - } + this(ref R range) + { + this.range = range; + buffer = uninitializedArray!(ubyte[])(bufferSize); + } - /* - * Advances the range to the next token - */ - void advance() - { - if (isEoF()) - { - _empty = true; - return; - } + /* + * Advances the range to the next token + */ + void advance() + { + if (isEoF()) + { + _empty = true; + return; + } - bufferIndex = 0; - current.line = lineNumber; - current.startIndex = index; - current.column = column; - current.value = null; + bufferIndex = 0; + current.line = lineNumber; + current.startIndex = index; + current.column = column; + current.value = null; - if (isWhite()) - { - lexWhitespace(); - return; - } + if (isWhite()) + { + if (config.iterStyle & IterationStyle.includeWhitespace) + lexWhitespace!true(); + else + lexWhitespace!false(); + return; + } - switch (currentElement()) - { + switch (currentElement()) + { // pragma(msg, generateCaseTrie( - mixin(generateCaseTrie( - "=", "TokenType.assign", - "@", "TokenType.at", - "&", "TokenType.bitAnd", - "&=", "TokenType.bitAndEquals", - "|", "TokenType.bitOr", - "|=", "TokenType.bitOrEquals", - "~=", "TokenType.catEquals", - ":", "TokenType.colon", - ",", "TokenType.comma", - "--", "TokenType.decrement", - "$", "TokenType.dollar", - "==", "TokenType.equals", - "=>", "TokenType.goesTo", - ">", "TokenType.greater", - ">=", "TokenType.greaterEqual", - "++", "TokenType.increment", - "{", "TokenType.lBrace", - "[", "TokenType.lBracket", - "<", "TokenType.less", - "<=", "TokenType.lessEqual", - "<>=", "TokenType.lessEqualGreater", - "<>", "TokenType.lessOrGreater", - "&&", "TokenType.logicAnd", - "||", "TokenType.logicOr", - "(", "TokenType.lParen", - "-", "TokenType.minus", - "-=", "TokenType.minusEquals", - "%", "TokenType.mod", - "%=", "TokenType.modEquals", - "*=", "TokenType.mulEquals", - "!", "TokenType.not", - "!=", "TokenType.notEquals", - "!>", "TokenType.notGreater", - "!>=", "TokenType.notGreaterEqual", - "!<", "TokenType.notLess", - "!<=", "TokenType.notLessEqual", - "!<>", "TokenType.notLessEqualGreater", - "+", "TokenType.plus", - "+=", "TokenType.plusEquals", - "^^", "TokenType.pow", - "^^=", "TokenType.powEquals", - "}", "TokenType.rBrace", - "]", "TokenType.rBracket", - ")", "TokenType.rParen", - ";", "TokenType.semicolon", - "<<", "TokenType.shiftLeft", - "<<=", "TokenType.shiftLeftEqual", - ">>", "TokenType.shiftRight", - ">>=", "TokenType.shiftRightEqual", - "*", "TokenType.star", - "?", "TokenType.ternary", - "~", "TokenType.tilde", - "!<>=", "TokenType.unordered", - ">>>", "TokenType.unsignedShiftRight", - ">>>=", "TokenType.unsignedShiftRightEqual", - "^", "TokenType.xor", - "^=", "TokenType.xorEquals", - )); - case '/': - keepNonNewlineChar(); - if (isEoF()) - { - current.type = TokenType.div; - current.value = "/"; - return; - } - switch (currentElement()) - { - case '/': - case '*': - case '+': - lexComment(); - return; - case '=': - current.type = TokenType.divEquals; - current.value = "/="; - advanceRange(); - return; - default: - current.type = TokenType.div; - current.value = "/"; - return; - } - case '.': - keepNonNewlineChar(); - if (isEoF()) - { - current.type = TokenType.dot; - current.value = getTokenValue(TokenType.dot); - return; - } - switch (currentElement()) - { - case '0': .. case '9': - lexNumber(); - return; - case '.': - current.type = TokenType.slice; - keepNonNewlineChar(); - if (currentElement() == '.') - { - current.type = TokenType.vararg; - keepNonNewlineChar(); - } - current.value = getTokenValue(current.type); - return; - default: - current.type = TokenType.dot; - current.value = getTokenValue(TokenType.dot); - return; - } - case '0': .. case '9': - keepNonNewlineChar(); - lexNumber(); - return; - case '\'': + mixin(generateCaseTrie( + "=", "TokenType.assign", + "@", "TokenType.at", + "&", "TokenType.bitAnd", + "&=", "TokenType.bitAndEquals", + "|", "TokenType.bitOr", + "|=", "TokenType.bitOrEquals", + "~=", "TokenType.catEquals", + ":", "TokenType.colon", + ",", "TokenType.comma", + "--", "TokenType.decrement", + "$", "TokenType.dollar", + "==", "TokenType.equals", + "=>", "TokenType.goesTo", + ">", "TokenType.greater", + ">=", "TokenType.greaterEqual", + "++", "TokenType.increment", + "{", "TokenType.lBrace", + "[", "TokenType.lBracket", + "<", "TokenType.less", + "<=", "TokenType.lessEqual", + "<>=", "TokenType.lessEqualGreater", + "<>", "TokenType.lessOrGreater", + "&&", "TokenType.logicAnd", + "||", "TokenType.logicOr", + "(", "TokenType.lParen", + "-", "TokenType.minus", + "-=", "TokenType.minusEquals", + "%", "TokenType.mod", + "%=", "TokenType.modEquals", + "*=", "TokenType.mulEquals", + "!", "TokenType.not", + "!=", "TokenType.notEquals", + "!>", "TokenType.notGreater", + "!>=", "TokenType.notGreaterEqual", + "!<", "TokenType.notLess", + "!<=", "TokenType.notLessEqual", + "!<>", "TokenType.notLessEqualGreater", + "+", "TokenType.plus", + "+=", "TokenType.plusEquals", + "^^", "TokenType.pow", + "^^=", "TokenType.powEquals", + "}", "TokenType.rBrace", + "]", "TokenType.rBracket", + ")", "TokenType.rParen", + ";", "TokenType.semicolon", + "<<", "TokenType.shiftLeft", + "<<=", "TokenType.shiftLeftEqual", + ">>", "TokenType.shiftRight", + ">>=", "TokenType.shiftRightEqual", + "*", "TokenType.star", + "?", "TokenType.ternary", + "~", "TokenType.tilde", + "!<>=", "TokenType.unordered", + ">>>", "TokenType.unsignedShiftRight", + ">>>=", "TokenType.unsignedShiftRightEqual", + "^", "TokenType.xor", + "^=", "TokenType.xorEquals", + )); + case '/': + keepNonNewlineChar(); + if (isEoF()) + { + current.type = TokenType.div; + current.value = "/"; + return; + } + switch (currentElement()) + { + case '/': + case '*': + case '+': + lexComment(); + return; + case '=': + current.type = TokenType.divEquals; + current.value = "/="; + advanceRange(); + return; + default: + current.type = TokenType.div; + current.value = "/"; + return; + } + case '.': + keepNonNewlineChar(); + if (isEoF()) + { + current.type = TokenType.dot; + current.value = getTokenValue(TokenType.dot); + return; + } + switch (currentElement()) + { + case '0': .. case '9': + lexNumber(); + return; + case '.': + current.type = TokenType.slice; + keepNonNewlineChar(); + if (currentElement() == '.') + { + current.type = TokenType.vararg; + keepNonNewlineChar(); + } + current.value = getTokenValue(current.type); + return; + default: + current.type = TokenType.dot; + current.value = getTokenValue(TokenType.dot); + return; + } + case '0': .. case '9': + keepNonNewlineChar(); + lexNumber(); + return; + case '\'': lexCharacterLiteral(); return; - case '"': - case '`': - lexString(); - return; - case 'q': - keepNonNewlineChar(); - if (isEoF()) - goto default; - switch (currentElement()) - { - case '{': - lexTokenString(); - return; - case '"': - lexDelimitedString(); - return; - default: - break; - } - goto default; - case 'r': - keepNonNewlineChar(); - if (isEoF()) - goto default; - else if (currentElement() == '"') - { - lexString(); - return; - } - else - goto default; - case 'x': - keepNonNewlineChar(); - if (isEoF()) - goto default; - else if (currentElement() == '"') - { - lexHexString(); - return; - } - else - goto default; - case '#': - lexSpecialTokenSequence(); - return; - default: - while(!isEoF() && !isSeparating()) - { - keepNonNewlineChar(); - } + case '"': + case '`': + lexString(); + return; + case 'q': + keepNonNewlineChar(); + if (isEoF()) + goto default; + switch (currentElement()) + { + case '{': + lexTokenString(); + return; + case '"': + lexDelimitedString(); + return; + default: + break; + } + goto default; + case 'r': + keepNonNewlineChar(); + if (isEoF()) + goto default; + else if (currentElement() == '"') + { + lexString(); + return; + } + else + goto default; + case 'x': + keepNonNewlineChar(); + if (isEoF()) + goto default; + else if (currentElement() == '"') + { + lexHexString(); + return; + } + else + goto default; + case '#': + lexSpecialTokenSequence(); + return; + default: + while(!isEoF() && !isSeparating()) + { + keepNonNewlineChar(); + } - current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]); - current.value = getTokenValue(current.type); - if (current.value is null) - setTokenValue(); + current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]); + current.value = getTokenValue(current.type); + if (current.value is null) + setTokenValue(); - if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) - { - _empty = true; - return; - } + if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) + { + _empty = true; + return; + } - if (!(config.iterStyle & TokenStyle.doNotReplaceSpecial)) - return; + if (!(config.iterStyle & TokenStyle.doNotReplaceSpecial)) + return; - switch (current.type) - { - case TokenType.date: - current.type = TokenType.stringLiteral; - auto time = Clock.currTime(); - current.value = format("%s %02d %04d", time.month, time.day, time.year); - return; - case TokenType.time: - auto time = Clock.currTime(); - current.type = TokenType.stringLiteral; - current.value = (cast(TimeOfDay)(time)).toISOExtString(); - return; - case TokenType.timestamp: - auto time = Clock.currTime(); - auto dt = cast(DateTime) time; - current.type = TokenType.stringLiteral; - current.value = format("%s %s %02d %02d:%02d:%02d %04d", - dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute, - dt.second, dt.year); - return; - case TokenType.vendor: - current.type = TokenType.stringLiteral; - current.value = config.vendorString; - return; - case TokenType.compilerVersion: - current.type = TokenType.stringLiteral; - current.value = format("%d", config.versionNumber); - return; - case TokenType.line: - current.type = TokenType.intLiteral; - current.value = format("%d", current.line); - return; - case TokenType.file: - current.type = TokenType.stringLiteral; - current.value = config.fileName; - return; - default: - return; - } - } - } + switch (current.type) + { + case TokenType.date: + current.type = TokenType.stringLiteral; + auto time = Clock.currTime(); + current.value = format("%s %02d %04d", time.month, time.day, time.year); + return; + case TokenType.time: + auto time = Clock.currTime(); + current.type = TokenType.stringLiteral; + current.value = (cast(TimeOfDay)(time)).toISOExtString(); + return; + case TokenType.timestamp: + auto time = Clock.currTime(); + auto dt = cast(DateTime) time; + current.type = TokenType.stringLiteral; + current.value = format("%s %s %02d %02d:%02d:%02d %04d", + dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute, + dt.second, dt.year); + return; + case TokenType.vendor: + current.type = TokenType.stringLiteral; + current.value = config.vendorString; + return; + case TokenType.compilerVersion: + current.type = TokenType.stringLiteral; + current.value = format("%d", config.versionNumber); + return; + case TokenType.line: + current.type = TokenType.intLiteral; + current.value = format("%d", current.line); + return; + case TokenType.file: + current.type = TokenType.stringLiteral; + current.value = config.fileName; + return; + default: + return; + } + } + } - void lexWhitespace() - { - current.type = TokenType.whitespace; - while (!isEoF() && isWhite()) - { - keepChar(); - } - if (config.iterStyle & IterationStyle.includeWhitespace) - setTokenValue(); - } + void lexWhitespace(bool keep)() + { + current.type = TokenType.whitespace; + while (!isEoF() && isWhite()) + { + static if (keep) keepChar(); + else advanceRange(); + } + static if (keep) setTokenValue(); + } - void lexComment() - in - { - assert (currentElement() == '/' || currentElement() == '*' || currentElement() == '+'); - } - body - { - current.type = TokenType.comment; - switch(currentElement()) - { - case '/': - while (!isEoF() && !isNewline(currentElement())) - { - keepNonNewlineChar(); - } - break; - case '*': - while (!isEoF()) - { - if (currentElement() == '*') - { - keepNonNewlineChar(); - if (currentElement() == '/') - { - keepNonNewlineChar(); - break; - } - } - else - keepChar(); - } - break; - case '+': - int depth = 1; - while (depth > 0 && !isEoF()) - { - if (currentElement() == '+') - { - keepNonNewlineChar(); - if (currentElement() == '/') - { - keepNonNewlineChar(); - --depth; - } - } - else if (currentElement() == '/') - { - keepNonNewlineChar(); - if (currentElement() == '+') - { - keepNonNewlineChar(); - ++depth; - } - } - else - keepChar(); - } - break; - default: - assert(false); - } - if (config.iterStyle & IterationStyle.includeComments) - setTokenValue(); - } + void lexComment() + in + { + assert (currentElement() == '/' || currentElement() == '*' || currentElement() == '+'); + } + body + { + current.type = TokenType.comment; + switch(currentElement()) + { + case '/': + while (!isEoF() && !isNewline(currentElement())) + { + keepNonNewlineChar(); + } + break; + case '*': + while (!isEoF()) + { + if (currentElement() == '*') + { + keepNonNewlineChar(); + if (currentElement() == '/') + { + keepNonNewlineChar(); + break; + } + } + else + keepChar(); + } + break; + case '+': + int depth = 1; + while (depth > 0 && !isEoF()) + { + if (currentElement() == '+') + { + keepNonNewlineChar(); + if (currentElement() == '/') + { + keepNonNewlineChar(); + --depth; + } + } + else if (currentElement() == '/') + { + keepNonNewlineChar(); + if (currentElement() == '+') + { + keepNonNewlineChar(); + ++depth; + } + } + else + keepChar(); + } + break; + default: + assert(false); + } + if (config.iterStyle & IterationStyle.includeComments) + setTokenValue(); + } - void lexHexString() - in - { - assert (currentElement() == '"' && buffer[0] == 'x'); - } - body - { - current.type = TokenType.stringLiteral; - keepChar(); - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated hex string literal"); - return; - } - else if (isHexDigit(currentElement())) - { - keepNonNewlineChar(); - } - else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) - { - keepChar(); - } - else if (currentElement() == '"') - { - keepNonNewlineChar(); - break; - } - else - { - errorMessage(format("Invalid character '%s' in hex string literal", - cast(char) currentElement())); - return; - } - } - lexStringSuffix(); - if (config.tokenStyle & TokenStyle.notEscaped) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(2, bufferIndex - 1); - } - else - { - auto a = appender!(ubyte[])(); - foreach (b; std.range.chunks(buffer[2 .. bufferIndex - 1], 2)) - { - string s = to!string(cast(char[]) b); - a.put(cast(ubyte[]) to!string(cast(dchar) parse!uint(s, 16))); - } - current.value = to!string(cast(char[]) a.data); - } - } + void lexHexString() + in + { + assert (currentElement() == '"' && buffer[0] == 'x'); + } + body + { + current.type = TokenType.stringLiteral; + keepChar(); + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated hex string literal"); + return; + } + else if (isHexDigit(currentElement())) + { + keepNonNewlineChar(); + } + else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) + { + keepChar(); + } + else if (currentElement() == '"') + { + keepNonNewlineChar(); + break; + } + else + { + errorMessage(format("Invalid character '%s' in hex string literal", + cast(char) currentElement())); + return; + } + } + lexStringSuffix(); + if (config.tokenStyle & TokenStyle.notEscaped) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(2, bufferIndex - 1); + } + else + { + auto a = appender!(ubyte[])(); + foreach (b; std.range.chunks(buffer[2 .. bufferIndex - 1], 2)) + { + string s = to!string(cast(char[]) b); + a.put(cast(ubyte[]) to!string(cast(dchar) parse!uint(s, 16))); + } + current.value = to!string(cast(char[]) a.data); + } + } - void lexNumber() - in - { - assert(buffer[0] || buffer[0] == '.'); - } - body - { - // hex and binary can start with zero, anything else is decimal - if (currentElement() != '0') - lexDecimal(); - else - { - switch (currentElement()) - { - case 'x': - case 'X': - keepNonNewlineChar(); - lexHex(); - break; - case 'b': - case 'B': - keepNonNewlineChar(); - lexBinary(); - break; - default: - lexDecimal(); - return; - } - } - } + void lexNumber() + in + { + assert(buffer[0] || buffer[0] == '.'); + } + body + { + // hex and binary can start with zero, anything else is decimal + if (currentElement() != '0') + lexDecimal(); + else + { + switch (currentElement()) + { + case 'x': + case 'X': + keepNonNewlineChar(); + lexHex(); + break; + case 'b': + case 'B': + keepNonNewlineChar(); + lexBinary(); + break; + default: + lexDecimal(); + return; + } + } + } - void lexFloatSuffix() - { - switch (currentElement()) - { - case 'L': - keepNonNewlineChar(); - current.type = TokenType.doubleLiteral; - break; - case 'f': - case 'F': - keepNonNewlineChar(); - current.type = TokenType.floatLiteral; - break; - default: - break; - } - if (!isEoF() && currentElement() == 'i') - { - keepNonNewlineChar(); - if (current.type == TokenType.floatLiteral) - current.type = TokenType.ifloatLiteral; - else - current.type = TokenType.idoubleLiteral; - } - } + void lexFloatSuffix() + { + switch (currentElement()) + { + case 'L': + keepNonNewlineChar(); + current.type = TokenType.doubleLiteral; + break; + case 'f': + case 'F': + keepNonNewlineChar(); + current.type = TokenType.floatLiteral; + break; + default: + break; + } + if (!isEoF() && currentElement() == 'i') + { + keepNonNewlineChar(); + if (current.type == TokenType.floatLiteral) + current.type = TokenType.ifloatLiteral; + else + current.type = TokenType.idoubleLiteral; + } + } - void lexIntSuffix() - { - bool foundU; - bool foundL; - while (!isEoF()) - { - switch (currentElement()) - { - case 'u': - case 'U': - if (foundU) - return; - switch (current.type) - { - case TokenType.intLiteral: - current.type = TokenType.uintLiteral; - keepNonNewlineChar(); - break; - case TokenType.longLiteral: - current.type = TokenType.ulongLiteral; - keepNonNewlineChar(); - break; - default: - return; - } - foundU = true; - break; - case 'L': - if (foundL) - return; - switch (current.type) - { - case TokenType.intLiteral: - current.type = TokenType.longLiteral; - keepNonNewlineChar(); - break; - case TokenType.uintLiteral: - current.type = TokenType.ulongLiteral; - keepNonNewlineChar(); - break; - default: - return; - } - foundL = true; - break; - default: - return; - } - } - } + void lexIntSuffix() + { + bool foundU; + bool foundL; + while (!isEoF()) + { + switch (currentElement()) + { + case 'u': + case 'U': + if (foundU) + return; + switch (current.type) + { + case TokenType.intLiteral: + current.type = TokenType.uintLiteral; + keepNonNewlineChar(); + break; + case TokenType.longLiteral: + current.type = TokenType.ulongLiteral; + keepNonNewlineChar(); + break; + default: + return; + } + foundU = true; + break; + case 'L': + if (foundL) + return; + switch (current.type) + { + case TokenType.intLiteral: + current.type = TokenType.longLiteral; + keepNonNewlineChar(); + break; + case TokenType.uintLiteral: + current.type = TokenType.ulongLiteral; + keepNonNewlineChar(); + break; + default: + return; + } + foundL = true; + break; + default: + return; + } + } + } - void lexExponent() - in - { - assert (currentElement() == 'e' || currentElement() == 'E' || currentElement() == 'p' - || currentElement() == 'P'); - } - body - { - keepNonNewlineChar(); - bool foundSign = false; - while (!isEoF()) - { - switch (currentElement()) - { - case '-': - case '+': - if (foundSign) - return; - foundSign = true; - keepNonNewlineChar(); - case '0': .. case '9': - case '_': - keepNonNewlineChar(); - break; - case 'L': - case 'f': - case 'F': - case 'i': - lexFloatSuffix(); - return; - default: - return; - } - } - } + void lexExponent() + in + { + assert (currentElement() == 'e' || currentElement() == 'E' || currentElement() == 'p' + || currentElement() == 'P'); + } + body + { + keepNonNewlineChar(); + bool foundSign = false; + while (!isEoF()) + { + switch (currentElement()) + { + case '-': + case '+': + if (foundSign) + return; + foundSign = true; + keepNonNewlineChar(); + case '0': .. case '9': + case '_': + keepNonNewlineChar(); + break; + case 'L': + case 'f': + case 'F': + case 'i': + lexFloatSuffix(); + return; + default: + return; + } + } + } - void lexDecimal() - in - { - assert ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.'); - } - body - { - bool foundDot = false; - current.type = TokenType.intLiteral; - scope(exit) setTokenValue(); - decimalLoop: while (!isEoF()) - { - switch (currentElement()) - { - case '0': .. case '9': - case '_': - keepNonNewlineChar(); - break; + void lexDecimal() + in + { + assert ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.'); + } + body + { + bool foundDot = false; + current.type = TokenType.intLiteral; + scope(exit) setTokenValue(); + decimalLoop: while (!isEoF()) + { + switch (currentElement()) + { + case '0': .. case '9': + case '_': + keepNonNewlineChar(); + break; case 'u': case 'U': if (foundDot) @@ -971,173 +974,173 @@ private: } else lexIntSuffix(); - case 'i': - case 'L': - if (foundDot) - { - lexFloatSuffix(); - return; - } - else - { - lexIntSuffix(); - return; - } - case 'f': - case 'F': - lexFloatSuffix(); - return; - case 'e': - case 'E': - lexExponent(); - return; - case '.': - static if (isArray!R) - auto r = range[index .. $]; - else - auto r = range.save(); - r.popFront(); - if (!r.isRangeEoF() && r.front == '.') - break decimalLoop; // possibly slice expression - if (foundDot) - break decimalLoop; // two dots with other characters between them - keepNonNewlineChar(); - foundDot = true; - current.type = TokenType.doubleLiteral; - break; - default: - break decimalLoop; - } - } + case 'i': + case 'L': + if (foundDot) + { + lexFloatSuffix(); + return; + } + else + { + lexIntSuffix(); + return; + } + case 'f': + case 'F': + lexFloatSuffix(); + return; + case 'e': + case 'E': + lexExponent(); + return; + case '.': + static if (isArray!R) + auto r = range[index .. $]; + else + auto r = range.save(); + r.popFront(); + if (!r.isRangeEoF() && r.front == '.') + break decimalLoop; // possibly slice expression + if (foundDot) + break decimalLoop; // two dots with other characters between them + keepNonNewlineChar(); + foundDot = true; + current.type = TokenType.doubleLiteral; + break; + default: + break decimalLoop; + } + } - } + } - void lexBinary() - { - current.type = TokenType.intLiteral; - scope(exit) setTokenValue(); - binaryLoop: while (!isEoF()) - { - switch (currentElement()) - { - case '0': - case '1': - case '_': - keepNonNewlineChar(); - break; - case 'u': - case 'U': - case 'L': - lexIntSuffix(); - return; - default: - break binaryLoop; - } - } - } + void lexBinary() + { + current.type = TokenType.intLiteral; + scope(exit) setTokenValue(); + binaryLoop: while (!isEoF()) + { + switch (currentElement()) + { + case '0': + case '1': + case '_': + keepNonNewlineChar(); + break; + case 'u': + case 'U': + case 'L': + lexIntSuffix(); + return; + default: + break binaryLoop; + } + } + } - void lexHex() - { - current.type = TokenType.intLiteral; - scope(exit) setTokenValue(); - bool foundDot; - hexLoop: while (!isEoF()) - { - switch (currentElement()) - { - case 'a': .. case 'f': - case 'A': .. case 'F': - case '0': .. case '9': - case '_': - keepNonNewlineChar(); - break; - case 'i': - case 'L': - if (foundDot) - { - lexFloatSuffix(); - return; - } - else - { - lexIntSuffix(); - return; - } - case 'p': - case 'P': - lexExponent(); - return; - case '.': - static if (isArray!R) - auto r = range[index .. $]; - else - auto r = range.save(); - r.popFront(); - if (!r.isRangeEoF() && r.front == '.') - break hexLoop; // slice expression - if (foundDot) - break hexLoop; // two dots with other characters between them - keepNonNewlineChar(); - foundDot = true; - current.type = TokenType.doubleLiteral; - break; - default: - break hexLoop; - } - } - } + void lexHex() + { + current.type = TokenType.intLiteral; + scope(exit) setTokenValue(); + bool foundDot; + hexLoop: while (!isEoF()) + { + switch (currentElement()) + { + case 'a': .. case 'f': + case 'A': .. case 'F': + case '0': .. case '9': + case '_': + keepNonNewlineChar(); + break; + case 'i': + case 'L': + if (foundDot) + { + lexFloatSuffix(); + return; + } + else + { + lexIntSuffix(); + return; + } + case 'p': + case 'P': + lexExponent(); + return; + case '.': + static if (isArray!R) + auto r = range[index .. $]; + else + auto r = range.save(); + r.popFront(); + if (!r.isRangeEoF() && r.front == '.') + break hexLoop; // slice expression + if (foundDot) + break hexLoop; // two dots with other characters between them + keepNonNewlineChar(); + foundDot = true; + current.type = TokenType.doubleLiteral; + break; + default: + break hexLoop; + } + } + } - void lexStringSuffix() - { - current.type = TokenType.stringLiteral; - if (!isEoF()) - { - switch (currentElement()) - { - case 'w': - current.type = TokenType.wstringLiteral; - goto case 'c'; - case 'd': - current.type = TokenType.dstringLiteral; - goto case 'c'; - case 'c': - keepNonNewlineChar(); - break; - default: - break; - } - } - } + void lexStringSuffix() + { + current.type = TokenType.stringLiteral; + if (!isEoF()) + { + switch (currentElement()) + { + case 'w': + current.type = TokenType.wstringLiteral; + goto case 'c'; + case 'd': + current.type = TokenType.dstringLiteral; + goto case 'c'; + case 'c': + keepNonNewlineChar(); + break; + default: + break; + } + } + } - void lexCharacterLiteral() - in - { - assert (currentElement() == '\''); - } - body - { - current.type = TokenType.characterLiteral; - scope (exit) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(1, bufferIndex - 1); - } - keepChar(); - if (isEoF()) - { - errorMessage("Unterminated character literal"); - return; - } - switch (currentElement()) - { - case '\'': - return; - case '\\': - lexEscapeSequence(); - break; - default: + void lexCharacterLiteral() + in + { + assert (currentElement() == '\''); + } + body + { + current.type = TokenType.characterLiteral; + scope (exit) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(1, bufferIndex - 1); + } + keepChar(); + if (isEoF()) + { + errorMessage("Unterminated character literal"); + return; + } + switch (currentElement()) + { + case '\'': + return; + case '\\': + lexEscapeSequence(); + break; + default: if (currentElement() & 0x80) { while (currentElement() & 0x80) @@ -1149,117 +1152,117 @@ private: keepChar(); break; } - } - if (currentElement() != '\'') - { - errorMessage("Expected \"'\" to end character literal"); - return; - } - keepChar(); - } + } + if (currentElement() != '\'') + { + errorMessage("Expected \"'\" to end character literal"); + return; + } + keepChar(); + } - void lexString() - in - { - assert (currentElement() == '"' || currentElement() == '`'); - } - body - { - current.type = TokenType.stringLiteral; - bool isWysiwyg = buffer[0] == 'r' || currentElement() == '`'; + void lexString() + in + { + assert (currentElement() == '"' || currentElement() == '`'); + } + body + { + current.type = TokenType.stringLiteral; + bool isWysiwyg = buffer[0] == 'r' || currentElement() == '`'; - scope (exit) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - { - if (buffer[0] == 'r') - setTokenValue(2, bufferIndex - 1); - else - setTokenValue(1, bufferIndex - 1); - } - } + scope (exit) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + { + if (buffer[0] == 'r') + setTokenValue(2, bufferIndex - 1); + else + setTokenValue(1, bufferIndex - 1); + } + } - auto quote = currentElement(); - keepChar(); - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - return; - } - else if (currentElement() == '\\') - { - if (isWysiwyg) - keepChar(); - else - lexEscapeSequence(); - } - else if (currentElement() == quote) - { - keepNonNewlineChar(); - break; - } - else - keepChar(); - } - lexStringSuffix(); - } + auto quote = currentElement(); + keepChar(); + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + return; + } + else if (currentElement() == '\\') + { + if (isWysiwyg) + keepChar(); + else + lexEscapeSequence(); + } + else if (currentElement() == quote) + { + keepNonNewlineChar(); + break; + } + else + keepChar(); + } + lexStringSuffix(); + } - void lexEscapeSequence() - in - { - assert (currentElement() == '\\'); - } - body - { - if (config.tokenStyle & TokenStyle.notEscaped) - { - keepChar(); - switch (currentElement()) - { - case '\'': - case '"': - case '?': - case '\\': - case 'a': - case 'b': - case 'f': - case 'n': - case 'r': - case 't': - case 'v': - case 0x0a: - case 0x00: - keepChar(); - return; - case '0': .. case '7': - foreach(i; 0 .. 3) - { - keepChar(); - if (currentElement() < '0' || currentElement() > '7') return; - } - return; - case 'x': - keepChar(); - foreach(i; 0 .. 4) - { - if (!isHexDigit(currentElement())) - { - errorMessage("Expected hex digit"); - return; - } - keepChar(); - } - return; - case 'u': - case 'U': + void lexEscapeSequence() + in + { + assert (currentElement() == '\\'); + } + body + { + if (config.tokenStyle & TokenStyle.notEscaped) + { + keepChar(); + switch (currentElement()) + { + case '\'': + case '"': + case '?': + case '\\': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case 0x0a: + case 0x00: + keepChar(); + return; + case '0': .. case '7': + foreach(i; 0 .. 3) + { + keepChar(); + if (currentElement() < '0' || currentElement() > '7') return; + } + return; + case 'x': + keepChar(); + foreach(i; 0 .. 4) + { + if (!isHexDigit(currentElement())) + { + errorMessage("Expected hex digit"); + return; + } + keepChar(); + } + return; + case 'u': + case 'U': uint digits = currentElement == 'u' ? 4 : 8; - keepChar(); - foreach (i; 0 .. digits) - { + keepChar(); + foreach (i; 0 .. digits) + { if (!isHexDigit(currentElement())) { errorMessage("Expected hex digit instead of %s".format( @@ -1268,7 +1271,7 @@ private: } keepChar(); } - return; + return; case '&': while (!isEoF()) { @@ -1277,60 +1280,60 @@ private: break; } return; - default: - errorMessage("Invalid escape sequence"); - return; - } - } - else - { - advanceRange(); - switch (currentElement()) - { - case '\'': bufferChar('\''); advanceRange(); return; - case '"': bufferChar('"'); advanceRange(); return; - case '?': bufferChar('\?'); advanceRange(); return; - case '\\': bufferChar('\\'); advanceRange(); return; - case 'a': bufferChar('\a'); advanceRange(); return; - case 'b': bufferChar('\b'); advanceRange(); return; - case 'f': bufferChar('\f'); advanceRange(); return; - case 'n': bufferChar('\n'); advanceRange(); return; - case 'r': bufferChar('\r'); advanceRange(); return; - case 't': bufferChar('\t'); advanceRange(); return; - case 'v': bufferChar('\v'); advanceRange(); return; - case 0x0a: bufferChar(0x0a); advanceRange(); return; - case 0x00: bufferChar(0x00); advanceRange(); return; - case '0': .. case '7': - ubyte[3] digits; - size_t i; - while(i < 3 && !isEoF()) - { - digits[i++] = currentElement(); - advanceRange(); - if (currentElement() < '0' || currentElement() > '7') break; - } - decodeAndStore(digits, i, 8); - return; - case 'x': - ubyte[2] digits; - advanceRange(); - foreach(i; 0 .. 2) - { - if (!isHexDigit(currentElement())) - { - errorMessage("Expected hex digit"); - return; - } - digits[i] = currentElement(); + default: + errorMessage("Invalid escape sequence"); + return; + } + } + else + { + advanceRange(); + switch (currentElement()) + { + case '\'': bufferChar('\''); advanceRange(); return; + case '"': bufferChar('"'); advanceRange(); return; + case '?': bufferChar('\?'); advanceRange(); return; + case '\\': bufferChar('\\'); advanceRange(); return; + case 'a': bufferChar('\a'); advanceRange(); return; + case 'b': bufferChar('\b'); advanceRange(); return; + case 'f': bufferChar('\f'); advanceRange(); return; + case 'n': bufferChar('\n'); advanceRange(); return; + case 'r': bufferChar('\r'); advanceRange(); return; + case 't': bufferChar('\t'); advanceRange(); return; + case 'v': bufferChar('\v'); advanceRange(); return; + case 0x0a: bufferChar(0x0a); advanceRange(); return; + case 0x00: bufferChar(0x00); advanceRange(); return; + case '0': .. case '7': + ubyte[3] digits; + size_t i; + while(i < 3 && !isEoF()) + { + digits[i++] = currentElement(); advanceRange(); - } - decodeAndStore(digits, 2, 16); - return; - case 'u': - case 'U': + if (currentElement() < '0' || currentElement() > '7') break; + } + decodeAndStore(digits, i, 8); + return; + case 'x': + ubyte[2] digits; + advanceRange(); + foreach(i; 0 .. 2) + { + if (!isHexDigit(currentElement())) + { + errorMessage("Expected hex digit"); + return; + } + digits[i] = currentElement(); + advanceRange(); + } + decodeAndStore(digits, 2, 16); + return; + case 'u': + case 'U': uint digitCount = currentElement == 'u' ? 4 : 8; - advanceRange(); - ubyte[8] digits; + advanceRange(); + ubyte[8] digits; foreach (i; 0 .. digitCount) { if (!isHexDigit(currentElement())) @@ -1341,8 +1344,8 @@ private: digits[i] = currentElement(); advanceRange(); } - decodeAndStore(digits, digitCount, 16); - return; + decodeAndStore(digits, digitCount, 16); + return; case '&': advanceRange(); ubyte[] b; @@ -1377,742 +1380,742 @@ private: bufferChar(cast(ubyte) (*entity)[i]); } return; - default: - errorMessage("Invalid escape sequence"); - return; - } - } - } + default: + errorMessage("Invalid escape sequence"); + return; + } + } + } - void decodeAndStore(ubyte[] digits, size_t maxIndex, uint base) - { + void decodeAndStore(ubyte[] digits, size_t maxIndex, uint base) + { scope(failure) { import std.stdio; stderr.writeln("Failed on line ", lineNumber, " of file ", config.fileName); } - char[4] codeUnits; - auto source = cast(char[]) digits[0 .. maxIndex]; - uint codePoint = parse!uint(source, base); - ulong unitCount = encode(codeUnits, codePoint); - foreach (i; 0 .. unitCount) - bufferChar(codeUnits[i]); - } + char[4] codeUnits; + auto source = cast(char[]) digits[0 .. maxIndex]; + uint codePoint = parse!uint(source, base); + ulong unitCount = encode(codeUnits, codePoint); + foreach (i; 0 .. unitCount) + bufferChar(codeUnits[i]); + } - void lexDelimitedString() - in - { - assert(currentElement() == '"'); - } - body - { - current.type = TokenType.stringLiteral; + void lexDelimitedString() + in + { + assert(currentElement() == '"'); + } + body + { + current.type = TokenType.stringLiteral; - keepChar(); + keepChar(); - bool heredoc; - ubyte open; - ubyte close; + bool heredoc; + ubyte open; + ubyte close; - switch (currentElement()) - { - case '[': open = '['; close = ']'; break; - case '{': open = '{'; close = '}'; break; - case '(': open = '('; close = ')'; break; - case '<': open = '<'; close = '>'; break; - default: heredoc = true; break; - } - if (heredoc) - lexHeredocString(); - else - lexNormalDelimitedString(open, close); - } + switch (currentElement()) + { + case '[': open = '['; close = ']'; break; + case '{': open = '{'; close = '}'; break; + case '(': open = '('; close = ')'; break; + case '<': open = '<'; close = '>'; break; + default: heredoc = true; break; + } + if (heredoc) + lexHeredocString(); + else + lexNormalDelimitedString(open, close); + } - void lexNormalDelimitedString(ubyte open, ubyte close) - in - { - assert(buffer[0 .. 2] == `q"`); - } - body - { - current.type = TokenType.stringLiteral; - int depth = 1; - keepChar(); - scope (exit) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(3, bufferIndex - 2); - } - while (true) - { - if (isEoF()) - errorMessage("Unterminated string literal"); - if (currentElement() == open) - { - keepChar(); - ++depth; - } - else if (currentElement() == close) - { - keepChar(); - --depth; - if (depth <= 0) - { - static if (isArray!R) - auto r = range[index .. $]; - else - auto r = range.save(); - if (r.front == '"') - { - keepChar(); - return; - } - else - { - errorMessage("Expected \" after balanced " - ~ cast(char) close ~ " but found " - ~ cast(char) r.front ~ " instead."); - return; - } - } - } - else - keepChar(); - } + void lexNormalDelimitedString(ubyte open, ubyte close) + in + { + assert(buffer[0 .. 2] == `q"`); + } + body + { + current.type = TokenType.stringLiteral; + int depth = 1; + keepChar(); + scope (exit) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(3, bufferIndex - 2); + } + while (true) + { + if (isEoF()) + errorMessage("Unterminated string literal"); + if (currentElement() == open) + { + keepChar(); + ++depth; + } + else if (currentElement() == close) + { + keepChar(); + --depth; + if (depth <= 0) + { + static if (isArray!R) + auto r = range[index .. $]; + else + auto r = range.save(); + if (r.front == '"') + { + keepChar(); + return; + } + else + { + errorMessage("Expected \" after balanced " + ~ cast(char) close ~ " but found " + ~ cast(char) r.front ~ " instead."); + return; + } + } + } + else + keepChar(); + } - } + } - void lexHeredocString() - in - { - assert (buffer[0 .. bufferIndex] == "q\""); - } - body - { - auto i = bufferIndex; - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - return; - } - else if (isNewline(currentElement())) - { - keepChar(); - break; - } - else if (isSeparating()) - { - errorMessage("Unterminated string literal - Separating"); - return; - } - else - keepChar(); - } - auto ident = buffer[i .. bufferIndex - 1]; + void lexHeredocString() + in + { + assert (buffer[0 .. bufferIndex] == "q\""); + } + body + { + auto i = bufferIndex; + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + return; + } + else if (isNewline(currentElement())) + { + keepChar(); + break; + } + else if (isSeparating()) + { + errorMessage("Unterminated string literal - Separating"); + return; + } + else + keepChar(); + } + auto ident = buffer[i .. bufferIndex - 1]; - scope(exit) - { - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - { - size_t b = 2 + ident.length; - if (buffer[b] == '\r') ++b; - if (buffer[b] == '\n') ++b; - size_t e = bufferIndex; - if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w') - --e; - setTokenValue(b, e); - } - } + scope(exit) + { + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + { + size_t b = 2 + ident.length; + if (buffer[b] == '\r') ++b; + if (buffer[b] == '\n') ++b; + size_t e = bufferIndex; + if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w') + --e; + setTokenValue(b, e); + } + } - while (true) - { - if (isEoF()) - { - errorMessage("Unterminated string literal"); - return; - } - else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident) - { - if (currentElement() == '"') - { - keepChar(); - lexStringSuffix(); - return; - } - else - { - errorMessage("Unterminated string literal"); - return; - } - } - else - keepChar(); - } - } + while (true) + { + if (isEoF()) + { + errorMessage("Unterminated string literal"); + return; + } + else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident) + { + if (currentElement() == '"') + { + keepChar(); + lexStringSuffix(); + return; + } + else + { + errorMessage("Unterminated string literal"); + return; + } + } + else + keepChar(); + } + } - void lexTokenString() - in - { - assert (currentElement() == '{'); - } - body - { - current.type = TokenType.stringLiteral; - keepChar(); - LexerConfig c = config; - config.iterStyle = IterationStyle.everything; - config.tokenStyle = TokenStyle.source; - size_t bi; - ubyte[] b = uninitializedArray!(ubyte[])(1024 * 4); - int depth = 1; - while (!isEoF()) - { - advance(); - while (bi + current.value.length >= b.length) - b.length += 1024 * 4; - b[bi .. bi + current.value.length] = cast(ubyte[]) current.value; - bi += current.value.length; - if (current.type == TokenType.lBrace) - ++depth; - else if (current.type == TokenType.rBrace) - { - --depth; - if (depth <= 0) - break; - } - } - config = c; - buffer[0] = 'q'; - buffer[1] = '{'; - buffer[2 .. bi + 2] = b[0 .. bi]; - buffer[bi++] = '}'; - bufferIndex = bi; - if (config.tokenStyle & TokenStyle.includeQuotes) - setTokenValue(); - else - setTokenValue(2, bufferIndex - 1); - lexStringSuffix(); - } + void lexTokenString() + in + { + assert (currentElement() == '{'); + } + body + { + current.type = TokenType.stringLiteral; + keepChar(); + LexerConfig c = config; + config.iterStyle = IterationStyle.everything; + config.tokenStyle = TokenStyle.source; + size_t bi; + ubyte[] b = uninitializedArray!(ubyte[])(1024 * 4); + int depth = 1; + while (!isEoF()) + { + advance(); + while (bi + current.value.length >= b.length) + b.length += 1024 * 4; + b[bi .. bi + current.value.length] = cast(ubyte[]) current.value; + bi += current.value.length; + if (current.type == TokenType.lBrace) + ++depth; + else if (current.type == TokenType.rBrace) + { + --depth; + if (depth <= 0) + break; + } + } + config = c; + buffer[0] = 'q'; + buffer[1] = '{'; + buffer[2 .. bi + 2] = b[0 .. bi]; + buffer[bi++] = '}'; + bufferIndex = bi; + if (config.tokenStyle & TokenStyle.includeQuotes) + setTokenValue(); + else + setTokenValue(2, bufferIndex - 1); + lexStringSuffix(); + } - void lexSpecialTokenSequence() - in - { - assert (currentElement() == '#'); - } - body - { - keepChar(); - static if (isArray!R) - auto r = range[index .. $]; - else - auto r = range.save(); - auto app = appender!(ubyte[])(); - app.put('#'); - while (true) - { - if (r.isRangeEoF()) - { - errorMessage("Found EOF when interpreting special token sequence"); - return; - } - else if (isNewline(r.front)) - break; - else - { - app.put(r.front); - r.popFront(); - } - } - auto m = match((cast(char[]) app.data), - `#line\s+(?P