// Written in the D programming language /** * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on * ASCII and UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * * To use the _lexer, create a LexerConfig struct * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; * config.tokenStyle = IterationStyle.source; * config.versionNumber = 2061; * config.vendorString = "Lexer Example"; * --- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your * source code, passing in the configuration. * --- * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); * --- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can * be used easily with the algorithms from std.algorithm or iterated over with * $(D_KEYWORD foreach) * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); * assert (tokens.front.line == 1); * assert (tokens.front.startIndex == 0); * --- * * Examples: * * Generate HTML markup of D code. * --- * module highlighter; * * import std.stdio; * import std.array; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * * // http://ethanschoonover.com/solarized * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * *
* * * * *]"); * * foreach (Token t; tokens) * { * if (isType(t.type)) * writeSpan("type", t.value); * else if (isKeyword(t.type)) * writeSpan("kwrd", t.value); * else if (t.type == TokenType.comment) * writeSpan("com", t.value); * else if (isStringLiteral(t.type)) * writeSpan("str", t.value); * else if (isNumberLiteral(t.type)) * writeSpan("num", t.value); * else if (isOperator(t.type)) * writeSpan("op", t.value); * else * stdout.write(t.value.replace("<", "<")); * } * stdout.writeln("\n"); * } * * void main(string[] args) * { * LexerConfig config; * config.tokenStyle = TokenStyle.source; * config.iterStyle = IterationStyle.everything; * config.fileName = args[1]; * auto f = File(args[1]); * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); * } * --- * * Copyright: Brian Schott 2013 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) * Authors: Brian Schott * Source: $(PHOBOSSRC std/d/_lexer.d) */ module std.d.lexer; import std.algorithm; import std.ascii; import std.conv; import std.d.entities; import std.datetime; import std.exception; import std.range; import std.string; import std.traits; import std.uni; import std.utf; import std.regex; import std.container; public: /** * Represents a D token */ struct Token { /** * The token type. */ TokenType type; /** * The representation of the token in the original source code. */ string value; /** * The number of the line the token is on. */ uint line; /** * The column number of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ uint column; /** * The index of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ uint startIndex; /** * Check to see if the token is of the same type and has the same string * representation as the given token. */ bool opEquals(ref const(Token) other) const { return other.type == type && other.value == value; } /** * Checks to see if the token's string representation is equal to the given * string. */ bool opEquals(string value) const { return this.value == value; } /** * Checks to see if the token is of the given type. */ bool opEquals(TokenType type) const { return type == type; } /** * Comparison operator orders tokens by start index. */ int opCmp(size_t i) const { if (startIndex < i) return -1; if (startIndex > i) return 1; return 0; } } /** * Configure the behavior of the byToken() function. These flags may be * combined using a bitwise or. */ enum IterationStyle { /// Only include code, not whitespace or comments codeOnly = 0, /// Includes comments includeComments = 0b0001, /// Includes whitespace includeWhitespace = 0b0010, /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) includeSpecialTokens = 0b0100, /// Do not stop iteration on reaching the ___EOF__ token ignoreEOF = 0b1000, /// Include everything everything = includeComments | includeWhitespace | ignoreEOF } /** * Configuration of the token lexing style. These flags may be combined with a * bitwise or. */ enum TokenStyle : uint { /** * Escape sequences will be replaced with their equivalent characters, * enclosing quote characters will not be included. Special tokens such as * __VENDOR__ will be replaced with their equivalent strings. Useful for * creating a compiler or interpreter. */ default_ = 0b0000, /** * Escape sequences will not be processed. An escaped quote character will * not terminate string lexing, but it will not be replaced with the quote * character in the token. */ notEscaped = 0b0001, /** * Strings will include their opening and closing quote characters as well * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will * include the $(D_STRING 'w') character as well as the opening and closing * quotes$(RPAREN) */ includeQuotes = 0b0010, /** * Do not replace the value field of the special tokens such as ___DATE__ * with their string equivalents. */ doNotReplaceSpecial = 0b0100, /** * Strings will be read exactly as they appeared in the source, including * their opening and closing quote characters. Useful for syntax * highlighting. */ source = notEscaped | includeQuotes | doNotReplaceSpecial } /** * Lexer configuration */ struct LexerConfig { /** * Iteration style */ IterationStyle iterStyle = IterationStyle.codeOnly; /** * Token style */ TokenStyle tokenStyle = tokenStyle.default_; /** * Replacement for the ___VERSION__ token. Defaults to 1. */ uint versionNumber = 100; /** * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") */ string vendorString = "std.d.lexer"; /** * Name used when creating error messages that are sent to errorFunc. This * is needed because the lexer operates on any forwarad range of ASCII * characters or UTF-8 code units and does not know what to call its input * source. Defaults to the empty string. */ string fileName = ""; /** * This function is called when an error is encountered during lexing. * Parameters are file name, code uint index, line number, column, * and error messsage. */ void delegate(string, uint, uint, uint, string) errorFunc; /** * Initial size of the lexer's internal token buffer in bytes. The lexer * will grow this buffer if necessary. */ size_t bufferSize = 1024 * 4; } /** * Iterate over the given range of characters by D tokens. * Params: * range = the range of characters * config = the lexer configuration * Returns: * an input range of tokens */ TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R)) { auto r = TokenRange!(R)(range); r.config = config; r.lineNumber = 1; r.popFront(); return r; } /** * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. */ struct TokenRange(R) if (isForwardRange!(R)) { /** * Returns: true if the range is empty */ bool empty() const @property { return _empty; } /** * Returns: the current token */ Token front() const @property { enforce(!_empty, "Cannot call front() on empty token range"); return current; } /** * Returns the current token and then removes it from the range */ Token moveFront() { auto r = front(); popFront(); return r; } /** * Range operation */ int opApply(int delegate(Token) dg) { int result = 0; while (!empty) { result = dg(front); if (result) break; popFront(); } return result; } /** * Range operation */ int opApply(int delegate(size_t, Token) dg) { int result = 0; int i = 0; while (!empty) { result = dg(i, front); if (result) break; popFront(); } return result; } /** * Removes the current token from the range */ void popFront() { // Filter out tokens we don't care about loop: do { advance(); switch (current.type) { case TokenType.whitespace: if (config.iterStyle & IterationStyle.includeWhitespace) break loop; break; case TokenType.comment: if (config.iterStyle & IterationStyle.includeComments) break loop; break; case TokenType.specialTokenSequence: if (config.iterStyle & IterationStyle.includeSpecialTokens) break loop; break; default: break loop; } } while (!empty()); } private: this(ref R range) { this.range = range; buffer = new ubyte[config.bufferSize]; cache.initialize(); } /* * Advances the range to the next token */ void advance() { if (range.empty) { _empty = true; return; } bufferIndex = 0; current.line = lineNumber; current.startIndex = index; current.column = column; current.value = null; if (std.ascii.isWhite(range.front)) { lexWhitespace(); return; } outer: switch (range.front) { // pragma(msg, generateCaseTrie( mixin(generateCaseTrie( "=", "TokenType.assign", "@", "TokenType.at", "&", "TokenType.bitAnd", "&=", "TokenType.bitAndEquals", "|", "TokenType.bitOr", "|=", "TokenType.bitOrEquals", "~=", "TokenType.catEquals", ":", "TokenType.colon", ",", "TokenType.comma", "--", "TokenType.decrement", "$", "TokenType.dollar", "==", "TokenType.equals", "=>", "TokenType.goesTo", ">", "TokenType.greater", ">=", "TokenType.greaterEqual", "++", "TokenType.increment", "{", "TokenType.lBrace", "[", "TokenType.lBracket", "<", "TokenType.less", "<=", "TokenType.lessEqual", "<>=", "TokenType.lessEqualGreater", "<>", "TokenType.lessOrGreater", "&&", "TokenType.logicAnd", "||", "TokenType.logicOr", "(", "TokenType.lParen", "-", "TokenType.minus", "-=", "TokenType.minusEquals", "%", "TokenType.mod", "%=", "TokenType.modEquals", "*=", "TokenType.mulEquals", "!", "TokenType.not", "!=", "TokenType.notEquals", "!>", "TokenType.notGreater", "!>=", "TokenType.notGreaterEqual", "!<", "TokenType.notLess", "!<=", "TokenType.notLessEqual", "!<>", "TokenType.notLessEqualGreater", "+", "TokenType.plus", "+=", "TokenType.plusEquals", "^^", "TokenType.pow", "^^=", "TokenType.powEquals", "}", "TokenType.rBrace", "]", "TokenType.rBracket", ")", "TokenType.rParen", ";", "TokenType.semicolon", "<<", "TokenType.shiftLeft", "<<=", "TokenType.shiftLeftEqual", ">>", "TokenType.shiftRight", ">>=", "TokenType.shiftRightEqual", "*", "TokenType.star", "?", "TokenType.ternary", "~", "TokenType.tilde", "!<>=", "TokenType.unordered", ">>>", "TokenType.unsignedShiftRight", ">>>=", "TokenType.unsignedShiftRightEqual", "^", "TokenType.xor", "^=", "TokenType.xorEquals", // "bool", "TokenType.bool_", // "byte", "TokenType.byte_", // "cdouble", "TokenType.cdouble_", // "cent", "TokenType.cent_", // "cfloat", "TokenType.cfloat_", // "char", "TokenType.char_", // "creal", "TokenType.creal_", // "dchar", "TokenType.dchar_", // "double", "TokenType.double_", // "dstring", "TokenType.dstring_", // "float", "TokenType.float_", // "function", "TokenType.function_", // "idouble", "TokenType.idouble_", // "ifloat", "TokenType.ifloat_", // "int", "TokenType.int_", // "ireal", "TokenType.ireal_", // "long", "TokenType.long_", // "real", "TokenType.real_", // "short", "TokenType.short_", // "string", "TokenType.string_", // "ubyte", "TokenType.ubyte_", // "ucent", "TokenType.ucent_", // "uint", "TokenType.uint_", // "ulong", "TokenType.ulong_", // "ushort", "TokenType.ushort_", // "void", "TokenType.void_", // "wchar", "TokenType.wchar_", // "wstring", "TokenType.wstring_", // "align", "TokenType.align_", // "deprecated", "TokenType.deprecated_", // "extern", "TokenType.extern_", // "pragma", "TokenType.pragma_", // "export", "TokenType.export_", // "package", "TokenType.package_", // "private", "TokenType.private_", // "protected", "TokenType.protected_", // "public", "TokenType.public_", // "abstract", "TokenType.abstract_", // "auto", "TokenType.auto_", // "const", "TokenType.const_", // "final", "TokenType.final_", // "__gshared", "TokenType.gshared", // "immutable", "TokenType.immutable_", // "inout", "TokenType.inout_", // "scope", "TokenType.scope_", // "shared", "TokenType.shared_", // "static", "TokenType.static_", // "synchronized", "TokenType.synchronized_", // "alias", "TokenType.alias_", // "asm", "TokenType.asm_", // "assert", "TokenType.assert_", // "body", "TokenType.body_", // "break", "TokenType.break_", // "case", "TokenType.case_", // "cast", "TokenType.cast_", // "catch", "TokenType.catch_", // "class", "TokenType.class_", // "continue", "TokenType.continue_", // "debug", "TokenType.debug_", // "default", "TokenType.default_", // "delegate", "TokenType.delegate_", // "delete", "TokenType.delete_", // "do", "TokenType.do_", // "else", "TokenType.else_", // "enum", "TokenType.enum_", // "false", "TokenType.false_", // "finally", "TokenType.finally_", // "foreach", "TokenType.foreach_", // "foreach_reverse", "TokenType.foreach_reverse_", // "for", "TokenType.for_", // "goto", "TokenType.goto_", // "if", "TokenType.if_", // "import", "TokenType.import_", // "in", "TokenType.in_", // "interface", "TokenType.interface_", // "invariant", "TokenType.invariant_", // "is", "TokenType.is_", // "lazy", "TokenType.lazy_", // "macro", "TokenType.macro_", // "mixin", "TokenType.mixin_", // "module", "TokenType.module_", // "new", "TokenType.new_", // "nothrow", "TokenType.nothrow_", // "null", "TokenType.null_", // "out", "TokenType.out_", // "override", "TokenType.override_", // "pure", "TokenType.pure_", // "ref", "TokenType.ref_", // "return", "TokenType.return_", // "struct", "TokenType.struct_", // "super", "TokenType.super_", // "switch", "TokenType.switch_", // "template", "TokenType.template_", // "this", "TokenType.this_", // "throw", "TokenType.throw_", // "true", "TokenType.true_", // "try", "TokenType.try_", // "typedef", "TokenType.typedef_", // "typeid", "TokenType.typeid_", // "typeof", "TokenType.typeof_", // "union", "TokenType.union_", // "unittest", "TokenType.unittest_", // "version", "TokenType.version_", // "volatile", "TokenType.volatile_", // "while", "TokenType.while_", // "with", "TokenType.with_", // "__DATE__", "TokenType.date", // "__EOF__", "TokenType.eof", // "__TIME__", "TokenType.time", // "__TIMESTAMP__", "TokenType.timestamp", // "__VENDOR__", "TokenType.vendor", // "__VERSION__", "TokenType.compilerVersion", // "__FILE__", "TokenType.file", // "__LINE__", "TokenType.line", // "__traits", "TokenType.traits", // "__parameters", "TokenType.parameters", // "__vector", "TokenType.vector", )); case '/': auto r = range.save(); r.popFront(); if (r.isEoF()) { current.type = TokenType.div; current.value = "/"; range.popFront(); ++index; break; } switch (r.front) { case '/': case '*': case '+': lexComment(); break outer; case '=': current.type = TokenType.divEquals; current.value = "/="; range.popFront(); range.popFront(); index += 2; break outer; default: current.type = TokenType.div; current.value = "/"; ++index; range.popFront(); break outer; } case '.': auto r = range.save(); r.popFront(); if (r.isEoF()) { current.type = TokenType.dot; current.value = getTokenValue(TokenType.dot); range.popFront(); ++index; break outer; } else if (r.front >= '0' && r.front <= '9') { lexNumber(); break outer; } else if (r.front == '.') { current.type = TokenType.slice; r.popFront(); if (r.front == '.') { current.type = TokenType.vararg; range.popFront(); range.popFront(); range.popFront(); index += 3; } else { range.popFront(); range.popFront(); index += 2; } current.value = getTokenValue(current.type); } else { range.popFront(); current.type = TokenType.dot; current.value = getTokenValue(TokenType.dot); } break; case '0': .. case '9': lexNumber(); break; case '\'': case '"': case '`': lexString(); break; case 'q': auto r = range.save; r.popFront(); if (!r.isEoF() && r.front == '{') { lexTokenString(); break; } else if (!r.isEoF() && r.front == '"') { lexDelimitedString(); break; } else goto default; case 'r': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '"') { lexString(); break; } else goto default; case 'x': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '"') { lexHexString(); break; } else goto default; case '#': lexSpecialTokenSequence(); break; default: while(!range.isEoF() && !isSeparating(range.front)) { keepChar(); } current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]); current.value = getTokenValue(current.type); if (current.value is null) setTokenValue(); if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) { _empty = true; return; } if (!(config.iterStyle & TokenStyle.doNotReplaceSpecial)) break; switch (current.type) { case TokenType.date: current.type = TokenType.stringLiteral; auto time = Clock.currTime(); current.value = format("%s %02d %04d", time.month, time.day, time.year); break; case TokenType.time: auto time = Clock.currTime(); current.type = TokenType.stringLiteral; current.value = (cast(TimeOfDay)(time)).toISOExtString(); break; case TokenType.timestamp: auto time = Clock.currTime(); auto dt = cast(DateTime) time; current.type = TokenType.stringLiteral; current.value = format("%s %s %02d %02d:%02d:%02d %04d", dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.year); break; case TokenType.vendor: current.type = TokenType.stringLiteral; current.value = config.vendorString; break; case TokenType.compilerVersion: current.type = TokenType.stringLiteral; current.value = format("%d", config.versionNumber); break; case TokenType.line: current.type = TokenType.intLiteral; current.value = format("%d", current.line); break; case TokenType.file: current.type = TokenType.stringLiteral; current.value = config.fileName; break; default: break; } break; } } void lexWhitespace() { current.type = TokenType.whitespace; while (!isEoF(range) && std.ascii.isWhite(range.front)) { keepChar(); } if (config.iterStyle & IterationStyle.includeWhitespace) setTokenValue(); } void lexComment() in { assert (range.front == '/'); } body { current.type = TokenType.comment; keepChar(); switch(range.front) { case '/': while (!isEoF(range) && !isNewline(range)) { keepChar(); } break; case '*': while (!isEoF(range)) { if (range.front == '*') { keepChar(); if (range.front == '/') { keepChar(); break; } } else keepChar(); } break; case '+': int depth = 1; while (depth > 0 && !isEoF(range)) { if (range.front == '+') { keepChar(); if (range.front == '/') { keepChar(); --depth; } } else if (range.front == '/') { keepChar(); if (range.front == '+') { keepChar(); ++depth; } } else keepChar(); } break; default: assert(false); } if (config.iterStyle & IterationStyle.includeComments) setTokenValue(); } void lexHexString() in { assert (range.front == 'x'); } body { current.type = TokenType.stringLiteral; keepChar(); keepChar(); while (true) { if (range.isEoF()) { errorMessage("Unterminated hex string literal"); return; } else if (isHexDigit(range.front)) { keepChar(); } else if (std.ascii.isWhite(range.front) && (config.tokenStyle & TokenStyle.notEscaped)) { keepChar(); } else if (range.front == '"') { keepChar(); break; } else { errorMessage(format("Invalid character '%s' in hex string literal", cast(char) range.front)); return; } } lexStringSuffix(); if (config.tokenStyle & TokenStyle.notEscaped) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(bufferIndex - 1, 2); } else { auto a = appender!(ubyte[])(); foreach (b; std.range.chunks(buffer[2 .. bufferIndex - 1], 2)) { string s = to!string(cast(char[]) b); a.put(cast(ubyte[]) to!string(cast(dchar) parse!uint(s, 16))); } current.value = to!string(cast(char[]) a.data); } } void lexNumber() in { assert(isDigit(cast(char) range.front) || range.front == '.'); } body { // hex and binary can start with zero, anything else is decimal if (range.front != '0') lexDecimal(); else { auto r = range.save(); r.popFront(); switch (r.front) { case 'x': case 'X': keepChar(); keepChar(); lexHex(); break; case 'b': case 'B': keepChar(); keepChar(); lexBinary(); break; default: lexDecimal(); return; } } } void lexFloatSuffix() { switch (range.front) { case 'L': keepChar(); current.type = TokenType.doubleLiteral; break; case 'f': case 'F': keepChar(); current.type = TokenType.floatLiteral; break; default: break; } if (!range.isEoF() && range.front == 'i') { keepChar(); if (current.type == TokenType.floatLiteral) current.type = TokenType.ifloatLiteral; else current.type = TokenType.idoubleLiteral; } } void lexIntSuffix() { bool foundU; bool foundL; while (!range.isEoF()) { switch (range.front) { case 'u': case 'U': if (foundU) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.uintLiteral; keepChar(); break; case TokenType.longLiteral: current.type = TokenType.ulongLiteral; keepChar(); break; default: return; } foundU = true; break; case 'L': if (foundL) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.longLiteral; keepChar(); break; case TokenType.uintLiteral: current.type = TokenType.ulongLiteral; keepChar(); break; default: return; } foundL = true; break; default: return; } } } void lexExponent() in { assert (range.front == 'e' || range.front == 'E' || range.front == 'p' || range.front == 'P'); } body { keepChar(); bool foundSign = false; while (!range.isEoF()) { switch (range.front) { case '-': case '+': if (foundSign) return; foundSign = true; keepChar(); case '0': .. case '9': case '_': keepChar(); break; case 'L': case 'f': case 'F': case 'i': lexFloatSuffix(); return; default: return; } } } void lexDecimal() in { assert ((range.front >= '0' && range.front <= '9') || range.front == '.'); } body { bool foundDot = false; current.type = TokenType.intLiteral; scope(exit) setTokenValue(); decimalLoop: while (!range.isEoF()) { switch (range.front) { case '0': .. case '9': case '_': keepChar(); break; case 'i': case 'L': if (foundDot) { lexFloatSuffix(); return; } else { lexIntSuffix(); return; } case 'f': case 'F': lexFloatSuffix(); return; case 'e': case 'E': lexExponent(); return; case '.': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '.') break decimalLoop; // possibly slice expression if (foundDot) break decimalLoop; // two dots with other characters between them keepChar(); foundDot = true; current.type = TokenType.doubleLiteral; break; default: break decimalLoop; } } } void lexBinary() { current.type = TokenType.intLiteral; scope(exit) setTokenValue(); binaryLoop: while (!range.isEoF()) { switch (range.front) { case '0': case '1': case '_': keepChar(); break; case 'u': case 'U': case 'L': lexIntSuffix(); return; default: break binaryLoop; } } } void lexHex() { current.type = TokenType.intLiteral; scope(exit) setTokenValue(); bool foundDot; hexLoop: while (!range.isEoF()) { switch (range.front) { case 'a': .. case 'f': case 'A': .. case 'F': case '0': .. case '9': case '_': keepChar(); break; case 'i': case 'L': if (foundDot) { lexFloatSuffix(); return; } else { lexIntSuffix(); return; } case 'p': case 'P': lexExponent(); return; case '.': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '.') break hexLoop; // slice expression if (foundDot) break hexLoop; // two dots with other characters between them keepChar(); foundDot = true; current.type = TokenType.doubleLiteral; break; default: break hexLoop; } } } void lexStringSuffix() { current.type = TokenType.stringLiteral; if (!range.isEoF()) { switch (range.front) { case 'w': current.type = TokenType.wstringLiteral; goto case 'c'; case 'd': current.type = TokenType.dstringLiteral; goto case 'c'; case 'c': keepChar(); break; default: break; } } } void lexString() in { assert (range.front == '\'' || range.front == '"' || range.front == '`' || range.front == 'r'); } body { current.type = TokenType.stringLiteral; bool isWysiwyg = range.front == 'r' || range.front == '`'; if (range.front == 'r') keepChar(); scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else { if (buffer[0] == 'r') setTokenValue(bufferIndex - 1, 2); else setTokenValue(bufferIndex - 1, 1); } } auto quote = range.front; keepChar(); while (true) { if (range.isEoF()) { errorMessage("Unterminated string literal"); return; } else if (range.front == '\\' && !isWysiwyg) { if (config.tokenStyle & TokenStyle.notEscaped) { auto r = range.save(); r.popFront(); if (r.front == quote && !isWysiwyg) { keepChar(); keepChar(); } else if (r.front == '\\' && !isWysiwyg) { keepChar(); keepChar(); } else keepChar(); } else interpretEscapeSequence(range, index, buffer, bufferIndex); } else if (range.front == quote) { keepChar(); break; } else keepChar(); } lexStringSuffix(); } void lexDelimitedString() in { assert(range.front == 'q'); } body { current.type = TokenType.stringLiteral; keepChar(); keepChar(); bool heredoc; ubyte open; ubyte close; switch (range.front) { case '[': open = '['; close = ']'; break; case '{': open = '{'; close = '}'; break; case '(': open = '('; close = ')'; break; case '<': open = '<'; close = '>'; break; default: heredoc = true; break; } if (heredoc) lexHeredocString(); else lexNormalDelimitedString(open, close); } void lexNormalDelimitedString(ubyte open, ubyte close) in { assert(buffer[0 .. bufferIndex] == "q\""); } body { current.type = TokenType.stringLiteral; int depth = 1; keepChar(); scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(bufferIndex - 2, 3); } while (true) { if (range.isEoF()) errorMessage("Unterminated string literal"); if (range.front == open) { keepChar(); ++depth; } else if (range.front == close) { keepChar(); --depth; if (depth <= 0) { auto r = range.save(); if (r.front == '"') { keepChar(); return; } else { errorMessage("Expected \" after balanced " ~ cast(char) close ~ " but found " ~ cast(char) r.front ~ " instead."); return; } } } else keepChar(); } } void lexHeredocString() in { assert (buffer[0 .. bufferIndex] == "q\""); } body { auto i = bufferIndex; while (true) { if (range.isEoF()) { errorMessage("Unterminated string literal"); return; } else if (isNewline(range)) { keepChar(); break; } else if (isSeparating(range.front)) { errorMessage("Unterminated string literal - Separating"); return; } else keepChar(); } auto ident = buffer[i .. bufferIndex - 1]; scope(exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else { size_t b = 2 + ident.length; if (buffer[b] == '\r') ++b; if (buffer[b] == '\n') ++b; size_t e = bufferIndex; if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w') --e; setTokenValue(e, b); } } while (true) { if (range.isEoF()) { errorMessage("Unterminated string literal -- a"); return; } else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident) { if (range.front == '"') { keepChar(); lexStringSuffix(); return; } else { errorMessage("Unterminated string literal -- b"); return; } } else keepChar(); } } void lexTokenString() in { assert (range.front == 'q'); } body { current.type = TokenType.stringLiteral; size_t i; scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(bufferIndex - 1, 2); } keepChar(); keepChar(); LexerConfig c; c.iterStyle = IterationStyle.everything; c.tokenStyle = TokenStyle.source; auto r = byToken(range, c); r.index = index; int depth = 1; while (!r.empty) { if (r.front.type == TokenType.lBrace) { ++depth; } else if (r.front.type == TokenType.rBrace) { --depth; if (depth <= 0) { if (config.tokenStyle & TokenStyle.includeQuotes) { if (bufferIndex >= buffer.length) buffer.length += 1024; buffer[bufferIndex++] = '}'; } r.popFront(); break; } } if (bufferIndex + r.front.value.length > buffer.length) buffer.length += 1024; buffer[bufferIndex .. bufferIndex + r.front.value.length] = cast(ubyte[]) r.front.value; bufferIndex += r.front.value.length; r.popFront(); } lexStringSuffix(); } void lexSpecialTokenSequence() in { assert (range.front == '#'); } body { keepChar(); auto r = range.save(); auto app = appender!(ubyte[])(); app.put('#'); while (true) { if (r.isEoF()) { errorMessage("Found EOF when interpreting special token sequence"); return; } else if (isNewline(r)) break; else { app.put(r.front); r.popFront(); } } auto m = match((cast(char[]) app.data), `#line\s+(?P