// Written in the D programming language /** * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on * ASCII and UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * * To use the _lexer, create a LexerConfig struct * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; * config.tokenStyle = IterationStyle.source; * config.versionNumber = 2061; * config.vendorString = "Lexer Example"; * --- * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your * source code, passing in the configuration. * --- * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); * --- * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can * be used easily with the algorithms from std.algorithm or iterated over with * $(D_KEYWORD foreach) * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); * assert (tokens.front.line == 1); * assert (tokens.front.startIndex == 0); * --- * * Examples: * * Generate HTML markup of D code. * --- * module highlighter; * * import std.stdio; * import std.array; * import std.d.lexer; * * void writeSpan(string cssClass, string value) * { * stdout.write(``, value.replace("&", "&").replace("<", "<"), ``); * } * * * // http://ethanschoonover.com/solarized * void highlight(R)(R tokens) * { * stdout.writeln(q"[ * *
* * * * *]"); * * foreach (Token t; tokens) * { * if (isType(t.type)) * writeSpan("type", t.value); * else if (isKeyword(t.type)) * writeSpan("kwrd", t.value); * else if (t.type == TokenType.comment) * writeSpan("com", t.value); * else if (isStringLiteral(t.type)) * writeSpan("str", t.value); * else if (isNumberLiteral(t.type)) * writeSpan("num", t.value); * else if (isOperator(t.type)) * writeSpan("op", t.value); * else * stdout.write(t.value.replace("<", "<")); * } * stdout.writeln("\n"); * } * * void main(string[] args) * { * LexerConfig config; * config.tokenStyle = TokenStyle.source; * config.iterStyle = IterationStyle.everything; * config.fileName = args[1]; * auto f = File(args[1]); * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); * } * --- * * Copyright: Brian Schott 2013 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt Boost, License 1.0) * Authors: Brian Schott * Source: $(PHOBOSSRC std/d/_lexer.d) */ module std.d.lexer; import std.algorithm; import std.ascii; import std.conv; import std.datetime; import std.d.entities; import std.exception; import std.range; import std.regex; import std.string; import std.traits; import std.utf; public: /** * Represents a D token */ struct Token { /** * The token type. */ TokenType type; /** * The representation of the token in the original source code. */ string value; /** * The number of the line the token is on. */ uint line; /** * The column number of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ uint column; /** * The index of the start of the token in the original source. * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) */ size_t startIndex; /** * Check to see if the token is of the same type and has the same string * representation as the given token. */ bool opEquals(ref const(Token) other) const { return other.type == type && other.value == value; } /** * Checks to see if the token's string representation is equal to the given * string. */ bool opEquals(string value) const { return this.value == value; } /** * Checks to see if the token is of the given type. */ bool opEquals(TokenType type) const { return type == type; } /** * Comparison operator orders tokens by start index. */ int opCmp(size_t i) const { if (startIndex < i) return -1; if (startIndex > i) return 1; return 0; } } /** * Configure the behavior of the byToken() function. These flags may be * combined using a bitwise or. */ enum IterationStyle { /// Only include code, not whitespace or comments codeOnly = 0, /// Includes comments includeComments = 0b0001, /// Includes whitespace includeWhitespace = 0b0010, /// Include $(LINK2 http://dlang.org/lex.html#specialtokens, special tokens) includeSpecialTokens = 0b0100, /// Do not stop iteration on reaching the ___EOF__ token ignoreEOF = 0b1000, /// Include everything everything = includeComments | includeWhitespace | ignoreEOF } /** * Configuration of the token lexing style. These flags may be combined with a * bitwise or. */ enum TokenStyle : uint { /** * Escape sequences will be replaced with their equivalent characters, * enclosing quote characters will not be included. Special tokens such as * __VENDOR__ will be replaced with their equivalent strings. Useful for * creating a compiler or interpreter. */ default_ = 0b0000, /** * Escape sequences will not be processed. An escaped quote character will * not terminate string lexing, but it will not be replaced with the quote * character in the token. */ notEscaped = 0b0001, /** * Strings will include their opening and closing quote characters as well * as any prefixes or suffixes $(LPAREN)e.g.: $(D_STRING "abcde"w) will * include the $(D_STRING 'w') character as well as the opening and closing * quotes$(RPAREN) */ includeQuotes = 0b0010, /** * Do not replace the value field of the special tokens such as ___DATE__ * with their string equivalents. */ doNotReplaceSpecial = 0b0100, /** * Strings will be read exactly as they appeared in the source, including * their opening and closing quote characters. Useful for syntax * highlighting. */ source = notEscaped | includeQuotes | doNotReplaceSpecial } /** * Lexer configuration */ struct LexerConfig { /** * Iteration style */ IterationStyle iterStyle = IterationStyle.codeOnly; /** * Token style */ TokenStyle tokenStyle = tokenStyle.default_; /** * Replacement for the ___VERSION__ token. Defaults to 1. */ uint versionNumber = 100; /** * Replacement for the ___VENDOR__ token. Defaults to $(D_STRING "std.d.lexer") */ string vendorString = "std.d.lexer"; /** * Name used when creating error messages that are sent to errorFunc. This * is needed because the lexer operates on any forwarad range of ASCII * characters or UTF-8 code units and does not know what to call its input * source. Defaults to the empty string. */ string fileName = ""; /** * This function is called when an error is encountered during lexing. * Parameters are file name, code uint index, line number, column, * and error messsage. */ void delegate(string, size_t, uint, uint, string) errorFunc; /** * Initial size of the lexer's internal token buffer in bytes. The lexer * will grow this buffer if necessary. */ size_t bufferSize = 1024 * 4; } /** * Iterate over the given range of characters by D tokens. * Params: * range = the range of characters * config = the lexer configuration * Returns: * an input range of tokens */ TokenRange!(R) byToken(R)(R range, LexerConfig config) if (isForwardRange!(R)) { auto r = TokenRange!(R)(range); r.config = config; r.lineNumber = 1; r.popFront(); return r; } /** * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. */ struct TokenRange(R) if (isForwardRange!(R)) { /** * Returns: true if the range is empty */ bool empty() const @property { return _empty; } /** * Returns: the current token */ ref const(Token) front() const @property { enforce(!_empty, "Cannot call front() on empty token range"); return current; } /** * Returns the current token and then removes it from the range */ Token moveFront() { auto r = front(); popFront(); return r; } /** * Range operation */ int opApply(int delegate(Token) dg) { int result = 0; while (!empty) { result = dg(front); if (result) break; popFront(); } return result; } /** * Range operation */ int opApply(int delegate(size_t, Token) dg) { int result = 0; int i = 0; while (!empty) { result = dg(i, front); if (result) break; popFront(); } return result; } /** * Removes the current token from the range */ void popFront() { // Filter out tokens we don't care about loop: do { advance(); switch (current.type) { case TokenType.whitespace: if (config.iterStyle & IterationStyle.includeWhitespace) break loop; break; case TokenType.comment: if (config.iterStyle & IterationStyle.includeComments) break loop; break; case TokenType.specialTokenSequence: if (config.iterStyle & IterationStyle.includeSpecialTokens) break loop; break; default: break loop; } } while (!empty()); } private: this(ref R range) { this.range = range; buffer = uninitializedArray!(ubyte[])(bufferSize); } /* * Advances the range to the next token */ void advance() { if (isEoF()) { _empty = true; return; } bufferIndex = 0; current.line = lineNumber; current.startIndex = index; current.column = column; current.value = null; if (isWhite()) { lexWhitespace(); return; } switch (currentElement()) { // pragma(msg, generateCaseTrie( mixin(generateCaseTrie( "=", "TokenType.assign", "@", "TokenType.at", "&", "TokenType.bitAnd", "&=", "TokenType.bitAndEquals", "|", "TokenType.bitOr", "|=", "TokenType.bitOrEquals", "~=", "TokenType.catEquals", ":", "TokenType.colon", ",", "TokenType.comma", "--", "TokenType.decrement", "$", "TokenType.dollar", "==", "TokenType.equals", "=>", "TokenType.goesTo", ">", "TokenType.greater", ">=", "TokenType.greaterEqual", "++", "TokenType.increment", "{", "TokenType.lBrace", "[", "TokenType.lBracket", "<", "TokenType.less", "<=", "TokenType.lessEqual", "<>=", "TokenType.lessEqualGreater", "<>", "TokenType.lessOrGreater", "&&", "TokenType.logicAnd", "||", "TokenType.logicOr", "(", "TokenType.lParen", "-", "TokenType.minus", "-=", "TokenType.minusEquals", "%", "TokenType.mod", "%=", "TokenType.modEquals", "*=", "TokenType.mulEquals", "!", "TokenType.not", "!=", "TokenType.notEquals", "!>", "TokenType.notGreater", "!>=", "TokenType.notGreaterEqual", "!<", "TokenType.notLess", "!<=", "TokenType.notLessEqual", "!<>", "TokenType.notLessEqualGreater", "+", "TokenType.plus", "+=", "TokenType.plusEquals", "^^", "TokenType.pow", "^^=", "TokenType.powEquals", "}", "TokenType.rBrace", "]", "TokenType.rBracket", ")", "TokenType.rParen", ";", "TokenType.semicolon", "<<", "TokenType.shiftLeft", "<<=", "TokenType.shiftLeftEqual", ">>", "TokenType.shiftRight", ">>=", "TokenType.shiftRightEqual", "*", "TokenType.star", "?", "TokenType.ternary", "~", "TokenType.tilde", "!<>=", "TokenType.unordered", ">>>", "TokenType.unsignedShiftRight", ">>>=", "TokenType.unsignedShiftRightEqual", "^", "TokenType.xor", "^=", "TokenType.xorEquals", )); case '/': keepNonNewlineChar(); if (isEoF()) { current.type = TokenType.div; current.value = "/"; return; } switch (currentElement()) { case '/': case '*': case '+': lexComment(); return; case '=': current.type = TokenType.divEquals; current.value = "/="; advanceRange(); return; default: current.type = TokenType.div; current.value = "/"; return; } case '.': keepNonNewlineChar(); if (isEoF()) { current.type = TokenType.dot; current.value = getTokenValue(TokenType.dot); return; } switch (currentElement()) { case '0': .. case '9': lexNumber(); return; case '.': current.type = TokenType.slice; keepNonNewlineChar(); if (currentElement() == '.') { current.type = TokenType.vararg; keepNonNewlineChar(); } current.value = getTokenValue(current.type); return; default: current.type = TokenType.dot; current.value = getTokenValue(TokenType.dot); return; } case '0': .. case '9': keepNonNewlineChar(); lexNumber(); return; case '\'': lexCharacterLiteral(); return; case '"': case '`': lexString(); return; case 'q': keepNonNewlineChar(); if (isEoF()) goto default; switch (currentElement()) { case '{': lexTokenString(); return; case '"': lexDelimitedString(); return; default: break; } goto default; case 'r': keepNonNewlineChar(); if (isEoF()) goto default; else if (currentElement() == '"') { lexString(); return; } else goto default; case 'x': keepNonNewlineChar(); if (isEoF()) goto default; else if (currentElement() == '"') { lexHexString(); return; } else goto default; case '#': lexSpecialTokenSequence(); return; default: while(!isEoF() && !isSeparating()) { keepNonNewlineChar(); } current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]); current.value = getTokenValue(current.type); if (current.value is null) setTokenValue(); if (!(config.iterStyle & IterationStyle.ignoreEOF) && current.type == TokenType.eof) { _empty = true; return; } if (!(config.iterStyle & TokenStyle.doNotReplaceSpecial)) return; switch (current.type) { case TokenType.date: current.type = TokenType.stringLiteral; auto time = Clock.currTime(); current.value = format("%s %02d %04d", time.month, time.day, time.year); return; case TokenType.time: auto time = Clock.currTime(); current.type = TokenType.stringLiteral; current.value = (cast(TimeOfDay)(time)).toISOExtString(); return; case TokenType.timestamp: auto time = Clock.currTime(); auto dt = cast(DateTime) time; current.type = TokenType.stringLiteral; current.value = format("%s %s %02d %02d:%02d:%02d %04d", dt.dayOfWeek, dt.month, dt.day, dt.hour, dt.minute, dt.second, dt.year); return; case TokenType.vendor: current.type = TokenType.stringLiteral; current.value = config.vendorString; return; case TokenType.compilerVersion: current.type = TokenType.stringLiteral; current.value = format("%d", config.versionNumber); return; case TokenType.line: current.type = TokenType.intLiteral; current.value = format("%d", current.line); return; case TokenType.file: current.type = TokenType.stringLiteral; current.value = config.fileName; return; default: return; } } } void lexWhitespace() { current.type = TokenType.whitespace; while (!isEoF() && isWhite()) { keepChar(); } if (config.iterStyle & IterationStyle.includeWhitespace) setTokenValue(); } void lexComment() in { assert (currentElement() == '/' || currentElement() == '*' || currentElement() == '+'); } body { current.type = TokenType.comment; switch(currentElement()) { case '/': while (!isEoF() && !isNewline(currentElement())) { keepNonNewlineChar(); } break; case '*': while (!isEoF()) { if (currentElement() == '*') { keepNonNewlineChar(); if (currentElement() == '/') { keepNonNewlineChar(); break; } } else keepChar(); } break; case '+': int depth = 1; while (depth > 0 && !isEoF()) { if (currentElement() == '+') { keepNonNewlineChar(); if (currentElement() == '/') { keepNonNewlineChar(); --depth; } } else if (currentElement() == '/') { keepNonNewlineChar(); if (currentElement() == '+') { keepNonNewlineChar(); ++depth; } } else keepChar(); } break; default: assert(false); } if (config.iterStyle & IterationStyle.includeComments) setTokenValue(); } void lexHexString() in { assert (currentElement() == '"' && buffer[0] == 'x'); } body { current.type = TokenType.stringLiteral; keepChar(); while (true) { if (isEoF()) { errorMessage("Unterminated hex string literal"); return; } else if (isHexDigit(currentElement())) { keepNonNewlineChar(); } else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) { keepChar(); } else if (currentElement() == '"') { keepNonNewlineChar(); break; } else { errorMessage(format("Invalid character '%s' in hex string literal", cast(char) currentElement())); return; } } lexStringSuffix(); if (config.tokenStyle & TokenStyle.notEscaped) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(2, bufferIndex - 1); } else { auto a = appender!(ubyte[])(); foreach (b; std.range.chunks(buffer[2 .. bufferIndex - 1], 2)) { string s = to!string(cast(char[]) b); a.put(cast(ubyte[]) to!string(cast(dchar) parse!uint(s, 16))); } current.value = to!string(cast(char[]) a.data); } } void lexNumber() in { assert(buffer[0] || buffer[0] == '.'); } body { // hex and binary can start with zero, anything else is decimal if (currentElement() != '0') lexDecimal(); else { switch (currentElement()) { case 'x': case 'X': keepNonNewlineChar(); lexHex(); break; case 'b': case 'B': keepNonNewlineChar(); lexBinary(); break; default: lexDecimal(); return; } } } void lexFloatSuffix() { switch (currentElement()) { case 'L': keepNonNewlineChar(); current.type = TokenType.doubleLiteral; break; case 'f': case 'F': keepNonNewlineChar(); current.type = TokenType.floatLiteral; break; default: break; } if (!isEoF() && currentElement() == 'i') { keepNonNewlineChar(); if (current.type == TokenType.floatLiteral) current.type = TokenType.ifloatLiteral; else current.type = TokenType.idoubleLiteral; } } void lexIntSuffix() { bool foundU; bool foundL; while (!isEoF()) { switch (currentElement()) { case 'u': case 'U': if (foundU) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.uintLiteral; keepNonNewlineChar(); break; case TokenType.longLiteral: current.type = TokenType.ulongLiteral; keepNonNewlineChar(); break; default: return; } foundU = true; break; case 'L': if (foundL) return; switch (current.type) { case TokenType.intLiteral: current.type = TokenType.longLiteral; keepNonNewlineChar(); break; case TokenType.uintLiteral: current.type = TokenType.ulongLiteral; keepNonNewlineChar(); break; default: return; } foundL = true; break; default: return; } } } void lexExponent() in { assert (currentElement() == 'e' || currentElement() == 'E' || currentElement() == 'p' || currentElement() == 'P'); } body { keepNonNewlineChar(); bool foundSign = false; while (!isEoF()) { switch (currentElement()) { case '-': case '+': if (foundSign) return; foundSign = true; keepNonNewlineChar(); case '0': .. case '9': case '_': keepNonNewlineChar(); break; case 'L': case 'f': case 'F': case 'i': lexFloatSuffix(); return; default: return; } } } void lexDecimal() in { assert ((buffer[0] >= '0' && buffer[0] <= '9') || buffer[0] == '.'); } body { bool foundDot = false; current.type = TokenType.intLiteral; scope(exit) setTokenValue(); decimalLoop: while (!isEoF()) { switch (currentElement()) { case '0': .. case '9': case '_': keepNonNewlineChar(); break; case 'u': case 'U': if (foundDot) { errorMessage("Floating-point literal cannot have %s suffix".format( cast(char) currentElement())); return; } else lexIntSuffix(); case 'i': case 'L': if (foundDot) { lexFloatSuffix(); return; } else { lexIntSuffix(); return; } case 'f': case 'F': lexFloatSuffix(); return; case 'e': case 'E': lexExponent(); return; case '.': static if (isArray!R) auto r = range[index .. $]; else auto r = range.save(); r.popFront(); if (!r.isRangeEoF() && r.front == '.') break decimalLoop; // possibly slice expression if (foundDot) break decimalLoop; // two dots with other characters between them keepNonNewlineChar(); foundDot = true; current.type = TokenType.doubleLiteral; break; default: break decimalLoop; } } } void lexBinary() { current.type = TokenType.intLiteral; scope(exit) setTokenValue(); binaryLoop: while (!isEoF()) { switch (currentElement()) { case '0': case '1': case '_': keepNonNewlineChar(); break; case 'u': case 'U': case 'L': lexIntSuffix(); return; default: break binaryLoop; } } } void lexHex() { current.type = TokenType.intLiteral; scope(exit) setTokenValue(); bool foundDot; hexLoop: while (!isEoF()) { switch (currentElement()) { case 'a': .. case 'f': case 'A': .. case 'F': case '0': .. case '9': case '_': keepNonNewlineChar(); break; case 'i': case 'L': if (foundDot) { lexFloatSuffix(); return; } else { lexIntSuffix(); return; } case 'p': case 'P': lexExponent(); return; case '.': static if (isArray!R) auto r = range[index .. $]; else auto r = range.save(); r.popFront(); if (!r.isRangeEoF() && r.front == '.') break hexLoop; // slice expression if (foundDot) break hexLoop; // two dots with other characters between them keepNonNewlineChar(); foundDot = true; current.type = TokenType.doubleLiteral; break; default: break hexLoop; } } } void lexStringSuffix() { current.type = TokenType.stringLiteral; if (!isEoF()) { switch (currentElement()) { case 'w': current.type = TokenType.wstringLiteral; goto case 'c'; case 'd': current.type = TokenType.dstringLiteral; goto case 'c'; case 'c': keepNonNewlineChar(); break; default: break; } } } void lexCharacterLiteral() in { assert (currentElement() == '\''); } body { current.type = TokenType.characterLiteral; scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(1, bufferIndex - 1); } keepChar(); if (isEoF()) { errorMessage("Unterminated character literal"); return; } switch (currentElement()) { case '\'': return; case '\\': lexEscapeSequence(); break; default: if (currentElement() & 0x80) { while (currentElement() & 0x80) keepChar(); break; } else { keepChar(); break; } } if (currentElement() != '\'') { errorMessage("Expected \"'\" to end character literal"); return; } keepChar(); } void lexString() in { assert (currentElement() == '"' || currentElement() == '`'); } body { current.type = TokenType.stringLiteral; bool isWysiwyg = buffer[0] == 'r' || currentElement() == '`'; scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else { if (buffer[0] == 'r') setTokenValue(2, bufferIndex - 1); else setTokenValue(1, bufferIndex - 1); } } auto quote = currentElement(); keepChar(); while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); return; } else if (currentElement() == '\\') { if (isWysiwyg) keepChar(); else lexEscapeSequence(); } else if (currentElement() == quote) { keepNonNewlineChar(); break; } else keepChar(); } lexStringSuffix(); } void lexEscapeSequence() in { assert (currentElement() == '\\'); } body { if (config.tokenStyle & TokenStyle.notEscaped) { keepChar(); switch (currentElement()) { case '\'': case '"': case '?': case '\\': case 'a': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': case 0x0a: case 0x00: keepChar(); return; case '0': .. case '7': foreach(i; 0 .. 3) { keepChar(); if (currentElement() < '0' || currentElement() > '7') return; } return; case 'x': keepChar(); foreach(i; 0 .. 4) { if (!isHexDigit(currentElement())) { errorMessage("Expected hex digit"); return; } keepChar(); } return; case 'u': case 'U': uint digits = currentElement == 'u' ? 4 : 8; keepChar(); foreach (i; 0 .. digits) { if (!isHexDigit(currentElement())) { errorMessage("Expected hex digit instead of %s".format( cast(char) currentElement())); return; } keepChar(); } return; case '&': while (!isEoF()) { keepChar(); if (currentElement() == ';') break; } return; default: errorMessage("Invalid escape sequence"); return; } } else { advanceRange(); switch (currentElement()) { case '\'': bufferChar('\''); advanceRange(); return; case '"': bufferChar('"'); advanceRange(); return; case '?': bufferChar('\?'); advanceRange(); return; case '\\': bufferChar('\\'); advanceRange(); return; case 'a': bufferChar('\a'); advanceRange(); return; case 'b': bufferChar('\b'); advanceRange(); return; case 'f': bufferChar('\f'); advanceRange(); return; case 'n': bufferChar('\n'); advanceRange(); return; case 'r': bufferChar('\r'); advanceRange(); return; case 't': bufferChar('\t'); advanceRange(); return; case 'v': bufferChar('\v'); advanceRange(); return; case 0x0a: bufferChar(0x0a); advanceRange(); return; case 0x00: bufferChar(0x00); advanceRange(); return; case '0': .. case '7': ubyte[3] digits; size_t i; while(i < 3 && !isEoF()) { digits[i++] = currentElement(); advanceRange(); if (currentElement() < '0' || currentElement() > '7') break; } decodeAndStore(digits, i, 8); return; case 'x': ubyte[2] digits; advanceRange(); foreach(i; 0 .. 2) { if (!isHexDigit(currentElement())) { errorMessage("Expected hex digit"); return; } digits[i] = currentElement(); advanceRange(); } decodeAndStore(digits, 2, 16); return; case 'u': case 'U': uint digitCount = currentElement == 'u' ? 4 : 8; advanceRange(); ubyte[8] digits; foreach (i; 0 .. digitCount) { if (!isHexDigit(currentElement())) { errorMessage("Expected hex digit"); return; } digits[i] = currentElement(); advanceRange(); } decodeAndStore(digits, digitCount, 16); return; case '&': advanceRange(); ubyte[] b; while (!isEoF()) { if (isAlpha(currentElement())) { b ~= currentElement(); advanceRange(); } else if (currentElement() == ';') { advanceRange(); break; } else { errorMessage("Invalid character entity"); return; } } auto entity = (cast(string) b) in characterEntities; if (entity is null) { errorMessage("Invalid character entity \"&%s;\"".format( cast(char[]) b)); return; } else { for (size_t i = 0; i < (*entity).length; i++) bufferChar(cast(ubyte) (*entity)[i]); } return; default: errorMessage("Invalid escape sequence"); return; } } } void decodeAndStore(ubyte[] digits, size_t maxIndex, uint base) { scope(failure) { import std.stdio; stderr.writeln("Failed on line ", lineNumber, " of file ", config.fileName); } char[4] codeUnits; auto source = cast(char[]) digits[0 .. maxIndex]; uint codePoint = parse!uint(source, base); ulong unitCount = encode(codeUnits, codePoint); foreach (i; 0 .. unitCount) bufferChar(codeUnits[i]); } void lexDelimitedString() in { assert(currentElement() == '"'); } body { current.type = TokenType.stringLiteral; keepChar(); bool heredoc; ubyte open; ubyte close; switch (currentElement()) { case '[': open = '['; close = ']'; break; case '{': open = '{'; close = '}'; break; case '(': open = '('; close = ')'; break; case '<': open = '<'; close = '>'; break; default: heredoc = true; break; } if (heredoc) lexHeredocString(); else lexNormalDelimitedString(open, close); } void lexNormalDelimitedString(ubyte open, ubyte close) in { assert(buffer[0 .. 2] == `q"`); } body { current.type = TokenType.stringLiteral; int depth = 1; keepChar(); scope (exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(3, bufferIndex - 2); } while (true) { if (isEoF()) errorMessage("Unterminated string literal"); if (currentElement() == open) { keepChar(); ++depth; } else if (currentElement() == close) { keepChar(); --depth; if (depth <= 0) { static if (isArray!R) auto r = range[index .. $]; else auto r = range.save(); if (r.front == '"') { keepChar(); return; } else { errorMessage("Expected \" after balanced " ~ cast(char) close ~ " but found " ~ cast(char) r.front ~ " instead."); return; } } } else keepChar(); } } void lexHeredocString() in { assert (buffer[0 .. bufferIndex] == "q\""); } body { auto i = bufferIndex; while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); return; } else if (isNewline(currentElement())) { keepChar(); break; } else if (isSeparating()) { errorMessage("Unterminated string literal - Separating"); return; } else keepChar(); } auto ident = buffer[i .. bufferIndex - 1]; scope(exit) { if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else { size_t b = 2 + ident.length; if (buffer[b] == '\r') ++b; if (buffer[b] == '\n') ++b; size_t e = bufferIndex; if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w') --e; setTokenValue(b, e); } } while (true) { if (isEoF()) { errorMessage("Unterminated string literal"); return; } else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident) { if (currentElement() == '"') { keepChar(); lexStringSuffix(); return; } else { errorMessage("Unterminated string literal"); return; } } else keepChar(); } } void lexTokenString() in { assert (currentElement() == '{'); } body { current.type = TokenType.stringLiteral; keepChar(); LexerConfig c = config; config.iterStyle = IterationStyle.everything; config.tokenStyle = TokenStyle.source; size_t bi; ubyte[] b = uninitializedArray!(ubyte[])(1024 * 4); int depth = 1; while (!isEoF()) { advance(); while (bi + current.value.length >= b.length) b.length += 1024 * 4; b[bi .. bi + current.value.length] = cast(ubyte[]) current.value; bi += current.value.length; if (current.type == TokenType.lBrace) ++depth; else if (current.type == TokenType.rBrace) { --depth; if (depth <= 0) break; } } config = c; buffer[0] = 'q'; buffer[1] = '{'; buffer[2 .. bi + 2] = b[0 .. bi]; buffer[bi++] = '}'; bufferIndex = bi; if (config.tokenStyle & TokenStyle.includeQuotes) setTokenValue(); else setTokenValue(2, bufferIndex - 1); lexStringSuffix(); } void lexSpecialTokenSequence() in { assert (currentElement() == '#'); } body { keepChar(); static if (isArray!R) auto r = range[index .. $]; else auto r = range.save(); auto app = appender!(ubyte[])(); app.put('#'); while (true) { if (r.isRangeEoF()) { errorMessage("Found EOF when interpreting special token sequence"); return; } else if (isNewline(r.front)) break; else { app.put(r.front); r.popFront(); } } auto m = match((cast(char[]) app.data), `#line\s+(?P