module std.d.lexer; import std.typecons; import std.typetuple; import std.array; import std.algorithm; import std.range; import std.lexer; public import std.lexer : StringCache; private enum operators = [ ",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=", "!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++", "+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=", "==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^", "^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~=" ]; private enum keywords = [ "abstract", "alias", "align", "asm", "assert", "auto", "body", "bool", "break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat", "char", "class", "const", "continue", "creal", "dchar", "debug", "default", "delegate", "delete", "deprecated", "do", "double", "else", "enum", "export", "extern", "false", "final", "finally", "float", "for", "foreach", "foreach_reverse", "function", "goto", "idouble", "if", "ifloat", "immutable", "import", "in", "inout", "int", "interface", "invariant", "ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow", "null", "out", "override", "package", "pragma", "private", "protected", "public", "pure", "real", "ref", "return", "scope", "shared", "short", "static", "struct", "super", "switch", "synchronized", "template", "this", "throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent", "uint", "ulong", "union", "unittest", "ushort", "version", "void", "volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__", "__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters", "__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector", "__VENDOR__", "__VERSION__" ]; private enum dynamicTokens = [ "specialTokenSequence", "comment", "identifier", "scriptLine", "whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral", "ifloatLiteral", "intLiteral", "longLiteral", "realLiteral", "irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral", "dstringLiteral", "stringLiteral", "wstringLiteral", "scriptLine" ]; private enum pseudoTokenHandlers = [ "\"", "lexStringLiteral", "`", "lexWysiwygString", "//", "lexSlashSlashComment", "/*", "lexSlashStarComment", "/+", "lexSlashPlusComment", ".", "lexDot", "'", "lexCharacterLiteral", "0", "lexNumber", "1", "lexDecimal", "2", "lexDecimal", "3", "lexDecimal", "4", "lexDecimal", "5", "lexDecimal", "6", "lexDecimal", "7", "lexDecimal", "8", "lexDecimal", "9", "lexDecimal", "q\"", "lexDelimitedString", "q{", "lexTokenString", "r\"", "lexWysiwygString", "x\"", "lexHexString", " ", "lexWhitespace", "\t", "lexWhitespace", "\r", "lexWhitespace", "\n", "lexWhitespace", "\u2028", "lexLongNewline", "\u2029", "lexLongNewline", "#!", "lexScriptLine", "#line", "lexSpecialTokenSequence" ]; public alias IdType = TokenIdType!(operators, dynamicTokens, keywords); public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords); public template tok(string token) { alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token); } private enum extraFields = q{ string comment; int opCmp(size_t i) const pure nothrow @safe { if (index < i) return -1; if (index > i) return 1; return 0; } }; public alias Token = std.lexer.TokenStructure!(IdType, extraFields); /** * Configure string lexing behavior */ public enum StringBehavior : ubyte { /// Do not include quote characters, process escape sequences compiler = 0b0000_0000, /// Opening quotes, closing quotes, and string suffixes are included in the /// string token includeQuoteChars = 0b0000_0001, /// String escape sequences are not replaced notEscaped = 0b0000_0010, /// Not modified at all. Useful for formatters or highlighters source = includeQuoteChars | notEscaped } /** * Configure whitespace handling behavior */ public enum WhitespaceBehavior : ubyte { /// Whitespace is skipped skip, /// Whitespace is treated as a token include } /** * Configure comment handling behavior */ public enum CommentBehavior : ubyte { /// Comments are attached to the non-whitespace token that follows them attach, /// Comments are tokens, and can be returned by calls to the token range's front() include } public struct LexerConfig { string fileName; StringBehavior stringBehavior; WhitespaceBehavior whitespaceBehavior; CommentBehavior commentBehavior; } public bool isBasicType(IdType type) nothrow pure @safe { switch (type) { case tok!"int": case tok!"uint": case tok!"double": case tok!"idouble": case tok!"float": case tok!"ifloat": case tok!"short": case tok!"ushort": case tok!"long": case tok!"ulong": case tok!"char": case tok!"wchar": case tok!"dchar": case tok!"bool": case tok!"void": case tok!"cent": case tok!"ucent": case tok!"real": case tok!"ireal": case tok!"byte": case tok!"ubyte": case tok!"cdouble": case tok!"cfloat": case tok!"creal": return true; default: return false; } } public bool isNumberLiteral(IdType type) nothrow pure @safe { switch (type) { case tok!"doubleLiteral": case tok!"floatLiteral": case tok!"idoubleLiteral": case tok!"ifloatLiteral": case tok!"intLiteral": case tok!"longLiteral": case tok!"realLiteral": case tok!"irealLiteral": case tok!"uintLiteral": case tok!"ulongLiteral": return true; default: return false; } } public bool isOperator(IdType type) nothrow pure @safe { switch (type) { case tok!",": case tok!".": case tok!"..": case tok!"...": case tok!"/": case tok!"/=": case tok!"!": case tok!"!<": case tok!"!<=": case tok!"!<>": case tok!"!<>=": case tok!"!=": case tok!"!>": case tok!"!>=": case tok!"$": case tok!"%": case tok!"%=": case tok!"&": case tok!"&&": case tok!"&=": case tok!"(": case tok!")": case tok!"*": case tok!"*=": case tok!"+": case tok!"++": case tok!"+=": case tok!"-": case tok!"--": case tok!"-=": case tok!":": case tok!";": case tok!"<": case tok!"<<": case tok!"<<=": case tok!"<=": case tok!"<>": case tok!"<>=": case tok!"=": case tok!"==": case tok!"=>": case tok!">": case tok!">=": case tok!">>": case tok!">>=": case tok!">>>": case tok!">>>=": case tok!"?": case tok!"@": case tok!"[": case tok!"]": case tok!"^": case tok!"^=": case tok!"^^": case tok!"^^=": case tok!"{": case tok!"|": case tok!"|=": case tok!"||": case tok!"}": case tok!"~": case tok!"~=": return true; default: return false; } } public bool isKeyword(IdType type) pure nothrow @safe { switch (type) { case tok!"abstract": case tok!"alias": case tok!"align": case tok!"asm": case tok!"assert": case tok!"auto": case tok!"body": case tok!"break": case tok!"case": case tok!"cast": case tok!"catch": case tok!"class": case tok!"const": case tok!"continue": case tok!"debug": case tok!"default": case tok!"delegate": case tok!"delete": case tok!"deprecated": case tok!"do": case tok!"else": case tok!"enum": case tok!"export": case tok!"extern": case tok!"false": case tok!"final": case tok!"finally": case tok!"for": case tok!"foreach": case tok!"foreach_reverse": case tok!"function": case tok!"goto": case tok!"if": case tok!"immutable": case tok!"import": case tok!"in": case tok!"inout": case tok!"interface": case tok!"invariant": case tok!"is": case tok!"lazy": case tok!"macro": case tok!"mixin": case tok!"module": case tok!"new": case tok!"nothrow": case tok!"null": case tok!"out": case tok!"override": case tok!"package": case tok!"pragma": case tok!"private": case tok!"protected": case tok!"public": case tok!"pure": case tok!"ref": case tok!"return": case tok!"scope": case tok!"shared": case tok!"static": case tok!"struct": case tok!"super": case tok!"switch": case tok!"synchronized": case tok!"template": case tok!"this": case tok!"throw": case tok!"true": case tok!"try": case tok!"typedef": case tok!"typeid": case tok!"typeof": case tok!"union": case tok!"unittest": case tok!"version": case tok!"volatile": case tok!"while": case tok!"with": case tok!"__DATE__": case tok!"__EOF__": case tok!"__FILE__": case tok!"__FUNCTION__": case tok!"__gshared": case tok!"__LINE__": case tok!"__MODULE__": case tok!"__parameters": case tok!"__PRETTY_FUNCTION__": case tok!"__TIME__": case tok!"__TIMESTAMP__": case tok!"__traits": case tok!"__vector": case tok!"__VENDOR__": case tok!"__VERSION__": return true; default: return false; } } public bool isStringLiteral(IdType type) pure nothrow @safe { switch (type) { case tok!"dstringLiteral": case tok!"stringLiteral": case tok!"wstringLiteral": return true; default: return false; } } public bool isProtection(IdType type) pure nothrow @safe { switch (type) { case tok!"export": case tok!"package": case tok!"private": case tok!"public": case tok!"protected": return true; default: return false; } } public struct DLexer { import core.vararg; mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens, keywords, pseudoTokenHandlers); this(ubyte[] range, const LexerConfig config, shared(StringCache)* cache) { this.range = LexerRange(range); this.config = config; this.cache = cache; popFront(); } private static bool isDocComment(string comment) pure nothrow @safe { return comment.length >= 3 && (comment[0 .. 3] == "///" || comment[0 .. 3] == "/++" || comment[0 .. 3] == "/**"); } public void popFront() pure { _popFront(); string comment = null; switch (front.type) { case tok!"comment": if (config.commentBehavior == CommentBehavior.attach) { import std.string; if (isDocComment(front.text)) { comment = comment is null ? front.text : format("%s\n%s", comment, front.text); } do _popFront(); while (front == tok!"comment"); if (front == tok!"whitespace") goto case tok!"whitespace"; } break; case tok!"whitespace": if (config.whitespaceBehavior == WhitespaceBehavior.skip) { do _popFront(); while (front == tok!"whitespace"); if (front == tok!"comment") goto case tok!"comment"; } break; default: break; } _front.comment = comment; } bool isWhitespace() pure /*const*/ nothrow { switch (range.front) { case ' ': case '\r': case '\n': case '\t': return true; case 0xe2: auto peek = range.peek(2); return peek.length == 2 && peek[0] == 0x80 && (peek[1] == 0xa8 || peek[1] == 0xa9); default: return false; } } void popFrontWhitespaceAware() pure nothrow { switch (range.front) { case '\r': range.popFront(); if (!range.empty && range.front == '\n') { range.popFront(); range.incrementLine(); } else range.incrementLine(); return; case '\n': range.popFront(); range.incrementLine(); return; case 0xe2: auto lookahead = range.peek(3); if (lookahead.length == 3 && lookahead[1] == 0x80 && (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)) { range.popFront(); range.popFront(); range.popFront(); range.incrementLine(); return; } else { range.popFront(); return; } default: range.popFront(); return; } } Token lexWhitespace() pure nothrow { mixin (tokenStart); loop: do { switch (range.front) { case '\r': range.popFront(); if (!range.empty && range.front == '\n') range.popFront(); range.incrementLine(); break; case '\n': range.popFront(); range.incrementLine(); break; case ' ': case '\t': range.popFront(); break; case 0xe2: auto lookahead = range.peek(3); if (lookahead.length != 3) break loop; if (lookahead[1] != 0x80) break loop; if (lookahead[2] == 0xa8 || lookahead[2] == 0xa9) { range.popFront(); range.popFront(); range.popFront(); range.incrementLine(); break; } break loop; default: break loop; } } while (!range.empty); string text = config.whitespaceBehavior == WhitespaceBehavior.skip ? null : cache.intern(range.slice(mark)); return Token(tok!"whitespace", text, line, column, index); } Token lexNumber() pure nothrow { mixin (tokenStart); if (range.front == '0' && range.canPeek(1)) { auto ahead = range.peek(1)[1]; switch (ahead) { case 'x': case 'X': range.popFront(); range.popFront(); return lexHex(mark, line, column, index); case 'b': case 'B': range.popFront(); range.popFront(); return lexBinary(mark, line, column, index); default: return lexDecimal(mark, line, column, index); } } else return lexDecimal(mark, line, column, index); } Token lexHex() pure nothrow { mixin (tokenStart); return lexHex(mark, line, column, index); } Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow { IdType type = tok!"intLiteral"; bool foundDot; hexLoop: while (!range.empty) { switch (range.front) { case 'a': .. case 'f': case 'A': .. case 'F': case '0': .. case '9': case '_': range.popFront(); break; case 'u': case 'U': lexIntSuffix(type); break hexLoop; case 'i': if (foundDot) lexFloatSuffix(type); break hexLoop; case 'L': if (foundDot) lexFloatSuffix(type); else lexIntSuffix(type); break hexLoop; case 'p': case 'P': lexExponent(type); break hexLoop; case '.': if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.') break hexLoop; else { // The following bit of silliness tries to tell the // difference between "int dot identifier" and // "double identifier". if (range.canPeek(1)) { switch (range.peekAt(1)) { case '0': .. case '9': case 'A': .. case 'F': case 'a': .. case 'f': goto doubleLiteral; default: break hexLoop; } } else { doubleLiteral: range.popFront(); foundDot = true; type = tok!"doubleLiteral"; } } break; default: break hexLoop; } } return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexBinary() pure nothrow { mixin (tokenStart); return lexBinary(mark, line, column, index); } Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow { IdType type = tok!"intLiteral"; binaryLoop: while (!range.empty) { switch (range.front) { case '0': case '1': case '_': range.popFront(); break; case 'u': case 'U': case 'L': lexIntSuffix(type); break binaryLoop; default: break binaryLoop; } } return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexDecimal() pure nothrow { mixin (tokenStart); return lexDecimal(mark, line, column, index); } Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow { bool foundDot = range.front == '.'; IdType type = tok!"intLiteral"; if (foundDot) { range.popFront(); type = tok!"doubleLiteral"; } decimalLoop: while (!range.empty) { switch (range.front) { case '0': .. case '9': case '_': range.popFront(); break; case 'u': case 'U': if (!foundDot) lexIntSuffix(type); break decimalLoop; case 'i': lexFloatSuffix(type); break decimalLoop; case 'L': if (foundDot) lexFloatSuffix(type); else lexIntSuffix(type); break decimalLoop; case 'f': case 'F': lexFloatSuffix(type); break decimalLoop; case 'e': case 'E': lexExponent(type); break decimalLoop; case '.': if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.') break decimalLoop; else { // The following bit of silliness tries to tell the // difference between "int dot identifier" and // "double identifier". if (range.canPeek(1)) { auto ch = range.peekAt(1); if (ch <= 0x2f || (ch >= '0' && ch <= '9') || (ch >= ':' && ch <= '@') || (ch >= '[' && ch <= '^') || (ch >= '{' && ch <= '~') || ch == '`' || ch == '_') { goto doubleLiteral; } else break decimalLoop; } else { doubleLiteral: range.popFront(); foundDot = true; type = tok!"doubleLiteral"; } } break; default: break decimalLoop; } } return Token(type, cache.intern(range.slice(mark)), line, column, index); } void lexIntSuffix(ref IdType type) pure nothrow @safe { bool secondPass; if (range.front == 'u' || range.front == 'U') { U: if (type == tok!"intLiteral") type = tok!"uintLiteral"; else type = tok!"ulongLiteral"; range.popFront(); if (secondPass) return; if (range.front == 'L' || range.front == 'l') goto L; return; } if (range.front == 'L' || range.front == 'l') { L: if (type == tok!"uintLiteral") type = tok!"ulongLiteral"; else type = tok!"longLiteral"; range.popFront(); if (range.front == 'U' || range.front == 'u') { secondPass = true; goto U; } return; } } void lexFloatSuffix(ref IdType type) pure nothrow @safe { switch (range.front) { case 'L': range.popFront(); type = tok!"doubleLiteral"; break; case 'f': case 'F': range.popFront(); type = tok!"floatLiteral"; break; default: break; } if (!range.empty && range.front == 'i') { warning("Complex number literals are deprecated"); range.popFront(); if (type == tok!"floatLiteral") type = tok!"ifloatLiteral"; else type = tok!"idoubleLiteral"; } } void lexExponent(ref IdType type) pure nothrow @safe { range.popFront(); bool foundSign = false; bool foundDigit = false; while (!range.empty) { switch (range.front) { case '-': case '+': if (foundSign) { if (!foundDigit) error("Expected an exponent"); return; } foundSign = true; range.popFront(); break; case '0': .. case '9': case '_': foundDigit = true; range.popFront(); break; case 'L': case 'f': case 'F': case 'i': lexFloatSuffix(type); return; default: if (!foundDigit) error("Expected an exponent"); return; } } } Token lexScriptLine() pure { mixin (tokenStart); while (!range.empty && !isNewline) range.popFront(); return Token(tok!"scriptLine", cache.intern(range.slice(mark)), line, column, index); } Token lexSpecialTokenSequence() pure { mixin (tokenStart); while (!range.empty && !isNewline) range.popFront(); return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)), line, column, index); } Token lexSlashStarComment() pure { mixin (tokenStart); IdType type = tok!"comment"; range.popFront(); range.popFront(); while (!range.empty) { if (range.front == '*') { range.popFront(); if (!range.empty && range.front == '/') { range.popFront(); break; } } else popFrontWhitespaceAware(); } return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexSlashSlashComment() pure nothrow { mixin (tokenStart); IdType type = tok!"comment"; range.popFront(); range.popFront(); while (!range.empty) { if (range.front == '\r' || range.front == '\n') break; range.popFront(); } return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexSlashPlusComment() pure nothrow { mixin (tokenStart); IdType type = tok!"comment"; range.popFront(); range.popFront(); int depth = 1; while (depth > 0 && !range.empty) { if (range.front == '+') { range.popFront(); if (!range.empty && range.front == '/') { range.popFront(); depth--; } } else if (range.front == '/') { range.popFront(); if (!range.empty && range.front == '+') { range.popFront(); depth++; } } else popFrontWhitespaceAware(); } return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexStringLiteral() pure nothrow { mixin (tokenStart); range.popFront(); while (true) { if (range.empty) { error("Error: unterminated string literal"); return Token(); } else if (range.front == '"') { range.popFront(); break; } else if (range.front == '\\') { lexEscapeSequence(); } else range.popFront(); } IdType type = tok!"stringLiteral"; lexStringSuffix(type); return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexWysiwygString() pure nothrow { mixin (tokenStart); IdType type = tok!"stringLiteral"; bool backtick = range.front == '`'; if (backtick) { range.popFront(); while (true) { if (range.empty) { error("Error: unterminated string literal"); return Token(tok!""); } else if (range.front == '`') { range.popFront(); break; } else popFrontWhitespaceAware(); } } else { range.popFront(); if (range.empty) { error("Error: unterminated string literal"); return Token(tok!""); } range.popFront(); while (true) { if (range.empty) { error("Error: unterminated string literal"); return Token(tok!""); } else if (range.front == '"') { range.popFront(); break; } else popFrontWhitespaceAware(); } } lexStringSuffix(type); return Token(type, cache.intern(range.slice(mark)), line, column, index); } void lexStringSuffix(ref IdType type) pure nothrow { if (range.empty) type = tok!"stringLiteral"; else { switch (range.front) { case 'w': range.popFront(); type = tok!"wstringLiteral"; break; case 'd': range.popFront(); type = tok!"dstringLiteral"; break; case 'c': range.popFront(); type = tok!"stringLiteral"; break; default: type = tok!"stringLiteral"; break; } } } Token lexDelimitedString() pure nothrow { import std.traits; mixin (tokenStart); range.popFront(); range.popFront(); ubyte open; ubyte close; switch (range.front) { case '<': open = '<'; close = '>'; range.popFront(); return lexNormalDelimitedString(mark, line, column, index, open, close); case '{': open = '{'; close = '}'; range.popFront(); return lexNormalDelimitedString(mark, line, column, index, open, close); case '[': open = '['; close = ']'; range.popFront(); return lexNormalDelimitedString(mark, line, column, index, open, close); case '(': open = '('; close = ')'; range.popFront(); return lexNormalDelimitedString(mark, line, column, index, open, close); default: return lexHeredocString(mark, line, column, index); } } Token lexNormalDelimitedString(size_t mark, size_t line, size_t column, size_t index, ubyte open, ubyte close) pure nothrow { int depth = 1; while (!range.empty && depth > 0) { if (range.front == open) { depth++; range.popFront(); } else if (range.front == close) { depth--; range.popFront(); if (depth <= 0) { if (range.front == '"') range.popFront(); else { error("Error: \" expected to end delimited string literal"); return Token(tok!""); } } } else popFrontWhitespaceAware(); } IdType type = tok!"stringLiteral"; lexStringSuffix(type); return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index) pure nothrow { import std.regex; Token ident = lexIdentifier(); if (isNewline()) popFrontWhitespaceAware(); else error("Newline expected"); while (!range.empty) { if (isNewline()) { popFrontWhitespaceAware(); if (!range.canPeek(ident.text.length)) { error(ident.text ~ " expected"); break; } if (range.peek(ident.text.length - 1) == ident.text) { range.popFrontN(ident.text.length); break; } } else range.popFront(); } if (!range.empty() && range.front == '"') range.popFront(); else error(`" expected`); IdType type = tok!"stringLiteral"; lexStringSuffix(type); return Token(type, cache.intern(range.slice(mark)), line, column, index); } Token lexTokenString() pure { mixin (tokenStart); assert(range.front == 'q'); range.popFront(); assert(range.front == '{'); range.popFront(); auto app = appender!string(); app.put("q{"); int depth = 1; LexerConfig c = config; scope(exit) config = c; config.whitespaceBehavior = WhitespaceBehavior.include; config.stringBehavior = StringBehavior.source; config.commentBehavior = CommentBehavior.include; _front = advance(); while (depth > 0 && !empty) { auto t = front(); if (t.text is null) app.put(str(t.type)); else app.put(t.text); if (t.type == tok!"}") { depth--; if (depth > 0) popFront(); } else if (t.type == tok!"{") { depth++; popFront(); } else popFront(); } IdType type = tok!"stringLiteral"; lexStringSuffix(type); return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line, column, index); } Token lexHexString() pure nothrow { mixin (tokenStart); range.popFront(); range.popFront(); loop: while (true) { if (range.empty) { error("Error: unterminated hex string literal"); return Token(); } else if (isWhitespace()) popFrontWhitespaceAware(); else switch (range.front) { case '0': .. case '9': case 'A': .. case 'F': case 'a': .. case 'f': range.popFront(); break; case '"': range.popFront(); break loop; default: error("Error: invalid character in hex string"); return Token(); } } IdType type = tok!"stringLiteral"; lexStringSuffix(type); return Token(type, cache.intern(range.slice(mark)), line, column, index); } bool lexEscapeSequence() pure nothrow { range.popFront(); if (range.empty) { error("Error: non-terminated character escape sequence."); return false; } switch (range.front) { case '\'': case '"': case '?': case '\\': case '0': case 'a': case 'b': case 'f': case 'n': case 'r': case 't': case 'v': range.popFront(); break; case 'x': range.popFront(); foreach (i; 0 .. 2) { if (range.empty) { error("Error: 2 hex digits expected."); return false; } switch (range.front) { case '0': .. case '9': case 'a': .. case 'f': case 'A': .. case 'F': range.popFront(); break; default: error("Error: 2 hex digits expected."); return false; } } break; case '1': .. case '7': for (size_t i = 0; i < 3 && !range.empty && range.front >= '0' && range.front <= '7'; i++) range.popFront(); break; case 'u': range.popFront(); foreach (i; 0 .. 4) { if (range.empty) { error("Error: at least 4 hex digits expected."); return false; } switch (range.front) { case '0': .. case '9': case 'a': .. case 'f': case 'A': .. case 'F': range.popFront(); break; default: error("Error: at least 4 hex digits expected."); return false; } } break; case 'U': range.popFront(); foreach (i; 0 .. 8) { if (range.empty) { error("Error: at least 8 hex digits expected."); return false; } switch (range.front) { case '0': .. case '9': case 'a': .. case 'f': case 'A': .. case 'F': range.popFront(); break; default: error("Error: at least 8 hex digits expected."); return false; } } break; default: while (true) { if (range.empty) { error("Error: non-terminated character escape sequence."); return false; } if (range.front == ';') { range.popFront(); break; } else range.popFront(); } } return true; } Token lexCharacterLiteral() pure nothrow { mixin (tokenStart); range.popFront(); if (range.front == '\\') { lexEscapeSequence(); goto close; } else if (range.front == '\'') { range.popFront(); return Token(tok!"characterLiteral", cache.intern(range.slice(mark)), line, column, index); } else if (range.front & 0x80) { while (range.front & 0x80) range.popFront(); goto close; } else { popFrontWhitespaceAware(); goto close; } close: if (range.front == '\'') { range.popFront(); return Token(tok!"characterLiteral", cache.intern(range.slice(mark)), line, column, index); } else { error("Error: Expected ' to end character literal"); return Token(); } } Token lexIdentifier() pure nothrow { import std.stdio; mixin (tokenStart); uint hash = 0; if (isSeparating(0) || range.empty) { error("Invalid identifier"); range.popFront(); } while (!range.empty && !isSeparating(0)) { hash = StringCache.hashStep(range.front, hash); range.popFront(); } return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line, column, index); } Token lexDot() pure nothrow { mixin (tokenStart); if (!range.canPeek(1)) { range.popFront(); return Token(tok!".", null, line, column, index); } switch (range.peekAt(1)) { case '0': .. case '9': return lexNumber(); case '.': range.popFront(); range.popFront(); if (!range.empty && range.front == '.') { range.popFront(); return Token(tok!"...", null, line, column, index); } else return Token(tok!"..", null, line, column, index); default: range.popFront(); return Token(tok!".", null, line, column, index); } } Token lexLongNewline() pure nothrow { mixin (tokenStart); range.popFront(); range.popFront(); range.popFront(); range.incrementLine(); return Token(tok!"whitespace", cache.intern(range.slice(mark)), line, column, index); } bool isNewline() pure @safe nothrow { if (range.front == '\n') return true; if (range.front == '\r') return true; return (range.front & 0x80) && range.canPeek(2) && (range.peek(2) == "\u2028" || range.peek(2) == "\u2029"); } bool isSeparating(size_t offset) pure nothrow @safe { if (!range.canPeek(offset)) return true; auto c = range.peekAt(offset); if (c >= 'A' && c <= 'Z') return false; if (c >= 'a' && c <= 'z') return false; if (c <= 0x2f) return true; if (c >= ':' && c <= '@') return true; if (c >= '[' && c <= '^') return true; if (c >= '{' && c <= '~') return true; if (c == '`') return true; if (c & 0x80) { auto r = range; range.popFrontN(offset); return (r.canPeek(2) && (r.peek(2) == "\u2028" || r.peek(2) == "\u2029")); } return false; } enum tokenStart = q{ size_t index = range.index; size_t column = range.column; size_t line = range.line; auto mark = range.mark(); }; void error(string message) pure nothrow @safe { messages ~= Message(range.line, range.column, message, true); } void warning(string message) pure nothrow @safe { messages ~= Message(range.line, range.column, message, false); assert (messages.length > 0); } struct Message { size_t line; size_t column; string message; bool isError; } Message[] messages; shared(StringCache)* cache; LexerConfig config; } public auto byToken(ubyte[] range) { LexerConfig config; shared(StringCache)* cache = new shared StringCache(StringCache.defaultBucketCount); return DLexer(range, config, cache); } public auto byToken(ubyte[] range, shared(StringCache)* cache) { LexerConfig config; return DLexer(range, config, cache); } public auto byToken(ubyte[] range, const LexerConfig config, shared(StringCache)* cache) { return DLexer(range, config, cache); } unittest { import std.stdio; auto source = cast(ubyte[]) q{ import std.stdio;}c; auto tokens = byToken(source); assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".", tok!"identifier", tok!";"])); } /// Test \x char sequence unittest { auto toks = (string s) => byToken(cast(ubyte[])s); // valid enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F']; auto source = ""; foreach(h1; hex) foreach(h2; hex) source ~= "'\\x" ~ h1 ~ h2 ~ "'"; assert(toks(source).filter!(t => t.type != tok!"characterLiteral").empty); // invalid assert(toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); assert(toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); assert(toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); assert(toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true)); assert(toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true)); }