// Copyright Brian Schott (Sir Alaran) 2012. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) module tokenizer; import std.range; import std.file; import std.traits; import std.algorithm; import std.conv; import std.uni; import std.stdio; import langutils; import codegen; pure bool isNewline(R)(R range) { return range.front == '\n' || range.front == '\r'; } pure bool isEoF(R)(R range) { return range.empty || range.front == 0 || range.front == 0x1a; } char[] popNewline(R)(ref R range) { char[] chars; if (range.front == '\r') { chars ~= range.front; range.popFront(); } if (range.front == '\n') { chars ~= range.front; range.popFront(); } return chars; } unittest { auto s = "\r\ntest"; assert (popNewline(s) == "\r\n"); assert (s == "test"); } /** * Returns: */ string lexWhitespace(R)(ref R range, ref uint lineNumber) { auto app = appender!(char[])(); while (!isEoF(range) && isWhite(range.front)) { if (isNewline(range)) { ++lineNumber; app.put(popNewline(range)); } else { app.put(range.front); range.popFront(); } } return to!string(app.data); } unittest { import std.stdio; uint lineNum = 1; auto chars = " \n \r\n \tabcde"; auto r = lexWhitespace(chars, lineNum); assert (r == " \n \r\n \t"); assert (chars == "abcde"); assert (lineNum == 3); } /** * Increments endIndex until it indexes a character directly after a comment * Params: * inputString = the source code to examine * endIndex = an index into inputString at the second character of a * comment, i.e. points at the second slash in a // comment. * lineNumber = the line number that corresponds to endIndex * Returns: The comment */ string lexComment(R)(ref R input, ref uint lineNumber) in { assert (input.front == '/'); } body { auto app = appender!(char[])(); app.put(input.front); input.popFront(); switch(input.front) { case '/': while (!isEoF(input) && !isNewline(input)) { app.put(input.front); input.popFront(); } break; case '*': while (!isEoF(input)) { if (isNewline(input)) { app.put(popNewline(input)); ++lineNumber; } else if (input.front == '*') { app.put(input.front); input.popFront(); if (input.front == '/') { app.put(input.front); input.popFront(); break; } } else { app.put(input.front); input.popFront(); } } break; case '+': int depth = 1; while (depth > 0 && !isEoF(input)) { if (isNewline(input)) { app.put(popNewline(input)); lineNumber++; } else if (input.front == '+') { app.put(input.front); input.popFront(); if (input.front == '/') { app.put(input.front); input.popFront(); --depth; } } else if (input.front == '/') { app.put(input.front); input.popFront(); if (input.front == '+') { app.put(input.front); input.popFront(); ++depth; } } else { app.put(input.front); input.popFront(); } } break; default: break; } return to!string(app.data); } unittest { uint lineNumber = 1; auto chars = "//this is a comment\r\nthis is not"; auto comment = lexComment(chars, lineNumber); assert (chars == "\r\nthis is not"); assert (comment == "//this is a comment"); } unittest { uint lineNumber = 1; auto chars = "/* this is a\n\tcomment\r\n */this is not"; auto comment = lexComment(chars, lineNumber); assert (chars == "this is not"); assert (comment == "/* this is a\n\tcomment\r\n */"); assert (lineNumber == 3); } unittest { uint lineNumber = 1; auto chars = "/+this is a /+c/+omm+/ent+/ \r\nthis+/ is not"; auto comment = lexComment(chars, lineNumber); assert (chars == " is not"); assert (comment == "/+this is a /+c/+omm+/ent+/ \r\nthis+/"); assert (lineNumber == 2); } /** * Params: * inputString = the source code to examine * endIndex = an index into inputString at the opening quote * lineNumber = the line number that corresponds to endIndex * quote = the opening (and closing) quote character for the string to be * lexed * Returns: a string literal, including its opening and closing quote characters */ pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber, C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C) in { assert (inputString[endIndex] == quote); assert (quote == '\'' || quote == '"' || quote == '`'); } body { if (inputString[endIndex] != quote) return ""; auto startIndex = endIndex; ++endIndex; bool escape = false; while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape)) { if (escape) escape = false; else escape = (canEscape && inputString[endIndex] == '\\'); if (inputString[endIndex] == '\n') lineNumber++; ++endIndex; } ++endIndex; if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w' || inputString[endIndex] == 'd' || inputString[endIndex] == 'c')) { ++endIndex; } auto e = endIndex > inputString.length ? inputString.length : endIndex; return inputString[startIndex .. e]; } /** * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF", * and q"<>". * Params: * inputString = the source code to examine * endIndex = an index into inputString at the opening quote * lineNumber = the line number that corresponds to endIndex * Returns: a string literal, including its opening and closing quote characters */ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) if (isSomeString!S) { auto startIndex = endIndex; ++endIndex; assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF? string open = inputString[endIndex .. endIndex + 1]; string close; bool nesting = false; switch (open[0]) { case '[': close = "]"; ++endIndex; nesting = true; break; case '<': close = ">"; ++endIndex; nesting = true; break; case '{': close = "}"; ++endIndex; nesting = true; break; case '(': close = ")"; ++endIndex; nesting = true; break; default: while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex])) endIndex++; close = open = inputString[startIndex + 1 .. endIndex]; break; } int depth = 1; while (!isEoF(inputString, endIndex) && depth > 0) { if (inputString[endIndex] == '\n') { lineNumber++; endIndex++; } else if (inputString[endIndex..$].startsWith(open)) { endIndex += open.length; if (!nesting && !isEoF(inputString, endIndex)) { if (inputString[endIndex] == '"') ++endIndex; break; } depth++; } else if (inputString[endIndex..$].startsWith(close)) { endIndex += close.length; depth--; if (depth <= 0) break; } else ++endIndex; } if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"') ++endIndex; return inputString[startIndex .. endIndex]; } /** * TODO: Fix this */ string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) { /+auto r = byDToken(range, IterationStyle.EVERYTHING); string s = getBraceContent(r); range.popFrontN(s.length); return s;+/ return ""; } pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex) if (isSomeString!S) { Token token; token.startIndex = endIndex; size_t startIndex = endIndex; if (inputString[endIndex] == '0') { endIndex++; if (isEoF(inputString, endIndex)) { token.type = TokenType.IntLiteral; token.value = inputString[startIndex .. endIndex]; return token; } switch (inputString[endIndex]) { case '0': .. case '9': // The current language spec doesn't cover octal literals, so this // is decimal. lexDecimal(inputString, startIndex, endIndex, token); return token; case 'b': case 'B': lexBinary(inputString, startIndex, ++endIndex, token); return token; case 'x': case 'X': lexHex(inputString, startIndex, ++endIndex, token); return token; default: token.type = TokenType.IntLiteral; token.value = inputString[startIndex .. endIndex]; return token; } } else { lexDecimal(inputString, startIndex, endIndex, token); return token; } } pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex, ref size_t endIndex, ref Token token) if (isSomeString!S) { bool lexingSuffix = false; bool isLong = false; bool isUnsigned = false; token.type = TokenType.IntLiteral; binaryLoop: while (!isEoF(inputString, endIndex)) { switch (inputString[endIndex]) { case '0': case '1': case '_': if (lexingSuffix) break binaryLoop; ++endIndex; break; case 'u': case 'U': if (isUnsigned) break; ++endIndex; lexingSuffix = true; if (isLong) { token.type = TokenType.UnsignedLongLiteral; break binaryLoop; } else token.type = TokenType.UnsignedIntLiteral; isUnsigned = true; break; case 'L': if (isLong) break binaryLoop; ++endIndex; lexingSuffix = true; if (isUnsigned) { token.type = TokenType.UnsignedLongLiteral; break binaryLoop; } else token.type = TokenType.LongLiteral; isLong = true; break; default: break binaryLoop; } } token.value = inputString[startIndex .. endIndex]; } pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex, ref size_t endIndex, ref Token token) if (isSomeString!S) { bool lexingSuffix = false; bool isLong = false; bool isUnsigned = false; bool isFloat = false; bool isReal = false; bool isDouble = false; bool foundDot = false; bool foundE = false; bool foundPlusMinus = false; token.type = TokenType.IntLiteral; decimalLoop: while (!isEoF(inputString, endIndex)) { switch (inputString[endIndex]) { case '0': .. case '9': case '_': if (lexingSuffix) break decimalLoop; ++endIndex; break; case 'e': case 'E': // For this to be a valid exponent, the next character must be a // decimal character or a sign if (foundE || isEoF(inputString, endIndex + 1)) break decimalLoop; switch (inputString[endIndex + 1]) { case '+': case '-': if (isEoF(inputString, endIndex + 2) || inputString[endIndex + 2] < '0' || inputString[endIndex + 2] > '9') { break decimalLoop; } break; case '0': .. case '9': break; default: break decimalLoop; } ++endIndex; foundE = true; isDouble = true; token.type = TokenType.DoubleLiteral; break; case '+': case '-': if (foundPlusMinus || !foundE) break decimalLoop; foundPlusMinus = true; ++endIndex; break; case '.': if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') break decimalLoop; // possibly slice expression if (foundDot) break decimalLoop; // two dots with other characters between them ++endIndex; foundDot = true; token.type = TokenType.DoubleLiteral; isDouble = true; break; case 'u': case 'U': if (isUnsigned) break decimalLoop; ++endIndex; lexingSuffix = true; if (isLong) token.type = TokenType.UnsignedLongLiteral; else token.type = TokenType.UnsignedIntLiteral; isUnsigned = true; break; case 'L': if (isLong) break decimalLoop; if (isReal) break decimalLoop; ++endIndex; lexingSuffix = true; if (isDouble) token.type = TokenType.RealLiteral; else if (isUnsigned) token.type = TokenType.UnsignedLongLiteral; else token.type = TokenType.LongLiteral; isLong = true; break; case 'f': case 'F': lexingSuffix = true; if (isUnsigned || isLong) break decimalLoop; ++endIndex; token.type = TokenType.FloatLiteral; break decimalLoop; case 'i': ++endIndex; // Spec says that this is the last suffix, so all cases break the // loop. if (isDouble) { token.type = TokenType.Idouble; break decimalLoop; } else if (isFloat) { token.type = TokenType.Ifloat; break decimalLoop; } else if (isReal) { token.type = TokenType.Ireal; break decimalLoop; } else { // There is no imaginary int --endIndex; break decimalLoop; } default: break decimalLoop; } } token.value = inputString[startIndex .. endIndex]; } unittest { Token t; size_t start, end; lexDecimal!string("55e-4", start, end, t); assert(t.value == "55e-4"); assert(t.type == TokenType.DoubleLiteral); start = end = 0; lexDecimal!string("123.45f", start, end, t); assert(t.value == "123.45f"); assert(t.type == TokenType.FloatLiteral); start = end = 0; lexDecimal!string("3e+f", start, end, t); assert(t.value == "3"); assert(t.type == TokenType.IntLiteral); start = end = 0; lexDecimal!string("3e++f", start, end, t); assert(t.value == "3"); assert(t.type == TokenType.IntLiteral); start = end = 0; lexDecimal!string("1234..1237", start, end, t); assert(t.value == "1234"); assert(t.type == TokenType.IntLiteral); } nothrow void lexHex(S)(ref S inputString, ref size_t startIndex, ref size_t endIndex, ref Token token) if (isSomeString!S) { bool lexingSuffix = false; bool isLong = false; bool isUnsigned = false; bool isFloat = false; bool isReal = false; bool isDouble = false; bool foundDot = false; bool foundE = false; bool foundPlusMinus = false; token.type = TokenType.IntLiteral; hexLoop: while (!isEoF(inputString, endIndex)) { switch (inputString[endIndex]) { case '0': .. case '9': case 'a': .. case 'f': case 'A': .. case 'F': case '_': if (lexingSuffix) break hexLoop; ++endIndex; break; case 'p': case 'P': if (foundE) break hexLoop; ++endIndex; foundE = true; break; case '+': case '-': if (foundPlusMinus || !foundE) break hexLoop; foundPlusMinus = true; ++endIndex; break; case '.': if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') break hexLoop; // possibly slice expression if (foundDot) break hexLoop; // two dots with other characters between them ++endIndex; foundDot = true; token.type = TokenType.DoubleLiteral; isDouble = true; break; default: break hexLoop; } } token.value = inputString[startIndex .. endIndex]; } unittest { Token t; size_t start, end; start = 0; end = 2; lexHex!string("0x193abfq", start, end, t); assert(t.value == "0x193abf", t.value); assert(t.type == TokenType.IntLiteral); start = 0; end = 2; lexHex!string("0x2130xabc", start, end, t); assert(t.value == "0x2130"); assert(t.type == TokenType.IntLiteral); } /** * Returns: true if ch marks the ending of one token and the beginning of * another, false otherwise */ pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) { switch (ch) { case '!': .. case '/': case ':': .. case '@': case '[': .. case '^': case '{': .. case '~': case 0x20: // space case 0x09: // tab case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return return true; default: return false; } } /** * Configure the tokenize() function */ enum IterationStyle { /// Only include code, not whitespace or comments CODE_ONLY, /// Include everything EVERYTHING } struct TokenRange(R) if (isInputRange(R)) { bool empty() const @property { return _empty; } private: R range; bool _empty; } Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY) if (isSomeString!S) { auto tokenAppender = appender!(Token[])(); // This is very likely a local maximum, but it does seem to take a few // milliseconds off of the run time tokenAppender.reserve(inputString.length / 4); size_t endIndex = 0; uint lineNumber = 1; if (inputString.length > 1 && inputString[0..2] == "#!") { Token currentToken; currentToken.lineNumber = lineNumber; // lineNumber is always 1 currentToken.value = lexScriptLine(inputString, endIndex, lineNumber); currentToken.type = TokenType.ScriptLine; } while (!isEoF(inputString, endIndex)) { size_t prevIndex = endIndex; Token currentToken; auto startIndex = endIndex; if (isWhite(inputString[endIndex])) { if (iterationStyle == IterationStyle.EVERYTHING) { currentToken.lineNumber = lineNumber; currentToken.value = lexWhitespace(inputString, endIndex, lineNumber); currentToken.type = TokenType.Whitespace; tokenAppender.put(currentToken); } else lexWhitespace(inputString, endIndex, lineNumber); continue; } currentToken.startIndex = endIndex; outerSwitch: switch(inputString[endIndex]) { mixin(generateCaseTrie( "=", "TokenType.Assign", "&", "TokenType.BitAnd", "&=", "TokenType.BitAndEquals", "|", "TokenType.BitOr", "|=", "TokenType.BitOrEquals", "~=", "TokenType.CatEquals", ":", "TokenType.Colon", ",", "TokenType.Comma", "$", "TokenType.Dollar", ".", "TokenType.Dot", "==", "TokenType.Equals", "=>", "TokenType.GoesTo", ">", "TokenType.Greater", ">=", "TokenType.GreaterEqual", "#", "TokenType.Hash", "&&", "TokenType.LogicAnd", "{", "TokenType.LBrace", "[", "TokenType.LBracket", "<", "TokenType.Less", "<=", "TokenType.LessEqual", "<>=", "TokenType.LessEqualGreater", "<>", "TokenType.LessOrGreater", "||", "TokenType.LogicOr", "(", "TokenType.LParen", "-", "TokenType.Minus", "-=", "TokenType.MinusEquals", "%", "TokenType.Mod", "%=", "TokenType.ModEquals", "*=", "TokenType.MulEquals", "!", "TokenType.Not", "!=", "TokenType.NotEquals", "!>", "TokenType.NotGreater", "!>=", "TokenType.NotGreaterEqual", "!<", "TokenType.NotLess", "!<=", "TokenType.NotLessEqual", "!<>", "TokenType.NotLessEqualGreater", "+", "TokenType.Plus", "+=", "TokenType.PlusEquals", "^^", "TokenType.Pow", "^^=", "TokenType.PowEquals", "}", "TokenType.RBrace", "]", "TokenType.RBracket", ")", "TokenType.RParen", ";", "TokenType.Semicolon", "<<", "TokenType.ShiftLeft", "<<=", "TokenType.ShiftLeftEqual", ">>", "TokenType.ShiftRight", ">>=", "TokenType.ShiftRightEqual", "..", "TokenType.Slice", "*", "TokenType.Star", "?", "TokenType.Ternary", "~", "TokenType.Tilde", "--", "TokenType.Decrement", "!<>=", "TokenType.Unordered", ">>>", "TokenType.UnsignedShiftRight", ">>>=", "TokenType.UnsignedShiftRightEqual", "++", "TokenType.Increment", "...", "TokenType.Vararg", "^", "TokenType.Xor", "^=", "TokenType.XorEquals", )); case '0': .. case '9': currentToken = lexNumber(inputString, endIndex); break; case '/': ++endIndex; if (isEoF(inputString, endIndex)) { currentToken.value = "/"; currentToken.type = TokenType.Div; currentToken.lineNumber = lineNumber; break; } currentToken.lineNumber = lineNumber; switch (inputString[endIndex]) { case '/': case '+': case '*': if (iterationStyle == IterationStyle.CODE_ONLY) { lexComment(inputString, endIndex, lineNumber); continue; } else { currentToken.value = lexComment(inputString, endIndex, lineNumber); currentToken.type = TokenType.Comment; break; } case '=': currentToken.value = "/="; currentToken.type = TokenType.DivEquals; ++endIndex; break; default: currentToken.value = "/"; currentToken.type = TokenType.Div; break; } break; case 'r': ++endIndex; if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') goto default; currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex], false); currentToken.type = TokenType.StringLiteral; break; case '`': currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex], false); currentToken.type = TokenType.StringLiteral; break; case 'x': ++endIndex; if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') goto default; else goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings case '\'': case '"': currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex]); currentToken.type = TokenType.StringLiteral; break; case 'q': currentToken.value = "q"; ++endIndex; if (!isEoF(inputString, endIndex)) { switch (inputString[endIndex]) { case '"': currentToken.lineNumber = lineNumber; currentToken.value ~= lexDelimitedString(inputString, endIndex, lineNumber); currentToken.type = TokenType.StringLiteral; break outerSwitch; case '{': currentToken.lineNumber = lineNumber; currentToken.value ~= lexTokenString(inputString, endIndex, lineNumber); currentToken.type = TokenType.StringLiteral; break outerSwitch; default: break; } } goto default; case '@': ++endIndex; goto default; default: while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex])) ++endIndex; currentToken.value = inputString[startIndex .. endIndex]; currentToken.type = lookupTokenTypeOptimized(currentToken.value); //currentToken.type = lookupTokenType(currentToken.value); currentToken.lineNumber = lineNumber; break; } //stderr.writeln(currentToken); tokenAppender.put(currentToken); // This should never happen. if (endIndex <= prevIndex) { stderr.writeln("FAIL"); return []; } } return tokenAppender.data; }