/******************************************************************************* * Boost Software License - Version 1.0 - August 17th, 2003 * * Permission is hereby granted, free of charge, to any person or organization * obtaining a copy of the software and accompanying documentation covered by * this license (the "Software") to use, reproduce, display, distribute, * execute, and transmit the Software, and to prepare derivative works of the * Software, and to permit third-parties to whom the Software is furnished to * do so, all subject to the following: * * The copyright notices in the Software and this entire statement, including * the above license grant, this restriction and the following disclaimer, * must be included in all copies of the Software, in whole or in part, and * all derivative works of the Software, unless such copies or derivative * works are solely in the form of machine-executable object code generated by * a source language processor. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. ******************************************************************************/ module tokenizer; import std.range; import std.file; import std.traits; import std.algorithm; import std.conv; import std.uni; import std.stdio; import langutils; import codegen; /** * Increments endIndex until it indexes a non-whitespace character in * inputString. * Params: * inputString = the source code to examine * endIndex = an index into inputString * lineNumber = the line number that corresponds to endIndex * style = the code iteration style * Returns: The whitespace, or null if style was CODE_ONLY */ pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex, ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) if (isSomeString!S) { immutable startIndex = endIndex; while (endIndex < inputString.length && isWhite(inputString[endIndex])) { if (inputString[endIndex] == '\n') lineNumber++; ++endIndex; } final switch (style) { case IterationStyle.EVERYTHING: return inputString[startIndex .. endIndex]; case IterationStyle.CODE_ONLY: return null; } } /** * Increments endIndex until it indexes a character directly after a comment * Params: * inputString = the source code to examine * endIndex = an index into inputString at the second character of a * comment, i.e. points at the second slash in a // comment. * lineNumber = the line number that corresponds to endIndex * Returns: The comment */ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) if (isSomeString!S) { if (inputString.length == 0) return ""; auto startIndex = endIndex - 1; switch(inputString[endIndex]) { case '/': while (endIndex < inputString.length && inputString[endIndex] != '\n') { if (inputString[endIndex] == '\n') ++lineNumber; ++endIndex; } break; case '*': while (endIndex < inputString.length && !inputString[endIndex..$].startsWith("*/")) { if (inputString[endIndex] == '\n') ++lineNumber; ++endIndex; } endIndex += 2; break; case '+': ++endIndex; int depth = 1; while (depth > 0 && endIndex + 1 < inputString.length) { if (inputString[endIndex] == '\n') lineNumber++; else if (inputString[endIndex..$].startsWith("+/")) depth--; else if (inputString[endIndex..$].startsWith("/+")) depth++; ++endIndex; } ++endIndex; break; default: break; } return inputString[startIndex..endIndex]; } /** * Params: * inputString = the source code to examine * endIndex = an index into inputString at the opening quote * lineNumber = the line number that corresponds to endIndex * quote = the opening (and closing) quote character for the string to be * lexed * Returns: a string literal, including its opening and closing quote characters * Bugs: Does not handle string suffixes */ pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber, C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C) in { assert (inputString[endIndex] == quote); assert (quote == '\'' || quote == '\"' || quote == '`'); } body { if (inputString[endIndex] != quote) return ""; auto startIndex = endIndex; ++endIndex; bool escape = false; while (endIndex < inputString.length && (inputString[endIndex] != quote || escape)) { if (escape) escape = false; else escape = (canEscape && inputString[endIndex] == '\\'); if (inputString[endIndex] == '\n') lineNumber++; ++endIndex; } ++endIndex; endIndex = min(endIndex, inputString.length); return inputString[startIndex .. endIndex]; } /** * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF", * and q"<>". * Params: * inputString = the source code to examine * endIndex = an index into inputString at the opening quote * lineNumber = the line number that corresponds to endIndex * Returns: a string literal, including its opening and closing quote characters */ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) if (isSomeString!S) { auto startIndex = endIndex; ++endIndex; string open = to!string(inputString[endIndex]); string close; bool nesting = false; switch (open) { case "[": close = "]"; ++endIndex; nesting = true; break; case "<": close = ">"; ++endIndex; nesting = true; break; case "{": close = "}"; ++endIndex; nesting = true; break; case "(": close = ")"; ++endIndex; nesting = true; break; default: while(!isWhite(inputString[endIndex])) endIndex++; close = open = inputString[startIndex + 1 .. endIndex]; break; } int depth = 1; while (endIndex < inputString.length && depth > 0) { if (inputString[endIndex] == '\n') { lineNumber++; endIndex++; } else if (inputString[endIndex..$].startsWith(open)) { endIndex += open.length; if (!nesting) { if (inputString[endIndex] == '\"') ++endIndex; break; } depth++; } else if (inputString[endIndex..$].startsWith(close)) { endIndex += close.length; depth--; if (depth <= 0) break; } else ++endIndex; } if (endIndex < inputString.length && inputString[endIndex] == '\"') ++endIndex; return inputString[startIndex .. endIndex]; } string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) { /+auto r = byDToken(range, IterationStyle.EVERYTHING); string s = getBraceContent(r); range.popFrontN(s.length); return s;+/ return ""; } /** * */ pure nothrow string lexNumber(S)(ref S inputString, ref size_t endIndex) if (isSomeString!S) { auto startIndex = endIndex; bool foundDot = false; bool foundX = false; bool foundB = false; bool foundE = false; numberLoop: while (endIndex < inputString.length) { switch (inputString[endIndex]) { case '0': if (!foundX) { ++endIndex; if (endIndex < inputString.length && (inputString[endIndex] == 'x' || inputString[endIndex] == 'X')) { ++endIndex; foundX = true; } } else ++endIndex; break; case 'b': if (foundB) break numberLoop; foundB = true; ++endIndex; break; case '.': if (foundDot || foundX || foundE) break numberLoop; foundDot = true; ++endIndex; break; case '+': case '-': if (!foundE) break numberLoop; ++endIndex; break; case 'p': case 'P': if (!foundX) break numberLoop; foundE = true; goto case '_'; case 'e': case 'E': if (foundE || foundX) break numberLoop; foundE = true; goto case '_'; case '1': .. case '9': case '_': ++endIndex; break; case 'F': case 'f': case 'L': case 'i': ++endIndex; break numberLoop; default: break numberLoop; } } return inputString[startIndex .. endIndex]; } /** * Returns: true if ch marks the ending of one token and the beginning of * another, false otherwise */ pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) { switch (ch) { case '!': .. case '/': case ':': .. case '@': case '[': .. case '^': case '{': .. case '~': case 0x20: // space case 0x09: // tab case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return return true; default: return false; } } /** * Configure the tokenize() function */ enum IterationStyle { /// Only include code, not whitespace or comments CODE_ONLY, /// Include everything EVERYTHING } Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY) if (isSomeString!S) { auto tokenAppender = appender!(Token[])(); // This is very likely a local maximum, but it does seem to take a few // milliseconds off of the run time tokenAppender.reserve(inputString.length / 4); size_t endIndex = 0; uint lineNumber = 1; while (endIndex < inputString.length) { Token currentToken; auto startIndex = endIndex; if (isWhite(inputString[endIndex])) { if (iterationStyle == IterationStyle.EVERYTHING) { currentToken.lineNumber = lineNumber; currentToken.value = lexWhitespace(inputString, endIndex, lineNumber, IterationStyle.EVERYTHING); currentToken.type = TokenType.whitespace; tokenAppender.put(currentToken); } else lexWhitespace(inputString, endIndex, lineNumber); continue; } currentToken.startIndex = endIndex; outerSwitch: switch(inputString[endIndex]) { mixin(generateCaseTrie( "=", "TokenType.assign", "&", "TokenType.bitAnd", "&=", "TokenType.bitAndEquals", "|", "TokenType.bitOr", "|=", "TokenType.bitOrEquals", "~=", "TokenType.catEquals", ":", "TokenType.colon", ",", "TokenType.comma", "$", "TokenType.dollar", ".", "TokenType.dot", "==", "TokenType.equals", "=>", "TokenType.goesTo", ">", "TokenType.greater", ">=", "TokenType.greaterEqual", "#", "TokenType.hash", "&&", "TokenType.lAnd", "{", "TokenType.lBrace", "[", "TokenType.lBracket", "<", "TokenType.less", "<=", "TokenType.lessEqual", "<>=", "TokenType.lessEqualGreater", "<>", "TokenType.lessOrGreater", "||", "TokenType.lOr", "(", "TokenType.lParen", "-", "TokenType.minus", "-=", "TokenType.minusEquals", "%", "TokenType.mod", "%=", "TokenType.modEquals", "*=", "TokenType.mulEquals", "!", "TokenType.not", "!=", "TokenType.notEquals", "!>", "TokenType.notGreater", "!>=", "TokenType.notGreaterEqual", "!<", "TokenType.notLess", "!<=", "TokenType.notLessEqual", "!<>", "TokenType.notLessEqualGreater", "+", "TokenType.plus", "+=", "TokenType.plusEquals", "^^", "TokenType.pow", "^^=", "TokenType.powEquals", "}", "TokenType.rBrace", "]", "TokenType.rBracket", ")", "TokenType.rParen", ";", "TokenType.semicolon", "<<", "TokenType.shiftLeft", "<<=", "TokenType.shiftLeftEqual", ">>", "TokenType.shiftRight", ">>=", "TokenType.shiftRightEqual", "..", "TokenType.slice", "*", "TokenType.star", "?", "TokenType.ternary", "~", "TokenType.tilde", "--", "TokenType.uMinus", "!<>=", "TokenType.unordered", ">>>", "TokenType.unsignedShiftRight", ">>>=", "TokenType.unsignedShiftRightEqual", "++", "TokenType.uPlus", "...", "TokenType.vararg", "^", "TokenType.xor", "^=", "TokenType.xorEquals", )); case '0': .. case '9': currentToken.value = lexNumber(inputString, endIndex); currentToken.type = TokenType.numberLiteral; currentToken.lineNumber = lineNumber; break; case '/': ++endIndex; if (endIndex >= inputString.length) { currentToken.value = "/"; currentToken.type = TokenType.div; currentToken.lineNumber = lineNumber; break; } currentToken.lineNumber = lineNumber; switch (inputString[endIndex]) { case '/': case '+': case '*': if (iterationStyle == IterationStyle.CODE_ONLY) { lexComment(inputString, endIndex, lineNumber); continue; } else { currentToken.value = lexComment(inputString, endIndex, lineNumber); currentToken.type = TokenType.comment; break; } case '=': currentToken.value = "/="; currentToken.type = TokenType.divEquals; ++endIndex; break; default: currentToken.value = "/"; currentToken.type = TokenType.div; break; } break; case 'r': currentToken.value = "r"; ++endIndex; if (inputString[endIndex] == '\"') { currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex], false); currentToken.type = TokenType.stringLiteral; break; } else goto default; case '`': currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex], false); currentToken.type = TokenType.stringLiteral; break; case 'x': currentToken.value = "x"; ++endIndex; if (inputString[endIndex] == '\"') goto case '\"'; else goto default; case '\'': case '"': currentToken.lineNumber = lineNumber; currentToken.value = lexString(inputString, endIndex, lineNumber, inputString[endIndex]); currentToken.type = TokenType.stringLiteral; break; case 'q': ++endIndex; switch (inputString[endIndex]) { case '\"': currentToken.lineNumber = lineNumber; currentToken.value ~= "q" ~ lexDelimitedString(inputString, endIndex, lineNumber); currentToken.type = TokenType.stringLiteral; break outerSwitch; case '{': currentToken.lineNumber = lineNumber; currentToken.value ~= "q" ~ lexTokenString(inputString, endIndex, lineNumber); currentToken.type = TokenType.stringLiteral; break outerSwitch; default: break; } goto default; case '@': ++endIndex; goto default; default: while(endIndex < inputString.length && !isSeparating(inputString[endIndex])) ++endIndex; currentToken.value = inputString[startIndex .. endIndex]; currentToken.type = lookupTokenType(currentToken.value); currentToken.lineNumber = lineNumber; break; } // writeln(currentToken); tokenAppender.put(currentToken); } return tokenAppender.data; }