From 09240c67cf0c966645796ee80b925d3b840589f4 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Mon, 14 Jan 2013 16:46:03 -0800 Subject: [PATCH] more work on the tokenizer --- tokenizer.d | 1443 ++++++++++++++++++++++++++------------------------- 1 file changed, 739 insertions(+), 704 deletions(-) diff --git a/tokenizer.d b/tokenizer.d index 893399f..fdb9b5c 100644 --- a/tokenizer.d +++ b/tokenizer.d @@ -62,7 +62,7 @@ string lexWhitespace(R)(ref R range, ref uint lineNumber) { ++lineNumber; app.put(popNewline(range)); - } + } else { app.put(range.front); @@ -211,6 +211,53 @@ unittest } +string interpretEscapeSequence(R)(ref R input) +in +{ + assert(input.front == '\\'); +} +body +{ + input.popFront(); + auto app = appender!(char[])(); + loop: while (!isEoF(input)) + { + switch (input.front) + { + case '\'': + case '\"': + case '?': + case '\\': + case 0: + case 0x1a: + app.put(input.front); + input.popFront(); + break loop; + case 'a': input.popFront(); app.put('\a'); break loop; + case 'b': input.popFront(); app.put('\b'); break loop; + case 'f': input.popFront(); app.put('\f'); break loop; + case 'n': input.popFront(); app.put('\n'); break loop; + case 'r': input.popFront(); app.put('\r'); break loop; + case 't': input.popFront(); app.put('\t'); break loop; + case 'v': input.popFront(); app.put('\v'); break loop; + case 'x': + break; + case '0' .. case '7': + break; + case 'u': + break; + case 'U': + break; + case '&': + // http://www.w3.org/TR/html5/entities.json + default: + // This is an error + break; + } + } + return app.data; +} + /** * Params: * inputString = the source code to examine @@ -220,718 +267,706 @@ unittest * lexed * Returns: a string literal, including its opening and closing quote characters */ -pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber, - C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C) +string lexString(R, C)(ref R input, ref uint lineNumber, + C quote, bool canEscape = true) if (is (ElementType!(R) == C)) in { - assert (inputString[endIndex] == quote); + assert (input.front == quote); assert (quote == '\'' || quote == '"' || quote == '`'); } body { - if (inputString[endIndex] != quote) - return ""; - auto startIndex = endIndex; - ++endIndex; - bool escape = false; - while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape)) + auto app = appender!(char[])(); + while (!isEoF(input) && (input.front != quote || escape)) { - if (escape) - escape = false; - else - escape = (canEscape && inputString[endIndex] == '\\'); - if (inputString[endIndex] == '\n') + if (canEscape && ) + else if (isNewline(input)) + { + app.put(popNewline(input)); lineNumber++; - ++endIndex; - } - ++endIndex; - if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w' - || inputString[endIndex] == 'd' || inputString[endIndex] == 'c')) - { - ++endIndex; - } - auto e = endIndex > inputString.length ? inputString.length : endIndex; - return inputString[startIndex .. e]; -} - -/** - * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF", - * and q"<>". - * Params: - * inputString = the source code to examine - * endIndex = an index into inputString at the opening quote - * lineNumber = the line number that corresponds to endIndex - * Returns: a string literal, including its opening and closing quote characters - */ -string lexDelimitedString(S)(ref S inputString, ref size_t endIndex, - ref uint lineNumber) if (isSomeString!S) -{ - auto startIndex = endIndex; - ++endIndex; - assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF? - string open = inputString[endIndex .. endIndex + 1]; - string close; - bool nesting = false; - switch (open[0]) - { - case '[': close = "]"; ++endIndex; nesting = true; break; - case '<': close = ">"; ++endIndex; nesting = true; break; - case '{': close = "}"; ++endIndex; nesting = true; break; - case '(': close = ")"; ++endIndex; nesting = true; break; - default: - while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex])) - endIndex++; - close = open = inputString[startIndex + 1 .. endIndex]; - break; - } - int depth = 1; - while (!isEoF(inputString, endIndex) && depth > 0) - { - if (inputString[endIndex] == '\n') - { - lineNumber++; - endIndex++; - } - else if (inputString[endIndex..$].startsWith(open)) - { - endIndex += open.length; - if (!nesting && !isEoF(inputString, endIndex)) - { - if (inputString[endIndex] == '"') - ++endIndex; - break; - } - depth++; - } - else if (inputString[endIndex..$].startsWith(close)) - { - endIndex += close.length; - depth--; - if (depth <= 0) - break; - } - else - ++endIndex; - } - if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"') - ++endIndex; - return inputString[startIndex .. endIndex]; -} - - -/** - * TODO: Fix this - */ -string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) -{ - /+auto r = byDToken(range, IterationStyle.EVERYTHING); - string s = getBraceContent(r); - range.popFrontN(s.length); - return s;+/ - return ""; -} - -pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex) - if (isSomeString!S) -{ - Token token; - token.startIndex = endIndex; - size_t startIndex = endIndex; - if (inputString[endIndex] == '0') - { - endIndex++; - if (isEoF(inputString, endIndex)) - { - token.type = TokenType.IntLiteral; - token.value = inputString[startIndex .. endIndex]; - return token; - } - switch (inputString[endIndex]) - { - case '0': .. case '9': - // The current language spec doesn't cover octal literals, so this - // is decimal. - lexDecimal(inputString, startIndex, endIndex, token); - return token; - case 'b': - case 'B': - lexBinary(inputString, startIndex, ++endIndex, token); - return token; - case 'x': - case 'X': - lexHex(inputString, startIndex, ++endIndex, token); - return token; - default: - token.type = TokenType.IntLiteral; - token.value = inputString[startIndex .. endIndex]; - return token; } } - else - { - lexDecimal(inputString, startIndex, endIndex, token); - return token; - } + return to!string(app.data); } -pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex, - ref size_t endIndex, ref Token token) if (isSomeString!S) -{ - bool lexingSuffix = false; - bool isLong = false; - bool isUnsigned = false; - token.type = TokenType.IntLiteral; - binaryLoop: while (!isEoF(inputString, endIndex)) - { - switch (inputString[endIndex]) - { - case '0': - case '1': - case '_': - if (lexingSuffix) - break binaryLoop; - ++endIndex; - break; - case 'u': - case 'U': - if (isUnsigned) - break; - ++endIndex; - lexingSuffix = true; - if (isLong) - { - token.type = TokenType.UnsignedLongLiteral; - break binaryLoop; - } - else - token.type = TokenType.UnsignedIntLiteral; - isUnsigned = true; - break; - case 'L': - if (isLong) - break binaryLoop; - ++endIndex; - lexingSuffix = true; - if (isUnsigned) - { - token.type = TokenType.UnsignedLongLiteral; - break binaryLoop; - } - else - token.type = TokenType.LongLiteral; - isLong = true; - break; - default: - break binaryLoop; - } - } - - token.value = inputString[startIndex .. endIndex]; -} - -pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex, - ref size_t endIndex, ref Token token) if (isSomeString!S) -{ - bool lexingSuffix = false; - bool isLong = false; - bool isUnsigned = false; - bool isFloat = false; - bool isReal = false; - bool isDouble = false; - bool foundDot = false; - bool foundE = false; - bool foundPlusMinus = false; - token.type = TokenType.IntLiteral; - decimalLoop: while (!isEoF(inputString, endIndex)) - { - switch (inputString[endIndex]) - { - case '0': .. case '9': - case '_': - if (lexingSuffix) - break decimalLoop; - ++endIndex; - break; - case 'e': - case 'E': - // For this to be a valid exponent, the next character must be a - // decimal character or a sign - if (foundE || isEoF(inputString, endIndex + 1)) - break decimalLoop; - switch (inputString[endIndex + 1]) - { - case '+': - case '-': - if (isEoF(inputString, endIndex + 2) - || inputString[endIndex + 2] < '0' - || inputString[endIndex + 2] > '9') - { - break decimalLoop; - } - break; - case '0': .. case '9': - break; - default: - break decimalLoop; - } - ++endIndex; - foundE = true; - isDouble = true; - token.type = TokenType.DoubleLiteral; - break; - case '+': - case '-': - if (foundPlusMinus || !foundE) - break decimalLoop; - foundPlusMinus = true; - ++endIndex; - break; - case '.': - if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') - break decimalLoop; // possibly slice expression - if (foundDot) - break decimalLoop; // two dots with other characters between them - ++endIndex; - foundDot = true; - token.type = TokenType.DoubleLiteral; - isDouble = true; - break; - case 'u': - case 'U': - if (isUnsigned) - break decimalLoop; - ++endIndex; - lexingSuffix = true; - if (isLong) - token.type = TokenType.UnsignedLongLiteral; - else - token.type = TokenType.UnsignedIntLiteral; - isUnsigned = true; - break; - case 'L': - if (isLong) - break decimalLoop; - if (isReal) - break decimalLoop; - ++endIndex; - lexingSuffix = true; - if (isDouble) - token.type = TokenType.RealLiteral; - else if (isUnsigned) - token.type = TokenType.UnsignedLongLiteral; - else - token.type = TokenType.LongLiteral; - isLong = true; - break; - case 'f': - case 'F': - lexingSuffix = true; - if (isUnsigned || isLong) - break decimalLoop; - ++endIndex; - token.type = TokenType.FloatLiteral; - break decimalLoop; - case 'i': - ++endIndex; - // Spec says that this is the last suffix, so all cases break the - // loop. - if (isDouble) - { - token.type = TokenType.Idouble; - break decimalLoop; - } - else if (isFloat) - { - token.type = TokenType.Ifloat; - break decimalLoop; - } - else if (isReal) - { - token.type = TokenType.Ireal; - break decimalLoop; - } - else - { - // There is no imaginary int - --endIndex; - break decimalLoop; - } - default: - break decimalLoop; - } - } - - token.value = inputString[startIndex .. endIndex]; -} - - -unittest { - Token t; - size_t start, end; - lexDecimal!string("55e-4", start, end, t); - assert(t.value == "55e-4"); - assert(t.type == TokenType.DoubleLiteral); - - start = end = 0; - lexDecimal!string("123.45f", start, end, t); - assert(t.value == "123.45f"); - assert(t.type == TokenType.FloatLiteral); - - start = end = 0; - lexDecimal!string("3e+f", start, end, t); - assert(t.value == "3"); - assert(t.type == TokenType.IntLiteral); - - start = end = 0; - lexDecimal!string("3e++f", start, end, t); - assert(t.value == "3"); - assert(t.type == TokenType.IntLiteral); - - start = end = 0; - lexDecimal!string("1234..1237", start, end, t); - assert(t.value == "1234"); - assert(t.type == TokenType.IntLiteral); -} - - -nothrow void lexHex(S)(ref S inputString, ref size_t startIndex, - ref size_t endIndex, ref Token token) if (isSomeString!S) -{ - bool lexingSuffix = false; - bool isLong = false; - bool isUnsigned = false; - bool isFloat = false; - bool isReal = false; - bool isDouble = false; - bool foundDot = false; - bool foundE = false; - bool foundPlusMinus = false; - token.type = TokenType.IntLiteral; - hexLoop: while (!isEoF(inputString, endIndex)) - { - switch (inputString[endIndex]) - { - case '0': .. case '9': - case 'a': .. case 'f': - case 'A': .. case 'F': - case '_': - if (lexingSuffix) - break hexLoop; - ++endIndex; - break; - case 'p': - case 'P': - if (foundE) - break hexLoop; - ++endIndex; - foundE = true; - break; - case '+': - case '-': - if (foundPlusMinus || !foundE) - break hexLoop; - foundPlusMinus = true; - ++endIndex; - break; - case '.': - if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') - break hexLoop; // possibly slice expression - if (foundDot) - break hexLoop; // two dots with other characters between them - ++endIndex; - foundDot = true; - token.type = TokenType.DoubleLiteral; - isDouble = true; - break; - default: - break hexLoop; - } - } - - token.value = inputString[startIndex .. endIndex]; -} - -unittest -{ - Token t; - size_t start, end; - start = 0; - end = 2; - lexHex!string("0x193abfq", start, end, t); - assert(t.value == "0x193abf", t.value); - assert(t.type == TokenType.IntLiteral); - - start = 0; - end = 2; - lexHex!string("0x2130xabc", start, end, t); - assert(t.value == "0x2130"); - assert(t.type == TokenType.IntLiteral); - -} - -/** - * Returns: true if ch marks the ending of one token and the beginning of - * another, false otherwise - */ -pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) -{ - switch (ch) - { - case '!': .. case '/': - case ':': .. case '@': - case '[': .. case '^': - case '{': .. case '~': - case 0x20: // space - case 0x09: // tab - case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return - return true; - default: - return false; - } -} - -/** - * Configure the tokenize() function - */ -enum IterationStyle -{ - /// Only include code, not whitespace or comments - CODE_ONLY, - /// Include everything - EVERYTHING -} - -struct TokenRange(R) if (isInputRange(R)) -{ - bool empty() const @property - { - return _empty; - } - - -private: - R range; - bool _empty; -} - -Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY) - if (isSomeString!S) -{ - auto tokenAppender = appender!(Token[])(); - - // This is very likely a local maximum, but it does seem to take a few - // milliseconds off of the run time - tokenAppender.reserve(inputString.length / 4); - - size_t endIndex = 0; - uint lineNumber = 1; - - if (inputString.length > 1 && inputString[0..2] == "#!") - { - Token currentToken; - currentToken.lineNumber = lineNumber; // lineNumber is always 1 - currentToken.value = lexScriptLine(inputString, endIndex, lineNumber); - currentToken.type = TokenType.ScriptLine; - } - - while (!isEoF(inputString, endIndex)) - { - size_t prevIndex = endIndex; - Token currentToken; - auto startIndex = endIndex; - if (isWhite(inputString[endIndex])) - { - if (iterationStyle == IterationStyle.EVERYTHING) - { - currentToken.lineNumber = lineNumber; - currentToken.value = lexWhitespace(inputString, endIndex, - lineNumber); - currentToken.type = TokenType.Whitespace; - tokenAppender.put(currentToken); - } - else - lexWhitespace(inputString, endIndex, lineNumber); - continue; - } - currentToken.startIndex = endIndex; - - outerSwitch: switch(inputString[endIndex]) - { - mixin(generateCaseTrie( - "=", "TokenType.Assign", - "&", "TokenType.BitAnd", - "&=", "TokenType.BitAndEquals", - "|", "TokenType.BitOr", - "|=", "TokenType.BitOrEquals", - "~=", "TokenType.CatEquals", - ":", "TokenType.Colon", - ",", "TokenType.Comma", - "$", "TokenType.Dollar", - ".", "TokenType.Dot", - "==", "TokenType.Equals", - "=>", "TokenType.GoesTo", - ">", "TokenType.Greater", - ">=", "TokenType.GreaterEqual", - "#", "TokenType.Hash", - "&&", "TokenType.LogicAnd", - "{", "TokenType.LBrace", - "[", "TokenType.LBracket", - "<", "TokenType.Less", - "<=", "TokenType.LessEqual", - "<>=", "TokenType.LessEqualGreater", - "<>", "TokenType.LessOrGreater", - "||", "TokenType.LogicOr", - "(", "TokenType.LParen", - "-", "TokenType.Minus", - "-=", "TokenType.MinusEquals", - "%", "TokenType.Mod", - "%=", "TokenType.ModEquals", - "*=", "TokenType.MulEquals", - "!", "TokenType.Not", - "!=", "TokenType.NotEquals", - "!>", "TokenType.NotGreater", - "!>=", "TokenType.NotGreaterEqual", - "!<", "TokenType.NotLess", - "!<=", "TokenType.NotLessEqual", - "!<>", "TokenType.NotLessEqualGreater", - "+", "TokenType.Plus", - "+=", "TokenType.PlusEquals", - "^^", "TokenType.Pow", - "^^=", "TokenType.PowEquals", - "}", "TokenType.RBrace", - "]", "TokenType.RBracket", - ")", "TokenType.RParen", - ";", "TokenType.Semicolon", - "<<", "TokenType.ShiftLeft", - "<<=", "TokenType.ShiftLeftEqual", - ">>", "TokenType.ShiftRight", - ">>=", "TokenType.ShiftRightEqual", - "..", "TokenType.Slice", - "*", "TokenType.Star", - "?", "TokenType.Ternary", - "~", "TokenType.Tilde", - "--", "TokenType.Decrement", - "!<>=", "TokenType.Unordered", - ">>>", "TokenType.UnsignedShiftRight", - ">>>=", "TokenType.UnsignedShiftRightEqual", - "++", "TokenType.Increment", - "...", "TokenType.Vararg", - "^", "TokenType.Xor", - "^=", "TokenType.XorEquals", - )); - case '0': .. case '9': - currentToken = lexNumber(inputString, endIndex); - break; - case '/': - ++endIndex; - if (isEoF(inputString, endIndex)) - { - currentToken.value = "/"; - currentToken.type = TokenType.Div; - currentToken.lineNumber = lineNumber; - break; - } - currentToken.lineNumber = lineNumber; - switch (inputString[endIndex]) - { - case '/': - case '+': - case '*': - if (iterationStyle == IterationStyle.CODE_ONLY) - { - lexComment(inputString, endIndex, lineNumber); - continue; - } - else - { - currentToken.value = lexComment(inputString, endIndex, lineNumber); - currentToken.type = TokenType.Comment; - break; - } - case '=': - currentToken.value = "/="; - currentToken.type = TokenType.DivEquals; - ++endIndex; - break; - default: - currentToken.value = "/"; - currentToken.type = TokenType.Div; - break; - } - break; - case 'r': - ++endIndex; - if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') - goto default; - currentToken.lineNumber = lineNumber; - currentToken.value = lexString(inputString, endIndex, - lineNumber, inputString[endIndex], false); - currentToken.type = TokenType.StringLiteral; - break; - case '`': - currentToken.lineNumber = lineNumber; - currentToken.value = lexString(inputString, endIndex, lineNumber, - inputString[endIndex], false); - currentToken.type = TokenType.StringLiteral; - break; - case 'x': - ++endIndex; - if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') - goto default; - else - goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings - case '\'': - case '"': - currentToken.lineNumber = lineNumber; - currentToken.value = lexString(inputString, endIndex, lineNumber, - inputString[endIndex]); - currentToken.type = TokenType.StringLiteral; - break; - case 'q': - currentToken.value = "q"; - ++endIndex; - if (!isEoF(inputString, endIndex)) - { - switch (inputString[endIndex]) - { - case '"': - currentToken.lineNumber = lineNumber; - currentToken.value ~= lexDelimitedString(inputString, - endIndex, lineNumber); - currentToken.type = TokenType.StringLiteral; - break outerSwitch; - case '{': - currentToken.lineNumber = lineNumber; - currentToken.value ~= lexTokenString(inputString, - endIndex, lineNumber); - currentToken.type = TokenType.StringLiteral; - break outerSwitch; - default: - break; - } - } - goto default; - case '@': - ++endIndex; - goto default; - default: - while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex])) - ++endIndex; - currentToken.value = inputString[startIndex .. endIndex]; - currentToken.type = lookupTokenTypeOptimized(currentToken.value); - //currentToken.type = lookupTokenType(currentToken.value); - currentToken.lineNumber = lineNumber; - break; - } - //stderr.writeln(currentToken); - tokenAppender.put(currentToken); - - // This should never happen. - if (endIndex <= prevIndex) - { - stderr.writeln("FAIL"); - return []; - } - } - return tokenAppender.data; -} +///** +// * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF", +// * and q"<>". +// * Params: +// * inputString = the source code to examine +// * endIndex = an index into inputString at the opening quote +// * lineNumber = the line number that corresponds to endIndex +// * Returns: a string literal, including its opening and closing quote characters +// */ +//string lexDelimitedString(S)(ref S inputString, ref size_t endIndex, +// ref uint lineNumber) if (isSomeString!S) +//{ +// auto startIndex = endIndex; +// ++endIndex; +// assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF? +// string open = inputString[endIndex .. endIndex + 1]; +// string close; +// bool nesting = false; +// switch (open[0]) +// { +// case '[': close = "]"; ++endIndex; nesting = true; break; +// case '<': close = ">"; ++endIndex; nesting = true; break; +// case '{': close = "}"; ++endIndex; nesting = true; break; +// case '(': close = ")"; ++endIndex; nesting = true; break; +// default: +// while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex])) +// endIndex++; +// close = open = inputString[startIndex + 1 .. endIndex]; +// break; +// } +// int depth = 1; +// while (!isEoF(inputString, endIndex) && depth > 0) +// { +// if (inputString[endIndex] == '\n') +// { +// lineNumber++; +// endIndex++; +// } +// else if (inputString[endIndex..$].startsWith(open)) +// { +// endIndex += open.length; +// if (!nesting && !isEoF(inputString, endIndex)) +// { +// if (inputString[endIndex] == '"') +// ++endIndex; +// break; +// } +// depth++; +// } +// else if (inputString[endIndex..$].startsWith(close)) +// { +// endIndex += close.length; +// depth--; +// if (depth <= 0) +// break; +// } +// else +// ++endIndex; +// } +// if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"') +// ++endIndex; +// return inputString[startIndex .. endIndex]; +//} +// +// +///** +// * TODO: Fix this +// */ +//string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber) +//{ +// /+auto r = byDToken(range, IterationStyle.EVERYTHING); +// string s = getBraceContent(r); +// range.popFrontN(s.length); +// return s;+/ +// return ""; +//} +// +//pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex) +// if (isSomeString!S) +//{ +// Token token; +// token.startIndex = endIndex; +// size_t startIndex = endIndex; +// if (inputString[endIndex] == '0') +// { +// endIndex++; +// if (isEoF(inputString, endIndex)) +// { +// token.type = TokenType.IntLiteral; +// token.value = inputString[startIndex .. endIndex]; +// return token; +// } +// switch (inputString[endIndex]) +// { +// case '0': .. case '9': +// // The current language spec doesn't cover octal literals, so this +// // is decimal. +// lexDecimal(inputString, startIndex, endIndex, token); +// return token; +// case 'b': +// case 'B': +// lexBinary(inputString, startIndex, ++endIndex, token); +// return token; +// case 'x': +// case 'X': +// lexHex(inputString, startIndex, ++endIndex, token); +// return token; +// default: +// token.type = TokenType.IntLiteral; +// token.value = inputString[startIndex .. endIndex]; +// return token; +// } +// } +// else +// { +// lexDecimal(inputString, startIndex, endIndex, token); +// return token; +// } +//} +// +//pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex, +// ref size_t endIndex, ref Token token) if (isSomeString!S) +//{ +// bool lexingSuffix = false; +// bool isLong = false; +// bool isUnsigned = false; +// token.type = TokenType.IntLiteral; +// binaryLoop: while (!isEoF(inputString, endIndex)) +// { +// switch (inputString[endIndex]) +// { +// case '0': +// case '1': +// case '_': +// if (lexingSuffix) +// break binaryLoop; +// ++endIndex; +// break; +// case 'u': +// case 'U': +// if (isUnsigned) +// break; +// ++endIndex; +// lexingSuffix = true; +// if (isLong) +// { +// token.type = TokenType.UnsignedLongLiteral; +// break binaryLoop; +// } +// else +// token.type = TokenType.UnsignedIntLiteral; +// isUnsigned = true; +// break; +// case 'L': +// if (isLong) +// break binaryLoop; +// ++endIndex; +// lexingSuffix = true; +// if (isUnsigned) +// { +// token.type = TokenType.UnsignedLongLiteral; +// break binaryLoop; +// } +// else +// token.type = TokenType.LongLiteral; +// isLong = true; +// break; +// default: +// break binaryLoop; +// } +// } +// +// token.value = inputString[startIndex .. endIndex]; +//} +// +//pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex, +// ref size_t endIndex, ref Token token) if (isSomeString!S) +//{ +// bool lexingSuffix = false; +// bool isLong = false; +// bool isUnsigned = false; +// bool isFloat = false; +// bool isReal = false; +// bool isDouble = false; +// bool foundDot = false; +// bool foundE = false; +// bool foundPlusMinus = false; +// token.type = TokenType.IntLiteral; +// decimalLoop: while (!isEoF(inputString, endIndex)) +// { +// switch (inputString[endIndex]) +// { +// case '0': .. case '9': +// case '_': +// if (lexingSuffix) +// break decimalLoop; +// ++endIndex; +// break; +// case 'e': +// case 'E': +// // For this to be a valid exponent, the next character must be a +// // decimal character or a sign +// if (foundE || isEoF(inputString, endIndex + 1)) +// break decimalLoop; +// switch (inputString[endIndex + 1]) +// { +// case '+': +// case '-': +// if (isEoF(inputString, endIndex + 2) +// || inputString[endIndex + 2] < '0' +// || inputString[endIndex + 2] > '9') +// { +// break decimalLoop; +// } +// break; +// case '0': .. case '9': +// break; +// default: +// break decimalLoop; +// } +// ++endIndex; +// foundE = true; +// isDouble = true; +// token.type = TokenType.DoubleLiteral; +// break; +// case '+': +// case '-': +// if (foundPlusMinus || !foundE) +// break decimalLoop; +// foundPlusMinus = true; +// ++endIndex; +// break; +// case '.': +// if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') +// break decimalLoop; // possibly slice expression +// if (foundDot) +// break decimalLoop; // two dots with other characters between them +// ++endIndex; +// foundDot = true; +// token.type = TokenType.DoubleLiteral; +// isDouble = true; +// break; +// case 'u': +// case 'U': +// if (isUnsigned) +// break decimalLoop; +// ++endIndex; +// lexingSuffix = true; +// if (isLong) +// token.type = TokenType.UnsignedLongLiteral; +// else +// token.type = TokenType.UnsignedIntLiteral; +// isUnsigned = true; +// break; +// case 'L': +// if (isLong) +// break decimalLoop; +// if (isReal) +// break decimalLoop; +// ++endIndex; +// lexingSuffix = true; +// if (isDouble) +// token.type = TokenType.RealLiteral; +// else if (isUnsigned) +// token.type = TokenType.UnsignedLongLiteral; +// else +// token.type = TokenType.LongLiteral; +// isLong = true; +// break; +// case 'f': +// case 'F': +// lexingSuffix = true; +// if (isUnsigned || isLong) +// break decimalLoop; +// ++endIndex; +// token.type = TokenType.FloatLiteral; +// break decimalLoop; +// case 'i': +// ++endIndex; +// // Spec says that this is the last suffix, so all cases break the +// // loop. +// if (isDouble) +// { +// token.type = TokenType.Idouble; +// break decimalLoop; +// } +// else if (isFloat) +// { +// token.type = TokenType.Ifloat; +// break decimalLoop; +// } +// else if (isReal) +// { +// token.type = TokenType.Ireal; +// break decimalLoop; +// } +// else +// { +// // There is no imaginary int +// --endIndex; +// break decimalLoop; +// } +// default: +// break decimalLoop; +// } +// } +// +// token.value = inputString[startIndex .. endIndex]; +//} +// +// +//unittest { +// Token t; +// size_t start, end; +// lexDecimal!string("55e-4", start, end, t); +// assert(t.value == "55e-4"); +// assert(t.type == TokenType.DoubleLiteral); +// +// start = end = 0; +// lexDecimal!string("123.45f", start, end, t); +// assert(t.value == "123.45f"); +// assert(t.type == TokenType.FloatLiteral); +// +// start = end = 0; +// lexDecimal!string("3e+f", start, end, t); +// assert(t.value == "3"); +// assert(t.type == TokenType.IntLiteral); +// +// start = end = 0; +// lexDecimal!string("3e++f", start, end, t); +// assert(t.value == "3"); +// assert(t.type == TokenType.IntLiteral); +// +// start = end = 0; +// lexDecimal!string("1234..1237", start, end, t); +// assert(t.value == "1234"); +// assert(t.type == TokenType.IntLiteral); +//} +// +// +//nothrow void lexHex(S)(ref S inputString, ref size_t startIndex, +// ref size_t endIndex, ref Token token) if (isSomeString!S) +//{ +// bool lexingSuffix = false; +// bool isLong = false; +// bool isUnsigned = false; +// bool isFloat = false; +// bool isReal = false; +// bool isDouble = false; +// bool foundDot = false; +// bool foundE = false; +// bool foundPlusMinus = false; +// token.type = TokenType.IntLiteral; +// hexLoop: while (!isEoF(inputString, endIndex)) +// { +// switch (inputString[endIndex]) +// { +// case '0': .. case '9': +// case 'a': .. case 'f': +// case 'A': .. case 'F': +// case '_': +// if (lexingSuffix) +// break hexLoop; +// ++endIndex; +// break; +// case 'p': +// case 'P': +// if (foundE) +// break hexLoop; +// ++endIndex; +// foundE = true; +// break; +// case '+': +// case '-': +// if (foundPlusMinus || !foundE) +// break hexLoop; +// foundPlusMinus = true; +// ++endIndex; +// break; +// case '.': +// if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.') +// break hexLoop; // possibly slice expression +// if (foundDot) +// break hexLoop; // two dots with other characters between them +// ++endIndex; +// foundDot = true; +// token.type = TokenType.DoubleLiteral; +// isDouble = true; +// break; +// default: +// break hexLoop; +// } +// } +// +// token.value = inputString[startIndex .. endIndex]; +//} +// +//unittest +//{ +// Token t; +// size_t start, end; +// start = 0; +// end = 2; +// lexHex!string("0x193abfq", start, end, t); +// assert(t.value == "0x193abf", t.value); +// assert(t.type == TokenType.IntLiteral); +// +// start = 0; +// end = 2; +// lexHex!string("0x2130xabc", start, end, t); +// assert(t.value == "0x2130"); +// assert(t.type == TokenType.IntLiteral); +// +//} +// +///** +// * Returns: true if ch marks the ending of one token and the beginning of +// * another, false otherwise +// */ +//pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) +//{ +// switch (ch) +// { +// case '!': .. case '/': +// case ':': .. case '@': +// case '[': .. case '^': +// case '{': .. case '~': +// case 0x20: // space +// case 0x09: // tab +// case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return +// return true; +// default: +// return false; +// } +//} +// +///** +// * Configure the tokenize() function +// */ +//enum IterationStyle +//{ +// /// Only include code, not whitespace or comments +// CODE_ONLY, +// /// Include everything +// EVERYTHING +//} +// +//struct TokenRange(R) if (isInputRange(R)) +//{ +// bool empty() const @property +// { +// return _empty; +// } +// +// +//private: +// R range; +// bool _empty; +//} +// +//Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY) +// if (isSomeString!S) +//{ +// auto tokenAppender = appender!(Token[])(); +// +// // This is very likely a local maximum, but it does seem to take a few +// // milliseconds off of the run time +// tokenAppender.reserve(inputString.length / 4); +// +// size_t endIndex = 0; +// uint lineNumber = 1; +// +// if (inputString.length > 1 && inputString[0..2] == "#!") +// { +// Token currentToken; +// currentToken.lineNumber = lineNumber; // lineNumber is always 1 +// currentToken.value = lexScriptLine(inputString, endIndex, lineNumber); +// currentToken.type = TokenType.ScriptLine; +// } +// +// while (!isEoF(inputString, endIndex)) +// { +// size_t prevIndex = endIndex; +// Token currentToken; +// auto startIndex = endIndex; +// if (isWhite(inputString[endIndex])) +// { +// if (iterationStyle == IterationStyle.EVERYTHING) +// { +// currentToken.lineNumber = lineNumber; +// currentToken.value = lexWhitespace(inputString, endIndex, +// lineNumber); +// currentToken.type = TokenType.Whitespace; +// tokenAppender.put(currentToken); +// } +// else +// lexWhitespace(inputString, endIndex, lineNumber); +// continue; +// } +// currentToken.startIndex = endIndex; +// +// outerSwitch: switch(inputString[endIndex]) +// { +// mixin(generateCaseTrie( +// "=", "TokenType.Assign", +// "&", "TokenType.BitAnd", +// "&=", "TokenType.BitAndEquals", +// "|", "TokenType.BitOr", +// "|=", "TokenType.BitOrEquals", +// "~=", "TokenType.CatEquals", +// ":", "TokenType.Colon", +// ",", "TokenType.Comma", +// "$", "TokenType.Dollar", +// ".", "TokenType.Dot", +// "==", "TokenType.Equals", +// "=>", "TokenType.GoesTo", +// ">", "TokenType.Greater", +// ">=", "TokenType.GreaterEqual", +// "#", "TokenType.Hash", +// "&&", "TokenType.LogicAnd", +// "{", "TokenType.LBrace", +// "[", "TokenType.LBracket", +// "<", "TokenType.Less", +// "<=", "TokenType.LessEqual", +// "<>=", "TokenType.LessEqualGreater", +// "<>", "TokenType.LessOrGreater", +// "||", "TokenType.LogicOr", +// "(", "TokenType.LParen", +// "-", "TokenType.Minus", +// "-=", "TokenType.MinusEquals", +// "%", "TokenType.Mod", +// "%=", "TokenType.ModEquals", +// "*=", "TokenType.MulEquals", +// "!", "TokenType.Not", +// "!=", "TokenType.NotEquals", +// "!>", "TokenType.NotGreater", +// "!>=", "TokenType.NotGreaterEqual", +// "!<", "TokenType.NotLess", +// "!<=", "TokenType.NotLessEqual", +// "!<>", "TokenType.NotLessEqualGreater", +// "+", "TokenType.Plus", +// "+=", "TokenType.PlusEquals", +// "^^", "TokenType.Pow", +// "^^=", "TokenType.PowEquals", +// "}", "TokenType.RBrace", +// "]", "TokenType.RBracket", +// ")", "TokenType.RParen", +// ";", "TokenType.Semicolon", +// "<<", "TokenType.ShiftLeft", +// "<<=", "TokenType.ShiftLeftEqual", +// ">>", "TokenType.ShiftRight", +// ">>=", "TokenType.ShiftRightEqual", +// "..", "TokenType.Slice", +// "*", "TokenType.Star", +// "?", "TokenType.Ternary", +// "~", "TokenType.Tilde", +// "--", "TokenType.Decrement", +// "!<>=", "TokenType.Unordered", +// ">>>", "TokenType.UnsignedShiftRight", +// ">>>=", "TokenType.UnsignedShiftRightEqual", +// "++", "TokenType.Increment", +// "...", "TokenType.Vararg", +// "^", "TokenType.Xor", +// "^=", "TokenType.XorEquals", +// )); +// case '0': .. case '9': +// currentToken = lexNumber(inputString, endIndex); +// break; +// case '/': +// ++endIndex; +// if (isEoF(inputString, endIndex)) +// { +// currentToken.value = "/"; +// currentToken.type = TokenType.Div; +// currentToken.lineNumber = lineNumber; +// break; +// } +// currentToken.lineNumber = lineNumber; +// switch (inputString[endIndex]) +// { +// case '/': +// case '+': +// case '*': +// if (iterationStyle == IterationStyle.CODE_ONLY) +// { +// lexComment(inputString, endIndex, lineNumber); +// continue; +// } +// else +// { +// currentToken.value = lexComment(inputString, endIndex, lineNumber); +// currentToken.type = TokenType.Comment; +// break; +// } +// case '=': +// currentToken.value = "/="; +// currentToken.type = TokenType.DivEquals; +// ++endIndex; +// break; +// default: +// currentToken.value = "/"; +// currentToken.type = TokenType.Div; +// break; +// } +// break; +// case 'r': +// ++endIndex; +// if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') +// goto default; +// currentToken.lineNumber = lineNumber; +// currentToken.value = lexString(inputString, endIndex, +// lineNumber, inputString[endIndex], false); +// currentToken.type = TokenType.StringLiteral; +// break; +// case '`': +// currentToken.lineNumber = lineNumber; +// currentToken.value = lexString(inputString, endIndex, lineNumber, +// inputString[endIndex], false); +// currentToken.type = TokenType.StringLiteral; +// break; +// case 'x': +// ++endIndex; +// if (isEoF(inputString, endIndex) || inputString[endIndex] != '"') +// goto default; +// else +// goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings +// case '\'': +// case '"': +// currentToken.lineNumber = lineNumber; +// currentToken.value = lexString(inputString, endIndex, lineNumber, +// inputString[endIndex]); +// currentToken.type = TokenType.StringLiteral; +// break; +// case 'q': +// currentToken.value = "q"; +// ++endIndex; +// if (!isEoF(inputString, endIndex)) +// { +// switch (inputString[endIndex]) +// { +// case '"': +// currentToken.lineNumber = lineNumber; +// currentToken.value ~= lexDelimitedString(inputString, +// endIndex, lineNumber); +// currentToken.type = TokenType.StringLiteral; +// break outerSwitch; +// case '{': +// currentToken.lineNumber = lineNumber; +// currentToken.value ~= lexTokenString(inputString, +// endIndex, lineNumber); +// currentToken.type = TokenType.StringLiteral; +// break outerSwitch; +// default: +// break; +// } +// } +// goto default; +// case '@': +// ++endIndex; +// goto default; +// default: +// while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex])) +// ++endIndex; +// currentToken.value = inputString[startIndex .. endIndex]; +// currentToken.type = lookupTokenTypeOptimized(currentToken.value); +// //currentToken.type = lookupTokenType(currentToken.value); +// currentToken.lineNumber = lineNumber; +// break; +// } +// //stderr.writeln(currentToken); +// tokenAppender.put(currentToken); +// +// // This should never happen. +// if (endIndex <= prevIndex) +// { +// stderr.writeln("FAIL"); +// return []; +// } +// } +// return tokenAppender.data; +//}