D-Scanner/tokenizer.d

773 lines
20 KiB
D

// Copyright Brian Schott (Sir Alaran) 2012.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module tokenizer;
import std.range;
import std.file;
import std.traits;
import std.algorithm;
import std.conv;
import std.uni;
import std.stdio;
import langutils;
import codegen;
/**
* Increments endIndex until it indexes a non-whitespace character in
* inputString.
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString
* lineNumber = the line number that corresponds to endIndex
* style = the code iteration style
* Returns: The whitespace, or null if style was CODE_ONLY
*/
pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex,
ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) // I suggest to remove the last param
if (isSomeString!S)
{
immutable startIndex = endIndex;
while (!isEoF(inputString, endIndex) && isWhite(inputString[endIndex]))
{
if (inputString[endIndex] == '\n')
lineNumber++;
++endIndex;
}
final switch (style)
{
case IterationStyle.EVERYTHING:
return inputString[startIndex .. endIndex];
case IterationStyle.CODE_ONLY:
return null;
}
}
/**
* If inputString starts from #!, increments endIndex until it indexes the next line.
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString
* lineNumber = the line number that corresponds to endIndex
* Returns: The script line, or null if this inputString doesn't start from script line
*/
pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
ref uint lineNumber) if (isSomeString!S)
{
auto startIndex = endIndex; // in current implementation endIndex is 0, but that could change (e.g., if BOM is not stripped from inputString)
string result = null;
if(inputString.length > 1 && inputString[0..2] == "#!") // safety check
{
endIndex = 2; // skip #!
while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
++endIndex;
result = inputString[startIndex..endIndex];
++lineNumber;
}
return result;
}
/**
* Increments endIndex until it indexes a character directly after a comment
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString at the second character of a
* comment, i.e. points at the second slash in a // comment.
* lineNumber = the line number that corresponds to endIndex
* Returns: The comment
*/
pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
ref uint lineNumber) if (isSomeString!S)
{
if (isEoF(inputString, endIndex))
return "";
auto startIndex = endIndex - 1;
switch(inputString[endIndex])
{
case '/':
while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
{
if (inputString[endIndex] == '\n')
++lineNumber;
++endIndex;
}
break;
case '*':
while (!isEoF(inputString, endIndex)
&& !inputString[endIndex..$].startsWith("*/"))
{
if (inputString[endIndex] == '\n')
++lineNumber;
++endIndex;
}
endIndex += 2;
break;
case '+':
++endIndex;
int depth = 1;
while (depth > 0 && !isEoF(inputString, endIndex))
{
if (inputString[endIndex] == '\n')
lineNumber++;
else if (inputString[endIndex..$].startsWith("+/"))
depth--;
else if (inputString[endIndex..$].startsWith("/+"))
depth++;
++endIndex;
}
if (!isEoF(inputString, endIndex))
++endIndex;
break;
default:
break;
}
return inputString[startIndex..endIndex];
}
/**
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString at the opening quote
* lineNumber = the line number that corresponds to endIndex
* quote = the opening (and closing) quote character for the string to be
* lexed
* Returns: a string literal, including its opening and closing quote characters
* Bugs: Does not handle string suffixes
*/
pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber,
C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C)
in
{
assert (inputString[endIndex] == quote);
assert (quote == '\'' || quote == '"' || quote == '`');
}
body
{
if (inputString[endIndex] != quote)
return "";
auto startIndex = endIndex;
++endIndex;
bool escape = false;
while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
{
if (escape)
escape = false;
else
escape = (canEscape && inputString[endIndex] == '\\');
if (inputString[endIndex] == '\n')
lineNumber++;
++endIndex;
}
++endIndex;
if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
{
++endIndex;
}
return inputString[startIndex .. endIndex];
}
/**
* Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
* and q"<>".
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString at the opening quote
* lineNumber = the line number that corresponds to endIndex
* Returns: a string literal, including its opening and closing quote characters
*/
string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
ref uint lineNumber) if (isSomeString!S)
{
auto startIndex = endIndex;
++endIndex;
assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
string open = inputString[endIndex .. endIndex + 1];
string close;
bool nesting = false;
switch (open[0])
{
case '[': close = "]"; ++endIndex; nesting = true; break;
case '<': close = ">"; ++endIndex; nesting = true; break;
case '{': close = "}"; ++endIndex; nesting = true; break;
case '(': close = ")"; ++endIndex; nesting = true; break;
default:
while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
endIndex++;
close = open = inputString[startIndex + 1 .. endIndex];
break;
}
int depth = 1;
while (!isEoF(inputString, endIndex) && depth > 0)
{
if (inputString[endIndex] == '\n')
{
lineNumber++;
endIndex++;
}
else if (inputString[endIndex..$].startsWith(open))
{
endIndex += open.length;
if (!nesting && !isEoF(inputString, endIndex))
{
if (inputString[endIndex] == '"')
++endIndex;
break;
}
depth++;
}
else if (inputString[endIndex..$].startsWith(close))
{
endIndex += close.length;
depth--;
if (depth <= 0)
break;
}
else
++endIndex;
}
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
++endIndex;
return inputString[startIndex .. endIndex];
}
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
{
/+auto r = byDToken(range, IterationStyle.EVERYTHING);
string s = getBraceContent(r);
range.popFrontN(s.length);
return s;+/
return "";
}
pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
if (isSomeString!S)
{
Token token;
size_t startIndex = endIndex;
if (inputString[endIndex] == '0')
{
endIndex++;
if (isEoF(inputString, endIndex))
{
token.type = TokenType.intLiteral;
token.value = inputString[startIndex .. endIndex];
return token;
}
switch (inputString[endIndex])
{
case '0': .. case '9':
// The current language spec doesn't cover octal literals, so this
// is decimal.
lexDecimal(inputString, startIndex, endIndex, token);
return token;
case 'b':
case 'B':
lexBinary(inputString, startIndex, ++endIndex, token);
return token;
case 'x':
case 'X':
lexHex(inputString, startIndex, ++endIndex, token);
return token;
default:
token.type = TokenType.intLiteral;
token.value = inputString[startIndex .. endIndex];
return token;
}
}
else
{
lexDecimal(inputString, startIndex, endIndex, token);
return token;
}
}
pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
ref size_t endIndex, ref Token token) if (isSomeString!S)
{
bool lexingSuffix = false;
bool isLong = false;
bool isUnsigned = false;
token.type = TokenType.intLiteral;
binaryLoop: while (!isEoF(inputString, endIndex))
{
switch (inputString[endIndex])
{
case '0':
case '1':
case '_':
if (lexingSuffix)
break binaryLoop;
++endIndex;
break;
case 'u':
case 'U':
if (isUnsigned)
break;
++endIndex;
lexingSuffix = true;
if (isLong)
{
token.type = TokenType.unsignedLongLiteral;
break binaryLoop;
}
else
token.type = TokenType.unsignedIntLiteral;
isUnsigned = true;
break;
case 'L':
if (isLong)
break binaryLoop;
++endIndex;
lexingSuffix = true;
if (isUnsigned)
{
token.type = TokenType.unsignedLongLiteral;
break binaryLoop;
}
else
token.type = TokenType.longLiteral;
isLong = true;
break;
default:
break binaryLoop;
}
}
token.value = inputString[startIndex .. endIndex];
}
pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
ref size_t endIndex, ref Token token) if (isSomeString!S)
{
bool lexingSuffix = false;
bool isLong = false;
bool isUnsigned = false;
bool isFloat = false;
bool isReal = false;
bool isDouble = false;
bool foundDot = false;
bool foundE = false;
bool foundPlusMinus = false;
token.type = TokenType.intLiteral;
decimalLoop: while (!isEoF(inputString, endIndex))
{
switch (inputString[endIndex])
{
case '0': .. case '9':
case '_':
if (lexingSuffix)
break decimalLoop;
++endIndex;
break;
case 'e':
case 'E':
if (foundE)
break decimalLoop;
++endIndex;
foundE = true;
break;
case '+':
case '-':
if (foundPlusMinus || !foundE)
break decimalLoop;
foundPlusMinus = true;
++endIndex;
break;
case '.':
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
break decimalLoop; // possibly slice expression
if (foundDot)
break decimalLoop; // two dots with other characters between them
++endIndex;
foundDot = true;
token.type = TokenType.doubleLiteral;
isDouble = true;
break;
case 'u':
case 'U':
if (isUnsigned)
break decimalLoop;
++endIndex;
lexingSuffix = true;
if (isLong)
token.type = TokenType.unsignedLongLiteral;
else
token.type = TokenType.unsignedIntLiteral;
isUnsigned = true;
break;
case 'L':
if (isLong)
break decimalLoop;
if (isReal)
break decimalLoop;
++endIndex;
lexingSuffix = true;
if (isDouble)
token.type = TokenType.realLiteral;
else if (isUnsigned)
token.type = TokenType.unsignedLongLiteral;
else
token.type = TokenType.longLiteral;
isLong = true;
break;
case 'f':
case 'F':
lexingSuffix = true;
if (isUnsigned || isLong)
break decimalLoop;
++endIndex;
token.type = TokenType.floatLiteral;
break decimalLoop;
default:
break decimalLoop;
}
}
// suggest to extract lexing integers into a separate function
// please see unittest below
token.value = inputString[startIndex .. endIndex];
}
unittest {
dump!lexDecimal("55e-4"); // yeilds intLiteral, but should be float
dump!lexDecimal("3e+f"); // floatLiteral, but should be considered invalid
dump!lexDecimal("3e++f"); // intLiteral 3e+, but should be considered invalid
// actually, there are lots of bugs. The point is that without decomposition of integer lexing from floating-point lexing
// it is very hard to prove algorithm correctness
}
// Temporary function to illustrate some problems
// Executes T and dumps results to console
void dump(alias T)(string s) {
size_t start;
size_t end;
Token tok;
T!(string)(s, start, end, tok);
// dump results
writeln(tok.type);
writeln(tok.value);
writeln(start);
writeln(end);
}
nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
ref size_t endIndex, ref Token token) if (isSomeString!S)
{
bool lexingSuffix = false;
bool isLong = false;
bool isUnsigned = false;
bool isFloat = false;
bool isReal = false;
bool isDouble = false;
bool foundDot = false;
bool foundE = false;
bool foundPlusMinus = false;
token.type = TokenType.intLiteral;
hexLoop: while (!isEoF(inputString, endIndex))
{
switch (inputString[endIndex])
{
case '0': .. case '9':
case 'a': .. case 'f':
case 'A': .. case 'F':
case '_':
if (lexingSuffix)
break hexLoop;
++endIndex;
break;
case 'p':
case 'P':
if (foundE)
break hexLoop;
++endIndex;
foundE = true;
break;
case '+':
case '-':
if (foundPlusMinus || !foundE)
break hexLoop;
foundPlusMinus = true;
++endIndex;
break;
case '.':
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
break hexLoop; // possibly slice expression
if (foundDot)
break hexLoop; // two dots with other characters between them
++endIndex;
foundDot = true;
token.type = TokenType.doubleLiteral;
isDouble = true;
break;
default:
break hexLoop;
}
}
token.value = inputString[startIndex .. endIndex];
}
/**
* Returns: true if ch marks the ending of one token and the beginning of
* another, false otherwise
*/
pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
{
switch (ch)
{
case '!': .. case '/':
case ':': .. case '@':
case '[': .. case '^':
case '{': .. case '~':
case 0x20: // space
case 0x09: // tab
case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return
return true;
default:
return false;
}
}
/**
* Configure the tokenize() function
*/
enum IterationStyle
{
/// Only include code, not whitespace or comments
CODE_ONLY,
/// Include everything
EVERYTHING
}
Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY)
if (isSomeString!S)
{
auto tokenAppender = appender!(Token[])();
// This is very likely a local maximum, but it does seem to take a few
// milliseconds off of the run time
tokenAppender.reserve(inputString.length / 4);
size_t endIndex = 0;
uint lineNumber = 1;
if (inputString.length > 1 && inputString[0..2] == "#!")
{
Token currentToken;
currentToken.lineNumber = lineNumber; // lineNumber is always 1
currentToken.value = lexScriptLine(inputString, endIndex, lineNumber);
currentToken.type = TokenType.scriptLine;
}
while (!isEoF(inputString, endIndex))
{
size_t prevIndex = endIndex;
Token currentToken;
auto startIndex = endIndex;
if (isWhite(inputString[endIndex]))
{
if (iterationStyle == IterationStyle.EVERYTHING)
{
currentToken.lineNumber = lineNumber;
currentToken.value = lexWhitespace(inputString, endIndex,
lineNumber, IterationStyle.EVERYTHING); // note: I suggest to remove the last parameter to simplify lexWhitespace
currentToken.type = TokenType.whitespace;
tokenAppender.put(currentToken);
}
else
lexWhitespace(inputString, endIndex, lineNumber);
continue;
}
currentToken.startIndex = endIndex;
outerSwitch: switch(inputString[endIndex])
{
mixin(generateCaseTrie(
"=", "TokenType.assign",
"&", "TokenType.bitAnd",
"&=", "TokenType.bitAndEquals",
"|", "TokenType.bitOr",
"|=", "TokenType.bitOrEquals",
"~=", "TokenType.catEquals",
":", "TokenType.colon",
",", "TokenType.comma",
"$", "TokenType.dollar",
".", "TokenType.dot",
"==", "TokenType.equals",
"=>", "TokenType.goesTo",
">", "TokenType.greater",
">=", "TokenType.greaterEqual",
"#", "TokenType.hash",
"&&", "TokenType.lAnd",
"{", "TokenType.lBrace",
"[", "TokenType.lBracket",
"<", "TokenType.less",
"<=", "TokenType.lessEqual",
"<>=", "TokenType.lessEqualGreater",
"<>", "TokenType.lessOrGreater",
"||", "TokenType.lOr",
"(", "TokenType.lParen",
"-", "TokenType.minus",
"-=", "TokenType.minusEquals",
"%", "TokenType.mod",
"%=", "TokenType.modEquals",
"*=", "TokenType.mulEquals",
"!", "TokenType.not",
"!=", "TokenType.notEquals",
"!>", "TokenType.notGreater",
"!>=", "TokenType.notGreaterEqual",
"!<", "TokenType.notLess",
"!<=", "TokenType.notLessEqual",
"!<>", "TokenType.notLessEqualGreater",
"+", "TokenType.plus",
"+=", "TokenType.plusEquals",
"^^", "TokenType.pow",
"^^=", "TokenType.powEquals",
"}", "TokenType.rBrace",
"]", "TokenType.rBracket",
")", "TokenType.rParen",
";", "TokenType.semicolon",
"<<", "TokenType.shiftLeft",
"<<=", "TokenType.shiftLeftEqual",
">>", "TokenType.shiftRight",
">>=", "TokenType.shiftRightEqual",
"..", "TokenType.slice",
"*", "TokenType.star",
"?", "TokenType.ternary",
"~", "TokenType.tilde",
"--", "TokenType.uMinus",
"!<>=", "TokenType.unordered",
">>>", "TokenType.unsignedShiftRight",
">>>=", "TokenType.unsignedShiftRightEqual",
"++", "TokenType.uPlus",
"...", "TokenType.vararg",
"^", "TokenType.xor",
"^=", "TokenType.xorEquals",
));
case '0': .. case '9':
currentToken = lexNumber(inputString, endIndex);
break;
case '/':
++endIndex;
if (isEoF(inputString, endIndex))
{
currentToken.value = "/";
currentToken.type = TokenType.div;
currentToken.lineNumber = lineNumber;
break;
}
currentToken.lineNumber = lineNumber;
switch (inputString[endIndex])
{
case '/':
case '+':
case '*':
if (iterationStyle == IterationStyle.CODE_ONLY)
{
lexComment(inputString, endIndex, lineNumber);
continue;
}
else
{
currentToken.value = lexComment(inputString, endIndex, lineNumber);
currentToken.type = TokenType.comment;
break;
}
case '=':
currentToken.value = "/=";
currentToken.type = TokenType.divEquals;
++endIndex;
break;
default:
currentToken.value = "/";
currentToken.type = TokenType.div;
break;
}
break;
case 'r':
++endIndex;
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
goto default;
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex,
lineNumber, inputString[endIndex], false);
currentToken.type = TokenType.stringLiteral;
break;
case '`':
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex], false);
currentToken.type = TokenType.stringLiteral;
break;
case 'x':
++endIndex;
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
goto default;
else
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
case '\'':
case '"':
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber,
inputString[endIndex]);
currentToken.type = TokenType.stringLiteral;
break;
case 'q':
currentToken.value = "q";
++endIndex;
if (!isEoF(inputString, endIndex))
{
switch (inputString[endIndex])
{
case '"':
currentToken.lineNumber = lineNumber;
currentToken.value ~= lexDelimitedString(inputString,
endIndex, lineNumber);
currentToken.type = TokenType.stringLiteral;
break outerSwitch;
case '{':
currentToken.lineNumber = lineNumber;
currentToken.value ~= lexTokenString(inputString,
endIndex, lineNumber);
currentToken.type = TokenType.stringLiteral;
break outerSwitch;
default:
break;
}
}
goto default;
case '@':
++endIndex;
goto default;
default:
while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
++endIndex;
currentToken.value = inputString[startIndex .. endIndex];
currentToken.type = lookupTokenType(currentToken.value);
currentToken.lineNumber = lineNumber;
break;
}
//stderr.writeln(currentToken);
tokenAppender.put(currentToken);
// This should never happen.
if (endIndex <= prevIndex)
{
stderr.writeln("FAIL");
return [];
}
}
return tokenAppender.data;
}