772 lines
20 KiB
D
772 lines
20 KiB
D
|
|
// Copyright Brian Schott (Sir Alaran) 2012.
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
module tokenizer;
|
|
|
|
import std.range;
|
|
import std.file;
|
|
import std.traits;
|
|
import std.algorithm;
|
|
import std.conv;
|
|
import std.uni;
|
|
import std.stdio;
|
|
|
|
import langutils;
|
|
import codegen;
|
|
|
|
|
|
/**
|
|
* Increments endIndex until it indexes a non-whitespace character in
|
|
* inputString.
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* style = the code iteration style
|
|
* Returns: The whitespace, or null if style was CODE_ONLY
|
|
*/
|
|
pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex,
|
|
ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) // I suggest to remove the last param
|
|
if (isSomeString!S)
|
|
{
|
|
immutable startIndex = endIndex;
|
|
while (!isEoF(inputString, endIndex) && isWhite(inputString[endIndex]))
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
lineNumber++;
|
|
++endIndex;
|
|
}
|
|
final switch (style)
|
|
{
|
|
case IterationStyle.EVERYTHING:
|
|
return inputString[startIndex .. endIndex];
|
|
case IterationStyle.CODE_ONLY:
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* If inputString starts with #!, increments endIndex until it indexes the next line.
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* Returns: The script line, or null if this inputString doesn't start from script line
|
|
*/
|
|
pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
|
|
ref uint lineNumber) if (isSomeString!S)
|
|
{
|
|
auto startIndex = endIndex; // in current implementation endIndex is 0, but that could change (e.g., if BOM is not stripped from inputString)
|
|
string result = null;
|
|
if(inputString.length > 1 && inputString[0..2] == "#!") // safety check
|
|
{
|
|
endIndex = 2; // skip #!
|
|
while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
|
|
++endIndex;
|
|
|
|
result = inputString[startIndex..endIndex];
|
|
++lineNumber;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Increments endIndex until it indexes a character directly after a comment
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the second character of a
|
|
* comment, i.e. points at the second slash in a // comment.
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* Returns: The comment
|
|
*/
|
|
pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
|
|
ref uint lineNumber) if (isSomeString!S)
|
|
{
|
|
if (isEoF(inputString, endIndex))
|
|
return "";
|
|
auto startIndex = endIndex - 1;
|
|
switch(inputString[endIndex])
|
|
{
|
|
case '/':
|
|
while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
++lineNumber;
|
|
++endIndex;
|
|
}
|
|
break;
|
|
case '*':
|
|
while (!isEoF(inputString, endIndex)
|
|
&& !inputString[endIndex..$].startsWith("*/"))
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
++lineNumber;
|
|
++endIndex;
|
|
}
|
|
endIndex += 2;
|
|
break;
|
|
case '+':
|
|
++endIndex;
|
|
int depth = 1;
|
|
while (depth > 0 && !isEoF(inputString, endIndex))
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
lineNumber++;
|
|
else if (inputString[endIndex..$].startsWith("+/"))
|
|
depth--;
|
|
else if (inputString[endIndex..$].startsWith("/+"))
|
|
depth++;
|
|
++endIndex;
|
|
}
|
|
if (!isEoF(inputString, endIndex))
|
|
++endIndex;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return inputString[startIndex..endIndex];
|
|
}
|
|
|
|
|
|
/**
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the opening quote
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* quote = the opening (and closing) quote character for the string to be
|
|
* lexed
|
|
* Returns: a string literal, including its opening and closing quote characters
|
|
*/
|
|
pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber,
|
|
C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C)
|
|
in
|
|
{
|
|
assert (inputString[endIndex] == quote);
|
|
assert (quote == '\'' || quote == '"' || quote == '`');
|
|
}
|
|
body
|
|
{
|
|
if (inputString[endIndex] != quote)
|
|
return "";
|
|
auto startIndex = endIndex;
|
|
++endIndex;
|
|
bool escape = false;
|
|
while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
|
|
{
|
|
if (escape)
|
|
escape = false;
|
|
else
|
|
escape = (canEscape && inputString[endIndex] == '\\');
|
|
if (inputString[endIndex] == '\n')
|
|
lineNumber++;
|
|
++endIndex;
|
|
}
|
|
++endIndex;
|
|
if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|
|
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
|
|
{
|
|
++endIndex;
|
|
}
|
|
return inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
/**
|
|
* Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
|
|
* and q"<>".
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the opening quote
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* Returns: a string literal, including its opening and closing quote characters
|
|
*/
|
|
string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
|
|
ref uint lineNumber) if (isSomeString!S)
|
|
{
|
|
auto startIndex = endIndex;
|
|
++endIndex;
|
|
assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
|
|
string open = inputString[endIndex .. endIndex + 1];
|
|
string close;
|
|
bool nesting = false;
|
|
switch (open[0])
|
|
{
|
|
case '[': close = "]"; ++endIndex; nesting = true; break;
|
|
case '<': close = ">"; ++endIndex; nesting = true; break;
|
|
case '{': close = "}"; ++endIndex; nesting = true; break;
|
|
case '(': close = ")"; ++endIndex; nesting = true; break;
|
|
default:
|
|
while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
|
|
endIndex++;
|
|
close = open = inputString[startIndex + 1 .. endIndex];
|
|
break;
|
|
}
|
|
int depth = 1;
|
|
while (!isEoF(inputString, endIndex) && depth > 0)
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
{
|
|
lineNumber++;
|
|
endIndex++;
|
|
}
|
|
else if (inputString[endIndex..$].startsWith(open))
|
|
{
|
|
endIndex += open.length;
|
|
if (!nesting && !isEoF(inputString, endIndex))
|
|
{
|
|
if (inputString[endIndex] == '"')
|
|
++endIndex;
|
|
break;
|
|
}
|
|
depth++;
|
|
}
|
|
else if (inputString[endIndex..$].startsWith(close))
|
|
{
|
|
endIndex += close.length;
|
|
depth--;
|
|
if (depth <= 0)
|
|
break;
|
|
}
|
|
else
|
|
++endIndex;
|
|
}
|
|
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
|
|
++endIndex;
|
|
return inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
|
|
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
|
|
{
|
|
/+auto r = byDToken(range, IterationStyle.EVERYTHING);
|
|
string s = getBraceContent(r);
|
|
range.popFrontN(s.length);
|
|
return s;+/
|
|
return "";
|
|
}
|
|
|
|
pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
|
|
if (isSomeString!S)
|
|
{
|
|
Token token;
|
|
size_t startIndex = endIndex;
|
|
if (inputString[endIndex] == '0')
|
|
{
|
|
endIndex++;
|
|
if (isEoF(inputString, endIndex))
|
|
{
|
|
token.type = TokenType.intLiteral;
|
|
token.value = inputString[startIndex .. endIndex];
|
|
return token;
|
|
}
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
// The current language spec doesn't cover octal literals, so this
|
|
// is decimal.
|
|
lexDecimal(inputString, startIndex, endIndex, token);
|
|
return token;
|
|
case 'b':
|
|
case 'B':
|
|
lexBinary(inputString, startIndex, ++endIndex, token);
|
|
return token;
|
|
case 'x':
|
|
case 'X':
|
|
lexHex(inputString, startIndex, ++endIndex, token);
|
|
return token;
|
|
default:
|
|
token.type = TokenType.intLiteral;
|
|
token.value = inputString[startIndex .. endIndex];
|
|
return token;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
lexDecimal(inputString, startIndex, endIndex, token);
|
|
return token;
|
|
}
|
|
}
|
|
|
|
pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
token.type = TokenType.intLiteral;
|
|
binaryLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0':
|
|
case '1':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break binaryLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
if (isUnsigned)
|
|
break;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isLong)
|
|
{
|
|
token.type = TokenType.unsignedLongLiteral;
|
|
break binaryLoop;
|
|
}
|
|
else
|
|
token.type = TokenType.unsignedIntLiteral;
|
|
isUnsigned = true;
|
|
break;
|
|
case 'L':
|
|
if (isLong)
|
|
break binaryLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isUnsigned)
|
|
{
|
|
token.type = TokenType.unsignedLongLiteral;
|
|
break binaryLoop;
|
|
}
|
|
else
|
|
token.type = TokenType.longLiteral;
|
|
isLong = true;
|
|
break;
|
|
default:
|
|
break binaryLoop;
|
|
}
|
|
}
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
bool isFloat = false;
|
|
bool isReal = false;
|
|
bool isDouble = false;
|
|
bool foundDot = false;
|
|
bool foundE = false;
|
|
bool foundPlusMinus = false;
|
|
token.type = TokenType.intLiteral;
|
|
decimalLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'e':
|
|
case 'E':
|
|
if (foundE)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
foundE = true;
|
|
break;
|
|
case '+':
|
|
case '-':
|
|
if (foundPlusMinus || !foundE)
|
|
break decimalLoop;
|
|
foundPlusMinus = true;
|
|
++endIndex;
|
|
break;
|
|
case '.':
|
|
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
|
|
break decimalLoop; // possibly slice expression
|
|
if (foundDot)
|
|
break decimalLoop; // two dots with other characters between them
|
|
++endIndex;
|
|
foundDot = true;
|
|
token.type = TokenType.doubleLiteral;
|
|
isDouble = true;
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
if (isUnsigned)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isLong)
|
|
token.type = TokenType.unsignedLongLiteral;
|
|
else
|
|
token.type = TokenType.unsignedIntLiteral;
|
|
isUnsigned = true;
|
|
break;
|
|
case 'L':
|
|
if (isLong)
|
|
break decimalLoop;
|
|
if (isReal)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isDouble)
|
|
token.type = TokenType.realLiteral;
|
|
else if (isUnsigned)
|
|
token.type = TokenType.unsignedLongLiteral;
|
|
else
|
|
token.type = TokenType.longLiteral;
|
|
isLong = true;
|
|
break;
|
|
case 'f':
|
|
case 'F':
|
|
lexingSuffix = true;
|
|
if (isUnsigned || isLong)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
token.type = TokenType.floatLiteral;
|
|
break decimalLoop;
|
|
default:
|
|
break decimalLoop;
|
|
}
|
|
}
|
|
|
|
// suggest to extract lexing integers into a separate function
|
|
// please see unittest below
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
unittest {
|
|
dump!lexDecimal("55e-4"); // yeilds intLiteral, but should be float
|
|
dump!lexDecimal("3e+f"); // floatLiteral, but should be considered invalid
|
|
dump!lexDecimal("3e++f"); // intLiteral 3e+, but should be considered invalid
|
|
// actually, there are lots of bugs. The point is that without decomposition of integer lexing from floating-point lexing
|
|
// it is very hard to prove algorithm correctness
|
|
}
|
|
|
|
// Temporary function to illustrate some problems
|
|
// Executes T and dumps results to console
|
|
void dump(alias T)(string s) {
|
|
size_t start;
|
|
size_t end;
|
|
Token tok;
|
|
T!(string)(s, start, end, tok);
|
|
// dump results
|
|
writeln(tok.type);
|
|
writeln(tok.value);
|
|
writeln(start);
|
|
writeln(end);
|
|
}
|
|
|
|
nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
bool isFloat = false;
|
|
bool isReal = false;
|
|
bool isDouble = false;
|
|
bool foundDot = false;
|
|
bool foundE = false;
|
|
bool foundPlusMinus = false;
|
|
token.type = TokenType.intLiteral;
|
|
hexLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break hexLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'p':
|
|
case 'P':
|
|
if (foundE)
|
|
break hexLoop;
|
|
++endIndex;
|
|
foundE = true;
|
|
break;
|
|
case '+':
|
|
case '-':
|
|
if (foundPlusMinus || !foundE)
|
|
break hexLoop;
|
|
foundPlusMinus = true;
|
|
++endIndex;
|
|
break;
|
|
case '.':
|
|
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
|
|
break hexLoop; // possibly slice expression
|
|
if (foundDot)
|
|
break hexLoop; // two dots with other characters between them
|
|
++endIndex;
|
|
foundDot = true;
|
|
token.type = TokenType.doubleLiteral;
|
|
isDouble = true;
|
|
break;
|
|
default:
|
|
break hexLoop;
|
|
}
|
|
}
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns: true if ch marks the ending of one token and the beginning of
|
|
* another, false otherwise
|
|
*/
|
|
pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
|
|
{
|
|
switch (ch)
|
|
{
|
|
case '!': .. case '/':
|
|
case ':': .. case '@':
|
|
case '[': .. case '^':
|
|
case '{': .. case '~':
|
|
case 0x20: // space
|
|
case 0x09: // tab
|
|
case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Configure the tokenize() function
|
|
*/
|
|
enum IterationStyle
|
|
{
|
|
/// Only include code, not whitespace or comments
|
|
CODE_ONLY,
|
|
/// Include everything
|
|
EVERYTHING
|
|
}
|
|
|
|
Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY)
|
|
if (isSomeString!S)
|
|
{
|
|
auto tokenAppender = appender!(Token[])();
|
|
|
|
// This is very likely a local maximum, but it does seem to take a few
|
|
// milliseconds off of the run time
|
|
tokenAppender.reserve(inputString.length / 4);
|
|
|
|
size_t endIndex = 0;
|
|
uint lineNumber = 1;
|
|
|
|
if (inputString.length > 1 && inputString[0..2] == "#!")
|
|
{
|
|
Token currentToken;
|
|
currentToken.lineNumber = lineNumber; // lineNumber is always 1
|
|
currentToken.value = lexScriptLine(inputString, endIndex, lineNumber);
|
|
currentToken.type = TokenType.scriptLine;
|
|
}
|
|
|
|
while (!isEoF(inputString, endIndex))
|
|
{
|
|
size_t prevIndex = endIndex;
|
|
Token currentToken;
|
|
auto startIndex = endIndex;
|
|
if (isWhite(inputString[endIndex]))
|
|
{
|
|
if (iterationStyle == IterationStyle.EVERYTHING)
|
|
{
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexWhitespace(inputString, endIndex,
|
|
lineNumber, IterationStyle.EVERYTHING); // note: I suggest to remove the last parameter to simplify lexWhitespace
|
|
currentToken.type = TokenType.whitespace;
|
|
tokenAppender.put(currentToken);
|
|
}
|
|
else
|
|
lexWhitespace(inputString, endIndex, lineNumber);
|
|
continue;
|
|
}
|
|
currentToken.startIndex = endIndex;
|
|
|
|
outerSwitch: switch(inputString[endIndex])
|
|
{
|
|
mixin(generateCaseTrie(
|
|
"=", "TokenType.assign",
|
|
"&", "TokenType.bitAnd",
|
|
"&=", "TokenType.bitAndEquals",
|
|
"|", "TokenType.bitOr",
|
|
"|=", "TokenType.bitOrEquals",
|
|
"~=", "TokenType.catEquals",
|
|
":", "TokenType.colon",
|
|
",", "TokenType.comma",
|
|
"$", "TokenType.dollar",
|
|
".", "TokenType.dot",
|
|
"==", "TokenType.equals",
|
|
"=>", "TokenType.goesTo",
|
|
">", "TokenType.greater",
|
|
">=", "TokenType.greaterEqual",
|
|
"#", "TokenType.hash",
|
|
"&&", "TokenType.logicAnd",
|
|
"{", "TokenType.lBrace",
|
|
"[", "TokenType.lBracket",
|
|
"<", "TokenType.less",
|
|
"<=", "TokenType.lessEqual",
|
|
"<>=", "TokenType.lessEqualGreater",
|
|
"<>", "TokenType.lessOrGreater",
|
|
"||", "TokenType.logicOr",
|
|
"(", "TokenType.lParen",
|
|
"-", "TokenType.minus",
|
|
"-=", "TokenType.minusEquals",
|
|
"%", "TokenType.mod",
|
|
"%=", "TokenType.modEquals",
|
|
"*=", "TokenType.mulEquals",
|
|
"!", "TokenType.not",
|
|
"!=", "TokenType.notEquals",
|
|
"!>", "TokenType.notGreater",
|
|
"!>=", "TokenType.notGreaterEqual",
|
|
"!<", "TokenType.notLess",
|
|
"!<=", "TokenType.notLessEqual",
|
|
"!<>", "TokenType.notLessEqualGreater",
|
|
"+", "TokenType.plus",
|
|
"+=", "TokenType.plusEquals",
|
|
"^^", "TokenType.pow",
|
|
"^^=", "TokenType.powEquals",
|
|
"}", "TokenType.rBrace",
|
|
"]", "TokenType.rBracket",
|
|
")", "TokenType.rParen",
|
|
";", "TokenType.semicolon",
|
|
"<<", "TokenType.shiftLeft",
|
|
"<<=", "TokenType.shiftLeftEqual",
|
|
">>", "TokenType.shiftRight",
|
|
">>=", "TokenType.shiftRightEqual",
|
|
"..", "TokenType.slice",
|
|
"*", "TokenType.star",
|
|
"?", "TokenType.ternary",
|
|
"~", "TokenType.tilde",
|
|
"--", "TokenType.uMinus",
|
|
"!<>=", "TokenType.unordered",
|
|
">>>", "TokenType.unsignedShiftRight",
|
|
">>>=", "TokenType.unsignedShiftRightEqual",
|
|
"++", "TokenType.uPlus",
|
|
"...", "TokenType.vararg",
|
|
"^", "TokenType.xor",
|
|
"^=", "TokenType.xorEquals",
|
|
));
|
|
case '0': .. case '9':
|
|
currentToken = lexNumber(inputString, endIndex);
|
|
break;
|
|
case '/':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex))
|
|
{
|
|
currentToken.value = "/";
|
|
currentToken.type = TokenType.div;
|
|
currentToken.lineNumber = lineNumber;
|
|
break;
|
|
}
|
|
currentToken.lineNumber = lineNumber;
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '/':
|
|
case '+':
|
|
case '*':
|
|
if (iterationStyle == IterationStyle.CODE_ONLY)
|
|
{
|
|
lexComment(inputString, endIndex, lineNumber);
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
currentToken.value = lexComment(inputString, endIndex, lineNumber);
|
|
currentToken.type = TokenType.comment;
|
|
break;
|
|
}
|
|
case '=':
|
|
currentToken.value = "/=";
|
|
currentToken.type = TokenType.divEquals;
|
|
++endIndex;
|
|
break;
|
|
default:
|
|
currentToken.value = "/";
|
|
currentToken.type = TokenType.div;
|
|
break;
|
|
}
|
|
break;
|
|
case 'r':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
|
goto default;
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex,
|
|
lineNumber, inputString[endIndex], false);
|
|
currentToken.type = TokenType.stringLiteral;
|
|
break;
|
|
case '`':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
|
inputString[endIndex], false);
|
|
currentToken.type = TokenType.stringLiteral;
|
|
break;
|
|
case 'x':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
|
goto default;
|
|
else
|
|
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
|
|
case '\'':
|
|
case '"':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
|
inputString[endIndex]);
|
|
currentToken.type = TokenType.stringLiteral;
|
|
break;
|
|
case 'q':
|
|
currentToken.value = "q";
|
|
++endIndex;
|
|
if (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '"':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value ~= lexDelimitedString(inputString,
|
|
endIndex, lineNumber);
|
|
currentToken.type = TokenType.stringLiteral;
|
|
break outerSwitch;
|
|
case '{':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value ~= lexTokenString(inputString,
|
|
endIndex, lineNumber);
|
|
currentToken.type = TokenType.stringLiteral;
|
|
break outerSwitch;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
goto default;
|
|
case '@':
|
|
++endIndex;
|
|
goto default;
|
|
default:
|
|
while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
|
|
++endIndex;
|
|
currentToken.value = inputString[startIndex .. endIndex];
|
|
currentToken.type = lookupTokenType(currentToken.value);
|
|
currentToken.lineNumber = lineNumber;
|
|
break;
|
|
}
|
|
//stderr.writeln(currentToken);
|
|
tokenAppender.put(currentToken);
|
|
|
|
// This should never happen.
|
|
if (endIndex <= prevIndex)
|
|
{
|
|
stderr.writeln("FAIL");
|
|
return [];
|
|
}
|
|
}
|
|
return tokenAppender.data;
|
|
}
|