938 lines
21 KiB
D
938 lines
21 KiB
D
|
|
// Copyright Brian Schott (Sir Alaran) 2012.
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
module tokenizer;
|
|
|
|
import std.range;
|
|
import std.file;
|
|
import std.traits;
|
|
import std.algorithm;
|
|
import std.conv;
|
|
import std.uni;
|
|
import std.stdio;
|
|
|
|
import langutils;
|
|
import codegen;
|
|
|
|
pure bool isNewline(R)(R range)
|
|
{
|
|
return range.front == '\n' || range.front == '\r';
|
|
}
|
|
|
|
pure bool isEoF(R)(R range)
|
|
{
|
|
return range.empty || range.front == 0 || range.front == 0x1a;
|
|
}
|
|
|
|
char[] popNewline(R)(ref R range)
|
|
{
|
|
char[] chars;
|
|
if (range.front == '\r')
|
|
{
|
|
chars ~= range.front;
|
|
range.popFront();
|
|
}
|
|
if (range.front == '\n')
|
|
{
|
|
chars ~= range.front;
|
|
range.popFront();
|
|
}
|
|
return chars;
|
|
}
|
|
|
|
unittest
|
|
{
|
|
auto s = "\r\ntest";
|
|
assert (popNewline(s) == "\r\n");
|
|
assert (s == "test");
|
|
}
|
|
|
|
/**
|
|
* Returns:
|
|
*/
|
|
string lexWhitespace(R)(ref R range, ref uint lineNumber)
|
|
{
|
|
auto app = appender!(char[])();
|
|
while (!isEoF(range) && isWhite(range.front))
|
|
{
|
|
if (isNewline(range))
|
|
{
|
|
++lineNumber;
|
|
app.put(popNewline(range));
|
|
}
|
|
else
|
|
{
|
|
app.put(range.front);
|
|
range.popFront();
|
|
}
|
|
}
|
|
return to!string(app.data);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
import std.stdio;
|
|
uint lineNum = 1;
|
|
auto chars = " \n \r\n \tabcde";
|
|
auto r = lexWhitespace(chars, lineNum);
|
|
assert (r == " \n \r\n \t");
|
|
assert (chars == "abcde");
|
|
assert (lineNum == 3);
|
|
}
|
|
|
|
/**
|
|
* Increments endIndex until it indexes a character directly after a comment
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the second character of a
|
|
* comment, i.e. points at the second slash in a // comment.
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* Returns: The comment
|
|
*/
|
|
string lexComment(R)(ref R input, ref uint lineNumber)
|
|
in
|
|
{
|
|
assert (input.front == '/');
|
|
}
|
|
body
|
|
{
|
|
auto app = appender!(char[])();
|
|
app.put(input.front);
|
|
input.popFront();
|
|
switch(input.front)
|
|
{
|
|
case '/':
|
|
while (!isEoF(input) && !isNewline(input))
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
}
|
|
break;
|
|
case '*':
|
|
while (!isEoF(input))
|
|
{
|
|
if (isNewline(input))
|
|
{
|
|
app.put(popNewline(input));
|
|
++lineNumber;
|
|
}
|
|
else if (input.front == '*')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
if (input.front == '/')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
}
|
|
}
|
|
break;
|
|
case '+':
|
|
int depth = 1;
|
|
while (depth > 0 && !isEoF(input))
|
|
{
|
|
if (isNewline(input))
|
|
{
|
|
app.put(popNewline(input));
|
|
lineNumber++;
|
|
}
|
|
else if (input.front == '+')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
if (input.front == '/')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
--depth;
|
|
}
|
|
}
|
|
else if (input.front == '/')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
if (input.front == '+')
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
++depth;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
app.put(input.front);
|
|
input.popFront();
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
return to!string(app.data);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
uint lineNumber = 1;
|
|
auto chars = "//this is a comment\r\nthis is not";
|
|
auto comment = lexComment(chars, lineNumber);
|
|
assert (chars == "\r\nthis is not");
|
|
assert (comment == "//this is a comment");
|
|
}
|
|
|
|
unittest
|
|
{
|
|
uint lineNumber = 1;
|
|
auto chars = "/* this is a\n\tcomment\r\n */this is not";
|
|
auto comment = lexComment(chars, lineNumber);
|
|
assert (chars == "this is not");
|
|
assert (comment == "/* this is a\n\tcomment\r\n */");
|
|
assert (lineNumber == 3);
|
|
}
|
|
|
|
unittest
|
|
{
|
|
uint lineNumber = 1;
|
|
auto chars = "/+this is a /+c/+omm+/ent+/ \r\nthis+/ is not";
|
|
auto comment = lexComment(chars, lineNumber);
|
|
assert (chars == " is not");
|
|
assert (comment == "/+this is a /+c/+omm+/ent+/ \r\nthis+/");
|
|
assert (lineNumber == 2);
|
|
}
|
|
|
|
|
|
/**
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the opening quote
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* quote = the opening (and closing) quote character for the string to be
|
|
* lexed
|
|
* Returns: a string literal, including its opening and closing quote characters
|
|
*/
|
|
pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber,
|
|
C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C)
|
|
in
|
|
{
|
|
assert (inputString[endIndex] == quote);
|
|
assert (quote == '\'' || quote == '"' || quote == '`');
|
|
}
|
|
body
|
|
{
|
|
if (inputString[endIndex] != quote)
|
|
return "";
|
|
auto startIndex = endIndex;
|
|
++endIndex;
|
|
bool escape = false;
|
|
while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
|
|
{
|
|
if (escape)
|
|
escape = false;
|
|
else
|
|
escape = (canEscape && inputString[endIndex] == '\\');
|
|
if (inputString[endIndex] == '\n')
|
|
lineNumber++;
|
|
++endIndex;
|
|
}
|
|
++endIndex;
|
|
if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|
|
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
|
|
{
|
|
++endIndex;
|
|
}
|
|
auto e = endIndex > inputString.length ? inputString.length : endIndex;
|
|
return inputString[startIndex .. e];
|
|
}
|
|
|
|
/**
|
|
* Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
|
|
* and q"<>".
|
|
* Params:
|
|
* inputString = the source code to examine
|
|
* endIndex = an index into inputString at the opening quote
|
|
* lineNumber = the line number that corresponds to endIndex
|
|
* Returns: a string literal, including its opening and closing quote characters
|
|
*/
|
|
string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
|
|
ref uint lineNumber) if (isSomeString!S)
|
|
{
|
|
auto startIndex = endIndex;
|
|
++endIndex;
|
|
assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
|
|
string open = inputString[endIndex .. endIndex + 1];
|
|
string close;
|
|
bool nesting = false;
|
|
switch (open[0])
|
|
{
|
|
case '[': close = "]"; ++endIndex; nesting = true; break;
|
|
case '<': close = ">"; ++endIndex; nesting = true; break;
|
|
case '{': close = "}"; ++endIndex; nesting = true; break;
|
|
case '(': close = ")"; ++endIndex; nesting = true; break;
|
|
default:
|
|
while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
|
|
endIndex++;
|
|
close = open = inputString[startIndex + 1 .. endIndex];
|
|
break;
|
|
}
|
|
int depth = 1;
|
|
while (!isEoF(inputString, endIndex) && depth > 0)
|
|
{
|
|
if (inputString[endIndex] == '\n')
|
|
{
|
|
lineNumber++;
|
|
endIndex++;
|
|
}
|
|
else if (inputString[endIndex..$].startsWith(open))
|
|
{
|
|
endIndex += open.length;
|
|
if (!nesting && !isEoF(inputString, endIndex))
|
|
{
|
|
if (inputString[endIndex] == '"')
|
|
++endIndex;
|
|
break;
|
|
}
|
|
depth++;
|
|
}
|
|
else if (inputString[endIndex..$].startsWith(close))
|
|
{
|
|
endIndex += close.length;
|
|
depth--;
|
|
if (depth <= 0)
|
|
break;
|
|
}
|
|
else
|
|
++endIndex;
|
|
}
|
|
if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
|
|
++endIndex;
|
|
return inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
|
|
/**
|
|
* TODO: Fix this
|
|
*/
|
|
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
|
|
{
|
|
/+auto r = byDToken(range, IterationStyle.EVERYTHING);
|
|
string s = getBraceContent(r);
|
|
range.popFrontN(s.length);
|
|
return s;+/
|
|
return "";
|
|
}
|
|
|
|
pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
|
|
if (isSomeString!S)
|
|
{
|
|
Token token;
|
|
token.startIndex = endIndex;
|
|
size_t startIndex = endIndex;
|
|
if (inputString[endIndex] == '0')
|
|
{
|
|
endIndex++;
|
|
if (isEoF(inputString, endIndex))
|
|
{
|
|
token.type = TokenType.IntLiteral;
|
|
token.value = inputString[startIndex .. endIndex];
|
|
return token;
|
|
}
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
// The current language spec doesn't cover octal literals, so this
|
|
// is decimal.
|
|
lexDecimal(inputString, startIndex, endIndex, token);
|
|
return token;
|
|
case 'b':
|
|
case 'B':
|
|
lexBinary(inputString, startIndex, ++endIndex, token);
|
|
return token;
|
|
case 'x':
|
|
case 'X':
|
|
lexHex(inputString, startIndex, ++endIndex, token);
|
|
return token;
|
|
default:
|
|
token.type = TokenType.IntLiteral;
|
|
token.value = inputString[startIndex .. endIndex];
|
|
return token;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
lexDecimal(inputString, startIndex, endIndex, token);
|
|
return token;
|
|
}
|
|
}
|
|
|
|
pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
token.type = TokenType.IntLiteral;
|
|
binaryLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0':
|
|
case '1':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break binaryLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
if (isUnsigned)
|
|
break;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isLong)
|
|
{
|
|
token.type = TokenType.UnsignedLongLiteral;
|
|
break binaryLoop;
|
|
}
|
|
else
|
|
token.type = TokenType.UnsignedIntLiteral;
|
|
isUnsigned = true;
|
|
break;
|
|
case 'L':
|
|
if (isLong)
|
|
break binaryLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isUnsigned)
|
|
{
|
|
token.type = TokenType.UnsignedLongLiteral;
|
|
break binaryLoop;
|
|
}
|
|
else
|
|
token.type = TokenType.LongLiteral;
|
|
isLong = true;
|
|
break;
|
|
default:
|
|
break binaryLoop;
|
|
}
|
|
}
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
bool isFloat = false;
|
|
bool isReal = false;
|
|
bool isDouble = false;
|
|
bool foundDot = false;
|
|
bool foundE = false;
|
|
bool foundPlusMinus = false;
|
|
token.type = TokenType.IntLiteral;
|
|
decimalLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'e':
|
|
case 'E':
|
|
// For this to be a valid exponent, the next character must be a
|
|
// decimal character or a sign
|
|
if (foundE || isEoF(inputString, endIndex + 1))
|
|
break decimalLoop;
|
|
switch (inputString[endIndex + 1])
|
|
{
|
|
case '+':
|
|
case '-':
|
|
if (isEoF(inputString, endIndex + 2)
|
|
|| inputString[endIndex + 2] < '0'
|
|
|| inputString[endIndex + 2] > '9')
|
|
{
|
|
break decimalLoop;
|
|
}
|
|
break;
|
|
case '0': .. case '9':
|
|
break;
|
|
default:
|
|
break decimalLoop;
|
|
}
|
|
++endIndex;
|
|
foundE = true;
|
|
isDouble = true;
|
|
token.type = TokenType.DoubleLiteral;
|
|
break;
|
|
case '+':
|
|
case '-':
|
|
if (foundPlusMinus || !foundE)
|
|
break decimalLoop;
|
|
foundPlusMinus = true;
|
|
++endIndex;
|
|
break;
|
|
case '.':
|
|
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
|
|
break decimalLoop; // possibly slice expression
|
|
if (foundDot)
|
|
break decimalLoop; // two dots with other characters between them
|
|
++endIndex;
|
|
foundDot = true;
|
|
token.type = TokenType.DoubleLiteral;
|
|
isDouble = true;
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
if (isUnsigned)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isLong)
|
|
token.type = TokenType.UnsignedLongLiteral;
|
|
else
|
|
token.type = TokenType.UnsignedIntLiteral;
|
|
isUnsigned = true;
|
|
break;
|
|
case 'L':
|
|
if (isLong)
|
|
break decimalLoop;
|
|
if (isReal)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
lexingSuffix = true;
|
|
if (isDouble)
|
|
token.type = TokenType.RealLiteral;
|
|
else if (isUnsigned)
|
|
token.type = TokenType.UnsignedLongLiteral;
|
|
else
|
|
token.type = TokenType.LongLiteral;
|
|
isLong = true;
|
|
break;
|
|
case 'f':
|
|
case 'F':
|
|
lexingSuffix = true;
|
|
if (isUnsigned || isLong)
|
|
break decimalLoop;
|
|
++endIndex;
|
|
token.type = TokenType.FloatLiteral;
|
|
break decimalLoop;
|
|
case 'i':
|
|
++endIndex;
|
|
// Spec says that this is the last suffix, so all cases break the
|
|
// loop.
|
|
if (isDouble)
|
|
{
|
|
token.type = TokenType.Idouble;
|
|
break decimalLoop;
|
|
}
|
|
else if (isFloat)
|
|
{
|
|
token.type = TokenType.Ifloat;
|
|
break decimalLoop;
|
|
}
|
|
else if (isReal)
|
|
{
|
|
token.type = TokenType.Ireal;
|
|
break decimalLoop;
|
|
}
|
|
else
|
|
{
|
|
// There is no imaginary int
|
|
--endIndex;
|
|
break decimalLoop;
|
|
}
|
|
default:
|
|
break decimalLoop;
|
|
}
|
|
}
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
|
|
unittest {
|
|
Token t;
|
|
size_t start, end;
|
|
lexDecimal!string("55e-4", start, end, t);
|
|
assert(t.value == "55e-4");
|
|
assert(t.type == TokenType.DoubleLiteral);
|
|
|
|
start = end = 0;
|
|
lexDecimal!string("123.45f", start, end, t);
|
|
assert(t.value == "123.45f");
|
|
assert(t.type == TokenType.FloatLiteral);
|
|
|
|
start = end = 0;
|
|
lexDecimal!string("3e+f", start, end, t);
|
|
assert(t.value == "3");
|
|
assert(t.type == TokenType.IntLiteral);
|
|
|
|
start = end = 0;
|
|
lexDecimal!string("3e++f", start, end, t);
|
|
assert(t.value == "3");
|
|
assert(t.type == TokenType.IntLiteral);
|
|
|
|
start = end = 0;
|
|
lexDecimal!string("1234..1237", start, end, t);
|
|
assert(t.value == "1234");
|
|
assert(t.type == TokenType.IntLiteral);
|
|
}
|
|
|
|
|
|
nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
|
|
ref size_t endIndex, ref Token token) if (isSomeString!S)
|
|
{
|
|
bool lexingSuffix = false;
|
|
bool isLong = false;
|
|
bool isUnsigned = false;
|
|
bool isFloat = false;
|
|
bool isReal = false;
|
|
bool isDouble = false;
|
|
bool foundDot = false;
|
|
bool foundE = false;
|
|
bool foundPlusMinus = false;
|
|
token.type = TokenType.IntLiteral;
|
|
hexLoop: while (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '0': .. case '9':
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
case '_':
|
|
if (lexingSuffix)
|
|
break hexLoop;
|
|
++endIndex;
|
|
break;
|
|
case 'p':
|
|
case 'P':
|
|
if (foundE)
|
|
break hexLoop;
|
|
++endIndex;
|
|
foundE = true;
|
|
break;
|
|
case '+':
|
|
case '-':
|
|
if (foundPlusMinus || !foundE)
|
|
break hexLoop;
|
|
foundPlusMinus = true;
|
|
++endIndex;
|
|
break;
|
|
case '.':
|
|
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
|
|
break hexLoop; // possibly slice expression
|
|
if (foundDot)
|
|
break hexLoop; // two dots with other characters between them
|
|
++endIndex;
|
|
foundDot = true;
|
|
token.type = TokenType.DoubleLiteral;
|
|
isDouble = true;
|
|
break;
|
|
default:
|
|
break hexLoop;
|
|
}
|
|
}
|
|
|
|
token.value = inputString[startIndex .. endIndex];
|
|
}
|
|
|
|
unittest
|
|
{
|
|
Token t;
|
|
size_t start, end;
|
|
start = 0;
|
|
end = 2;
|
|
lexHex!string("0x193abfq", start, end, t);
|
|
assert(t.value == "0x193abf", t.value);
|
|
assert(t.type == TokenType.IntLiteral);
|
|
|
|
start = 0;
|
|
end = 2;
|
|
lexHex!string("0x2130xabc", start, end, t);
|
|
assert(t.value == "0x2130");
|
|
assert(t.type == TokenType.IntLiteral);
|
|
|
|
}
|
|
|
|
/**
|
|
* Returns: true if ch marks the ending of one token and the beginning of
|
|
* another, false otherwise
|
|
*/
|
|
pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
|
|
{
|
|
switch (ch)
|
|
{
|
|
case '!': .. case '/':
|
|
case ':': .. case '@':
|
|
case '[': .. case '^':
|
|
case '{': .. case '~':
|
|
case 0x20: // space
|
|
case 0x09: // tab
|
|
case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Configure the tokenize() function
|
|
*/
|
|
enum IterationStyle
|
|
{
|
|
/// Only include code, not whitespace or comments
|
|
CODE_ONLY,
|
|
/// Include everything
|
|
EVERYTHING
|
|
}
|
|
|
|
struct TokenRange(R) if (isInputRange(R))
|
|
{
|
|
bool empty() const @property
|
|
{
|
|
return _empty;
|
|
}
|
|
|
|
|
|
private:
|
|
R range;
|
|
bool _empty;
|
|
}
|
|
|
|
Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY)
|
|
if (isSomeString!S)
|
|
{
|
|
auto tokenAppender = appender!(Token[])();
|
|
|
|
// This is very likely a local maximum, but it does seem to take a few
|
|
// milliseconds off of the run time
|
|
tokenAppender.reserve(inputString.length / 4);
|
|
|
|
size_t endIndex = 0;
|
|
uint lineNumber = 1;
|
|
|
|
if (inputString.length > 1 && inputString[0..2] == "#!")
|
|
{
|
|
Token currentToken;
|
|
currentToken.lineNumber = lineNumber; // lineNumber is always 1
|
|
currentToken.value = lexScriptLine(inputString, endIndex, lineNumber);
|
|
currentToken.type = TokenType.ScriptLine;
|
|
}
|
|
|
|
while (!isEoF(inputString, endIndex))
|
|
{
|
|
size_t prevIndex = endIndex;
|
|
Token currentToken;
|
|
auto startIndex = endIndex;
|
|
if (isWhite(inputString[endIndex]))
|
|
{
|
|
if (iterationStyle == IterationStyle.EVERYTHING)
|
|
{
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexWhitespace(inputString, endIndex,
|
|
lineNumber);
|
|
currentToken.type = TokenType.Whitespace;
|
|
tokenAppender.put(currentToken);
|
|
}
|
|
else
|
|
lexWhitespace(inputString, endIndex, lineNumber);
|
|
continue;
|
|
}
|
|
currentToken.startIndex = endIndex;
|
|
|
|
outerSwitch: switch(inputString[endIndex])
|
|
{
|
|
mixin(generateCaseTrie(
|
|
"=", "TokenType.Assign",
|
|
"&", "TokenType.BitAnd",
|
|
"&=", "TokenType.BitAndEquals",
|
|
"|", "TokenType.BitOr",
|
|
"|=", "TokenType.BitOrEquals",
|
|
"~=", "TokenType.CatEquals",
|
|
":", "TokenType.Colon",
|
|
",", "TokenType.Comma",
|
|
"$", "TokenType.Dollar",
|
|
".", "TokenType.Dot",
|
|
"==", "TokenType.Equals",
|
|
"=>", "TokenType.GoesTo",
|
|
">", "TokenType.Greater",
|
|
">=", "TokenType.GreaterEqual",
|
|
"#", "TokenType.Hash",
|
|
"&&", "TokenType.LogicAnd",
|
|
"{", "TokenType.LBrace",
|
|
"[", "TokenType.LBracket",
|
|
"<", "TokenType.Less",
|
|
"<=", "TokenType.LessEqual",
|
|
"<>=", "TokenType.LessEqualGreater",
|
|
"<>", "TokenType.LessOrGreater",
|
|
"||", "TokenType.LogicOr",
|
|
"(", "TokenType.LParen",
|
|
"-", "TokenType.Minus",
|
|
"-=", "TokenType.MinusEquals",
|
|
"%", "TokenType.Mod",
|
|
"%=", "TokenType.ModEquals",
|
|
"*=", "TokenType.MulEquals",
|
|
"!", "TokenType.Not",
|
|
"!=", "TokenType.NotEquals",
|
|
"!>", "TokenType.NotGreater",
|
|
"!>=", "TokenType.NotGreaterEqual",
|
|
"!<", "TokenType.NotLess",
|
|
"!<=", "TokenType.NotLessEqual",
|
|
"!<>", "TokenType.NotLessEqualGreater",
|
|
"+", "TokenType.Plus",
|
|
"+=", "TokenType.PlusEquals",
|
|
"^^", "TokenType.Pow",
|
|
"^^=", "TokenType.PowEquals",
|
|
"}", "TokenType.RBrace",
|
|
"]", "TokenType.RBracket",
|
|
")", "TokenType.RParen",
|
|
";", "TokenType.Semicolon",
|
|
"<<", "TokenType.ShiftLeft",
|
|
"<<=", "TokenType.ShiftLeftEqual",
|
|
">>", "TokenType.ShiftRight",
|
|
">>=", "TokenType.ShiftRightEqual",
|
|
"..", "TokenType.Slice",
|
|
"*", "TokenType.Star",
|
|
"?", "TokenType.Ternary",
|
|
"~", "TokenType.Tilde",
|
|
"--", "TokenType.Decrement",
|
|
"!<>=", "TokenType.Unordered",
|
|
">>>", "TokenType.UnsignedShiftRight",
|
|
">>>=", "TokenType.UnsignedShiftRightEqual",
|
|
"++", "TokenType.Increment",
|
|
"...", "TokenType.Vararg",
|
|
"^", "TokenType.Xor",
|
|
"^=", "TokenType.XorEquals",
|
|
));
|
|
case '0': .. case '9':
|
|
currentToken = lexNumber(inputString, endIndex);
|
|
break;
|
|
case '/':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex))
|
|
{
|
|
currentToken.value = "/";
|
|
currentToken.type = TokenType.Div;
|
|
currentToken.lineNumber = lineNumber;
|
|
break;
|
|
}
|
|
currentToken.lineNumber = lineNumber;
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '/':
|
|
case '+':
|
|
case '*':
|
|
if (iterationStyle == IterationStyle.CODE_ONLY)
|
|
{
|
|
lexComment(inputString, endIndex, lineNumber);
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
currentToken.value = lexComment(inputString, endIndex, lineNumber);
|
|
currentToken.type = TokenType.Comment;
|
|
break;
|
|
}
|
|
case '=':
|
|
currentToken.value = "/=";
|
|
currentToken.type = TokenType.DivEquals;
|
|
++endIndex;
|
|
break;
|
|
default:
|
|
currentToken.value = "/";
|
|
currentToken.type = TokenType.Div;
|
|
break;
|
|
}
|
|
break;
|
|
case 'r':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
|
goto default;
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex,
|
|
lineNumber, inputString[endIndex], false);
|
|
currentToken.type = TokenType.StringLiteral;
|
|
break;
|
|
case '`':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
|
inputString[endIndex], false);
|
|
currentToken.type = TokenType.StringLiteral;
|
|
break;
|
|
case 'x':
|
|
++endIndex;
|
|
if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
|
|
goto default;
|
|
else
|
|
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
|
|
case '\'':
|
|
case '"':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value = lexString(inputString, endIndex, lineNumber,
|
|
inputString[endIndex]);
|
|
currentToken.type = TokenType.StringLiteral;
|
|
break;
|
|
case 'q':
|
|
currentToken.value = "q";
|
|
++endIndex;
|
|
if (!isEoF(inputString, endIndex))
|
|
{
|
|
switch (inputString[endIndex])
|
|
{
|
|
case '"':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value ~= lexDelimitedString(inputString,
|
|
endIndex, lineNumber);
|
|
currentToken.type = TokenType.StringLiteral;
|
|
break outerSwitch;
|
|
case '{':
|
|
currentToken.lineNumber = lineNumber;
|
|
currentToken.value ~= lexTokenString(inputString,
|
|
endIndex, lineNumber);
|
|
currentToken.type = TokenType.StringLiteral;
|
|
break outerSwitch;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
goto default;
|
|
case '@':
|
|
++endIndex;
|
|
goto default;
|
|
default:
|
|
while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
|
|
++endIndex;
|
|
currentToken.value = inputString[startIndex .. endIndex];
|
|
currentToken.type = lookupTokenTypeOptimized(currentToken.value);
|
|
//currentToken.type = lookupTokenType(currentToken.value);
|
|
currentToken.lineNumber = lineNumber;
|
|
break;
|
|
}
|
|
//stderr.writeln(currentToken);
|
|
tokenAppender.put(currentToken);
|
|
|
|
// This should never happen.
|
|
if (endIndex <= prevIndex)
|
|
{
|
|
stderr.writeln("FAIL");
|
|
return [];
|
|
}
|
|
}
|
|
return tokenAppender.data;
|
|
}
|