2166 lines
48 KiB
D
2166 lines
48 KiB
D
module std.d.lexer;
|
|
|
|
import std.typecons;
|
|
import std.typetuple;
|
|
import std.array;
|
|
import std.algorithm;
|
|
import std.range;
|
|
import std.lexer;
|
|
|
|
private enum operators = [
|
|
",", ".", "..", "...", "/", "/=", "!", "!<", "!<=", "!<>", "!<>=", "!=",
|
|
"!>", "!>=", "$", "%", "%=", "&", "&&", "&=", "(", ")", "*", "*=", "+", "++",
|
|
"+=", "-", "--", "-=", ":", ";", "<", "<<", "<<=", "<=", "<>", "<>=", "=",
|
|
"==", "=>", ">", ">=", ">>", ">>=", ">>>", ">>>=", "?", "@", "[", "]", "^",
|
|
"^=", "^^", "^^=", "{", "|", "|=", "||", "}", "~", "~="
|
|
];
|
|
|
|
private enum keywords = [
|
|
"abstract", "alias", "align", "asm", "assert", "auto", "body", "bool",
|
|
"break", "byte", "case", "cast", "catch", "cdouble", "cent", "cfloat",
|
|
"char", "class", "const", "continue", "creal", "dchar", "debug", "default",
|
|
"delegate", "delete", "deprecated", "do", "double", "else", "enum",
|
|
"export", "extern", "false", "final", "finally", "float", "for", "foreach",
|
|
"foreach_reverse", "function", "goto", "idouble", "if", "ifloat",
|
|
"immutable", "import", "in", "inout", "int", "interface", "invariant",
|
|
"ireal", "is", "lazy", "long", "macro", "mixin", "module", "new", "nothrow",
|
|
"null", "out", "override", "package", "pragma", "private", "protected",
|
|
"public", "pure", "real", "ref", "return", "scope", "shared", "short",
|
|
"static", "struct", "super", "switch", "synchronized", "template", "this",
|
|
"throw", "true", "try", "typedef", "typeid", "typeof", "ubyte", "ucent",
|
|
"uint", "ulong", "union", "unittest", "ushort", "version", "virtual", "void",
|
|
"volatile", "wchar", "while", "with", "__DATE__", "__EOF__", "__FILE__",
|
|
"__FUNCTION__", "__gshared", "__LINE__", "__MODULE__", "__parameters",
|
|
"__PRETTY_FUNCTION__", "__TIME__", "__TIMESTAMP__", "__traits", "__vector",
|
|
"__VENDOR__", "__VERSION__"
|
|
];
|
|
|
|
private enum dynamicTokens = [
|
|
"specialTokenSequence", "comment", "identifier", "scriptLine",
|
|
"whitespace", "doubleLiteral", "floatLiteral", "idoubleLiteral",
|
|
"ifloatLiteral", "intLiteral", "longLiteral", "realLiteral",
|
|
"irealLiteral", "uintLiteral", "ulongLiteral", "characterLiteral",
|
|
"dstringLiteral", "stringLiteral", "wstringLiteral"
|
|
];
|
|
|
|
private enum pseudoTokenHandlers = [
|
|
"\"", "lexStringLiteral",
|
|
"`", "lexWysiwygString",
|
|
"//", "lexSlashSlashComment",
|
|
"/*", "lexSlashStarComment",
|
|
"/+", "lexSlashPlusComment",
|
|
".", "lexDot",
|
|
"'", "lexCharacterLiteral",
|
|
"0", "lexNumber",
|
|
"1", "lexDecimal",
|
|
"2", "lexDecimal",
|
|
"3", "lexDecimal",
|
|
"4", "lexDecimal",
|
|
"5", "lexDecimal",
|
|
"6", "lexDecimal",
|
|
"7", "lexDecimal",
|
|
"8", "lexDecimal",
|
|
"9", "lexDecimal",
|
|
"q\"", "lexDelimitedString",
|
|
"q{", "lexTokenString",
|
|
"r\"", "lexWysiwygString",
|
|
"x\"", "lexHexString",
|
|
" ", "lexWhitespace",
|
|
"\t", "lexWhitespace",
|
|
"\r", "lexWhitespace",
|
|
"\n", "lexWhitespace",
|
|
"\u2028", "lexLongNewline",
|
|
"\u2029", "lexLongNewline",
|
|
"#!", "lexScriptLine",
|
|
"#line", "lexSpecialTokenSequence"
|
|
];
|
|
|
|
public alias IdType = TokenIdType!(operators, dynamicTokens, keywords);
|
|
public alias str = tokenStringRepresentation!(IdType, operators, dynamicTokens, keywords);
|
|
public template tok(string token)
|
|
{
|
|
alias tok = TokenId!(IdType, operators, dynamicTokens, keywords, token);
|
|
}
|
|
private enum extraFields = q{
|
|
string comment;
|
|
|
|
int opCmp(size_t i) const pure nothrow @safe {
|
|
if (index < i) return -1;
|
|
if (index > i) return 1;
|
|
return 0;
|
|
}
|
|
};
|
|
public alias Token = std.lexer.TokenStructure!(IdType, extraFields);
|
|
|
|
/**
|
|
* Configure string lexing behavior
|
|
*/
|
|
public enum StringBehavior : ubyte
|
|
{
|
|
/// Do not include quote characters, process escape sequences
|
|
compiler = 0b0000_0000,
|
|
/// Opening quotes, closing quotes, and string suffixes are included in the
|
|
/// string token
|
|
includeQuoteChars = 0b0000_0001,
|
|
/// String escape sequences are not replaced
|
|
notEscaped = 0b0000_0010,
|
|
/// Not modified at all. Useful for formatters or highlighters
|
|
source = includeQuoteChars | notEscaped
|
|
}
|
|
|
|
/**
|
|
* Configure whitespace handling behavior
|
|
*/
|
|
public enum WhitespaceBehavior : ubyte
|
|
{
|
|
/// Whitespace is skipped
|
|
skip,
|
|
/// Whitespace is treated as a token
|
|
include
|
|
}
|
|
|
|
/**
|
|
* Configure special token handling behavior
|
|
*/
|
|
public enum SpecialTokenBehavior : ubyte
|
|
{
|
|
/// Special tokens are skipped
|
|
skip,
|
|
/// Special tokens are treated as a token
|
|
include
|
|
}
|
|
|
|
/**
|
|
* Configure comment handling behavior
|
|
*/
|
|
public enum CommentBehavior : ubyte
|
|
{
|
|
/// Comments are attached to the non-whitespace token that follows them
|
|
attach,
|
|
/// Comments are tokens, and can be returned by calls to the token range's front()
|
|
include
|
|
}
|
|
|
|
public struct LexerConfig
|
|
{
|
|
string fileName;
|
|
StringBehavior stringBehavior;
|
|
WhitespaceBehavior whitespaceBehavior;
|
|
CommentBehavior commentBehavior;
|
|
SpecialTokenBehavior specialTokenBehavior;
|
|
}
|
|
|
|
public bool isBasicType(IdType type) nothrow pure @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!"int":
|
|
case tok!"uint":
|
|
case tok!"double":
|
|
case tok!"idouble":
|
|
case tok!"float":
|
|
case tok!"ifloat":
|
|
case tok!"short":
|
|
case tok!"ushort":
|
|
case tok!"long":
|
|
case tok!"ulong":
|
|
case tok!"char":
|
|
case tok!"wchar":
|
|
case tok!"dchar":
|
|
case tok!"bool":
|
|
case tok!"void":
|
|
case tok!"cent":
|
|
case tok!"ucent":
|
|
case tok!"real":
|
|
case tok!"ireal":
|
|
case tok!"byte":
|
|
case tok!"ubyte":
|
|
case tok!"cdouble":
|
|
case tok!"cfloat":
|
|
case tok!"creal":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public bool isNumberLiteral(IdType type) nothrow pure @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!"doubleLiteral":
|
|
case tok!"floatLiteral":
|
|
case tok!"idoubleLiteral":
|
|
case tok!"ifloatLiteral":
|
|
case tok!"intLiteral":
|
|
case tok!"longLiteral":
|
|
case tok!"realLiteral":
|
|
case tok!"irealLiteral":
|
|
case tok!"uintLiteral":
|
|
case tok!"ulongLiteral":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public bool isOperator(IdType type) nothrow pure @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!",":
|
|
case tok!".":
|
|
case tok!"..":
|
|
case tok!"...":
|
|
case tok!"/":
|
|
case tok!"/=":
|
|
case tok!"!":
|
|
case tok!"!<":
|
|
case tok!"!<=":
|
|
case tok!"!<>":
|
|
case tok!"!<>=":
|
|
case tok!"!=":
|
|
case tok!"!>":
|
|
case tok!"!>=":
|
|
case tok!"$":
|
|
case tok!"%":
|
|
case tok!"%=":
|
|
case tok!"&":
|
|
case tok!"&&":
|
|
case tok!"&=":
|
|
case tok!"(":
|
|
case tok!")":
|
|
case tok!"*":
|
|
case tok!"*=":
|
|
case tok!"+":
|
|
case tok!"++":
|
|
case tok!"+=":
|
|
case tok!"-":
|
|
case tok!"--":
|
|
case tok!"-=":
|
|
case tok!":":
|
|
case tok!";":
|
|
case tok!"<":
|
|
case tok!"<<":
|
|
case tok!"<<=":
|
|
case tok!"<=":
|
|
case tok!"<>":
|
|
case tok!"<>=":
|
|
case tok!"=":
|
|
case tok!"==":
|
|
case tok!"=>":
|
|
case tok!">":
|
|
case tok!">=":
|
|
case tok!">>":
|
|
case tok!">>=":
|
|
case tok!">>>":
|
|
case tok!">>>=":
|
|
case tok!"?":
|
|
case tok!"@":
|
|
case tok!"[":
|
|
case tok!"]":
|
|
case tok!"^":
|
|
case tok!"^=":
|
|
case tok!"^^":
|
|
case tok!"^^=":
|
|
case tok!"{":
|
|
case tok!"|":
|
|
case tok!"|=":
|
|
case tok!"||":
|
|
case tok!"}":
|
|
case tok!"~":
|
|
case tok!"~=":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public bool isKeyword(IdType type) pure nothrow @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!"abstract":
|
|
case tok!"alias":
|
|
case tok!"align":
|
|
case tok!"asm":
|
|
case tok!"assert":
|
|
case tok!"auto":
|
|
case tok!"body":
|
|
case tok!"break":
|
|
case tok!"case":
|
|
case tok!"cast":
|
|
case tok!"catch":
|
|
case tok!"class":
|
|
case tok!"const":
|
|
case tok!"continue":
|
|
case tok!"debug":
|
|
case tok!"default":
|
|
case tok!"delegate":
|
|
case tok!"delete":
|
|
case tok!"deprecated":
|
|
case tok!"do":
|
|
case tok!"else":
|
|
case tok!"enum":
|
|
case tok!"export":
|
|
case tok!"extern":
|
|
case tok!"false":
|
|
case tok!"final":
|
|
case tok!"finally":
|
|
case tok!"for":
|
|
case tok!"foreach":
|
|
case tok!"foreach_reverse":
|
|
case tok!"function":
|
|
case tok!"goto":
|
|
case tok!"if":
|
|
case tok!"immutable":
|
|
case tok!"import":
|
|
case tok!"in":
|
|
case tok!"inout":
|
|
case tok!"interface":
|
|
case tok!"invariant":
|
|
case tok!"is":
|
|
case tok!"lazy":
|
|
case tok!"macro":
|
|
case tok!"mixin":
|
|
case tok!"module":
|
|
case tok!"new":
|
|
case tok!"nothrow":
|
|
case tok!"null":
|
|
case tok!"out":
|
|
case tok!"override":
|
|
case tok!"package":
|
|
case tok!"pragma":
|
|
case tok!"private":
|
|
case tok!"protected":
|
|
case tok!"public":
|
|
case tok!"pure":
|
|
case tok!"ref":
|
|
case tok!"return":
|
|
case tok!"scope":
|
|
case tok!"shared":
|
|
case tok!"static":
|
|
case tok!"struct":
|
|
case tok!"super":
|
|
case tok!"switch":
|
|
case tok!"synchronized":
|
|
case tok!"template":
|
|
case tok!"this":
|
|
case tok!"throw":
|
|
case tok!"true":
|
|
case tok!"try":
|
|
case tok!"typedef":
|
|
case tok!"typeid":
|
|
case tok!"typeof":
|
|
case tok!"union":
|
|
case tok!"unittest":
|
|
case tok!"version":
|
|
case tok!"volatile":
|
|
case tok!"while":
|
|
case tok!"with":
|
|
case tok!"__DATE__":
|
|
case tok!"__EOF__":
|
|
case tok!"__FILE__":
|
|
case tok!"__FUNCTION__":
|
|
case tok!"__gshared":
|
|
case tok!"__LINE__":
|
|
case tok!"__MODULE__":
|
|
case tok!"__parameters":
|
|
case tok!"__PRETTY_FUNCTION__":
|
|
case tok!"__TIME__":
|
|
case tok!"__TIMESTAMP__":
|
|
case tok!"__traits":
|
|
case tok!"__vector":
|
|
case tok!"__VENDOR__":
|
|
case tok!"__VERSION__":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public bool isStringLiteral(IdType type) pure nothrow @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!"dstringLiteral":
|
|
case tok!"stringLiteral":
|
|
case tok!"wstringLiteral":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public bool isProtection(IdType type) pure nothrow @safe
|
|
{
|
|
switch (type)
|
|
{
|
|
case tok!"export":
|
|
case tok!"package":
|
|
case tok!"private":
|
|
case tok!"public":
|
|
case tok!"protected":
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
public struct DLexer
|
|
{
|
|
import core.vararg;
|
|
|
|
mixin Lexer!(Token, lexIdentifier, isSeparating, operators, dynamicTokens,
|
|
keywords, pseudoTokenHandlers);
|
|
|
|
@disable this();
|
|
|
|
this(ubyte[] range, const LexerConfig config, StringCache* cache)
|
|
{
|
|
auto r = (range.length >= 3 && range[0] == 0xef && range[1] == 0xbb && range[2] == 0xbf)
|
|
? range[3 .. $] : range;
|
|
this.range = LexerRange(r);
|
|
this.config = config;
|
|
this.cache = cache;
|
|
popFront();
|
|
}
|
|
|
|
private static bool isDocComment(string comment) pure nothrow @safe
|
|
{
|
|
return comment.length >= 3 && (comment[0 .. 3] == "///"
|
|
|| comment[0 .. 3] == "/++" || comment[0 .. 3] == "/**");
|
|
}
|
|
|
|
public void popFront() pure
|
|
{
|
|
_popFront();
|
|
string comment;
|
|
switch (front.type)
|
|
{
|
|
case tok!"comment":
|
|
if (config.commentBehavior == CommentBehavior.attach)
|
|
{
|
|
import std.string;
|
|
if (isDocComment(front.text))
|
|
{
|
|
comment = comment is null
|
|
? front.text
|
|
: format("%s\n%s", comment, front.text);
|
|
}
|
|
do _popFront(); while (front == tok!"comment");
|
|
if (front == tok!"whitespace") goto case tok!"whitespace";
|
|
if (front == tok!"specialTokenSequence") goto case tok!"specialTokenSequence";
|
|
}
|
|
break;
|
|
case tok!"whitespace":
|
|
if (config.whitespaceBehavior == WhitespaceBehavior.skip)
|
|
{
|
|
do _popFront(); while (front == tok!"whitespace");
|
|
if (front == tok!"comment") goto case tok!"comment";
|
|
if (front == tok!"specialTokenSequence") goto case tok!"specialTokenSequence";
|
|
}
|
|
break;
|
|
case tok!"specialTokenSequence":
|
|
if (config.specialTokenBehavior == SpecialTokenBehavior.skip)
|
|
{
|
|
do _popFront(); while (front == tok!"specialTokenSequence");
|
|
if (front == tok!"comment") goto case tok!"comment";
|
|
if (front == tok!"whitespace") goto case tok!"whitespace";
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
_front.comment = comment;
|
|
}
|
|
|
|
|
|
bool isWhitespace() pure /*const*/ nothrow
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case ' ':
|
|
case '\r':
|
|
case '\n':
|
|
case '\t':
|
|
return true;
|
|
case 0xe2:
|
|
auto peek = range.peek(2);
|
|
return peek.length == 2
|
|
&& peek[0] == 0x80
|
|
&& (peek[1] == 0xa8 || peek[1] == 0xa9);
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
void popFrontWhitespaceAware() pure nothrow
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case '\r':
|
|
range.popFront();
|
|
if (!range.empty && range.front == '\n')
|
|
{
|
|
range.popFront();
|
|
range.incrementLine();
|
|
}
|
|
else
|
|
range.incrementLine();
|
|
return;
|
|
case '\n':
|
|
range.popFront();
|
|
range.incrementLine();
|
|
return;
|
|
case 0xe2:
|
|
auto lookahead = range.peek(3);
|
|
if (lookahead.length == 3 && lookahead[1] == 0x80
|
|
&& (lookahead[2] == 0xa8 || lookahead[2] == 0xa9))
|
|
{
|
|
range.popFront();
|
|
range.popFront();
|
|
range.popFront();
|
|
range.incrementLine();
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
range.popFront();
|
|
return;
|
|
}
|
|
default:
|
|
range.popFront();
|
|
return;
|
|
}
|
|
}
|
|
|
|
Token lexWhitespace() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
version (none) while (index + 16 <= range.bytes.length)
|
|
{
|
|
ulong startAddr = (cast(ulong) range.bytes.ptr) + index;
|
|
enum space = (cast(ulong) ' ') * 0x0101010101010101L;
|
|
enum tab = (cast(ulong) '\t') * 0x0101010101010101L;
|
|
enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
|
|
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
|
|
ulong charsSkipped;
|
|
ulong lineIncrement;
|
|
asm
|
|
{
|
|
mov R10, space;
|
|
mov R11, tab;
|
|
mov R12, cr;
|
|
mov R13, lf;
|
|
mov R8, startAddr;
|
|
movdqu XMM0, [R8];
|
|
|
|
mov R9, line;
|
|
|
|
// space pattern
|
|
movq XMM1, R10;
|
|
shufpd XMM1, XMM1, 0;
|
|
pcmpeqb XMM1, XMM0;
|
|
|
|
// tab pattern
|
|
movq XMM2, R11;
|
|
shufpd XMM2, XMM2, 0;
|
|
pcmpeqb XMM2, XMM0;
|
|
|
|
// CR pattern
|
|
movq XMM3, R12;
|
|
shufpd XMM3, XMM3, 0;
|
|
pcmpeqb XMM3, XMM0;
|
|
|
|
// LF pattern
|
|
movq XMM4, R13;
|
|
shufpd XMM4, XMM4, 0;
|
|
pcmpeqb XMM4, XMM0;
|
|
|
|
// Bit mask-of newlines to r10
|
|
pmovmskb R10, XMM4;
|
|
|
|
// and the masks together
|
|
por XMM1, XMM2;
|
|
por XMM1, XMM3;
|
|
por XMM1, XMM4;
|
|
pmovmskb RAX, XMM1;
|
|
not RAX;
|
|
bsf RCX, RAX;
|
|
mov charsSkipped, RCX;
|
|
|
|
mov RBX, 1;
|
|
inc CL;
|
|
shl RBX, CL;
|
|
sub RBX, 1;
|
|
and R10, RBX;
|
|
popcnt R10, R10;
|
|
mov lineIncrement, R10;
|
|
}
|
|
range.incrementLine(lineIncrement);
|
|
range.popFrontN(charsSkipped);
|
|
if (charsSkipped < 16)
|
|
goto end;
|
|
index += 16;
|
|
}
|
|
loop: do
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case '\r':
|
|
range.popFront();
|
|
if (!range.empty && range.front == '\n')
|
|
range.popFront();
|
|
range.incrementLine();
|
|
break;
|
|
case '\n':
|
|
range.popFront();
|
|
range.incrementLine();
|
|
break;
|
|
case ' ':
|
|
case '\t':
|
|
range.popFront();
|
|
break;
|
|
case 0xe2:
|
|
auto lookahead = range.peek(3);
|
|
if (lookahead.length != 3)
|
|
break loop;
|
|
if (lookahead[1] != 0x80)
|
|
break loop;
|
|
if (lookahead[2] == 0xa8 || lookahead[2] == 0xa9)
|
|
{
|
|
range.popFront();
|
|
range.popFront();
|
|
range.popFront();
|
|
range.incrementLine();
|
|
break;
|
|
}
|
|
break loop;
|
|
default:
|
|
break loop;
|
|
}
|
|
} while (!range.empty);
|
|
end:
|
|
string text = config.whitespaceBehavior == WhitespaceBehavior.skip
|
|
? null : cache.intern(range.slice(mark));
|
|
return Token(tok!"whitespace", text, line, column, index);
|
|
}
|
|
|
|
Token lexNumber() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
if (range.front == '0' && range.canPeek(1))
|
|
{
|
|
auto ahead = range.peek(1)[1];
|
|
switch (ahead)
|
|
{
|
|
case 'x':
|
|
case 'X':
|
|
range.popFront();
|
|
range.popFront();
|
|
return lexHex(mark, line, column, index);
|
|
case 'b':
|
|
case 'B':
|
|
range.popFront();
|
|
range.popFront();
|
|
return lexBinary(mark, line, column, index);
|
|
default:
|
|
return lexDecimal(mark, line, column, index);
|
|
}
|
|
}
|
|
else
|
|
return lexDecimal(mark, line, column, index);
|
|
}
|
|
|
|
Token lexHex() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
return lexHex(mark, line, column, index);
|
|
}
|
|
|
|
Token lexHex(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
|
{
|
|
IdType type = tok!"intLiteral";
|
|
bool foundDot;
|
|
hexLoop: while (!range.empty)
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
case '0': .. case '9':
|
|
case '_':
|
|
range.popFront();
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
lexIntSuffix(type);
|
|
break hexLoop;
|
|
case 'i':
|
|
if (foundDot)
|
|
lexFloatSuffix(type);
|
|
break hexLoop;
|
|
case 'L':
|
|
if (foundDot)
|
|
lexFloatSuffix(type);
|
|
else
|
|
lexIntSuffix(type);
|
|
break hexLoop;
|
|
case 'p':
|
|
case 'P':
|
|
lexExponent(type);
|
|
break hexLoop;
|
|
case '.':
|
|
if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
|
|
break hexLoop;
|
|
else
|
|
{
|
|
// The following bit of silliness tries to tell the
|
|
// difference between "int dot identifier" and
|
|
// "double identifier".
|
|
if (range.canPeek(1))
|
|
{
|
|
switch (range.peekAt(1))
|
|
{
|
|
case '0': .. case '9':
|
|
case 'A': .. case 'F':
|
|
case 'a': .. case 'f':
|
|
goto doubleLiteral;
|
|
default:
|
|
break hexLoop;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
doubleLiteral:
|
|
range.popFront();
|
|
foundDot = true;
|
|
type = tok!"doubleLiteral";
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break hexLoop;
|
|
}
|
|
}
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexBinary() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
return lexBinary(mark, line, column, index);
|
|
}
|
|
|
|
Token lexBinary(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
|
{
|
|
IdType type = tok!"intLiteral";
|
|
binaryLoop: while (!range.empty)
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case '0':
|
|
case '1':
|
|
case '_':
|
|
range.popFront();
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
case 'L':
|
|
lexIntSuffix(type);
|
|
break binaryLoop;
|
|
default:
|
|
break binaryLoop;
|
|
}
|
|
}
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexDecimal() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
return lexDecimal(mark, line, column, index);
|
|
}
|
|
|
|
Token lexDecimal(size_t mark, size_t line, size_t column, size_t index) pure nothrow
|
|
{
|
|
bool foundDot = range.front == '.';
|
|
IdType type = tok!"intLiteral";
|
|
if (foundDot)
|
|
{
|
|
range.popFront();
|
|
type = tok!"doubleLiteral";
|
|
}
|
|
|
|
decimalLoop: while (!range.empty)
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case '0': .. case '9':
|
|
case '_':
|
|
range.popFront();
|
|
break;
|
|
case 'u':
|
|
case 'U':
|
|
if (!foundDot)
|
|
lexIntSuffix(type);
|
|
break decimalLoop;
|
|
case 'i':
|
|
lexFloatSuffix(type);
|
|
break decimalLoop;
|
|
case 'L':
|
|
if (foundDot)
|
|
lexFloatSuffix(type);
|
|
else
|
|
lexIntSuffix(type);
|
|
break decimalLoop;
|
|
case 'f':
|
|
case 'F':
|
|
lexFloatSuffix(type);
|
|
break decimalLoop;
|
|
case 'e':
|
|
case 'E':
|
|
lexExponent(type);
|
|
break decimalLoop;
|
|
case '.':
|
|
if (foundDot || !range.canPeek(1) || range.peekAt(1) == '.')
|
|
break decimalLoop;
|
|
else
|
|
{
|
|
// The following bit of silliness tries to tell the
|
|
// difference between "int dot identifier" and
|
|
// "double identifier".
|
|
if (range.canPeek(1))
|
|
{
|
|
auto ch = range.peekAt(1);
|
|
if (ch <= 0x2f
|
|
|| (ch >= '0' && ch <= '9')
|
|
|| (ch >= ':' && ch <= '@')
|
|
|| (ch >= '[' && ch <= '^')
|
|
|| (ch >= '{' && ch <= '~')
|
|
|| ch == '`' || ch == '_')
|
|
{
|
|
goto doubleLiteral;
|
|
}
|
|
else
|
|
break decimalLoop;
|
|
}
|
|
else
|
|
{
|
|
doubleLiteral:
|
|
range.popFront();
|
|
foundDot = true;
|
|
type = tok!"doubleLiteral";
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break decimalLoop;
|
|
}
|
|
}
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
void lexIntSuffix(ref IdType type) pure nothrow @safe
|
|
{
|
|
bool secondPass;
|
|
if (range.front == 'u' || range.front == 'U')
|
|
{
|
|
U:
|
|
if (type == tok!"intLiteral")
|
|
type = tok!"uintLiteral";
|
|
else
|
|
type = tok!"ulongLiteral";
|
|
range.popFront();
|
|
if (secondPass)
|
|
return;
|
|
if (range.front == 'L' || range.front == 'l')
|
|
goto L;
|
|
return;
|
|
}
|
|
if (range.front == 'L' || range.front == 'l')
|
|
{
|
|
L:
|
|
if (type == tok!"uintLiteral")
|
|
type = tok!"ulongLiteral";
|
|
else
|
|
type = tok!"longLiteral";
|
|
range.popFront();
|
|
if (range.front == 'U' || range.front == 'u')
|
|
{
|
|
secondPass = true;
|
|
goto U;
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
void lexFloatSuffix(ref IdType type) pure nothrow @safe
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case 'L':
|
|
range.popFront();
|
|
type = tok!"doubleLiteral";
|
|
break;
|
|
case 'f':
|
|
case 'F':
|
|
range.popFront();
|
|
type = tok!"floatLiteral";
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
if (!range.empty && range.front == 'i')
|
|
{
|
|
warning("Complex number literals are deprecated");
|
|
range.popFront();
|
|
if (type == tok!"floatLiteral")
|
|
type = tok!"ifloatLiteral";
|
|
else
|
|
type = tok!"idoubleLiteral";
|
|
}
|
|
}
|
|
|
|
void lexExponent(ref IdType type) pure nothrow @safe
|
|
{
|
|
range.popFront();
|
|
bool foundSign = false;
|
|
bool foundDigit = false;
|
|
while (!range.empty)
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case '-':
|
|
case '+':
|
|
if (foundSign)
|
|
{
|
|
if (!foundDigit)
|
|
error("Expected an exponent");
|
|
return;
|
|
}
|
|
foundSign = true;
|
|
range.popFront();
|
|
break;
|
|
case '0': .. case '9':
|
|
case '_':
|
|
foundDigit = true;
|
|
range.popFront();
|
|
break;
|
|
case 'L':
|
|
case 'f':
|
|
case 'F':
|
|
case 'i':
|
|
lexFloatSuffix(type);
|
|
return;
|
|
default:
|
|
if (!foundDigit)
|
|
error("Expected an exponent");
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
Token lexScriptLine() pure
|
|
{
|
|
mixin (tokenStart);
|
|
while (!range.empty && !isNewline)
|
|
range.popFront();
|
|
return Token(tok!"scriptLine", cache.intern(range.slice(mark)),
|
|
line, column, index);
|
|
}
|
|
|
|
Token lexSpecialTokenSequence() pure
|
|
{
|
|
mixin (tokenStart);
|
|
while (!range.empty && !isNewline)
|
|
range.popFront();
|
|
return Token(tok!"specialTokenSequence", cache.intern(range.slice(mark)),
|
|
line, column, index);
|
|
}
|
|
|
|
Token lexSlashStarComment() pure
|
|
{
|
|
mixin (tokenStart);
|
|
IdType type = tok!"comment";
|
|
range.popFrontN(2);
|
|
version (none) while (range.index + 16 <= range.bytes.length)
|
|
{
|
|
ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
|
|
enum slash = (cast(ulong) '/') * 0x0101010101010101L;
|
|
enum star = (cast(ulong) '*') * 0x0101010101010101L;
|
|
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
|
|
ulong charsSkipped;
|
|
ulong newlineCount;
|
|
bool done;
|
|
asm
|
|
{
|
|
mov RAX, startAddress;
|
|
movdqu XMM0, [RAX];
|
|
|
|
mov R10, lf;
|
|
movq XMM2, R10;
|
|
shufpd XMM2, XMM2, 0;
|
|
pcmpeqb XMM2, XMM0;
|
|
pmovmskb R15, XMM2;
|
|
|
|
mov R10, star;
|
|
movq XMM3, R10;
|
|
shufpd XMM3, XMM3, 0;
|
|
pcmpeqb XMM3, XMM0;
|
|
pmovmskb R8, XMM3;
|
|
|
|
mov R10, slash;
|
|
movq XMM4, R10;
|
|
shufpd XMM4, XMM4, 0;
|
|
pcmpeqb XMM4, XMM0;
|
|
pmovmskb R9, XMM4;
|
|
loop:
|
|
cmp R8, 0;
|
|
je notFound;
|
|
cmp R9, 0;
|
|
je notFound;
|
|
bsf RAX, R8; // stIndex
|
|
bsf RBX, R9; // slIndex
|
|
mov RDX, RAX;
|
|
inc RDX;
|
|
cmp RDX, RBX;
|
|
je found;
|
|
cmp RAX, RBX;
|
|
jae maskSlash;
|
|
maskStar:
|
|
mov RCX, RAX;
|
|
mov R10, 1;
|
|
shl R10, CL;
|
|
xor R8, R10;
|
|
jmp loop;
|
|
maskSlash:
|
|
mov RCX, RBX;
|
|
mov R10, 1;
|
|
shl R10, CL;
|
|
xor R9, R10;
|
|
jmp loop;
|
|
notFound:
|
|
mov R14, 16;
|
|
mov charsSkipped, R14;
|
|
popcnt R14, R15;
|
|
mov newlineCount, R14;
|
|
jmp asmEnd;
|
|
found:
|
|
inc RBX;
|
|
mov charsSkipped, RBX;
|
|
mov RAX, 1;
|
|
mov done, AL;
|
|
mov RCX, RBX;
|
|
mov RBX, 1;
|
|
shl RBX, CL;
|
|
dec RBX;
|
|
and R15, RBX;
|
|
popcnt R14, R15;
|
|
mov newlineCount, R14;
|
|
asmEnd:
|
|
nop;
|
|
}
|
|
range.popFrontN(charsSkipped);
|
|
range.incrementLine(newlineCount);
|
|
if (done)
|
|
goto end;
|
|
}
|
|
while (!range.empty)
|
|
{
|
|
if (range.front == '*')
|
|
{
|
|
range.popFront();
|
|
if (!range.empty && range.front == '/')
|
|
{
|
|
range.popFront();
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
end:
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexSlashSlashComment() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
IdType type = tok!"comment";
|
|
range.popFrontN(2);
|
|
version (none) while (range.index + 16 <= range.bytes.length)
|
|
{
|
|
ulong startAddress = cast(ulong) range.bytes.ptr + range.index;
|
|
enum cr = (cast(ulong) '\r') * 0x0101010101010101L;
|
|
enum lf = (cast(ulong) '\n') * 0x0101010101010101L;
|
|
ulong charsSkipped;
|
|
asm
|
|
{
|
|
mov RAX, startAddress;
|
|
movdqu XMM0, [RAX];
|
|
|
|
mov R10, cr;
|
|
movq XMM1, R10;
|
|
shufpd XMM1, XMM1, 0;
|
|
pcmpeqb XMM1, XMM0;
|
|
|
|
mov R10, lf;
|
|
movq XMM2, R10;
|
|
shufpd XMM2, XMM2, 0;
|
|
pcmpeqb XMM2, XMM0;
|
|
|
|
por XMM1, XMM2;
|
|
pmovmskb RBX, XMM1;
|
|
bsf RCX, RBX;
|
|
mov RDX, 16;
|
|
cmp RBX, 0;
|
|
cmove RCX, RDX;
|
|
mov charsSkipped, RCX;
|
|
|
|
}
|
|
if (charsSkipped < 16)
|
|
{
|
|
index += charsSkipped;
|
|
column += charsSkipped;
|
|
range.popFrontN(charsSkipped);
|
|
goto end;
|
|
}
|
|
else
|
|
{
|
|
assert (charsSkipped == 16);
|
|
index += 16;
|
|
column += 16;
|
|
range.popFrontN(16);
|
|
}
|
|
}
|
|
while (!range.empty)
|
|
{
|
|
if (range.front == '\r' || range.front == '\n')
|
|
break;
|
|
range.popFront();
|
|
}
|
|
end:
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexSlashPlusComment() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
IdType type = tok!"comment";
|
|
range.popFront();
|
|
range.popFront();
|
|
int depth = 1;
|
|
while (depth > 0 && !range.empty)
|
|
{
|
|
if (range.front == '+')
|
|
{
|
|
range.popFront();
|
|
if (!range.empty && range.front == '/')
|
|
{
|
|
range.popFront();
|
|
depth--;
|
|
}
|
|
}
|
|
else if (range.front == '/')
|
|
{
|
|
range.popFront();
|
|
if (!range.empty && range.front == '+')
|
|
{
|
|
range.popFront();
|
|
depth++;
|
|
}
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexStringLiteral() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
range.popFront();
|
|
while (true)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: unterminated string literal");
|
|
return Token();
|
|
}
|
|
else if (range.front == '"')
|
|
{
|
|
range.popFront();
|
|
break;
|
|
}
|
|
else if (range.front == '\\')
|
|
{
|
|
lexEscapeSequence();
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
IdType type = tok!"stringLiteral";
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
Token lexWysiwygString() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
IdType type = tok!"stringLiteral";
|
|
bool backtick = range.front == '`';
|
|
if (backtick)
|
|
{
|
|
range.popFront();
|
|
while (true)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: unterminated string literal");
|
|
return Token(tok!"");
|
|
}
|
|
else if (range.front == '`')
|
|
{
|
|
range.popFront();
|
|
break;
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
}
|
|
else
|
|
{
|
|
range.popFront();
|
|
if (range.empty)
|
|
{
|
|
error("Error: unterminated string literal");
|
|
return Token(tok!"");
|
|
}
|
|
range.popFront();
|
|
while (true)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: unterminated string literal");
|
|
return Token(tok!"");
|
|
}
|
|
else if (range.front == '"')
|
|
{
|
|
range.popFront();
|
|
break;
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
}
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
void lexStringSuffix(ref IdType type) pure nothrow
|
|
{
|
|
if (range.empty)
|
|
type = tok!"stringLiteral";
|
|
else
|
|
{
|
|
switch (range.front)
|
|
{
|
|
case 'w': range.popFront(); type = tok!"wstringLiteral"; break;
|
|
case 'd': range.popFront(); type = tok!"dstringLiteral"; break;
|
|
case 'c': range.popFront(); type = tok!"stringLiteral"; break;
|
|
default: type = tok!"stringLiteral"; break;
|
|
}
|
|
}
|
|
}
|
|
|
|
Token lexDelimitedString() pure nothrow
|
|
{
|
|
import std.traits;
|
|
mixin (tokenStart);
|
|
range.popFront();
|
|
range.popFront();
|
|
ubyte open;
|
|
ubyte close;
|
|
switch (range.front)
|
|
{
|
|
case '<':
|
|
open = '<';
|
|
close = '>';
|
|
range.popFront();
|
|
return lexNormalDelimitedString(mark, line, column, index, open, close);
|
|
case '{':
|
|
open = '{';
|
|
close = '}';
|
|
range.popFront();
|
|
return lexNormalDelimitedString(mark, line, column, index, open, close);
|
|
case '[':
|
|
open = '[';
|
|
close = ']';
|
|
range.popFront();
|
|
return lexNormalDelimitedString(mark, line, column, index, open, close);
|
|
case '(':
|
|
open = '(';
|
|
close = ')';
|
|
range.popFront();
|
|
return lexNormalDelimitedString(mark, line, column, index, open, close);
|
|
default:
|
|
return lexHeredocString(mark, line, column, index);
|
|
}
|
|
}
|
|
|
|
Token lexNormalDelimitedString(size_t mark, size_t line, size_t column,
|
|
size_t index, ubyte open, ubyte close)
|
|
pure nothrow
|
|
{
|
|
int depth = 1;
|
|
while (!range.empty && depth > 0)
|
|
{
|
|
if (range.front == open)
|
|
{
|
|
depth++;
|
|
range.popFront();
|
|
}
|
|
else if (range.front == close)
|
|
{
|
|
depth--;
|
|
range.popFront();
|
|
if (depth <= 0)
|
|
{
|
|
if (range.front == '"')
|
|
range.popFront();
|
|
else
|
|
{
|
|
error("Error: \" expected to end delimited string literal");
|
|
return Token(tok!"");
|
|
}
|
|
}
|
|
}
|
|
else
|
|
popFrontWhitespaceAware();
|
|
}
|
|
IdType type = tok!"stringLiteral";
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(range.slice(mark)), line, column, index);
|
|
}
|
|
|
|
Token lexHeredocString(size_t mark, size_t line, size_t column, size_t index)
|
|
pure nothrow
|
|
{
|
|
import std.regex;
|
|
Token ident = lexIdentifier();
|
|
if (isNewline())
|
|
popFrontWhitespaceAware();
|
|
else
|
|
error("Newline expected");
|
|
while (!range.empty)
|
|
{
|
|
if (isNewline())
|
|
{
|
|
popFrontWhitespaceAware();
|
|
if (!range.canPeek(ident.text.length))
|
|
{
|
|
error(ident.text ~ " expected");
|
|
break;
|
|
}
|
|
if (range.peek(ident.text.length - 1) == ident.text)
|
|
{
|
|
range.popFrontN(ident.text.length);
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
range.popFront();
|
|
}
|
|
if (!range.empty() && range.front == '"')
|
|
range.popFront();
|
|
else
|
|
error(`" expected`);
|
|
IdType type = tok!"stringLiteral";
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(range.slice(mark)), line, column, index);
|
|
}
|
|
|
|
Token lexTokenString() pure
|
|
{
|
|
mixin (tokenStart);
|
|
assert (range.front == 'q');
|
|
range.popFront();
|
|
assert (range.front == '{');
|
|
range.popFront();
|
|
auto app = appender!string();
|
|
app.put("q{");
|
|
int depth = 1;
|
|
|
|
LexerConfig c = config;
|
|
scope(exit) config = c;
|
|
config.whitespaceBehavior = WhitespaceBehavior.include;
|
|
config.stringBehavior = StringBehavior.source;
|
|
config.commentBehavior = CommentBehavior.include;
|
|
|
|
_front = advance();
|
|
while (depth > 0 && !empty)
|
|
{
|
|
auto t = front();
|
|
if (t.text is null)
|
|
app.put(str(t.type));
|
|
else
|
|
app.put(t.text);
|
|
if (t.type == tok!"}")
|
|
{
|
|
depth--;
|
|
if (depth > 0)
|
|
popFront();
|
|
}
|
|
else if (t.type == tok!"{")
|
|
{
|
|
depth++;
|
|
popFront();
|
|
}
|
|
else
|
|
popFront();
|
|
}
|
|
IdType type = tok!"stringLiteral";
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(cast(const(ubyte)[]) app.data), line,
|
|
column, index);
|
|
}
|
|
|
|
Token lexHexString() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
range.popFront();
|
|
range.popFront();
|
|
|
|
loop: while (true)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: unterminated hex string literal");
|
|
return Token();
|
|
}
|
|
else if (isWhitespace())
|
|
popFrontWhitespaceAware();
|
|
else switch (range.front)
|
|
{
|
|
case '0': .. case '9':
|
|
case 'A': .. case 'F':
|
|
case 'a': .. case 'f':
|
|
range.popFront();
|
|
break;
|
|
case '"':
|
|
range.popFront();
|
|
break loop;
|
|
default:
|
|
error("Error: invalid character in hex string");
|
|
return Token();
|
|
}
|
|
}
|
|
|
|
IdType type = tok!"stringLiteral";
|
|
lexStringSuffix(type);
|
|
return Token(type, cache.intern(range.slice(mark)), line, column,
|
|
index);
|
|
}
|
|
|
|
bool lexEscapeSequence() pure nothrow
|
|
{
|
|
range.popFront();
|
|
if (range.empty)
|
|
{
|
|
error("Error: non-terminated character escape sequence.");
|
|
return false;
|
|
}
|
|
switch (range.front)
|
|
{
|
|
case '\'':
|
|
case '"':
|
|
case '?':
|
|
case '\\':
|
|
case '0':
|
|
case 'a':
|
|
case 'b':
|
|
case 'f':
|
|
case 'n':
|
|
case 'r':
|
|
case 't':
|
|
case 'v':
|
|
range.popFront();
|
|
break;
|
|
case 'x':
|
|
range.popFront();
|
|
foreach (i; 0 .. 2)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: 2 hex digits expected.");
|
|
return false;
|
|
}
|
|
switch (range.front)
|
|
{
|
|
case '0': .. case '9':
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
range.popFront();
|
|
break;
|
|
default:
|
|
error("Error: 2 hex digits expected.");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
case '1': .. case '7':
|
|
for (size_t i = 0; i < 3 && !range.empty && range.front >= '0' && range.front <= '7'; i++)
|
|
range.popFront();
|
|
break;
|
|
case 'u':
|
|
range.popFront();
|
|
foreach (i; 0 .. 4)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: at least 4 hex digits expected.");
|
|
return false;
|
|
}
|
|
switch (range.front)
|
|
{
|
|
case '0': .. case '9':
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
range.popFront();
|
|
break;
|
|
default:
|
|
error("Error: at least 4 hex digits expected.");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
case 'U':
|
|
range.popFront();
|
|
foreach (i; 0 .. 8)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: at least 8 hex digits expected.");
|
|
return false;
|
|
}
|
|
switch (range.front)
|
|
{
|
|
case '0': .. case '9':
|
|
case 'a': .. case 'f':
|
|
case 'A': .. case 'F':
|
|
range.popFront();
|
|
break;
|
|
default:
|
|
error("Error: at least 8 hex digits expected.");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
while (true)
|
|
{
|
|
if (range.empty)
|
|
{
|
|
error("Error: non-terminated character escape sequence.");
|
|
return false;
|
|
}
|
|
if (range.front == ';')
|
|
{
|
|
range.popFront();
|
|
break;
|
|
}
|
|
else
|
|
range.popFront();
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
Token lexCharacterLiteral() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
range.popFront();
|
|
if (range.front == '\\')
|
|
{
|
|
lexEscapeSequence();
|
|
goto close;
|
|
}
|
|
else if (range.front == '\'')
|
|
{
|
|
range.popFront();
|
|
return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
|
|
line, column, index);
|
|
}
|
|
else if (range.front & 0x80)
|
|
{
|
|
while (range.front & 0x80)
|
|
range.popFront();
|
|
goto close;
|
|
}
|
|
else
|
|
{
|
|
popFrontWhitespaceAware();
|
|
goto close;
|
|
}
|
|
close:
|
|
if (range.front == '\'')
|
|
{
|
|
range.popFront();
|
|
return Token(tok!"characterLiteral", cache.intern(range.slice(mark)),
|
|
line, column, index);
|
|
}
|
|
else
|
|
{
|
|
error("Error: Expected ' to end character literal");
|
|
return Token();
|
|
}
|
|
}
|
|
|
|
Token lexIdentifier() pure nothrow
|
|
{
|
|
import std.stdio;
|
|
mixin (tokenStart);
|
|
uint hash = 0;
|
|
if (isSeparating(0) || range.empty)
|
|
{
|
|
error("Invalid identifier");
|
|
range.popFront();
|
|
}
|
|
while (!range.empty && !isSeparating(0))
|
|
{
|
|
hash = StringCache.hashStep(range.front, hash);
|
|
range.popFront();
|
|
}
|
|
return Token(tok!"identifier", cache.intern(range.slice(mark), hash), line,
|
|
column, index);
|
|
}
|
|
|
|
Token lexDot() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
if (!range.canPeek(1))
|
|
{
|
|
range.popFront();
|
|
return Token(tok!".", null, line, column, index);
|
|
}
|
|
switch (range.peekAt(1))
|
|
{
|
|
case '0': .. case '9':
|
|
return lexNumber();
|
|
case '.':
|
|
range.popFront();
|
|
range.popFront();
|
|
if (!range.empty && range.front == '.')
|
|
{
|
|
range.popFront();
|
|
return Token(tok!"...", null, line, column, index);
|
|
}
|
|
else
|
|
return Token(tok!"..", null, line, column, index);
|
|
default:
|
|
range.popFront();
|
|
return Token(tok!".", null, line, column, index);
|
|
}
|
|
}
|
|
|
|
Token lexLongNewline() pure nothrow
|
|
{
|
|
mixin (tokenStart);
|
|
range.popFront();
|
|
range.popFront();
|
|
range.popFront();
|
|
range.incrementLine();
|
|
return Token(tok!"whitespace", cache.intern(range.slice(mark)), line,
|
|
column, index);
|
|
}
|
|
|
|
bool isNewline() pure @safe nothrow
|
|
{
|
|
if (range.front == '\n') return true;
|
|
if (range.front == '\r') return true;
|
|
return (range.front & 0x80) && range.canPeek(2)
|
|
&& (range.peek(2) == "\u2028" || range.peek(2) == "\u2029");
|
|
}
|
|
|
|
bool isSeparating(size_t offset) pure nothrow @safe
|
|
{
|
|
if (!range.canPeek(offset)) return true;
|
|
auto c = range.peekAt(offset);
|
|
if (c >= 'A' && c <= 'Z') return false;
|
|
if (c >= 'a' && c <= 'z') return false;
|
|
if (c <= 0x2f) return true;
|
|
if (c >= ':' && c <= '@') return true;
|
|
if (c >= '[' && c <= '^') return true;
|
|
if (c >= '{' && c <= '~') return true;
|
|
if (c == '`') return true;
|
|
if (c & 0x80)
|
|
{
|
|
auto r = range;
|
|
range.popFrontN(offset);
|
|
return (r.canPeek(2) && (r.peek(2) == "\u2028"
|
|
|| r.peek(2) == "\u2029"));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
enum tokenStart = q{
|
|
size_t index = range.index;
|
|
size_t column = range.column;
|
|
size_t line = range.line;
|
|
auto mark = range.mark();
|
|
};
|
|
|
|
void error(string message) pure nothrow @safe
|
|
{
|
|
messages ~= Message(range.line, range.column, message, true);
|
|
}
|
|
|
|
void warning(string message) pure nothrow @safe
|
|
{
|
|
messages ~= Message(range.line, range.column, message, false);
|
|
assert (messages.length > 0);
|
|
}
|
|
|
|
struct Message
|
|
{
|
|
size_t line;
|
|
size_t column;
|
|
string message;
|
|
bool isError;
|
|
}
|
|
|
|
Message[] messages;
|
|
StringCache* cache;
|
|
LexerConfig config;
|
|
}
|
|
|
|
public auto byToken(ubyte[] range)
|
|
{
|
|
LexerConfig config;
|
|
StringCache* cache = new StringCache(StringCache.defaultBucketCount);
|
|
return DLexer(range, config, cache);
|
|
}
|
|
|
|
public auto byToken(ubyte[] range, StringCache* cache)
|
|
{
|
|
LexerConfig config;
|
|
return DLexer(range, config, cache);
|
|
}
|
|
|
|
public auto byToken(ubyte[] range, const LexerConfig config, StringCache* cache)
|
|
{
|
|
return DLexer(range, config, cache);
|
|
}
|
|
|
|
/**
|
|
* Removes "decoration" such as leading whitespace, leading + and * characters,
|
|
* and places the result into the given output range
|
|
*/
|
|
public void unDecorateComment(T)(string comment, auto ref T outputRange)
|
|
if (isOutputRange!(T, string))
|
|
in
|
|
{
|
|
assert (comment.length >= 3);
|
|
}
|
|
body
|
|
{
|
|
switch (comment[0 .. 3])
|
|
{
|
|
case "///":
|
|
size_t i = 3;
|
|
while (comment[i] == ' ' || comment[i] == '\t')
|
|
i++;
|
|
outputRange.put(comment[i .. $]);
|
|
break;
|
|
case "/++":
|
|
case "/**":
|
|
size_t i = 3;
|
|
immutable char c = comment[1];
|
|
// Skip leading * and + characters
|
|
while (comment[i] == c) i++;
|
|
// Skip trailing * and + characters
|
|
size_t j = comment.length - 2;
|
|
while (j > i && comment[j] == c)
|
|
j--;
|
|
while (j > i && (comment[j] == ' ' || comment[j] == '\t'))
|
|
j--;
|
|
if (comment[i] == '\r') i++;
|
|
if (comment[i] == '\n') i++;
|
|
while (comment[i] == ' ' || comment[i] == '\t') i++;
|
|
immutable bool skipBeginningChar = comment[i] == c;
|
|
if (skipBeginningChar)
|
|
i++;
|
|
size_t whitespaceToSkip;
|
|
while (comment[i] == ' ' || comment[i] == '\t')
|
|
{
|
|
whitespaceToSkip++;
|
|
i++;
|
|
}
|
|
size_t l = i;
|
|
while (i < j)
|
|
{
|
|
if (comment[i++] == '\n')
|
|
break;
|
|
}
|
|
outputRange.put(comment[l .. i]);
|
|
while (true)
|
|
{
|
|
if (skipBeginningChar)
|
|
{
|
|
while (i < j && (comment[i] == ' ' || comment[i] == '\t')) i++;
|
|
if (i < j && comment[i] == c) i++;
|
|
}
|
|
for (size_t s = 0; (i < j) && (s <= whitespaceToSkip)
|
|
&& (comment[i] == ' ' || comment[i] == '\t');)
|
|
{
|
|
s++;
|
|
i++;
|
|
}
|
|
size_t k = i;
|
|
inner: while (k < j)
|
|
{
|
|
if (comment[k] == '\n')
|
|
{
|
|
k++;
|
|
break inner;
|
|
}
|
|
k++;
|
|
}
|
|
outputRange.put(comment[i .. k]);
|
|
i = k;
|
|
if (i >= j)
|
|
break;
|
|
}
|
|
break;
|
|
default:
|
|
assert (false, "Invalid doc comment");
|
|
}
|
|
}
|
|
|
|
|
|
struct StringCache
|
|
{
|
|
public:
|
|
|
|
@disable this();
|
|
|
|
/**
|
|
* Params: bucketCount = the initial number of buckets. Must be a
|
|
* power of two
|
|
*/
|
|
this(size_t bucketCount)
|
|
{
|
|
buckets = (cast(Node**) calloc((Node*).sizeof, bucketCount))[0 .. bucketCount];
|
|
}
|
|
|
|
~this()
|
|
{
|
|
Block* current = rootBlock;
|
|
while (current !is null)
|
|
{
|
|
Block* prev = current;
|
|
current = current.next;
|
|
free(cast(void*) prev.bytes.ptr);
|
|
free(cast(void*) prev);
|
|
}
|
|
foreach (nodePointer; buckets)
|
|
{
|
|
Node* currentNode = nodePointer;
|
|
while (currentNode !is null)
|
|
{
|
|
Node* prev = currentNode;
|
|
currentNode = currentNode.next;
|
|
free(prev);
|
|
}
|
|
}
|
|
rootBlock = null;
|
|
free(buckets.ptr);
|
|
buckets = null;
|
|
}
|
|
|
|
/**
|
|
* Caches a string.
|
|
*/
|
|
string intern(const(ubyte)[] str) pure nothrow @safe
|
|
{
|
|
if (str is null || str.length == 0)
|
|
return "";
|
|
immutable uint hash = hashBytes(str);
|
|
return intern(str, hash);
|
|
}
|
|
|
|
/**
|
|
* ditto
|
|
*/
|
|
string intern(string str) pure nothrow @trusted
|
|
{
|
|
return intern(cast(ubyte[]) str);
|
|
}
|
|
|
|
/**
|
|
* Caches a string as above, but uses the given hash code instead of
|
|
* calculating one itself. Use this alongside $(LREF hashStep)() can reduce the
|
|
* amount of work necessary when lexing dynamic tokens.
|
|
*/
|
|
string intern(const(ubyte)[] str, uint hash) pure nothrow @safe
|
|
in
|
|
{
|
|
assert (str.length > 0);
|
|
}
|
|
body
|
|
{
|
|
return _intern(str, hash);
|
|
// string s = _intern(str, hash);
|
|
// size_t* ptr = s in debugMap;
|
|
// if (ptr is null)
|
|
// debugMap[s] = cast(size_t) s.ptr;
|
|
// else
|
|
// assert (*ptr == cast(size_t) s.ptr);
|
|
// return s;
|
|
}
|
|
|
|
/**
|
|
* Incremental hashing.
|
|
* Params:
|
|
* b = the byte to add to the hash
|
|
* h = the hash that has been calculated so far
|
|
* Returns: the new hash code for the string.
|
|
*/
|
|
static uint hashStep(ubyte b, uint h) pure nothrow @safe
|
|
{
|
|
return (h ^ sbox[b]) * 3;
|
|
}
|
|
|
|
/**
|
|
* The default bucket count for the string cache.
|
|
*/
|
|
static enum defaultBucketCount = 4096;
|
|
|
|
size_t allocated() pure nothrow @safe @property
|
|
{
|
|
return _allocated;
|
|
}
|
|
|
|
private:
|
|
|
|
string _intern(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
|
|
{
|
|
if (bytes is null || bytes.length == 0)
|
|
return "";
|
|
immutable size_t index = hash & (buckets.length - 1);
|
|
Node* s = find(bytes, hash);
|
|
if (s !is null)
|
|
return cast(string) s.str;
|
|
_allocated += bytes.length;
|
|
ubyte[] mem = allocate(bytes.length);
|
|
mem[] = bytes[];
|
|
Node* node = cast(Node*) malloc(Node.sizeof);
|
|
node.str = mem;
|
|
node.hash = hash;
|
|
node.next = buckets[index];
|
|
buckets[index] = node;
|
|
return cast(string) mem;
|
|
}
|
|
|
|
Node* find(const(ubyte)[] bytes, uint hash) pure nothrow @trusted
|
|
{
|
|
import std.algorithm;
|
|
immutable size_t index = hash & (buckets.length - 1);
|
|
Node* node = buckets[index];
|
|
while (node !is null)
|
|
{
|
|
if (node.hash == hash && bytes.equal(cast(ubyte[]) node.str))
|
|
return node;
|
|
node = node.next;
|
|
}
|
|
return node;
|
|
}
|
|
|
|
static uint hashBytes(const(ubyte)[] data) pure nothrow @trusted
|
|
in
|
|
{
|
|
assert (data !is null);
|
|
assert (data.length > 0);
|
|
}
|
|
body
|
|
{
|
|
uint hash = 0;
|
|
foreach (ubyte b; data)
|
|
{
|
|
hash ^= sbox[b];
|
|
hash *= 3;
|
|
}
|
|
return hash;
|
|
}
|
|
|
|
ubyte[] allocate(size_t numBytes) pure nothrow @trusted
|
|
in
|
|
{
|
|
assert (numBytes != 0);
|
|
}
|
|
out (result)
|
|
{
|
|
assert (result.length == numBytes);
|
|
}
|
|
body
|
|
{
|
|
if (numBytes > (blockSize / 4))
|
|
return (cast(ubyte*) malloc(numBytes))[0 .. numBytes];
|
|
Block* r = rootBlock;
|
|
size_t i = 0;
|
|
while (i <= 3 && r !is null)
|
|
{
|
|
|
|
immutable size_t available = r.bytes.length;
|
|
immutable size_t oldUsed = r.used;
|
|
immutable size_t newUsed = oldUsed + numBytes;
|
|
if (newUsed <= available)
|
|
{
|
|
r.used = newUsed;
|
|
return r.bytes[oldUsed .. newUsed];
|
|
}
|
|
i++;
|
|
r = r.next;
|
|
}
|
|
Block* b = cast(Block*) malloc(Block.sizeof);
|
|
b.bytes = (cast(ubyte*) malloc(blockSize))[0 .. blockSize];
|
|
b.used = numBytes;
|
|
b.next = rootBlock;
|
|
rootBlock = b;
|
|
return b.bytes[0 .. numBytes];
|
|
}
|
|
|
|
static struct Node
|
|
{
|
|
ubyte[] str;
|
|
uint hash;
|
|
Node* next;
|
|
}
|
|
|
|
static struct Block
|
|
{
|
|
ubyte[] bytes;
|
|
size_t used;
|
|
Block* next;
|
|
}
|
|
|
|
static enum blockSize = 1024 * 16;
|
|
|
|
static immutable uint[] sbox = [
|
|
0xF53E1837, 0x5F14C86B, 0x9EE3964C, 0xFA796D53,
|
|
0x32223FC3, 0x4D82BC98, 0xA0C7FA62, 0x63E2C982,
|
|
0x24994A5B, 0x1ECE7BEE, 0x292B38EF, 0xD5CD4E56,
|
|
0x514F4303, 0x7BE12B83, 0x7192F195, 0x82DC7300,
|
|
0x084380B4, 0x480B55D3, 0x5F430471, 0x13F75991,
|
|
0x3F9CF22C, 0x2FE0907A, 0xFD8E1E69, 0x7B1D5DE8,
|
|
0xD575A85C, 0xAD01C50A, 0x7EE00737, 0x3CE981E8,
|
|
0x0E447EFA, 0x23089DD6, 0xB59F149F, 0x13600EC7,
|
|
0xE802C8E6, 0x670921E4, 0x7207EFF0, 0xE74761B0,
|
|
0x69035234, 0xBFA40F19, 0xF63651A0, 0x29E64C26,
|
|
0x1F98CCA7, 0xD957007E, 0xE71DDC75, 0x3E729595,
|
|
0x7580B7CC, 0xD7FAF60B, 0x92484323, 0xA44113EB,
|
|
0xE4CBDE08, 0x346827C9, 0x3CF32AFA, 0x0B29BCF1,
|
|
0x6E29F7DF, 0xB01E71CB, 0x3BFBC0D1, 0x62EDC5B8,
|
|
0xB7DE789A, 0xA4748EC9, 0xE17A4C4F, 0x67E5BD03,
|
|
0xF3B33D1A, 0x97D8D3E9, 0x09121BC0, 0x347B2D2C,
|
|
0x79A1913C, 0x504172DE, 0x7F1F8483, 0x13AC3CF6,
|
|
0x7A2094DB, 0xC778FA12, 0xADF7469F, 0x21786B7B,
|
|
0x71A445D0, 0xA8896C1B, 0x656F62FB, 0x83A059B3,
|
|
0x972DFE6E, 0x4122000C, 0x97D9DA19, 0x17D5947B,
|
|
0xB1AFFD0C, 0x6EF83B97, 0xAF7F780B, 0x4613138A,
|
|
0x7C3E73A6, 0xCF15E03D, 0x41576322, 0x672DF292,
|
|
0xB658588D, 0x33EBEFA9, 0x938CBF06, 0x06B67381,
|
|
0x07F192C6, 0x2BDA5855, 0x348EE0E8, 0x19DBB6E3,
|
|
0x3222184B, 0xB69D5DBA, 0x7E760B88, 0xAF4D8154,
|
|
0x007A51AD, 0x35112500, 0xC9CD2D7D, 0x4F4FB761,
|
|
0x694772E3, 0x694C8351, 0x4A7E3AF5, 0x67D65CE1,
|
|
0x9287DE92, 0x2518DB3C, 0x8CB4EC06, 0xD154D38F,
|
|
0xE19A26BB, 0x295EE439, 0xC50A1104, 0x2153C6A7,
|
|
0x82366656, 0x0713BC2F, 0x6462215A, 0x21D9BFCE,
|
|
0xBA8EACE6, 0xAE2DF4C1, 0x2A8D5E80, 0x3F7E52D1,
|
|
0x29359399, 0xFEA1D19C, 0x18879313, 0x455AFA81,
|
|
0xFADFE838, 0x62609838, 0xD1028839, 0x0736E92F,
|
|
0x3BCA22A3, 0x1485B08A, 0x2DA7900B, 0x852C156D,
|
|
0xE8F24803, 0x00078472, 0x13F0D332, 0x2ACFD0CF,
|
|
0x5F747F5C, 0x87BB1E2F, 0xA7EFCB63, 0x23F432F0,
|
|
0xE6CE7C5C, 0x1F954EF6, 0xB609C91B, 0x3B4571BF,
|
|
0xEED17DC0, 0xE556CDA0, 0xA7846A8D, 0xFF105F94,
|
|
0x52B7CCDE, 0x0E33E801, 0x664455EA, 0xF2C70414,
|
|
0x73E7B486, 0x8F830661, 0x8B59E826, 0xBB8AEDCA,
|
|
0xF3D70AB9, 0xD739F2B9, 0x4A04C34A, 0x88D0F089,
|
|
0xE02191A2, 0xD89D9C78, 0x192C2749, 0xFC43A78F,
|
|
0x0AAC88CB, 0x9438D42D, 0x9E280F7A, 0x36063802,
|
|
0x38E8D018, 0x1C42A9CB, 0x92AAFF6C, 0xA24820C5,
|
|
0x007F077F, 0xCE5BC543, 0x69668D58, 0x10D6FF74,
|
|
0xBE00F621, 0x21300BBE, 0x2E9E8F46, 0x5ACEA629,
|
|
0xFA1F86C7, 0x52F206B8, 0x3EDF1A75, 0x6DA8D843,
|
|
0xCF719928, 0x73E3891F, 0xB4B95DD6, 0xB2A42D27,
|
|
0xEDA20BBF, 0x1A58DBDF, 0xA449AD03, 0x6DDEF22B,
|
|
0x900531E6, 0x3D3BFF35, 0x5B24ABA2, 0x472B3E4C,
|
|
0x387F2D75, 0x4D8DBA36, 0x71CB5641, 0xE3473F3F,
|
|
0xF6CD4B7F, 0xBF7D1428, 0x344B64D0, 0xC5CDFCB6,
|
|
0xFE2E0182, 0x2C37A673, 0xDE4EB7A3, 0x63FDC933,
|
|
0x01DC4063, 0x611F3571, 0xD167BFAF, 0x4496596F,
|
|
0x3DEE0689, 0xD8704910, 0x7052A114, 0x068C9EC5,
|
|
0x75D0E766, 0x4D54CC20, 0xB44ECDE2, 0x4ABC653E,
|
|
0x2C550A21, 0x1A52C0DB, 0xCFED03D0, 0x119BAFE2,
|
|
0x876A6133, 0xBC232088, 0x435BA1B2, 0xAE99BBFA,
|
|
0xBB4F08E4, 0xA62B5F49, 0x1DA4B695, 0x336B84DE,
|
|
0xDC813D31, 0x00C134FB, 0x397A98E6, 0x151F0E64,
|
|
0xD9EB3E69, 0xD3C7DF60, 0xD2F2C336, 0x2DDD067B,
|
|
0xBD122835, 0xB0B3BD3A, 0xB0D54E46, 0x8641F1E4,
|
|
0xA0B38F96, 0x51D39199, 0x37A6AD75, 0xDF84EE41,
|
|
0x3C034CBA, 0xACDA62FC, 0x11923B8B, 0x45EF170A,
|
|
];
|
|
|
|
// deprecated size_t[string] debugMap;
|
|
size_t _allocated;
|
|
Node*[] buckets;
|
|
Block* rootBlock;
|
|
}
|
|
|
|
private extern(C) void* calloc(size_t, size_t) nothrow pure;
|
|
private extern(C) void* malloc(size_t) nothrow pure;
|
|
private extern(C) void free(void*) nothrow pure;
|
|
|
|
unittest
|
|
{
|
|
import std.stdio;
|
|
auto source = cast(ubyte[]) q{ import std.stdio;}c;
|
|
auto tokens = byToken(source);
|
|
assert (tokens.map!"a.type"().equal([tok!"import", tok!"identifier", tok!".",
|
|
tok!"identifier", tok!";"]));
|
|
}
|
|
|
|
/// Test \x char sequence
|
|
unittest
|
|
{
|
|
auto toks = (string s) => byToken(cast(ubyte[])s);
|
|
|
|
// valid
|
|
enum hex = ['0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f','A','B','C','D','E','F'];
|
|
auto source = "";
|
|
foreach (h1; hex)
|
|
foreach (h2; hex)
|
|
source ~= "'\\x" ~ h1 ~ h2 ~ "'";
|
|
assert (toks(source).filter!(t => t.type != tok!"characterLiteral").empty);
|
|
|
|
// invalid
|
|
assert (toks(`'\x'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
|
|
assert (toks(`'\x_'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
|
|
assert (toks(`'\xA'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
|
|
assert (toks(`'\xAY'`).messages[0] == DLexer.Message(1,5,"Error: 2 hex digits expected.",true));
|
|
assert (toks(`'\xXX'`).messages[0] == DLexer.Message(1,4,"Error: 2 hex digits expected.",true));
|
|
}
|
|
|