Tokenizer is somewhat functional

This commit is contained in:
Hackerpilot 2013-01-18 00:34:59 -08:00
parent e3c737f6e1
commit a7f81c57cc
3 changed files with 229 additions and 282 deletions

View File

@ -53,15 +53,13 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
caseStatement ~= k;
caseStatement ~= "';\n";
caseStatement ~= indentString;
caseStatement ~= "\tcurrent.lineNumber = lineNumber;\n";
caseStatement ~= indentString;
caseStatement ~= "\t++index;\n";
caseStatement ~= indentString;
caseStatement ~= "\tinput.popFront();\n";
caseStatement ~= "\trange.popFront();\n";
if (v.children.length > 0)
{
caseStatement ~= indentString;
caseStatement ~= "\tif (isEoF(inputString, endIndex))\n";
caseStatement ~= "\tif (range.isEoF())\n";
caseStatement ~= indentString;
caseStatement ~= "\t{\n";
caseStatement ~= indentString;
@ -72,7 +70,7 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
caseStatement ~= indentString;
caseStatement ~= "\t}\n";
caseStatement ~= indentString;
caseStatement ~= "\tswitch (input.front)\n";
caseStatement ~= "\tswitch (range.front)\n";
caseStatement ~= indentString;
caseStatement ~= "\t{\n";
caseStatement ~= printCaseStatements(v, indentString ~ "\t");

View File

@ -110,7 +110,6 @@ pure nothrow TokenType lookupTokenTypeOptimized(const string input)
case 5:
switch (input)
{
case "@safe": return TokenType.AtSafe;
case "alias": return TokenType.Alias;
case "align": return TokenType.Align;
case "break": return TokenType.Break;
@ -169,7 +168,6 @@ pure nothrow TokenType lookupTokenTypeOptimized(const string input)
case 7:
switch (input)
{
case "@system": return TokenType.AtSystem;
case "cdouble": return TokenType.Cdouble;
case "default": return TokenType.Default;
case "dstring": return TokenType.DString;
@ -196,9 +194,7 @@ pure nothrow TokenType lookupTokenTypeOptimized(const string input)
case "__thread": return TokenType.Thread;
case "__traits": return TokenType.Traits;
case "volatile": return TokenType.Volatile;
case "@trusted": return TokenType.AtTrusted;
case "delegate": return TokenType.Delegate;
case "@disable": return TokenType.AtDisable;
case "function": return TokenType.Function;
case "unittest": return TokenType.Unittest;
case "__FILE__": return TokenType.File;
@ -209,7 +205,6 @@ pure nothrow TokenType lookupTokenTypeOptimized(const string input)
switch (input)
{
case "__gshared": return TokenType.Gshared;
case "@property": return TokenType.AtProperty;
case "immutable": return TokenType.Immutable;
case "interface": return TokenType.Interface;
case "invariant": return TokenType.Invariant;
@ -243,6 +238,7 @@ enum TokenType: uint
// Operators
OPERATORS_BEGIN,
Assign, /// =
At, /// @
BitAnd, /// &
BitAndEquals, /// &=
BitOr, /// |
@ -433,14 +429,6 @@ enum TokenType: uint
Traits, /// __traits,
CONSTANTS_END,
// Properties
PROPERTIES_BEGIN,
AtProperty, /// @property
AtSafe, /// @safe
AtSystem, /// @system
AtTrusted, /// @trusted
PROPERTIES_END,
// Misc
MISC_BEGIN,
Blank, /// unknown token type
@ -505,7 +493,6 @@ static this()
"delegate" : TokenType.Delegate,
"delete" : TokenType.Delete,
"deprecated" : TokenType.Deprecated,
"@disable" : TokenType.AtDisable,
"do" : TokenType.Do,
"double" : TokenType.Double,
"dstring" : TokenType.DString,
@ -550,14 +537,12 @@ static this()
"package" : TokenType.Package,
"pragma" : TokenType.Pragma,
"private" : TokenType.Private,
"@property" : TokenType.AtProperty,
"protected" : TokenType.Protected,
"public" : TokenType.Public,
"pure" : TokenType.Pure,
"real" : TokenType.Real,
"ref" : TokenType.Ref,
"return" : TokenType.Return,
"@safe" : TokenType.AtSafe,
"scope" : TokenType.Scope,
"shared" : TokenType.Shared,
"short" : TokenType.Short,
@ -567,14 +552,12 @@ static this()
"super" : TokenType.Super,
"switch" : TokenType.Switch,
"synchronized" : TokenType.Synchronized,
"@system" : TokenType.AtSystem,
"template" : TokenType.Template,
"this" : TokenType.This,
"__thread" : TokenType.Thread,
"throw" : TokenType.Throw,
"__traits" : TokenType.Traits,
"true" : TokenType.True,
"@trusted" : TokenType.AtTrusted,
"try" : TokenType.Try,
"typedef" : TokenType.Typedef,
"typeid" : TokenType.Typeid,

View File

@ -14,6 +14,7 @@ import std.uni;
import std.stdio;
import std.ascii;
import std.format;
import std.exception;
import langutils;
import codegen;
@ -29,9 +30,9 @@ pure bool isEoF(R)(R range)
return range.empty || range.front == 0 || range.front == 0x1a;
}
char[] popNewline(R)(ref R range, ref uint index)
C[] popNewline(R, C = ElementType!R)(ref R range, ref uint index) if (isSomeChar!C && isForwardRange!R)
{
char[] chars;
C[] chars;
if (range.front == '\r')
{
chars ~= range.front;
@ -58,13 +59,14 @@ unittest
/**
* Returns:
*/
Token lexWhitespace(R)(ref R range, ref uint index, ref uint lineNumber)
Token lexWhitespace(R, C = ElementType!R)(ref R range, ref uint index, ref uint lineNumber)
if (isForwardRange!R && isSomeChar!C)
{
Token t;
t.type = TokenType.Whitespace;
t.lineNumber = lineNumber;
t.startIndex = index;
auto app = appender!(char[])();
auto app = appender!(C[])();
while (!isEoF(range) && std.uni.isWhite(range.front))
{
if (isNewline(range))
@ -104,7 +106,8 @@ unittest
* lineNumber = the line number that corresponds to endIndex
* Returns: The comment
*/
Token lexComment(R)(ref R input, ref uint index, ref uint lineNumber)
Token lexComment(R, C = ElementType!R)(ref R input, ref uint index, ref uint lineNumber)
if (isSomeChar!C && isForwardRange!R)
in
{
assert (input.front == '/');
@ -115,7 +118,7 @@ body
t.lineNumber = lineNumber;
t.type = TokenType.Comment;
t.startIndex = index;
auto app = appender!(char[])();
auto app = appender!(C[])();
app.put(input.front);
input.popFront();
switch(input.front)
@ -252,10 +255,10 @@ unittest
/**
* Pops up to upTo hex chars from the input range and returns them as a string
*/
string popDigitChars(R, alias isInterestingDigit)(ref R input, ref uint index,
uint upTo)
string popDigitChars(R, C = ElementType!R, alias isInterestingDigit)(ref R input, ref uint index,
uint upTo) if (isSomeChar!C && isForwardRange!R)
{
auto app = appender!(char[])();
auto app = appender!(C[])();
for (uint i = 0; i != upTo; ++i)
{
if (isInterestingDigit(input.front))
@ -271,12 +274,12 @@ string popDigitChars(R, alias isInterestingDigit)(ref R input, ref uint index,
string popHexChars(R)(ref R input, ref uint index, uint upTo)
{
return popDigitChars!(R, isHexDigit)(input, index, upTo);
return popDigitChars!(R, ElementType!R, isHexDigit)(input, index, upTo);
}
string popOctalChars(R)(ref R input, ref uint index, uint upTo)
{
return popDigitChars!(R, isOctalDigit)(input, index, upTo);
return popDigitChars!(R, ElementType!R, isOctalDigit)(input, index, upTo);
}
unittest
@ -297,7 +300,8 @@ unittest
assert (rc == "00123");
}
string interpretEscapeSequence(R)(ref R input, ref uint index)
string interpretEscapeSequence(R, C = ElementType!R)(ref R input, ref uint index)
if (isSomeChar!C && isForwardRange!R)
in
{
assert(input.front == '\\');
@ -391,17 +395,8 @@ unittest
assert (interpretEscapeSequence(k, i) == v);
}
/**
* Params:
* inputString = the source code to examine
* endIndex = an index into inputString at the opening quote
* lineNumber = the line number that corresponds to endIndex
* quote = the opening (and closing) quote character for the string to be
* lexed
* Returns: a string literal, including its opening and closing quote characters
*/
Token lexString(R)(ref R input, ref uint lineNumber, ref uint index,
bool canEscape = true)
Token lexString(R)(ref R input, ref uint index, ref uint lineNumber,
const StringStyle style = StringStyle.Escaped)
in
{
assert (input.front == '\'' || input.front == '"' || input.front == '`');
@ -411,10 +406,13 @@ body
Token t;
t.lineNumber = lineNumber;
t.startIndex = index;
t.type = TokenType.StringLiteral;
auto quote = input.front;
input.popFront();
++index;
auto app = appender!(char[])();
if (style & StringStyle.IncludeQuotes)
app.put(quote);
while (!isEoF(input))
{
if (isNewline(input))
@ -422,10 +420,12 @@ body
app.put(popNewline(input, index));
lineNumber++;
}
else if (input.front == '\\' && canEscape)
else if (input.front == '\\' && style & StringStyle.Escaped)
app.put(interpretEscapeSequence(input, index));
else if (input.front == quote)
{
if (style & StringStyle.IncludeQuotes)
app.put(quote);
input.popFront();
++index;
break;
@ -443,20 +443,17 @@ body
{
case 'w':
t.type = TokenType.WStringLiteral;
input.popFront();
++index;
break;
goto case 'c';
case 'd':
t.type = TokenType.DStringLiteral;
goto case 'c';
case 'c':
if (style & StringStyle.IncludeQuotes)
app.put(input.front);
input.popFront();
++index;
break;
case 'c':
input.popFront();
++index;
goto default;
default:
t.type = TokenType.StringLiteral;
break;
}
}
@ -473,7 +470,7 @@ unittest
auto b = "\"ab\\ncd\"";
assert (lexString(b, i, l) == "ab\ncd");
auto c = "`abc\\ndef`";
assert (lexString(c, i, l, false) == "abc\\ndef");
assert (lexString(c, i, l, StringStyle.NotEscaped) == "abc\\ndef");
auto d = `"12345"w`;
assert (lexString(d, i, l).type == TokenType.WStringLiteral);
auto e = `"abc"c`;
@ -1091,32 +1088,214 @@ pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
enum IterationStyle
{
/// Only include code, not whitespace or comments
CODE_ONLY,
CodeOnly = 0,
/// Includes comments
IncludeComments = 1,
/// Includes whitespace
IncludeWhitespace = 2 << 1,
/// Include everything
EVERYTHING
Everything = IncludeComments | IncludeWhitespace
}
struct TokenRange(R) if (isInputRange(R))
/**
* Configuration of the token lexing style
*/
enum StringStyle : uint
{
NotEscaped = 0,
/// String escape sequences will be processed and enclosing quote characters
/// will not be preserved.
Escaped = 1,
/// Strings will be read exactly as they appeared in the source, including
/// their opening and closing quote characters. Useful for syntax highlighting.
IncludeQuotes = 2,
}
TokenRange!(R) byToken(R)(ref R range, const IterationStyle iterationStyle = IterationStyle.CodeOnly,
const StringStyle tokenStyle = StringStyle.Escaped) if (isForwardRange!(R) && isSomeChar!(ElementType!(R)))
{
auto r = TokenRange!(R)(range);
r.tokenStyle = tokenStyle;
r.iterStyle = iterationStyle;
r.lineNumber = 1;
r.popFront();
return r;
}
struct TokenRange(R) if (isForwardRange!(R) && isSomeChar!(ElementType!(R)))
{
this(ref R range)
{
this.range = range;
}
bool empty() const @property
bool empty() @property
{
return _empty;
}
Token front() const @property
{
enforce(!_empty, "Cannot call popFront() on empty token range");
return current;
}
Token popFront()
{
Token c = current;
if (range.isEoF())
{
_empty = true;
return current;
}
Token c = current;
current = Token.init;
current.lineNumber = lineNumber;
current.startIndex = index;
while (std.uni.isWhite(range.front))
{
if (iterStyle == IterationStyle.Everything)
{
current = lexWhitespace(range, index, lineNumber);
break;
}
else
lexWhitespace(range, index, lineNumber);
}
outer: switch (range.front)
{
mixin(generateCaseTrie(
"=", "TokenType.Assign",
"&", "TokenType.BitAnd",
"&=", "TokenType.BitAndEquals",
"|", "TokenType.BitOr",
"|=", "TokenType.BitOrEquals",
"~=", "TokenType.CatEquals",
":", "TokenType.Colon",
",", "TokenType.Comma",
"$", "TokenType.Dollar",
".", "TokenType.Dot",
"==", "TokenType.Equals",
"=>", "TokenType.GoesTo",
">", "TokenType.Greater",
">=", "TokenType.GreaterEqual",
"#", "TokenType.Hash",
"&&", "TokenType.LogicAnd",
"{", "TokenType.LBrace",
"[", "TokenType.LBracket",
"<", "TokenType.Less",
"<=", "TokenType.LessEqual",
"<>=", "TokenType.LessEqualGreater",
"<>", "TokenType.LessOrGreater",
"||", "TokenType.LogicOr",
"(", "TokenType.LParen",
"-", "TokenType.Minus",
"-=", "TokenType.MinusEquals",
"%", "TokenType.Mod",
"%=", "TokenType.ModEquals",
"*=", "TokenType.MulEquals",
"!", "TokenType.Not",
"!=", "TokenType.NotEquals",
"!>", "TokenType.NotGreater",
"!>=", "TokenType.NotGreaterEqual",
"!<", "TokenType.NotLess",
"!<=", "TokenType.NotLessEqual",
"!<>", "TokenType.NotLessEqualGreater",
"+", "TokenType.Plus",
"+=", "TokenType.PlusEquals",
"^^", "TokenType.Pow",
"^^=", "TokenType.PowEquals",
"}", "TokenType.RBrace",
"]", "TokenType.RBracket",
")", "TokenType.RParen",
";", "TokenType.Semicolon",
"<<", "TokenType.ShiftLeft",
"<<=", "TokenType.ShiftLeftEqual",
">>", "TokenType.ShiftRight",
">>=", "TokenType.ShiftRightEqual",
"..", "TokenType.Slice",
"*", "TokenType.Star",
"?", "TokenType.Ternary",
"~", "TokenType.Tilde",
"--", "TokenType.Decrement",
"!<>=", "TokenType.Unordered",
">>>", "TokenType.UnsignedShiftRight",
">>>=", "TokenType.UnsignedShiftRightEqual",
"++", "TokenType.Increment",
"...", "TokenType.Vararg",
"^", "TokenType.Xor",
"^=", "TokenType.XorEquals",
"@", "TokenType.At",
));
case '0': .. case '9':
current = lexNumber(range, index, lineNumber);
break;
case '\'':
case '"':
current = lexString(range, index, lineNumber);
break;
case '`':
current = lexString(range, index, lineNumber, StringStyle.NotEscaped);
break;
case 'q':
auto r = range.save;
r.popFront();
if (!r.isEoF() && r.front == '{')
writeln("ParseTokenString");
else
goto default;
case '/':
auto r = range.save();
r.popFront();
if (r.isEoF())
{
current.type = TokenType.Div;
current.value = "/";
break;
}
switch (r.front)
{
case '/':
case '*':
case '+':
current = lexComment(range, index, lineNumber);
break outer;
case '=':
current.type = TokenType.DivEquals;
current.value = "/=";
break outer;
default:
current.type = TokenType.Div;
current.value = "/";
break;
}
break;
case 'r':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
writeln("parse wysiwyg string");
else
goto default;
case 'x':
auto r = range.save();
r.popFront();
if (!r.isEoF() && r.front == '"')
writeln("parse hex string");
else
goto default;
default:
auto app = appender!(ElementType!(R)[])();
while(!range.isEoF() && !isSeparating(range.front))
{
app.put(range.front);
range.popFront();
}
current.value = to!string(app.data);
current.type = lookupTokenTypeOptimized(current.value);
break;
}
return c;
}
@ -1126,226 +1305,13 @@ private:
uint index;
R range;
bool _empty;
IterationStyle iterStyle;
StringStyle tokenStyle;
}
//Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY)
// if (isSomeString!S)
//{
// auto tokenAppender = appender!(Token[])();
//
// // This is very likely a local maximum, but it does seem to take a few
// // milliseconds off of the run time
// tokenAppender.reserve(inputString.length / 4);
//
// size_t endIndex = 0;
// uint lineNumber = 1;
//
// if (inputString.length > 1 && inputString[0..2] == "#!")
// {
// Token currentToken;
// currentToken.lineNumber = lineNumber; // lineNumber is always 1
// currentToken.value = lexScriptLine(inputString, endIndex, lineNumber);
// currentToken.type = TokenType.ScriptLine;
// }
//
// while (!isEoF(inputString, endIndex))
// {
// size_t prevIndex = endIndex;
// Token currentToken;
// auto startIndex = endIndex;
// if (isWhite(inputString[endIndex]))
// {
// if (iterationStyle == IterationStyle.EVERYTHING)
// {
// currentToken.lineNumber = lineNumber;
// currentToken.value = lexWhitespace(inputString, endIndex,
// lineNumber);
// currentToken.type = TokenType.Whitespace;
// tokenAppender.put(currentToken);
// }
// else
// lexWhitespace(inputString, endIndex, lineNumber);
// continue;
// }
// currentToken.startIndex = endIndex;
//
// outerSwitch: switch(inputString[endIndex])
// {
// mixin(generateCaseTrie(
// "=", "TokenType.Assign",
// "&", "TokenType.BitAnd",
// "&=", "TokenType.BitAndEquals",
// "|", "TokenType.BitOr",
// "|=", "TokenType.BitOrEquals",
// "~=", "TokenType.CatEquals",
// ":", "TokenType.Colon",
// ",", "TokenType.Comma",
// "$", "TokenType.Dollar",
// ".", "TokenType.Dot",
// "==", "TokenType.Equals",
// "=>", "TokenType.GoesTo",
// ">", "TokenType.Greater",
// ">=", "TokenType.GreaterEqual",
// "#", "TokenType.Hash",
// "&&", "TokenType.LogicAnd",
// "{", "TokenType.LBrace",
// "[", "TokenType.LBracket",
// "<", "TokenType.Less",
// "<=", "TokenType.LessEqual",
// "<>=", "TokenType.LessEqualGreater",
// "<>", "TokenType.LessOrGreater",
// "||", "TokenType.LogicOr",
// "(", "TokenType.LParen",
// "-", "TokenType.Minus",
// "-=", "TokenType.MinusEquals",
// "%", "TokenType.Mod",
// "%=", "TokenType.ModEquals",
// "*=", "TokenType.MulEquals",
// "!", "TokenType.Not",
// "!=", "TokenType.NotEquals",
// "!>", "TokenType.NotGreater",
// "!>=", "TokenType.NotGreaterEqual",
// "!<", "TokenType.NotLess",
// "!<=", "TokenType.NotLessEqual",
// "!<>", "TokenType.NotLessEqualGreater",
// "+", "TokenType.Plus",
// "+=", "TokenType.PlusEquals",
// "^^", "TokenType.Pow",
// "^^=", "TokenType.PowEquals",
// "}", "TokenType.RBrace",
// "]", "TokenType.RBracket",
// ")", "TokenType.RParen",
// ";", "TokenType.Semicolon",
// "<<", "TokenType.ShiftLeft",
// "<<=", "TokenType.ShiftLeftEqual",
// ">>", "TokenType.ShiftRight",
// ">>=", "TokenType.ShiftRightEqual",
// "..", "TokenType.Slice",
// "*", "TokenType.Star",
// "?", "TokenType.Ternary",
// "~", "TokenType.Tilde",
// "--", "TokenType.Decrement",
// "!<>=", "TokenType.Unordered",
// ">>>", "TokenType.UnsignedShiftRight",
// ">>>=", "TokenType.UnsignedShiftRightEqual",
// "++", "TokenType.Increment",
// "...", "TokenType.Vararg",
// "^", "TokenType.Xor",
// "^=", "TokenType.XorEquals",
// ));
// case '0': .. case '9':
// currentToken = lexNumber(inputString, endIndex);
// break;
// case '/':
// ++endIndex;
// if (isEoF(inputString, endIndex))
// {
// currentToken.value = "/";
// currentToken.type = TokenType.Div;
// currentToken.lineNumber = lineNumber;
// break;
// }
// currentToken.lineNumber = lineNumber;
// switch (inputString[endIndex])
// {
// case '/':
// case '+':
// case '*':
// if (iterationStyle == IterationStyle.CODE_ONLY)
// {
// lexComment(inputString, endIndex, lineNumber);
// continue;
// }
// else
// {
// currentToken.value = lexComment(inputString, endIndex, lineNumber);
// currentToken.type = TokenType.Comment;
// break;
// }
// case '=':
// currentToken.value = "/=";
// currentToken.type = TokenType.DivEquals;
// ++endIndex;
// break;
// default:
// currentToken.value = "/";
// currentToken.type = TokenType.Div;
// break;
// }
// break;
// case 'r':
// ++endIndex;
// if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
// goto default;
// currentToken.lineNumber = lineNumber;
// currentToken.value = lexString(inputString, endIndex,
// lineNumber, inputString[endIndex], false);
// currentToken.type = TokenType.StringLiteral;
// break;
// case '`':
// currentToken.lineNumber = lineNumber;
// currentToken.value = lexString(inputString, endIndex, lineNumber,
// inputString[endIndex], false);
// currentToken.type = TokenType.StringLiteral;
// break;
// case 'x':
// ++endIndex;
// if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
// goto default;
// else
// goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
// case '\'':
// case '"':
// currentToken.lineNumber = lineNumber;
// currentToken.value = lexString(inputString, endIndex, lineNumber,
// inputString[endIndex]);
// currentToken.type = TokenType.StringLiteral;
// break;
// case 'q':
// currentToken.value = "q";
// ++endIndex;
// if (!isEoF(inputString, endIndex))
// {
// switch (inputString[endIndex])
// {
// case '"':
// currentToken.lineNumber = lineNumber;
// currentToken.value ~= lexDelimitedString(inputString,
// endIndex, lineNumber);
// currentToken.type = TokenType.StringLiteral;
// break outerSwitch;
// case '{':
// currentToken.lineNumber = lineNumber;
// currentToken.value ~= lexTokenString(inputString,
// endIndex, lineNumber);
// currentToken.type = TokenType.StringLiteral;
// break outerSwitch;
// default:
// break;
// }
// }
// goto default;
// case '@':
// ++endIndex;
// goto default;
// default:
// while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
// ++endIndex;
// currentToken.value = inputString[startIndex .. endIndex];
// currentToken.type = lookupTokenTypeOptimized(currentToken.value);
// //currentToken.type = lookupTokenType(currentToken.value);
// currentToken.lineNumber = lineNumber;
// break;
// }
// //stderr.writeln(currentToken);
// tokenAppender.put(currentToken);
//
// // This should never happen.
// if (endIndex <= prevIndex)
// {
// stderr.writeln("FAIL");
// return [];
// }
// }
// return tokenAppender.data;
//}
unittest
{
auto c = ">><==>)(*)\"TestString\"if import ifire 0,10.4f `\n`@property void//comment\ntest/* comment *//+comment/+moar comment+/+/";
foreach (t; byToken(c))
writeln(t);
}