refactoring

This commit is contained in:
Hackerpilot 2013-02-16 21:48:42 -08:00
parent 102836503f
commit c61e5d2a88
1 changed files with 234 additions and 156 deletions

View File

@ -411,6 +411,12 @@ struct TokenRange(R) if (isForwardRange!(R))
if (config.iterStyle & IterationStyle.includeSpecialTokens) if (config.iterStyle & IterationStyle.includeSpecialTokens)
break loop; break loop;
break; break;
case TokenType.eof:
if (config.iterStyle & IterationStyle.ignoreEOF)
break loop;
else
_empty = true;
break;
default: default:
break loop; break loop;
} }
@ -515,7 +521,7 @@ private:
"^=", "TokenType.xorEquals", "^=", "TokenType.xorEquals",
)); ));
case '/': case '/':
keepNonNewlineChar(); keepChar();
if (isEoF()) if (isEoF())
{ {
current.type = TokenType.div; current.type = TokenType.div;
@ -543,7 +549,7 @@ private:
return; return;
} }
case '.': case '.':
keepNonNewlineChar(); keepChar();
if (isEoF()) if (isEoF())
{ {
current.type = TokenType.dot; current.type = TokenType.dot;
@ -557,11 +563,11 @@ private:
return; return;
case '.': case '.':
current.type = TokenType.slice; current.type = TokenType.slice;
keepNonNewlineChar(); keepChar();
if (currentElement() == '.') if (currentElement() == '.')
{ {
current.type = TokenType.vararg; current.type = TokenType.vararg;
keepNonNewlineChar(); keepChar();
} }
current.value = getTokenValue(current.type); current.value = getTokenValue(current.type);
return; return;
@ -571,7 +577,7 @@ private:
return; return;
} }
case '0': .. case '9': case '0': .. case '9':
keepNonNewlineChar(); keepChar();
lexNumber(); lexNumber();
return; return;
case '\'': case '\'':
@ -582,7 +588,7 @@ private:
lexString(); lexString();
return; return;
case 'q': case 'q':
keepNonNewlineChar(); keepChar();
if (isEoF()) if (isEoF())
goto default; goto default;
switch (currentElement()) switch (currentElement())
@ -598,7 +604,7 @@ private:
} }
goto default; goto default;
case 'r': case 'r':
keepNonNewlineChar(); keepChar();
if (isEoF()) if (isEoF())
goto default; goto default;
else if (currentElement() == '"') else if (currentElement() == '"')
@ -609,7 +615,7 @@ private:
else else
goto default; goto default;
case 'x': case 'x':
keepNonNewlineChar(); keepChar();
if (isEoF()) if (isEoF())
goto default; goto default;
else if (currentElement() == '"') else if (currentElement() == '"')
@ -625,7 +631,7 @@ private:
default: default:
while(!isEoF() && !isSeparating()) while(!isEoF() && !isSeparating())
{ {
keepNonNewlineChar(); keepChar();
} }
current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]); current.type = lookupTokenType(cast(char[]) buffer[0 .. bufferIndex]);
@ -706,9 +712,9 @@ private:
switch(currentElement()) switch(currentElement())
{ {
case '/': case '/':
while (!isEoF() && !isNewline(currentElement())) while (!isEoF() && !isNewline())
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
} }
break; break;
@ -717,11 +723,11 @@ private:
{ {
if (currentElement() == '*') if (currentElement() == '*')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
if (currentElement() == '/') if (currentElement() == '/')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
break; break;
} }
@ -736,22 +742,22 @@ private:
{ {
if (currentElement() == '+') if (currentElement() == '+')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
if (currentElement() == '/') if (currentElement() == '/')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
--depth; --depth;
} }
} }
else if (currentElement() == '/') else if (currentElement() == '/')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
if (currentElement() == '+') if (currentElement() == '+')
{ {
static if (keep) keepNonNewlineChar(); static if (keep) keepChar();
else advanceRange(); else advanceRange();
++depth; ++depth;
} }
@ -784,7 +790,7 @@ private:
} }
else if (isHexDigit(currentElement())) else if (isHexDigit(currentElement()))
{ {
keepNonNewlineChar(); keepChar();
} }
else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped)) else if (isWhite() && (config.tokenStyle & TokenStyle.notEscaped))
{ {
@ -792,7 +798,7 @@ private:
} }
else if (currentElement() == '"') else if (currentElement() == '"')
{ {
keepNonNewlineChar(); keepChar();
break; break;
} }
else else
@ -838,12 +844,12 @@ private:
{ {
case 'x': case 'x':
case 'X': case 'X':
keepNonNewlineChar(); keepChar();
lexHex(); lexHex();
break; break;
case 'b': case 'b':
case 'B': case 'B':
keepNonNewlineChar(); keepChar();
lexBinary(); lexBinary();
break; break;
default: default:
@ -858,12 +864,12 @@ private:
switch (currentElement()) switch (currentElement())
{ {
case 'L': case 'L':
keepNonNewlineChar(); keepChar();
current.type = TokenType.doubleLiteral; current.type = TokenType.doubleLiteral;
break; break;
case 'f': case 'f':
case 'F': case 'F':
keepNonNewlineChar(); keepChar();
current.type = TokenType.floatLiteral; current.type = TokenType.floatLiteral;
break; break;
default: default:
@ -871,7 +877,7 @@ private:
} }
if (!isEoF() && currentElement() == 'i') if (!isEoF() && currentElement() == 'i')
{ {
keepNonNewlineChar(); keepChar();
if (current.type == TokenType.floatLiteral) if (current.type == TokenType.floatLiteral)
current.type = TokenType.ifloatLiteral; current.type = TokenType.ifloatLiteral;
else else
@ -895,11 +901,11 @@ private:
{ {
case TokenType.intLiteral: case TokenType.intLiteral:
current.type = TokenType.uintLiteral; current.type = TokenType.uintLiteral;
keepNonNewlineChar(); keepChar();
break; break;
case TokenType.longLiteral: case TokenType.longLiteral:
current.type = TokenType.ulongLiteral; current.type = TokenType.ulongLiteral;
keepNonNewlineChar(); keepChar();
break; break;
default: default:
return; return;
@ -913,11 +919,11 @@ private:
{ {
case TokenType.intLiteral: case TokenType.intLiteral:
current.type = TokenType.longLiteral; current.type = TokenType.longLiteral;
keepNonNewlineChar(); keepChar();
break; break;
case TokenType.uintLiteral: case TokenType.uintLiteral:
current.type = TokenType.ulongLiteral; current.type = TokenType.ulongLiteral;
keepNonNewlineChar(); keepChar();
break; break;
default: default:
return; return;
@ -938,7 +944,7 @@ private:
} }
body body
{ {
keepNonNewlineChar(); keepChar();
bool foundSign = false; bool foundSign = false;
bool foundDigit = false; bool foundDigit = false;
while (!isEoF()) while (!isEoF())
@ -953,12 +959,12 @@ private:
return; return;
} }
foundSign = true; foundSign = true;
keepNonNewlineChar(); keepChar();
break; break;
case '0': .. case '9': case '0': .. case '9':
case '_': case '_':
foundDigit = true; foundDigit = true;
keepNonNewlineChar(); keepChar();
break; break;
case 'L': case 'L':
case 'f': case 'f':
@ -990,7 +996,7 @@ private:
{ {
case '0': .. case '9': case '0': .. case '9':
case '_': case '_':
keepNonNewlineChar(); keepChar();
break; break;
case 'u': case 'u':
case 'U': case 'U':
@ -1035,7 +1041,7 @@ private:
break decimalLoop; // possibly slice expression break decimalLoop; // possibly slice expression
if (foundDot) if (foundDot)
break decimalLoop; // two dots with other characters between them break decimalLoop; // two dots with other characters between them
keepNonNewlineChar(); keepChar();
foundDot = true; foundDot = true;
current.type = TokenType.doubleLiteral; current.type = TokenType.doubleLiteral;
break; break;
@ -1057,7 +1063,7 @@ private:
case '0': case '0':
case '1': case '1':
case '_': case '_':
keepNonNewlineChar(); keepChar();
break; break;
case 'u': case 'u':
case 'U': case 'U':
@ -1083,7 +1089,7 @@ private:
case 'A': .. case 'F': case 'A': .. case 'F':
case '0': .. case '9': case '0': .. case '9':
case '_': case '_':
keepNonNewlineChar(); keepChar();
break; break;
case 'u': case 'u':
case 'U': case 'U':
@ -1118,7 +1124,7 @@ private:
break hexLoop; // slice expression break hexLoop; // slice expression
if (foundDot) if (foundDot)
break hexLoop; // two dots with other characters between them break hexLoop; // two dots with other characters between them
keepNonNewlineChar(); keepChar();
foundDot = true; foundDot = true;
current.type = TokenType.doubleLiteral; current.type = TokenType.doubleLiteral;
break; break;
@ -1142,7 +1148,7 @@ private:
current.type = TokenType.dstringLiteral; current.type = TokenType.dstringLiteral;
goto case 'c'; goto case 'c';
case 'c': case 'c':
keepNonNewlineChar(); keepChar();
break; break;
default: default:
break; break;
@ -1240,7 +1246,7 @@ private:
} }
else if (currentElement() == quote) else if (currentElement() == quote)
{ {
keepNonNewlineChar(); keepChar();
break; break;
} }
else else
@ -1534,27 +1540,37 @@ private:
body body
{ {
auto i = bufferIndex; auto i = bufferIndex;
while (true) ubyte[] ident = void;
if (isSeparating())
{ {
if (isEoF()) keepChar();
{ ident = buffer[i .. bufferIndex];
errorMessage("Unterminated string literal");
return;
}
else if (isNewline(currentElement()))
{
keepChar();
break;
}
else if (isSeparating())
{
errorMessage("Unterminated string literal - Separating");
return;
}
else
keepChar();
} }
auto ident = buffer[i .. bufferIndex - 1]; else
{
while (true)
{
if (isEoF())
{
errorMessage("Unterminated string literal. End of file");
return;
}
else if (isNewline())
{
keepChar();
break;
}
else if (isSeparating())
{
errorMessage("Unterminated string literal. Expected newline");
return;
}
else
keepChar();
}
ident = buffer[i .. bufferIndex - 1];
}
assert (ident.length > 0);
scope(exit) scope(exit)
{ {
@ -1562,16 +1578,20 @@ private:
setTokenValue(); setTokenValue();
else else
{ {
size_t b = 2 + ident.length; size_t begin = 2 + ident.length;
if (buffer[b] == '\r') ++b; if (buffer[begin] == '\r') ++begin;
if (buffer[b] == '\n') ++b; if (buffer[begin] == '\n') ++begin;
size_t e = bufferIndex; size_t end = bufferIndex;
if (buffer[e - 1] == 'c' || buffer[e - 1] == 'd' || buffer[e - 1] == 'w') // ignore string suffix
--e; if (buffer[end - 1] == 'c' || buffer[end - 1] == 'd' || buffer[end - 1] == 'w')
setTokenValue(b, e); --end;
// ignore delimeter and closing quote
setTokenValue(begin, end - ident.length - 1);
} }
} }
keepChar();
while (true) while (true)
{ {
if (isEoF()) if (isEoF())
@ -1579,7 +1599,8 @@ private:
errorMessage("Unterminated string literal"); errorMessage("Unterminated string literal");
return; return;
} }
else if (buffer[bufferIndex - ident.length .. bufferIndex] == ident) else if (bufferIndex > ident.length
&& buffer[bufferIndex - ident.length .. bufferIndex] == ident)
{ {
if (currentElement() == '"') if (currentElement() == '"')
{ {
@ -1589,7 +1610,8 @@ private:
} }
else else
{ {
errorMessage("Unterminated string literal"); errorMessage(cast(string) ("Unterminated string literal. Expected \" following "
~ cast(char[]) ident));
return; return;
} }
} }
@ -1608,13 +1630,35 @@ private:
current.type = TokenType.stringLiteral; current.type = TokenType.stringLiteral;
keepChar(); keepChar();
LexerConfig c = config; LexerConfig c = config;
config.iterStyle = IterationStyle.everything; config.iterStyle = IterationStyle.everything ^ IterationStyle.ignoreEOF;
assert (!(config.iterStyle & IterationStyle.ignoreEOF));
config.tokenStyle = TokenStyle.source; config.tokenStyle = TokenStyle.source;
size_t bi; size_t bi;
ubyte[] b = uninitializedArray!(ubyte[])(1024 * 4); ubyte[] b = uninitializedArray!(ubyte[])(1024 * 4);
int depth = 1; int depth = 1;
while (!isEoF())
scope(exit)
{ {
config = c;
buffer[0] = 'q';
buffer[1] = '{';
buffer[2 .. bi + 2] = b[0 .. bi];
bi++;
buffer[bi++] = '}';
bufferIndex = bi;
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(2, bufferIndex - 1);
}
while (true)
{
if (empty)
{
errorMessage("End of file in token string");
return;
}
advance(); advance();
while (bi + current.value.length >= b.length) while (bi + current.value.length >= b.length)
b.length += 1024 * 4; b.length += 1024 * 4;
@ -1629,17 +1673,6 @@ private:
break; break;
} }
} }
config = c;
buffer[0] = 'q';
buffer[1] = '{';
buffer[2 .. bi + 2] = b[0 .. bi];
bi++;
buffer[bi++] = '}';
bufferIndex = bi;
if (config.tokenStyle & TokenStyle.includeQuotes)
setTokenValue();
else
setTokenValue(2, bufferIndex - 1);
lexStringSuffix(); lexStringSuffix();
} }
@ -1664,7 +1697,7 @@ private:
errorMessage("Found EOF when interpreting special token sequence"); errorMessage("Found EOF when interpreting special token sequence");
return; return;
} }
else if (isNewline(r.front)) else if (r.front == '\r' || r.front == '\n')
break; break;
else else
{ {
@ -1706,18 +1739,64 @@ private:
current.column, s); current.column, s);
} }
void keepNonNewlineChar() void keepChar()
{ {
if (bufferIndex >= buffer.length) if (bufferIndex >= buffer.length)
buffer.length += 1024; buffer.length += 1024;
static if (isArray!R) static if (isArray!R)
buffer[bufferIndex++] = range[index++]; {
if (range[index] & ~0b1000_0000)
{
buffer[bufferIndex++] = range[index++];
++column;
}
else if (range[index] & 0b1100_0000)
{
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
column += 2;
}
else if (range[index] & 0b1110_0000)
{
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
column += 3;
}
else if (range[index] & 0b1111_0000)
{
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
buffer[bufferIndex++] = range[index++];
column += 4;
}
else
{
errorMessage("Invalid UTF-8 code unit");
buffer[bufferIndex++] = range[index++];
++column;
}
}
else else
{ {
buffer[bufferIndex++] = currentElement(); if (range[index] & 0x80)
advanceRange(); {
while (range[index] & 0x80)
{
buffer[bufferIndex++] = range[index++];
advanceRange();
++column;
}
}
else
{
buffer[bufferIndex++] = currentElement();
advanceRange();
++column;
}
} }
++column;
} }
void bufferChar(ubyte ch) void bufferChar(ubyte ch)
@ -1727,50 +1806,12 @@ private:
buffer[bufferIndex++] = ch; buffer[bufferIndex++] = ch;
} }
void keepChar() void keepNewline()
{ {
while (bufferIndex + 2 >= buffer.length) while (bufferIndex + 4 >= buffer.length)
buffer.length += 1024; buffer.length += 1024;
bool foundNewline; bool foundNewline = isNewline();
if (currentElement() == '\r') keepChar();
{
static if (isArray!R)
{
buffer[bufferIndex++] = range[index++];
}
else
{
buffer[bufferIndex++] = currentElement();
advanceRange();
}
foundNewline = true;
}
if (currentElement() == '\n')
{
static if (isArray!R)
{
buffer[bufferIndex++] = range[index++];
}
else
{
buffer[bufferIndex++] = currentElement();
advanceRange();
}
foundNewline = true;
}
else
{
static if (isArray!R)
{
buffer[bufferIndex++] = range[index++];
}
else
{
buffer[bufferIndex++] = currentElement();
advanceRange();
}
++column;
}
if (foundNewline) if (foundNewline)
{ {
++lineNumber; ++lineNumber;
@ -1798,6 +1839,7 @@ private:
{ {
if (endIndex == 0) if (endIndex == 0)
endIndex = bufferIndex; endIndex = bufferIndex;
assert (endIndex > startIndex);
current.value = cache.get(buffer[startIndex .. endIndex]); current.value = cache.get(buffer[startIndex .. endIndex]);
} }
@ -1823,36 +1865,44 @@ private:
return false; return false;
} }
bool isWhite() const nothrow bool isNewline() const nothrow
{ {
auto c = currentElement(); if (currentElement() == '\n') return true;
if (c & 0x80) // multi-byte utf-8 if (currentElement() == '\r') return true;
static if (isArray!R)
{ {
static if (isArray!R) if (index + 2 >= range.length) return false;
{ if (range[index] != 0xe2) return false;
if (index + 2 >= range.length) return false; if (range[index + 1] != 0x80) return false;
if (range[index] != 0xe2) return false; if (range[index + 2] != 0xa8 && range[index + 2] != 0xa9) return false;
if (range[index + 1] != 0x80) return false;
if (range[index + 2] != 0xa8 && range[index + 2] != 0xa9) return false;
}
else
{
auto r = range.save();
if (r.front != 0xe2)
return false;
else
r.popFront();
if (r.empty || r.front != 0x80)
return false;
else
r.popFront();
if (r.empty || (r.front != 0xa8 && range.front != 0xa9))
return false;
}
return true; return true;
} }
else else
{
auto r = range.save();
if (r.front != 0xe2)
return false;
else
r.popFront();
if (r.empty || r.front != 0x80)
return false;
else
r.popFront();
if (r.empty || (r.front != 0xa8 && range.front != 0xa9))
return false;
return true;
}
}
bool isWhite() const nothrow
{
if (isNewline())
return true;
else
{
auto c = currentElement();
return c == 0x20 || (c >= 0x09 && c <= 0x0d); return c == 0x20 || (c >= 0x09 && c <= 0x0d);
}
} }
immutable bufferSize = 1024 * 8; immutable bufferSize = 1024 * 8;
@ -2694,7 +2744,7 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
caseStatement ~= k; caseStatement ~= k;
caseStatement ~= "':\n"; caseStatement ~= "':\n";
caseStatement ~= indentString; caseStatement ~= indentString;
caseStatement ~= "\tkeepNonNewlineChar();\n"; caseStatement ~= "\tkeepChar();\n";
if (v.children.length > 0) if (v.children.length > 0)
{ {
caseStatement ~= indentString; caseStatement ~= indentString;
@ -2983,7 +3033,7 @@ unittest
unittest unittest
{ {
auto source = cast(ubyte[]) ("int #line 4\n double q{abcde}"); auto source = cast(ubyte[]) ("int #line 4\n double q{ab{cd}e}w");
LexerConfig config; LexerConfig config;
auto tokens = byToken(source, config); auto tokens = byToken(source, config);
assert (tokens.front.line == 1); assert (tokens.front.line == 1);
@ -2992,8 +3042,9 @@ unittest
assert (isType(tokens.front)); assert (isType(tokens.front));
assert (tokens.front.value == "double"); assert (tokens.front.value == "double");
tokens.popFront(); tokens.popFront();
assert (tokens.front.value == "abcde"); assert (tokens.front.value == "ab{cd}e");
assert (isStringLiteral(tokens.front)); assert (isStringLiteral(tokens.front));
assert (tokens.front.type == TokenType.wstringLiteral);
} }
unittest unittest
@ -3012,23 +3063,37 @@ unittest
unittest unittest
{ {
auto source = cast(ubyte[]) ("import foo"); auto source = cast(ubyte[]) ("import\u2028foo\u2029; "c);
LexerConfig config; LexerConfig config;
auto tokens = byToken(source, config); auto tokens = byToken(source, config);
Token a = tokens.moveFront(); Token a = tokens.moveFront();
assert (a.type == TokenType.import_);
Token b = tokens.moveFront(); Token b = tokens.moveFront();
writeln(b);
assert (a.type == TokenType.identifier);
assert (a != b); assert (a != b);
assert (a != "foo"); assert (a != "foo");
assert (a < b); assert (a < b);
assert (b > a); assert (b > a);
assert (!(a > a)); assert (!(a > a));
writeln(tokens.front);
assert (tokens.front.type == TokenType.semicolon);
tokens.popFront();
assert (tokens.empty); assert (tokens.empty);
//assert (tokens.empty);
} }
unittest unittest
{ {
auto source = cast(ubyte[]) ("import std.stdio; void main(){writeln(\"hello world\");}"); auto source = cast(ubyte[]) ("import std.stdio; void main(){"
~ "writeln(\"hello world\");} q{ __EOF__ }");
int errCount = 0;
void errorFunction(string file, size_t index, uint line, uint col, string msg)
{
++errCount;
}
LexerConfig config; LexerConfig config;
config.errorFunc = &errorFunction;
auto tokens = byToken(source, config); auto tokens = byToken(source, config);
int tokenCount = 0; int tokenCount = 0;
foreach (t; tokens) foreach (t; tokens)
@ -3036,6 +3101,19 @@ unittest
++tokenCount; ++tokenCount;
} }
assert (tokenCount == 16); assert (tokenCount == 16);
assert (errCount == 1);
}
unittest
{
auto source = cast(ubyte[]) ("q\"abcd\nstring\nabcd\" q\"/abc/\" __EOF__ int");
LexerConfig config;
auto tokens = byToken(source, config);
assert (tokens.front.value == "string\n");
tokens.popFront();
assert (tokens.front.value == "abc");
tokens.popFront();
assert (tokens.empty);
} }