Merge pull request #9 from roman-d-boiko/master

Implemented EoF handling and partially fixed lexing numbers
This commit is contained in:
Hackerpilot 2012-04-26 08:24:05 -07:00
commit 1dc70fb0ae
2 changed files with 132 additions and 63 deletions

View File

@ -60,7 +60,7 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
if (v.children.length > 0) if (v.children.length > 0)
{ {
caseStatement ~= indentString; caseStatement ~= indentString;
caseStatement ~= "\tif (endIndex >= inputString.length)\n"; caseStatement ~= "\tif (isEoF(inputString, endIndex))\n";
caseStatement ~= indentString; caseStatement ~= indentString;
caseStatement ~= "\t{\n"; caseStatement ~= "\t{\n";
caseStatement ~= indentString; caseStatement ~= indentString;
@ -110,3 +110,28 @@ string generateCaseTrie(string[] args ...)
} }
return printCaseStatements(t, ""); return printCaseStatements(t, "");
} }
/**
* Returns: true if index points to end of inputString, false otherwise
*/
pure nothrow bool isEoF(S)(S inputString, size_t index)
{
// note: EoF is determined according to D specification
return index >= inputString.length
|| inputString[index] == Character.NUL
|| inputString[index] == Character.SUB;
}
private:
// Unicode character literals
enum Character
{
// End of file (EoF)
NUL = '\u0000', // NUL character
SUB = '\u001A', // Substitute character
// Line feed (EoL)
CR = '\u000D', // CR character
LF = '\u000A', // LF character
}

View File

@ -29,11 +29,11 @@ import codegen;
* Returns: The whitespace, or null if style was CODE_ONLY * Returns: The whitespace, or null if style was CODE_ONLY
*/ */
pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex, pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex,
ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) // I suggest to remove the last param
if (isSomeString!S) if (isSomeString!S)
{ {
immutable startIndex = endIndex; immutable startIndex = endIndex;
while (endIndex < inputString.length && isWhite(inputString[endIndex])) while (!isEoF(inputString, endIndex) && isWhite(inputString[endIndex]))
{ {
if (inputString[endIndex] == '\n') if (inputString[endIndex] == '\n')
lineNumber++; lineNumber++;
@ -64,7 +64,7 @@ pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
if(inputString.length > 1 && inputString[0..2] == "#!") // safety check if(inputString.length > 1 && inputString[0..2] == "#!") // safety check
{ {
endIndex = 2; // skip #! endIndex = 2; // skip #!
while (endIndex < inputString.length && inputString[endIndex] != '\n') while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
++endIndex; ++endIndex;
result = inputString[startIndex..endIndex]; result = inputString[startIndex..endIndex];
@ -85,13 +85,13 @@ pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex, pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
ref uint lineNumber) if (isSomeString!S) ref uint lineNumber) if (isSomeString!S)
{ {
if (inputString.length == 0) if (isEoF(inputString, endIndex))
return ""; return "";
auto startIndex = endIndex - 1; auto startIndex = endIndex - 1;
switch(inputString[endIndex]) switch(inputString[endIndex])
{ {
case '/': case '/':
while (endIndex < inputString.length && inputString[endIndex] != '\n') while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
{ {
if (inputString[endIndex] == '\n') if (inputString[endIndex] == '\n')
++lineNumber; ++lineNumber;
@ -99,7 +99,7 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
} }
break; break;
case '*': case '*':
while (endIndex < inputString.length while (!isEoF(inputString, endIndex)
&& !inputString[endIndex..$].startsWith("*/")) && !inputString[endIndex..$].startsWith("*/"))
{ {
if (inputString[endIndex] == '\n') if (inputString[endIndex] == '\n')
@ -111,7 +111,7 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
case '+': case '+':
++endIndex; ++endIndex;
int depth = 1; int depth = 1;
while (depth > 0 && endIndex + 1 < inputString.length) while (depth > 0 && !isEoF(inputString, endIndex))
{ {
if (inputString[endIndex] == '\n') if (inputString[endIndex] == '\n')
lineNumber++; lineNumber++;
@ -121,7 +121,8 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
depth++; depth++;
++endIndex; ++endIndex;
} }
++endIndex; if (!isEoF(inputString, endIndex))
++endIndex;
break; break;
default: default:
break; break;
@ -145,7 +146,7 @@ pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint
in in
{ {
assert (inputString[endIndex] == quote); assert (inputString[endIndex] == quote);
assert (quote == '\'' || quote == '\"' || quote == '`'); assert (quote == '\'' || quote == '"' || quote == '`');
} }
body body
{ {
@ -154,7 +155,7 @@ body
auto startIndex = endIndex; auto startIndex = endIndex;
++endIndex; ++endIndex;
bool escape = false; bool escape = false;
while (endIndex < inputString.length && (inputString[endIndex] != quote || escape)) while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
{ {
if (escape) if (escape)
escape = false; escape = false;
@ -165,12 +166,11 @@ body
++endIndex; ++endIndex;
} }
++endIndex; ++endIndex;
if (endIndex < inputString.length && (inputString[endIndex] == 'w' if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c')) || inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
{ {
++endIndex; ++endIndex;
} }
endIndex = min(endIndex, inputString.length);
return inputString[startIndex .. endIndex]; return inputString[startIndex .. endIndex];
} }
@ -188,6 +188,7 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
{ {
auto startIndex = endIndex; auto startIndex = endIndex;
++endIndex; ++endIndex;
assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
string open = inputString[endIndex .. endIndex + 1]; string open = inputString[endIndex .. endIndex + 1];
string close; string close;
bool nesting = false; bool nesting = false;
@ -198,12 +199,13 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
case '{': close = "}"; ++endIndex; nesting = true; break; case '{': close = "}"; ++endIndex; nesting = true; break;
case '(': close = ")"; ++endIndex; nesting = true; break; case '(': close = ")"; ++endIndex; nesting = true; break;
default: default:
while(!isWhite(inputString[endIndex])) endIndex++; while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
endIndex++;
close = open = inputString[startIndex + 1 .. endIndex]; close = open = inputString[startIndex + 1 .. endIndex];
break; break;
} }
int depth = 1; int depth = 1;
while (endIndex < inputString.length && depth > 0) while (!isEoF(inputString, endIndex) && depth > 0)
{ {
if (inputString[endIndex] == '\n') if (inputString[endIndex] == '\n')
{ {
@ -213,9 +215,9 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
else if (inputString[endIndex..$].startsWith(open)) else if (inputString[endIndex..$].startsWith(open))
{ {
endIndex += open.length; endIndex += open.length;
if (!nesting) if (!nesting && !isEoF(inputString, endIndex))
{ {
if (inputString[endIndex] == '\"') if (inputString[endIndex] == '"')
++endIndex; ++endIndex;
break; break;
} }
@ -231,7 +233,7 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
else else
++endIndex; ++endIndex;
} }
if (endIndex < inputString.length && inputString[endIndex] == '\"') if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
++endIndex; ++endIndex;
return inputString[startIndex .. endIndex]; return inputString[startIndex .. endIndex];
} }
@ -254,7 +256,7 @@ pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
if (inputString[endIndex] == '0') if (inputString[endIndex] == '0')
{ {
endIndex++; endIndex++;
if (endIndex >= inputString.length) if (isEoF(inputString, endIndex))
{ {
token.type = TokenType.intLiteral; token.type = TokenType.intLiteral;
token.value = inputString[startIndex .. endIndex]; token.value = inputString[startIndex .. endIndex];
@ -295,32 +297,42 @@ pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
bool isLong = false; bool isLong = false;
bool isUnsigned = false; bool isUnsigned = false;
token.type = TokenType.intLiteral; token.type = TokenType.intLiteral;
binaryLoop: while (endIndex < inputString.length) binaryLoop: while (!isEoF(inputString, endIndex))
{ {
switch (inputString[endIndex]) switch (inputString[endIndex])
{ {
case '0': case '0':
case '1': case '1':
case '_': case '_':
++endIndex;
if (lexingSuffix) if (lexingSuffix)
break binaryLoop; break binaryLoop;
++endIndex;
break; break;
case 'u': case 'u':
case 'U': case 'U':
if (isUnsigned)
break;
++endIndex; ++endIndex;
lexingSuffix = true; lexingSuffix = true;
if (isLong) if (isLong)
{
token.type = TokenType.unsignedLongLiteral; token.type = TokenType.unsignedLongLiteral;
break binaryLoop;
}
else else
token.type = TokenType.unsignedIntLiteral; token.type = TokenType.unsignedIntLiteral;
isUnsigned = true;
break; break;
case 'L': case 'L':
++endIndex;
if (isLong) if (isLong)
break binaryLoop; break binaryLoop;
++endIndex;
lexingSuffix = true;
if (isUnsigned) if (isUnsigned)
{
token.type = TokenType.unsignedLongLiteral; token.type = TokenType.unsignedLongLiteral;
break binaryLoop;
}
else else
token.type = TokenType.longLiteral; token.type = TokenType.longLiteral;
isLong = true; isLong = true;
@ -346,15 +358,15 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
bool foundE = false; bool foundE = false;
bool foundPlusMinus = false; bool foundPlusMinus = false;
token.type = TokenType.intLiteral; token.type = TokenType.intLiteral;
decimalLoop: while (endIndex < inputString.length) decimalLoop: while (!isEoF(inputString, endIndex))
{ {
switch (inputString[endIndex]) switch (inputString[endIndex])
{ {
case '0': .. case '9': case '0': .. case '9':
case '_': case '_':
++endIndex;
if (lexingSuffix) if (lexingSuffix)
break decimalLoop; break decimalLoop;
++endIndex;
break; break;
case 'e': case 'e':
case 'E': case 'E':
@ -371,8 +383,10 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
++endIndex; ++endIndex;
break; break;
case '.': case '.':
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
break decimalLoop; // possibly slice expression
if (foundDot) if (foundDot)
break decimalLoop; break decimalLoop; // two dots with other characters between them
++endIndex; ++endIndex;
foundDot = true; foundDot = true;
token.type = TokenType.doubleLiteral; token.type = TokenType.doubleLiteral;
@ -380,6 +394,8 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
break; break;
case 'u': case 'u':
case 'U': case 'U':
if (isUnsigned)
break decimalLoop;
++endIndex; ++endIndex;
lexingSuffix = true; lexingSuffix = true;
if (isLong) if (isLong)
@ -389,10 +405,12 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
isUnsigned = true; isUnsigned = true;
break; break;
case 'L': case 'L':
if (isLong)
break decimalLoop;
if (isReal)
break decimalLoop;
++endIndex; ++endIndex;
lexingSuffix = true; lexingSuffix = true;
if (isLong || isReal)
break decimalLoop;
if (isDouble) if (isDouble)
token.type = TokenType.realLiteral; token.type = TokenType.realLiteral;
else if (isUnsigned) else if (isUnsigned)
@ -414,9 +432,34 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
} }
} }
// suggest to extract lexing integers into a separate function
// please see unittest below
token.value = inputString[startIndex .. endIndex]; token.value = inputString[startIndex .. endIndex];
} }
unittest {
dump!lexDecimal("55e-4"); // yeilds intLiteral, but should be float
dump!lexDecimal("3e+f"); // floatLiteral, but should be considered invalid
dump!lexDecimal("3e++f"); // intLiteral 3e+, but should be considered invalid
// actually, there are lots of bugs. The point is that without decomposition of integer lexing from floating-point lexing
// it is very hard to prove algorithm correctness
}
// Temporary function to illustrate some problems
// Executes T and dumps results to console
void dump(alias T)(string s) {
size_t start;
size_t end;
Token tok;
T!(string)(s, start, end, tok);
// dump results
writeln(tok.type);
writeln(tok.value);
writeln(start);
writeln(end);
}
nothrow void lexHex(S)(ref S inputString, ref size_t startIndex, nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
ref size_t endIndex, ref Token token) if (isSomeString!S) ref size_t endIndex, ref Token token) if (isSomeString!S)
{ {
@ -430,7 +473,7 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
bool foundE = false; bool foundE = false;
bool foundPlusMinus = false; bool foundPlusMinus = false;
token.type = TokenType.intLiteral; token.type = TokenType.intLiteral;
hexLoop: while (endIndex < inputString.length) hexLoop: while (!isEoF(inputString, endIndex))
{ {
switch (inputString[endIndex]) switch (inputString[endIndex])
{ {
@ -438,9 +481,9 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
case 'a': .. case 'f': case 'a': .. case 'f':
case 'A': .. case 'F': case 'A': .. case 'F':
case '_': case '_':
++endIndex;
if (lexingSuffix) if (lexingSuffix)
break hexLoop; break hexLoop;
++endIndex;
break; break;
case 'p': case 'p':
case 'P': case 'P':
@ -457,8 +500,10 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
++endIndex; ++endIndex;
break; break;
case '.': case '.':
if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
break hexLoop; // possibly slice expression
if (foundDot) if (foundDot)
break hexLoop; break hexLoop; // two dots with other characters between them
++endIndex; ++endIndex;
foundDot = true; foundDot = true;
token.type = TokenType.doubleLiteral; token.type = TokenType.doubleLiteral;
@ -525,7 +570,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
currentToken.type = TokenType.scriptLine; currentToken.type = TokenType.scriptLine;
} }
while (endIndex < inputString.length) while (!isEoF(inputString, endIndex))
{ {
size_t prevIndex = endIndex; size_t prevIndex = endIndex;
Token currentToken; Token currentToken;
@ -536,7 +581,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
{ {
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
currentToken.value = lexWhitespace(inputString, endIndex, currentToken.value = lexWhitespace(inputString, endIndex,
lineNumber, IterationStyle.EVERYTHING); lineNumber, IterationStyle.EVERYTHING); // note: I suggest to remove the last parameter to simplify lexWhitespace
currentToken.type = TokenType.whitespace; currentToken.type = TokenType.whitespace;
tokenAppender.put(currentToken); tokenAppender.put(currentToken);
} }
@ -615,7 +660,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
break; break;
case '/': case '/':
++endIndex; ++endIndex;
if (endIndex >= inputString.length) if (isEoF(inputString, endIndex))
{ {
currentToken.value = "/"; currentToken.value = "/";
currentToken.type = TokenType.div; currentToken.type = TokenType.div;
@ -651,18 +696,14 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
} }
break; break;
case 'r': case 'r':
currentToken.value = "r";
++endIndex; ++endIndex;
if (inputString[endIndex] == '\"') if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
{
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex,
lineNumber, inputString[endIndex], false);
currentToken.type = TokenType.stringLiteral;
break;
}
else
goto default; goto default;
currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex,
lineNumber, inputString[endIndex], false);
currentToken.type = TokenType.stringLiteral;
break;
case '`': case '`':
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
currentToken.value = lexString(inputString, endIndex, lineNumber, currentToken.value = lexString(inputString, endIndex, lineNumber,
@ -670,12 +711,11 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
currentToken.type = TokenType.stringLiteral; currentToken.type = TokenType.stringLiteral;
break; break;
case 'x': case 'x':
currentToken.value = "x";
++endIndex; ++endIndex;
if (inputString[endIndex] == '\"') if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
goto case '\"';
else
goto default; goto default;
else
goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
case '\'': case '\'':
case '"': case '"':
currentToken.lineNumber = lineNumber; currentToken.lineNumber = lineNumber;
@ -684,30 +724,34 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
currentToken.type = TokenType.stringLiteral; currentToken.type = TokenType.stringLiteral;
break; break;
case 'q': case 'q':
currentToken.value = "q";
++endIndex; ++endIndex;
switch (inputString[endIndex]) if (!isEoF(inputString, endIndex))
{ {
case '\"': switch (inputString[endIndex])
currentToken.lineNumber = lineNumber; {
currentToken.value ~= "q" ~ lexDelimitedString(inputString, case '"':
endIndex, lineNumber); currentToken.lineNumber = lineNumber;
currentToken.type = TokenType.stringLiteral; currentToken.value ~= lexDelimitedString(inputString,
break outerSwitch; endIndex, lineNumber);
case '{': currentToken.type = TokenType.stringLiteral;
currentToken.lineNumber = lineNumber; break outerSwitch;
currentToken.value ~= "q" ~ lexTokenString(inputString, case '{':
endIndex, lineNumber); currentToken.lineNumber = lineNumber;
currentToken.type = TokenType.stringLiteral; currentToken.value ~= lexTokenString(inputString,
break outerSwitch; endIndex, lineNumber);
default: currentToken.type = TokenType.stringLiteral;
break; break outerSwitch;
default:
break;
}
} }
goto default; goto default;
case '@': case '@':
++endIndex; ++endIndex;
goto default; goto default;
default: default:
while(endIndex < inputString.length && !isSeparating(inputString[endIndex])) while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
++endIndex; ++endIndex;
currentToken.value = inputString[startIndex .. endIndex]; currentToken.value = inputString[startIndex .. endIndex];
currentToken.type = lookupTokenType(currentToken.value); currentToken.type = lookupTokenType(currentToken.value);