Merge pull request #9 from roman-d-boiko/master

Implemented EoF handling and partially fixed lexing numbers
2012-04-26 08:24:05 -07:00 · 2012-04-26 08:24:05 -07:00 · 1dc70fb0ae
parent fdcf5fe4ca e528248c59
commit 1dc70fb0ae
2 changed files with 132 additions and 63 deletions
--- a/codegen.d
+++ b/codegen.d
@ -60,7 +60,7 @@ string printCaseStatements(K, V)(TrieNode!(K,V) node, string indentString)
 		if (v.children.length > 0)
 		{
 			caseStatement ~= indentString;
-			caseStatement ~= "\tif (endIndex >= inputString.length)\n";
+			caseStatement ~= "\tif (isEoF(inputString, endIndex))\n";
 			caseStatement ~= indentString;
 			caseStatement ~= "\t{\n";
 			caseStatement ~= indentString;
@ -110,3 +110,28 @@ string generateCaseTrie(string[] args ...)
 	}
 	return printCaseStatements(t, "");
 }
+
+/**
+ * Returns: true if index points to end of inputString, false otherwise
+ */
+pure nothrow bool isEoF(S)(S inputString, size_t index)
+{
+	// note: EoF is determined according to D specification
+	return index >= inputString.length
+		|| inputString[index] == Character.NUL
+		|| inputString[index] == Character.SUB;
+}
+
+private:
+
+	// Unicode character literals
+	enum Character
+	{
+		// End of file (EoF)
+		NUL = '\u0000',	// NUL character
+		SUB = '\u001A',	// Substitute character
+
+		// Line feed (EoL)
+		CR = '\u000D', // CR character
+		LF = '\u000A',	// LF character	
+	}
--- a/tokenizer.d
+++ b/tokenizer.d
@ -29,11 +29,11 @@ import codegen;
 * Returns: The whitespace, or null if style was CODE_ONLY
 */
 pure nothrow string lexWhitespace(S)(S inputString, ref size_t endIndex,
-	ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY)
+	ref uint lineNumber, IterationStyle style = IterationStyle.CODE_ONLY) // I suggest to remove the last param
 	if (isSomeString!S)
 {
 	immutable startIndex = endIndex;
-	while (endIndex < inputString.length && isWhite(inputString[endIndex]))
+	while (!isEoF(inputString, endIndex) && isWhite(inputString[endIndex]))
 	{
 		if (inputString[endIndex] == '\n')
 			lineNumber++;
@ -64,7 +64,7 @@ pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
 	if(inputString.length > 1 && inputString[0..2] == "#!") // safety check
 	{
 		endIndex = 2; // skip #!
-		while (endIndex < inputString.length && inputString[endIndex] != '\n')
+		while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
 			++endIndex;

 		result = inputString[startIndex..endIndex];
@ -85,13 +85,13 @@ pure nothrow string lexScriptLine(S)(ref S inputString, ref size_t endIndex,
 pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
 	ref uint lineNumber) if (isSomeString!S)
 {
-	if (inputString.length == 0)
+	if (isEoF(inputString, endIndex))
 		return "";
 	auto startIndex = endIndex - 1;
 	switch(inputString[endIndex])
 	{
 	case '/':
-		while (endIndex < inputString.length && inputString[endIndex] != '\n')
+		while (!isEoF(inputString, endIndex) && inputString[endIndex] != '\n')
 		{
 			if (inputString[endIndex] == '\n')
 				++lineNumber;
@ -99,7 +99,7 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
 		}
 		break;
 	case '*':
-		while (endIndex < inputString.length
+		while (!isEoF(inputString, endIndex)
 			&& !inputString[endIndex..$].startsWith("*/"))
 		{
 			if (inputString[endIndex] == '\n')
@ -111,7 +111,7 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
 	case '+':
 		++endIndex;
 		int depth = 1;
-		while (depth > 0 && endIndex + 1 < inputString.length)
+		while (depth > 0 && !isEoF(inputString, endIndex))
 		{
 			if (inputString[endIndex] == '\n')
 				lineNumber++;
@ -121,7 +121,8 @@ pure nothrow string lexComment(S)(ref S inputString, ref size_t endIndex,
 				depth++;
 			++endIndex;
 		}
-		++endIndex;
+		if (!isEoF(inputString, endIndex))
+			++endIndex;
 		break;
 	default:
 		break;
@ -145,7 +146,7 @@ pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint
 in
 {
 	assert (inputString[endIndex] == quote);
-	assert (quote == '\'' || quote == '\"' || quote == '`');
+	assert (quote == '\'' || quote == '"' || quote == '`');
 }
 body
 {
@ -154,7 +155,7 @@ body
 	auto startIndex = endIndex;
 	++endIndex;
 	bool escape = false;
-	while (endIndex < inputString.length && (inputString[endIndex] != quote || escape))
+	while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
 	{
 		if (escape)
 			escape = false;
@ -165,12 +166,11 @@ body
 		++endIndex;
 	}
 	++endIndex;
-	if (endIndex < inputString.length && (inputString[endIndex] == 'w'
+	if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
 		|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
 	{
 		++endIndex;
 	}
-	endIndex = min(endIndex, inputString.length);
 	return inputString[startIndex .. endIndex];
 }

@ -188,6 +188,7 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
 {
 	auto startIndex = endIndex;
 	++endIndex;
+	assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
 	string open = inputString[endIndex .. endIndex + 1];
 	string close;
 	bool nesting = false;
@ -198,12 +199,13 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
 	case '{': close = "}"; ++endIndex; nesting = true; break;
 	case '(': close = ")"; ++endIndex; nesting = true; break;
 	default:
-		while(!isWhite(inputString[endIndex])) endIndex++;
+		while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
+			endIndex++;
 		close = open = inputString[startIndex + 1 .. endIndex];
 		break;
 	}
 	int depth = 1;
-	while (endIndex < inputString.length && depth > 0)
+	while (!isEoF(inputString, endIndex) && depth > 0)
 	{
 		if (inputString[endIndex] == '\n')
 		{
@ -213,9 +215,9 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
 		else if (inputString[endIndex..$].startsWith(open))
 		{
 			endIndex += open.length;
-			if (!nesting)
+			if (!nesting && !isEoF(inputString, endIndex))
 			{
-				if (inputString[endIndex] == '\"')
+				if (inputString[endIndex] == '"')
 					++endIndex;
 				break;
 			}
@ -231,7 +233,7 @@ string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
 		else
 			++endIndex;
 	}
-	if (endIndex < inputString.length && inputString[endIndex] == '\"')
+	if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
 		++endIndex;
 	return inputString[startIndex .. endIndex];
 }
@ -254,7 +256,7 @@ pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
 	if (inputString[endIndex] == '0')
 	{
 		endIndex++;
-		if (endIndex >= inputString.length)
+		if (isEoF(inputString, endIndex))
 		{
 			token.type = TokenType.intLiteral;
 			token.value = inputString[startIndex .. endIndex];
@ -295,32 +297,42 @@ pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
 	bool isLong = false;
 	bool isUnsigned = false;
 	token.type = TokenType.intLiteral;
-	binaryLoop: while (endIndex < inputString.length)
+	binaryLoop: while (!isEoF(inputString, endIndex))
 	{
 		switch (inputString[endIndex])
 		{
 		case '0':
 		case '1':
 		case '_':
-			++endIndex;
 			if (lexingSuffix)
 				break binaryLoop;
+			++endIndex;
 			break;
 		case 'u':
 		case 'U':
+			if (isUnsigned)
+				break;
 			++endIndex;
 			lexingSuffix = true;
 			if (isLong)
+			{
 				token.type = TokenType.unsignedLongLiteral;
+				break binaryLoop;
+			}
 			else
 				token.type = TokenType.unsignedIntLiteral;
+			isUnsigned = true;
 			break;
 		case 'L':
-			++endIndex;
 			if (isLong)
 				break binaryLoop;
+			++endIndex;
+			lexingSuffix = true;
 			if (isUnsigned)
+			{
 				token.type = TokenType.unsignedLongLiteral;
+				break binaryLoop;
+			}
 			else
 				token.type = TokenType.longLiteral;
 			isLong = true;
@ -346,15 +358,15 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
 	bool foundE = false;
 	bool foundPlusMinus = false;
 	token.type = TokenType.intLiteral;
-	decimalLoop: while (endIndex < inputString.length)
+	decimalLoop: while (!isEoF(inputString, endIndex))
 	{
 		switch (inputString[endIndex])
 		{
 		case '0': .. case '9':
 		case '_':
-			++endIndex;
 			if (lexingSuffix)
 				break decimalLoop;
+			++endIndex;
 			break;
 		case 'e':
 		case 'E':
@ -371,8 +383,10 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
 			++endIndex;
 			break;
 		case '.':
+			if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
+				break decimalLoop; // possibly slice expression
 			if (foundDot)
-				break decimalLoop;
+				break decimalLoop; // two dots with other characters between them
 			++endIndex;
 			foundDot = true;
 			token.type = TokenType.doubleLiteral;
@ -380,6 +394,8 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
 			break;
 		case 'u':
 		case 'U':
+			if (isUnsigned)
+				break decimalLoop;
 			++endIndex;
 			lexingSuffix = true;
 			if (isLong)
@ -389,10 +405,12 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
 			isUnsigned = true;
 			break;
 		case 'L':
+			if (isLong)
+				break decimalLoop;
+			if (isReal)
+				break decimalLoop;
 			++endIndex;
 			lexingSuffix = true;
-			if (isLong || isReal)
-				break decimalLoop;
 			if (isDouble)
 				token.type = TokenType.realLiteral;
 			else if (isUnsigned)
@ -414,9 +432,34 @@ pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
 		}
 	}

+	// suggest to extract lexing integers into a separate function
+	// please see unittest below
+
 	token.value = inputString[startIndex .. endIndex];
 }

+unittest {
+	dump!lexDecimal("55e-4"); // yeilds intLiteral, but should be float
+	dump!lexDecimal("3e+f"); // floatLiteral, but should be considered invalid
+	dump!lexDecimal("3e++f"); // intLiteral 3e+, but should be considered invalid
+	// actually, there are lots of bugs. The point is that without decomposition of integer lexing from floating-point lexing
+	// it is very hard to prove algorithm correctness
+}
+
+// Temporary function to illustrate some problems
+// Executes T and dumps results to console
+void dump(alias T)(string s) {
+	size_t start;
+	size_t end;
+	Token tok;
+	T!(string)(s, start, end, tok);
+	// dump results
+	writeln(tok.type);
+	writeln(tok.value);
+	writeln(start);
+	writeln(end);
+}
+
 nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
 	ref size_t endIndex, ref Token token) if (isSomeString!S)
 {
@ -430,7 +473,7 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
 	bool foundE = false;
 	bool foundPlusMinus = false;
 	token.type = TokenType.intLiteral;
-	hexLoop: while (endIndex < inputString.length)
+	hexLoop: while (!isEoF(inputString, endIndex))
 	{
 		switch (inputString[endIndex])
 		{
@ -438,9 +481,9 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
 		case 'a': .. case 'f':
 		case 'A': .. case 'F':
 		case '_':
-			++endIndex;
 			if (lexingSuffix)
 				break hexLoop;
+			++endIndex;
 			break;
 		case 'p':
 		case 'P':
@ -457,8 +500,10 @@ nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
 			++endIndex;
 			break;
 		case '.':
+			if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
+				break hexLoop; // possibly slice expression
 			if (foundDot)
-				break hexLoop;
+				break hexLoop; // two dots with other characters between them
 			++endIndex;
 			foundDot = true;
 			token.type = TokenType.doubleLiteral;
@ -525,7 +570,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 		currentToken.type = TokenType.scriptLine;
 	}

-	while (endIndex < inputString.length)
+	while (!isEoF(inputString, endIndex))
 	{
 		size_t prevIndex = endIndex;
 		Token currentToken;
@ -536,7 +581,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			{
 				currentToken.lineNumber = lineNumber;
 				currentToken.value = lexWhitespace(inputString, endIndex,
-					lineNumber, IterationStyle.EVERYTHING);
+					lineNumber, IterationStyle.EVERYTHING); // note: I suggest to remove the last parameter to simplify lexWhitespace
 				currentToken.type = TokenType.whitespace;
 				tokenAppender.put(currentToken);
 			}
@ -615,7 +660,7 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			break;
 		case '/':
 			++endIndex;
-			if (endIndex >= inputString.length)
+			if (isEoF(inputString, endIndex))
 			{
 				currentToken.value = "/";
 				currentToken.type = TokenType.div;
@ -651,18 +696,14 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			}
 			break;
 		case 'r':
-			currentToken.value = "r";
 			++endIndex;
-			if (inputString[endIndex] == '\"')
-			{
-				currentToken.lineNumber = lineNumber;
-				currentToken.value = lexString(inputString, endIndex,
-					lineNumber, inputString[endIndex], false);
-				currentToken.type = TokenType.stringLiteral;
-				break;
-			}
-			else
+			if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
 				goto default;
+			currentToken.lineNumber = lineNumber;
+			currentToken.value = lexString(inputString, endIndex,
+				lineNumber, inputString[endIndex], false);
+			currentToken.type = TokenType.stringLiteral;
+			break;
 		case '`':
 			currentToken.lineNumber = lineNumber;
 			currentToken.value = lexString(inputString, endIndex, lineNumber,
@ -670,12 +711,11 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			currentToken.type = TokenType.stringLiteral;
 			break;
 		case 'x':
-			currentToken.value = "x";
 			++endIndex;
-			if (inputString[endIndex] == '\"')
-				goto case '\"';
-			else
+			if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
 				goto default;
+			else
+				goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
 		case '\'':
 		case '"':
 			currentToken.lineNumber = lineNumber;
@ -684,30 +724,34 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			currentToken.type = TokenType.stringLiteral;
 			break;
 		case 'q':
+			currentToken.value = "q";
 			++endIndex;
-			switch (inputString[endIndex])
+			if (!isEoF(inputString, endIndex))
 			{
-				case '\"':
-					currentToken.lineNumber = lineNumber;
-					currentToken.value ~= "q" ~ lexDelimitedString(inputString,
-						endIndex, lineNumber);
-					currentToken.type = TokenType.stringLiteral;
-					break outerSwitch;
-				case '{':
-					currentToken.lineNumber = lineNumber;
-					currentToken.value ~= "q" ~ lexTokenString(inputString,
-						endIndex, lineNumber);
-					currentToken.type = TokenType.stringLiteral;
-					break outerSwitch;
-				default:
-					break;
+				switch (inputString[endIndex])
+				{
+					case '"':
+						currentToken.lineNumber = lineNumber;
+						currentToken.value ~= lexDelimitedString(inputString,
+							endIndex, lineNumber);
+						currentToken.type = TokenType.stringLiteral;
+						break outerSwitch;
+					case '{':
+						currentToken.lineNumber = lineNumber;
+						currentToken.value ~= lexTokenString(inputString,
+							endIndex, lineNumber);
+						currentToken.type = TokenType.stringLiteral;
+						break outerSwitch;
+					default:
+						break;
+				}
 			}
 			goto default;
 		case '@':
 			++endIndex;
 			goto default;
 		default:
-			while(endIndex < inputString.length && !isSeparating(inputString[endIndex]))
+			while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
 				++endIndex;
 			currentToken.value = inputString[startIndex .. endIndex];
 			currentToken.type = lookupTokenType(currentToken.value);