Bug fixes for character literals and escape sequences

2013-02-08 06:10:18 -08:00 · 2013-02-08 06:10:18 -08:00 · 61704db501
parent c904bad110
commit 61704db501
3 changed files with 105 additions and 41 deletions
--- a/build.sh
+++ b/build.sh
@ -1,3 +1,4 @@
 #dmd *.d std/d/*.d -release -inline -noboundscheck -O -w -wi -m64 -property -ofdscanner -L-lsqlite3 #-inline
-#dmd *.d std/d/*.d -g -m64 -w -wi -property -ofdscanner #-unittest
-ldc2 -O5 *.d std/d/*.d -of=dscanner -release -vectorize -m64
+#dmd *.d std/d/*.d -g -m64 -w -wi -property -ofdscanner -unittest
+#ldc2 -O3 *.d std/d/*.d -of=dscanner -release -vectorize -m64
+ldc2 *.d std/d/*.d -of=dscanner -unittest -m64 -g
--- a/highlighter.d
+++ b/highlighter.d
@ -47,7 +47,7 @@ html  { background-color: #fdf6e3; color: #002b36; }
 			writeSpan("kwrd", t.value);
 		else if (t.type == TokenType.comment)
 			writeSpan("com", t.value);
-		else if (isStringLiteral(t.type))
+		else if (isStringLiteral(t.type) || t.type == TokenType.characterLiteral)
 			writeSpan("str", t.value);
 		else if (isNumberLiteral(t.type))
 			writeSpan("num", t.value);
--- a/std/d/lexer.d
+++ b/std/d/lexer.d
@ -558,6 +558,8 @@ private:
            lexNumber();
            return;
        case '\'':
+			lexCharacterLiteral();
+			return;
        case '"':
        case '`':
            lexString();
@ -959,6 +961,16 @@ private:
            case '_':
                keepNonNewlineChar();
                break;
+			case 'u':
+			case 'U':
+				if (foundDot)
+				{
+					errorMessage("Floating-point literal cannot have %s suffix".format(
+						cast(char) currentElement()));
+					return;
+				}
+				else
+					lexIntSuffix();
            case 'i':
            case 'L':
                if (foundDot)
@ -1118,7 +1130,7 @@ private:
            errorMessage("Unterminated character literal");
            return;
        }
-        sw: switch (currentElement())
+        switch (currentElement())
        {
            case '\'':
                return;
@ -1126,9 +1138,18 @@ private:
                lexEscapeSequence();
                break;
            default:
+				if (currentElement() & 0x80)
+				{
+					while (currentElement() & 0x80)
 						keepChar();
 					break;
 				}
+				else
+				{
+					keepChar();
+					break;
+				}
+        }
        if (currentElement() != '\'')
        {
            errorMessage("Expected \"'\" to end character literal");
@ -1235,19 +1256,24 @@ private:
                return;
            case 'u':
            case 'U':
+				uint digits = currentElement == 'u' ? 4 : 8;
                keepChar();
-                foreach (i; 0 .. 2)
-                {
-                    foreach (j; 0 .. 4)
+                foreach (i; 0 .. digits)
                {
 					if (!isHexDigit(currentElement()))
 					{
-                            errorMessage("Expected hex digit");
+						errorMessage("Expected hex digit instead of %s".format(
+							cast(char) currentElement()));
 						return;
 					}
 					keepChar();
 				}
-                    if (!isHexDigit(currentElement()))
+                return;
+			case '&':
+				while (!isEoF())
+				{
+					keepChar();
+					if (currentElement() == ';')
 						break;
 				}
 				return;
@ -1277,9 +1303,9 @@ private:
            case '0': .. case '7':
                ubyte[3] digits;
                size_t i;
-                for(; i < 3 && !isEoF(); ++i)
+                while(i < 3 && !isEoF())
                {
-                    digits[i] = currentElement();
+                    digits[i++] = currentElement();
                    advanceRange();
                    if (currentElement() < '0' || currentElement() > '7') break;
                }
@ -1296,29 +1322,60 @@ private:
                        return;
                    }
                    digits[i] = currentElement();
+					advanceRange();
                }
                decodeAndStore(digits, 2, 16);
                return;
            case 'u':
            case 'U':
+				uint digitCount = currentElement == 'u' ? 4 : 8;
                advanceRange();
                ubyte[8] digits;
-                size_t i;
-                foreach (j; 0 .. 2)
-                {
-                    foreach (k; 0 .. 4)
+				foreach (i; 0 .. digitCount)
 				{
 					if (!isHexDigit(currentElement()))
 					{
 						errorMessage("Expected hex digit");
 						return;
 					}
-                        digits[i++] = currentElement();
+					digits[i] = currentElement();
+					advanceRange();
 				}
-                    if (!isHexDigit(currentElement()))
+                decodeAndStore(digits, digitCount, 16);
+                return;
+			case '&':
+				advanceRange();
+				ubyte[] b;
+				while (!isEoF())
+				{
+					if (isAlpha(currentElement()))
+					{
+						b ~= currentElement();
+						advanceRange();
+					}
+					else if (currentElement() == ';')
+					{
+						advanceRange();
 						break;
 					}
-                decodeAndStore(digits, i, 16);
+					else
+					{
+						errorMessage("Invalid character entity");
+						return;
+					}
+				}
+				auto entity = (cast(string) b) in characterEntities;
+				if (entity is null)
+				{
+					errorMessage("Invalid character entity \"&%s;\"".format(
+						cast(char[]) b));
+					return;
+				}
+				else
+				{
+					for (size_t i = 0; i < (*entity).length; i++)
+						bufferChar(cast(ubyte) (*entity)[i]);
+				}
 				return;
            default:
                errorMessage("Invalid escape sequence");
@ -1329,18 +1386,24 @@ private:

    void decodeAndStore(ubyte[] digits, size_t maxIndex, uint base)
    {
+		scope(failure)
+		{
+			import std.stdio;
+			stderr.writeln("Failed on line ", lineNumber, " of file ",
+				config.fileName);
+		}
        char[4] codeUnits;
-        auto source = cast(char[]) digits[0 .. maxIndex + 1];
+        auto source = cast(char[]) digits[0 .. maxIndex];
        uint codePoint = parse!uint(source, base);
        ulong unitCount = encode(codeUnits, codePoint);
        foreach (i; 0 .. unitCount)
-            bufferChar(codeUnits[unitCount]);
+            bufferChar(codeUnits[i]);
    }

    void lexDelimitedString()
    in
    {
-        assert(currentElement() == 'q');
+        assert(currentElement() == '"');
    }
    body
    {
@ -1369,7 +1432,7 @@ private:
    void lexNormalDelimitedString(ubyte open, ubyte close)
    in
    {
-        assert(currentElement() == '"');
+        assert(buffer[0 .. 2] == `q"`);
    }
    body
    {