From e7c198cbc7eb5c0feecb927bdb50d08f7149f7c2 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Sun, 22 Apr 2012 18:58:35 -0700 Subject: [PATCH] Lex various number literals differently. Fix a few bugs --- README.md | 2 + autocomplete.d | 173 ++++++++----------------------- build.sh | 2 +- highlighter.d | 4 +- langutils.d | 14 ++- parser.d | 1 - tokenizer.d | 271 +++++++++++++++++++++++++++++++++++++++---------- 7 files changed, 273 insertions(+), 194 deletions(-) diff --git a/README.md b/README.md index 2646c24..46e5093 100644 --- a/README.md +++ b/README.md @@ -20,8 +20,10 @@ well as any paths specified in /etc/dmd.conf. code file. # Dot Completion +This is currently under development. # Paren Completion +This is currently under development. # JSON output Generates a JSON summary of the input file. diff --git a/autocomplete.d b/autocomplete.d index 18e08ec..82b58af 100644 --- a/autocomplete.d +++ b/autocomplete.d @@ -1,4 +1,3 @@ - // Copyright Brian Schott (Sir Alaran) 2012. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at @@ -28,100 +27,19 @@ immutable string[] versions = ["AIX", "all", "Alpha", "ARM", "BigEndian", "BSD", "Win64", "Windows", "X86", "X86_64" ]; -/+/** +/** * Returns: indicies into the token array */ -Tuple!(size_t, size_t) findEndOfStatement(const Token[] tokens, size_t index, out size_t) +size_t findEndOfExpression(const Token[] tokens, size_t index) { - -}+/ - -string[] callChainBackwards(const Token[] tokens, size_t index) -{ - if (index == 0) - return [tokens[index].value]; - string[] callChain; - string current; - loop: while(true) - { - switch(tokens[index].type) - { - case TokenType.tThis: - case TokenType.identifier: - case TokenType.TYPES_BEGIN: .. case TokenType.TYPES_END: - current = tokens[index].value ~ current; - callChain = current ~ callChain; - current = ""; - if (index == 0) - break loop; - else - --index; - if (tokens[index] == TokenType.not) - callChain = callChain[1 .. $]; - break; - case TokenType.rBracket: - tokens.skipBrackets(index); - current ~= "[]"; - break; - case TokenType.rParen: - tokens.skipParens(index); - break; - case TokenType.not: - case TokenType.dot: - if (index == 0) - break loop; - else - --index; - break; - default: - break loop; - } - } - return callChain; + return index; } - -string[] callChainForwards(const Token[] tokens, size_t index) +size_t findBeginningOfExpression(const Token[] tokens, size_t index) { - string[] callChain; - while (index < tokens.length) - { - switch(tokens[index].type) - { - case TokenType.tNew: - ++index; - break; - case TokenType.tThis: - case TokenType.identifier: - case TokenType.TYPES_BEGIN: .. case TokenType.TYPES_END: - callChain ~= tokens[index++].value; - break; - case TokenType.lParen: - tokens.skipParens(index); - break; - case TokenType.lBracket: - tokens.skipBrackets(index); - callChain[$ - 1] ~= "[i]"; - break; - case TokenType.not: - ++index; - if (tokens.startsWith(TokenType.lParen)) - tokens.skipParens(index); - else - ++index; - break; - default: - break; - } - if (index >= tokens.length || tokens[index] != TokenType.dot) - break; - else - ++index; - } - return callChain; + return index; } - struct AutoComplete { this(const (Token)[] tokens, CompletionContext context) @@ -130,39 +48,42 @@ struct AutoComplete this.context = context; } - string getTypeOfExpression(string[] chain, const Token[] tokens, size_t cursor) + string getTypeOfExpression(const(Token)[] expression, const Token[] tokens, size_t cursor) { - if (chain.length == 0) - return "void"; - auto type = typeOfVariable(chain[0], cursor); - if (type == "void") - return type; - chain = chain[1 .. $]; - while (chain.length >= 1) - { - auto typeMap = context.getMembersOfType(type); - if (typeMap is null) - return "void"; - auto memberType = typeMap[chain[0]][0]; - if (memberType is null) - return "void"; - type = memberType; - chain = chain[1 .. $]; - } - return type; + return "void"; } /** * This is where the magic happens */ - string typeOfVariable(string symbol, size_t cursor) + string typeOfVariable(Token symbol, size_t cursor) { // int is of type int, double of type double, and so on - if (symbol in typeProperties) - return symbol; + if (symbol.value in typeProperties) + return symbol.value; - if (context.getMembersOfType(symbol)) - return symbol; + switch (symbol.type) + { + case TokenType.floatLiteral: + return "float"; + case TokenType.doubleLiteral: + return "double"; + case TokenType.realLiteral: + return "real"; + case TokenType.intLiteral: + return "int"; + case TokenType.unsignedIntLiteral: + return "uint"; + case TokenType.longLiteral: + return "long"; + case TokenType.unsignedLongLiteral: + return "ulong"; + default: + break; + } + + if (context.getMembersOfType(symbol.value)) + return symbol.value; // Arbitrarily define the depth of the cursor position as zero // iterate backwards through the code to try to find the variable @@ -183,14 +104,13 @@ struct AutoComplete || p == TokenType.tConst) && preceedingTokens[index + 1] == TokenType.assign) { - auto chain = callChainForwards(tokens, index + 2); - return getTypeOfExpression(chain, tokens, cursor); + return null; } - if (p == TokenType.identifier + else if (p == TokenType.identifier || (p.type > TokenType.TYPES_BEGIN && p.type < TokenType.TYPES_END)) { - return preceedingTokens[index - 1].value; + return p.value; } } if (index == 0) @@ -207,7 +127,7 @@ struct AutoComplete return minCount!("a.bodyStart > b.bodyStart")(structs)[0].name; foreach (s; structs) { - auto t = s.getMemberType(symbol); + auto t = s.getMemberType(symbol.value); if (t !is null) return t; } @@ -225,14 +145,16 @@ struct AutoComplete string parenComplete(size_t cursor) { + stderr.writeln("parenComplete"); auto index = assumeSorted(tokens).lowerBound(cursor).length - 2; Token t = tokens[index]; + stderr.writeln(t); if (t.startIndex + t.value.length + 1 != cursor) return ""; switch (tokens[index].type) { case TokenType.tVersion: - return to!string(array(join(map!`a ~ "?1"`(versions), " "))); + return to!string(join(map!`a ~ "?1"`(versions), " ").array()); case TokenType.tIf: case TokenType.tCast: case TokenType.tWhile: @@ -251,20 +173,7 @@ struct AutoComplete Token t = tokens[index]; if (t.startIndex + t.value.length + 1 != cursor) return ""; - stderr.writeln(t); - string[] chain = callChainBackwards(tokens, index); - auto type = getTypeOfExpression(chain, tokens, cursor); - - if (type && type in typeProperties) - { - string r; - foreach (i, prop; typeProperties[type]) - if (i == typeProperties.length) - r = r ~ prop; - else - r = r ~ prop ~ " "; - return r; - } + auto type = typeOfVariable(t, cursor); const Tuple!(string, string)[string] typeMap = context.getMembersOfType(type); if (typeMap is null) @@ -272,7 +181,7 @@ struct AutoComplete auto app = appender!(string[])(); foreach (k, t; typeMap) app.put(k ~ t[1]); - return to!string(array(join(sort(app.data), " "))); + return to!string(array(join(sort!"a.toLower() < b.toLower()"(app.data), " "))); } const(Token)[] tokens; diff --git a/build.sh b/build.sh index 41bddc5..ac9688e 100755 --- a/build.sh +++ b/build.sh @@ -1,2 +1,2 @@ dmd *.d -release -noboundscheck -O -w -wi -m64 -property -ofdscanner -#dmd *.d -g -unittest -m64 -w -wi -property -oftokenizer +#dmd *.d -g -unittest -m64 -w -wi -property -ofdscanner diff --git a/highlighter.d b/highlighter.d index cb25238..9a9e557 100644 --- a/highlighter.d +++ b/highlighter.d @@ -30,7 +30,7 @@ html { background-color: #111; color: #ccc; } .string { color: Tomato; font-style: italic; } .property { color: HotPink; font-weight: bold;} .operator { color: tan; font-weight: bold; } -.type { color: cyan; } +.type { color: cyan; font-weight: bold; }
]");
 
@@ -50,7 +50,7 @@ html { background-color: #111; color: #ccc; }
 		case TokenType.stringLiteral:
 			writeSpan("string", t.value);
 			break;
-		case TokenType.numberLiteral:
+		case TokenType.NUMBERS_BEGIN: .. case TokenType.NUMBERS_END:
 			writeSpan("number", t.value);
 			break;
 		case TokenType.OPERATORS_BEGIN: .. case TokenType.OPERATORS_END:
diff --git a/langutils.d b/langutils.d
index 138b0b2..c9c4818 100644
--- a/langutils.d
+++ b/langutils.d
@@ -254,11 +254,19 @@ enum TokenType: uint
 // Misc
 	MISC_BEGIN,
 	comment, /// /** comment */ or // comment or ///comment
+	NUMBERS_BEGIN,
+	floatLiteral, /// 123.456f or 0x123_45p-af
+	doubleLiteral, /// 123.456
+	realLiteral, /// 123.456L
+	intLiteral, /// 123 or 0b1101010101
+	unsignedIntLiteral, /// 123u
+	longLiteral, /// 123L
+	unsignedLongLiteral, /// 123uL
+	NUMBERS_END,
 	stringLiteral, /// "a string"
-	numberLiteral, /// int, float, etc...
-	identifier,
+	identifier, /// anything else
 	whitespace, /// whitespace
-	blank,
+	blank, /// unknown token type
 	MISC_END,
 }
 
diff --git a/parser.d b/parser.d
index 5a99ffb..71d061b 100644
--- a/parser.d
+++ b/parser.d
@@ -218,7 +218,6 @@ string parseTypeDeclaration(const Token[] tokens, ref size_t index)
 			break buildingType;
 		}
 	}
-	stderr.writeln("type = ", type);
 	return type;
 }
 
diff --git a/tokenizer.d b/tokenizer.d
index a9c1fbd..47528a8 100644
--- a/tokenizer.d
+++ b/tokenizer.d
@@ -141,11 +141,15 @@ body
 		++endIndex;
 	}
 	++endIndex;
+	if (endIndex < inputString.length && (inputString[endIndex] == 'w'
+		|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
+	{
+		++endIndex;
+	}
 	endIndex = min(endIndex, inputString.length);
 	return inputString[startIndex .. endIndex];
 }
 
-
 /**
  * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
  * and q"<>".
@@ -218,79 +222,230 @@ string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNu
 	return "";
 }
 
-/**
- *
- */
-pure nothrow string lexNumber(S)(ref S inputString, ref size_t endIndex) if (isSomeString!S)
+pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
+	if (isSomeString!S)
 {
-	auto startIndex = endIndex;
-	bool foundDot = false;
-	bool foundX = false;
-	bool foundB = false;
-	bool foundE = false;
-	numberLoop: while (endIndex < inputString.length)
+	Token token;
+	size_t startIndex = endIndex;
+	if (inputString[endIndex] == '0')
+	{
+		endIndex++;
+		if (endIndex >= inputString.length)
+		{
+			token.type = TokenType.intLiteral;
+			token.value = inputString[startIndex .. endIndex];
+			return token;
+		}
+		switch (inputString[endIndex])
+		{
+		case '0': .. case '9':
+			// The current language spec doesn't cover octal literals, so this
+			// is decimal.
+			lexDecimal(inputString, startIndex, endIndex, token);
+			return token;
+		case 'b':
+		case 'B':
+			lexBinary(inputString, startIndex, ++endIndex, token);
+			return token;
+		case 'x':
+		case 'X':
+			lexHex(inputString, startIndex, ++endIndex, token);
+			return token;
+		default:
+			token.type = TokenType.intLiteral;
+			token.value = inputString[startIndex .. endIndex];
+			return token;
+		}
+	}
+	else
+	{
+		lexDecimal(inputString, startIndex, endIndex, token);
+		return token;
+	}
+}
+
+pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
+	ref size_t endIndex, ref Token token) if (isSomeString!S)
+{
+	bool lexingSuffix = false;
+	bool isLong = false;
+	bool isUnsigned = false;
+	token.type = TokenType.intLiteral;
+	binaryLoop: while (endIndex < inputString.length)
 	{
 		switch (inputString[endIndex])
 		{
 		case '0':
-			if (!foundX)
-			{
-				++endIndex;
-				if (endIndex < inputString.length
-					&& (inputString[endIndex] == 'x' || inputString[endIndex] == 'X'))
-				{
-					++endIndex;
-					foundX = true;
-				}
-			}
+		case '1':
+		case '_':
+			++endIndex;
+			if (lexingSuffix)
+				break binaryLoop;
+			break;
+		case 'u':
+		case 'U':
+			++endIndex;
+			lexingSuffix = true;
+			if (isLong)
+				token.type = TokenType.unsignedLongLiteral;
 			else
-				++endIndex;
+				token.type = TokenType.unsignedIntLiteral;
 			break;
-		case 'b':
-			if (foundB)
-				break numberLoop;
-			foundB = true;
+		case 'L':
 			++endIndex;
+			if (isLong)
+				break binaryLoop;
+			if (isUnsigned)
+				token.type = TokenType.unsignedLongLiteral;
+			else
+				token.type = TokenType.longLiteral;
+			isLong = true;
 			break;
-		case '.':
-			if (foundDot || foundX || foundE)
-				break numberLoop;
-			foundDot = true;
+		default:
+			break binaryLoop;
+		}
+	}
+
+	token.value = inputString[startIndex .. endIndex];
+}
+
+pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
+	ref size_t endIndex, ref Token token) if (isSomeString!S)
+{
+	bool lexingSuffix = false;
+	bool isLong = false;
+	bool isUnsigned = false;
+	bool isFloat = false;
+	bool isReal = false;
+	bool isDouble = false;
+	bool foundDot = false;
+	bool foundE = false;
+	bool foundPlusMinus = false;
+	token.type = TokenType.intLiteral;
+	decimalLoop: while (endIndex < inputString.length)
+	{
+		switch (inputString[endIndex])
+		{
+		case '0': .. case '9':
+		case '_':
 			++endIndex;
+			if (lexingSuffix)
+				break decimalLoop;
+			break;
+		case 'e':
+		case 'E':
+			if (foundE)
+				break decimalLoop;
+			++endIndex;
+			foundE = true;
 			break;
 		case '+':
 		case '-':
-			if (!foundE)
-				break numberLoop;
+			if (foundPlusMinus || !foundE)
+				break decimalLoop;
+			foundPlusMinus = true;
 			++endIndex;
 			break;
+		case '.':
+			if (foundDot)
+				break decimalLoop;
+			++endIndex;
+			foundDot = true;
+			token.type = TokenType.doubleLiteral;
+			isDouble = true;
+			break;
+		case 'u':
+		case 'U':
+			++endIndex;
+			lexingSuffix = true;
+			if (isLong)
+				token.type = TokenType.unsignedLongLiteral;
+			else
+				token.type = TokenType.unsignedIntLiteral;
+			isUnsigned = true;
+			break;
+		case 'L':
+			++endIndex;
+			lexingSuffix = true;
+			if (isLong || isReal)
+				break decimalLoop;
+			if (isDouble)
+				token.type = TokenType.realLiteral;
+			else if (isUnsigned)
+				token.type = TokenType.unsignedLongLiteral;
+			else
+				token.type = TokenType.longLiteral;
+			isLong = true;
+			break;
+		case 'f':
+		case 'F':
+			lexingSuffix = true;
+			if (isUnsigned || isLong)
+				break decimalLoop;
+			++endIndex;
+			token.type = TokenType.floatLiteral;
+			break decimalLoop;
+		default:
+			break decimalLoop;
+		}
+	}
+
+	token.value = inputString[startIndex .. endIndex];
+}
+
+nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
+	ref size_t endIndex, ref Token token) if (isSomeString!S)
+{
+	bool lexingSuffix = false;
+	bool isLong = false;
+	bool isUnsigned = false;
+	bool isFloat = false;
+	bool isReal = false;
+	bool isDouble = false;
+	bool foundDot = false;
+	bool foundE = false;
+	bool foundPlusMinus = false;
+	token.type = TokenType.intLiteral;
+	hexLoop: while (endIndex < inputString.length)
+	{
+		switch (inputString[endIndex])
+		{
+		case '0': .. case '9':
+		case 'a': .. case 'f':
+		case 'A': .. case 'F':
+		case '_':
+			++endIndex;
+			if (lexingSuffix)
+				break hexLoop;
+			break;
 		case 'p':
 		case 'P':
-			if (!foundX)
-				break numberLoop;
+			if (foundE)
+				break hexLoop;
+			++endIndex;
 			foundE = true;
-			goto case '_';
-		case 'e':
-		case 'E':
-			if (foundE || foundX)
-				break numberLoop;
-			foundE = true;
-			goto case '_';
-		case '1': .. case '9':
-		case '_':
+			break;
+		case '+':
+		case '-':
+			if (foundPlusMinus || !foundE)
+				break hexLoop;
+			foundPlusMinus = true;
 			++endIndex;
 			break;
-		case 'F':
-		case 'f':
-		case 'L':
-		case 'i':
+		case '.':
+			if (foundDot)
+				break hexLoop;
 			++endIndex;
-			break numberLoop;
+			foundDot = true;
+			token.type = TokenType.doubleLiteral;
+			isDouble = true;
+			break;
 		default:
-			break numberLoop;
+			break hexLoop;
 		}
 	}
-	return inputString[startIndex .. endIndex];
+
+	token.value = inputString[startIndex .. endIndex];
 }
 
 
@@ -337,8 +492,10 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 
 	size_t endIndex = 0;
 	uint lineNumber = 1;
+
 	while (endIndex < inputString.length)
 	{
+		size_t prevIndex = endIndex;
 		Token currentToken;
 		auto startIndex = endIndex;
 		if (isWhite(inputString[endIndex]))
@@ -421,11 +578,8 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			"^",    "TokenType.xor",
 			"^=",   "TokenType.xorEquals",
 		));
-
 		case '0': .. case '9':
-			currentToken.value = lexNumber(inputString, endIndex);
-			currentToken.type = TokenType.numberLiteral;
-			currentToken.lineNumber = lineNumber;
+			currentToken = lexNumber(inputString, endIndex);
 			break;
 		case '/':
 			++endIndex;
@@ -528,8 +682,15 @@ Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyl
 			currentToken.lineNumber = lineNumber;
 			break;
 		}
-//		writeln(currentToken);
+		//stderr.writeln(currentToken);
 		tokenAppender.put(currentToken);
+
+		// This should never happen.
+		if (endIndex <= prevIndex)
+		{
+			stderr.writeln("FAIL");
+			return [];
+		}
 	}
 	return tokenAppender.data;
 }