D-Scanner/tokenizer.d


//          Copyright Brian Schott (Sir Alaran) 2012.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

module tokenizer;

import std.range;
import std.file;
import std.traits;
import std.algorithm;
import std.conv;
import std.uni;
import std.stdio;

import langutils;
import codegen;

pure bool isNewline(R)(R range)
{
	return range.front == '\n' || range.front == '\r';
}

pure bool isEoF(R)(R range)
{
	return range.empty || range.front == 0 || range.front == 0x1a;
}

char[] popNewline(R)(ref R range)
	{
	char[] chars;
	if (range.front == '\r')
	{
		chars ~= range.front;
		range.popFront();
	}
	if (range.front == '\n')
	{
		chars ~= range.front;
		range.popFront();
	}
	return chars;
}

unittest
{
	auto s = "\r\ntest";
	assert (popNewline(s) == "\r\n");
	assert (s == "test");
}

/**
 * Returns:
 */
string lexWhitespace(R)(ref R range, ref uint lineNumber)
{
	auto app = appender!(char[])();
	while (!isEoF(range) && isWhite(range.front))
	{
		if (isNewline(range))
		{
		++lineNumber;
			app.put(popNewline(range));
	}
		else
		{
			app.put(range.front);
			range.popFront();
		}
	}
	return to!string(app.data);
}

unittest
{
	import std.stdio;
	uint lineNum = 1;
	auto chars = " \n \r\n \tabcde";
	auto r = lexWhitespace(chars, lineNum);
	assert (r == " \n \r\n \t");
	assert (chars == "abcde");
	assert (lineNum == 3);
}

/**
 * Increments endIndex until it indexes a character directly after a comment
 * Params:
 *     inputString = the source code to examine
 *     endIndex = an index into inputString at the second character of a
 *     comment, i.e. points at the second slash in a // comment.
 *     lineNumber = the line number that corresponds to endIndex
 * Returns: The comment
 */
string lexComment(R)(ref R input, ref uint lineNumber)
in
{
	assert (input.front == '/');
}
body
{
	auto app = appender!(char[])();
	app.put(input.front);
	input.popFront();
	switch(input.front)
	{
	case '/':
		while (!isEoF(input) && !isNewline(input))
		{
			app.put(input.front);
			input.popFront();
		}
		break;
	case '*':
		while (!isEoF(input))
		{
			if (isNewline(input))
			{
				app.put(popNewline(input));
				++lineNumber;
		}
			else if (input.front == '*')
			{
				app.put(input.front);
				input.popFront();
				if (input.front == '/')
				{
					app.put(input.front);
					input.popFront();
					break;
				}
			}
			else
			{
				app.put(input.front);
				input.popFront();
			}
		}
		break;
	case '+':
		int depth = 1;
		while (depth > 0 && !isEoF(input))
		{
			if (isNewline(input))
			{
				app.put(popNewline(input));
				lineNumber++;
		}
			else if (input.front == '+')
			{
				app.put(input.front);
				input.popFront();
				if (input.front == '/')
				{
					app.put(input.front);
					input.popFront();
					--depth;
				}
			}
			else if (input.front == '/')
			{
				app.put(input.front);
				input.popFront();
				if (input.front == '+')
				{
					app.put(input.front);
					input.popFront();
					++depth;
				}
			}
			else
			{
				app.put(input.front);
				input.popFront();
			}
		}
		break;
	default:
		break;
	}
	return to!string(app.data);
}

unittest
{
	uint lineNumber = 1;
	auto chars = "//this is a comment\r\nthis is not";
	auto comment = lexComment(chars, lineNumber);
	assert (chars == "\r\nthis is not");
	assert (comment == "//this is a comment");
}

unittest
{
	uint lineNumber = 1;
	auto chars = "/* this is a\n\tcomment\r\n */this is not";
	auto comment = lexComment(chars, lineNumber);
	assert (chars == "this is not");
	assert (comment == "/* this is a\n\tcomment\r\n */");
	assert (lineNumber == 3);
}

unittest
{
	uint lineNumber = 1;
	auto chars = "/+this is a /+c/+omm+/ent+/ \r\nthis+/ is not";
	auto comment = lexComment(chars, lineNumber);
	assert (chars == " is not");
	assert (comment == "/+this is a /+c/+omm+/ent+/ \r\nthis+/");
	assert (lineNumber == 2);
}


/**
 * Params:
 *     inputString = the source code to examine
 *     endIndex = an index into inputString at the opening quote
 *     lineNumber = the line number that corresponds to endIndex
 *     quote = the opening (and closing) quote character for the string to be
 *         lexed
 * Returns: a string literal, including its opening and closing quote characters
 */
pure nothrow string lexString(S, C)(S inputString, ref size_t endIndex, ref uint lineNumber,
	C quote, bool canEscape = true) if (isSomeString!S && isSomeChar!C)
in
{
	assert (inputString[endIndex] == quote);
	assert (quote == '\'' || quote == '"' || quote == '`');
}
body
{
	if (inputString[endIndex] != quote)
		return "";
	auto startIndex = endIndex;
	++endIndex;
	bool escape = false;
	while (!isEoF(inputString, endIndex) && (inputString[endIndex] != quote || escape))
	{
		if (escape)
			escape = false;
		else
			escape = (canEscape && inputString[endIndex] == '\\');
		if (inputString[endIndex] == '\n')
			lineNumber++;
		++endIndex;
	}
	++endIndex;
	if (!isEoF(inputString, endIndex) && (inputString[endIndex] == 'w'
		|| inputString[endIndex] == 'd' || inputString[endIndex] == 'c'))
	{
		++endIndex;
	}
	auto e = endIndex > inputString.length ? inputString.length : endIndex;
	return inputString[startIndex .. e];
}

/**
 * Lexes the various crazy D string literals such as q{}, q"WTF is this? WTF",
 * and q"<>".
 * Params:
 *     inputString = the source code to examine
 *     endIndex = an index into inputString at the opening quote
 *     lineNumber = the line number that corresponds to endIndex
 * Returns: a string literal, including its opening and closing quote characters
 */
string lexDelimitedString(S)(ref S inputString, ref size_t endIndex,
	ref uint lineNumber) if (isSomeString!S)
{
	auto startIndex = endIndex;
	++endIndex;
	assert(!isEoF(inputString, endIndex)); // todo: what should happen if this is EoF?
	string open = inputString[endIndex .. endIndex + 1];
	string close;
	bool nesting = false;
	switch (open[0])
	{
	case '[': close = "]"; ++endIndex; nesting = true; break;
	case '<': close = ">"; ++endIndex; nesting = true; break;
	case '{': close = "}"; ++endIndex; nesting = true; break;
	case '(': close = ")"; ++endIndex; nesting = true; break;
	default:
		while(!isEoF(inputString, endIndex) && !isWhite(inputString[endIndex]))
			endIndex++;
		close = open = inputString[startIndex + 1 .. endIndex];
		break;
	}
	int depth = 1;
	while (!isEoF(inputString, endIndex) && depth > 0)
	{
		if (inputString[endIndex] == '\n')
		{
			lineNumber++;
			endIndex++;
		}
		else if (inputString[endIndex..$].startsWith(open))
		{
			endIndex += open.length;
			if (!nesting && !isEoF(inputString, endIndex))
			{
				if (inputString[endIndex] == '"')
					++endIndex;
				break;
			}
			depth++;
		}
		else if (inputString[endIndex..$].startsWith(close))
		{
			endIndex += close.length;
			depth--;
			if (depth <= 0)
				break;
		}
		else
			++endIndex;
	}
	if (!isEoF(inputString, endIndex) && inputString[endIndex] == '"')
		++endIndex;
	return inputString[startIndex .. endIndex];
}


/**
 * TODO: Fix this
 */
string lexTokenString(S)(ref S inputString, ref size_t endIndex, ref uint lineNumber)
{
	/+auto r = byDToken(range, IterationStyle.EVERYTHING);
	string s = getBraceContent(r);
	range.popFrontN(s.length);
	return s;+/
	return "";
}

pure nothrow Token lexNumber(S)(ref S inputString, ref size_t endIndex)
	if (isSomeString!S)
{
	Token token;
	token.startIndex = endIndex;
	size_t startIndex = endIndex;
	if (inputString[endIndex] == '0')
	{
		endIndex++;
		if (isEoF(inputString, endIndex))
		{
			token.type = TokenType.IntLiteral;
			token.value = inputString[startIndex .. endIndex];
			return token;
		}
		switch (inputString[endIndex])
		{
		case '0': .. case '9':
			// The current language spec doesn't cover octal literals, so this
			// is decimal.
			lexDecimal(inputString, startIndex, endIndex, token);
			return token;
		case 'b':
		case 'B':
			lexBinary(inputString, startIndex, ++endIndex, token);
			return token;
		case 'x':
		case 'X':
			lexHex(inputString, startIndex, ++endIndex, token);
			return token;
		default:
			token.type = TokenType.IntLiteral;
			token.value = inputString[startIndex .. endIndex];
			return token;
		}
	}
	else
	{
		lexDecimal(inputString, startIndex, endIndex, token);
		return token;
	}
}

pure nothrow void lexBinary(S)(ref S inputString, size_t startIndex,
	ref size_t endIndex, ref Token token) if (isSomeString!S)
{
	bool lexingSuffix = false;
	bool isLong = false;
	bool isUnsigned = false;
	token.type = TokenType.IntLiteral;
	binaryLoop: while (!isEoF(inputString, endIndex))
	{
		switch (inputString[endIndex])
		{
		case '0':
		case '1':
		case '_':
			if (lexingSuffix)
				break binaryLoop;
			++endIndex;
			break;
		case 'u':
		case 'U':
			if (isUnsigned)
				break;
			++endIndex;
			lexingSuffix = true;
			if (isLong)
			{
				token.type = TokenType.UnsignedLongLiteral;
				break binaryLoop;
			}
			else
				token.type = TokenType.UnsignedIntLiteral;
			isUnsigned = true;
			break;
		case 'L':
			if (isLong)
				break binaryLoop;
			++endIndex;
			lexingSuffix = true;
			if (isUnsigned)
			{
				token.type = TokenType.UnsignedLongLiteral;
				break binaryLoop;
			}
			else
				token.type = TokenType.LongLiteral;
			isLong = true;
			break;
		default:
			break binaryLoop;
		}
	}

	token.value = inputString[startIndex .. endIndex];
}

pure nothrow void lexDecimal(S)(ref S inputString, size_t startIndex,
	ref size_t endIndex, ref Token token) if (isSomeString!S)
{
	bool lexingSuffix = false;
	bool isLong = false;
	bool isUnsigned = false;
	bool isFloat = false;
	bool isReal = false;
	bool isDouble = false;
	bool foundDot = false;
	bool foundE = false;
	bool foundPlusMinus = false;
	token.type = TokenType.IntLiteral;
	decimalLoop: while (!isEoF(inputString, endIndex))
	{
		switch (inputString[endIndex])
		{
		case '0': .. case '9':
		case '_':
			if (lexingSuffix)
				break decimalLoop;
			++endIndex;
			break;
		case 'e':
		case 'E':
			// For this to be a valid exponent, the next character must be a
			// decimal character or a sign
			if (foundE || isEoF(inputString, endIndex + 1))
				break decimalLoop;
			switch (inputString[endIndex + 1])
			{
			case '+':
			case '-':
				if (isEoF(inputString, endIndex + 2)
					|| inputString[endIndex + 2] < '0'
					|| inputString[endIndex + 2] > '9')
				{
					break decimalLoop;
				}
				break;
			case '0': .. case '9':
				break;
			default:
				break decimalLoop;
			}
			++endIndex;
			foundE = true;
			isDouble = true;
			token.type = TokenType.DoubleLiteral;
			break;
		case '+':
		case '-':
			if (foundPlusMinus || !foundE)
				break decimalLoop;
			foundPlusMinus = true;
			++endIndex;
			break;
		case '.':
			if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
				break decimalLoop; // possibly slice expression
			if (foundDot)
				break decimalLoop; // two dots with other characters between them
			++endIndex;
			foundDot = true;
			token.type = TokenType.DoubleLiteral;
			isDouble = true;
			break;
		case 'u':
		case 'U':
			if (isUnsigned)
				break decimalLoop;
			++endIndex;
			lexingSuffix = true;
			if (isLong)
				token.type = TokenType.UnsignedLongLiteral;
			else
				token.type = TokenType.UnsignedIntLiteral;
			isUnsigned = true;
			break;
		case 'L':
			if (isLong)
				break decimalLoop;
			if (isReal)
				break decimalLoop;
			++endIndex;
			lexingSuffix = true;
			if (isDouble)
				token.type = TokenType.RealLiteral;
			else if (isUnsigned)
				token.type = TokenType.UnsignedLongLiteral;
			else
				token.type = TokenType.LongLiteral;
			isLong = true;
			break;
		case 'f':
		case 'F':
			lexingSuffix = true;
			if (isUnsigned || isLong)
				break decimalLoop;
			++endIndex;
			token.type = TokenType.FloatLiteral;
			break decimalLoop;
		case 'i':
			++endIndex;
			// Spec says that this is the last suffix, so all cases break the
			// loop.
			if (isDouble)
			{
				token.type = TokenType.Idouble;
				break decimalLoop;
			}
			else if (isFloat)
			{
				token.type = TokenType.Ifloat;
				break decimalLoop;
			}
			else if (isReal)
			{
				token.type = TokenType.Ireal;
				break decimalLoop;
			}
			else
			{
				// There is no imaginary int
				--endIndex;
				break decimalLoop;
			}
		default:
			break decimalLoop;
		}
	}

	token.value = inputString[startIndex .. endIndex];
}


unittest {
	Token t;
	size_t start, end;
	lexDecimal!string("55e-4", start, end, t);
	assert(t.value == "55e-4");
	assert(t.type == TokenType.DoubleLiteral);

	start = end = 0;
	lexDecimal!string("123.45f", start, end, t);
	assert(t.value == "123.45f");
	assert(t.type == TokenType.FloatLiteral);

	start = end = 0;
	lexDecimal!string("3e+f", start, end, t);
	assert(t.value == "3");
	assert(t.type == TokenType.IntLiteral);

	start = end = 0;
	lexDecimal!string("3e++f", start, end, t);
	assert(t.value == "3");
	assert(t.type == TokenType.IntLiteral);

	start = end = 0;
	lexDecimal!string("1234..1237", start, end, t);
	assert(t.value == "1234");
	assert(t.type == TokenType.IntLiteral);
}


nothrow void lexHex(S)(ref S inputString, ref size_t startIndex,
	ref size_t endIndex, ref Token token) if (isSomeString!S)
{
	bool lexingSuffix = false;
	bool isLong = false;
	bool isUnsigned = false;
	bool isFloat = false;
	bool isReal = false;
	bool isDouble = false;
	bool foundDot = false;
	bool foundE = false;
	bool foundPlusMinus = false;
	token.type = TokenType.IntLiteral;
	hexLoop: while (!isEoF(inputString, endIndex))
	{
		switch (inputString[endIndex])
		{
		case '0': .. case '9':
		case 'a': .. case 'f':
		case 'A': .. case 'F':
		case '_':
			if (lexingSuffix)
				break hexLoop;
			++endIndex;
			break;
		case 'p':
		case 'P':
			if (foundE)
				break hexLoop;
			++endIndex;
			foundE = true;
			break;
		case '+':
		case '-':
			if (foundPlusMinus || !foundE)
				break hexLoop;
			foundPlusMinus = true;
			++endIndex;
			break;
		case '.':
			if (!isEoF(inputString, endIndex + 1) && inputString[endIndex + 1] == '.')
				break hexLoop; // possibly slice expression
			if (foundDot)
				break hexLoop; // two dots with other characters between them
			++endIndex;
			foundDot = true;
			token.type = TokenType.DoubleLiteral;
			isDouble = true;
			break;
		default:
			break hexLoop;
		}
	}

	token.value = inputString[startIndex .. endIndex];
}

unittest
{
  Token t;
	size_t start, end;
  start = 0;
  end = 2;
  lexHex!string("0x193abfq", start, end, t);
  assert(t.value == "0x193abf", t.value);
  assert(t.type == TokenType.IntLiteral);

  start = 0;
  end = 2;
  lexHex!string("0x2130xabc", start, end, t);
  assert(t.value == "0x2130");
  assert(t.type == TokenType.IntLiteral);

}

/**
 * Returns: true if  ch marks the ending of one token and the beginning of
 *     another, false otherwise
 */
pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C)
{
	switch (ch)
	{
		case '!': .. case '/':
		case ':': .. case '@':
		case '[': .. case '^':
		case '{': .. case '~':
		case 0x20: // space
		case 0x09: // tab
		case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return
			return true;
		default:
			return false;
	}
}

/**
 * Configure the tokenize() function
 */
enum IterationStyle
{
	/// Only include code, not whitespace or comments
	CODE_ONLY,
	/// Include everything
	EVERYTHING
}

struct TokenRange(R) if (isInputRange(R))
{
	bool empty() const @property
	{
		return _empty;
	}


private:
	R range;
	bool _empty;
}

Token[] tokenize(S)(S inputString, IterationStyle iterationStyle = IterationStyle.CODE_ONLY)
	if (isSomeString!S)
{
	auto tokenAppender = appender!(Token[])();

	// This is very likely a local maximum, but it does seem to take a few
	// milliseconds off of the run time
	tokenAppender.reserve(inputString.length / 4);

	size_t endIndex = 0;
	uint lineNumber = 1;

	if (inputString.length > 1 && inputString[0..2] == "#!")
	{
		Token currentToken;
		currentToken.lineNumber = lineNumber; // lineNumber is always 1
		currentToken.value = lexScriptLine(inputString, endIndex, lineNumber);
		currentToken.type = TokenType.ScriptLine;
	}

	while (!isEoF(inputString, endIndex))
	{
		size_t prevIndex = endIndex;
		Token currentToken;
		auto startIndex = endIndex;
		if (isWhite(inputString[endIndex]))
		{
			if (iterationStyle == IterationStyle.EVERYTHING)
			{
				currentToken.lineNumber = lineNumber;
				currentToken.value = lexWhitespace(inputString, endIndex,
					lineNumber);
				currentToken.type = TokenType.Whitespace;
				tokenAppender.put(currentToken);
			}
			else
				lexWhitespace(inputString, endIndex, lineNumber);
			continue;
		}
		currentToken.startIndex = endIndex;

		outerSwitch: switch(inputString[endIndex])
		{
		mixin(generateCaseTrie(
			"=",    "TokenType.Assign",
			"&",    "TokenType.BitAnd",
			"&=",   "TokenType.BitAndEquals",
			"|",    "TokenType.BitOr",
			"|=",   "TokenType.BitOrEquals",
			"~=",   "TokenType.CatEquals",
			":",    "TokenType.Colon",
			",",    "TokenType.Comma",
			"$",    "TokenType.Dollar",
			".",    "TokenType.Dot",
			"==",   "TokenType.Equals",
			"=>",   "TokenType.GoesTo",
			">",    "TokenType.Greater",
			">=",   "TokenType.GreaterEqual",
			"#",    "TokenType.Hash",
			"&&",   "TokenType.LogicAnd",
			"{",    "TokenType.LBrace",
			"[",    "TokenType.LBracket",
			"<",    "TokenType.Less",
			"<=",   "TokenType.LessEqual",
			"<>=",  "TokenType.LessEqualGreater",
			"<>",   "TokenType.LessOrGreater",
			"||",   "TokenType.LogicOr",
			"(",    "TokenType.LParen",
			"-",    "TokenType.Minus",
			"-=",   "TokenType.MinusEquals",
			"%",    "TokenType.Mod",
			"%=",   "TokenType.ModEquals",
			"*=",   "TokenType.MulEquals",
			"!",    "TokenType.Not",
			"!=",   "TokenType.NotEquals",
			"!>",   "TokenType.NotGreater",
			"!>=",  "TokenType.NotGreaterEqual",
			"!<",   "TokenType.NotLess",
			"!<=",  "TokenType.NotLessEqual",
			"!<>",  "TokenType.NotLessEqualGreater",
			"+",    "TokenType.Plus",
			"+=",   "TokenType.PlusEquals",
			"^^",   "TokenType.Pow",
			"^^=",  "TokenType.PowEquals",
			"}",    "TokenType.RBrace",
			"]",    "TokenType.RBracket",
			")",    "TokenType.RParen",
			";",    "TokenType.Semicolon",
			"<<",   "TokenType.ShiftLeft",
			"<<=",  "TokenType.ShiftLeftEqual",
			">>",   "TokenType.ShiftRight",
			">>=",  "TokenType.ShiftRightEqual",
			"..",   "TokenType.Slice",
			"*",    "TokenType.Star",
			"?",    "TokenType.Ternary",
			"~",    "TokenType.Tilde",
			"--",   "TokenType.Decrement",
			"!<>=", "TokenType.Unordered",
			">>>",  "TokenType.UnsignedShiftRight",
			">>>=", "TokenType.UnsignedShiftRightEqual",
			"++",   "TokenType.Increment",
			"...",  "TokenType.Vararg",
			"^",    "TokenType.Xor",
			"^=",   "TokenType.XorEquals",
		));
		case '0': .. case '9':
			currentToken = lexNumber(inputString, endIndex);
			break;
		case '/':
			++endIndex;
			if (isEoF(inputString, endIndex))
			{
				currentToken.value = "/";
				currentToken.type = TokenType.Div;
				currentToken.lineNumber = lineNumber;
				break;
			}
			currentToken.lineNumber = lineNumber;
			switch (inputString[endIndex])
			{
			case '/':
			case '+':
			case '*':
				if (iterationStyle == IterationStyle.CODE_ONLY)
				{
					lexComment(inputString, endIndex, lineNumber);
					continue;
				}
				else
				{
					currentToken.value = lexComment(inputString, endIndex, lineNumber);
					currentToken.type = TokenType.Comment;
					break;
				}
			case '=':
				currentToken.value = "/=";
				currentToken.type = TokenType.DivEquals;
				++endIndex;
				break;
			default:
				currentToken.value = "/";
				currentToken.type = TokenType.Div;
				break;
			}
			break;
		case 'r':
			++endIndex;
			if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
				goto default;
			currentToken.lineNumber = lineNumber;
			currentToken.value = lexString(inputString, endIndex,
				lineNumber, inputString[endIndex], false);
			currentToken.type = TokenType.StringLiteral;
			break;
		case '`':
			currentToken.lineNumber = lineNumber;
			currentToken.value = lexString(inputString, endIndex, lineNumber,
				inputString[endIndex], false);
			currentToken.type = TokenType.StringLiteral;
			break;
		case 'x':
			++endIndex;
			if (isEoF(inputString, endIndex) || inputString[endIndex] != '"')
				goto default;
			else
				goto case '"'; // BUG: this is incorrect! according to specification, hex data should be lexed differently than "normal" strings
		case '\'':
		case '"':
			currentToken.lineNumber = lineNumber;
			currentToken.value = lexString(inputString, endIndex, lineNumber,
				inputString[endIndex]);
			currentToken.type = TokenType.StringLiteral;
			break;
		case 'q':
			currentToken.value = "q";
			++endIndex;
			if (!isEoF(inputString, endIndex))
			{
				switch (inputString[endIndex])
				{
					case '"':
						currentToken.lineNumber = lineNumber;
						currentToken.value ~= lexDelimitedString(inputString,
							endIndex, lineNumber);
						currentToken.type = TokenType.StringLiteral;
						break outerSwitch;
					case '{':
						currentToken.lineNumber = lineNumber;
						currentToken.value ~= lexTokenString(inputString,
							endIndex, lineNumber);
						currentToken.type = TokenType.StringLiteral;
						break outerSwitch;
					default:
						break;
				}
			}
			goto default;
		case '@':
			++endIndex;
			goto default;
		default:
			while(!isEoF(inputString, endIndex) && !isSeparating(inputString[endIndex]))
				++endIndex;
			currentToken.value = inputString[startIndex .. endIndex];
			currentToken.type = lookupTokenTypeOptimized(currentToken.value);
			//currentToken.type = lookupTokenType(currentToken.value);
			currentToken.lineNumber = lineNumber;
			break;
		}
		//stderr.writeln(currentToken);
		tokenAppender.put(currentToken);

		// This should never happen.
		if (endIndex <= prevIndex)
		{
			stderr.writeln("FAIL");
			return [];
		}
	}
	return tokenAppender.data;
}