From fd763e525a40e18539c0171c396b6e1aff46ade7 Mon Sep 17 00:00:00 2001 From: Vadim Lopatin Date: Mon, 19 Jan 2015 22:35:44 +0300 Subject: [PATCH] syntax highlight, continue --- dlangide.visualdproj | 13 +- src/ddc/lexer/Lexer.d | 288 +++ src/ddc/lexer/LexerException.d | 10 + src/ddc/lexer/LineStream.d | 589 +++++ src/ddc/lexer/SourceEncodingException.d | 10 + src/ddc/lexer/Tokenizer.d | 2636 +++++++++++++++++++++++ src/ddc/lexer/exceptions.d | 32 + src/ddc/lexer/textsource.d | 103 + src/dlangide/ui/frame.d | 95 +- 9 files changed, 3774 insertions(+), 2 deletions(-) create mode 100644 src/ddc/lexer/Lexer.d create mode 100644 src/ddc/lexer/LexerException.d create mode 100644 src/ddc/lexer/LineStream.d create mode 100644 src/ddc/lexer/SourceEncodingException.d create mode 100644 src/ddc/lexer/Tokenizer.d create mode 100644 src/ddc/lexer/exceptions.d create mode 100644 src/ddc/lexer/textsource.d diff --git a/dlangide.visualdproj b/dlangide.visualdproj index 5933e03..96b9e75 100644 --- a/dlangide.visualdproj +++ b/dlangide.visualdproj @@ -66,7 +66,7 @@ 0 0 - Unicode + Unicode USE_SDL USE_OPENGL 0 3 0 @@ -189,6 +189,17 @@ *.obj;*.cmd;*.build;*.json;*.dep + + + + + + + + + + + diff --git a/src/ddc/lexer/Lexer.d b/src/ddc/lexer/Lexer.d new file mode 100644 index 0000000..808070c --- /dev/null +++ b/src/ddc/lexer/Lexer.d @@ -0,0 +1,288 @@ +// D grammar - according to http://dlang.org/grammar + +module ddc.lexer.Lexer; +import ddc.lexer.LineStream; +import ddc.lexer.Tokenizer; + +/** Lexem type constants */ +enum LexemType : ushort { + UNKNOWN, + // types + TYPE, + TYPE_CTORS, + TYPE_CTOR, + BASIC_TYPE, + BASIC_TYPE_X, + BASIC_TYPE_2, + IDENTIFIER_LIST, + IDENTIFIER, + TYPEOF, + // templates + TEMPLATE_INSTANCE, + EXPRESSION, + ALT_DECLARATOR, +} + +class Lexem { + public @property LexemType type() { return LexemType.UNKNOWN; } +} + +/** + Returns true for one of keywords: bool, byte, ubyte, short, ushort, int, uint, long, ulong, + char, wchar, dchar, float, double, real, ifloat, idouble, ireal, cfloat, cdouble, creal, void +*/ +bool isBasicTypeXToken(Token token) { + if (token.type != TokenType.KEYWORD) + return false; + Keyword id = token.keyword; + return id == Keyword.BOOL + || id == Keyword.BYTE + || id == Keyword.UBYTE + || id == Keyword.SHORT + || id == Keyword.USHORT + || id == Keyword.INT + || id == Keyword.UINT + || id == Keyword.LONG + || id == Keyword.ULONG + || id == Keyword.CHAR + || id == Keyword.WCHAR + || id == Keyword.DCHAR + || id == Keyword.FLOAT + || id == Keyword.DOUBLE + || id == Keyword.REAL + || id == Keyword.IFLOAT + || id == Keyword.IDOUBLE + || id == Keyword.IREAL + || id == Keyword.CFLOAT + || id == Keyword.CDOUBLE + || id == Keyword.CREAL + || id == Keyword.VOID; +} + +/** + Single token, one of keywords: bool, byte, ubyte, short, ushort, int, uint, long, ulong, + char, wchar, dchar, float, double, real, ifloat, idouble, ireal, cfloat, cdouble, creal, void +*/ +class BasicTypeX : Lexem { + public Token _token; + public override @property LexemType type() { return LexemType.BASIC_TYPE_X; } + public this(Token token) + in { + assert(isBasicTypeXToken(token)); + } + body { + _token = token; + } +} + +/** + Returns true for one of keywords: const, immutable, inout, shared +*/ +bool isTypeCtorToken(Token token) { + if (token.type != TokenType.KEYWORD) + return false; + Keyword id = token.keyword; + return id == Keyword.CONST + || id == Keyword.IMMUTABLE + || id == Keyword.INOUT + || id == Keyword.SHARED; +} + +/** + Single token, one of keywords: const, immutable, inout, shared +*/ +class TypeCtor : Lexem { + public Token _token; + public override @property LexemType type() { return LexemType.TYPE_CTOR; } + public this(Token token) + in { + assert(isTypeCtorToken(token)); + } + body { + _token = token; + } +} + +/** + Zero, one or several keywords: const, immutable, inout, shared +*/ +class TypeCtors : Lexem { + public TypeCtor[] _list; + public override @property LexemType type() { return LexemType.TYPE_CTORS; } + public this(Token token) + in { + assert(isTypeCtorToken(token)); + } + body { + _list ~= new TypeCtor(token); + } + public void append(Token token) + in { + assert(isTypeCtorToken(token)); + } + body { + _list ~= new TypeCtor(token); + } +} + +/** + Identifier. +*/ +class Identifier : Lexem { + IdentToken _token; + public override @property LexemType type() { return LexemType.IDENTIFIER; } + public this(Token identifier) + in { + assert(identifier.type == TokenType.IDENTIFIER); + } + body { + _token = cast(IdentToken)identifier; + } +} + +/** + Identifier list. + + IdentifierList: + Identifier + Identifier . IdentifierList + TemplateInstance + TemplateInstance . IdentifierList + */ +class IdentifierList : Lexem { + public Identifier _identifier; + public IdentifierList _identifierList; + public TemplateInstance _templateInstance; + public override @property LexemType type() { return LexemType.IDENTIFIER_LIST; } + public this(Token ident, IdentifierList identifierList = null) + in { + assert(ident.type == TokenType.IDENTIFIER); + } + body { + _identifier = new Identifier(ident); + _identifierList = identifierList; + } + public this(TemplateInstance templateInstance, IdentifierList identifierList = null) + in { + } + body { + _templateInstance = templateInstance; + _identifierList = identifierList; + } +} + +/** + Template instance. + + TemplateInstance: + Identifier TemplateArguments +*/ +class TemplateInstance : Lexem { + public override @property LexemType type() { return LexemType.TEMPLATE_INSTANCE; } + public this() + in { + } + body { + } +} + +/** + Basic type. + + BasicType: + BasicTypeX + . IdentifierList + IdentifierList + Typeof + Typeof . IdentifierList + TypeCtor ( Type ) +*/ +class BasicType : Lexem { + public BasicTypeX _basicTypeX; + public IdentifierList _identifierList; + public Typeof _typeof; + public TypeCtor _typeCtor; + public Type _typeCtorType; + public bool _dotBeforeIdentifierList; + public override @property LexemType type() { return LexemType.BASIC_TYPE; } + public this() + in { + } + body { + } +} + + + +/** + Typeof. + + Typeof: + typeof ( Expression ) + typeof ( return ) + + For typeof(return), _expression is null +*/ +class Typeof : Lexem { + public Expression _expression; + public override @property LexemType type() { return LexemType.TYPEOF; } + public this(Expression expression) + in { + } + body { + _expression = expression; + } +} + +/** + Type. + +*/ +class Type : Lexem { + public TypeCtors _typeCtors; + public BasicType _basicType; + public AltDeclarator _altDeclarator; + public override @property LexemType type() { return LexemType.TYPE; } + public this() + in { + } + body { + } +} + +/** + Expression. + + Expression: +*/ +class Expression : Lexem { + public override @property LexemType type() { return LexemType.EXPRESSION; } + public this() + in { + } + body { + } +} + +/** + AltDeclarator. + + AltDeclarator: +*/ +class AltDeclarator : Lexem { + public override @property LexemType type() { return LexemType.ALT_DECLARATOR; } + public this() + in { + } + body { + } +} + +class Lexer +{ + LineStream _lineStream; + this(LineStream lineStream) + { + _lineStream = lineStream; + } +} diff --git a/src/ddc/lexer/LexerException.d b/src/ddc/lexer/LexerException.d new file mode 100644 index 0000000..0d0aae2 --- /dev/null +++ b/src/ddc/lexer/LexerException.d @@ -0,0 +1,10 @@ +module ddc.lexer.LexerException; + +class LexerException +{ + this() + { + // Constructor code + } +} + diff --git a/src/ddc/lexer/LineStream.d b/src/ddc/lexer/LineStream.d new file mode 100644 index 0000000..7f1f063 --- /dev/null +++ b/src/ddc/lexer/LineStream.d @@ -0,0 +1,589 @@ +module ddc.lexer.LineStream; + +import std.stream; +import ddc.lexer.exceptions; +import std.stdio; +import std.conv; +import ddc.lexer.textsource; + +class LineStream : SourceLines { + public enum EncodingType { + ASCII, + UTF8, + UTF16BE, + UTF16LE, + UTF32BE, + UTF32LE + }; + + static immutable uint LINE_POSITION_UNDEFINED = uint.max; + static immutable int TEXT_BUFFER_SIZE = 1024; + static immutable int BYTE_BUFFER_SIZE = 512; + static immutable int QUARTER_BYTE_BUFFER_SIZE = BYTE_BUFFER_SIZE / 4; + + InputStream _stream; + string _filename; + SourceFile _file; + ubyte[] _buf; // stream reading buffer + uint _pos; // reading position of stream buffer + uint _len; // number of bytes in stream buffer + bool _streamEof; // true if input stream is in EOF state + uint _line; // current line number + + uint _textPos; // start of text line in text buffer + uint _textLen; // position of last filled char in text buffer + 1 + dchar[] _textBuf; // text buffer + bool _eof; // end of file, no more lines + + override @property SourceFile file() { return _file; } + @property string filename() { return _file.filename; } + override @property uint line() { return _line; } + @property EncodingType encoding() { return _encoding; } + override @property int errorCode() { return _errorCode; } + override @property string errorMessage() { return _errorMessage; } + override @property int errorLine() { return _errorLine; } + override @property int errorPos() { return _errorPos; } + + immutable EncodingType _encoding; + + int _errorCode; + string _errorMessage; + uint _errorLine; + uint _errorPos; + + protected this(InputStream stream, SourceFile file, EncodingType encoding, ubyte[] buf, uint offset, uint len) { + _file = file; + _stream = stream; + _encoding = encoding; + _buf = buf; + _len = len; + _pos = offset; + _streamEof = _stream.eof; + } + + // returns slice of bytes available in buffer + uint readBytes() { + uint bytesLeft = _len - _pos; + if (_streamEof || bytesLeft > QUARTER_BYTE_BUFFER_SIZE) + return bytesLeft; + if (_pos > 0) { + for (uint i = 0; i < bytesLeft; i++) + _buf[i] = _buf[i + _pos]; + _len = bytesLeft; + _pos = 0; + } + uint bytesRead = cast(uint)_stream.read(_buf[_len .. BYTE_BUFFER_SIZE]); + _len += bytesRead; + _streamEof = _stream.eof; + return _len - _pos; //_buf[_pos .. _len]; + } + + // when bytes consumed from byte buffer, call this method to update position + void consumedBytes(uint count) { + _pos += count; + } + + // reserve text buffer for specified number of characters, and return pointer to first free character in buffer + dchar * reserveTextBuf(uint len) { + // create new text buffer if necessary + if (_textBuf == null) { + if (len < TEXT_BUFFER_SIZE) + len = TEXT_BUFFER_SIZE; + _textBuf = new dchar[len]; + return _textBuf.ptr; + } + uint spaceLeft = cast(uint)_textBuf.length - _textLen; + if (spaceLeft >= len) + return _textBuf.ptr + _textLen; + // move text to beginning of buffer, if necessary + if (_textPos > _textBuf.length / 2) { + uint charCount = _textLen - _textPos; + dchar * p = _textBuf.ptr; + for (uint i = 0; i < charCount; i++) + p[i] = p[i + _textPos]; + _textLen = charCount; + _textPos = 0; + } + // resize buffer if necessary + if (_textLen + len > _textBuf.length) { + // resize buffer + uint newsize = cast(uint)_textBuf.length * 2; + if (newsize < _textLen + len) + newsize = _textLen + len; + _textBuf.length = newsize; + } + return _textBuf.ptr + _textLen; + } + + void appendedText(uint len) { + //writeln("appended ", len, " chars of text"); //:", _textBuf[_textLen .. _textLen + len]); + _textLen += len; + } + + void setError(int code, string message, uint errorLine, uint errorPos) { + _errorCode = code; + _errorMessage = message; + _errorLine = errorLine; + _errorPos = errorPos; + } + + // override to decode text + abstract uint decodeText(); + + override public dchar[] readLine() { + if (_errorCode != 0) { + //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); + return null; // error detected + } + if (_eof) { + //writeln("EOF found"); + return null; + } + _line++; + uint p = 0; + uint eol = LINE_POSITION_UNDEFINED; + uint eof = LINE_POSITION_UNDEFINED; + uint lastchar = LINE_POSITION_UNDEFINED; + do { + if (_errorCode != 0) { + //writeln("error ", _errorCode, ": ", _errorMessage, " in line ", _errorLine); + return null; // error detected + } + uint charsLeft = _textLen - _textPos; + if (p >= charsLeft) { + uint decodedChars = decodeText(); + if (_errorCode != 0) { + return null; // error detected + } + charsLeft = _textLen - _textPos; + if (decodedChars == 0) { + eol = charsLeft; + eof = charsLeft; + lastchar = charsLeft; + break; + } + } + for (; p < charsLeft; p++) { + dchar ch = _textBuf[_textPos + p]; + if (ch == 0x0D) { + lastchar = p; + if (p == charsLeft - 1) { + // need one more char to check if it's 0D0A or just 0D eol + //writeln("read one more char for 0D0A detection"); + decodeText(); + if (_errorCode != 0) { + return null; // error detected + } + charsLeft = _textLen - _textPos; + } + dchar ch2 = (p < charsLeft - 1) ? _textBuf[_textPos + p + 1] : 0; + if (ch2 == 0x0A) + eol = p + 2; + else + eol = p + 1; + break; + } else if (ch == 0x0A || ch == 0x2028 || ch == 0x2029) { + // single char eoln + lastchar = p; + eol = p + 1; + break; + } else if (ch == 0 || ch == 0x001A) { + // eof + //writeln("EOF char found"); + lastchar = p; + eol = eof = p + 1; + break; + } + } + } while (eol == LINE_POSITION_UNDEFINED); + uint lineStart = _textPos; + uint lineEnd = _textPos + lastchar; + _textPos += eol; // consume text + if (eof != LINE_POSITION_UNDEFINED) { + _eof = true; + //writeln("Setting eof flag. lastchar=", lastchar, ", p=", p, ", lineStart=", lineStart); + if (lineStart >= lineEnd) { + //writeln("lineStart >= lineEnd -- treat as eof"); + return null; // eof + } + } + // return slice with decoded line + return _textBuf[lineStart .. lineEnd]; + } + + + // factory for string parser + public static LineStream create(string code, string filename = "") { + uint len = cast(uint)code.length; + ubyte[] data = new ubyte[len + 3]; + for (uint i = 0; i < len; i++) + data[i + 3] = code[i]; + // BOM for UTF8 + data[0] = 0xEF; + data[1] = 0xBB; + data[2] = 0xBF; + MemoryStream stream = new MemoryStream(data); + return create(stream, filename); + } + + // factory + public static LineStream create(InputStream stream, string filename) { + ubyte[] buf = new ubyte[BYTE_BUFFER_SIZE]; + buf[0] = buf[1] = buf[2] = buf[3] = 0; + if (!stream.isOpen) + return null; + uint len = cast(uint)stream.read(buf); + if (buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF) { + return new Utf8LineStream(stream, filename, buf, len); + } else if (buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF) { + return new Utf32beLineStream(stream, filename, buf, len); + } else if (buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00) { + return new Utf32leLineStream(stream, filename, buf, len); + } else if (buf[0] == 0xFE && buf[1] == 0xFF) { + return new Utf16beLineStream(stream, filename, buf, len); + } else if (buf[0] == 0xFF && buf[1] == 0xFE) { + return new Utf16leLineStream(stream, filename, buf, len); + } else { + return new AsciiLineStream(stream, filename, buf, len); + } + } + + protected bool invalidCharFlag; + protected void invalidCharError() { + uint pos = _textLen - _textPos + 1; + setError(1, "Invalid character in line " ~ to!string(_line) ~ ":" ~ to!string(pos), _line, pos); + } +} + + + +class AsciiLineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.ASCII, buf, 0, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len); + uint i = 0; + for (; i < len; i++) { + ubyte ch = b[i]; + if (ch & 0x80) { + // invalid character + invalidCharFlag = true; + break; + } + text[i] = ch; + } + consumedBytes(i); + appendedText(i); + return len; + } + +} + +class Utf8LineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.UTF8, buf, 3, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + uint chars = 0; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len); + uint i = 0; + for (; i < len; i++) { + uint ch = 0; + uint ch0 = b[i]; + uint bleft = len - i; + uint bread = 0; + if (!(ch0 & 0x80)) { + // 0x00..0x7F single byte + ch = ch0; + bread = 1; + } if ((ch0 & 0xE0) == 0xC0) { + // two bytes 110xxxxx 10xxxxxx + if (bleft < 2) + break; + uint ch1 = b[i + 1]; + if ((ch1 & 0xC0) != 0x80) { + invalidCharFlag = true; + break; + } + ch = ((ch0 & 0x1F) << 6) | ((ch1 & 0x3F)); + bread = 2; + } if ((ch0 & 0xF0) == 0xE0) { + // three bytes 1110xxxx 10xxxxxx 10xxxxxx + if (bleft < 3) + break; + uint ch1 = b[i + 1]; + uint ch2 = b[i + 2]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80) { + invalidCharFlag = true; + break; + } + ch = ((ch0 & 0x0F) << 12) | ((ch1 & 0x1F) << 6) | ((ch2 & 0x3F)); + bread = 3; + } if ((ch0 & 0xF8) == 0xF0) { + // four bytes 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 4) + break; + uint ch1 = b[i + 1]; + uint ch2 = b[i + 2]; + uint ch3 = b[i + 3]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80) { + invalidCharFlag = true; + break; + } + ch = ((ch0 & 0x07) << 18) | ((ch1 & 0x3F) << 12) | ((ch2 & 0x3F) << 6) | ((ch3 & 0x3F)); + bread = 4; + } if ((ch0 & 0xFC) == 0xF8) { + // five bytes 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 5) + break; + uint ch1 = b[i + 1]; + uint ch2 = b[i + 2]; + uint ch3 = b[i + 3]; + uint ch4 = b[i + 4]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80) { + invalidCharFlag = true; + break; + } + ch = ((ch0 & 0x03) << 24) | ((ch1 & 0x3F) << 18) | ((ch2 & 0x3F) << 12) | ((ch3 & 0x3F) << 6) | ((ch4 & 0x3F)); + bread = 5; + } if ((ch0 & 0xFE) == 0xFC) { + // six bytes 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + if (bleft < 6) + break; + uint ch1 = b[i + 1]; + uint ch2 = b[i + 2]; + uint ch3 = b[i + 3]; + uint ch4 = b[i + 4]; + uint ch5 = b[i + 5]; + if ((ch1 & 0xC0) != 0x80 || (ch2 & 0xC0) != 0x80 || (ch3 & 0xC0) != 0x80 || (ch4 & 0xC0) != 0x80 || (ch5 & 0xC0) != 0x80) { + invalidCharFlag = true; + break; + } + ch = ((ch0 & 0x01) << 30) | ((ch1 & 0x3F) << 24) | ((ch2 & 0x3F) << 18) | ((ch3 & 0x3F) << 12) | ((ch4 & 0x3F) << 6) | ((ch5 & 0x3F)); + bread = 5; + } + if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { + invalidCharFlag = true; + break; + } + if (ch < 0x10000) { + text[chars++] = ch; + } else { + uint lo = ch & 0x3FF; + uint hi = ch >> 10; + text[chars++] = (0xd800 | hi); + text[chars++] = (0xdc00 | lo); + } + i += bread - 1; + } + consumedBytes(i); + appendedText(chars); + uint bleft = len - i; + if (_streamEof && bleft > 0) + invalidCharFlag = true; // incomplete character at end of stream + return chars; + } +} + +class Utf16beLineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.UTF16BE, buf, 2, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + uint chars = 0; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len / 2 + 1); + uint i = 0; + for (; i < len - 1; i += 2) { + uint ch0 = b[i]; + uint ch1 = b[i + 1]; + uint ch = (ch0 << 8) | ch1; + // TODO: check special cases + text[chars++] = ch; + } + consumedBytes(i); + appendedText(chars); + uint bleft = len - i; + if (_streamEof && bleft > 0) + invalidCharFlag = true; // incomplete character at end of stream + return chars; + } +} + +class Utf16leLineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.UTF16LE, buf, 2, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + uint chars = 0; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len / 2 + 1); + uint i = 0; + for (; i < len - 1; i += 2) { + uint ch0 = b[i]; + uint ch1 = b[i + 1]; + uint ch = (ch1 << 8) | ch0; + // TODO: check special cases + text[chars++] = ch; + } + consumedBytes(i); + appendedText(chars); + uint bleft = len - i; + if (_streamEof && bleft > 0) + invalidCharFlag = true; // incomplete character at end of stream + return chars; + } +} + +class Utf32beLineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.UTF32BE, buf, 4, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + uint chars = 0; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len / 2 + 1); + uint i = 0; + for (; i < len - 3; i += 4) { + uint ch0 = b[i]; + uint ch1 = b[i + 1]; + uint ch2 = b[i + 2]; + uint ch3 = b[i + 3]; + uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; + if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { + invalidCharFlag = true; + break; + } + text[chars++] = ch; + } + consumedBytes(i); + appendedText(chars); + uint bleft = len - i; + if (_streamEof && bleft > 0) + invalidCharFlag = true; // incomplete character at end of stream + return chars; + } +} + +class Utf32leLineStream : LineStream { + this(InputStream stream, string filename, ubyte[] buf, uint len) { + super(stream, new SourceFile(filename), EncodingType.UTF32LE, buf, 4, len); + } + override uint decodeText() { + if (invalidCharFlag) { + invalidCharError(); + return 0; + } + uint bytesAvailable = readBytes(); + ubyte * bytes = _buf.ptr + _pos; + if (bytesAvailable == 0) + return 0; // nothing to decode + uint len = bytesAvailable; + uint chars = 0; + ubyte* b = bytes; + dchar* text = reserveTextBuf(len / 2 + 1); + uint i = 0; + for (; i < len - 3; i += 4) { + uint ch3 = b[i]; + uint ch2 = b[i + 1]; + uint ch1 = b[i + 2]; + uint ch0 = b[i + 3]; + uint ch = (ch0 << 24) | (ch1 << 16) | (ch2 << 8) | ch3; + if ((ch >= 0xd800 && ch < 0xe000) || (ch > 0x10FFFF)) { + invalidCharFlag = true; + break; + } + text[chars++] = ch; + } + consumedBytes(i); + appendedText(chars); + uint bleft = len - i; + if (_streamEof && bleft > 0) + invalidCharFlag = true; // incomplete character at end of stream + return chars; + } +} + + +unittest { + static if (false) { + import std.stdio; + import std.conv; + import std.utf; + //string fname = "C:\\projects\\d\\ddc\\ddclexer\\src\\ddc\\lexer\\LineStream.d"; + //string fname = "/home/lve/src/d/ddc/ddclexer/" ~ __FILE__; //"/home/lve/src/d/ddc/ddclexer/src/ddc/lexer/Lexer.d"; + //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf8.d"; + //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16be.d"; + //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf16le.d"; + //string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32be.d"; + string fname = "/home/lve/src/d/ddc/ddclexer/tests/LineStream_utf32le.d"; + writeln("opening file"); + std.stream.File f = new std.stream.File(fname); + scope(exit) { f.close(); } + try { + LineStream lines = LineStream.create(f, fname); + for (;;) { + dchar[] s = lines.readLine(); + if (s is null) + break; + writeln("line " ~ to!string(lines.line()) ~ ":" ~ toUTF8(s)); + } + if (lines.errorCode != 0) { + writeln("Error ", lines.errorCode, " ", lines.errorMessage, " -- at line ", lines.errorLine, " position ", lines.errorPos); + } else { + writeln("EOF reached"); + } + } catch (Exception e) { + writeln("Exception " ~ e.toString); + } + } +} +// LAST LINE diff --git a/src/ddc/lexer/SourceEncodingException.d b/src/ddc/lexer/SourceEncodingException.d new file mode 100644 index 0000000..d84a1f6 --- /dev/null +++ b/src/ddc/lexer/SourceEncodingException.d @@ -0,0 +1,10 @@ +module ddc.lexer.SourceEncodingException; + +class SourceEncodingException : Exception +{ + this(string msg) + { + super(msg); + } +} + diff --git a/src/ddc/lexer/Tokenizer.d b/src/ddc/lexer/Tokenizer.d new file mode 100644 index 0000000..fda1430 --- /dev/null +++ b/src/ddc/lexer/Tokenizer.d @@ -0,0 +1,2636 @@ +module ddc.lexer.Tokenizer; + +import ddc.lexer.textsource; +import ddc.lexer.exceptions; + +import std.stdio; +import std.datetime; +import std.conv; +import std.utf; +import std.math; + +enum TokenType : ubyte { + EOF, + //EOL, + WHITESPACE, + COMMENT, + IDENTIFIER, + STRING, + CHARACTER, + INTEGER, + FLOAT, + KEYWORD, + OP +} + +// table for fast checking of UniversalAlpha (as per ISO/IEC 9899:1999 Annex E) OR a..z OR A..Z OR _ +// max code is 0xd7ff +//1728 +const uint[1728] UNIVERSAL_ALPHA_FLAGS = [ + 0x00000000,0x00000000,0x87fffffe,0x07fffffe,0x00000000,0x04a00400,0xff7fffff,0xff7fffff,// 0000-00ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xfc3fffff,// 0100-01ff + 0x00ffffff,0x00000000,0xffff0000,0xffffffff,0xffffffff,0xe9ff01ff,0x00030003,0x0000001f,// 0200-02ff + 0x00000000,0x00000000,0x00000000,0x04000000,0xffffd740,0xfffffffb,0x547f7fff,0x000ffffd,// 0300-03ff + 0xffffdffe,0xffffffff,0xdffeffff,0xffffffff,0xffff0003,0xffffffff,0xffff199f,0x033fcfff,// 0400-04ff + 0x00000000,0xfffe0000,0x027fffff,0xfffffffe,0x000000ff,0xbbff0000,0xffff0006,0x000707ff,// 0500-05ff + 0x00000000,0x07fffffe,0x0007ffff,0xffff03ff,0xffffffff,0x7cffffff,0x1fff7fff,0x03ff3de0,// 0600-06ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 0700-07ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 0800-08ff + 0xffffffee,0xe3ffffff,0xff073fff,0x0000ffcf,0xfff99fee,0xc3c5fdff,0xb000399f,0x0003ffcf,// 0900-09ff + 0xfff987e4,0xc36dfdff,0x5e003987,0x0010ffc0,0xfffbafee,0xe3edfdff,0x00013bbf,0x0000ffc1,// 0a00-0aff + 0xfff99fee,0xe3cdfdff,0xb000398f,0x0000ffc3,0xd63dc7ec,0xc3bfc718,0x00003dc7,0x0000ff80,// 0b00-0bff + 0xfffddfee,0xc3effdff,0x00003ddf,0x0000ffc3,0xfffddfec,0xc3effdff,0x40003ddf,0x0000ffc3,// 0c00-0cff + 0xfffddfec,0xc3fffdff,0x00003dcf,0x0000ffc3,0x00000000,0x00000000,0x00000000,0x00000000,// 0d00-0dff + 0xfffffffe,0x07ffffff,0x0fffffff,0x00000000,0xfef02596,0x3bff6cae,0x33ff3f5f,0x00000000,// 0e00-0eff + 0x03000001,0xc2afffff,0xfffffeff,0xfffe03ff,0xfebf0fdf,0x02fe3fff,0x00000000,0x00000000,// 0f00-0fff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xffffffff,0xffff003f,0x007fffff,// 1000-10ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1100-11ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1200-12ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1300-13ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1400-14ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1500-15ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1600-16ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1700-17ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1800-18ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1900-19ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1a00-1aff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1b00-1bff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1c00-1cff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 1d00-1dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0x0fffffff,0xffffffff,0xffffffff,0x03ffffff,// 1e00-1eff + 0x3f3fffff,0xffffffff,0xaaff3f3f,0x3fffffff,0xffffffff,0x5fdfffff,0x0fcf1fdc,0x1fdc1fff,// 1f00-1fff + 0x00000000,0x80000000,0x00000001,0x80000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2000-20ff + 0x3f2ffc84,0x01fbfd50,0x00000000,0xffffffff,0x00000007,0x00000000,0x00000000,0x00000000,// 2100-21ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2200-22ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2300-23ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2400-24ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2500-25ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2600-26ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2700-27ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2800-28ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2900-29ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2a00-2aff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2b00-2bff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2c00-2cff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2d00-2dff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2e00-2eff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 2f00-2fff + 0x000000e0,0x000003fe,0xfffffffe,0xffffffff,0x180fffff,0xfffffffe,0xffffffff,0x187fffff,// 3000-30ff + 0xffffffe0,0x00001fff,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3100-31ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3200-32ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3300-33ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3400-34ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3500-35ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3600-36ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3700-37ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3800-38ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3900-39ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3a00-3aff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3b00-3bff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3c00-3cff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3d00-3dff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3e00-3eff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 3f00-3fff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4000-40ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4100-41ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4200-42ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4300-43ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4400-44ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4500-45ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4600-46ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4700-47ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4800-48ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4900-49ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4a00-4aff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4b00-4bff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4c00-4cff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// 4d00-4dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 4e00-4eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 4f00-4fff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5000-50ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5100-51ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5200-52ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5300-53ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5400-54ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5500-55ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5600-56ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5700-57ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5800-58ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5900-59ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5a00-5aff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5b00-5bff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5c00-5cff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5d00-5dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5e00-5eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 5f00-5fff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6000-60ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6100-61ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6200-62ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6300-63ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6400-64ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6500-65ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6600-66ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6700-67ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6800-68ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6900-69ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6a00-6aff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6b00-6bff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6c00-6cff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6d00-6dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6e00-6eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 6f00-6fff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7000-70ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7100-71ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7200-72ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7300-73ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7400-74ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7500-75ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7600-76ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7700-77ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7800-78ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7900-79ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7a00-7aff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7b00-7bff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7c00-7cff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7d00-7dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7e00-7eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 7f00-7fff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8000-80ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8100-81ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8200-82ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8300-83ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8400-84ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8500-85ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8600-86ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8700-87ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8800-88ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8900-89ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8a00-8aff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8b00-8bff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8c00-8cff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8d00-8dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8e00-8eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 8f00-8fff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9000-90ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9100-91ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9200-92ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9300-93ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9400-94ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9500-95ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9600-96ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9700-97ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9800-98ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9900-99ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9a00-9aff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9b00-9bff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9c00-9cff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9d00-9dff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// 9e00-9eff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0x0000003f,0x00000000,0x00000000,// 9f00-9fff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a000-a0ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a100-a1ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a200-a2ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a300-a3ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a400-a4ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a500-a5ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a600-a6ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a700-a7ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a800-a8ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// a900-a9ff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// aa00-aaff + 0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,// ab00-abff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ac00-acff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ad00-adff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ae00-aeff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// af00-afff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b000-b0ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b100-b1ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b200-b2ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b300-b3ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b400-b4ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b500-b5ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b600-b6ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b700-b7ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b800-b8ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// b900-b9ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ba00-baff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// bb00-bbff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// bc00-bcff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// bd00-bdff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// be00-beff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// bf00-bfff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c000-c0ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c100-c1ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c200-c2ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c300-c3ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c400-c4ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c500-c5ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c600-c6ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c700-c7ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c800-c8ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// c900-c9ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ca00-caff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// cb00-cbff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// cc00-ccff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// cd00-cdff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// ce00-ceff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// cf00-cfff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d000-d0ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d100-d1ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d200-d2ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d300-d3ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d400-d4ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d500-d5ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,// d600-d6ff + 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0x0000000f,0x00000000,0x00000000// d700-d7ff +]; + +// returns true if character is A..Z, a..z, _ or universal alpha +public bool isUniversalAlpha(dchar ch) pure nothrow { + return (ch <= 0xd7ff && (UNIVERSAL_ALPHA_FLAGS[ch >> 5] & (1 << (ch & 31)))); +} + +public bool isIdentStartChar(dchar ch) pure nothrow { + return isUniversalAlpha(ch); +} + +public bool isIdentMiddleChar(dchar ch) pure nothrow { + return (ch >= '0' && ch <='9') || isUniversalAlpha(ch); +} + +immutable bool ENABLE_DUMP_UNIVERSAL_ALPHA_TABLE = false; +static if (ENABLE_DUMP_UNIVERSAL_ALPHA_TABLE) { + bool r(dchar ch, wchar v) pure nothrow { + return ch == v; + } + + bool r(dchar ch, wchar v1, wchar v2) pure nothrow { + return ch >= v1 && ch <= v2; + } + + bool isUniversalAlphaSlow(dchar c) pure nothrow { + return + // Latin: 00AA, 00BA, 00C0−00D6, 00D8−00F6, 00F8−01F5, 01FA−0217, + // 0250−02A8, 1E00−1E9B, 1EA0−1EF9, 207F + r(c, 0xAA) || r(c, 0x00BA) || r(c, 0x00C0,0x00D6) || r(c, 0x00D8,0x00F6) || r(c, 0x00F8,0x01F5) || r(c, 0x01FA,0x0217) + || r(c, 0x0250,0x02A8) || r(c, 0x1E00,0x1E9B) || r(c, 0x1EA0,0x1EF9) || r(c, 0x207F) + //Greek: 0386, 0388−038A, 038C, 038E−03A1, 03A3−03CE, 03D0−03D6, + //03DA, 03DC, 03DE, 03E0, 03E2−03F3, 1F00−1F15, 1F18−1F1D, + //1F20−1F45, 1F48−1F4D, 1F50−1F57, 1F59, 1F5B, 1F5D, + //1F5F−1F7D, 1F80−1FB4, 1FB6−1FBC, 1FC2−1FC4, 1FC6−1FCC, + //1FD0−1FD3, 1FD6−1FDB, 1FE0−1FEC, 1FF2−1FF4, 1FF6−1FFC + || r(c, 0x0386) || r(c, 0x0388,0x038A) || r(c, 0x038C) || r(c, 0x038E,0x03A1) || r(c, 0x03A3,0x03CE) || r(c, 0x03D0,0x03D6) + || r(c, 0x03DA) || r(c, 0x03DC) || r(c, 0x03DE) || r(c, 0x03E0) || r(c, 0x03E2,0x03F3) || r(c, 0x1F00,0x1F15) || r(c, 0x1F18,0x1F1D) + || r(c, 0x1F20,0x1F45) || r(c, 0x1F48,0x1F4D) || r(c, 0x1F50,0x1F57) || r(c, 0x1F59) || r(c, 0x1F5B) || r(c, 0x1F5D) + || r(c, 0x1F5F,0x1F7D) || r(c, 0x1F80,0x1FB4) || r(c, 0x1FB6,0x1FBC) || r(c, 0x1FC2,0x1FC4) || r(c, 0x1FC6,0x1FCC) + || r(c, 0x1FD0,0x1FD3) || r(c, 0x1FD6,0x1FDB) || r(c, 0x1FE0,0x1FEC) || r(c, 0x1FF2,0x1FF4) || r(c, 0x1FF6,0x1FFC) + //Cyrillic: 0401−040C, 040E−044F, 0451−045C, 045E−0481, 0490−04C4, + //04C7−04C8, 04CB−04CC, 04D0−04EB, 04EE−04F5, 04F8−04F9 + || r(c, 0x0401,0x040C) || r(c, 0x040E,0x044F) || r(c, 0x0451,0x045C) || r(c, 0x045E,0x0481) || r(c, 0x0490,0x04C4) + || r(c, 0x04C7,0x04C8) || r(c, 0x04CB,0x04CC) || r(c, 0x04D0,0x04EB) || r(c, 0x04EE,0x04F5) || r(c, 0x04F8,0x04F9) + //Armenian: 0531−0556, 0561−0587 + || r(c, 0x0531,0x0556) || r(c, 0x0561,0x0587) + //Hebrew: 05B0−05B9, 05BB−05BD, 05BF, 05C1−05C2, 05D0−05EA, + //05F0−05F2 + || r(c, 0x05B0,0x05B9) || r(c, 0x05BB,0x05BD) || r(c, 0x05BF) || r(c, 0x05C1,0x05C2) || r(c, 0x05D0,0x05EA) + || r(c, 0x05F0,0x05F2) + //Arabic: 0621−063A, 0640−0652, 0670−06B7, 06BA−06BE, 06C0−06CE, + //06D0−06DC, 06E5−06E8, 06EA−06ED + || r(c, 0x0621,0x063A) || r(c, 0x0640,0x0652) || r(c, 0x0670,0x06B7) || r(c, 0x06BA,0x06BE) || r(c, 0x06C0,0x06CE) + || r(c, 0x06D0,0x06DC) || r(c, 0x06E5,0x06E8) || r(c, 0x06EA,0x06ED) + //Devanagari: 0901−0903, 0905−0939, 093E−094D, 0950−0952, 0958−0963 + || r(c, 0x0901,0x0903) || r(c, 0x0905,0x0939) || r(c, 0x093E,0x094D) || r(c, 0x0950,0x0952) || r(c, 0x0958,0x0963) + //Bengali: 0981−0983, 0985−098C, 098F−0990, 0993−09A8, 09AA−09B0, + //09B2, 09B6−09B9, 09BE−09C4, 09C7−09C8, 09CB−09CD, + //09DC−09DD, 09DF−09E3, 09F0−09F1 + || r(c, 0x0981,0x0983) || r(c, 0x0985,0x098C) || r(c, 0x098F,0x0990) || r(c, 0x0993,0x09A8) || r(c, 0x09AA,0x09B0) + || r(c, 0x09B2) || r(c, 0x09B6,0x09B9) || r(c, 0x09BE,0x09C4) || r(c, 0x09C7,0x09C8) || r(c, 0x09CB,0x09CD) + || r(c, 0x09DC,0x09DD) || r(c, 0x09DF,0x09E3) || r(c, 0x09F0,0x09F1) + //Gurmukhi: 0A02, 0A05−0A0A, 0A0F−0A10, 0A13−0A28, 0A2A−0A30, + //0A32−0A33, 0A35−0A36, 0A38−0A39, 0A3E−0A42, 0A47−0A48, + //0A4B−0A4D, 0A59−0A5C, 0A5E, 0A74 + || r(c, 0x0A02) || r(c, 0x0A05,0x0A0A) || r(c, 0x0A0F,0x0A10) || r(c, 0x0A13,0x0A28) || r(c, 0x0A2A,0x0A30) + || r(c, 0x0A32,0x0A33) || r(c, 0x0A35,0x0A36) || r(c, 0x0A38,0x0A39) || r(c, 0x0A3E,0x0A42) || r(c, 0x0A47,0x0A48) + || r(c, 0x0A4B,0x0A4D) || r(c, 0x0A59,0x0A5C) || r(c, 0x0A5E) || r(c, 0x0A74) + //Gujarati: 0A81−0A83, 0A85−0A8B, 0A8D, 0A8F−0A91, 0A93−0AA8, + //0AAA−0AB0, 0AB2−0AB3, 0AB5−0AB9, 0ABD−0AC5, + //0AC7−0AC9, 0ACB−0ACD, 0AD0, 0AE0 + || r(c, 0x0A81,0x0A83) || r(c, 0x0A85,0x0A8B) || r(c, 0x0A8D) || r(c, 0x0A8F,0x0A91) || r(c, 0x0A93,0x0AA8) + || r(c, 0x0AAA,0x0AB0) || r(c, 0x0AB2,0x0AB3) || r(c, 0x0AB5,0x0AB9) || r(c, 0x0ABD,0x0AC5) + || r(c, 0x0AC7,0x0AC9) || r(c, 0x0ACB,0x0ACD) || r(c, 0x0AD0) || r(c, 0x0AE0) + // Oriya: 0B01−0B03, 0B05−0B0C, 0B0F−0B10, 0B13−0B28, 0B2A−0B30, + //0B32−0B33, 0B36−0B39, 0B3E−0B43, 0B47−0B48, 0B4B−0B4D, + //0B5C−0B5D, 0B5F−0B61 + || r(c, 0x0B01,0x0B03) || r(c, 0x0B05,0x0B0C) || r(c, 0x0B0F,0x0B10) || r(c, 0x0B13,0x0B28) || r(c, 0x0B2A,0x0B30) + || r(c, 0x0B32,0x0B33) || r(c, 0x0B36,0x0B39) || r(c, 0x0B3E,0x0B43) || r(c, 0x0B47,0x0B48) || r(c, 0x0B4B,0x0B4D) + || r(c, 0x0B5C,0x0B5D) || r(c, 0x0B5F,0x0B61) + //Tamil: 0B82−0B83, 0B85−0B8A, 0B8E−0B90, 0B92−0B95, 0B99−0B9A, + //0B9C, 0B9E−0B9F, 0BA3−0BA4, 0BA8−0BAA, 0BAE−0BB5, + //0BB7−0BB9, 0BBE−0BC2, 0BC6−0BC8, 0BCA−0BCD + || r(c, 0x0B82,0x0B83) || r(c, 0x0B85,0x0B8A) || r(c, 0x0B8E,0x0B90) || r(c, 0x0B92,0x0B95) || r(c, 0x0B99,0x0B9A) + || r(c, 0x0B9C) || r(c, 0x0B9E,0x0B9F) || r(c, 0x0BA3,0x0BA4) || r(c, 0x0BA8,0x0BAA) || r(c, 0x0BAE,0x0BB5) + || r(c, 0x0BB7,0x0BB9) || r(c, 0x0BBE,0x0BC2) || r(c, 0x0BC6,0x0BC8) || r(c, 0x0BCA,0x0BCD) + //Telugu: 0C01−0C03, 0C05−0C0C, 0C0E−0C10, 0C12−0C28, 0C2A−0C33, + //0C35−0C39, 0C3E−0C44, 0C46−0C48, 0C4A−0C4D, 0C60−0C61 + || r(c, 0x0C01,0x0C03) || r(c, 0x0C05,0x0C0C) || r(c, 0x0C0E,0x0C10) || r(c, 0x0C12,0x0C28) || r(c, 0x0C2A,0x0C33) + || r(c, 0x0C35,0x0C39) || r(c, 0x0C3E,0x0C44) || r(c, 0x0C46,0x0C48) || r(c, 0x0C4A,0x0C4D) || r(c, 0x0C60,0x0C61) + //Kannada: 0C82−0C83, 0C85−0C8C, 0C8E−0C90, 0C92−0CA8, 0CAA−0CB3, + //0CB5−0CB9, 0CBE−0CC4, 0CC6−0CC8, 0CCA−0CCD, 0CDE, + //0CE0−0CE1 + || r(c, 0x0C82,0x0C83) || r(c, 0x0C85,0x0C8C) || r(c, 0x0C8E,0x0C90) || r(c, 0x0C92,0x0CA8) || r(c, 0x0CAA,0x0CB3) + || r(c, 0x0CB5,0x0CB9) || r(c, 0x0CBE,0x0CC4) || r(c, 0x0CC6,0x0CC8) || r(c, 0x0CCA,0x0CCD) || r(c, 0x0CDE) + || r(c, 0x0CE0,0x0CE1) + //Malayalam: 0D02−0D03, 0D05−0D0C, 0D0E−0D10, 0D12−0D28, 0D2A−0D39, + //0D3E−0D43, 0D46−0D48, 0D4A−0D4D, 0D60−0D61 + || r(c, 0x0D02,0x0D03) || r(c, 0x0D05,0x0D0C) || r(c, 0x0D0E,0x0D10) || r(c, 0x0D12,0x0D28) || r(c, 0x0D2A,0x0D39) + || r(c, 0xD3E,0x0D43) || r(c, 0x0D46,0x0D48) || r(c, 0x0D4A,0x0D4D) || r(c, 0x0D60,0x0D61) + //Thai: 0E01−0E3A, 0E40−0E5B + || r(c, 0x0E01,0x0E3A) || r(c, 0x0E40,0x0E5B) + //Lao: 0E81−0E82, 0E84, 0E87−0E88, 0E8A, 0E8D, 0E94−0E97, + //0E99−0E9F, 0EA1−0EA3, 0EA5, 0EA7, 0EAA−0EAB, + //0EAD−0EAE, 0EB0−0EB9, 0EBB−0EBD, 0EC0−0EC4, 0EC6, + //0EC8−0ECD, 0EDC−0EDD + || r(c, 0x0E81,0x0E82) || r(c, 0x0E84) || r(c, 0x0E87,0x0E88) || r(c, 0x0E8A) || r(c, 0x0E8D) || r(c, 0x0E94,0x0E97) + || r(c, 0x0E99,0x0E9F) || r(c, 0x0EA1,0x0EA3) || r(c, 0x0EA5) || r(c, 0x0EA7) || r(c, 0x0EAA,0x0EAB) + || r(c, 0x0EAD,0x0EAE) || r(c, 0x0EB0,0x0EB9) || r(c, 0x0EBB,0x0EBD) || r(c, 0x0EC0,0x0EC4) || r(c, 0x0EC6) + || r(c, 0x0EC8,0x0ECD) || r(c, 0x0EDC,0x0EDD) + //Tibetan: 0F00, 0F18−0F19, 0F35, 0F37, 0F39, 0F3E−0F47, 0F49−0F69, + //0F71−0F84, 0F86−0F8B, 0F90−0F95, 0F97, 0F99−0FAD, + //0FB1−0FB7, 0FB9 + || r(c, 0x0F00) || r(c, 0x0F18,0x0F19) || r(c, 0x0F35) || r(c, 0x0F37) || r(c, 0x0F39) || r(c, 0x0F3E,0x0F47) || r(c, 0x0F49,0x0F69) + || r(c, 0x0F71,0x0F84) || r(c, 0x0F86,0x0F8B) || r(c, 0x0F90,0x0F95) || r(c, 0x0F97) || r(c, 0x0F99,0x0FAD) + || r(c, 0x0FB1,0x0FB7) || r(c, 0x0FB9) + //Georgian: 10A0−10C5, 10D0−10F6 + || r(c, 0x10A0,0x10C5) || r(c, 0x10D0,0x10F6) + //Hiragana: 3041−3093, 309B−309C + || r(c, 0x3041,0x3093) || r(c, 0x309B,0x309C) + //Katakana: 30A1−30F6, 30FB−30FC + || r(c, 0x30A1,0x30F6) || r(c, 0x30FB,0x30FC) + //Bopomofo: 3105−312C + || r(c, 0x3105,0x312C) + //CJK Unified Ideographs: 4E00−9FA5 + || r(c, 0x4E00,0x9FA5) + //Hangul: AC00−D7A3 + || r(c, 0xAC00,0xD7A3) + //Digits: 0660−0669, 06F0−06F9, 0966−096F, 09E6−09EF, 0A66−0A6F, + //0AE6−0AEF, 0B66−0B6F, 0BE7−0BEF, 0C66−0C6F, 0CE6−0CEF, + //0D66−0D6F, 0E50−0E59, 0ED0−0ED9, 0F20−0F33 + || r(c, 0x0660,0x0669) || r(c, 0x06F0,0x06F9) || r(c, 0x0966,0x096F) || r(c, 0x09E6,0x09EF) || r(c, 0x0A66,0x0A6F) + || r(c, 0x0AE6,0x0AEF) || r(c, 0x0B66,0x0B6F) || r(c, 0x0BE7,0x0BEF) || r(c, 0x0C66,0x0C6F) || r(c, 0x0CE6,0x0CEF) + || r(c, 0x0D66,0x0D6F) || r(c, 0x0E50,0x0E59) || r(c, 0x0ED0,0x0ED9) || r(c, 0x0F20,0x0F33) + //Special characters: 00B5, 00B7, 02B0−02B8, 02BB, 02BD−02C1, 02D0−02D1, + //02E0−02E4, 037A, 0559, 093D, 0B3D, 1FBE, 203F−2040, 2102, + //2107, 210A−2113, 2115, 2118−211D, 2124, 2126, 2128, 212A−2131, + //2133−2138, 2160−2182, 3005−3007, 3021−3029 + || r(c, 0x00B5) || r(c, 0x00B7) || r(c, 0x02B0,0x02B8) || r(c, 0x02BB) || r(c, 0x02BD,0x02C1) || r(c, 0x02D0,0x02D1) + || r(c, 0x2E0,0x02E4) || r(c, 0x037A) || r(c, 0x0559) || r(c, 0x093D) || r(c, 0x0B3D) || r(c, 0x1FBE) || r(c, 0x203F,0x2040) || r(c, 0x2102) + || r(c, 0x2107) || r(c, 0x210A,0x2113) || r(c, 0x2115) || r(c, 0x2118,0x211D) || r(c, 0x2124) || r(c, 0x2126) || r(c, 0x2128) || r(c, 0x212A,0x2131) + || r(c, 0x2133,0x2138) || r(c, 0x2160,0x2182) || r(c, 0x3005,0x3007) || r(c, 0x3021,0x3029) + ; + } + +} + +unittest { + + + static if (ENABLE_DUMP_UNIVERSAL_ALPHA_TABLE) { + immutable uint itemsInRow = 8; + + uint maxAlpha = 0; + for (uint i = 0; i < 0x10000; i++) { + uint ch = i; + if (isUniversalAlphaSlow(ch) || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) + maxAlpha = i; + } + maxAlpha = (maxAlpha + itemsInRow * 32 - 1) / (itemsInRow * 32) * (itemsInRow * 32) - 1; + writeln("// table for fast checking of UniversalAlpha (as per ISO/IEC 9899:1999 Annex E) OR a..z OR A..Z OR _"); + writefln("// max code is 0x%04x", maxAlpha); + writeln("immutable uint[", (maxAlpha + 1) / 32,"] UNIVERSAL_ALPHA_FLAGS = ["); + for (uint i = 0; i <= maxAlpha; i += 32) { + if ((i / 32) % itemsInRow == 0) + write(" "); + uint flags = 0; + for (uint j = 0; j < 32; j++) { + uint ch = i + j; + bool flag = isUniversalAlphaSlow(ch) || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); + if (flag) + flags |= (1 << j); + } + writef("0x%08x", flags); + if (i != maxAlpha / 32 * 32) + write(","); + if ((i / 32) % itemsInRow == itemsInRow - 1) + writefln("// %04x-%04x", i - itemsInRow * 32 + 1 + 31, i + 31); + } + writeln("];"); + + for (uint ch = 0; ch < 0x100000; ch++) { + bool flag = isUniversalAlphaSlow(ch) || ch == '_' || (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); + bool flag2 = isUniversalAlpha(ch); + if (flag2 != flag) { + isUniversalAlpha(ch); + writefln("universalAlpha test failed for char %06x expeced %d actual %d", ch, flag ? 1 : 0, flag2 ? 1 : 0); + } + assert(flag2 == flag); + } + } +} + +enum OpCode : ubyte { + NONE, // no op + DIV, // / + DIV_EQ, // /= + DOT, // . + DOT_DOT, // .. + DOT_DOT_DOT,// ... + AND, // & + AND_EQ, // &= + LOG_AND, // && + OR, // | + OR_EQ, // |= + LOG_OR, // || + MINUS, // - + MINUS_EQ, // -= + MINUS_MINUS,// -- + PLUS, // + + PLUS_EQ, // += + PLUS_PLUS, // ++ + LT, // < + LT_EQ, // <= + SHL, // << + SHL_EQ, // <<= + LT_GT, // <> + NE_EQ, // <>= + GT, // > + GT_EQ, // >= + SHR_EQ, // >>= + ASR_EQ, // >>>= + SHR, // >> + ASR, // >>> + NOT, // ! + NOT_EQ, // != + NOT_LT_GT, // !<> + NOT_LT_GT_EQ, // !<>= + NOT_LT, // !< + NOT_LT_EQ, // !<= + NOT_GT, // !> + NOT_GT_EQ, // !>= + PAR_OPEN, // ( + PAR_CLOSE, // ) + SQ_OPEN, // [ + SQ_CLOSE, // ] + CURL_OPEN, // { + CURL_CLOSE, // } + QUEST, // ? + COMMA, // , + SEMICOLON, // ; + COLON, // : + DOLLAR, // $ + EQ, // = + QE_EQ, // == + MUL, // * + MUL_EQ, // *= + MOD, // % + MOD_EQ, // %= + XOR, // ^ + XOR_EQ, // ^= + LOG_XOR, // ^^ + LOG_XOR_EQ, // ^^= + INV, // ~ + INV_EQ, // ~= + AT, // @ + EQ_GT, // => + SHARP // # +}; + +immutable dstring[] OP_CODE_STRINGS = [ + "", + "/", + "/=", + ".", + "..", + "...", + "&", + "&=", + "&&", + "|", + "|=", + "||", + "-", + "-=", + "--", + "+", + "+=", + "++", + "<", + "<=", + "<<", + "<<=", + "<>", + "<>=", + ">", + ">=", + ">>=", + ">>>=", + ">>", + ">>>", + "!", + "!=", + "!<>", + "!<>=", + "!<", + "!<=", + "!>", + "!>=", + "(", + ")", + "[", + "]", + "{", + "}", + "?", + ",", + ";", + ":", + "$", + "=", + "==", + "*", + "*=", + "%", + "%=", + "^", + "^=", + "^^", + "^^=", + "~", + "~=", + "@", + "=>", + "#" +]; + +dstring getOpNameD(OpCode op) pure nothrow { + return OP_CODE_STRINGS[op]; +}; + +enum Keyword : ubyte { + NONE, + ABSTRACT, + ALIAS, + ALIGN, + ASM, + ASSERT, + AUTO, + + BODY, + BOOL, + BREAK, + BYTE, + + CASE, + CAST, + CATCH, + CDOUBLE, + CENT, + CFLOAT, + CHAR, + CLASS, + CONST, + CONTINUE, + CREAL, + + DCHAR, + DEBUG, + DEFAULT, + DELEGATE, + DELETE, + DEPRECATED, + DO, + DOUBLE, + + ELSE, + ENUM, + EXPORT, + EXTERN, + + FALSE, + FINAL, + FINALLY, + FLOAT, + FOR, + FOREACH, + FOREACH_REVERSE, + FUNCTION, + + GOTO, + + IDOUBLE, + IF, + IFLOAT, + IMMUTABLE, + IMPORT, + IN, + INOUT, + INT, + INTERFACE, + INVARIANT, + IREAL, + IS, + + LAZY, + LONG, + + MACRO, + MIXIN, + MODULE, + + NEW, + NOTHROW, + NULL, + + OUT, + OVERRIDE, + + PACKAGE, + PRAGMA, + PRIVATE, + PROTECTED, + PUBLIC, + PURE, + + REAL, + REF, + RETURN, + + SCOPE, + SHARED, + SHORT, + STATIC, + STRUCT, + SUPER, + SWITCH, + SYNCHRONIZED, + + TEMPLATE, + THIS, + THROW, + TRUE, + TRY, + TYPEDEF, + TYPEID, + TYPEOF, + + UBYTE, + UCENT, + UINT, + ULONG, + UNION, + UNITTEST, + USHORT, + + VERSION, + VOID, + VOLATILE, + + WCHAR, + WHILE, + WITH, + + FILE, + MODULE__, + LINE, + FUNCTION__, + PRETTY_FUNCTION, + + //Special Token Replaced with + DATE, // string literal of the date of compilation "mmm dd yyyy" + EOF, // sets the scanner to the end of the file + TIME, // string literal of the time of compilation "hh:mm:ss" + TIMESTAMP, // string literal of the date and time of compilation "www mmm dd hh:mm:ss yyyy" + VENDOR, // Compiler vendor string, such as "Digital Mars D" + VERSION_, // Compiler version as an integer, such as 2001 + + GSHARED, + TRAITS, + VECTOR, + PARAMETERS, + +} + +immutable dstring[] KEYWORD_STRINGS = [ + "", + "abstract", + "alias", + "align", + "asm", + "assert", + "auto", + + "body", + "bool", + "break", + "byte", + + "case", + "cast", + "catch", + "cdouble", + "cent", + "cfloat", + "char", + "class", + "const", + "continue", + "creal", + + "dchar", + "debug", + "default", + "delegate", + "delete", + "deprecated", + "do", + "double", + + "else", + "enum", + "export", + "extern", + + "false", + "final", + "finally", + "float", + "for", + "foreach", + "foreach_reverse", + "function", + + "goto", + + "idouble", + "if", + "ifloat", + "immutable", + "import", + "in", + "inout", + "int", + "interface", + "invariant", + "ireal", + "is", + + "lazy", + "long", + + "macro", + "mixin", + "module", + + "new", + "nothrow", + "null", + + "out", + "override", + + "package", + "pragma", + "private", + "protected", + "public", + "pure", + + "real", + "ref", + "return", + + "scope", + "shared", + "short", + "static", + "struct", + "super", + "switch", + "synchronized", + + "template", + "this", + "throw", + "true", + "try", + "typedef", + "typeid", + "typeof", + + "ubyte", + "ucent", + "uint", + "ulong", + "union", + "unittest", + "ushort", + + "version", + "void", + "volatile", + + "wchar", + "while", + "with", + + "__FILE__", + "__MODULE__", + "__LINE__", + "__FUNCTION__", + "__PRETTY_FUNCTION__", + + //Special Token Replaced with + "__DATE__", // string literal of the date of compilation "mmm dd yyyy" + "__EOF__", // sets the scanner to the end of the file + "__TIME__", // string literal of the time of compilation "hh:mm:ss" + "__TIMESTAMP__", // string literal of the date and time of compilation "www mmm dd hh:mm:ss yyyy" + "__VENDOR__", // Compiler vendor string, such as "Digital Mars D" + "__VERSION__", // Compiler version as an integer, such as 2001 + + + "__gshared", + "__traits", + "__vector", + "__parameters" +]; + +public dstring getKeywordNameD(Keyword keyword) pure nothrow { + return KEYWORD_STRINGS[keyword]; +}; + +public Keyword findKeyword(Keyword start, Keyword end, dchar * name, uint len, ref uint pos) pure nothrow { + for (Keyword i = start; i <= end; i++) { + dstring s = KEYWORD_STRINGS[i]; + if (s.length > len + 1) + continue; // too long + bool found = true; + for (uint j = 1; j < s.length; j++) { + if (s[j] != name[j - 1]) { + found = false; + break; + } + } + if (found) { + if (s.length == len - 1 || !isIdentMiddleChar(name[s.length - 1])) { + pos += s.length - 1; + return i; + } + } + } + return Keyword.NONE; +} + +/** + * Token. + */ +class Token { + protected SourceFile _file; + protected uint _line; + protected uint _pos; + protected TokenType _type; + public @property TokenType type() { return _type; } + public @property string filename() { return _file.filename; } + public @property uint line() { return _line; } + public @property uint pos() { return _pos; } + public @property dchar[] text() { return null; } + public @property dchar literalType() { return 0; } + public @property ulong intValue() { return 0; } + public @property bool isUnsigned() { return false; } + public @property ulong isLong() { return false; } + public @property real realValue() { return 0; } + public @property double doubleValue() { return 0; } + public @property float floatValue() { return 0; } + public @property byte precision() { return 0; } + public @property bool isImaginary() { return false; } + public @property OpCode opCode() { return OpCode.NONE; } + public @property Keyword keyword() { return Keyword.NONE; } + + this(TokenType type) { + _type = type; + } + + this(TokenType type, SourceFile file, uint line, uint pos) { + _type = type; + _file = file; + _line = line; + _pos = pos; + } + + void setPos(SourceFile file, uint line, uint pos) { + _file = file; + _line = line; + _pos = pos + 1; + } + + void setFile(SourceFile file) { + _file = file; + } + + void setPos(uint line, uint pos) { + _line = line; + _pos = pos + 1; + } + + public abstract Token clone(); + public override @property string toString() { + return "" ~ to!string(_line) ~ ":" ~ to!string(_pos) ~ " " ~ to!string(type) ~ " " ~ to!string(opCode) ~ " " ~ to!string(keyword) + ~" \"" ~ toUTF8(text()) ~ "\""; + } +} + +class EofToken : Token { + this() { + super(TokenType.EOF); + } + this(SourceFile file, uint line, uint pos) { + super(TokenType.EOF, file, line, pos); + } + override public Token clone() { + return new EofToken(_file, _line, _pos); + } + public override @property string toString() { + return "EOF"; + } +} + +// treat as white space +//class EolToken : Token { +// this(string file, uint line, uint pos) { +// super(TokenType.EOL, file, line, pos); +// } +//} + +class WhiteSpaceToken : Token { + this() { + super(TokenType.WHITESPACE); + } + this(SourceFile file, uint line, uint pos) { + super(TokenType.WHITESPACE, file, line, pos); + } + override public Token clone() { + return new WhiteSpaceToken(_file, _line, _pos); + } + public override @property string toString() { + return "WhiteSpace"; + } +} + +class OpToken : Token { + OpCode _op; + public @property override OpCode opCode() { return _op; } + public @property void opCode(OpCode op) { _op = op; } + public @property override dchar[] text() { return cast(dchar[])getOpNameD(_op); } + this() { + super(TokenType.OP); + } + this(SourceFile file, uint line, uint pos) { + super(TokenType.OP, file, line, pos); + } + override public Token clone() { + return new OpToken(_file, _line, _pos); + } + public override @property string toString() { + return "Op:" ~ to!string(_op); + } +} + +class KeywordToken : Token { + Keyword _keyword; + public @property override Keyword keyword() { return _keyword; } + public @property void keyword(Keyword keyword) { _keyword = keyword; } + public @property override dchar[] text() { return cast(dchar[])getKeywordNameD(_keyword); } + this() { + super(TokenType.KEYWORD); + } + this(SourceFile file, uint line, uint pos) { + super(TokenType.KEYWORD, file, line, pos); + } + override public Token clone() { + return new KeywordToken(_file, _line, _pos); + } + public override @property string toString() { + return "Keyword:" ~ to!string(_keyword); + } +} + +// do we need comment text? + +class CommentToken : Token { + dchar[] _text; + public @property override dchar[] text() { return _text; } + public @property void text(dchar[] text) { _text = text; } + this() { + super(TokenType.COMMENT); + } + this(SourceFile file, uint line, uint pos, dchar[] text) { + super(TokenType.COMMENT, file, line, pos); + _text = text; + } + override public Token clone() { + return new CommentToken(_file, _line, _pos, _text); + } + public override @property string toString() { + return "Comment:" ~ to!string(_text); + } +} + +alias tokenizer_ident_t = uint; +alias tokenizer_ident_name_t = dchar[]; + +enum : tokenizer_ident_t { + NO_IDENT = 0 +} + +/** + * Global storage for identifier strings. + */ +class IdentHolder { + protected tokenizer_ident_t _nextId; + protected tokenizer_ident_name_t[tokenizer_ident_t] _idToName; + protected tokenizer_ident_t[tokenizer_ident_name_t] _nameToId; + + public this() { + _nextId = NO_IDENT + 1; + } + + /** + * Search for id by name, return NO_IDENT if not found. + */ + uint findByName(tokenizer_ident_name_t name) { + tokenizer_ident_t * found = (name in _nameToId); + if (found) + return *found; + return NO_IDENT; + } + + /** + * Search for name by id, return null if not found. + */ + tokenizer_ident_name_t nameById(tokenizer_ident_t id) { + auto found = (id in _idToName); + if (found) + return *found; + return null; + } + + /** + * Search for ident id by name, create new entry if not found. + */ + tokenizer_ident_t idByName(tokenizer_ident_name_t name) { + uint * found = (name in _nameToId); + if (found) + return *found; + uint newid = _nextId++; + _nameToId[cast(dstring)name] = newid; + _idToName[newid] = cast(tokenizer_ident_name_t)name; + return newid; + } +} + +/** +* Thread local storage for IDs. +*/ +IdentHolder identMap; + +static this() { + // init ID storage + identMap = new IdentHolder(); +} + +class StringLiteralToken : Token { + dchar[] _text; + dchar _literalType; + public @property override dchar literalType() { return _literalType; } + public @property override dchar[] text() { return _text; } + public void setText(dchar[] text, dchar type) { _text = text; _literalType = type; } + this() { + super(TokenType.STRING); + } + this(SourceFile file, uint line, uint pos, dchar[] text, dchar type) { + super(TokenType.STRING, file, line, pos); + _text = text; + _literalType = type; + } + override public Token clone() { + return new StringLiteralToken(_file, _line, _pos, _text.dup, _literalType); + } + public override @property string toString() { + return "String:" ~ to!string(_text); + } +} + +class IntegerLiteralToken : Token { + ulong _value; + bool _unsigned; + bool _long; + public @property override ulong intValue() { return _value; } + public @property override bool isUnsigned() { return _unsigned; } + public @property override ulong isLong() { return _long; } + public @property override dchar[] text() { return cast(dchar[])to!dstring(_value); } + public void setValue(ulong value, bool unsignedFlag = false, bool longFlag = false) { + _value = value; + _unsigned = unsignedFlag; + _long = longFlag; + } + public void setFlags(bool unsignedFlag = false, bool longFlag = false) { + _unsigned = unsignedFlag; + _long = longFlag; + } + this() { + super(TokenType.INTEGER); + } + this(SourceFile file, uint line, uint pos, ulong value, bool unsignedFlag, bool longFlag) { + super(TokenType.INTEGER, file, line, pos); + _value = value; + _unsigned = unsignedFlag; + _long = longFlag; + } + override public Token clone() { + return new IntegerLiteralToken(_file, _line, _pos, _value, _unsigned, _long); + } + public override @property string toString() { + return "Integer:" ~ to!string(_value) ~ (_long ? "L" : "") ~ (_unsigned ? "U" : ""); + } +} + +class RealLiteralToken : Token { + real _value; + byte _precision; + bool _imaginary; + public @property override ulong intValue() { return to!long(_value); } + public @property override real realValue() { return _value; } + public @property override double doubleValue() { return cast(double)_value; } + public @property override float floatValue() { return cast(float)_value; } + public @property override byte precision() { return _precision; } + public @property override bool isImaginary() { return _imaginary; } + public @property override dchar[] text() { return cast(dchar[])to!dstring(_value); } + public void setValue(real value, byte precision = 1, bool imaginary = false) { + _value = value; + _precision = precision; + _imaginary = imaginary; + } + public void setFlags(byte precision = 1, bool imaginary = false) { + _precision = precision; + _imaginary = imaginary; + } + this() { + super(TokenType.FLOAT); + } + this(SourceFile file, uint line, uint pos, real value, byte precision, bool imaginary) { + super(TokenType.FLOAT, file, line, pos); + _value = value; + _precision = precision; + _imaginary = imaginary; + } + override public Token clone() { + return new RealLiteralToken(_file, _line, _pos, _value, _precision, _imaginary); + } + public override @property string toString() { + return "Integer:" ~ to!string(_value) ~ (_precision == 0 ? "f" : (_precision == 2 ? "L" : "")) ~ (_imaginary ? "i" : ""); + } +} + +class IdentToken : Token { + tokenizer_ident_t _id; + public @property override dchar[] text() { return identMap.nameById(_id); } + public void setText(dchar[] text) { _id = identMap.idByName(text); } + this() { + super(TokenType.IDENTIFIER); + } + this(SourceFile file, uint line, uint pos, dchar[] text) { + super(TokenType.IDENTIFIER, file, line, pos); + _id = identMap.idByName(text); + } + this(SourceFile file, uint line, uint pos, tokenizer_ident_t id) { + super(TokenType.IDENTIFIER, file, line, pos); + _id = id; + } + override public Token clone() { + return new IdentToken(_file, _line, _pos, _id); + } + public override @property string toString() { + return "Ident:" ~ to!string(text); + } +} + +// shared appender buffer, to avoid extra heap allocations +struct StringAppender { + dchar[] buf; + uint len; + dchar[] get() { + return buf[0 .. len]; + } + void appendEol() { + if (len + 1 > buf.length) { + uint newsize = cast(uint)((len + 1 + buf.length) * 2); + if (newsize < 128) + newsize = 128; + buf.length = newsize; + } + buf[len] = '\n'; + len++; + } + void append(dchar[] s) { + if (s.length == 0) + return; + if (len + s.length > buf.length) { + uint newsize = cast(uint)((len + s.length + buf.length) * 2); + if (newsize < 128) + newsize = 128; + buf.length = newsize; + } + buf[len .. len + s.length] = s; + len += s.length; + } + void reset() { + len = 0; + } +} + +class Tokenizer +{ + SourceLines _lineStream; + dchar[] _lineText; + uint _line; // current line number + uint _len; // current line length + uint _pos; // current line read position + uint _state; // tokenizer state + + enum : int { + EOF_CHAR = 0x001A, + EOL_CHAR = 0x000A + }; + + WhiteSpaceToken _sharedWhiteSpaceToken = new WhiteSpaceToken(); + CommentToken _sharedCommentToken = new CommentToken(); + StringLiteralToken _sharedStringLiteralToken = new StringLiteralToken(); + IdentToken _sharedIdentToken = new IdentToken(); + OpToken _sharedOpToken = new OpToken(); + KeywordToken _sharedKeywordToken = new KeywordToken(); + IntegerLiteralToken _sharedIntegerToken = new IntegerLiteralToken(); + RealLiteralToken _sharedRealToken = new RealLiteralToken(); + StringAppender _stringLiteralAppender; + StringAppender _commentAppender; + StringAppender _identAppender; + + bool _enableCommentText = true; + public void enableCommentText(bool enabled) { + _enableCommentText = enabled; + } + + this(SourceLines lineStream) { + init(lineStream); + } + + void init(SourceLines lineStream) { + _lineStream = lineStream; + _sharedWhiteSpaceToken.setFile(_lineStream.file); + _sharedCommentToken.setFile(_lineStream.file); + _sharedStringLiteralToken.setFile(_lineStream.file); + _sharedIdentToken.setFile(_lineStream.file); + _sharedOpToken.setFile(_lineStream.file); + _sharedKeywordToken.setFile(_lineStream.file); + _sharedIntegerToken.setFile(_lineStream.file); + _sharedRealToken.setFile(_lineStream.file); + buildTime = Clock.currTime(); + _line = lineStream.line; + _pos = 0; + _lineText = null; + } + + this(string code, string filename = "") { + this(new ArraySourceLines(code, filename)); + } + + // fetch next line from source stream + bool nextLine() { + _lineText = _lineStream.readLine(); + if (_lineText is null) { + if (_lineStream.errorCode != 0) + throw new SourceEncodingException(_lineStream.errorMessage, _lineStream.file.filename, _lineStream.errorLine, _lineStream.errorPos); + _pos = 0; + _len = 0; + return false; + } + _line = _lineStream.line; + _pos = 0; + _len = cast(uint)_lineText.length; // do not support lines longer that 4Gb + return true; + } + + dchar nextChar() { + if (_lineText is null) { + if (!nextLine()) { + return EOF_CHAR; + } + } else if (_pos >= _len) { + if (!nextLine()) { + return EOF_CHAR; + } + return EOL_CHAR; + } + return _lineText[_pos++]; + } + + dchar peekChar() { + if (_lineText is null) { + if (!nextLine()) { + return EOF_CHAR; + } + } + if (_pos >= _len) + return EOL_CHAR; + return _lineText[_pos++]; + } + + Token emitEof() { + // TODO: check for current state + return new EofToken(_lineStream.file, _line, _pos); + } + + Token processWhiteSpace(dchar firstChar) { + uint line = _line; + uint pos = _pos - 1; + for (;;) { + uint i = _pos; + for (; i < _len; i++) { + dchar ch = _lineText[i]; + if (!(ch == 0x0020 || ch == 0x0009 || ch == 0x000B || ch == 0x000C)) + break; + } + _pos = i; + if (_pos < _len) + break; + // go to next line + if (!nextLine()) + break; + } + // reuse the same token instance, to avoid extra heap spamming + _sharedWhiteSpaceToken.setPos(line, pos); + return _sharedWhiteSpaceToken; + } + + Token processOneLineComment() { + _sharedCommentToken.setPos(_line, _pos - 1); + if (_enableCommentText) { + _sharedCommentToken.text = _lineText[_pos + 1 .. $]; + } + _pos = _len; + return _sharedCommentToken; + } + + // Comment /* */ + Token processMultilineComment() { + _sharedCommentToken.setPos(_line, _pos - 1); + _commentAppender.reset(); + uint textStart = _pos + 1; + for (;;) { + uint textEnd = uint.max; + uint i = textStart; + for (; i < _len - 1; i++) { + if (_lineText[i] == '*' && _lineText[i + 1] == '/') { + textEnd = i; + break; + } + } + if (textEnd != uint.max) { + if (_enableCommentText) + _commentAppender.append(_lineText[textStart .. textEnd]); + _pos = textEnd + 2; + break; + } + if (!nextLine()) { + // TODO: do we need throw exception if comment not closed by end of file? + _pos = _len; + break; + } + textStart = 0; + } + if (_enableCommentText) { + _sharedCommentToken.text = _commentAppender.get(); + } + return _sharedCommentToken; + } + + // Comment /* */ + Token processNestedComment() { + _sharedCommentToken.setPos(_line, _pos - 1); + _commentAppender.reset(); + dchar[] text; + uint textStart = _pos + 1; + int level = 1; + for (;;) { + uint textEnd = uint.max; + uint i = textStart; + for (; i < _len - 1; i++) { + if (_lineText[i] == '/' && _lineText[i + 1] == '+') { + level++; + i++; + } else if (_lineText[i] == '+' && _lineText[i + 1] == '/') { + if (--level == 0) { + textEnd = i; + break; + } + } + } + if (textEnd != uint.max) { + if (_enableCommentText) + _commentAppender.append(_lineText[textStart .. textEnd]); + _pos = textEnd + 2; + break; + } + if (!nextLine()) { + // TODO: do we need throw exception if comment not closed by end of file? + _pos = _len; + break; + } + if (_enableCommentText) + _commentAppender.appendEol(); + textStart = 0; + } + if (_enableCommentText) { + _sharedCommentToken.text = _commentAppender.get(); + } + return _sharedCommentToken; + } + + Token processHexString() { + _pos++; + // TODO: + return null; + } + + Token processDelimitedString() { + _pos++; + // TODO: + return null; + } + + // r"string" or `string` + Token processWysiwygString(dchar ch) { + _pos++; + // TODO: + return null; + } + + Token processIdent() { + _sharedIdentToken.setPos(_line, _pos - 1); + _identAppender.reset(); + uint startPos = _pos - 1; + uint endPos = _len; + for (uint i = _pos; i < _len; i++) { + dchar ch = _lineText[i]; + if (!isIdentMiddleChar(ch)) { + endPos = i; + break; + } + } + _pos = endPos; + _sharedIdentToken.setText(_lineText[startPos .. endPos]); + return _sharedIdentToken; + } + + Token processIntegerSuffix() { + if (_pos >= _len) + return _sharedIntegerToken; + bool longFlag = false; + bool unsignedFlag = false; + dchar ch = _lineText[_pos]; + dchar ch2 = _pos < _len - 1 ? _lineText[_pos + 1] : 0; + if (ch == 'l' || ch == 'L') { + longFlag = true; + _pos++; + if (ch2 == 'u' || ch2 == 'U') { + unsignedFlag = true; + _pos++; + } + } else if (ch == 'u' || ch == 'U') { + unsignedFlag = true; + _pos++; + if (ch2 == 'l' || ch2 == 'L') { + longFlag = true; + _pos++; + } + } + _sharedIntegerToken.setFlags(unsignedFlag, longFlag); + ch = _pos < _len ? _lineText[_pos] : 0; + if (isIdentMiddleChar(ch)) + parserError("Unexpected character after number"); + return _sharedIntegerToken; + } + + Token processBinaryNumber() { + _sharedIntegerToken.setPos(_line, _pos - 1); + _pos++; + if (_pos >= _len) + parserError("Unexpected end of line in binary number"); + int digits = 0; + ulong number = 0; + uint i = _pos; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + if (ch != '0' && ch != '1') + break; + number = (number << 1) | (ch == '1' ? 1 : 0); + digits++; + } + _pos = i; + if (digits > 64) + parserError("number is too big"); + _sharedIntegerToken.setValue(number); + return processIntegerSuffix(); + } + + Token processHexNumber() { + _sharedIntegerToken.setPos(_line, _pos - 1); + _sharedRealToken.setPos(_line, _pos - 1); + _pos++; + if (_pos >= _len) + parserError("Unexpected end of line in hex number"); + int digits = 0; + ulong number = 0; + uint i = _pos; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + uint digit = 0; + if (ch >= '0' && ch <= '9') + digit = ch - '0'; + else if (ch >= 'a' && ch <= 'f') + digit = ch - 'a' + 10; + else if (ch >= 'A' && ch <= 'F') + digit = ch - 'A' + 10; + else if (ch == '_') + continue; + else + break; + number = (number << 4) | digit; + digits++; + } + _pos = i; + if (digits > 16) + parserError("number is too big to fit 64 bits"); + _sharedIntegerToken.setValue(number); + return processIntegerSuffix(); + } + + Token processOctNumber() { + _sharedIntegerToken.setPos(_line, _pos - 1); + if (_pos >= _len) + parserError("Unexpected end of line in octal number"); + int digits = 0; + ulong number = 0; + uint i = _pos; + bool overflow = false; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + uint digit = 0; + if (ch >= '0' && ch <= '7') + digit = ch - '0'; + else if (ch == '_') + continue; + else + break; + number <<= 3; + if (digits >= 20) { + if ((number >> 3) << 3 != number) { + overflow = true; + break; + } + } + number |= digit; + digits++; + } + _pos = i; + if (overflow) + parserError("number is too big to fit 64 bits"); + _sharedIntegerToken.setValue(number); + return processIntegerSuffix(); + } + + // + Token processDecFloatSuffix(real value) { + _sharedRealToken.setValue(value); + // TODO + return _sharedRealToken; + } + + // after E char + Token processDecFloatExponent(real value) { + dchar next = _pos < _len ? _lineText[_pos] : 0; + int sign = 1; + if (next == '+') { + _pos++; + } else if (next == '-') { + _pos++; + sign = -1; + } + if (_pos >= _len) + parserError("Invalid exponent"); + ulong digits = 0; + ulong number = 0; + uint i = _pos; + bool overflow = false; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + uint digit = 0; + if (ch >= '0' && ch <= '9') + digit = ch - '0'; + else if (ch == '_') + continue; + else + break; + number *= 10; + if (digits >= 18) { + if ((number * 10) / 10 != number) { + overflow = true; + break; + } + } + number += digit; + digits++; + } + if (digits == 0) + parserError("Invalid exponent"); + _pos = i; + value *= pow(10., cast(long)number * sign); + return processDecFloatSuffix(value); + } + + Token processDecFloatSecondPart(ulong firstPart) { + if (_pos >= _len) { + _sharedRealToken.setValue(cast(real)firstPart); + return _sharedRealToken; + } + ulong divider = 1; + ulong number = 0; + uint i = _pos; + bool overflow = false; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + uint digit = 0; + if (ch >= '0' && ch <= '9') + digit = ch - '0'; + else if (ch == '_') + continue; + else + break; + if (divider * 10 < divider) + continue; // ignore extra digits + number *= 10; + number += digit; + divider *= 10; + } + _pos = i; + real value = cast(real)firstPart + (cast(real)number / divider); + dchar next = _pos < _len ? _lineText[_pos] : 0; + if (next == 0) { + // neither exponent nor suffix + _sharedRealToken.setValue(value); + return _sharedRealToken; + } + if (next == 'e' || next == 'E') { + _pos++; + return processDecFloatExponent(value); + } + return processDecFloatSuffix(value); + } + + Token processDecNumber(dchar c) { + _pos--; + _sharedIntegerToken.setPos(_line, _pos); + _sharedRealToken.setPos(_line, _pos); + if (_pos >= _len) + parserError("Unexpected end of line in number"); + int digits = 0; + ulong number = 0; + uint i = _pos; + bool overflow = false; + for (;i < _len; i++) { + dchar ch = _lineText[i]; + uint digit = 0; + if (ch >= '0' && ch <= '9') + digit = ch - '0'; + else if (ch == '_') + continue; + else + break; + number *= 10; + if (digits >= 18) { + if ((number * 10) / 10 != number) { + overflow = true; + break; + } + } + number += digit; + digits++; + } + _pos = i; + if (overflow) + parserError("number is too big to fit 64 bits"); + _sharedIntegerToken.setValue(number); + dchar next = _pos < _len ? _lineText[_pos] : 0; + if (next == 0) + return _sharedIntegerToken; + if (next == '.') { + _pos++; + return processDecFloatSecondPart(number); + } + return processIntegerSuffix(); + } + + void parserError(string msg) { + throw new ParserException(msg, _lineStream.file.filename, _line, _pos); + } + + Keyword detectKeyword(dchar ch) { + if (ch > 'z') + return Keyword.NONE; + uint len = _len - _pos; + switch (cast(ubyte)ch) { + // ABSTRACT, + // ALIAS, + // ALIGN, + // ASM, + // ASSERT, + // AUTO, + case 'a': return findKeyword(Keyword.ABSTRACT, Keyword.AUTO, _lineText.ptr + _pos, len, _pos); + + // BODY, + // BOOL, + // BREAK, + // BYTE, + case 'b': return findKeyword(Keyword.BODY, Keyword.BYTE, _lineText.ptr + _pos, len, _pos); + + // CASE, + // CAST, + // CATCH, + // CDOUBLE, + // CENT, + // CFLOAT, + // CHAR, + // CLASS, + // CONST, + // CONTINUE, + // CREAL, + case 'c': return findKeyword(Keyword.CASE, Keyword.CREAL, _lineText.ptr + _pos, len, _pos); + + // DCHAR, + // DEBUG, + // DEFAULT, + // DELEGATE, + // DELETE, + // DEPRECATED, + // DO, + // DOUBLE, + case 'd': return findKeyword(Keyword.DCHAR, Keyword.DOUBLE, _lineText.ptr + _pos, len, _pos); + + // ELSE, + // ENUM, + // EXPORT, + // EXTERN, + case 'e': return findKeyword(Keyword.ELSE, Keyword.EXTERN, _lineText.ptr + _pos, len, _pos); + + // FALSE, + // FINAL, + // FINALLY, + // FLOAT, + // FOR, + // FOREACH, + // FOREACH_REVERSE, + // FUNCTION, + case 'f': return findKeyword(Keyword.FALSE, Keyword.FUNCTION, _lineText.ptr + _pos, len, _pos); + + // GOTO, + case 'g': return findKeyword(Keyword.GOTO, Keyword.GOTO, _lineText.ptr + _pos, len, _pos); + + // IDOUBLE, + // IF, + // IFLOAT, + // IMMUTABLE, + // IMPORT, + // IN, + // INOUT, + // INT, + // INTERFACE, + // INVARIANT, + // IREAL, + // IS, + case 'i': return findKeyword(Keyword.IDOUBLE, Keyword.IS, _lineText.ptr + _pos, len, _pos); + + // LAZY, + // LONG, + case 'l': return findKeyword(Keyword.LAZY, Keyword.LONG, _lineText.ptr + _pos, len, _pos); + + // MACRO, + // MIXIN, + // MODULE, + case 'm': return findKeyword(Keyword.MACRO, Keyword.MODULE, _lineText.ptr + _pos, len, _pos); + + // NEW, + // NOTHROW, + // NULL, + case 'n': return findKeyword(Keyword.NEW, Keyword.NULL, _lineText.ptr + _pos, len, _pos); + + // OUT, + // OVERRIDE, + case 'o': return findKeyword(Keyword.OUT, Keyword.OVERRIDE, _lineText.ptr + _pos, len, _pos); + + // PACKAGE, + // PRAGMA, + // PRIVATE, + // PROTECTED, + // PUBLIC, + // PURE, + case 'p': return findKeyword(Keyword.PACKAGE, Keyword.PURE, _lineText.ptr + _pos, len, _pos); + + // REAL, + // REF, + // RETURN, + case 'r': return findKeyword(Keyword.REAL, Keyword.RETURN, _lineText.ptr + _pos, len, _pos); + + // SCOPE, + // SHARED, + // SHORT, + // STATIC, + // STRUCT, + // SUPER, + // SWITCH, + // SYNCHRONIZED, + case 's': return findKeyword(Keyword.SCOPE, Keyword.SYNCHRONIZED, _lineText.ptr + _pos, len, _pos); + + // TEMPLATE, + // THIS, + // THROW, + // TRUE, + // TRY, + // TYPEDEF, + // TYPEID, + // TYPEOF, + case 't': return findKeyword(Keyword.TEMPLATE, Keyword.TYPEOF, _lineText.ptr + _pos, len, _pos); + + // UBYTE, + // UCENT, + // UINT, + // ULONG, + // UNION, + // UNITTEST, + // USHORT, + case 'u': return findKeyword(Keyword.UBYTE, Keyword.USHORT, _lineText.ptr + _pos, len, _pos); + + // VERSION, + // VOID, + // VOLATILE, + case 'v': return findKeyword(Keyword.VERSION, Keyword.VOLATILE, _lineText.ptr + _pos, len, _pos); + + // WCHAR, + // WHILE, + // WITH, + case 'w': return findKeyword(Keyword.WCHAR, Keyword.WITH, _lineText.ptr + _pos, len, _pos); + + // FILE, + // MODULE, + // LINE, + // FUNCTION, + // PRETTY_FUNCTION, + // + // GSHARED, + // TRAITS, + // VECTOR, + // PARAMETERS, + case '_': return findKeyword(Keyword.FILE, Keyword.PARAMETERS, _lineText.ptr + _pos, len, _pos); + default: return Keyword.NONE; + } + } + OpCode detectOp(dchar ch) nothrow { + if (ch >= 128) + return OpCode.NONE; + dchar ch2 = _pos < _len ? _lineText[_pos] : 0; + dchar ch3 = _pos < _len - 1 ? _lineText[_pos + 1] : 0; + switch(cast(ubyte)ch) { + // DIV, // / + // DIV_EQ, // /= + case '/': + if (ch2 == '=') { + _pos++; + return OpCode.DIV_EQ; + } + return OpCode.DIV; + // DOT, // . + // DOT_DOT, // .. + // DOT_DOT_DOT,// ... + case '.': + if (ch2 == '.') { + if (ch3 == '.') { + _pos += 2; + return OpCode.DOT_DOT_DOT; + } + _pos++; + return OpCode.DOT_DOT; + } + return OpCode.DOT; + // AND, // & + // AND_EQ, // &= + // LOG_AND, // && + case '&': + if (ch2 == '=') { + _pos++; + return OpCode.AND_EQ; + } + if (ch2 == '&') { + _pos++; + return OpCode.LOG_AND; + } + return OpCode.AND; + // OR, // | + // OR_EQ, // |= + // LOG_OR, // || + case '|': + if (ch2 == '=') { + _pos++; + return OpCode.OR_EQ; + } + if (ch2 == '|') { + _pos++; + return OpCode.LOG_OR; + } + return OpCode.OR; + // MINUS, // - + // MINUS_EQ, // -= + // MINUS_MINUS,// -- + case '-': + if (ch2 == '=') { + _pos++; + return OpCode.MINUS_EQ; + } + if (ch2 == '-') { + _pos++; + return OpCode.MINUS_MINUS; + } + return OpCode.MINUS; + // PLUS, // + + // PLUS_EQ, // += + // PLUS_PLUS, // ++ + case '+': + if (ch2 == '=') { + _pos++; + return OpCode.PLUS_EQ; + } + if (ch2 == '+') { + _pos++; + return OpCode.PLUS_PLUS; + } + return OpCode.PLUS; + // LT, // < + // LT_EQ, // <= + // SHL, // << + // SHL_EQ, // <<= + // LT_GT, // <> + // NE_EQ, // <>= + case '<': + if (ch2 == '<') { + if (ch3 == '=') { + _pos += 2; + return OpCode.SHL_EQ; + } + _pos++; + return OpCode.SHL; + } + if (ch2 == '>') { + if (ch3 == '=') { + _pos += 2; + return OpCode.NE_EQ; + } + _pos++; + return OpCode.LT_GT; + } + if (ch2 == '=') { + _pos++; + return OpCode.LT_EQ; + } + return OpCode.LT; + // GT, // > + // GT_EQ, // >= + // SHR_EQ // >>= + // ASR_EQ, // >>>= + // SHR, // >> + // ASR, // >>> + case '>': + if (ch2 == '>') { + if (ch3 == '>') { + dchar ch4 = _pos < _len - 2 ? _lineText[_pos + 2] : 0; + if (ch4 == '=') { // >>>= + _pos += 3; + return OpCode.ASR_EQ; + } + _pos += 2; + return OpCode.ASR; // >>> + } + if (ch3 == '=') { // >>= + _pos += 2; + return OpCode.SHR_EQ; + } + _pos++; + return OpCode.SHR; + } + if (ch2 == '=') { // >= + _pos++; + return OpCode.GT_EQ; + } + // > + return OpCode.GT; + // NOT, // ! + // NOT_EQ // != + // NOT_LT_GT, // !<> + // NOT_LT_GT_EQ, // !<>= + // NOT_LT, // !< + // NOT_LT_EQ, // !<= + // NOT_GT, // !> + // NOT_GT_EQ, // !>= + case '!': + if (ch2 == '<') { // !< + if (ch3 == '>') { // !<> + dchar ch4 = _pos < _len - 2 ? _lineText[_pos + 2] : 0; + if (ch4 == '=') { // !<>= + _pos += 3; + return OpCode.NOT_LT_GT_EQ; + } + _pos += 2; + return OpCode.NOT_LT_GT; // !<> + } + if (ch3 == '=') { // !<= + _pos += 2; + return OpCode.NOT_LT_EQ; + } + _pos++; + return OpCode.NOT_LT; // !< + } + if (ch2 == '=') { // != + _pos++; + return OpCode.NOT_EQ; + } + return OpCode.NOT; + // PAR_OPEN, // ( + case '(': + return OpCode.PAR_OPEN; + // PAR_CLOSE, // ) + case ')': + return OpCode.PAR_CLOSE; + // SQ_OPEN, // [ + case '[': + return OpCode.SQ_OPEN; + // SQ_CLOSE, // ] + case ']': + return OpCode.SQ_CLOSE; + // CURL_OPEN, // { + case '{': + return OpCode.CURL_OPEN; + // CURL_CLOSE, // } + case '}': + return OpCode.CURL_CLOSE; + // QUEST, // ? + case '?': + return OpCode.QUEST; + // COMMA, // , + case ',': + return OpCode.COMMA; + // SEMICOLON, // ; + case ';': + return OpCode.SEMICOLON; + // COLON, // : + case ':': + return OpCode.COLON; + // DOLLAR, // $ + case '$': + return OpCode.DOLLAR; + // EQ, // = + // QE_EQ, // == + // EQ_GT, // => + case '=': + if (ch2 == '=') { // == + _pos++; + return OpCode.QE_EQ; + } + if (ch2 == '>') { // => + _pos++; + return OpCode.EQ_GT; + } + return OpCode.EQ; + // MUL, // * + // MUL_EQ, // *= + case '*': + if (ch2 == '=') { + _pos++; + return OpCode.MUL_EQ; + } + return OpCode.MUL; + // MOD, // % + // MOD_EQ, // %= + case '%': + if (ch2 == '=') { + _pos++; + return OpCode.MOD_EQ; + } + return OpCode.MOD; + // XOR, // ^ + // XOR_EQ, // ^= + // LOG_XOR, // ^^ + // LOG_XOR_EQ, // ^^= + case '^': + if (ch2 == '^') { + if (ch3 == '=') { + _pos += 2; + return OpCode.LOG_XOR_EQ; + } + _pos++; + return OpCode.LOG_XOR; + } + if (ch2 == '=') { + _pos++; + return OpCode.XOR_EQ; + } + return OpCode.XOR; + // INV, // ~ + // INV_EQ, // ~= + case '~': + if (ch2 == '=') { + _pos++; + return OpCode.INV_EQ; + } + return OpCode.INV; + // AT, // @ + case '@': + return OpCode.AT; + // SHARP // # + case '#': + return OpCode.SHARP; + default: + return OpCode.NONE; + } + } + + Token processDoubleQuotedOrWysiwygString(dchar delimiter) { + bool wysiwyg = (delimiter == 'r' || delimiter == '`'); + //writeln("processDoubleQuotedString()"); + _sharedStringLiteralToken.setPos(_line, _pos - 1); + _stringLiteralAppender.reset(); + if (delimiter == 'r') { + _pos++; + delimiter = '\"'; + } + dchar type = 0; + for (;;) { + uint i = _pos; + uint endPos = uint.max; + for(; i < _len; i++) { + if (_lineText[i] == delimiter && (i == 0 || _lineText[i - 1] != '\\')) { + endPos = i; + break; + } + } + if (endPos != uint.max) { + // found end quote + _stringLiteralAppender.append(_lineText[_pos .. endPos]); + _pos = endPos + 1; + break; + } + // no quote by end of line + _stringLiteralAppender.append(_lineText[_pos .. $]); + _stringLiteralAppender.appendEol(); + if (!nextLine()) { + // do we need to throw exception if eof comes before end of string? + break; + } + } + dchar t = 0; + if (_pos < _len) { + dchar ch = _lineText[_pos]; + if (ch == 'c' || ch == 'w' || ch == 'd') + t = ch; + else if (isIdentMiddleChar(ch)) + parserError("Unexpected character after string literal"); + } + if (t != 0) { + if (type != 0 && t != type) + parserError("Cannot concatenate strings of different type"); + type = t; + } + if (!wysiwyg) { + // no escape processing + _sharedStringLiteralToken.setText(_stringLiteralAppender.get(), type); + return _sharedStringLiteralToken; + } + // TODO: process escape sequences + _sharedStringLiteralToken.setText(_stringLiteralAppender.get(), type); + return _sharedStringLiteralToken; + } + + SysTime buildTime; + + // string literal of the date of compilation "mmm dd yyyy" + dstring formatBuildDate() { + // TODO: provide proper format + return to!dstring(buildTime); + } + + // string literal of the time of compilation "hh:mm:ss" + dstring formatBuildTime() { + // TODO: provide proper format + return to!dstring(buildTime); + } + + // string literal of the date and time of compilation "www mmm dd hh:mm:ss yyyy" + dstring formatBuildTimestamp() { + // TODO: provide proper format + return to!dstring(buildTime); + } + + static immutable dstring VERSION = "0.1"; + static immutable dstring VENDOR = "coolreader.org"; + + Token makeSpecialTokenString(dstring str, uint pos) { + _sharedStringLiteralToken.setPos(_line, pos); + _sharedStringLiteralToken.setText(cast(dchar[])str, 0); + return _sharedStringLiteralToken; + } + + Token processSpecialToken(Keyword keyword, uint pos) { + switch (keyword) { + //Special Token Replaced with + case Keyword.DATE: // string literal of the date of compilation "mmm dd yyyy" + return makeSpecialTokenString(formatBuildDate(), pos); + case Keyword.TIME: // string literal of the time of compilation "hh:mm:ss" + return makeSpecialTokenString(formatBuildTime(), pos); + case Keyword.TIMESTAMP: // string literal of the date and time of compilation "www mmm dd hh:mm:ss yyyy" + return makeSpecialTokenString(formatBuildTimestamp(), pos); + case Keyword.VENDOR: // Compiler vendor string, such as "Digital Mars D" + return makeSpecialTokenString(VENDOR, pos); + case Keyword.VERSION_: // Compiler version as an integer, such as 2001 + return makeSpecialTokenString(VERSION, pos); + default: + parserError("Unexpected token"); + } + return null; + } + + // returns next token (clone it if you want to store for future usage, otherwise it may be overwritten by further nextToken() calls). + public Token nextToken() { + dchar ch = nextChar(); + if (ch == EOF_CHAR) { + return emitEof(); + } + if (ch == EOL_CHAR || ch == 0x0020 || ch == 0x0009 || ch == 0x000B || ch == 0x000C) { + // white space (treat EOL as whitespace, too) + return processWhiteSpace(ch); + } + dchar next = _pos < _len ? _lineText[_pos] : 0; + if (ch == '/') { + if (next == '/') + return processOneLineComment(); + else if (next == '*') + return processMultilineComment(); + else if (next == '+') + return processNestedComment(); + } + if (ch == '\"') + return processDoubleQuotedOrWysiwygString(ch); + if (ch == 'x' && next == '\"') + return processHexString(); + if (ch == 'q' && next == '\"') + return processDelimitedString(); + if ((ch == 'r' && next == '\"') || (ch == '`')) + return processDoubleQuotedOrWysiwygString(ch); + uint oldPos = _pos - 1; + + if (ch == '0') { + if (next == 'b' || next == 'B') + return processBinaryNumber(); + if (next == 'x' || next == 'X') + return processHexNumber(); + if (next >= '0' && next <= '9') + return processOctNumber(); + if (next >= '0' && next <= '9') + return processDecNumber(ch); + } + if (ch >= '0' && ch <= '9') + return processDecNumber(ch); + if (ch == '.' && next >= '0' && next <= '9') // .123 + return processDecFloatSecondPart(0); + + if (ch == '_' || isUniversalAlpha(ch)) { + // start of identifier or keyword? + Keyword keyword = detectKeyword(ch); + if (keyword != Keyword.NONE) { + switch (keyword) { + //Special Token Replaced with + case Keyword.EOF: return emitEof(); // sets the scanner to the end of the file + case Keyword.DATE: // string literal of the date of compilation "mmm dd yyyy" + case Keyword.TIME: // string literal of the time of compilation "hh:mm:ss" + case Keyword.TIMESTAMP: // string literal of the date and time of compilation "www mmm dd hh:mm:ss yyyy" + case Keyword.VENDOR: // Compiler vendor string, such as "Digital Mars D" + case Keyword.VERSION_: // Compiler version as an integer, such as 2001 + return processSpecialToken(keyword, oldPos); + default: + _sharedKeywordToken.setPos(_line, oldPos); + _sharedKeywordToken.keyword = keyword; + return _sharedKeywordToken; + } + } + return processIdent(); + } + OpCode op = detectOp(ch); + if (op != OpCode.NONE) { + _sharedOpToken.setPos(_line, oldPos); + _sharedOpToken.opCode = op; + return _sharedOpToken; + } + return null; + } + + +} + +unittest { + import std.algorithm; + class TokenTest { + uint _line; + string _file; + this(string file, uint line) { + _file = file; + _line = line; + } + bool doTest(Token token) { + return true; + } + void execute(Tokenizer tokenizer) { + Token token = tokenizer.nextToken(); + if (!doTest(token)) { + assert(false, " token doesn not match at " ~ _file ~ ":" ~ to!string(_line) ~ " foundToken: " ~ token.toString ~ " expected: " ~ toString); + } + } + public override @property string toString() { + return "TokenTest"; + } + } + void testTokenizer(string code, TokenTest[] tokens, string file = __FILE__, uint line = __LINE__) { + Tokenizer tokenizer = new Tokenizer(code, "tokenizerTest:" ~ file ~ ":" ~ to!string(line)); + for (uint i = 0; i < tokens.length; i++) { + tokens[i].execute(tokenizer); + } + } + class KeywordTest : TokenTest { + Keyword _code; + this(Keyword code, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _code = code; + } + override bool doTest(Token token) { + if (token.type != TokenType.KEYWORD) + return false; + if (token.keyword != _code) + return false; + return true; + } + public override @property string toString() { + return "Keyword:" ~ to!string(_code); + } + } + class OpTest : TokenTest { + OpCode _code; + this(OpCode code, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _code = code; + } + override bool doTest(Token token) { + if (token.type != TokenType.OP) + return false; + if (token.opCode != _code) + return false; + return true; + } + public override @property string toString() { + return "Op:" ~ to!string(_code); + } + } + class StringTest : TokenTest { + string _value; + this(string value, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _value = value; + } + override bool doTest(Token token) { + if (token.type != TokenType.STRING) + return false; + if (to!string(token.text).equal(_value)) + return false; + return true; + } + public override @property string toString() { + return "String:" ~ _value; + } + } + class IntegerTest : TokenTest { + ulong _value; + bool _unsigned; + bool _long; + this(ulong value, bool unsignedFlag = false, bool longFlag = false, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _value = value; + _unsigned = unsignedFlag; + _long = longFlag; + } + override bool doTest(Token token) { + if (token.type != TokenType.INTEGER) + return false; + if (token.intValue != _value) + return false; + if (token.isUnsigned != _unsigned) + return false; + if (token.isLong != _long) + return false; + return true; + } + public override @property string toString() { + return "Integer:" ~ to!string(_value); + } + } + class RealTest : TokenTest { + real _value; + ubyte _precision; + bool _imaginary; + this(real value, ubyte precision = 1, bool imaginary = false, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _value = value; + _precision = precision; + _imaginary = imaginary; + } + override bool doTest(Token token) { + if (token.type != TokenType.FLOAT) + return false; + if (token.realValue != _value) + return false; + if (token.precision != _precision) + return false; + if (token.isImaginary != _imaginary) + return false; + return true; + } + public override @property string toString() { + return "Real:" ~ to!string(_value); + } + } + class IdentTest : TokenTest { + string _value; + this(string value, string file = __FILE__, uint line = __LINE__) { + super(file, line); + _value = value; + } + override bool doTest(Token token) { + if (token.type != TokenType.IDENTIFIER) + return false; + if (! to!string(token.text).equal(_value)) + return false; + return true; + } + public override @property string toString() { + return "Ident:" ~ _value; + } + } + class CommentTest : TokenTest { + this(string file = __FILE__, uint line = __LINE__) { + super(file, line); + } + override bool doTest(Token token) { + if (token.type != TokenType.COMMENT) + return false; + return true; + } + public override @property string toString() { + return "Comment"; + } + } + class EOFTest : TokenTest { + this(string file = __FILE__, uint line = __LINE__) { + super(file, line); + } + override bool doTest(Token token) { + if (token.type != TokenType.EOF) + return false; + return true; + } + public override @property string toString() { + return "EOF"; + } + } + class WhiteSpaceTest : TokenTest { + this(string file = __FILE__, uint line = __LINE__) { + super(file, line); + } + override bool doTest(Token token) { + if (token.type != TokenType.WHITESPACE) + return false; + return true; + } + public override @property string toString() { + return "whiteSpace"; + } + } + TokenTest checkString(string value, string file = __FILE__, uint line = __LINE__) { + return new StringTest(value, file, line); + } + TokenTest checkInteger(ulong value, bool unsignedFlag = false, bool longFlag = false, string file = __FILE__, uint line = __LINE__) { + return new IntegerTest(value, unsignedFlag, longFlag, file, line); + } + TokenTest checkReal(real value, byte precision = 0, bool imaginary = false, string file = __FILE__, uint line = __LINE__) { + return new RealTest(value, precision, imaginary, file, line); + } + TokenTest checkIdent(string value, string file = __FILE__, uint line = __LINE__) { + return new IdentTest(value, file, line); + } + TokenTest checkKeyword(Keyword value, string file = __FILE__, uint line = __LINE__) { + return new KeywordTest(value, file, line); + } + TokenTest checkOp(OpCode value, string file = __FILE__, uint line = __LINE__) { + return new OpTest(value, file, line); + } + TokenTest checkSpace(string file = __FILE__, uint line = __LINE__) { + return new WhiteSpaceTest(file, line); + } + TokenTest checkComment(string file = __FILE__, uint line = __LINE__) { + return new CommentTest(file, line); + } + TokenTest checkEOF(string file = __FILE__, uint line = __LINE__) { + return new EOFTest(file, line); + } + + testTokenizer(q"TEST +int i; +TEST" + , [ + checkKeyword(Keyword.INT), + checkSpace(), + checkIdent("i"), + checkOp(OpCode.SEMICOLON), + checkEOF() + ]); + testTokenizer("0b1101 0x123abcdU 0xABCL 0743 192837465 0 192_837_465 5.25" + , [ + checkInteger(13), + checkSpace(), + checkInteger(0x123abcd, true, false), + checkSpace(), + checkInteger(0xabc, false, true), + checkSpace(), + checkInteger(std.conv.octal!743), + checkSpace(), + checkInteger(192_837_465), + checkSpace(), + checkInteger(0), + checkSpace(), + checkInteger(192837465), + checkSpace(), + checkReal(5.25), + checkEOF() + ]); +} + +unittest { + import std.stdio; + import std.conv; + import std.utf; + import ddx.lexer.LineStream; + string fname = "/home/lve/src/d/ddc/ddclexer/tests/tokenizer_test.d"; + writeln("opening file"); + try { + std.stream.File f = new std.stream.File(fname); + scope(exit) { f.close(); } + try { + LineStream lines = LineStream.create(f, fname); + Tokenizer tokenizer = new Tokenizer(lines); + for (;;) { + Token token = tokenizer.nextToken(); + if (token is null) { + writeln("Null token returned"); + break; + } + if (token.type == TokenType.EOF) { + writeln("EOF token"); + break; + } + writeln("", token.line, ":", token.pos, "\t", token.toString); + } + } catch (Exception e) { + writeln("Exception " ~ e.toString); + } + } catch (Exception e) { + writeln("Exception " ~ e.toString); + } +} diff --git a/src/ddc/lexer/exceptions.d b/src/ddc/lexer/exceptions.d new file mode 100644 index 0000000..1934d3c --- /dev/null +++ b/src/ddc/lexer/exceptions.d @@ -0,0 +1,32 @@ +module ddc.lexer.exceptions; + +import std.conv; + +class ParserException : Exception { + string _msg; + string _filename; + size_t _line; + size_t _pos; + + public @property size_t line() { return _line; } + + this(string msg, string filename, size_t line, size_t pos) { + super(msg ~ " at " ~ filename ~ " line " ~ to!string(line) ~ " column " ~ to!string(pos)); + _msg = msg; + _filename = filename; + _line = line; + _pos = pos; + } +} + +class LexerException : ParserException { + this(string msg, string filename, size_t line, size_t pos) { + super(msg, filename, line, pos); + } +} + +class SourceEncodingException : LexerException { + this(string msg, string filename, size_t line, size_t pos) { + super(msg, filename, line, pos); + } +} diff --git a/src/ddc/lexer/textsource.d b/src/ddc/lexer/textsource.d new file mode 100644 index 0000000..060ea40 --- /dev/null +++ b/src/ddc/lexer/textsource.d @@ -0,0 +1,103 @@ +module ddc.lexer.textsource; + +private import std.utf; +private import std.array; + +/** +* Source file information. +* Even if contains only file name, it's better to use it instead of string - object reference size is twice less than array ref. +*/ +class SourceFile { + protected string _file; + public @property string filename() { return _file; } + public this(string filename) { + _file = filename; + } +} + +/// source lines for tokenizer +interface SourceLines { + /// source file + @property SourceFile file(); + /// last read line + @property uint line(); + /// source encoding + //@property EncodingType encoding() { return _encoding; } + /// error code + @property int errorCode(); + /// error message + @property string errorMessage(); + /// error line + @property int errorLine(); + /// error position + @property int errorPos(); + + /// read line, return null if EOF reached or error occured + dchar[] readLine(); +} + +/// Simple text source based on array +class ArraySourceLines : SourceLines { + protected SourceFile _file; + protected uint _line; + protected uint _firstLine; + protected dstring[] _lines; + static protected dchar[] _emptyLine = ""d.dup; + + this() { + } + + this(dstring[] lines, SourceFile file, uint firstLine = 0) { + init(lines, file, firstLine); + } + + this(string code, string filename) { + _lines = (toUTF32(code)).split("\n"); + _file = new SourceFile(filename); + } + + void close() { + _lines = null; + _line = 0; + _firstLine = 0; + _file = null; + } + + void init(dstring[] lines, SourceFile file, uint firstLine = 0) { + _lines = lines; + _firstLine = firstLine; + _line = 0; + _file = file; + } + + bool reset(int line) { + _line = line; + return true; + } + + /// source file + override @property SourceFile file() { return _file; } + /// last read line + override @property uint line() { return _line; } + /// source encoding + //@property EncodingType encoding() { return _encoding; } + /// error code + override @property int errorCode() { return 0; } + /// error message + override @property string errorMessage() { return ""; } + /// error line + override @property int errorLine() { return 0; } + /// error position + override @property int errorPos() { return 0; } + + /// read line, return null if EOF reached or error occured + override dchar[] readLine() { + if (_line < _lines.length) { + if (_lines[_line]) + return cast(dchar[])_lines[_line++]; + _line++; + return _emptyLine; + } + return null; // EOF + } +} diff --git a/src/dlangide/ui/frame.d b/src/dlangide/ui/frame.d index 59f4ef9..b19d936 100644 --- a/src/dlangide/ui/frame.d +++ b/src/dlangide/ui/frame.d @@ -17,8 +17,84 @@ import dlangide.ui.wspanel; import dlangide.workspace.workspace; import dlangide.workspace.project; +import ddc.lexer.textsource; +import ddc.lexer.exceptions; +import ddc.lexer.Tokenizer; + import std.conv; import std.utf; +import std.algorithm; + +class SimpleDSyntaxHighlighter : SyntaxHighlighter { + + SourceFile _file; + ArraySourceLines _lines; + Tokenizer _tokenizer; + this (string filename) { + _file = new SourceFile(filename); + _lines = new ArraySourceLines(); + _tokenizer = new Tokenizer(_lines); + } + + TokenPropString[] _props; + + /// categorize characters in content by token types + void updateHighlight(dstring[] lines, TokenPropString[] props, int changeStartLine, int changeEndLine) { + _props = props; + changeStartLine = 0; + changeEndLine = lines.length; + _lines.init(lines[changeStartLine..$], _file, changeStartLine); + _tokenizer.init(_lines); + uint tokenPos = 0; + uint tokenLine = 0; + ubyte category = 0; + for (;;) { + Token token = _tokenizer.nextToken(); + if (token is null) { + //writeln("Null token returned"); + break; + } + if (token.type == TokenType.EOF) { + //writeln("EOF token"); + break; + } + uint newPos = token.pos; + uint newLine = token.line; + + if (category) { + // fill with category + for (uint i = tokenLine - 1; i <= newLine - 1; i++) { + uint start = i > tokenLine - 1 ? 0 : tokenPos; + uint end = i < newLine - 1 ? lines[i].length : tokenPos; + for (uint j = start; j < end; j++) { + assert(i < _props.length); + if (j - 1 < _props[i].length) + _props[i][j - 1] = category; + } + } + } + + TokenType t = token.type; + // handle token + if (t == TokenType.COMMENT) { + category = TokenCategory.Comment; + } else if (t == TokenType.KEYWORD) { + category = TokenCategory.Keyword; + } else if (t == TokenType.IDENTIFIER) { + category = TokenCategory.Identifier; + } else if (t == TokenType.STRING) { + category = TokenCategory.String; + } else { + category = 0; + } + tokenPos = newPos; + tokenLine= newLine; + + } + _lines.close(); + _props = null; + } +} /// DIDE source file editor class DSourceEdit : SourceEdit { @@ -26,6 +102,10 @@ class DSourceEdit : SourceEdit { super(ID); styleId = null; backgroundColor = 0xFFFFFF; + setTokenHightlightColor(TokenCategory.Comment, 0x808080); // gray + setTokenHightlightColor(TokenCategory.Keyword, 0x0020C0); // blue + setTokenHightlightColor(TokenCategory.String, 0xC02000); // red + setTokenHightlightColor(TokenCategory.Identifier, 0x206000); // green } this() { this("SRCEDIT"); @@ -34,8 +114,20 @@ class DSourceEdit : SourceEdit { @property ProjectSourceFile projectSourceFile() { return _projectSourceFile; } /// load by filename override bool load(string fn) { - return super.load(fn); + _projectSourceFile = null; + bool res = super.load(fn); + setHighlighter(); + return res; } + + void setHighlighter() { + if (filename.endsWith(".d") || filename.endsWith(".dd") || filename.endsWith(".dh") || filename.endsWith(".ddoc")) { + content.syntaxHighlighter = new SimpleDSyntaxHighlighter(filename); + } else { + content.syntaxHighlighter = null; + } + } + /// load by project item bool load(ProjectSourceFile f) { if (!load(f.filename)) { @@ -43,6 +135,7 @@ class DSourceEdit : SourceEdit { return false; } _projectSourceFile = f; + setHighlighter(); return true; } }