From e81f52f9490d44b78c23626b3aaf7422d3771645 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Thu, 12 Sep 2013 22:25:00 +0000 Subject: [PATCH] Merge changes from the phobos branch --- stdx/d/lexer.d | 249 +++++++++++++++++++++++++++++++++++------------- stdx/d/parser.d | 41 ++++---- 2 files changed, 203 insertions(+), 87 deletions(-) diff --git a/stdx/d/lexer.d b/stdx/d/lexer.d index 7440601..7b8b2fd 100644 --- a/stdx/d/lexer.d +++ b/stdx/d/lexer.d @@ -4,11 +4,13 @@ * This module contains a range-based _lexer for the D programming language. * * For performance reasons the _lexer contained in this module operates only on - * ASCII or UTF-8 encoded source code. If the use of other encodings is + * UTF-8 encoded source code. If the use of other encodings is * desired, the source code must be converted to UTF-8 before passing it to this * _lexer. * - * To use the _lexer, create a LexerConfig struct + * To use the _lexer, create a $(LREF LexerConfig) struct. The + * $(LREF LexerConfig) contains fields for configuring the behavior of the + * lexer. * --- * LexerConfig config; * config.iterStyle = IterationStyle.everything; @@ -16,15 +18,17 @@ * config.versionNumber = 2064; * config.vendorString = "Lexer Example"; * --- - * Once you have configured the _lexer, call byToken$(LPAREN)$(RPAREN) on your - * source code, passing in the configuration. + * Once you have configured the _lexer, call $(LREF byToken)$(LPAREN)$(RPAREN) + * on your source code, passing in the configuration. * --- + * // UTF-8 encoded source code * auto source = "import std.stdio;"c; * auto tokens = byToken(source, config); + * // or auto tokens = source.byToken(config); * --- - * The result of byToken$(LPAREN)$(RPAREN) is a forward range of tokens that can - * be easily used with the algorithms from std.algorithm or iterated over with - * $(D_KEYWORD foreach) + * The result of $(LREF byToken)$(LPAREN)$(RPAREN) is a forward range of tokens + * that can be easily used with the algorithms from std.algorithm or iterated + * over with $(D_KEYWORD foreach). * --- * assert (tokens.front.type == TokenType.import_); * assert (tokens.front.value == "import"); @@ -90,11 +94,19 @@ * * void main(string[] args) * { + * // Create the configuration * LexerConfig config; + * // Specify that we want tokens to appear exactly as they did in the source * config.tokenStyle = TokenStyle.source; + * // Include whitespace, comments, etc. * config.iterStyle = IterationStyle.everything; + * // Tell the lexer to use the name of the file being read when generating + * // error messages. * config.fileName = args[1]; + * // Open the file (error checking ommitted for brevity) * auto f = File(args[1]); + * // Read the lines of the file, and combine them. Then create the token + * // range, which is then passed on to highlight. * (cast(ubyte[]) f.byLine(KeepTerminator.yes).join()).byToken(config).highlight(); * } * --- @@ -129,13 +141,13 @@ public: struct Token { /** - * The representation of the token in the original source code. + * The characters that comprise the token. */ string value; /** * The index of the start of the token in the original source. - * $(LPAREN)measured in ASCII characters or UTF-8 code units$(RPAREN) + * $(LPAREN)measured in UTF-8 code units$(RPAREN) */ size_t startIndex; @@ -164,6 +176,20 @@ struct Token return other.type == type && other.value == value; } + /// + unittest + { + Token a; + a.type = TokenType.intLiteral; + a.value = "1"; + Token b; + b.type = TokenType.intLiteral; + b.value = "1"; + assert (a == b); + b.value = "2"; + assert (a != b); + } + /** * Checks to see if the token's string representation is equal to the given * string. @@ -173,6 +199,14 @@ struct Token return this.value == value; } + /// + unittest + { + Token t; + t.value = "abcde"; + assert (t == "abcde"); + } + /** * Checks to see if the token is of the given type. */ @@ -181,6 +215,14 @@ struct Token return this.type == type; } + /// + unittest + { + Token t; + t.type = TokenType.class_; + assert (t == TokenType.class_); + } + /** * Comparison operator orders tokens by start index. */ @@ -191,17 +233,43 @@ struct Token return 0; } + /// + unittest + { + Token a; + a.startIndex = 10; + Token b; + b.startIndex = 20; + assert (a < b); + } + + /** + * Comparison operator overload for checking if the token's start index is + * before, after, or the same as the given index. + */ int opCmp(size_t index) const nothrow pure { if (startIndex < index) return -1; if (startIndex > index) return 1; return 0; } + + /// + unittest + { + import std.array; + import std.range; + auto source = cast(ubyte[]) "a b c"c; + LexerConfig c; + auto tokens = source.byToken(c).array(); + assert (tokens.length == 3); + //assert (tokens.assumeSorted().lowerBound(1)[$ - 1] == "b"); + } } /** - * Configure the behavior of the byToken() function. These flags may be - * combined using a bitwise or. + * Configure the behavior of the $(LREF byToken)$(LPAREN)$(RPAREN) function. + * These flags may be combined using a bitwise or. */ enum IterationStyle : ushort { @@ -268,12 +336,14 @@ enum TokenStyle : ushort struct LexerConfig { /** - * Iteration style + * Configure the lexer's iteration style. + * See_Also: $(LREF IterationStyle) */ IterationStyle iterStyle = IterationStyle.codeOnly; /** - * Token style + * Configure the style of the tokens produced by the lexer. + * See_Also: $(LREF TokenStyle) */ TokenStyle tokenStyle = tokenStyle.default_; @@ -289,28 +359,64 @@ struct LexerConfig /** * Name used when creating error messages that are sent to errorFunc. This - * is needed because the lexer operates on any forwarad range of ASCII + * is needed because the lexer operates on any forward range of ASCII * characters or UTF-8 code units and does not know what to call its input * source. Defaults to the empty string. */ string fileName = ""; /** - * This function is called when an error is encountered during lexing. - * Parameters are file name, code uint index, line number, column, - * and error messsage. + * The starting line and column numbers for the lexer. These can be set when + * partially lexing D code to provide correct token locations and better + * error messages. These should be left to their default values of 1 when + * lexing entire files. Line and column numbers are 1-indexed in this lexer + * because this produces more useful error messages. The start index is + * zero-indexed, as it is more useful to machines than users. */ - void delegate(string, size_t, uint, uint, string) errorFunc; + uint startLine = 1; + + /** + * ditto + */ + ushort startColumn = 1; + + /** + * ditto + */ + size_t startIndex = 0; + + /** + * This function is called when an error is encountered during lexing. + * If this field is not set, the lexer will throw an exception including the + * line, column, and error message. + * + * $(BOOKTABLE Error Function Parameters:, + * $(TR $(TD string) $(TD File name)) + * $(TR $(TD size_t) $(TD Code unit index)) + * $(TR $(TD uint) $(TD Line number)) + * $(TR $(TD ushort) $(TD Column number)) + * $(TR $(TD string) $(TD Error message)) + * ) + */ + void delegate(string, size_t, uint, ushort, string) errorFunc; } /** * Iterate over the given range of characters by D tokens. + * + * The lexing process is able to handle a forward range of code units by using + * an internal circular buffer to provide efficient extracting of the token + * values from the input. It is more efficient, however, to provide a range + * that supports random accessing and slicing. If the input range supports + * slicing, the caching layer aliases itself away and the lexing process + * is much more efficient. + * * Params: - * range = the range of characters + * range = the range of characters to lex * config = the lexer configuration * bufferSize = initial size of internal circular buffer * Returns: - * an input range of tokens + * a $(LREF TokenRange) that iterates over the given range */ auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024) if (isForwardRange!(R) && !isRandomAccessRange!(R) @@ -319,8 +425,6 @@ auto byToken(R)(R range, LexerConfig config, size_t bufferSize = 4*1024) // 4K of circular buffer by default auto r = TokenRange!(typeof(lexerSource(range))) (lexerSource(range, bufferSize), config); - r.config = config; - r.lineNumber = 1; r.popFront(); return r; } @@ -331,14 +435,12 @@ auto byToken(R)(R range, LexerConfig config) { auto r = TokenRange!(typeof(lexerSource(range))) (lexerSource(range), config); - r.config = config; - r.lineNumber = 1; r.popFront(); return r; } /** - * Range of tokens. Use byToken$(LPAREN)$(RPAREN) to instantiate. + * Range of tokens. Use $(LREF byToken)$(LPAREN)$(RPAREN) to instantiate. */ struct TokenRange(LexSrc) //if ( is(LexSrc : LexSource!(U...), U...)) //check for LexSource @@ -361,7 +463,7 @@ struct TokenRange(LexSrc) } /** - * Returns the current token and then removes it from the range + * Returns: the current token and then removes it from the range */ Token moveFront() { @@ -398,7 +500,7 @@ L_advance: current.value = null; switch (src.front) { - // handle sentenels for end of input + // handle sentinels for end of input case 0: case 0x1a: // TODO: check config flags, it's cheap @@ -408,11 +510,11 @@ L_advance: mixin(generateCaseTrie( "=", "TokenType.assign", "@", "TokenType.at", - "&", "TokenType.bitAnd", - "&=", "TokenType.bitAndEqual", + "&", "TokenType.amp", + "&=", "TokenType.bitAndAssign", "|", "TokenType.bitOr", - "|=", "TokenType.bitOrEqual", - "~=", "TokenType.catEqual", + "|=", "TokenType.bitOrAssign", + "~=", "TokenType.catAssign", ":", "TokenType.colon", ",", "TokenType.comma", "--", "TokenType.decrement", @@ -432,10 +534,10 @@ L_advance: "||", "TokenType.logicOr", "(", "TokenType.lParen", "-", "TokenType.minus", - "-=", "TokenType.minusEqual", + "-=", "TokenType.minusAssign", "%", "TokenType.mod", - "%=", "TokenType.modEqual", - "*=", "TokenType.mulEqual", + "%=", "TokenType.modAssign", + "*=", "TokenType.mulAssign", "!", "TokenType.not", "!=", "TokenType.notEqual", "!>", "TokenType.notGreater", @@ -444,25 +546,25 @@ L_advance: "!<=", "TokenType.notLessEqual", "!<>", "TokenType.notLessEqualGreater", "+", "TokenType.plus", - "+=", "TokenType.plusEqual", + "+=", "TokenType.plusAssign", "^^", "TokenType.pow", - "^^=", "TokenType.powEqual", + "^^=", "TokenType.powAssign", "}", "TokenType.rBrace", "]", "TokenType.rBracket", ")", "TokenType.rParen", ";", "TokenType.semicolon", "<<", "TokenType.shiftLeft", - "<<=", "TokenType.shiftLeftEqual", + "<<=", "TokenType.shiftLeftAssign", ">>", "TokenType.shiftRight", - ">>=", "TokenType.shiftRightEqual", + ">>=", "TokenType.shiftRightAssign", "*", "TokenType.star", "?", "TokenType.ternary", "~", "TokenType.tilde", "!<>=", "TokenType.unordered", ">>>", "TokenType.unsignedShiftRight", - ">>>=", "TokenType.unsignedShiftRightEqual", + ">>>=", "TokenType.unsignedShiftRightAssign", "^", "TokenType.xor", - "^=", "TokenType.xorEqual" + "^=", "TokenType.xorAssign" )); case '/': nextCharNonLF(); @@ -483,7 +585,7 @@ L_advance: goto L_advance; // tail-recursion case '=': - current.type = TokenType.divEqual; + current.type = TokenType.divAssign; current.value = "/="; src.popFront(); return; @@ -508,7 +610,7 @@ L_advance: case '.': nextCharNonLF(); nextCharNonLF(); - current.type = TokenType.slice; + current.type = TokenType.dotdot; if (src.front == '.') { current.type = TokenType.vararg; @@ -516,7 +618,7 @@ L_advance: current.value = tokenValue!(TokenType.vararg); } else - current.value = tokenValue!(TokenType.slice); + current.value = tokenValue!(TokenType.dotdot); return; default: nextCharNonLF(); @@ -1722,7 +1824,7 @@ L_advance: if (foundNewline) { ++lineNumber; - column = 0; + column = 1; } else ++column; @@ -1865,12 +1967,14 @@ L_advance: this(LexSrc lex, LexerConfig cfg) { src = move(lex); // lex is r-value - lineNumber = 1; - column = 0; + lineNumber = cfg.startLine; + column = cfg.startColumn; + //src._index = cfg.startIndex; _empty = false; config = move(cfg); // ditto with cfg cache = StringCache(initialTableSize); } + enum initialTableSize = 2048; Token current; uint lineNumber; @@ -1886,7 +1990,7 @@ L_advance: */ pure nothrow bool isOperator(const TokenType t) { - return t >= TokenType.assign && t <= TokenType.xorEqual; + return t >= TokenType.assign && t <= TokenType.xorAssign; } /** @@ -2026,7 +2130,7 @@ pure nothrow bool isStringLiteral(ref const Token t) } /** - * Returns: true if the token is whitespace, a commemnt, a special token + * Returns: true if the token is whitespace, a comment, a special token * sequence, or an identifier */ pure nothrow bool isMisc(const TokenType t) @@ -2050,16 +2154,16 @@ enum TokenType: ushort invalid, /// Not a valid token assign, /// = at, /// @ - bitAnd, /// & - bitAndEqual, /// &= + amp, /// & + bitAndAssign, /// &= bitOr, /// | - bitOrEqual, /// |= - catEqual, /// ~= + bitOrAssign, /// |= + catAssign, /// ~= colon, /// : comma, /// , decrement, /// -- div, /// / - divEqual, /// /= + divAssign, /// /= dollar, /// $ dot, /// . equal, /// == @@ -2078,10 +2182,10 @@ enum TokenType: ushort logicOr, /// || lParen, /// $(LPAREN) minus, /// - - minusEqual, /// -= + minusAssign, /// -= mod, /// % - modEqual, /// %= - mulEqual, /// *= + modAssign, /// %= + mulAssign, /// *= not, /// ! notEqual, /// != notGreater, /// !> @@ -2090,27 +2194,27 @@ enum TokenType: ushort notLessEqual, /// !<= notLessEqualGreater, /// !<> plus, /// + - plusEqual, /// += + plusAssign, /// += pow, /// ^^ - powEqual, /// ^^= + powAssign, /// ^^= rBrace, /// } rBracket, /// ] rParen, /// $(RPAREN) semicolon, /// ; shiftLeft, /// << - shiftLeftEqual, /// <<= + shiftLeftAssign, /// <<= shiftRight, /// >> - shiftRightEqual, /// >>= - slice, /// .. + shiftRightAssign, /// >>= + dotdot, /// .. star, /// * ternary, /// ? tilde, /// ~ unordered, /// !<>= unsignedShiftRight, /// >>> - unsignedShiftRightEqual, /// >>>= + unsignedShiftRightAssign, /// >>>= vararg, /// ... xor, /// ^ - xorEqual, /// ^= + xorAssign, /// ^= bool_, /// $(D_KEYWORD bool) byte_, /// $(D_KEYWORD byte) @@ -2266,12 +2370,21 @@ pure string getTokenValue(const TokenType type) return tokenValues[type]; } +/// +unittest +{ + // The class token always has one value + assert (getTokenValue(TokenType.class_) == "class"); + // Identifiers do not + assert (getTokenValue(TokenType.identifier) is null); +} + // Implementation details follow private: // For now a private helper that is tailored to the way lexer works // hides away forwardness of range by buffering -// random-access version is a strightforward thin wrapping +// random-access version is a straightforward thin wrapping // ATM it is byte-oriented private struct LexSource(R) if(isForwardRange!R && !isRandomAccessRange!R) @@ -2449,7 +2562,7 @@ private struct LexSource(R) saved = cur; } - // use the underliying range slicing capability + // use the underlying range slicing capability auto slice() @property { return range[saved..cur]; @@ -3218,7 +3331,7 @@ private: size_t uniqueSlots; enum loadQuot = 2, loadDenom = 3; - // leave some slack for alloctors/GC meta-data + // leave some slack for allocators/GC meta-data enum chunkSize = 16*1024 - size_t.sizeof*8; ubyte*[] chunkS; size_t next = chunkSize; @@ -3415,7 +3528,7 @@ unittest "1.2", "u", "4i", "1337L", "4.2L", "1", "..", "2", "4.3", ".5", ".8", "0xabc", "0xabcp4", "0x1P-10", "0x40u", "0x29L", "0x4Lu", "0xdeadbeef"]; int errCount = 0; - void errorFunction(string file, size_t index, uint line, uint col, string msg) + void errorFunction(string file, size_t index, uint line, ushort col, string msg) { ++errCount; } @@ -3464,7 +3577,7 @@ unittest { auto source = cast(ubyte[]) (`"string`); int errCount = 0; - void errorFunction(string file, size_t index, uint line, uint col, string msg) + void errorFunction(string file, size_t index, uint line, ushort col, string msg) { ++errCount; } diff --git a/stdx/d/parser.d b/stdx/d/parser.d index 9c6a0f3..493cf3e 100644 --- a/stdx/d/parser.d +++ b/stdx/d/parser.d @@ -246,7 +246,7 @@ alias core.sys.posix.stdio.fileno fileno; { mixin(traceEnterAndExit!(__FUNCTION__)); return parseLeftAssocBinaryExpression!(AndExpression, CmpExpression, - TokenType.bitAnd)(); + TokenType.amp)(); } /** @@ -676,12 +676,12 @@ alias core.sys.posix.stdio.fileno fileno; mixin(traceEnterAndExit!(__FUNCTION__)); auto node = new AssignExpression; node.ternaryExpression = parseTernaryExpression(); - if (currentIsOneOf(TokenType.assign, TokenType.unsignedShiftRightEqual, - TokenType.shiftRightEqual, TokenType.shiftLeftEqual, - TokenType.plusEqual, TokenType.minusEqual, TokenType.mulEqual, - TokenType.modEqual, TokenType.bitAndEqual, TokenType.divEqual, - TokenType.bitOrEqual, TokenType.powEqual, TokenType.xorEqual, - TokenType.catEqual)) + if (currentIsOneOf(TokenType.assign, TokenType.unsignedShiftRightAssign, + TokenType.shiftRightAssign, TokenType.shiftLeftAssign, + TokenType.plusAssign, TokenType.minusAssign, TokenType.mulAssign, + TokenType.modAssign, TokenType.bitAndAssign, TokenType.divAssign, + TokenType.bitOrAssign, TokenType.powAssign, TokenType.xorAssign, + TokenType.catAssign)) { node.operator = advance().type; node.assignExpression = parseAssignExpression(); @@ -981,7 +981,7 @@ alias core.sys.posix.stdio.fileno fileno; node.low = parseAssignExpression(); } if (expect(TokenType.colon) is null) return null; - if (expect(TokenType.slice) is null) return null; + if (expect(TokenType.dotdot) is null) return null; expect(TokenType.case_); node.high = parseAssignExpression(); if (expect(TokenType.colon) is null) return null; @@ -2196,7 +2196,7 @@ class ClassFour(A, B) if (someTest()) : Super {}}c; if (expect(TokenType.semicolon) is null) return null; node.low = parseExpression(); if (node.low is null) return null; - if (currentIs(TokenType.slice)) + if (currentIs(TokenType.dotdot)) { if (!canBeRange) { @@ -4280,12 +4280,13 @@ q{(int a, ...) { mixin(traceEnterAndExit!(__FUNCTION__)); auto node = new SharedStaticConstructor; - expect(TokenType.shared_); - expect(TokenType.static_); - expect(TokenType.this_); - expect(TokenType.lParen); - expect(TokenType.rParen); + if (expect(TokenType.shared_) is null) return null; + if (expect(TokenType.static_) is null) return null; + if (expect(TokenType.this_) is null) return null; + if (expect(TokenType.lParen) is null) return null; + if (expect(TokenType.rParen) is null) return null; node.functionBody = parseFunctionBody(); + if (node.functionBody is null) return null; return node; } @@ -4343,6 +4344,8 @@ q{(int a, ...) advance(); // = } node.identifierChain = parseIdentifierChain(); + if (node.identifierChain is null) + return null; return node; } @@ -4363,7 +4366,7 @@ q{(int a, ...) if (!currentIs(TokenType.rBracket)) { node.lower = parseAssignExpression(); - expect(TokenType.slice); + expect(TokenType.dotdot); node.upper = parseAssignExpression(); } if (expect(TokenType.rBracket) is null) return null; @@ -4389,7 +4392,7 @@ q{(int a, ...) case TokenType.case_: advance(); auto argumentList = parseArgumentList(); - if (argumentList.items.length == 1 && startsWith(TokenType.colon, TokenType.slice)) + if (argumentList.items.length == 1 && startsWith(TokenType.colon, TokenType.dotdot)) node.caseRangeStatement = parseCaseRangeStatement(argumentList.items[0]); else node.caseStatement = parseCaseStatement(argumentList); @@ -5488,7 +5491,7 @@ q{(int a, ...) goToBookmark(bookmark); node.low = parseAssignExpression(); if (node.low is null) return null; - if (currentIs(slice)) + if (currentIs(dotdot)) { advance(); node.high = parseAssignExpression(); @@ -5595,7 +5598,7 @@ q{(int a, ...) auto node = new UnaryExpression; with(TokenType) switch (current.type) { - case bitAnd: + case amp: case not: case star: case plus: @@ -5955,7 +5958,7 @@ q{doStuff(5)}c; mixin(traceEnterAndExit!(__FUNCTION__)); if (startsWith(TokenType.lBracket, TokenType.rBracket)) return true; - return hasMagicDelimiter!(TokenType.slice)(); + return hasMagicDelimiter!(TokenType.dotdot)(); } void setTokens(const(Token)[] tokens)