// Copyright Brian Schott (Sir Alaran) 2012. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) module tokenizer; import std.range; import std.file; import std.traits; import std.algorithm; import std.conv; import std.uni; import std.stdio; import std.ascii; import std.format; import std.exception; import langutils; import codegen; import entities; pure bool isNewline(R)(R range) { return range.front == '\n' || range.front == '\r'; } pure bool isEoF(R)(R range) { return range.empty || range.front == 0 || range.front == 0x1a; } C[] popNewline(R, C = ElementType!R)(ref R range, ref uint index) if (isSomeChar!C && isForwardRange!R) { C[] chars; if (range.front == '\r') { chars ~= range.front; range.popFront(); ++index; } if (range.front == '\n') { chars ~= range.front; range.popFront(); ++index; } return chars; } unittest { uint i; auto s = "\r\ntest"; assert (popNewline(s, i) == "\r\n"); assert (s == "test"); } /** * Returns: */ Token lexWhitespace(R, C = ElementType!R)(ref R range, ref uint index, ref uint lineNumber) if (isForwardRange!R && isSomeChar!C) { Token t; t.type = TokenType.Whitespace; t.lineNumber = lineNumber; t.startIndex = index; auto app = appender!(C[])(); while (!isEoF(range) && std.uni.isWhite(range.front)) { if (isNewline(range)) { ++lineNumber; app.put(popNewline(range, index)); } else { app.put(range.front); range.popFront(); ++index; } } t.value = to!string(app.data); return t; } unittest { import std.stdio; uint lineNum = 1; uint index; auto chars = " \n \r\n \tabcde"; auto r = lexWhitespace(chars, index, lineNum); assert (r.value == " \n \r\n \t"); assert (chars == "abcde"); assert (lineNum == 3); } /** * Increments endIndex until it indexes a character directly after a comment * Params: * inputString = the source code to examine * endIndex = an index into inputString at the second character of a * comment, i.e. points at the second slash in a // comment. * lineNumber = the line number that corresponds to endIndex * Returns: The comment */ Token lexComment(R, C = ElementType!R)(ref R input, ref uint index, ref uint lineNumber) if (isSomeChar!C && isForwardRange!R) in { assert (input.front == '/'); } body { Token t; t.lineNumber = lineNumber; t.type = TokenType.Comment; t.startIndex = index; auto app = appender!(C[])(); app.put(input.front); input.popFront(); switch(input.front) { case '/': while (!isEoF(input) && !isNewline(input)) { app.put(input.front); input.popFront(); ++index; } break; case '*': while (!isEoF(input)) { if (isNewline(input)) { app.put(popNewline(input, index)); ++lineNumber; } else if (input.front == '*') { app.put(input.front); input.popFront(); ++index; if (input.front == '/') { app.put(input.front); input.popFront(); ++index; break; } } else { app.put(input.front); input.popFront(); ++index; } } break; case '+': int depth = 1; while (depth > 0 && !isEoF(input)) { if (isNewline(input)) { app.put(popNewline(input, index)); lineNumber++; } else if (input.front == '+') { app.put(input.front); input.popFront(); ++index; if (input.front == '/') { app.put(input.front); input.popFront(); ++index; --depth; } } else if (input.front == '/') { app.put(input.front); input.popFront(); ++index; if (input.front == '+') { app.put(input.front); input.popFront(); ++index; ++depth; } } else { app.put(input.front); input.popFront(); ++index; } } break; default: Token errorToken; return errorToken; } t.value = to!string(app.data); return t; } unittest { uint index; uint lineNumber = 1; auto chars = "//this is a comment\r\nthis is not"; auto comment = lexComment(chars, index, lineNumber); assert (chars == "\r\nthis is not"); assert (comment.value == "//this is a comment"); } unittest { uint index = 0; uint lineNumber = 1; auto chars = "/* this is a\n\tcomment\r\n */this is not"; auto comment = lexComment(chars, index, lineNumber); assert (chars == "this is not"); assert (comment.value == "/* this is a\n\tcomment\r\n */"); assert (lineNumber == 3); } unittest { uint index; uint lineNumber = 1; auto chars = "/+this is a /+c/+omm+/ent+/ \r\nthis+/ is not"; auto comment = lexComment(chars, index, lineNumber); assert (chars == " is not"); assert (comment.value == "/+this is a /+c/+omm+/ent+/ \r\nthis+/"); assert (lineNumber == 2); } unittest { uint i; uint l; auto chars = "/("; auto comment = lexComment(chars, i, l); assert (comment == ""); } /** * Pops up to upTo hex chars from the input range and returns them as a string */ string popDigitChars(R, C = ElementType!R, alias isInterestingDigit)(ref R input, ref uint index, uint upTo) if (isSomeChar!C && isForwardRange!R) { auto app = appender!(C[])(); for (uint i = 0; i != upTo; ++i) { if (isInterestingDigit(input.front)) { app.put(input.front); input.popFront(); } else break; } return to!string(app.data); } string popHexChars(R)(ref R input, ref uint index, uint upTo) { return popDigitChars!(R, ElementType!R, isHexDigit)(input, index, upTo); } string popOctalChars(R)(ref R input, ref uint index, uint upTo) { return popDigitChars!(R, ElementType!R, isOctalDigit)(input, index, upTo); } unittest { uint i; auto a = "124ac82d3fqwerty"; auto ra = popHexChars(a, i, uint.max); assert (a == "qwerty"); assert (ra == "124ac82d3f"); auto b = "08a7c2e3"; auto rb = popHexChars(b, i, 4); assert (rb.length == 4); assert (rb == "08a7"); assert (b == "c2e3"); auto c = "00123832"; auto rc = popOctalChars(c, i, uint.max); assert (c == "832"); assert (rc == "00123"); } string interpretEscapeSequence(R, C = ElementType!R)(ref R input, ref uint index) if (isSomeChar!C && isForwardRange!R) in { assert(input.front == '\\'); } body { input.popFront(); switch (input.front) { case '\'': case '\"': case '?': case '\\': case 0: case 0x1a: auto f = input.front; input.popFront(); ++index; return to!string(f); case 'a': input.popFront(); ++index; return "\a"; case 'b': input.popFront(); ++index; return "\b"; case 'f': input.popFront(); ++index; return "\f"; case 'n': input.popFront(); ++index; return "\n"; case 'r': input.popFront(); ++index; return "\r"; case 't': input.popFront(); ++index; return "\t"; case 'v': input.popFront(); ++index; return "\v"; case 'x': input.popFront(); auto hexChars = popHexChars(input, index, 2); return to!string(cast(dchar) parse!uint(hexChars, 16)); case '0': .. case '7': auto octalChars = popOctalChars(input, index, 3); return to!string(cast(dchar) parse!uint(octalChars, 8)); case 'u': input.popFront(); auto hexChars = popHexChars(input, index, 4); return to!string(cast(dchar) parse!uint(hexChars, 16)); case 'U': input.popFront(); auto hexChars = popHexChars(input, index, 8); return to!string(cast(dchar) parse!uint(hexChars, 16)); case '&': input.popFront(); ++index; auto entity = appender!(char[])(); while (!input.isEoF() && input.front != ';') { entity.put(input.front); input.popFront(); ++index; } if (!isEoF(input)) { auto decoded = to!string(entity.data) in characterEntities; input.popFront(); ++index; if (decoded !is null) return to!string(*decoded); } return ""; default: input.popFront(); ++index; // This is an error return "\\"; } } unittest { uint i; auto vals = [ "\\&": "&", "\\n": "\n", "\\?": "?", "\\u0033": "\u0033", "\\U00000076": "v", "\\075": "=", "\\'": "'", "\\a": "\a", "\\b": "\b", "\\f": "\f", "\\r": "\r", "\\t": "\t", "\\v": "\v", "\\y": "\\", "\\x20": " ", "\\&eeeeeeror;": "", ]; foreach (k, v; vals) assert (interpretEscapeSequence(k, i) == v); } Token lexHexString(R, C = ElementType!R)(ref R input, ref uint index, ref uint lineNumber, const StringStyle style = StringStyle.Default) in { assert (input.front == 'x'); } body { Token t; t.lineNumber = lineNumber; t.startIndex = index; t.type = TokenType.StringLiteral; auto app = appender!(C[])(); if (style & StringStyle.IncludeQuotes) app.put("x\""); input.popFront(); input.popFront(); index += 2; while (!input.isEoF()) { if (isNewline(input)) { app.put(popNewline(input, index)); ++lineNumber; } else if (isHexDigit(input.front)) { app.put(input.front); input.popFront(); ++index; } else if (std.uni.isWhite(input.front) && (style & StringStyle.NotEscaped)) { app.put(input.front); input.popFront(); ++index; } else if (input.front == '"') { if (style & StringStyle.IncludeQuotes) app.put('"'); input.popFront(); ++index; break; } else { // This is an error } } if (!input.isEoF()) { switch (input.front) { case 'w': t.type = TokenType.WStringLiteral; goto case 'c'; case 'd': t.type = TokenType.DStringLiteral; goto case 'c'; case 'c': if (style & StringStyle.IncludeQuotes) app.put(input.front); input.popFront(); ++index; break; default: break; } } if (style & StringStyle.NotEscaped) t.value = to!string(app.data); else { auto a = appender!(char[])(); foreach (b; std.range.chunks(app.data, 2)) a.put(to!string(cast(dchar) parse!uint(b, 16))); t.value = to!string(a.data); } return t; } unittest { uint i; uint l; auto a = `x"204041"`; auto ar = lexHexString(a, i, l); assert (ar == " @A"); assert (ar == TokenType.StringLiteral); auto b = `x"20"w`; auto br = lexHexString(b, i, l); assert (br == " "); assert (br == TokenType.WStringLiteral); auto c = `x"6d"`; auto cr = lexHexString(c, i, l, StringStyle.NotEscaped); assert (cr == "6d"); auto d = `x"5e5f"d`; auto dr = lexHexString(d, i, l, StringStyle.NotEscaped | StringStyle.IncludeQuotes); assert (dr == `x"5e5f"d`); assert (dr == TokenType.DStringLiteral); } Token lexString(R)(ref R input, ref uint index, ref uint lineNumber, const StringStyle style = StringStyle.Default) in { assert (input.front == '\'' || input.front == '"' || input.front == '`' || input.front == 'r'); } body { Token t; t.lineNumber = lineNumber; t.startIndex = index; t.type = TokenType.StringLiteral; auto app = appender!(char[])(); if (input.front == 'r') { if (style & StringStyle.IncludeQuotes) app.put('r'); input.popFront(); } auto quote = input.front; input.popFront(); ++index; if (style & StringStyle.IncludeQuotes) app.put(quote); while (!isEoF(input)) { if (isNewline(input)) { app.put(popNewline(input, index)); lineNumber++; } else if (input.front == '\\') { if (style & StringStyle.NotEscaped) { auto r = input.save(); r.popFront(); if (r.front == quote) { app.put('\\'); app.put(quote); input.popFront(); input.popFront(); index += 2; } else if (r.front == '\\') { app.put('\\'); app.put('\\'); input.popFront(); input.popFront(); index += 2; } else { app.put('\\'); input.popFront(); ++index; } } else app.put(interpretEscapeSequence(input, index)); } else if (input.front == quote) { if (style & StringStyle.IncludeQuotes) app.put(quote); input.popFront(); ++index; break; } else { app.put(input.front); input.popFront(); ++index; } } if (!input.isEoF()) { switch (input.front) { case 'w': t.type = TokenType.WStringLiteral; goto case 'c'; case 'd': t.type = TokenType.DStringLiteral; goto case 'c'; case 'c': if (style & StringStyle.IncludeQuotes) app.put(input.front); input.popFront(); ++index; break; default: break; } } t.value = to!string(app.data); return t; } unittest { uint l = 1; uint i; auto a = `"abcde"`; assert (lexString(a, i, l) == "abcde"); auto b = "\"ab\\ncd\""; assert (lexString(b, i, l) == "ab\ncd"); auto c = "`abc\\ndef`"; assert (lexString(c, i, l, StringStyle.NotEscaped) == "abc\\ndef"); auto d = `"12345"w`; assert (lexString(d, i, l).type == TokenType.WStringLiteral); auto e = `"abc"c`; assert (lexString(e, i, l).type == TokenType.StringLiteral); auto f = `"abc"d`; assert (lexString(f, i, l).type == TokenType.DStringLiteral); auto g = "\"a\nb\""; assert (lexString(g, i, l) == "a\nb"); } Token lexNumber(R)(ref R input, ref uint index, const uint lineNumber) in { assert(isDigit(input.front)); } body { auto app = appender!(char[])(); // hex and binary can start with zero, anything else is decimal if (input.front != '0') return lexDecimal(input, index, lineNumber, app); else { app.put(input.front); input.popFront(); ++index; switch (input.front) { case 'x': case 'X': app.put(input.front); input.popFront(); ++index; return lexHex(input, index, lineNumber, app); case 'b': case 'B': app.put(input.front); input.popFront(); ++index; return lexBinary(input, index, lineNumber, app); default: return lexDecimal(input, index, lineNumber, app); } } } unittest { uint i; uint l; auto a = "0q1239"; assert (lexNumber(a, i, l) == "0"); } Token lexBinary(R)(ref R input, ref uint index, const uint lineNumber, ref typeof(appender!(char[])()) app) { Token token; token.lineNumber = lineNumber; token.startIndex = index; token.type = TokenType.IntLiteral; bool lexingSuffix = false; bool isLong = false; bool isUnsigned = false; binaryLoop: while (!input.isEoF()) { switch (input.front) { case '0': case '1': case '_': if (lexingSuffix) break binaryLoop; app.put(input.front); input.popFront(); ++index; break; case 'u': case 'U': if (isUnsigned) break binaryLoop; app.put(input.front); input.popFront(); ++index; if (isLong) { token.type = TokenType.UnsignedLongLiteral; break binaryLoop; } else token.type = TokenType.UnsignedIntLiteral; isUnsigned = true; break; case 'L': if (isLong) break binaryLoop; app.put(input.front); input.popFront(); ++index; lexingSuffix = true; if (isUnsigned) { token.type = TokenType.UnsignedLongLiteral; break binaryLoop; } else token.type = TokenType.LongLiteral; isLong = true; break; default: break binaryLoop; } } token.value = to!string(app.data); return token; } unittest { uint i; uint l; auto a = "0b000101"; auto ar = lexNumber(a, i, l); assert (ar.value == "0b000101"); assert (a == ""); auto b = "0b001L_"; auto br = lexNumber(b, i, l); assert (br.value == "0b001L"); assert (br.type == TokenType.LongLiteral); auto c = "0b1101uLL"; auto cr = lexNumber(c, i, l); assert (cr.value == "0b1101uL"); assert (cr.type == TokenType.UnsignedLongLiteral); auto d = "0b1q"; auto dr = lexNumber(d, i, l); assert (dr.value == "0b1"); assert (dr.type == TokenType.IntLiteral); auto e = "0b1_0_1LU"; auto er = lexNumber(e, i, l); assert (er.value == "0b1_0_1LU"); assert (er.type == TokenType.UnsignedLongLiteral); auto f = "0b1_0_1uU"; auto fr = lexNumber(f, i, l); assert (fr.value == "0b1_0_1u"); assert (fr.type == TokenType.UnsignedIntLiteral); auto g = "0b1_0_1LL"; auto gr = lexNumber(g, i, l); assert (gr.value == "0b1_0_1L"); assert (gr.type == TokenType.LongLiteral); } Token lexDecimal(R)(ref R input, ref uint index, const uint lineNumber, ref typeof(appender!(char[])()) app) { bool lexingSuffix = false; bool isLong = false; bool isUnsigned = false; bool isFloat = false; bool isReal = false; bool isDouble = false; bool foundDot = false; bool foundE = false; bool foundPlusMinus = false; Token token; token.type = TokenType.IntLiteral; token.startIndex = index; token.lineNumber = lineNumber; decimalLoop: while (!input.isEoF()) { switch (input.front) { case '0': .. case '9': case '_': if (lexingSuffix) break decimalLoop; app.put(input.front); input.popFront(); ++index; break; case 'e': case 'E': // For this to be a valid exponent, the next character must be a // decimal character or a sign auto r = input.save(); r.popFront(); if (foundE || r.isEoF()) break decimalLoop; switch (r.front) { case '+': case '-': r.popFront(); if (r.isEoF() || r.front < '0' || r.front > '9') { break decimalLoop; } break; case '0': .. case '9': break; default: break decimalLoop; } app.put(input.front); input.popFront(); ++index; foundE = true; isDouble = true; token.type = TokenType.DoubleLiteral; break; case '+': case '-': if (foundPlusMinus || !foundE) break decimalLoop; foundPlusMinus = true; app.put(input.front); input.popFront(); ++index; break; case '.': auto r = input.save(); r.popFront(); if (!r.isEoF() && r.front == '.') break decimalLoop; // possibly slice expression if (foundDot) break decimalLoop; // two dots with other characters between them app.put(input.front); input.popFront(); ++index; foundDot = true; token.type = TokenType.DoubleLiteral; isDouble = true; break; case 'u': case 'U': if (isUnsigned) break decimalLoop; app.put(input.front); input.popFront(); ++index; lexingSuffix = true; if (isLong) token.type = TokenType.UnsignedLongLiteral; else token.type = TokenType.UnsignedIntLiteral; isUnsigned = true; break; case 'L': if (isLong || isReal) break decimalLoop; app.put(input.front); input.popFront(); ++index; lexingSuffix = true; if (isDouble) { token.type = TokenType.RealLiteral; isReal = true; } else if (isUnsigned) { token.type = TokenType.UnsignedLongLiteral; isLong = true; } else { token.type = TokenType.LongLiteral; isLong = true; } break; case 'f': case 'F': lexingSuffix = true; if (isUnsigned || isLong) break decimalLoop; app.put(input.front); input.popFront(); ++index; token.type = TokenType.FloatLiteral; isFloat = true; break; case 'i': // Spec says that this is the last suffix, so all cases break the // loop. if (isReal) { app.put(input.front); input.popFront(); ++index; token.type = TokenType.IRealLiteral; } else if (isFloat) { app.put(input.front); input.popFront(); ++index; token.type = TokenType.IFloatLiteral; } else if (isDouble) { app.put(input.front); input.popFront(); ++index; token.type = TokenType.IDoubleLiteral; } break decimalLoop; default: break decimalLoop; } } token.value = to!string(app.data()); return token; } unittest { uint i; uint l; auto a = "55e-4"; auto ar = lexNumber(a, i, l); assert (ar.value == "55e-4"); assert (ar.type == TokenType.DoubleLiteral); auto b = "123.45f"; auto br = lexNumber(b, i, l); assert (br.value == "123.45f"); assert (br.type == TokenType.FloatLiteral); auto c = "3e+f"; auto cr = lexNumber(c, i, l); assert (cr.value == "3"); assert (cr.type == TokenType.IntLiteral); auto d = "3e++f"; auto dr = lexNumber(d, i, l); assert (dr.value == "3"); assert (dr.type == TokenType.IntLiteral); auto e = "1234..1237"; auto er = lexNumber(e, i, l); assert (er.value == "1234"); assert (er.type == TokenType.IntLiteral); auto f = "12L_"; auto fr = lexNumber(f, i, l); assert (fr == "12L"); auto g = "12e-12e"; auto gr = lexNumber(g, i, l); assert (gr == "12e-12"); auto h = "12e10"; auto hr = lexNumber(h, i, l); assert (hr == "12e10"); auto j = "12er"; auto jr = lexNumber(j, i, l); assert (jr == "12"); auto k = "12e+12-"; auto kr = lexNumber(k, i, l); assert (kr == "12e+12"); auto m = "1.1."; auto mr = lexNumber(m, i, l); assert (mr == "1.1"); auto n = "12uu"; auto nr = lexNumber(n, i, l); assert (nr == "12u"); assert (nr.type == TokenType.UnsignedIntLiteral); auto o = "12LU"; auto or = lexNumber(o, i, l); assert (or == "12LU"); auto p = "3LL"; auto pr = lexNumber(p, i, l); assert (pr == "3L"); auto q = "3.0LL"; auto qr = lexNumber(q, i, l); assert (qr == "3.0L"); auto r = "5uL"; auto rr = lexNumber(r, i, l); assert (rr == "5uL"); auto s = "5Lf"; auto sr = lexNumber(s, i, l); assert (sr == "5L"); assert (sr == TokenType.LongLiteral); auto t = "5i"; auto tr = lexNumber(t, i, l); assert (tr == "5"); assert (tr == TokenType.IntLiteral); auto u = "894.3i"; auto ur = lexNumber(u, i, l); assert (ur == "894.3i"); assert (ur == TokenType.IDoubleLiteral); auto v = "894.3Li"; auto vr = lexNumber(v, i, l); assert (vr == "894.3Li"); assert (vr == TokenType.IRealLiteral); auto w = "894.3fi"; auto wr = lexNumber(w, i, l); assert (wr == "894.3fi"); assert (wr == TokenType.IFloatLiteral); auto x = "4892.4ee"; auto xr = lexNumber(x, i, l); assert (xr == "4892.4"); assert (xr == TokenType.DoubleLiteral); } Token lexHex(R)(ref R input, ref uint index, const uint lineNumber, ref typeof(appender!(char[])()) app) { bool isLong = false; bool isUnsigned = false; bool isFloat = false; bool isReal = false; bool isDouble = false; bool foundDot = false; bool foundExp = false; bool foundPlusMinus = false; string backup; Token token; token.lineNumber = lineNumber; token.startIndex = index; token.type = TokenType.IntLiteral; hexLoop: while (!input.isEoF()) { switch (input.front) { case 'a': .. case 'f': case 'A': .. case 'F': if (foundExp) break hexLoop; else goto case; case '0': .. case '9': case '_': app.put(input.front); input.popFront(); ++index; break; case 'p': case 'P': if (foundExp) break hexLoop; auto r = input.save(); r.popFront(); switch (r.front) { case '-': case '+': r.popFront(); if (r.isEoF() || !isDigit(r.front)) break hexLoop; break; case '0': .. case '9': break; default: break hexLoop; } app.put(input.front); input.popFront(); ++index; foundExp = true; isDouble = true; token.type = TokenType.DoubleLiteral; break; case '+': case '-': if (foundPlusMinus || !foundExp) break hexLoop; foundPlusMinus = true; app.put(input.front); input.popFront(); ++index; break; case '.': auto r = input.save(); r.popFront(); if (!r.isEoF() && r.front == '.') break hexLoop; // slice expression if (foundDot) break hexLoop; // two dots with other characters between them app.put(input.front); input.popFront(); ++index; foundDot = true; token.type = TokenType.DoubleLiteral; break; default: break hexLoop; } } token.value = to!string(app.data); return token; } unittest { uint i; uint l; auto a = "0x193abfq"; auto ar = lexNumber(a, i, l); assert(ar.value == "0x193abf"); assert(ar.type == TokenType.IntLiteral); auto b = "0x2130xabc"; auto br = lexNumber(b, i, l); assert(br.value == "0x2130"); assert(br.type == TokenType.IntLiteral); auto c = "0x123..0321"; auto cr = lexNumber(c, i, l); assert (cr.value == "0x123"); assert (cr.type == TokenType.IntLiteral); auto d = "0xabp5"; auto dr = lexNumber(d, i, l); assert (dr == "0xabp5"); assert (dr == TokenType.DoubleLiteral); auto e = "0x93p+5"; auto er = lexNumber(e, i, l); assert (er == "0x93p+5"); assert (er == TokenType.DoubleLiteral); auto f = "0x93pp"; auto fr = lexNumber(f, i, l); assert (fr == "0x93"); assert (fr == TokenType.IntLiteral); auto g = "0XF..7"; auto gr = lexNumber(g, i, l); assert (gr == "0XF"); assert (gr == TokenType.IntLiteral); auto h = "0x8.4p100"; auto hr = lexNumber(h, i, l); assert (hr == "0x8.4p100"); assert (hr == TokenType.DoubleLiteral); auto j = "0x8.4.100"; auto jr = lexNumber(j, i, l); assert (jr == "0x8.4"); assert (jr == TokenType.DoubleLiteral); auto k = "0x1p-t"; auto kr = lexNumber(k, i, l); assert (kr == "0x1"); assert (kr == TokenType.IntLiteral); auto m = "0x1p-5p"; auto mr = lexNumber(m, i, l); assert (mr == "0x1p-5"); assert (mr == TokenType.DoubleLiteral); auto n = "0x1p-c_"; auto nr = lexNumber(n, i, l); assert (nr == "0x1"); assert (nr == TokenType.IntLiteral); auto o = "0x1p-1a"; auto or = lexNumber(o, i, l); assert (or == "0x1p-1"); assert (or == TokenType.DoubleLiteral); auto p = "0x1p-1+"; auto pr = lexNumber(p, i, l); assert (pr == "0x1p-1"); assert (pr == TokenType.DoubleLiteral); } /** * Returns: true if ch marks the ending of one token and the beginning of * another, false otherwise */ pure nothrow bool isSeparating(C)(C ch) if (isSomeChar!C) { switch (ch) { case '!': .. case '/': case ':': .. case '@': case '[': .. case '^': case '{': .. case '~': case '`': case 0x20: // space case 0x09: // tab case 0x0a: .. case 0x0d: // newline, vertical tab, form feed, carriage return return true; default: return false; } } /** * Configure the tokenize() function */ enum IterationStyle { /// Only include code, not whitespace or comments CodeOnly = 0, /// Includes comments IncludeComments = 0b01, /// Includes whitespace IncludeWhitespace = 0b10, /// Include everything Everything = IncludeComments | IncludeWhitespace } /** * Configuration of the token lexing style */ enum StringStyle : uint { /// Escape sequences will be replaced with their equivalent characters. /// Quote characters will not be included Default = 0b0000, /// Escape sequences will not be processed NotEscaped = 0b0001, /// Strings will include their opening and closing quote characters as well /// as any prefixes or suffixes (e.g.: "abcde"w will include the 'w' /// character) IncludeQuotes = 0x0010, /// Strings will be read exactly as they appeared in the source, including /// their opening and closing quote characters. Useful for syntax highlighting. Source = NotEscaped | IncludeQuotes, } TokenRange!(R) byToken(R)(R range, const IterationStyle iterationStyle = IterationStyle.CodeOnly, const StringStyle stringStyle = StringStyle.Default) if (isForwardRange!(R) && isSomeChar!(ElementType!(R))) { auto r = TokenRange!(R)(range); r.stringStyle = stringStyle; r.iterStyle = iterationStyle; r.lineNumber = 1; r.popFront(); return r; } struct TokenRange(R) if (isForwardRange!(R) && isSomeChar!(ElementType!(R))) { this(ref R range) { this.range = range; } bool empty() @property { return _empty; } Token front() const @property { enforce(!_empty, "Cannot call popFront() on empty token range"); return current; } Token popFront() { if (range.isEoF()) { _empty = true; return current; } Token c = current; current = Token.init; current.lineNumber = lineNumber; current.startIndex = index; while (std.uni.isWhite(range.front)) { if (iterStyle == IterationStyle.Everything) { current = lexWhitespace(range, index, lineNumber); return c; } else lexWhitespace(range, index, lineNumber); } outer: switch (range.front) { mixin(generateCaseTrie( "=", "TokenType.Assign", "&", "TokenType.BitAnd", "&=", "TokenType.BitAndEquals", "|", "TokenType.BitOr", "|=", "TokenType.BitOrEquals", "~=", "TokenType.CatEquals", ":", "TokenType.Colon", ",", "TokenType.Comma", "$", "TokenType.Dollar", ".", "TokenType.Dot", "==", "TokenType.Equals", "=>", "TokenType.GoesTo", ">", "TokenType.Greater", ">=", "TokenType.GreaterEqual", "#", "TokenType.Hash", "&&", "TokenType.LogicAnd", "{", "TokenType.LBrace", "[", "TokenType.LBracket", "<", "TokenType.Less", "<=", "TokenType.LessEqual", "<>=", "TokenType.LessEqualGreater", "<>", "TokenType.LessOrGreater", "||", "TokenType.LogicOr", "(", "TokenType.LParen", "-", "TokenType.Minus", "-=", "TokenType.MinusEquals", "%", "TokenType.Mod", "%=", "TokenType.ModEquals", "*=", "TokenType.MulEquals", "!", "TokenType.Not", "!=", "TokenType.NotEquals", "!>", "TokenType.NotGreater", "!>=", "TokenType.NotGreaterEqual", "!<", "TokenType.NotLess", "!<=", "TokenType.NotLessEqual", "!<>", "TokenType.NotLessEqualGreater", "+", "TokenType.Plus", "+=", "TokenType.PlusEquals", "^^", "TokenType.Pow", "^^=", "TokenType.PowEquals", "}", "TokenType.RBrace", "]", "TokenType.RBracket", ")", "TokenType.RParen", ";", "TokenType.Semicolon", "<<", "TokenType.ShiftLeft", "<<=", "TokenType.ShiftLeftEqual", ">>", "TokenType.ShiftRight", ">>=", "TokenType.ShiftRightEqual", "..", "TokenType.Slice", "*", "TokenType.Star", "?", "TokenType.Ternary", "~", "TokenType.Tilde", "--", "TokenType.Decrement", "!<>=", "TokenType.Unordered", ">>>", "TokenType.UnsignedShiftRight", ">>>=", "TokenType.UnsignedShiftRightEqual", "++", "TokenType.Increment", "...", "TokenType.Vararg", "^", "TokenType.Xor", "^=", "TokenType.XorEquals", "@", "TokenType.At", )); case '0': .. case '9': current = lexNumber(range, index, lineNumber); break; case '\'': case '"': current = lexString(range, index, lineNumber, stringStyle); break; case '`': current = lexString(range, index, lineNumber, stringStyle); break; case 'q': auto r = range.save; r.popFront(); if (!r.isEoF() && r.front == '{') { writeln("ParseTokenString"); break; } else goto default; case '/': auto r = range.save(); r.popFront(); if (r.isEoF()) { current.type = TokenType.Div; current.value = "/"; break; } switch (r.front) { case '/': case '*': case '+': current = lexComment(range, index, lineNumber); break outer; case '=': current.type = TokenType.DivEquals; current.value = "/="; break outer; default: current.type = TokenType.Div; current.value = "/"; break; } break; case 'r': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '"') { current = lexString(range, index, lineNumber, StringStyle.NotEscaped); break; } else goto default; case 'x': auto r = range.save(); r.popFront(); if (!r.isEoF() && r.front == '"') { current = lexHexString(range, index, lineNumber); break; } else goto default; default: auto app = appender!(ElementType!(R)[])(); while(!range.isEoF() && !isSeparating(range.front)) { app.put(range.front); range.popFront(); } current.value = to!string(app.data); current.type = lookupTokenTypeOptimized(current.value); break; } return c; } private: Token current; uint lineNumber; uint index; R range; bool _empty; IterationStyle iterStyle; StringStyle stringStyle; } unittest { auto c = "rust r\"\\ntest\" r`eh?`"; foreach (t; byToken(c)) writeln(t); }