From ca33a71074c39f3fe74c83fbfad9aa550d39cce7 Mon Sep 17 00:00:00 2001 From: Hackerpilot Date: Tue, 15 Jan 2013 01:55:32 +0000 Subject: [PATCH] more tokenizer work --- circularbuffer.d | 1 - entities.d | 5 +- tokenizer.d | 138 ++++++++++++++++++++++++++++++++++------------- 3 files changed, 105 insertions(+), 39 deletions(-) diff --git a/circularbuffer.d b/circularbuffer.d index 9b7226f..714720a 100644 --- a/circularbuffer.d +++ b/circularbuffer.d @@ -1,4 +1,3 @@ - // Copyright Brian Schott (Sir Alaran) 2012. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at diff --git a/entities.d b/entities.d index 80c55db..d59b2be 100644 --- a/entities.d +++ b/entities.d @@ -1,4 +1,7 @@ -module entities; +// Copyright Brian Schott (Sir Alaran) 2012. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt)module entities; /** * Generated from $(LINK http://www.w3.org/TR/html5/entities.json) diff --git a/tokenizer.d b/tokenizer.d index fdb9b5c..0ef0394 100644 --- a/tokenizer.d +++ b/tokenizer.d @@ -1,4 +1,3 @@ - // Copyright Brian Schott (Sir Alaran) 2012. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at @@ -13,9 +12,12 @@ import std.algorithm; import std.conv; import std.uni; import std.stdio; +import std.ascii; +import std.format; import langutils; import codegen; +import entities; pure bool isNewline(R)(R range) { @@ -28,7 +30,7 @@ pure bool isEoF(R)(R range) } char[] popNewline(R)(ref R range) - { +{ char[] chars; if (range.front == '\r') { @@ -56,7 +58,7 @@ unittest string lexWhitespace(R)(ref R range, ref uint lineNumber) { auto app = appender!(char[])(); - while (!isEoF(range) && isWhite(range.front)) + while (!isEoF(range) && std.uni.isWhite(range.front)) { if (isNewline(range)) { @@ -210,6 +212,37 @@ unittest assert (lineNumber == 2); } +/** + * Pops up to upTo hex chars from the input range and returns them as a string + */ +string popHexChars(R)(ref R input, uint upTo) +{ + auto app = appender!(char[])(); + for (uint i = 0; i != upTo; ++i) + { + if (isHexDigit(input.front)) + { + app.put(input.front); + input.popFront; + } + else + break; + } + return to!string(app.data); +} + +unittest +{ + auto a = "124ac82d3fqwerty"; + auto ra = popHexChars(a, uint.max); + assert (a == "qwerty"); + assert (ra == "124ac82d3f"); + auto b = "08a7c2e3"; + auto rb = popHexChars(b, 4); + assert (rb.length == 4); + assert (rb == "08a7"); + assert (b == "c2e3"); +} string interpretEscapeSequence(R)(ref R input) in @@ -219,43 +252,75 @@ in body { input.popFront(); - auto app = appender!(char[])(); - loop: while (!isEoF(input)) + switch (input.front) { - switch (input.front) + case '\'': + case '\"': + case '?': + case '\\': + case 0: + case 0x1a: + auto f = input.front; + input.popFront(); + return to!string(f); + case 'a': input.popFront(); return "\a"; + case 'b': input.popFront(); return "\b"; + case 'f': input.popFront(); return "\f"; + case 'n': input.popFront(); return "\n"; + case 'r': input.popFront(); return "\r"; + case 't': input.popFront(); return "\t"; + case 'v': input.popFront(); return "\v"; + case 'x': + input.popFront(); + auto hexChars = popHexChars(input, 2); + return to!string(cast(dchar) parse!uint(hexChars, 16)); + case '0': .. case '7': + return ""; + case 'u': + input.popFront(); + auto hexChars = popHexChars(input, 4); + return to!string(cast(dchar) parse!uint(hexChars, 16)); + case 'U': + input.popFront(); + auto hexChars = popHexChars(input, 8); + return to!string(cast(dchar) parse!uint(hexChars, 16)); + case '&': + input.popFront(); + auto entity = appender!(char[])(); + while (!input.isEoF() && input.front != ';') { - case '\'': - case '\"': - case '?': - case '\\': - case 0: - case 0x1a: - app.put(input.front); + entity.put(input.front); input.popFront(); - break loop; - case 'a': input.popFront(); app.put('\a'); break loop; - case 'b': input.popFront(); app.put('\b'); break loop; - case 'f': input.popFront(); app.put('\f'); break loop; - case 'n': input.popFront(); app.put('\n'); break loop; - case 'r': input.popFront(); app.put('\r'); break loop; - case 't': input.popFront(); app.put('\t'); break loop; - case 'v': input.popFront(); app.put('\v'); break loop; - case 'x': - break; - case '0' .. case '7': - break; - case 'u': - break; - case 'U': - break; - case '&': - // http://www.w3.org/TR/html5/entities.json - default: - // This is an error - break; } + if (!isEoF(input)) + { + auto decoded = characterEntities[to!string(entity.data)]; + input.popFront(); + if (decoded !is null) + return decoded; + } + return ""; + default: + // This is an error + return ""; + } - return app.data; +} + +unittest +{ + auto a = "\\&"; + assert (interpretEscapeSequence(a) == x"0026"); + auto b = "\\𝔞"; + assert (interpretEscapeSequence(b) == x"D835DD1E"); + auto c = "\\n"; + assert (interpretEscapeSequence(c) == "\n"); + auto d = "\\?"; + assert (interpretEscapeSequence(d) == "?"); + auto e = "\\u0033"; + assert (interpretEscapeSequence(e) == "\u0033"); + auto f = "\\U00000094"; + assert (interpretEscapeSequence(f) == "\U00000094"); } /** @@ -279,8 +344,7 @@ body auto app = appender!(char[])(); while (!isEoF(input) && (input.front != quote || escape)) { - if (canEscape && ) - else if (isNewline(input)) + if (isNewline(input)) { app.put(popNewline(input)); lineNumber++;