more tokenizer work

This commit is contained in:
Hackerpilot 2013-01-15 01:55:32 +00:00
parent 400d9ec137
commit ca33a71074
3 changed files with 105 additions and 39 deletions

View File

@ -1,4 +1,3 @@
// Copyright Brian Schott (Sir Alaran) 2012. // Copyright Brian Schott (Sir Alaran) 2012.
// Distributed under the Boost Software License, Version 1.0. // Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at // (See accompanying file LICENSE_1_0.txt or copy at

View File

@ -1,4 +1,7 @@
module entities; // Copyright Brian Schott (Sir Alaran) 2012.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)module entities;
/** /**
* Generated from $(LINK http://www.w3.org/TR/html5/entities.json) * Generated from $(LINK http://www.w3.org/TR/html5/entities.json)

View File

@ -1,4 +1,3 @@
// Copyright Brian Schott (Sir Alaran) 2012. // Copyright Brian Schott (Sir Alaran) 2012.
// Distributed under the Boost Software License, Version 1.0. // Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at // (See accompanying file LICENSE_1_0.txt or copy at
@ -13,9 +12,12 @@ import std.algorithm;
import std.conv; import std.conv;
import std.uni; import std.uni;
import std.stdio; import std.stdio;
import std.ascii;
import std.format;
import langutils; import langutils;
import codegen; import codegen;
import entities;
pure bool isNewline(R)(R range) pure bool isNewline(R)(R range)
{ {
@ -28,7 +30,7 @@ pure bool isEoF(R)(R range)
} }
char[] popNewline(R)(ref R range) char[] popNewline(R)(ref R range)
{ {
char[] chars; char[] chars;
if (range.front == '\r') if (range.front == '\r')
{ {
@ -56,7 +58,7 @@ unittest
string lexWhitespace(R)(ref R range, ref uint lineNumber) string lexWhitespace(R)(ref R range, ref uint lineNumber)
{ {
auto app = appender!(char[])(); auto app = appender!(char[])();
while (!isEoF(range) && isWhite(range.front)) while (!isEoF(range) && std.uni.isWhite(range.front))
{ {
if (isNewline(range)) if (isNewline(range))
{ {
@ -210,6 +212,37 @@ unittest
assert (lineNumber == 2); assert (lineNumber == 2);
} }
/**
* Pops up to upTo hex chars from the input range and returns them as a string
*/
string popHexChars(R)(ref R input, uint upTo)
{
auto app = appender!(char[])();
for (uint i = 0; i != upTo; ++i)
{
if (isHexDigit(input.front))
{
app.put(input.front);
input.popFront;
}
else
break;
}
return to!string(app.data);
}
unittest
{
auto a = "124ac82d3fqwerty";
auto ra = popHexChars(a, uint.max);
assert (a == "qwerty");
assert (ra == "124ac82d3f");
auto b = "08a7c2e3";
auto rb = popHexChars(b, 4);
assert (rb.length == 4);
assert (rb == "08a7");
assert (b == "c2e3");
}
string interpretEscapeSequence(R)(ref R input) string interpretEscapeSequence(R)(ref R input)
in in
@ -219,43 +252,75 @@ in
body body
{ {
input.popFront(); input.popFront();
auto app = appender!(char[])(); switch (input.front)
loop: while (!isEoF(input))
{ {
switch (input.front) case '\'':
case '\"':
case '?':
case '\\':
case 0:
case 0x1a:
auto f = input.front;
input.popFront();
return to!string(f);
case 'a': input.popFront(); return "\a";
case 'b': input.popFront(); return "\b";
case 'f': input.popFront(); return "\f";
case 'n': input.popFront(); return "\n";
case 'r': input.popFront(); return "\r";
case 't': input.popFront(); return "\t";
case 'v': input.popFront(); return "\v";
case 'x':
input.popFront();
auto hexChars = popHexChars(input, 2);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case '0': .. case '7':
return "";
case 'u':
input.popFront();
auto hexChars = popHexChars(input, 4);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case 'U':
input.popFront();
auto hexChars = popHexChars(input, 8);
return to!string(cast(dchar) parse!uint(hexChars, 16));
case '&':
input.popFront();
auto entity = appender!(char[])();
while (!input.isEoF() && input.front != ';')
{ {
case '\'': entity.put(input.front);
case '\"':
case '?':
case '\\':
case 0:
case 0x1a:
app.put(input.front);
input.popFront(); input.popFront();
break loop;
case 'a': input.popFront(); app.put('\a'); break loop;
case 'b': input.popFront(); app.put('\b'); break loop;
case 'f': input.popFront(); app.put('\f'); break loop;
case 'n': input.popFront(); app.put('\n'); break loop;
case 'r': input.popFront(); app.put('\r'); break loop;
case 't': input.popFront(); app.put('\t'); break loop;
case 'v': input.popFront(); app.put('\v'); break loop;
case 'x':
break;
case '0' .. case '7':
break;
case 'u':
break;
case 'U':
break;
case '&':
// http://www.w3.org/TR/html5/entities.json
default:
// This is an error
break;
} }
if (!isEoF(input))
{
auto decoded = characterEntities[to!string(entity.data)];
input.popFront();
if (decoded !is null)
return decoded;
}
return "";
default:
// This is an error
return "";
} }
return app.data; }
unittest
{
auto a = "\\&";
assert (interpretEscapeSequence(a) == x"0026");
auto b = "\\𝔞";
assert (interpretEscapeSequence(b) == x"D835DD1E");
auto c = "\\n";
assert (interpretEscapeSequence(c) == "\n");
auto d = "\\?";
assert (interpretEscapeSequence(d) == "?");
auto e = "\\u0033";
assert (interpretEscapeSequence(e) == "\u0033");
auto f = "\\U00000094";
assert (interpretEscapeSequence(f) == "\U00000094");
} }
/** /**
@ -279,8 +344,7 @@ body
auto app = appender!(char[])(); auto app = appender!(char[])();
while (!isEoF(input) && (input.front != quote || escape)) while (!isEoF(input) && (input.front != quote || escape))
{ {
if (canEscape && ) if (isNewline(input))
else if (isNewline(input))
{ {
app.put(popNewline(input)); app.put(popNewline(input));
lineNumber++; lineNumber++;