mirror of https://github.com/adamdruppe/arsd.git
more sloppy entity error recovery
This commit is contained in:
parent
40a5854003
commit
db857a4470
80
dom.d
80
dom.d
|
@ -4305,6 +4305,9 @@ dchar parseEntity(in dchar[] entity) {
|
||||||
while(decimal.length && (decimal[0] < '0' || decimal[0] > '9'))
|
while(decimal.length && (decimal[0] < '0' || decimal[0] > '9'))
|
||||||
decimal = decimal[1 .. $];
|
decimal = decimal[1 .. $];
|
||||||
|
|
||||||
|
while(decimal.length && (decimal[$-1] < '0' || decimal[$-1] > '9'))
|
||||||
|
decimal = decimal[0 .. $ - 1];
|
||||||
|
|
||||||
if(decimal.length == 0)
|
if(decimal.length == 0)
|
||||||
return ' '; // this is really broken html
|
return ' '; // this is really broken html
|
||||||
// done with dealing with broken stuff
|
// done with dealing with broken stuff
|
||||||
|
@ -4357,6 +4360,8 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
char[4] buffer;
|
char[4] buffer;
|
||||||
|
|
||||||
bool tryingEntity = false;
|
bool tryingEntity = false;
|
||||||
|
bool tryingNumericEntity = false;
|
||||||
|
bool tryingHexEntity = false;
|
||||||
dchar[16] entityBeingTried;
|
dchar[16] entityBeingTried;
|
||||||
int entityBeingTriedLength = 0;
|
int entityBeingTriedLength = 0;
|
||||||
int entityAttemptIndex = 0;
|
int entityAttemptIndex = 0;
|
||||||
|
@ -4366,6 +4371,14 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
entityAttemptIndex++;
|
entityAttemptIndex++;
|
||||||
entityBeingTried[entityBeingTriedLength++] = ch;
|
entityBeingTried[entityBeingTriedLength++] = ch;
|
||||||
|
|
||||||
|
if(entityBeingTriedLength == 2 && ch == '#') {
|
||||||
|
tryingNumericEntity = true;
|
||||||
|
continue;
|
||||||
|
} else if(tryingNumericEntity && entityBeingTriedLength == 3 && ch == 'x') {
|
||||||
|
tryingHexEntity = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// I saw some crappy html in the wild that looked like &0ї this tries to handle that.
|
// I saw some crappy html in the wild that looked like &0ї this tries to handle that.
|
||||||
if(ch == '&') {
|
if(ch == '&') {
|
||||||
if(strict)
|
if(strict)
|
||||||
|
@ -4373,14 +4386,21 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
|
|
||||||
// if not strict, let's try to parse both.
|
// if not strict, let's try to parse both.
|
||||||
|
|
||||||
if(entityBeingTried[0 .. entityBeingTriedLength] == "&&")
|
if(entityBeingTried[0 .. entityBeingTriedLength] == "&&") {
|
||||||
a ~= "&"; // double amp means keep the first one, still try to parse the next one
|
a ~= "&"; // double amp means keep the first one, still try to parse the next one
|
||||||
else
|
} else {
|
||||||
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
|
auto ch2 = parseEntity(entityBeingTried[0 .. entityBeingTriedLength]);
|
||||||
|
if(ch2 == '\ufffd') { // either someone put this in intentionally (lol) or we failed to get it
|
||||||
|
// but either way, just abort and keep the plain text
|
||||||
|
foreach(char c; entityBeingTried[0 .. entityBeingTriedLength - 1]) // cut off the & we're on now
|
||||||
|
a ~= c;
|
||||||
|
} else {
|
||||||
|
a ~= buffer[0.. std.utf.encode(buffer, ch2)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// tryingEntity is still true
|
// tryingEntity is still true
|
||||||
entityBeingTriedLength = 1;
|
goto new_entity;
|
||||||
entityAttemptIndex = 0; // restarting o this
|
|
||||||
} else
|
} else
|
||||||
if(ch == ';') {
|
if(ch == ';') {
|
||||||
tryingEntity = false;
|
tryingEntity = false;
|
||||||
|
@ -4391,10 +4411,34 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
else {
|
else {
|
||||||
tryingEntity = false;
|
tryingEntity = false;
|
||||||
a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength]);
|
a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength - 1]);
|
||||||
|
a ~= buffer[0 .. std.utf.encode(buffer, ch)];
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
if(tryingNumericEntity) {
|
||||||
|
if(ch < '0' || ch > '9') {
|
||||||
|
if(tryingHexEntity) {
|
||||||
|
if(ch < 'A')
|
||||||
|
goto trouble;
|
||||||
|
if(ch > 'Z' && ch < 'a')
|
||||||
|
goto trouble;
|
||||||
|
if(ch > 'z')
|
||||||
|
goto trouble;
|
||||||
|
} else {
|
||||||
|
trouble:
|
||||||
|
if(strict)
|
||||||
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
|
tryingEntity = false;
|
||||||
|
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
|
||||||
|
a ~= ch;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
if(entityAttemptIndex >= 9) {
|
if(entityAttemptIndex >= 9) {
|
||||||
|
done:
|
||||||
if(strict)
|
if(strict)
|
||||||
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
else {
|
else {
|
||||||
|
@ -4405,7 +4449,10 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(ch == '&') {
|
if(ch == '&') {
|
||||||
|
new_entity:
|
||||||
tryingEntity = true;
|
tryingEntity = true;
|
||||||
|
tryingNumericEntity = false;
|
||||||
|
tryingHexEntity = false;
|
||||||
entityBeingTriedLength = 0;
|
entityBeingTriedLength = 0;
|
||||||
entityBeingTried[entityBeingTriedLength++] = ch;
|
entityBeingTried[entityBeingTriedLength++] = ch;
|
||||||
entityAttemptIndex = 0;
|
entityAttemptIndex = 0;
|
||||||
|
@ -4427,6 +4474,25 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
return cast(string) a; // assumeUnique is actually kinda slow, lol
|
return cast(string) a; // assumeUnique is actually kinda slow, lol
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unittest {
|
||||||
|
// error recovery
|
||||||
|
assert(htmlEntitiesDecode("<&foo") == "<&foo"); // unterminated turned back to thing
|
||||||
|
assert(htmlEntitiesDecode("<&foo") == "<&foo"); // semi-terminated... parse and carry on (is this really sane?)
|
||||||
|
assert(htmlEntitiesDecode("loc=en_us&tracknum=111") == "loc=en_us&tracknum=111"); // a bit of both, seen in a real life email
|
||||||
|
assert(htmlEntitiesDecode("& test") == "& test"); // unterminated, just abort
|
||||||
|
|
||||||
|
// in strict mode all of these should fail
|
||||||
|
try { assert(htmlEntitiesDecode("<&foo", true) == "<&foo"); assert(0); } catch(Exception e) { }
|
||||||
|
try { assert(htmlEntitiesDecode("<&foo", true) == "<&foo"); assert(0); } catch(Exception e) { }
|
||||||
|
try { assert(htmlEntitiesDecode("loc=en_us&tracknum=111", true) == "<&foo"); assert(0); } catch(Exception e) { }
|
||||||
|
try { assert(htmlEntitiesDecode("& test", true) == "& test"); assert(0); } catch(Exception e) { }
|
||||||
|
|
||||||
|
// correct cases that should pass the same in strict or loose mode
|
||||||
|
foreach(strict; [false, true]) {
|
||||||
|
assert(htmlEntitiesDecode("&hello» win", strict) == "&hello\» win");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Group: implementations
|
/// Group: implementations
|
||||||
abstract class SpecialElement : Element {
|
abstract class SpecialElement : Element {
|
||||||
this(Document _parentDocument) {
|
this(Document _parentDocument) {
|
||||||
|
@ -5538,6 +5604,8 @@ int intFromHex(string hex) {
|
||||||
v = q - '0';
|
v = q - '0';
|
||||||
else if (q >= 'a' && q <= 'f')
|
else if (q >= 'a' && q <= 'f')
|
||||||
v = q - 'a' + 10;
|
v = q - 'a' + 10;
|
||||||
|
else if (q >= 'A' && q <= 'F')
|
||||||
|
v = q - 'A' + 10;
|
||||||
else throw new Exception("Illegal hex character: " ~ q);
|
else throw new Exception("Illegal hex character: " ~ q);
|
||||||
|
|
||||||
value += v * place;
|
value += v * place;
|
||||||
|
|
Loading…
Reference in New Issue