new htmlEntitiesDecode, saves HUGE time on my test, hopefully is not regression

This commit is contained in:
Adam D. Ruppe 2017-03-03 10:59:26 -05:00
parent 5e6d74e4a9
commit 022c57b720
1 changed files with 16 additions and 15 deletions

31
dom.d
View File

@ -3873,56 +3873,57 @@ string htmlEntitiesDecode(string data, bool strict = false) {
char[4] buffer; char[4] buffer;
bool tryingEntity = false; bool tryingEntity = false;
dchar[] entityBeingTried; dchar[16] entityBeingTried;
int entityBeingTriedLength = 0;
int entityAttemptIndex = 0; int entityAttemptIndex = 0;
foreach(dchar ch; data) { foreach(dchar ch; data) {
if(tryingEntity) { if(tryingEntity) {
entityAttemptIndex++; entityAttemptIndex++;
entityBeingTried ~= ch; entityBeingTried[entityBeingTriedLength++] = ch;
// I saw some crappy html in the wild that looked like &0ї this tries to handle that. // I saw some crappy html in the wild that looked like &0ї this tries to handle that.
if(ch == '&') { if(ch == '&') {
if(strict) if(strict)
throw new Exception("unterminated entity; & inside another at " ~ to!string(entityBeingTried)); throw new Exception("unterminated entity; & inside another at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
// if not strict, let's try to parse both. // if not strict, let's try to parse both.
if(entityBeingTried == "&&") if(entityBeingTried[0 .. entityBeingTriedLength] == "&&")
a ~= "&"; // double amp means keep the first one, still try to parse the next one a ~= "&"; // double amp means keep the first one, still try to parse the next one
else else
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried))]; a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
// tryingEntity is still true // tryingEntity is still true
entityBeingTried = entityBeingTried[0 .. 1]; // keep the & entityBeingTriedLength = 1;
entityAttemptIndex = 0; // restarting o this entityAttemptIndex = 0; // restarting o this
} else } else
if(ch == ';') { if(ch == ';') {
tryingEntity = false; tryingEntity = false;
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried))]; a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
} else if(ch == ' ') { } else if(ch == ' ') {
// e.g. you &amp i // e.g. you &amp i
if(strict) if(strict)
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried)); throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
else { else {
tryingEntity = false; tryingEntity = false;
a ~= to!(char[])(entityBeingTried); a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength]);
} }
} else { } else {
if(entityAttemptIndex >= 9) { if(entityAttemptIndex >= 9) {
if(strict) if(strict)
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried)); throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
else { else {
tryingEntity = false; tryingEntity = false;
a ~= to!(char[])(entityBeingTried); a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength]);
} }
} }
} }
} else { } else {
if(ch == '&') { if(ch == '&') {
tryingEntity = true; tryingEntity = true;
entityBeingTried = null; entityBeingTriedLength = 0;
entityBeingTried ~= ch; entityBeingTried[entityBeingTriedLength++] = ch;
entityAttemptIndex = 0; entityAttemptIndex = 0;
} else { } else {
a ~= buffer[0 .. std.utf.encode(buffer, ch)]; a ~= buffer[0 .. std.utf.encode(buffer, ch)];
@ -3932,10 +3933,10 @@ string htmlEntitiesDecode(string data, bool strict = false) {
if(tryingEntity) { if(tryingEntity) {
if(strict) if(strict)
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried)); throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
// otherwise, let's try to recover, at least so we don't drop any data // otherwise, let's try to recover, at least so we don't drop any data
a ~= to!string(entityBeingTried); a ~= to!string(entityBeingTried[0 .. entityBeingTriedLength]);
// FIXME: what if we have "cool &amp"? should we try to parse it? // FIXME: what if we have "cool &amp"? should we try to parse it?
} }