mirror of https://github.com/adamdruppe/arsd.git
new htmlEntitiesDecode, saves HUGE time on my test, hopefully is not regression
This commit is contained in:
parent
5e6d74e4a9
commit
022c57b720
31
dom.d
31
dom.d
|
@ -3873,56 +3873,57 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
char[4] buffer;
|
char[4] buffer;
|
||||||
|
|
||||||
bool tryingEntity = false;
|
bool tryingEntity = false;
|
||||||
dchar[] entityBeingTried;
|
dchar[16] entityBeingTried;
|
||||||
|
int entityBeingTriedLength = 0;
|
||||||
int entityAttemptIndex = 0;
|
int entityAttemptIndex = 0;
|
||||||
|
|
||||||
foreach(dchar ch; data) {
|
foreach(dchar ch; data) {
|
||||||
if(tryingEntity) {
|
if(tryingEntity) {
|
||||||
entityAttemptIndex++;
|
entityAttemptIndex++;
|
||||||
entityBeingTried ~= ch;
|
entityBeingTried[entityBeingTriedLength++] = ch;
|
||||||
|
|
||||||
// I saw some crappy html in the wild that looked like &0ї this tries to handle that.
|
// I saw some crappy html in the wild that looked like &0ї this tries to handle that.
|
||||||
if(ch == '&') {
|
if(ch == '&') {
|
||||||
if(strict)
|
if(strict)
|
||||||
throw new Exception("unterminated entity; & inside another at " ~ to!string(entityBeingTried));
|
throw new Exception("unterminated entity; & inside another at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
|
|
||||||
// if not strict, let's try to parse both.
|
// if not strict, let's try to parse both.
|
||||||
|
|
||||||
if(entityBeingTried == "&&")
|
if(entityBeingTried[0 .. entityBeingTriedLength] == "&&")
|
||||||
a ~= "&"; // double amp means keep the first one, still try to parse the next one
|
a ~= "&"; // double amp means keep the first one, still try to parse the next one
|
||||||
else
|
else
|
||||||
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried))];
|
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
|
||||||
|
|
||||||
// tryingEntity is still true
|
// tryingEntity is still true
|
||||||
entityBeingTried = entityBeingTried[0 .. 1]; // keep the &
|
entityBeingTriedLength = 1;
|
||||||
entityAttemptIndex = 0; // restarting o this
|
entityAttemptIndex = 0; // restarting o this
|
||||||
} else
|
} else
|
||||||
if(ch == ';') {
|
if(ch == ';') {
|
||||||
tryingEntity = false;
|
tryingEntity = false;
|
||||||
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried))];
|
a ~= buffer[0.. std.utf.encode(buffer, parseEntity(entityBeingTried[0 .. entityBeingTriedLength]))];
|
||||||
} else if(ch == ' ') {
|
} else if(ch == ' ') {
|
||||||
// e.g. you & i
|
// e.g. you & i
|
||||||
if(strict)
|
if(strict)
|
||||||
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried));
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
else {
|
else {
|
||||||
tryingEntity = false;
|
tryingEntity = false;
|
||||||
a ~= to!(char[])(entityBeingTried);
|
a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(entityAttemptIndex >= 9) {
|
if(entityAttemptIndex >= 9) {
|
||||||
if(strict)
|
if(strict)
|
||||||
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried));
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
else {
|
else {
|
||||||
tryingEntity = false;
|
tryingEntity = false;
|
||||||
a ~= to!(char[])(entityBeingTried);
|
a ~= to!(char[])(entityBeingTried[0 .. entityBeingTriedLength]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(ch == '&') {
|
if(ch == '&') {
|
||||||
tryingEntity = true;
|
tryingEntity = true;
|
||||||
entityBeingTried = null;
|
entityBeingTriedLength = 0;
|
||||||
entityBeingTried ~= ch;
|
entityBeingTried[entityBeingTriedLength++] = ch;
|
||||||
entityAttemptIndex = 0;
|
entityAttemptIndex = 0;
|
||||||
} else {
|
} else {
|
||||||
a ~= buffer[0 .. std.utf.encode(buffer, ch)];
|
a ~= buffer[0 .. std.utf.encode(buffer, ch)];
|
||||||
|
@ -3932,10 +3933,10 @@ string htmlEntitiesDecode(string data, bool strict = false) {
|
||||||
|
|
||||||
if(tryingEntity) {
|
if(tryingEntity) {
|
||||||
if(strict)
|
if(strict)
|
||||||
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried));
|
throw new Exception("unterminated entity at " ~ to!string(entityBeingTried[0 .. entityBeingTriedLength]));
|
||||||
|
|
||||||
// otherwise, let's try to recover, at least so we don't drop any data
|
// otherwise, let's try to recover, at least so we don't drop any data
|
||||||
a ~= to!string(entityBeingTried);
|
a ~= to!string(entityBeingTried[0 .. entityBeingTriedLength]);
|
||||||
// FIXME: what if we have "cool &"? should we try to parse it?
|
// FIXME: what if we have "cool &"? should we try to parse it?
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue