mirror of https://github.com/adamdruppe/arsd.git
<p><p> bug fixed in tag soup parser. I think that was the last major bug.
This commit is contained in:
parent
73821aad29
commit
f89f3d3e41
97
dom.d
97
dom.d
|
@ -11,45 +11,13 @@ import std.stdio;
|
||||||
|
|
||||||
import arsd.characterencodings;
|
import arsd.characterencodings;
|
||||||
|
|
||||||
/+
|
// tag soup works for most the crap I know now! If you have two bad closing tags back to back, it might erase one, but meh
|
||||||
/* *
|
// that's rarer than the flipped closing tags that hack fixes so I'm ok with it. (Odds are it should be erased anyway; it's
|
||||||
Does a printf into an html format string.
|
// most likely a typo so I say kill kill kill.
|
||||||
|
|
||||||
eprintf(div, "<span>%s</span> is awesome. %d.",
|
|
||||||
"Adam", 10);
|
|
||||||
*/
|
|
||||||
|
|
||||||
// is this even a useful idea now that I have add children and such?
|
|
||||||
void eprintf(T...)(Element parent, string format, T data) {
|
|
||||||
|
|
||||||
}
|
|
||||||
+/
|
|
||||||
|
|
||||||
// Biggest (known) fixme left for "tag soup": <p> .... <p> in loose mode should close it on the second opening.
|
|
||||||
|
|
||||||
// Should I support Element.dataset? it does dash to camelcase for attribute "data-xxx-xxx"
|
// Should I support Element.dataset? it does dash to camelcase for attribute "data-xxx-xxx"
|
||||||
|
|
||||||
/*
|
|
||||||
To pwn haml, it might be interesting to add a
|
|
||||||
|
|
||||||
getElementBySelectorAndMakeIfNotThere
|
|
||||||
|
|
||||||
It first does querySelector. If null, find the path that was closest to matching using
|
|
||||||
the weight rules or the left to right reading, whatever gets close.
|
|
||||||
|
|
||||||
Then make the elements so it works and return the first matching element.
|
|
||||||
|
|
||||||
|
|
||||||
virtual Element setMainPart() {} // usually does innertext but can be overridden by certain elements
|
|
||||||
|
|
||||||
|
|
||||||
The haml converter produces a mixin string that does getElementBySelectorAndMakeIfNotThere and calls
|
|
||||||
setMainPart on it. boom.
|
|
||||||
|
|
||||||
|
|
||||||
but meh
|
|
||||||
*/
|
|
||||||
|
|
||||||
void sanitizeHtml(Document document) {
|
void sanitizeHtml(Document document) {
|
||||||
foreach(e; document.root.tree) {
|
foreach(e; document.root.tree) {
|
||||||
|
|
||||||
|
@ -2636,6 +2604,7 @@ class Document : FileResource {
|
||||||
loose = !caseSensitive;
|
loose = !caseSensitive;
|
||||||
|
|
||||||
bool sawImproperNesting = false;
|
bool sawImproperNesting = false;
|
||||||
|
bool paragraphHackfixRequired = false;
|
||||||
|
|
||||||
int getLineNumber(sizediff_t p) {
|
int getLineNumber(sizediff_t p) {
|
||||||
int line = 1;
|
int line = 1;
|
||||||
|
@ -2762,6 +2731,8 @@ class Document : FileResource {
|
||||||
// I don't care about these, so I just want to skip them
|
// I don't care about these, so I just want to skip them
|
||||||
case '!': // might be a comment, a doctype, or a special instruction
|
case '!': // might be a comment, a doctype, or a special instruction
|
||||||
pos++;
|
pos++;
|
||||||
|
// FIXME: we should store these in the tree too
|
||||||
|
// though I like having it stripped out tbh.
|
||||||
if(data[pos] == '-' && data[pos+1] == '-') {
|
if(data[pos] == '-' && data[pos+1] == '-') {
|
||||||
// comment
|
// comment
|
||||||
pos += 2;
|
pos += 2;
|
||||||
|
@ -2793,6 +2764,9 @@ class Document : FileResource {
|
||||||
while(data[pos] != end)
|
while(data[pos] != end)
|
||||||
pos++;
|
pos++;
|
||||||
pos++; // skip the end
|
pos++; // skip the end
|
||||||
|
|
||||||
|
// FIXME: we should actually store this somewhere
|
||||||
|
// though I like having it stripped out as well tbh.
|
||||||
if(data[pos] == '>')
|
if(data[pos] == '>')
|
||||||
pos++;
|
pos++;
|
||||||
else
|
else
|
||||||
|
@ -2853,7 +2827,7 @@ class Document : FileResource {
|
||||||
e.parseAttributes();
|
e.parseAttributes();
|
||||||
|
|
||||||
|
|
||||||
// HACK to handle script as a CDATA section
|
// HACK to handle script and style as a raw data section as it is in HTML browsers
|
||||||
if(tagName == "script" || tagName == "style") {
|
if(tagName == "script" || tagName == "style") {
|
||||||
string closer = "</" ~ tagName ~ ">";
|
string closer = "</" ~ tagName ~ ">";
|
||||||
auto ending = indexOf(data[pos..$], closer);
|
auto ending = indexOf(data[pos..$], closer);
|
||||||
|
@ -2869,6 +2843,15 @@ class Document : FileResource {
|
||||||
|
|
||||||
bool closed = selfClosed;
|
bool closed = selfClosed;
|
||||||
|
|
||||||
|
void considerHtmlParagraphHack(Element n) {
|
||||||
|
assert(!strict);
|
||||||
|
if(e.tagName == "p" && e.tagName == n.tagName) {
|
||||||
|
// html lets you write <p> para 1 <p> para 1
|
||||||
|
// but in the dom tree, they should be siblings, not children.
|
||||||
|
paragraphHackfixRequired = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//writef("<%s>", tagName);
|
//writef("<%s>", tagName);
|
||||||
while(!closed) {
|
while(!closed) {
|
||||||
Ele n;
|
Ele n;
|
||||||
|
@ -2881,6 +2864,8 @@ class Document : FileResource {
|
||||||
|
|
||||||
|
|
||||||
if(n.type == 0) {
|
if(n.type == 0) {
|
||||||
|
if(!strict)
|
||||||
|
considerHtmlParagraphHack(n.element);
|
||||||
e.appendChild(n.element);
|
e.appendChild(n.element);
|
||||||
} else if(n.type == 1) {
|
} else if(n.type == 1) {
|
||||||
bool found = false;
|
bool found = false;
|
||||||
|
@ -2891,6 +2876,8 @@ class Document : FileResource {
|
||||||
sawImproperNesting = true;
|
sawImproperNesting = true;
|
||||||
// this is so we don't drop several levels of awful markup
|
// this is so we don't drop several levels of awful markup
|
||||||
if(n.element) {
|
if(n.element) {
|
||||||
|
if(!strict)
|
||||||
|
considerHtmlParagraphHack(n.element);
|
||||||
e.appendChild(n.element);
|
e.appendChild(n.element);
|
||||||
n.element = null;
|
n.element = null;
|
||||||
}
|
}
|
||||||
|
@ -2917,9 +2904,12 @@ class Document : FileResource {
|
||||||
e.appendChild(TextNode.fromUndecodedString(this, "</"~n.payload~">"));
|
e.appendChild(TextNode.fromUndecodedString(this, "</"~n.payload~">"));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if(n.element)
|
if(n.element) {
|
||||||
|
if(!strict)
|
||||||
|
considerHtmlParagraphHack(n.element);
|
||||||
e.appendChild(n.element);
|
e.appendChild(n.element);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if(n.payload == tagName) // in strict mode, this is always true
|
if(n.payload == tagName) // in strict mode, this is always true
|
||||||
closed = true;
|
closed = true;
|
||||||
|
@ -2984,25 +2974,24 @@ class Document : FileResource {
|
||||||
parse(`<html><head></head><body></body></html>`); // fill in a dummy document in loose mode since that's what browsers do
|
parse(`<html><head></head><body></body></html>`); // fill in a dummy document in loose mode since that's what browsers do
|
||||||
}
|
}
|
||||||
|
|
||||||
if(0&&sawImproperNesting) {
|
if(paragraphHackfixRequired) {
|
||||||
// in loose mode, we can see some bad nesting. It's hard to fix above though
|
assert(!strict); // this should never happen in strict mode; it ought to never set the hack flag...
|
||||||
// because my code sucks. So, we'll fix it here.
|
|
||||||
|
|
||||||
fixing_p_again:
|
// in loose mode, we can see some "bad" nesting (it's valid html, but poorly formed xml).
|
||||||
foreach(ele; root.getElementsBySelector("p > p")) {
|
// It's hard to handle above though because my code sucks. So, we'll fix it here.
|
||||||
auto holder = ele.parentNode.parentNode;
|
|
||||||
auto h2 = ele.parentNode;
|
|
||||||
|
|
||||||
if(holder is null || h2 is null)
|
auto iterator = root.tree;
|
||||||
|
foreach(ele; iterator) {
|
||||||
|
if(ele.parentNode is null)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
ele = ele.parentNode.removeChild(ele);
|
if(ele.tagName == "p" && ele.parentNode.tagName == ele.tagName) {
|
||||||
|
auto shouldBePreviousSibling = ele.parentNode;
|
||||||
holder.insertAfter(h2, ele);
|
auto holder = shouldBePreviousSibling.parentNode; // this is the two element's mutual holder...
|
||||||
|
holder.insertAfter(shouldBePreviousSibling, ele.removeFromTree);
|
||||||
goto fixing_p_again;
|
iterator.currentKilled(); // the current branch can be skipped; we'll hit it soon anyway since it's now next up.
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3209,7 +3198,11 @@ class Document : FileResource {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static string[] selfClosedElements = [ "img", "hr", "input", "br", "col", "link", "meta" ];
|
private static string[] selfClosedElements = [
|
||||||
|
// html 4
|
||||||
|
"img", "hr", "input", "br", "col", "link", "meta",
|
||||||
|
// html 5
|
||||||
|
"source" ];
|
||||||
|
|
||||||
static import std.conv;
|
static import std.conv;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue