diff --git a/dom.d b/dom.d
index d1f88e7..c511d8a 100644
--- a/dom.d
+++ b/dom.d
@@ -1,29 +1,32 @@
-/**
+/++
This is an html DOM implementation, started with cloning
what the browser offers in Javascript, but going well beyond
it in convenience.
If you can do it in Javascript, you can probably do it with
- this module.
+ this module, and much more.
- And much more.
+ ---
+ import arsd.dom;
+ void main() {
+ auto document = new Document("
paragraph
");
+ writeln(document.querySelector("p"));
+ document.root.innerHTML = "hey
";
+ writeln(document);
+ }
+ ---
- Note: some of the documentation here writes html with added
- spaces. That's because ddoc doesn't bother encoding html output,
- and adding spaces is easier than using LT macros everywhere.
-
-
- BTW: this file optionally depends on arsd.characterencodings, to
+ BTW: this file optionally depends on `arsd.characterencodings`, to
help it correctly read files from the internet. You should be able to
get characterencodings.d from the same place you got this file.
- If you want it to stand alone, just always use the `parseUtf8` function.
-*/
+ If you want it to stand alone, just always use the `Document.parseUtf8`
+ function or the constructor that takes a string.
++/
module arsd.dom;
-// FIXME: do parent selector picking in get selector
-// FIXME: do :has too... or instead, :has is quite nice.
+// FIXME: support the css standard namespace thing in the selectors too
version(with_arsd_jsvar)
import arsd.jsvar;
@@ -47,9 +50,6 @@ bool isConvenientAttribute(string name) {
return false;
}
-// FIXME: might be worth doing Element.attrs and taking opDispatch off that
-// so more UFCS works.
-
// FIXME: something like spam with no closing
should read the second tag as the closer in garbage mode
// FIXME: failing to close a paragraph sometimes messes things up too
@@ -66,16 +66,1252 @@ bool isConvenientAttribute(string name) {
xpath.p[0].a["href"]
*/
-// public import arsd.domconvenience; // merged for now
-/* domconvenience follows { */
+/// The main document interface, including a html parser.
+class Document : FileResource {
+ ///.
+ this(string data, bool caseSensitive = false, bool strict = false) {
+ parseUtf8(data, caseSensitive, strict);
+ }
+
+ /**
+ Creates an empty document. It has *nothing* in it at all.
+ */
+ this() {
+
+ }
+
+ /// This is just something I'm toying with. Right now, you use opIndex to put in css selectors.
+ /// It returns a struct that forwards calls to all elements it holds, and returns itself so you
+ /// can chain it.
+ ///
+ /// Example: document["p"].innerText("hello").addClass("modified");
+ ///
+ /// Equivalent to: foreach(e; document.getElementsBySelector("p")) { e.innerText("hello"); e.addClas("modified"); }
+ ///
+ /// Note: always use function calls (not property syntax) and don't use toString in there for best results.
+ ///
+ /// You can also do things like: document["p"]["b"] though tbh I'm not sure why since the selector string can do all that anyway. Maybe
+ /// you could put in some kind of custom filter function tho.
+ ElementCollection opIndex(string selector) {
+ auto e = ElementCollection(this.root);
+ return e[selector];
+ }
+
+ string _contentType = "text/html; charset=utf-8";
+
+ /// If you're using this for some other kind of XML, you can
+ /// set the content type here.
+ ///
+ /// Note: this has no impact on the function of this class.
+ /// It is only used if the document is sent via a protocol like HTTP.
+ ///
+ /// This may be called by parse() if it recognizes the data. Otherwise,
+ /// if you don't set it, it assumes text/html; charset=utf-8.
+ @property string contentType(string mimeType) {
+ _contentType = mimeType;
+ return _contentType;
+ }
+
+ /// implementing the FileResource interface, useful for sending via
+ /// http automatically.
+ override @property string contentType() const {
+ return _contentType;
+ }
+
+ /// implementing the FileResource interface; it calls toString.
+ override immutable(ubyte)[] getData() const {
+ return cast(immutable(ubyte)[]) this.toString();
+ }
-import std.string;
+ /// Concatenates any consecutive text nodes
+ /*
+ void normalize() {
-// the reason this is separated is so I can plug it into D->JS as well, which uses a different base Element class
+ }
+ */
-mixin template DomConvenienceFunctions() {
+ /// This will set delegates for parseSaw* (note: this overwrites anything else you set, and you setting subsequently will overwrite this) that add those things to the dom tree when it sees them.
+ /// Call this before calling parse().
+
+ /// Note this will also preserve the prolog and doctype from the original file, if there was one.
+ void enableAddingSpecialTagsToDom() {
+ parseSawComment = (string) => true;
+ parseSawAspCode = (string) => true;
+ parseSawPhpCode = (string) => true;
+ parseSawQuestionInstruction = (string) => true;
+ parseSawBangInstruction = (string) => true;
+ }
+
+ /// If the parser sees a html comment, it will call this callback
+ /// will call parseSawComment(" comment ")
+ /// Return true if you want the node appended to the document.
+ bool delegate(string) parseSawComment;
+
+ /// If the parser sees <% asp code... %>, it will call this callback.
+ /// It will be passed "% asp code... %" or "%= asp code .. %"
+ /// Return true if you want the node appended to the document.
+ bool delegate(string) parseSawAspCode;
+
+ /// If the parser sees , it will call this callback.
+ /// It will be passed "?php php code... ?" or "?= asp code .. ?"
+ /// Note: dom.d cannot identify the other php code ?> short format.
+ /// Return true if you want the node appended to the document.
+ bool delegate(string) parseSawPhpCode;
+
+ /// if it sees a that is not php or asp
+ /// it calls this function with the contents.
+ /// calls parseSawQuestionInstruction("?SOMETHING foo")
+ /// Unlike the php/asp ones, this ends on the first > it sees, without requiring ?>.
+ /// Return true if you want the node appended to the document.
+ bool delegate(string) parseSawQuestionInstruction;
+
+ /// if it sees a calls parseSawBangInstruction("SOMETHING foo")
+ /// Return true if you want the node appended to the document.
+ bool delegate(string) parseSawBangInstruction;
+
+ /// Given the kind of garbage you find on the Internet, try to make sense of it.
+ /// Equivalent to document.parse(data, false, false, null);
+ /// (Case-insensitive, non-strict, determine character encoding from the data.)
+
+ /// NOTE: this makes no attempt at added security.
+ ///
+ /// It is a template so it lazily imports characterencodings.
+ void parseGarbage()(string data) {
+ parse(data, false, false, null);
+ }
+
+ /// Parses well-formed UTF-8, case-sensitive, XML or XHTML
+ /// Will throw exceptions on things like unclosed tags.
+ void parseStrict(string data) {
+ parseStream(toUtf8Stream(data), true, true);
+ }
+
+ /// Parses well-formed UTF-8 in loose mode (by default). Tries to correct
+ /// tag soup, but does NOT try to correct bad character encodings.
+ ///
+ /// They will still throw an exception.
+ void parseUtf8(string data, bool caseSensitive = false, bool strict = false) {
+ parseStream(toUtf8Stream(data), caseSensitive, strict);
+ }
+
+ // this is a template so we get lazy import behavior
+ Utf8Stream handleDataEncoding()(in string rawdata, string dataEncoding, bool strict) {
+ import arsd.characterencodings;
+ // gotta determine the data encoding. If you know it, pass it in above to skip all this.
+ if(dataEncoding is null) {
+ dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
+ // it can't tell... probably a random 8 bit encoding. Let's check the document itself.
+ // Now, XML and HTML can both list encoding in the document, but we can't really parse
+ // it here without changing a lot of code until we know the encoding. So I'm going to
+ // do some hackish string checking.
+ if(dataEncoding is null) {
+ auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
+ // first, look for an XML prolog
+ auto idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
+ if(idx != -1) {
+ idx += "encoding=\"".length;
+ // we're probably past the prolog if it's this far in; we might be looking at
+ // content. Forget about it.
+ if(idx > 100)
+ idx = -1;
+ }
+ // if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
+ if(idx == -1) {
+ idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "charset=");
+ if(idx != -1) {
+ idx += "charset=".length;
+ if(dataAsBytes[idx] == '"')
+ idx++;
+ }
+ }
+
+ // found something in either branch...
+ if(idx != -1) {
+ // read till a quote or about 12 chars, whichever comes first...
+ auto end = idx;
+ while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
+ end++;
+
+ dataEncoding = cast(string) dataAsBytes[idx .. end];
+ }
+ // otherwise, we just don't know.
+ }
+ }
+
+ if(dataEncoding is null) {
+ if(strict)
+ throw new MarkupException("I couldn't figure out the encoding of this document.");
+ else
+ // if we really don't know by here, it means we already tried UTF-8,
+ // looked for utf 16 and 32 byte order marks, and looked for xml or meta
+ // tags... let's assume it's Windows-1252, since that's probably the most
+ // common aside from utf that wouldn't be labeled.
+
+ dataEncoding = "Windows 1252";
+ }
+
+ // and now, go ahead and convert it.
+
+ string data;
+
+ if(!strict) {
+ // if we're in non-strict mode, we need to check
+ // the document for mislabeling too; sometimes
+ // web documents will say they are utf-8, but aren't
+ // actually properly encoded. If it fails to validate,
+ // we'll assume it's actually Windows encoding - the most
+ // likely candidate for mislabeled garbage.
+ dataEncoding = dataEncoding.toLower();
+ dataEncoding = dataEncoding.replace(" ", "");
+ dataEncoding = dataEncoding.replace("-", "");
+ dataEncoding = dataEncoding.replace("_", "");
+ if(dataEncoding == "utf8") {
+ try {
+ validate(rawdata);
+ } catch(UTFException e) {
+ dataEncoding = "Windows 1252";
+ }
+ }
+ }
+
+ if(dataEncoding != "UTF-8") {
+ if(strict)
+ data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
+ else {
+ try {
+ data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
+ } catch(Exception e) {
+ data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, "Windows 1252");
+ }
+ }
+ } else
+ data = rawdata;
+
+ return toUtf8Stream(data);
+ }
+
+ private
+ Utf8Stream toUtf8Stream(in string rawdata) {
+ string data = rawdata;
+ static if(is(Utf8Stream == string))
+ return data;
+ else
+ return new Utf8Stream(data);
+ }
+
+ /**
+ Take XMLish data and try to make the DOM tree out of it.
+
+ The goal isn't to be perfect, but to just be good enough to
+ approximate Javascript's behavior.
+
+ If strict, it throws on something that doesn't make sense.
+ (Examples: mismatched tags. It doesn't validate!)
+ If not strict, it tries to recover anyway, and only throws
+ when something is REALLY unworkable.
+
+ If strict is false, it uses a magic list of tags that needn't
+ be closed. If you are writing a document specifically for this,
+ try to avoid such - use self closed tags at least. Easier to parse.
+
+ The dataEncoding argument can be used to pass a specific
+ charset encoding for automatic conversion. If null (which is NOT
+ the default!), it tries to determine from the data itself,
+ using the xml prolog or meta tags, and assumes UTF-8 if unsure.
+
+ If this assumption is wrong, it can throw on non-ascii
+ characters!
+
+
+ Note that it previously assumed the data was encoded as UTF-8, which
+ is why the dataEncoding argument defaults to that.
+
+ So it shouldn't break backward compatibility.
+
+ But, if you want the best behavior on wild data - figuring it out from the document
+ instead of assuming - you'll probably want to change that argument to null.
+
+ This is a template so it lazily imports arsd.characterencodings, which is required
+ to fix up data encodings.
+
+ If you are sure the encoding is good, try parseUtf8 or parseStrict to avoid the
+ dependency. If it is data from the Internet though, a random website, the encoding
+ is often a lie. This function, if dataEncoding == null, can correct for that, or
+ you can try parseGarbage. In those cases, arsd.characterencodings is required to
+ compile.
+ */
+ void parse()(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
+ auto data = handleDataEncoding(rawdata, dataEncoding, strict);
+ parseStream(data, caseSensitive, strict);
+ }
+
+ // note: this work best in strict mode, unless data is just a simple string wrapper
+ void parseStream(Utf8Stream data, bool caseSensitive = false, bool strict = false) {
+ // FIXME: this parser could be faster; it's in the top ten biggest tree times according to the profiler
+ // of my big app.
+
+ assert(data !is null);
+
+ // go through character by character.
+ // if you see a <, consider it a tag.
+ // name goes until the first non tagname character
+ // then see if it self closes or has an attribute
+
+ // if not in a tag, anything not a tag is a big text
+ // node child. It ends as soon as it sees a <
+
+ // Whitespace in text or attributes is preserved, but not between attributes
+
+ // & and friends are converted when I know them, left the same otherwise
+
+
+ // this it should already be done correctly.. so I'm leaving it off to net a ~10% speed boost on my typical test file (really)
+ //validate(data); // it *must* be UTF-8 for this to work correctly
+
+ sizediff_t pos = 0;
+
+ clear();
+
+ loose = !caseSensitive;
+
+ bool sawImproperNesting = false;
+ bool paragraphHackfixRequired = false;
+
+ int getLineNumber(sizediff_t p) {
+ int line = 1;
+ foreach(c; data[0..p])
+ if(c == '\n')
+ line++;
+ return line;
+ }
+
+ void parseError(string message) {
+ throw new MarkupException(format("char %d (line %d): %s", pos, getLineNumber(pos), message));
+ }
+
+ void eatWhitespace() {
+ while(pos < data.length && (data[pos] == ' ' || data[pos] == '\n' || data[pos] == '\t'))
+ pos++;
+ }
+
+ string readTagName() {
+ // remember to include : for namespaces
+ // basically just keep going until >, /, or whitespace
+ auto start = pos;
+ while( data[pos] != '>' && data[pos] != '/' &&
+ data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
+ {
+ pos++;
+ if(pos == data.length) {
+ if(strict)
+ throw new Exception("tag name incomplete when file ended");
+ else
+ break;
+ }
+ }
+
+ if(!caseSensitive)
+ return toLower(data[start..pos]);
+ else
+ return data[start..pos];
+ }
+
+ string readAttributeName() {
+ // remember to include : for namespaces
+ // basically just keep going until >, /, or whitespace
+ auto start = pos;
+ while( data[pos] != '>' && data[pos] != '/' && data[pos] != '=' &&
+ data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
+ {
+ if(data[pos] == '<') {
+ if(strict)
+ throw new MarkupException("The character < can never appear in an attribute name. Line " ~ to!string(getLineNumber(pos)));
+ else
+ break; // e.g. . The > should have been after the href, but some shitty files don't do that right and the browser handles it, so we will too, by pretending the > was indeed there
+ }
+ pos++;
+ if(pos == data.length) {
+ if(strict)
+ throw new Exception("unterminated attribute name");
+ else
+ break;
+ }
+ }
+
+ if(!caseSensitive)
+ return toLower(data[start..pos]);
+ else
+ return data[start..pos];
+ }
+
+ string readAttributeValue() {
+ if(pos >= data.length) {
+ if(strict)
+ throw new Exception("no attribute value before end of file");
+ else
+ return null;
+ }
+ switch(data[pos]) {
+ case '\'':
+ case '"':
+ auto started = pos;
+ char end = data[pos];
+ pos++;
+ auto start = pos;
+ while(pos < data.length && data[pos] != end)
+ pos++;
+ if(strict && pos == data.length)
+ throw new MarkupException("Unclosed attribute value, started on char " ~ to!string(started));
+ string v = htmlEntitiesDecode(data[start..pos], strict);
+ pos++; // skip over the end
+ return v;
+ default:
+ if(strict)
+ parseError("Attributes must be quoted");
+ // read until whitespace or terminator (/ or >)
+ auto start = pos;
+ while(
+ pos < data.length &&
+ data[pos] != '>' &&
+ // unquoted attributes might be urls, so gotta be careful with them and self-closed elements
+ !(data[pos] == '/' && pos + 1 < data.length && data[pos+1] == '>') &&
+ data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
+ pos++;
+
+ string v = htmlEntitiesDecode(data[start..pos], strict);
+ // don't skip the end - we'll need it later
+ return v;
+ }
+ }
+
+ TextNode readTextNode() {
+ auto start = pos;
+ while(pos < data.length && data[pos] != '<') {
+ pos++;
+ }
+
+ return TextNode.fromUndecodedString(this, data[start..pos]);
+ }
+
+ // this is obsolete!
+ RawSource readCDataNode() {
+ auto start = pos;
+ while(pos < data.length && data[pos] != '<') {
+ pos++;
+ }
+
+ return new RawSource(this, data[start..pos]);
+ }
+
+
+ struct Ele {
+ int type; // element or closing tag or nothing
+ /*
+ type == 0 means regular node, self-closed (element is valid)
+ type == 1 means closing tag (payload is the tag name, element may be valid)
+ type == 2 means you should ignore it completely
+ type == 3 means it is a special element that should be appended, if possible, e.g. a that was chosen to be kept, php code, or comment. It will be appended at the current element if inside the root, and to a special document area if not
+ type == 4 means the document was totally empty
+ */
+ Element element; // for type == 0 or type == 3
+ string payload; // for type == 1
+ }
+ // recursively read a tag
+ Ele readElement(string[] parentChain = null) {
+ // FIXME: this is the slowest function in this module, by far, even in strict mode.
+ // Loose mode should perform decently, but strict mode is the important one.
+ if(!strict && parentChain is null)
+ parentChain = [];
+
+ static string[] recentAutoClosedTags;
+
+ if(pos >= data.length)
+ {
+ if(strict) {
+ throw new MarkupException("Gone over the input (is there no root element or did it never close?), chain: " ~ to!string(parentChain));
+ } else {
+ if(parentChain.length)
+ return Ele(1, null, parentChain[0]); // in loose mode, we just assume the document has ended
+ else
+ return Ele(4); // signal emptiness upstream
+ }
+ }
+
+ if(data[pos] != '<') {
+ return Ele(0, readTextNode(), null);
+ }
+
+ enforce(data[pos] == '<');
+ pos++;
+ if(pos == data.length) {
+ if(strict)
+ throw new MarkupException("Found trailing < at end of file");
+ // if not strict, we'll just skip the switch
+ } else
+ switch(data[pos]) {
+ // I don't care about these, so I just want to skip them
+ case '!': // might be a comment, a doctype, or a special instruction
+ pos++;
+
+ // FIXME: we should store these in the tree too
+ // though I like having it stripped out tbh.
+
+ if(pos == data.length) {
+ if(strict)
+ throw new MarkupException(" block.
+ // so in since that's the common way
+
+ auto commentStart = pos;
+ while(pos+3 < data.length && data[pos..pos+3] != "-->")
+ pos++;
+
+ auto end = commentStart;
+
+ if(pos + 3 >= data.length) {
+ if(strict)
+ throw new MarkupException("unclosed comment");
+ end = data.length;
+ pos = data.length;
+ } else {
+ end = pos;
+ assert(data[pos] == '-');
+ pos++;
+ assert(data[pos] == '-');
+ pos++;
+ assert(data[pos] == '>');
+ pos++;
+ }
+
+ if(parseSawComment !is null)
+ if(parseSawComment(data[commentStart .. end])) {
+ return Ele(3, new HtmlComment(this, data[commentStart .. end]), null);
+ }
+ } else if(pos + 7 <= data.length && data[pos..pos + 7] == "[CDATA[") {
+ pos += 7;
+
+ auto cdataStart = pos;
+
+ ptrdiff_t end = -1;
+ typeof(end) cdataEnd;
+
+ if(pos < data.length) {
+ // cdata isn't allowed to nest, so this should be generally ok, as long as it is found
+ end = data[pos .. $].indexOf("]]>");
+ }
+
+ if(end == -1) {
+ if(strict)
+ throw new MarkupException("Unclosed CDATA section");
+ end = pos;
+ cdataEnd = pos;
+ } else {
+ cdataEnd = pos + end;
+ pos = cdataEnd + 3;
+ }
+
+ return Ele(0, new TextNode(this, data[cdataStart .. cdataEnd]), null);
+ } else {
+ auto start = pos;
+ while(pos < data.length && data[pos] != '>')
+ pos++;
+
+ auto bangEnds = pos;
+ if(pos == data.length) {
+ if(strict)
+ throw new MarkupException("unclosed processing instruction ()");
+ } else pos++; // skipping the >
+
+ if(parseSawBangInstruction !is null)
+ if(parseSawBangInstruction(data[start .. bangEnds])) {
+ // FIXME: these should be able to modify the parser state,
+ // doing things like adding entities, somehow.
+
+ return Ele(3, new BangInstruction(this, data[start .. bangEnds]), null);
+ }
+ }
+
+ /*
+ if(pos < data.length && data[pos] == '>')
+ pos++; // skip the >
+ else
+ assert(!strict);
+ */
+ break;
+ case '%':
+ case '?':
+ /*
+ Here's what we want to support:
+
+ <% asp code %>
+ <%= asp code %>
+
+ = php code ?>
+
+ The contents don't really matter, just if it opens with
+ one of the above for, it ends on the two char terminator.
+
+
+ this is NOT php code
+ because I've seen this in the wild:
+
+ This could be php with shorttags which would be cut off
+ prematurely because if(a >) - that > counts as the close
+ of the tag, but since dom.d can't tell the difference
+ between that and the real world example, it will
+ not try to look for the ?> ending.
+
+ The difference between this and the asp/php stuff is that it
+ ends on >, not ?>. ONLY . The rest end
+ on >.
+ */
+
+ char end = data[pos];
+ auto started = pos;
+ bool isAsp = end == '%';
+ int currentIndex = 0;
+ bool isPhp = false;
+ bool isEqualTag = false;
+ int phpCount = 0;
+
+ more:
+ pos++; // skip the start
+ if(pos == data.length) {
+ if(strict)
+ throw new MarkupException("Unclosed <"~end~" by end of file");
+ } else {
+ currentIndex++;
+ if(currentIndex == 1 && data[pos] == '=') {
+ if(!isAsp)
+ isPhp = true;
+ isEqualTag = true;
+ goto more;
+ }
+ if(currentIndex == 1 && data[pos] == 'p')
+ phpCount++;
+ if(currentIndex == 2 && data[pos] == 'h')
+ phpCount++;
+ if(currentIndex == 3 && data[pos] == 'p' && phpCount == 2)
+ isPhp = true;
+
+ if(data[pos] == '>') {
+ if((isAsp || isPhp) && data[pos - 1] != end)
+ goto more;
+ // otherwise we're done
+ } else
+ goto more;
+ }
+
+ //writefln("%s: %s", isAsp ? "ASP" : isPhp ? "PHP" : " ", data[started .. pos]);
+ auto code = data[started .. pos];
+
+
+ assert((pos < data.length && data[pos] == '>') || (!strict && pos == data.length));
+ if(pos < data.length)
+ pos++; // get past the >
+
+ if(isAsp && parseSawAspCode !is null) {
+ if(parseSawAspCode(code)) {
+ return Ele(3, new AspCode(this, code), null);
+ }
+ } else if(isPhp && parseSawPhpCode !is null) {
+ if(parseSawPhpCode(code)) {
+ return Ele(3, new PhpCode(this, code), null);
+ }
+ } else if(!isAsp && !isPhp && parseSawQuestionInstruction !is null) {
+ if(parseSawQuestionInstruction(code)) {
+ return Ele(3, new QuestionInstruction(this, code), null);
+ }
+ }
+ break;
+ case '/': // closing an element
+ pos++; // skip the start
+ auto p = pos;
+ while(pos < data.length && data[pos] != '>')
+ pos++;
+ //writefln("%s>", data[p..pos]);
+ if(pos == data.length && data[pos-1] != '>') {
+ if(strict)
+ throw new MarkupException("File ended before closing tag had a required >");
+ else
+ data ~= ">"; // just hack it in
+ }
+ pos++; // skip the '>'
+
+ string tname = data[p..pos-1];
+ if(!caseSensitive)
+ tname = tname.toLower();
+
+ return Ele(1, null, tname); // closing tag reports itself here
+ case ' ': // assume it isn't a real element...
+ if(strict)
+ parseError("bad markup - improperly placed <");
+ else
+ return Ele(0, TextNode.fromUndecodedString(this, "<"), null);
+ break;
+ default:
+
+ if(!strict) {
+ // what about something that kinda looks like a tag, but isn't?
+ auto nextTag = data[pos .. $].indexOf("<");
+ auto closeTag = data[pos .. $].indexOf(">");
+ if(closeTag != -1 && nextTag != -1)
+ if(nextTag < closeTag) {
+ // since attribute names cannot possibly have a < in them, we'll look for an equal since it might be an attribute value... and even in garbage mode, it'd have to be a quoted one realistically
+
+ auto equal = data[pos .. $].indexOf("=\"");
+ if(equal != -1 && equal < closeTag) {
+ // this MIGHT be ok, soldier on
+ } else {
+ // definitely no good, this must be a (horribly distorted) text node
+ pos++; // skip the < we're on - don't want text node to end prematurely
+ auto node = readTextNode();
+ node.contents = "<" ~ node.contents; // put this back
+ return Ele(0, node, null);
+ }
+ }
+ }
+
+ string tagName = readTagName();
+ string[string] attributes;
+
+ Ele addTag(bool selfClosed) {
+ if(selfClosed)
+ pos++;
+ else {
+ if(!strict)
+ if(tagName.isInArray(selfClosedElements))
+ // these are de-facto self closed
+ selfClosed = true;
+ }
+
+ if(strict)
+ enforce(data[pos] == '>');//, format("got %s when expecting >\nContext:\n%s", data[pos], data[pos - 100 .. pos + 100]));
+ else {
+ // if we got here, it's probably because a slash was in an
+ // unquoted attribute - don't trust the selfClosed value
+ if(!selfClosed)
+ selfClosed = tagName.isInArray(selfClosedElements);
+
+ while(pos < data.length && data[pos] != '>')
+ pos++;
+ }
+
+ auto whereThisTagStarted = pos; // for better error messages
+
+ pos++;
+
+ auto e = createElement(tagName);
+ e.attributes = attributes;
+ version(dom_node_indexes) {
+ if(e.dataset.nodeIndex.length == 0)
+ e.dataset.nodeIndex = to!string(&(e.attributes));
+ }
+ e.selfClosed = selfClosed;
+ e.parseAttributes();
+
+
+ // HACK to handle script and style as a raw data section as it is in HTML browsers
+ if(tagName == "script" || tagName == "style") {
+ if(!selfClosed) {
+ string closer = "" ~ tagName ~ ">";
+ ptrdiff_t ending;
+ if(pos >= data.length)
+ ending = -1;
+ else
+ ending = indexOf(data[pos..$], closer);
+
+ ending = indexOf(data[pos..$], closer, 0, (loose ? CaseSensitive.no : CaseSensitive.yes));
+ /*
+ if(loose && ending == -1 && pos < data.length)
+ ending = indexOf(data[pos..$], closer.toUpper());
+ */
+ if(ending == -1) {
+ if(strict)
+ throw new Exception("tag " ~ tagName ~ " never closed");
+ else {
+ // let's call it totally empty and do the rest of the file as text. doing it as html could still result in some weird stuff like if(a<4) being read as <4 being a tag so it comes out if(a<4>4> and other weirdness) It is either a closed script tag or the rest of the file is forfeit.
+ if(pos < data.length) {
+ e = new TextNode(this, data[pos .. $]);
+ pos = data.length;
+ }
+ }
+ } else {
+ ending += pos;
+ e.innerRawSource = data[pos..ending];
+ pos = ending + closer.length;
+ }
+ }
+ return Ele(0, e, null);
+ }
+
+ bool closed = selfClosed;
+
+ void considerHtmlParagraphHack(Element n) {
+ assert(!strict);
+ if(e.tagName == "p" && e.tagName == n.tagName) {
+ // html lets you write para 1
para 1
+ // but in the dom tree, they should be siblings, not children.
+ paragraphHackfixRequired = true;
+ }
+ }
+
+ //writef("<%s>", tagName);
+ while(!closed) {
+ Ele n;
+ if(strict)
+ n = readElement();
+ else
+ n = readElement(parentChain ~ tagName);
+
+ if(n.type == 4) return n; // the document is empty
+
+ if(n.type == 3 && n.element !is null) {
+ // special node, append if possible
+ if(e !is null)
+ e.appendChild(n.element);
+ else
+ piecesBeforeRoot ~= n.element;
+ } else if(n.type == 0) {
+ if(!strict)
+ considerHtmlParagraphHack(n.element);
+ e.appendChild(n.element);
+ } else if(n.type == 1) {
+ bool found = false;
+ if(n.payload != tagName) {
+ if(strict)
+ parseError(format("mismatched tag: %s> != <%s> (opened on line %d)", n.payload, tagName, getLineNumber(whereThisTagStarted)));
+ else {
+ sawImproperNesting = true;
+ // this is so we don't drop several levels of awful markup
+ if(n.element) {
+ if(!strict)
+ considerHtmlParagraphHack(n.element);
+ e.appendChild(n.element);
+ n.element = null;
+ }
+
+ // is the element open somewhere up the chain?
+ foreach(i, parent; parentChain)
+ if(parent == n.payload) {
+ recentAutoClosedTags ~= tagName;
+ // just rotating it so we don't inadvertently break stuff with vile crap
+ if(recentAutoClosedTags.length > 4)
+ recentAutoClosedTags = recentAutoClosedTags[1 .. $];
+
+ n.element = e;
+ return n;
+ }
+
+ // if not, this is a text node; we can't fix it up...
+
+ // If it's already in the tree somewhere, assume it is closed by algorithm
+ // and we shouldn't output it - odds are the user just flipped a couple tags
+ foreach(ele; e.tree) {
+ if(ele.tagName == n.payload) {
+ found = true;
+ break;
+ }
+ }
+
+ foreach(ele; recentAutoClosedTags) {
+ if(ele == n.payload) {
+ found = true;
+ break;
+ }
+ }
+
+ if(!found) // if not found in the tree though, it's probably just text
+ e.appendChild(TextNode.fromUndecodedString(this, ""~n.payload~">"));
+ }
+ } else {
+ if(n.element) {
+ if(!strict)
+ considerHtmlParagraphHack(n.element);
+ e.appendChild(n.element);
+ }
+ }
+
+ if(n.payload == tagName) // in strict mode, this is always true
+ closed = true;
+ } else { /*throw new Exception("wtf " ~ tagName);*/ }
+ }
+ //writef("%s>\n", tagName);
+ return Ele(0, e, null);
+ }
+
+ // if a tag was opened but not closed by end of file, we can arrive here
+ if(!strict && pos >= data.length)
+ return addTag(false);
+ //else if(strict) assert(0); // should be caught before
+
+ switch(data[pos]) {
+ default: assert(0);
+ case '/': // self closing tag
+ return addTag(true);
+ case '>':
+ return addTag(false);
+ case ' ':
+ case '\t':
+ case '\n':
+ // there might be attributes...
+ moreAttributes:
+ eatWhitespace();
+
+ // same deal as above the switch....
+ if(!strict && pos >= data.length)
+ return addTag(false);
+
+ if(strict && pos >= data.length)
+ throw new MarkupException("tag open, didn't find > before end of file");
+
+ switch(data[pos]) {
+ case '/': // self closing tag
+ return addTag(true);
+ case '>': // closed tag; open -- we now read the contents
+ return addTag(false);
+ default: // it is an attribute
+ string attrName = readAttributeName();
+ string attrValue = attrName;
+ if(pos >= data.length) {
+ if(strict)
+ assert(0, "this should have thrown in readAttributeName");
+ else {
+ data ~= ">";
+ goto blankValue;
+ }
+ }
+ if(data[pos] == '=') {
+ pos++;
+ attrValue = readAttributeValue();
+ }
+
+ blankValue:
+
+ if(strict && attrName in attributes)
+ throw new MarkupException("Repeated attribute: " ~ attrName);
+
+ if(attrName.strip().length)
+ attributes[attrName] = attrValue;
+ else if(strict) throw new MarkupException("wtf, zero length attribute name");
+
+ if(!strict && pos < data.length && data[pos] == '<') {
+ // this is the broken tag that doesn't have a > at the end
+ data = data[0 .. pos] ~ ">" ~ data[pos.. $];
+ // let's insert one as a hack
+ goto case '>';
+ }
+
+ goto moreAttributes;
+ }
+ }
+ }
+
+ return Ele(2, null, null); // this is a