From 78821d25bb5107759443e8b2a13ae3ce7b2c8aba Mon Sep 17 00:00:00 2001 From: "Adam D. Ruppe" Date: Mon, 3 Jun 2013 12:07:08 -0400 Subject: [PATCH] new handling of php,asp, and processing instructions --- dom.d | 437 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 389 insertions(+), 48 deletions(-) diff --git a/dom.d b/dom.d index 863d335..8381e0d 100644 --- a/dom.d +++ b/dom.d @@ -2386,9 +2386,24 @@ string htmlEntitiesDecode(string data, bool strict = false) { return cast(string) a; // assumeUnique is actually kinda slow, lol } -///. -class RawSource : Element { +abstract class SpecialElement : Element { + this(Document _parentDocument) { + super(_parentDocument); + } + ///. + override Element appendChild(Element e) { + assert(0, "Cannot append to a special node"); + } + + ///. + override int nodeType() const { + return 100; + } +} + +///. +class RawSource : SpecialElement { ///. this(Document _parentDocument, string s) { super(_parentDocument); @@ -2401,11 +2416,6 @@ class RawSource : Element { return this.toString(); } - ///. - override int nodeType() const { - return 100; - } - ///. override string writeToAppender(Appender!string where = appender!string()) const { where.put(source); @@ -2413,15 +2423,147 @@ class RawSource : Element { } ///. - override Element appendChild(Element e) { - assert(0, "Cannot append to a text node"); + string source; +} + +///. +class PhpCode : SpecialElement { + ///. + this(Document _parentDocument, string s) { + super(_parentDocument); + source = s; + tagName = "#php"; } + ///. + override string nodeValue() const { + return this.source; + } + + ///. + override string writeToAppender(Appender!string where = appender!string()) const { + auto start = where.data.length; + where.put("<"); + where.put(source); + where.put(">"); + return where.data[start .. $]; + } ///. string source; } +///. +class AspCode : SpecialElement { + ///. + this(Document _parentDocument, string s) { + super(_parentDocument); + source = s; + tagName = "#asp"; + } + + ///. + override string nodeValue() const { + return this.source; + } + + ///. + override string writeToAppender(Appender!string where = appender!string()) const { + auto start = where.data.length; + where.put("<"); + where.put(source); + where.put(">"); + return where.data[start .. $]; + } + + ///. + string source; +} + +///. +class BangInstruction : SpecialElement { + ///. + this(Document _parentDocument, string s) { + super(_parentDocument); + source = s; + tagName = "#bpi"; + } + + ///. + override string nodeValue() const { + return this.source; + } + + ///. + override string writeToAppender(Appender!string where = appender!string()) const { + auto start = where.data.length; + where.put(""); + return where.data[start .. $]; + } + + ///. + string source; +} + +///. +class QuestionInstruction : SpecialElement { + ///. + this(Document _parentDocument, string s) { + super(_parentDocument); + source = s; + tagName = "#qpi"; + } + + ///. + override string nodeValue() const { + return this.source; + } + + ///. + override string writeToAppender(Appender!string where = appender!string()) const { + auto start = where.data.length; + where.put("<"); + where.put(source); + where.put(">"); + return where.data[start .. $]; + } + + ///. + string source; +} + +///. +class HtmlComment : SpecialElement { + ///. + this(Document _parentDocument, string s) { + super(_parentDocument); + source = s; + tagName = "#comment"; + } + + ///. + override string nodeValue() const { + return this.source; + } + + ///. + override string writeToAppender(Appender!string where = appender!string()) const { + auto start = where.data.length; + where.put(""); + return where.data[start .. $]; + } + + ///. + string source; +} + + + + ///. class TextNode : Element { public: @@ -3240,6 +3382,47 @@ class Document : FileResource { } */ + /// This will set delegates for parseSaw* (note: this overwrites anything else you set, and you setting subsequently will overwrite this) that add those things to the dom tree when it sees them. + /// Call this before calling parse(). + + /// Note this will also preserve the prolog and doctype from the original file, if there was one. + void enableAddingSpecialTagsToDom() { + parseSawComment = (string) => true; + parseSawAspCode = (string) => true; + parseSawPhpCode = (string) => true; + parseSawQuestionInstruction = (string) => true; + parseSawBangInstruction = (string) => true; + } + + /// If the parser sees a html comment, it will call this callback + /// will call parseSawComment(" comment ") + /// Return true if you want the node appended to the document. + bool delegate(string) parseSawComment; + + /// If the parser sees <% asp code... %>, it will call this callback. + /// It will be passed "% asp code... %" or "%= asp code .. %" + /// Return true if you want the node appended to the document. + bool delegate(string) parseSawAspCode; + + /// If the parser sees , it will call this callback. + /// It will be passed "?php php code... ?" or "?= asp code .. ?" + /// Note: dom.d cannot identify the other php short format. + /// Return true if you want the node appended to the document. + bool delegate(string) parseSawPhpCode; + + /// if it sees a that is not php or asp + /// it calls this function with the contents. + /// calls parseSawQuestionInstruction("?SOMETHING foo") + /// Unlike the php/asp ones, this ends on the first > it sees, without requiring ?>. + /// Return true if you want the node appended to the document. + bool delegate(string) parseSawQuestionInstruction; + + /// if it sees a calls parseSawBangInstruction("SOMETHING foo") + /// Return true if you want the node appended to the document. + bool delegate(string) parseSawBangInstruction; + /// Given the kind of garbage you find on the Internet, try to make sense of it. /// Equivalent to document.parse(data, false, false, null); /// (Case-insensitive, non-strict, determine character encoding from the data.) @@ -3527,6 +3710,7 @@ class Document : FileResource { return TextNode.fromUndecodedString(this, data[start..pos]); } + // this is obsolete! RawSource readCDataNode() { auto start = pos; while(pos < data.length && data[pos] != '<') { @@ -3539,7 +3723,14 @@ class Document : FileResource { struct Ele { int type; // element or closing tag or nothing - Element element; // for type == 0 + /* + type == 0 means regular node, self-closed (element is valid) + type == 1 means closing tag (payload is the tag name, element may be valid) + type == 2 means you should ignore it completely + type == 3 means it is a special element that should be appended, if possible, e.g. a that was chosen to be kept, php code, or comment. It will be appended at the current element if inside the root, and to a special document area if not + type == 4 means the document was totally empty + */ + Element element; // for type == 0 or type == 3 string payload; // for type == 1 } // recursively read a tag @@ -3554,7 +3745,7 @@ class Document : FileResource { if(pos >= data.length) { if(strict) { - throw new MarkupException("Gone over the input (is there no root element?), chain: " ~ to!string(parentChain)); + throw new MarkupException("Gone over the input (is there no root element or did it never close?), chain: " ~ to!string(parentChain)); } else { if(parentChain.length) return Ele(1, null, parentChain[0]); // in loose mode, we just assume the document has ended @@ -3578,57 +3769,180 @@ class Document : FileResource { // I don't care about these, so I just want to skip them case '!': // might be a comment, a doctype, or a special instruction pos++; + // FIXME: we should store these in the tree too // though I like having it stripped out tbh. - if(data[pos] == '-' && data[pos+1] == '-') { + + if(pos == data.length) { + if(strict) + throw new MarkupException("") + + // FIXME: technically, a comment is anything + // between -- and -- inside a block. + // so in since that's the common way + + auto commentStart = pos; + while(pos+3 < data.length && data[pos..pos+3] != "-->") pos++; - assert(data[pos] == '-'); - pos++; - assert(data[pos] == '-'); - pos++; - assert(data[pos] == '>'); - } else if(data[pos..pos + 7] == "[CDATA[") { + + auto end = commentStart; + + if(pos + 3 >= data.length) { + if(strict) + throw new MarkupException("unclosed comment"); + end = data.length; + pos = data.length; + } else { + end = pos; + assert(data[pos] == '-'); + pos++; + assert(data[pos] == '-'); + pos++; + assert(data[pos] == '>'); + pos++; + } + + if(parseSawComment !is null) + if(parseSawComment(data[commentStart .. end])) { + return Ele(3, new HtmlComment(this, data[commentStart .. end]), null); + } + } else if(pos + 7 <= data.length && data[pos..pos + 7] == "[CDATA[") { pos += 7; - // FIXME: major malfunction possible here + auto cdataStart = pos; - // cdata isn't allowed to nest, so this should be generally ok, as long as it is found - auto cdataEnd = pos + data[pos .. $].indexOf("]]>"); + typeof(indexOf(data, "")) end = -1; + typeof(end) cdataEnd; + + if(pos < data.length) { + // cdata isn't allowed to nest, so this should be generally ok, as long as it is found + end = data[pos .. $].indexOf("]]>"); + } + + if(end == -1) { + if(strict) + throw new MarkupException("Unclosed CDATA section"); + end = pos; + cdataEnd = pos; + } else { + cdataEnd = pos + end; + pos = cdataEnd + 3; + } - pos = cdataEnd + 3; return Ele(0, new TextNode(this, data[cdataStart .. cdataEnd]), null); - } else - while(data[pos] != '>') + } else { + auto start = pos; + while(pos < data.length && data[pos] != '>') pos++; - pos++; // skip the > + if(pos == data.length) { + if(strict) + throw new MarkupException("unclosed processing instruction ()"); + } + + if(parseSawBangInstruction !is null) + if(parseSawBangInstruction(data[start .. pos])) { + // FIXME: these should be able to modify the parser state, + // doing things like adding entities, somehow. + + return Ele(3, new BangInstruction(this, data[start .. pos]), null); + } + } + + if(pos < data.length && data[pos] == '>') + pos++; // skip the > + else + assert(!strict); break; - // case '%': + case '%': case '?': + /* + Here's what we want to support: + + <% asp code %> + <%= asp code %> + + + + The contents don't really matter, just if it opens with + one of the above for, it ends on the two char terminator. + + + this is NOT php code + because I've seen this in the wild: + + This could be php with shorttags which would be cut off + prematurely because if(a >) - that > counts as the close + of the tag, but since dom.d can't tell the difference + between that and the real world example, it will + not try to look for the ?> ending. + + The difference between this and the asp/php stuff is that it + ends on >, not ?>. ONLY . The rest end + on >. + */ + char end = data[pos]; - // FIXME this is all kinda broken + auto started = pos; + bool isAsp = end == '%'; + int currentIndex = 0; + bool isPhp = false; + bool isEqualTag = false; + int phpCount = 0; more: pos++; // skip the start + if(pos == data.length) { + if(strict) + throw new MarkupException("Unclosed <"~end~" by end of file"); + } else { + currentIndex++; + if(currentIndex == 1 && data[pos] == '=') { + if(!isAsp) + isPhp = true; + isEqualTag = true; + goto more; + } + if(currentIndex == 1 && data[pos] == 'p') + phpCount++; + if(currentIndex == 2 && data[pos] == 'h') + phpCount++; + if(currentIndex == 3 && data[pos] == 'p' && phpCount == 2) + isPhp = true; - while(data[pos] != end) { - // FIXME: what if it is PHP? - if(data[pos] == '>') - break; // I've seen this in the wild: - pos++; + if(data[pos] == '>') { + if((isAsp || isPhp) && data[pos - 1] != end) + goto more; + // otherwise we're done + } else + goto more; } - if(data[pos] == end) - pos++; // skip the end + //writefln("%s: %s", isAsp ? "ASP" : isPhp ? "PHP" : "') - pos++; - else - goto more; + + assert((pos < data.length && data[pos] == '>') || (!strict && pos == data.length)); + if(pos < data.length) + pos++; // get past the > + + if(isAsp && parseSawAspCode !is null) { + if(parseSawAspCode(code)) { + return Ele(3, new AspCode(this, code), null); + } + } else if(isPhp && parseSawPhpCode !is null) { + if(parseSawPhpCode(code)) { + return Ele(3, new PhpCode(this, code), null); + } + } else if(!isAsp && !isPhp && parseSawQuestionInstruction !is null) { + if(parseSawQuestionInstruction(code)) { + return Ele(3, new QuestionInstruction(this, code), null); + } + } break; case '/': // closing an element pos++; // skip the start @@ -3764,8 +4078,13 @@ class Document : FileResource { if(n.type == 4) return n; // the document is empty - - if(n.type == 0) { + if(n.type == 3 && n.element !is null) { + // special node, append if possible + if(e !is null) + e.appendChild(n.element); + else + piecesBeforeRoot ~= n.element; + } else if(n.type == 0) { if(!strict) considerHtmlParagraphHack(n.element); e.appendChild(n.element); @@ -3896,7 +4215,7 @@ class Document : FileResource { } } - return Ele(2, null, null); // this is a \n"; + private string _prolog = "\n"; + private bool prologWasSet = false; // set to true if the user changed it + + @property string prolog() const { + // if the user explicitly changed it, do what they want + // or if we didn't keep/find stuff from the document itself, + // we'll use the builtin one as a default. + if(prologWasSet || piecesBeforeRoot.length == 0) + return _prolog; + + string p; + foreach(e; piecesBeforeRoot) + p ~= e.toString() ~ "\n"; + return p; + } ///. override string toString() const { @@ -4135,6 +4473,9 @@ class Document : FileResource { ///. Element root; + /// if these were kept, this is stuff that appeared before the root element, such as decls and s + Element[] piecesBeforeRoot; + ///. bool loose; @@ -4155,7 +4496,7 @@ class Document : FileResource { class XmlDocument : Document { this(string data) { contentType = "text/xml; charset=utf-8"; - prolog = `` ~ "\n"; + _prolog = `` ~ "\n"; parse(data, true, true); }