From ba7b65a80408cc29d793d886404144200748462f Mon Sep 17 00:00:00 2001 From: "Adam D. Ruppe" Date: Wed, 14 Jan 2015 11:01:49 -0500 Subject: [PATCH] characterencodings.d is now optional again --- dom.d | 68 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/dom.d b/dom.d index 3492d19..f5d3b1a 100644 --- a/dom.d +++ b/dom.d @@ -487,6 +487,27 @@ struct ElementCollection { return ec; } + /// if you slice it, give the underlying array for easy forwarding of the + /// collection to range expecting algorithms or looping over. + Element[] opSlice() { + return elements; + } + + /// And input range primitives so we can foreach over this + void popFront() { + elements = elements[1..$]; + } + + /// ditto + Element front() { + return elements[0]; + } + + /// ditto + bool empty() { + return !elements.length; + } + /// Forward method calls to each individual element of the collection /// returns this so it can be chained. ElementCollection opDispatch(string name, T...)(T t) { @@ -707,8 +728,6 @@ string camelCase(string a) { // I need to maintain compatibility with the way it is now too. -import arsd.characterencodings; - import std.string; import std.exception; import std.uri; @@ -1660,7 +1679,7 @@ class Element { } auto doc = new Document(); - doc.parse("" ~ html ~ "", strict, strict); // FIXME: this should preserve the strictness of the parent document + doc.parseUtf8("" ~ html ~ "", strict, strict); // FIXME: this should preserve the strictness of the parent document children = doc.root.children; foreach(c; children) { @@ -1695,7 +1714,7 @@ class Element { */ @property Element[] outerHTML(string html) { auto doc = new Document(); - doc.parse("" ~ html ~ ""); // FIXME: needs to preserve the strictness + doc.parseUtf8("" ~ html ~ ""); // FIXME: needs to preserve the strictness children = doc.root.children; foreach(c; children) { @@ -3344,7 +3363,7 @@ struct Html { class Document : FileResource { ///. this(string data, bool caseSensitive = false, bool strict = false) { - parse(data, caseSensitive, strict); + parseUtf8(data, caseSensitive, strict); } /** @@ -3451,17 +3470,30 @@ class Document : FileResource { /// (Case-insensitive, non-strict, determine character encoding from the data.) /// NOTE: this makes no attempt at added security. - void parseGarbage(string data) { + /// + /// It is a template so it lazily imports characterencodings. + void parseGarbage()(string data) { parse(data, false, false, null); } /// Parses well-formed UTF-8, case-sensitive, XML or XHTML /// Will throw exceptions on things like unclosed tags. void parseStrict(string data) { - parse(data, true, true); + parseStream(toUtf8Stream(data), true, true); } - Utf8Stream handleDataEncoding(in string rawdata, string dataEncoding, bool strict) { + /// Parses well-formed UTF-8 in loose mode (by default). Tries to correct + /// tag soup, but does NOT try to correct bad character encodings. + /// + /// They will still throw an exception. + void parseUtf8(string data, bool caseSensitive = false, bool strict = false) { + parseStream(toUtf8Stream(data), caseSensitive, strict); + } + + // this is a template so we get lazy import behavior + Utf8Stream handleDataEncoding()(in string rawdata, string dataEncoding, bool strict) { + static assert(0); + import arsd.characterencodings; // gotta determine the data encoding. If you know it, pass it in above to skip all this. if(dataEncoding is null) { dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata); @@ -3552,6 +3584,12 @@ class Document : FileResource { } else data = rawdata; + return toUtf8Stream(data); + } + + private + Utf8Stream toUtf8Stream(in string rawdata) { + string data = rawdata; static if(is(Utf8Stream == string)) return data; else @@ -3590,8 +3628,16 @@ class Document : FileResource { But, if you want the best behavior on wild data - figuring it out from the document instead of assuming - you'll probably want to change that argument to null. + This is a template so it lazily imports arsd.characterencodings, which is required + to fix up data encodings. + + If you are sure the encoding is good, try parseUtf8 or parseStrict to avoid the + dependency. If it is data from the Internet though, a random website, the encoding + is often a lie. This function, if dataEncoding == null, can correct for that, or + you can try parseGarbage. In those cases, arsd.characterencodings is required to + compile. */ - void parse(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") { + void parse()(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") { auto data = handleDataEncoding(rawdata, dataEncoding, strict); parseStream(data, caseSensitive, strict); } @@ -4294,7 +4340,7 @@ class Document : FileResource { if(strict) assert(0, "empty document should be impossible in strict mode"); else - parse(``); // fill in a dummy document in loose mode since that's what browsers do + parseUtf8(``); // fill in a dummy document in loose mode since that's what browsers do } if(paragraphHackfixRequired) { @@ -4557,7 +4603,7 @@ class XmlDocument : Document { contentType = "text/xml; charset=utf-8"; _prolog = `` ~ "\n"; - parse(data, true, true); + parseStrict(data); } }