From a7f86d26e04db890e637b95412e4dd036a8a9723 Mon Sep 17 00:00:00 2001 From: "Adam D. Ruppe" Date: Sun, 20 Nov 2011 11:50:23 -0500 Subject: [PATCH] charset. note this module is no longer standalone - requires arsd.characterencodings now --- dom.d | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 3 deletions(-) diff --git a/dom.d b/dom.d index 4793623..fca876f 100644 --- a/dom.d +++ b/dom.d @@ -9,6 +9,8 @@ import std.array; import std.stdio; +import arsd.characterencodings; + // Biggest (known) fixme left for "tag soup":

....

in loose mode should close it on the second opening. // Biggest FIXME for real documents: character set encoding detection @@ -2301,8 +2303,17 @@ class Document { } */ + /// Given the kind of garbage you find on the Internet, try to make sense of it. + /// Equivalent to document.parse(data, false, false, null); + /// (Case-insensitive, non-strict, determine character encoding from the data.) + + /// NOTE: this makes no attempt at added security. + void parseGarbage(string data) { + parse(data, false, false, null); + } + /** - Take XMLish* data and try to make the DOM tree out of it. + Take XMLish data and try to make the DOM tree out of it. The goal isn't to be perfect, but to just be good enough to approximate Javascript's behavior. @@ -2316,9 +2327,88 @@ class Document { be closed. If you are writing a document specifically for this, try to avoid such - use self closed tags at least. Easier to parse. - * The xml version at the top really shouldn't be there... + The dataEncoding argument can be used to pass a specific + charset encoding for automatic conversion. If null (which is NOT + the default!), it tries to determine from the data itself, + using the xml prolog or meta tags, and assumes UTF-8 if unsure. + + If this assumption is wrong, it can throw on non-ascii + characters! + + + Note that it previously assumed the data was encoded as UTF-8, which + is why the dataEncoding argument defaults to that. + + So it shouldn't break backward compatibility. + + But, if you want the best behavior on wild data - figuring it out from the document + instead of assuming - you'll probably want to change that argument to null. + */ - void parse(/*in */string data, bool caseSensitive = false, bool strict = false) { + void parse(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") { + // gotta determine the data encoding. If you know it, pass it in above to skip all this. + if(dataEncoding is null) { + dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata); + // it can't tell... probably a random 8 bit encoding. Let's check the document itself. + // Now, XML and HTML can both list encoding in the document, but we can't really parse + // it here without changing a lot of code until we know the encoding. So I'm going to + // do some hackish string checking. + if(dataEncoding is null) { + auto dataAsBytes = cast(immutable(ubyte)[]) rawdata; + // first, look for an XML prolog + auto idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "encoding=\""); + if(idx != -1) { + idx += "encoding=\"".length; + // we're probably past the prolog if it's this far in; we might be looking at + // content. Forget about it. + if(idx > 100) + idx = -1; + } + // if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5).. + if(idx == -1) { + idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "charset="); + if(idx != -1) { + idx += "charset=".length; + if(dataAsBytes[idx] == '"') + idx++; + } + } + + // found something in either branch... + if(idx != -1) { + // read till a quote or about 12 chars, whichever comes first... + int end = idx; + while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12) + end++; + + dataEncoding = cast(string) dataAsBytes[idx .. end]; + } + // otherwise, we just don't know. + } + } + + if(strict) + enforce(dataEncoding !is null, "I couldn't figure out the encoding of this document."); + else { + // if we really don't know by here, it means we already tried UTF-8, + // looked for utf 16 and 32 byte order marks, and looked for xml or meta + // tags... let's assume it's Windows-1252, since that's probably the most + // common aside from utf that wouldn't be labeled. + + dataEncoding = "Windows 1252"; + } + + // and now, go ahead and convert it. + + string data; + + if(dataEncoding != "UTF-8") + data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding); + else + data = rawdata; + assert(data !is null); + + // go through character by character. // if you see a <, consider it a tag. // name goes until the first non tagname character