charset. note this module is no longer standalone - requires arsd.characterencodings now

This commit is contained in:
Adam D. Ruppe 2011-11-20 11:50:23 -05:00
parent ddc47b90a2
commit a7f86d26e0
1 changed files with 93 additions and 3 deletions

96
dom.d
View File

@ -9,6 +9,8 @@ import std.array;
import std.stdio;
import arsd.characterencodings;
// Biggest (known) fixme left for "tag soup": <p> .... <p> in loose mode should close it on the second opening.
// Biggest FIXME for real documents: character set encoding detection
@ -2301,8 +2303,17 @@ class Document {
}
*/
/// Given the kind of garbage you find on the Internet, try to make sense of it.
/// Equivalent to document.parse(data, false, false, null);
/// (Case-insensitive, non-strict, determine character encoding from the data.)
/// NOTE: this makes no attempt at added security.
void parseGarbage(string data) {
parse(data, false, false, null);
}
/**
Take XMLish* data and try to make the DOM tree out of it.
Take XMLish data and try to make the DOM tree out of it.
The goal isn't to be perfect, but to just be good enough to
approximate Javascript's behavior.
@ -2316,9 +2327,88 @@ class Document {
be closed. If you are writing a document specifically for this,
try to avoid such - use self closed tags at least. Easier to parse.
* The xml version at the top really shouldn't be there...
The dataEncoding argument can be used to pass a specific
charset encoding for automatic conversion. If null (which is NOT
the default!), it tries to determine from the data itself,
using the xml prolog or meta tags, and assumes UTF-8 if unsure.
If this assumption is wrong, it can throw on non-ascii
characters!
Note that it previously assumed the data was encoded as UTF-8, which
is why the dataEncoding argument defaults to that.
So it shouldn't break backward compatibility.
But, if you want the best behavior on wild data - figuring it out from the document
instead of assuming - you'll probably want to change that argument to null.
*/
void parse(/*in */string data, bool caseSensitive = false, bool strict = false) {
void parse(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
// gotta determine the data encoding. If you know it, pass it in above to skip all this.
if(dataEncoding is null) {
dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
// it can't tell... probably a random 8 bit encoding. Let's check the document itself.
// Now, XML and HTML can both list encoding in the document, but we can't really parse
// it here without changing a lot of code until we know the encoding. So I'm going to
// do some hackish string checking.
if(dataEncoding is null) {
auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
// first, look for an XML prolog
auto idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
if(idx != -1) {
idx += "encoding=\"".length;
// we're probably past the prolog if it's this far in; we might be looking at
// content. Forget about it.
if(idx > 100)
idx = -1;
}
// if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
if(idx == -1) {
idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "charset=");
if(idx != -1) {
idx += "charset=".length;
if(dataAsBytes[idx] == '"')
idx++;
}
}
// found something in either branch...
if(idx != -1) {
// read till a quote or about 12 chars, whichever comes first...
int end = idx;
while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
end++;
dataEncoding = cast(string) dataAsBytes[idx .. end];
}
// otherwise, we just don't know.
}
}
if(strict)
enforce(dataEncoding !is null, "I couldn't figure out the encoding of this document.");
else {
// if we really don't know by here, it means we already tried UTF-8,
// looked for utf 16 and 32 byte order marks, and looked for xml or meta
// tags... let's assume it's Windows-1252, since that's probably the most
// common aside from utf that wouldn't be labeled.
dataEncoding = "Windows 1252";
}
// and now, go ahead and convert it.
string data;
if(dataEncoding != "UTF-8")
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
else
data = rawdata;
assert(data !is null);
// go through character by character.
// if you see a <, consider it a tag.
// name goes until the first non tagname character