mirror of https://github.com/adamdruppe/arsd.git
charset. note this module is no longer standalone - requires arsd.characterencodings now
This commit is contained in:
parent
ddc47b90a2
commit
a7f86d26e0
96
dom.d
96
dom.d
|
@ -9,6 +9,8 @@ import std.array;
|
||||||
|
|
||||||
import std.stdio;
|
import std.stdio;
|
||||||
|
|
||||||
|
import arsd.characterencodings;
|
||||||
|
|
||||||
// Biggest (known) fixme left for "tag soup": <p> .... <p> in loose mode should close it on the second opening.
|
// Biggest (known) fixme left for "tag soup": <p> .... <p> in loose mode should close it on the second opening.
|
||||||
// Biggest FIXME for real documents: character set encoding detection
|
// Biggest FIXME for real documents: character set encoding detection
|
||||||
|
|
||||||
|
@ -2301,8 +2303,17 @@ class Document {
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
/// Given the kind of garbage you find on the Internet, try to make sense of it.
|
||||||
|
/// Equivalent to document.parse(data, false, false, null);
|
||||||
|
/// (Case-insensitive, non-strict, determine character encoding from the data.)
|
||||||
|
|
||||||
|
/// NOTE: this makes no attempt at added security.
|
||||||
|
void parseGarbage(string data) {
|
||||||
|
parse(data, false, false, null);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
Take XMLish* data and try to make the DOM tree out of it.
|
Take XMLish data and try to make the DOM tree out of it.
|
||||||
|
|
||||||
The goal isn't to be perfect, but to just be good enough to
|
The goal isn't to be perfect, but to just be good enough to
|
||||||
approximate Javascript's behavior.
|
approximate Javascript's behavior.
|
||||||
|
@ -2316,9 +2327,88 @@ class Document {
|
||||||
be closed. If you are writing a document specifically for this,
|
be closed. If you are writing a document specifically for this,
|
||||||
try to avoid such - use self closed tags at least. Easier to parse.
|
try to avoid such - use self closed tags at least. Easier to parse.
|
||||||
|
|
||||||
* The xml version at the top really shouldn't be there...
|
The dataEncoding argument can be used to pass a specific
|
||||||
|
charset encoding for automatic conversion. If null (which is NOT
|
||||||
|
the default!), it tries to determine from the data itself,
|
||||||
|
using the xml prolog or meta tags, and assumes UTF-8 if unsure.
|
||||||
|
|
||||||
|
If this assumption is wrong, it can throw on non-ascii
|
||||||
|
characters!
|
||||||
|
|
||||||
|
|
||||||
|
Note that it previously assumed the data was encoded as UTF-8, which
|
||||||
|
is why the dataEncoding argument defaults to that.
|
||||||
|
|
||||||
|
So it shouldn't break backward compatibility.
|
||||||
|
|
||||||
|
But, if you want the best behavior on wild data - figuring it out from the document
|
||||||
|
instead of assuming - you'll probably want to change that argument to null.
|
||||||
|
|
||||||
*/
|
*/
|
||||||
void parse(/*in */string data, bool caseSensitive = false, bool strict = false) {
|
void parse(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
|
||||||
|
// gotta determine the data encoding. If you know it, pass it in above to skip all this.
|
||||||
|
if(dataEncoding is null) {
|
||||||
|
dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
|
||||||
|
// it can't tell... probably a random 8 bit encoding. Let's check the document itself.
|
||||||
|
// Now, XML and HTML can both list encoding in the document, but we can't really parse
|
||||||
|
// it here without changing a lot of code until we know the encoding. So I'm going to
|
||||||
|
// do some hackish string checking.
|
||||||
|
if(dataEncoding is null) {
|
||||||
|
auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
|
||||||
|
// first, look for an XML prolog
|
||||||
|
auto idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
|
||||||
|
if(idx != -1) {
|
||||||
|
idx += "encoding=\"".length;
|
||||||
|
// we're probably past the prolog if it's this far in; we might be looking at
|
||||||
|
// content. Forget about it.
|
||||||
|
if(idx > 100)
|
||||||
|
idx = -1;
|
||||||
|
}
|
||||||
|
// if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
|
||||||
|
if(idx == -1) {
|
||||||
|
idx = std.array.indexOf(dataAsBytes, cast(immutable ubyte[]) "charset=");
|
||||||
|
if(idx != -1) {
|
||||||
|
idx += "charset=".length;
|
||||||
|
if(dataAsBytes[idx] == '"')
|
||||||
|
idx++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// found something in either branch...
|
||||||
|
if(idx != -1) {
|
||||||
|
// read till a quote or about 12 chars, whichever comes first...
|
||||||
|
int end = idx;
|
||||||
|
while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
|
||||||
|
end++;
|
||||||
|
|
||||||
|
dataEncoding = cast(string) dataAsBytes[idx .. end];
|
||||||
|
}
|
||||||
|
// otherwise, we just don't know.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(strict)
|
||||||
|
enforce(dataEncoding !is null, "I couldn't figure out the encoding of this document.");
|
||||||
|
else {
|
||||||
|
// if we really don't know by here, it means we already tried UTF-8,
|
||||||
|
// looked for utf 16 and 32 byte order marks, and looked for xml or meta
|
||||||
|
// tags... let's assume it's Windows-1252, since that's probably the most
|
||||||
|
// common aside from utf that wouldn't be labeled.
|
||||||
|
|
||||||
|
dataEncoding = "Windows 1252";
|
||||||
|
}
|
||||||
|
|
||||||
|
// and now, go ahead and convert it.
|
||||||
|
|
||||||
|
string data;
|
||||||
|
|
||||||
|
if(dataEncoding != "UTF-8")
|
||||||
|
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
|
||||||
|
else
|
||||||
|
data = rawdata;
|
||||||
|
assert(data !is null);
|
||||||
|
|
||||||
|
|
||||||
// go through character by character.
|
// go through character by character.
|
||||||
// if you see a <, consider it a tag.
|
// if you see a <, consider it a tag.
|
||||||
// name goes until the first non tagname character
|
// name goes until the first non tagname character
|
||||||
|
|
Loading…
Reference in New Issue