html.
class TableCell : Element {
///.
this(Document _parentDocument, string _tagName) {
super(_parentDocument, _tagName);
}
@property int rowspan() const {
int ret = 1;
auto it = getAttribute("rowspan");
if(it.length)
ret = to!int(it);
return ret;
}
@property int colspan() const {
int ret = 1;
auto it = getAttribute("colspan");
if(it.length)
ret = to!int(it);
return ret;
}
@property int rowspan(int i) {
setAttribute("rowspan", to!string(i));
return i;
}
@property int colspan(int i) {
setAttribute("colspan", to!string(i));
return i;
}
}
///.
class MarkupError : Exception {
///.
this(string message) {
super(message);
}
}
///.
class ElementNotFoundException : Exception {
///.
this(string type, string search, string file = __FILE__, int line = __LINE__) {
super("Element of type '"~type~"' matching {"~search~"} not found.", file, line);
}
}
/// The html struct is used to differentiate between regular text nodes and html in certain functions
struct Html {
///.
string source;
}
/// This might belong in another module, but it represents a file with a mime type and some data.
/// Document implements this interface with type = text/html (see Document.contentType for more info)
/// and data = document.toString, so you can return Documents anywhere web.d expects FileResources.
interface FileResource {
string contentType() const;
immutable(ubyte)[] getData() const;
}
///.
class Document : FileResource {
///.
this(string data, bool caseSensitive = false, bool strict = false) {
parse(data, caseSensitive, strict);
}
/**
Creates an empty document. It has *nothing* in it at all.
*/
this() {
}
/// This is just something I'm toying with. Right now, you use opIndex to put in css selectors.
/// It returns a struct that forwards calls to all elements it holds, and returns itself so you
/// can chain it.
///
/// Example: document["p"].innerText("hello").addClass("modified");
///
/// Equivalent to: foreach(e; document.getElementsBySelector("p")) { e.innerText("hello"); e.addClas("modified"); }
///
/// Note: always use function calls (not property syntax) and don't use toString in there for best results.
///
/// You can also do things like: document["p"]["b"] though tbh I'm not sure why since the selector string can do all that anyway. Maybe
/// you could put in some kind of custom filter function tho.
ElementCollection opIndex(string selector) {
auto e = ElementCollection(this.root);
return e[selector];
}
string _contentType = "text/html; charset=utf-8";
/// If you're using this for some other kind of XML, you can
/// set the content type here.
///
/// Note: this has no impact on the function of this class.
/// It is only used if the document is sent via a protocol like HTTP.
///
/// This may be called by parse() if it recognizes the data. Otherwise,
/// if you don't set it, it assumes text/html; charset=utf-8.
string contentType(string mimeType) {
_contentType = mimeType;
return _contentType;
}
/// implementing the FileResource interface, useful for sending via
/// http automatically.
override string contentType() const {
return _contentType;
}
/// implementing the FileResource interface; it calls toString.
override immutable(ubyte)[] getData() const {
return cast(immutable(ubyte)[]) this.toString();
}
/// Concatenates any consecutive text nodes
/*
void normalize() {
}
*/
/// Given the kind of garbage you find on the Internet, try to make sense of it.
/// Equivalent to document.parse(data, false, false, null);
/// (Case-insensitive, non-strict, determine character encoding from the data.)
/// NOTE: this makes no attempt at added security.
void parseGarbage(string data) {
parse(data, false, false, null);
}
/**
Take XMLish data and try to make the DOM tree out of it.
The goal isn't to be perfect, but to just be good enough to
approximate Javascript's behavior.
If strict, it throws on something that doesn't make sense.
(Examples: mismatched tags. It doesn't validate!)
If not strict, it tries to recover anyway, and only throws
when something is REALLY unworkable.
If strict is false, it uses a magic list of tags that needn't
be closed. If you are writing a document specifically for this,
try to avoid such - use self closed tags at least. Easier to parse.
The dataEncoding argument can be used to pass a specific
charset encoding for automatic conversion. If null (which is NOT
the default!), it tries to determine from the data itself,
using the xml prolog or meta tags, and assumes UTF-8 if unsure.
If this assumption is wrong, it can throw on non-ascii
characters!
Note that it previously assumed the data was encoded as UTF-8, which
is why the dataEncoding argument defaults to that.
So it shouldn't break backward compatibility.
But, if you want the best behavior on wild data - figuring it out from the document
instead of assuming - you'll probably want to change that argument to null.
*/
void parse(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
// FIXME: this parser could be faster; it's in the top ten biggest tree times according to the profiler
// of my big app.
// gotta determine the data encoding. If you know it, pass it in above to skip all this.
if(dataEncoding is null) {
dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
// it can't tell... probably a random 8 bit encoding. Let's check the document itself.
// Now, XML and HTML can both list encoding in the document, but we can't really parse
// it here without changing a lot of code until we know the encoding. So I'm going to
// do some hackish string checking.
if(dataEncoding is null) {
auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
// first, look for an XML prolog
auto idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
if(idx != -1) {
idx += "encoding=\"".length;
// we're probably past the prolog if it's this far in; we might be looking at
// content. Forget about it.
if(idx > 100)
idx = -1;
}
// if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
if(idx == -1) {
idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "charset=");
if(idx != -1) {
idx += "charset=".length;
if(dataAsBytes[idx] == '"')
idx++;
}
}
// found something in either branch...
if(idx != -1) {
// read till a quote or about 12 chars, whichever comes first...
auto end = idx;
while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
end++;
dataEncoding = cast(string) dataAsBytes[idx .. end];
}
// otherwise, we just don't know.
}
}
if(dataEncoding is null) {
if(strict)
throw new MarkupError("I couldn't figure out the encoding of this document.");
else
// if we really don't know by here, it means we already tried UTF-8,
// looked for utf 16 and 32 byte order marks, and looked for xml or meta
// tags... let's assume it's Windows-1252, since that's probably the most
// common aside from utf that wouldn't be labeled.
dataEncoding = "Windows 1252";
}
// and now, go ahead and convert it.
string data;
if(!strict) {
// if we're in non-strict mode, we need to check
// the document for mislabeling too; sometimes
// web documents will say they are utf-8, but aren't
// actually properly encoded. If it fails to validate,
// we'll assume it's actually Windows encoding - the most
// likely candidate for mislabeled garbage.
dataEncoding = dataEncoding.toLower();
dataEncoding = dataEncoding.replace(" ", "");
dataEncoding = dataEncoding.replace("-", "");
dataEncoding = dataEncoding.replace("_", "");
if(dataEncoding == "utf8") {
try {
validate(rawdata);
} catch(UtfException e) {
dataEncoding = "Windows 1252";
}
}
}
if(dataEncoding != "UTF-8")
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
else
data = rawdata;
assert(data !is null);
// go through character by character.
// if you see a <, consider it a tag.
// name goes until the first non tagname character
// then see if it self closes or has an attribute
// if not in a tag, anything not a tag is a big text
// node child. It ends as soon as it sees a <
// Whitespace in text or attributes is preserved, but not between attributes
// & and friends are converted when I know them, left the same otherwise
// this it should already be done correctly.. so I'm leaving it off to net a ~10% speed boost on my typical test file (really)
//validate(data); // it *must* be UTF-8 for this to work correctly
sizediff_t pos = 0;
clear();
loose = !caseSensitive;
bool sawImproperNesting = false;
bool paragraphHackfixRequired = false;
int getLineNumber(sizediff_t p) {
int line = 1;
foreach(c; data[0..p])
if(c == '\n')
line++;
return line;
}
void parseError(string message) {
throw new MarkupError(format("char %d (line %d): %s", pos, getLineNumber(pos), message));
}
void eatWhitespace() {
while(pos < data.length && (data[pos] == ' ' || data[pos] == '\n' || data[pos] == '\t'))
pos++;
}
string readTagName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while( data[pos] != '>' && data[pos] != '/' &&
data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
pos++;
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while( data[pos] != '>' && data[pos] != '/' && data[pos] != '=' &&
data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
pos++;
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeValue() {
switch(data[pos]) {
case '\'':
case '"':
char end = data[pos];
pos++;
auto start = pos;
while(data[pos] != end)
pos++;
string v = htmlEntitiesDecode(data[start..pos], strict);
pos++; // skip over the end
return v;
default:
if(strict)
parseError("Attributes must be quoted");
// read until whitespace or terminator (/ or >)
auto start = pos;
while(data[pos] != '>' && data[pos] != '/' &&
data[pos] != ' ' && data[pos] != '\n' && data[pos] != '\t')
pos++;
string v = htmlEntitiesDecode(data[start..pos], strict);
// don't skip the end - we'll need it later
return v;
}
}
TextNode readTextNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return TextNode.fromUndecodedString(this, data[start..pos]);
}
RawSource readCDataNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return new RawSource(this, data[start..pos]);
}
struct Ele {
int type; // element or closing tag or nothing
Element element; // for type == 0
string payload; // for type == 1
}
// recursively read a tag
Ele readElement(string[] parentChain = null) {
// FIXME: this is the slowest function in this module, by far, even in strict mode.
// Loose mode should perform decently, but strict mode is the important one.
if(!strict && parentChain is null)
parentChain = [];
if(pos >= data.length)
{
if(strict) {
throw new MarkupError("Gone over the input (is there no root element?), chain: " ~ to!string(parentChain));
} else {
if(parentChain.length)
return Ele(1, null, parentChain[0]); // in loose mode, we just assume the document has ended
else
return Ele(4); // signal emptiness upstream
}
}
if(data[pos] != '<') {
return Ele(0, readTextNode(), null);
}
enforce(data[pos] == '<');
pos++;
switch(data[pos]) {
// I don't care about these, so I just want to skip them
case '!': // might be a comment, a doctype, or a special instruction
pos++;
// FIXME: we should store these in the tree too
// though I like having it stripped out tbh.
if(data[pos] == '-' && data[pos+1] == '-') {
// comment
pos += 2;
while(data[pos..pos+3] != "-->")
pos++;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '>');
} else if(data[pos..pos + 7] == "[CDATA[") {
pos += 7;
// FIXME: major malfunction possible here
auto cdataStart = pos;
auto cdataEnd = pos + data[pos .. $].indexOf("]]>");
pos = cdataEnd + 3;
return Ele(0, new TextNode(this, data[cdataStart .. cdataEnd]), null);
} else
while(data[pos] != '>')
pos++;
pos++; // skip the >
break;
case '?':
char end = data[pos];
more:
pos++; // skip the start
while(data[pos] != end)
pos++;
pos++; // skip the end
// FIXME: we should actually store this somewhere
// though I like having it stripped out as well tbh.
if(data[pos] == '>')
pos++;
else
goto more;
break;
case '/': // closing an element
pos++; // skip the start
auto p = pos;
while(data[pos] != '>')
pos++;
//writefln("%s>", data[p..pos]);
pos++; // skip the '>'
string tname = data[p..pos-1];
if(!caseSensitive)
tname = tname.toLower;
return Ele(1, null, tname); // closing tag reports itself here
case ' ': // assume it isn't a real element...
if(strict)
parseError("bad markup - improperly placed <");
else
return Ele(0, TextNode.fromUndecodedString(this, "<"), null);
break;
default:
string tagName = readTagName();
string[string] attributes;
Ele addTag(bool selfClosed) {
if(selfClosed)
pos++;
else {
if(!strict)
if(tagName.isInArray(selfClosedElements))
// these are de-facto self closed
selfClosed = true;
}
if(strict)
enforce(data[pos] == '>');//, format("got %s when expecting >\nContext:\n%s", data[pos], data[pos - 100 .. pos + 100]));
else {
// if we got here, it's probably because a slash was in an
// unquoted attribute - don't trust the selfClosed value
if(!selfClosed)
selfClosed = tagName.isInArray(selfClosedElements);
while(data[pos] != '>')
pos++;
}
auto whereThisTagStarted = pos; // for better error messages
pos++;
auto e = createElement(tagName);
e.attributes = attributes;
e.selfClosed = selfClosed;
e.parseAttributes();
// HACK to handle script and style as a raw data section as it is in HTML browsers
if(tagName == "script" || tagName == "style") {
if(!selfClosed) {
string closer = "" ~ tagName ~ ">";
auto ending = indexOf(data[pos..$], closer);
if(loose && ending == -1)
ending = indexOf(data[pos..$], closer.toUpper);
if(ending == -1)
throw new Exception("tag " ~ tagName ~ " never closed");
ending += pos;
e.innerRawSource = data[pos..ending];
pos = ending + closer.length;
}
return Ele(0, e, null);
}
bool closed = selfClosed;
void considerHtmlParagraphHack(Element n) {
assert(!strict);
if(e.tagName == "p" && e.tagName == n.tagName) {
// html lets you write para 1 para 1
// but in the dom tree, they should be siblings, not children.
paragraphHackfixRequired = true;
}
}
//writef("<%s>", tagName);
while(!closed) {
Ele n;
if(strict)
n = readElement();
else
n = readElement(parentChain ~ tagName);
if(n.type == 4) return n; // the document is empty
if(n.type == 0) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
} else if(n.type == 1) {
bool found = false;
if(n.payload != tagName) {
if(strict)
parseError(format("mismatched tag: %s> != <%s> (opened on line %d)", n.payload, tagName, getLineNumber(whereThisTagStarted)));
else {
sawImproperNesting = true;
// this is so we don't drop several levels of awful markup
if(n.element) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
n.element = null;
}
// is the element open somewhere up the chain?
foreach(parent; parentChain)
if(parent == n.payload) {
n.element = e;
return n;
}
// if not, this is a text node; we can't fix it up...
// If it's already in the tree somewhere, assume it is closed by algorithm
// and we shouldn't output it - odds are the user just flipped a couple tags
foreach(ele; e.tree) {
if(ele.tagName == n.payload) {
found = true;
break;
}
}
if(!found) // if not found in the tree though, it's probably just text
e.appendChild(TextNode.fromUndecodedString(this, ""~n.payload~">"));
}
} else {
if(n.element) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
}
}
if(n.payload == tagName) // in strict mode, this is always true
closed = true;
} else { /*throw new Exception("wtf " ~ tagName);*/ }
}
//writef("%s>\n", tagName);
return Ele(0, e, null);
}
switch(data[pos]) {
default: assert(0);
case '/': // self closing tag
return addTag(true);
case '>':
return addTag(false);
case ' ':
case '\t':
case '\n':
// there might be attributes...
moreAttributes:
eatWhitespace();
switch(data[pos]) {
case '/': // self closing tag
return addTag(true);
case '>': // closed tag; open -- we now read the contents
return addTag(false);
default: // it is an attribute
string attrName = readAttributeName();
string attrValue = attrName;
if(data[pos] == '=') {
pos++;
attrValue = readAttributeValue();
}
attributes[attrName] = attrValue;
goto moreAttributes;
}
}
}
return Ele(2, null, null); // this is a |