// FIXME: xml namespace support???
// FIXME: https://developer.mozilla.org/en-US/docs/Web/API/Element/insertAdjacentHTML
// FIXME: parentElement is parentNode that skips DocumentFragment etc but will be hard to work in with my compatibility...
// FIXME: the scriptable list is quite arbitrary
// xml entity references?!
/++
This is an html DOM implementation, started with cloning
what the browser offers in Javascript, but going well beyond
it in convenience.
If you can do it in Javascript, you can probably do it with
this module, and much more.
---
import arsd.dom;
void main() {
auto document = new Document("
paragraph
");
writeln(document.querySelector("p"));
document.root.innerHTML = "hey
";
writeln(document);
}
---
BTW: this file optionally depends on `arsd.characterencodings`, to
help it correctly read files from the internet. You should be able to
get characterencodings.d from the same place you got this file.
If you want it to stand alone, just always use the `Document.parseUtf8`
function or the constructor that takes a string.
Symbol_groups:
core_functionality =
These members provide core functionality. The members on these classes
will provide most your direct interaction.
bonus_functionality =
These provide additional functionality for special use cases.
implementations =
These provide implementations of other functionality.
+/
module arsd.dom;
// FIXME: support the css standard namespace thing in the selectors too
version(with_arsd_jsvar)
import arsd.jsvar;
else {
enum scriptable = "arsd_jsvar_compatible";
}
// this is only meant to be used at compile time, as a filter for opDispatch
// lists the attributes we want to allow without the use of .attr
bool isConvenientAttribute(string name) {
static immutable list = [
"name", "id", "href", "value",
"checked", "selected", "type",
"src", "content", "pattern",
"placeholder", "required", "alt",
"rel",
"method", "action", "enctype"
];
foreach(l; list)
if(name == l) return true;
return false;
}
// FIXME: something like spam with no closing
should read the second tag as the closer in garbage mode
// FIXME: failing to close a paragraph sometimes messes things up too
// FIXME: it would be kinda cool to have some support for internal DTDs
// and maybe XPath as well, to some extent
/*
we could do
meh this sux
auto xpath = XPath(element);
// get the first p
xpath.p[0].a["href"]
*/
/++
The main document interface, including a html or xml parser.
There's three main ways to create a Document:
If you want to parse something and inspect the tags, you can use the [this|constructor]:
---
// create and parse some HTML in one call
auto document = new Document("");
// or some XML
auto document = new Document("", true, true); // strict mode enabled
// or better yet:
auto document = new XmlDocument(""); // specialized subclass
---
If you want to download something and parse it in one call, the [fromUrl] static function can help:
---
auto document = Document.fromUrl("http://dlang.org/");
---
(note that this requires my [arsd.characterencodings] and [arsd.http2] libraries)
And, if you need to inspect things like `<%= foo %>` tags and comments, you can add them to the dom like this, with the [enableAddingSpecialTagsToDom]
and [parseUtf8] or [parseGarbage] functions:
---
auto document = new Document();
document.enableAddingSpecialTagsToDom();
document.parseUtf8("", true, true); // changes the trues to false to switch from xml to html mode
---
However you parse it, it will put a few things into special variables.
[root] contains the root document.
[prolog] contains the instructions before the root (like ``). To keep the original things, you will need to [enableAddingSpecialTagsToDom] first, otherwise the library will return generic strings in there. [piecesBeforeRoot] will have other parsed instructions, if [enableAddingSpecialTagsToDom] is called.
[piecesAfterRoot] will contain any xml-looking data after the root tag is closed.
Most often though, you will not need to look at any of that data, since `Document` itself has methods like [querySelector], [appendChild], and more which will forward to the root [Element] for you.
+/
/// Group: core_functionality
class Document : FileResource, DomParent {
inout(Document) asDocument() inout { return this; }
inout(Element) asElement() inout { return null; }
/++
Convenience method for web scraping. Requires [arsd.http2] to be
included in the build as well as [arsd.characterencodings].
This will download the file from the given url and create a document
off it, using a strict constructor or a [parseGarbage], depending on
the value of `strictMode`.
+/
static Document fromUrl()(string url, bool strictMode = false) {
import arsd.http2;
auto client = new HttpClient();
auto req = client.navigateTo(Uri(url), HttpVerb.GET);
auto res = req.waitForCompletion();
auto document = new Document();
if(strictMode) {
document.parse(cast(string) res.content, true, true, res.contentTypeCharset);
} else {
document.parseGarbage(cast(string) res.content);
}
return document;
}
/++
Creates a document with the given source data. If you want HTML behavior, use `caseSensitive` and `struct` set to `false`. For XML mode, set them to `true`.
Please note that anything after the root element will be found in [piecesAfterRoot]. Comments, processing instructions, and other special tags will be stripped out b default. You can customize this by using the zero-argument constructor and setting callbacks on the [parseSawComment], [parseSawBangInstruction], [parseSawAspCode], [parseSawPhpCode], and [parseSawQuestionInstruction] members, then calling one of the [parseUtf8], [parseGarbage], or [parse] functions. Calling the convenience method, [enableAddingSpecialTagsToDom], will enable all those things at once.
See_Also:
[parseGarbage]
[parseUtf8]
[parseUrl]
+/
this(string data, bool caseSensitive = false, bool strict = false) {
parseUtf8(data, caseSensitive, strict);
}
/**
Creates an empty document. It has *nothing* in it at all, ready.
*/
this() {
}
/++
This is just something I'm toying with. Right now, you use opIndex to put in css selectors.
It returns a struct that forwards calls to all elements it holds, and returns itself so you
can chain it.
Example: document["p"].innerText("hello").addClass("modified");
Equivalent to: foreach(e; document.getElementsBySelector("p")) { e.innerText("hello"); e.addClas("modified"); }
Note: always use function calls (not property syntax) and don't use toString in there for best results.
You can also do things like: document["p"]["b"] though tbh I'm not sure why since the selector string can do all that anyway. Maybe
you could put in some kind of custom filter function tho.
+/
ElementCollection opIndex(string selector) {
auto e = ElementCollection(this.root);
return e[selector];
}
string _contentType = "text/html; charset=utf-8";
/// If you're using this for some other kind of XML, you can
/// set the content type here.
///
/// Note: this has no impact on the function of this class.
/// It is only used if the document is sent via a protocol like HTTP.
///
/// This may be called by parse() if it recognizes the data. Otherwise,
/// if you don't set it, it assumes text/html; charset=utf-8.
@property string contentType(string mimeType) {
_contentType = mimeType;
return _contentType;
}
/// implementing the FileResource interface, useful for sending via
/// http automatically.
@property string filename() const { return null; }
/// implementing the FileResource interface, useful for sending via
/// http automatically.
override @property string contentType() const {
return _contentType;
}
/// implementing the FileResource interface; it calls toString.
override immutable(ubyte)[] getData() const {
return cast(immutable(ubyte)[]) this.toString();
}
/*
/// Concatenates any consecutive text nodes
void normalize() {
}
*/
/// This will set delegates for parseSaw* (note: this overwrites anything else you set, and you setting subsequently will overwrite this) that add those things to the dom tree when it sees them.
/// Call this before calling parse().
/++
Adds objects to the dom representing things normally stripped out during the default parse, like comments, ``, `<% code%>`, and ` code?>` all at once.
Note this will also preserve the prolog and doctype from the original file, if there was one.
See_Also:
[parseSawComment]
[parseSawAspCode]
[parseSawPhpCode]
[parseSawQuestionInstruction]
[parseSawBangInstruction]
+/
void enableAddingSpecialTagsToDom() {
parseSawComment = (string) => true;
parseSawAspCode = (string) => true;
parseSawPhpCode = (string) => true;
parseSawQuestionInstruction = (string) => true;
parseSawBangInstruction = (string) => true;
}
/// If the parser sees a html comment, it will call this callback
/// will call parseSawComment(" comment ")
/// Return true if you want the node appended to the document. It will be in a [HtmlComment] object.
bool delegate(string) parseSawComment;
/// If the parser sees <% asp code... %>, it will call this callback.
/// It will be passed "% asp code... %" or "%= asp code .. %"
/// Return true if you want the node appended to the document. It will be in an [AspCode] object.
bool delegate(string) parseSawAspCode;
/// If the parser sees , it will call this callback.
/// It will be passed "?php php code... ?" or "?= asp code .. ?"
/// Note: dom.d cannot identify the other php code ?> short format.
/// Return true if you want the node appended to the document. It will be in a [PhpCode] object.
bool delegate(string) parseSawPhpCode;
/// if it sees a that is not php or asp
/// it calls this function with the contents.
/// calls parseSawQuestionInstruction("?SOMETHING foo")
/// Unlike the php/asp ones, this ends on the first > it sees, without requiring ?>.
/// Return true if you want the node appended to the document. It will be in a [QuestionInstruction] object.
bool delegate(string) parseSawQuestionInstruction;
/// if it sees a calls parseSawBangInstruction("SOMETHING foo")
/// Return true if you want the node appended to the document. It will be in a [BangInstruction] object.
bool delegate(string) parseSawBangInstruction;
/// Given the kind of garbage you find on the Internet, try to make sense of it.
/// Equivalent to document.parse(data, false, false, null);
/// (Case-insensitive, non-strict, determine character encoding from the data.)
/// NOTE: this makes no attempt at added security, but it will try to recover from anything instead of throwing.
///
/// It is a template so it lazily imports characterencodings.
void parseGarbage()(string data) {
parse(data, false, false, null);
}
/// Parses well-formed UTF-8, case-sensitive, XML or XHTML
/// Will throw exceptions on things like unclosed tags.
void parseStrict(string data) {
parseStream(toUtf8Stream(data), true, true);
}
/// Parses well-formed UTF-8 in loose mode (by default). Tries to correct
/// tag soup, but does NOT try to correct bad character encodings.
///
/// They will still throw an exception.
void parseUtf8(string data, bool caseSensitive = false, bool strict = false) {
parseStream(toUtf8Stream(data), caseSensitive, strict);
}
// this is a template so we get lazy import behavior
Utf8Stream handleDataEncoding()(in string rawdata, string dataEncoding, bool strict) {
import arsd.characterencodings;
// gotta determine the data encoding. If you know it, pass it in above to skip all this.
if(dataEncoding is null) {
dataEncoding = tryToDetermineEncoding(cast(const(ubyte[])) rawdata);
// it can't tell... probably a random 8 bit encoding. Let's check the document itself.
// Now, XML and HTML can both list encoding in the document, but we can't really parse
// it here without changing a lot of code until we know the encoding. So I'm going to
// do some hackish string checking.
if(dataEncoding is null) {
auto dataAsBytes = cast(immutable(ubyte)[]) rawdata;
// first, look for an XML prolog
auto idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "encoding=\"");
if(idx != -1) {
idx += "encoding=\"".length;
// we're probably past the prolog if it's this far in; we might be looking at
// content. Forget about it.
if(idx > 100)
idx = -1;
}
// if that fails, we're looking for Content-Type http-equiv or a meta charset (see html5)..
if(idx == -1) {
idx = indexOfBytes(dataAsBytes, cast(immutable ubyte[]) "charset=");
if(idx != -1) {
idx += "charset=".length;
if(dataAsBytes[idx] == '"')
idx++;
}
}
// found something in either branch...
if(idx != -1) {
// read till a quote or about 12 chars, whichever comes first...
auto end = idx;
while(end < dataAsBytes.length && dataAsBytes[end] != '"' && end - idx < 12)
end++;
dataEncoding = cast(string) dataAsBytes[idx .. end];
}
// otherwise, we just don't know.
}
}
if(dataEncoding is null) {
if(strict)
throw new MarkupException("I couldn't figure out the encoding of this document.");
else
// if we really don't know by here, it means we already tried UTF-8,
// looked for utf 16 and 32 byte order marks, and looked for xml or meta
// tags... let's assume it's Windows-1252, since that's probably the most
// common aside from utf that wouldn't be labeled.
dataEncoding = "Windows 1252";
}
// and now, go ahead and convert it.
string data;
if(!strict) {
// if we're in non-strict mode, we need to check
// the document for mislabeling too; sometimes
// web documents will say they are utf-8, but aren't
// actually properly encoded. If it fails to validate,
// we'll assume it's actually Windows encoding - the most
// likely candidate for mislabeled garbage.
dataEncoding = dataEncoding.toLower();
dataEncoding = dataEncoding.replace(" ", "");
dataEncoding = dataEncoding.replace("-", "");
dataEncoding = dataEncoding.replace("_", "");
if(dataEncoding == "utf8") {
try {
validate(rawdata);
} catch(UTFException e) {
dataEncoding = "Windows 1252";
}
}
}
if(dataEncoding != "UTF-8") {
if(strict)
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
else {
try {
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, dataEncoding);
} catch(Exception e) {
data = convertToUtf8(cast(immutable(ubyte)[]) rawdata, "Windows 1252");
}
}
} else
data = rawdata;
return toUtf8Stream(data);
}
private
Utf8Stream toUtf8Stream(in string rawdata) {
string data = rawdata;
static if(is(Utf8Stream == string))
return data;
else
return new Utf8Stream(data);
}
/++
List of elements that can be assumed to be self-closed
in this document. The default for a Document are a hard-coded
list of ones appropriate for HTML. For [XmlDocument], it defaults
to empty. You can modify this after construction but before parsing.
History:
Added February 8, 2021 (included in dub release 9.2)
+/
string[] selfClosedElements = htmlSelfClosedElements;
/++
List of elements that are considered inline for pretty printing.
The default for a Document are hard-coded to something appropriate
for HTML. For [XmlDocument], it defaults to empty. You can modify
this after construction but before parsing.
History:
Added June 21, 2021 (included in dub release 10.1)
+/
string[] inlineElements = htmlInlineElements;
/**
Take XMLish data and try to make the DOM tree out of it.
The goal isn't to be perfect, but to just be good enough to
approximate Javascript's behavior.
If strict, it throws on something that doesn't make sense.
(Examples: mismatched tags. It doesn't validate!)
If not strict, it tries to recover anyway, and only throws
when something is REALLY unworkable.
If strict is false, it uses a magic list of tags that needn't
be closed. If you are writing a document specifically for this,
try to avoid such - use self closed tags at least. Easier to parse.
The dataEncoding argument can be used to pass a specific
charset encoding for automatic conversion. If null (which is NOT
the default!), it tries to determine from the data itself,
using the xml prolog or meta tags, and assumes UTF-8 if unsure.
If this assumption is wrong, it can throw on non-ascii
characters!
Note that it previously assumed the data was encoded as UTF-8, which
is why the dataEncoding argument defaults to that.
So it shouldn't break backward compatibility.
But, if you want the best behavior on wild data - figuring it out from the document
instead of assuming - you'll probably want to change that argument to null.
This is a template so it lazily imports arsd.characterencodings, which is required
to fix up data encodings.
If you are sure the encoding is good, try parseUtf8 or parseStrict to avoid the
dependency. If it is data from the Internet though, a random website, the encoding
is often a lie. This function, if dataEncoding == null, can correct for that, or
you can try parseGarbage. In those cases, arsd.characterencodings is required to
compile.
*/
void parse()(in string rawdata, bool caseSensitive = false, bool strict = false, string dataEncoding = "UTF-8") {
auto data = handleDataEncoding(rawdata, dataEncoding, strict);
parseStream(data, caseSensitive, strict);
}
// note: this work best in strict mode, unless data is just a simple string wrapper
void parseStream(Utf8Stream data, bool caseSensitive = false, bool strict = false) {
// FIXME: this parser could be faster; it's in the top ten biggest tree times according to the profiler
// of my big app.
assert(data !is null);
// go through character by character.
// if you see a <, consider it a tag.
// name goes until the first non tagname character
// then see if it self closes or has an attribute
// if not in a tag, anything not a tag is a big text
// node child. It ends as soon as it sees a <
// Whitespace in text or attributes is preserved, but not between attributes
// & and friends are converted when I know them, left the same otherwise
// this it should already be done correctly.. so I'm leaving it off to net a ~10% speed boost on my typical test file (really)
//validate(data); // it *must* be UTF-8 for this to work correctly
sizediff_t pos = 0;
clear();
loose = !caseSensitive;
bool sawImproperNesting = false;
bool paragraphHackfixRequired = false;
int getLineNumber(sizediff_t p) {
int line = 1;
foreach(c; data[0..p])
if(c == '\n')
line++;
return line;
}
void parseError(string message) {
throw new MarkupException(format("char %d (line %d): %s", pos, getLineNumber(pos), message));
}
bool eatWhitespace() {
bool ateAny = false;
while(pos < data.length && data[pos].isSimpleWhite) {
pos++;
ateAny = true;
}
return ateAny;
}
string readTagName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while(data[pos] != '>' && data[pos] != '/' && !data[pos].isSimpleWhite)
{
pos++;
if(pos == data.length) {
if(strict)
throw new Exception("tag name incomplete when file ended");
else
break;
}
}
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeName() {
// remember to include : for namespaces
// basically just keep going until >, /, or whitespace
auto start = pos;
while(data[pos] != '>' && data[pos] != '/' && data[pos] != '=' && !data[pos].isSimpleWhite)
{
if(data[pos] == '<') {
if(strict)
throw new MarkupException("The character < can never appear in an attribute name. Line " ~ to!string(getLineNumber(pos)));
else
break; // e.g. . The > should have been after the href, but some shitty files don't do that right and the browser handles it, so we will too, by pretending the > was indeed there
}
pos++;
if(pos == data.length) {
if(strict)
throw new Exception("unterminated attribute name");
else
break;
}
}
if(!caseSensitive)
return toLower(data[start..pos]);
else
return data[start..pos];
}
string readAttributeValue() {
if(pos >= data.length) {
if(strict)
throw new Exception("no attribute value before end of file");
else
return null;
}
switch(data[pos]) {
case '\'':
case '"':
auto started = pos;
char end = data[pos];
pos++;
auto start = pos;
while(pos < data.length && data[pos] != end)
pos++;
if(strict && pos == data.length)
throw new MarkupException("Unclosed attribute value, started on char " ~ to!string(started));
string v = htmlEntitiesDecode(data[start..pos], strict);
pos++; // skip over the end
return v;
default:
if(strict)
parseError("Attributes must be quoted");
// read until whitespace or terminator (/> or >)
auto start = pos;
while(
pos < data.length &&
data[pos] != '>' &&
// unquoted attributes might be urls, so gotta be careful with them and self-closed elements
!(data[pos] == '/' && pos + 1 < data.length && data[pos+1] == '>') &&
!data[pos].isSimpleWhite)
pos++;
string v = htmlEntitiesDecode(data[start..pos], strict);
// don't skip the end - we'll need it later
return v;
}
}
TextNode readTextNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return TextNode.fromUndecodedString(this, data[start..pos]);
}
// this is obsolete!
RawSource readCDataNode() {
auto start = pos;
while(pos < data.length && data[pos] != '<') {
pos++;
}
return new RawSource(this, data[start..pos]);
}
struct Ele {
int type; // element or closing tag or nothing
/*
type == 0 means regular node, self-closed (element is valid)
type == 1 means closing tag (payload is the tag name, element may be valid)
type == 2 means you should ignore it completely
type == 3 means it is a special element that should be appended, if possible, e.g. a that was chosen to be kept, php code, or comment. It will be appended at the current element if inside the root, and to a special document area if not
type == 4 means the document was totally empty
*/
Element element; // for type == 0 or type == 3
string payload; // for type == 1
}
// recursively read a tag
Ele readElement(string[] parentChain = null) {
// FIXME: this is the slowest function in this module, by far, even in strict mode.
// Loose mode should perform decently, but strict mode is the important one.
if(!strict && parentChain is null)
parentChain = [];
static string[] recentAutoClosedTags;
if(pos >= data.length)
{
if(strict) {
throw new MarkupException("Gone over the input (is there no root element or did it never close?), chain: " ~ to!string(parentChain));
} else {
if(parentChain.length)
return Ele(1, null, parentChain[0]); // in loose mode, we just assume the document has ended
else
return Ele(4); // signal emptiness upstream
}
}
if(data[pos] != '<') {
return Ele(0, readTextNode(), null);
}
enforce(data[pos] == '<');
pos++;
if(pos == data.length) {
if(strict)
throw new MarkupException("Found trailing < at end of file");
// if not strict, we'll just skip the switch
} else
switch(data[pos]) {
// I don't care about these, so I just want to skip them
case '!': // might be a comment, a doctype, or a special instruction
pos++;
// FIXME: we should store these in the tree too
// though I like having it stripped out tbh.
if(pos == data.length) {
if(strict)
throw new MarkupException(" block.
// so in since that's the common way
auto commentStart = pos;
while(pos+3 < data.length && data[pos..pos+3] != "-->")
pos++;
auto end = commentStart;
if(pos + 3 >= data.length) {
if(strict)
throw new MarkupException("unclosed comment");
end = data.length;
pos = data.length;
} else {
end = pos;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '-');
pos++;
assert(data[pos] == '>');
pos++;
}
if(parseSawComment !is null)
if(parseSawComment(data[commentStart .. end])) {
return Ele(3, new HtmlComment(this, data[commentStart .. end]), null);
}
} else if(pos + 7 <= data.length && data[pos..pos + 7] == "[CDATA[") {
pos += 7;
auto cdataStart = pos;
ptrdiff_t end = -1;
typeof(end) cdataEnd;
if(pos < data.length) {
// cdata isn't allowed to nest, so this should be generally ok, as long as it is found
end = data[pos .. $].indexOf("]]>");
}
if(end == -1) {
if(strict)
throw new MarkupException("Unclosed CDATA section");
end = pos;
cdataEnd = pos;
} else {
cdataEnd = pos + end;
pos = cdataEnd + 3;
}
return Ele(0, new TextNode(this, data[cdataStart .. cdataEnd]), null);
} else {
auto start = pos;
while(pos < data.length && data[pos] != '>')
pos++;
auto bangEnds = pos;
if(pos == data.length) {
if(strict)
throw new MarkupException("unclosed processing instruction ()");
} else pos++; // skipping the >
if(parseSawBangInstruction !is null)
if(parseSawBangInstruction(data[start .. bangEnds])) {
// FIXME: these should be able to modify the parser state,
// doing things like adding entities, somehow.
return Ele(3, new BangInstruction(this, data[start .. bangEnds]), null);
}
}
/*
if(pos < data.length && data[pos] == '>')
pos++; // skip the >
else
assert(!strict);
*/
break;
case '%':
case '?':
/*
Here's what we want to support:
<% asp code %>
<%= asp code %>
= php code ?>
The contents don't really matter, just if it opens with
one of the above for, it ends on the two char terminator.
this is NOT php code
because I've seen this in the wild:
This could be php with shorttags which would be cut off
prematurely because if(a >) - that > counts as the close
of the tag, but since dom.d can't tell the difference
between that and the real world example, it will
not try to look for the ?> ending.
The difference between this and the asp/php stuff is that it
ends on >, not ?>. ONLY . The rest end
on >.
*/
char end = data[pos];
auto started = pos;
bool isAsp = end == '%';
int currentIndex = 0;
bool isPhp = false;
bool isEqualTag = false;
int phpCount = 0;
more:
pos++; // skip the start
if(pos == data.length) {
if(strict)
throw new MarkupException("Unclosed <"~end~" by end of file");
} else {
currentIndex++;
if(currentIndex == 1 && data[pos] == '=') {
if(!isAsp)
isPhp = true;
isEqualTag = true;
goto more;
}
if(currentIndex == 1 && data[pos] == 'p')
phpCount++;
if(currentIndex == 2 && data[pos] == 'h')
phpCount++;
if(currentIndex == 3 && data[pos] == 'p' && phpCount == 2)
isPhp = true;
if(data[pos] == '>') {
if((isAsp || isPhp) && data[pos - 1] != end)
goto more;
// otherwise we're done
} else
goto more;
}
//writefln("%s: %s", isAsp ? "ASP" : isPhp ? "PHP" : " ", data[started .. pos]);
auto code = data[started .. pos];
assert((pos < data.length && data[pos] == '>') || (!strict && pos == data.length));
if(pos < data.length)
pos++; // get past the >
if(isAsp && parseSawAspCode !is null) {
if(parseSawAspCode(code)) {
return Ele(3, new AspCode(this, code), null);
}
} else if(isPhp && parseSawPhpCode !is null) {
if(parseSawPhpCode(code)) {
return Ele(3, new PhpCode(this, code), null);
}
} else if(!isAsp && !isPhp && parseSawQuestionInstruction !is null) {
if(parseSawQuestionInstruction(code)) {
return Ele(3, new QuestionInstruction(this, code), null);
}
}
break;
case '/': // closing an element
pos++; // skip the start
auto p = pos;
while(pos < data.length && data[pos] != '>')
pos++;
//writefln("%s>", data[p..pos]);
if(pos == data.length && data[pos-1] != '>') {
if(strict)
throw new MarkupException("File ended before closing tag had a required >");
else
data ~= ">"; // just hack it in
}
pos++; // skip the '>'
string tname = data[p..pos-1];
if(!strict)
tname = tname.strip;
if(!caseSensitive)
tname = tname.toLower();
return Ele(1, null, tname); // closing tag reports itself here
case ' ': // assume it isn't a real element...
if(strict) {
parseError("bad markup - improperly placed <");
assert(0); // parseError always throws
} else
return Ele(0, TextNode.fromUndecodedString(this, "<"), null);
default:
if(!strict) {
// what about something that kinda looks like a tag, but isn't?
auto nextTag = data[pos .. $].indexOf("<");
auto closeTag = data[pos .. $].indexOf(">");
if(closeTag != -1 && nextTag != -1)
if(nextTag < closeTag) {
// since attribute names cannot possibly have a < in them, we'll look for an equal since it might be an attribute value... and even in garbage mode, it'd have to be a quoted one realistically
auto equal = data[pos .. $].indexOf("=\"");
if(equal != -1 && equal < closeTag) {
// this MIGHT be ok, soldier on
} else {
// definitely no good, this must be a (horribly distorted) text node
pos++; // skip the < we're on - don't want text node to end prematurely
auto node = readTextNode();
node.contents = "<" ~ node.contents; // put this back
return Ele(0, node, null);
}
}
}
string tagName = readTagName();
string[string] attributes;
Ele addTag(bool selfClosed) {
if(selfClosed)
pos++;
else {
if(!strict)
if(tagName.isInArray(selfClosedElements))
// these are de-facto self closed
selfClosed = true;
}
import std.algorithm.comparison;
if(strict) {
enforce(data[pos] == '>', format("got %s when expecting > (possible missing attribute name)\nContext:\n%s", data[pos], data[max(0, pos - 100) .. min(data.length, pos + 100)]));
} else {
// if we got here, it's probably because a slash was in an
// unquoted attribute - don't trust the selfClosed value
if(!selfClosed)
selfClosed = tagName.isInArray(selfClosedElements);
while(pos < data.length && data[pos] != '>')
pos++;
if(pos >= data.length) {
// the tag never closed
assert(data.length != 0);
pos = data.length - 1; // rewinding so it hits the end at the bottom..
}
}
auto whereThisTagStarted = pos; // for better error messages
pos++;
auto e = createElement(tagName);
e.attributes = attributes;
version(dom_node_indexes) {
if(e.dataset.nodeIndex.length == 0)
e.dataset.nodeIndex = to!string(&(e.attributes));
}
e.selfClosed = selfClosed;
e.parseAttributes();
// HACK to handle script and style as a raw data section as it is in HTML browsers
if(tagName == "script" || tagName == "style") {
if(!selfClosed) {
string closer = "" ~ tagName ~ ">";
ptrdiff_t ending;
if(pos >= data.length)
ending = -1;
else
ending = indexOf(data[pos..$], closer);
ending = indexOf(data[pos..$], closer, 0, (loose ? CaseSensitive.no : CaseSensitive.yes));
/*
if(loose && ending == -1 && pos < data.length)
ending = indexOf(data[pos..$], closer.toUpper());
*/
if(ending == -1) {
if(strict)
throw new Exception("tag " ~ tagName ~ " never closed");
else {
// let's call it totally empty and do the rest of the file as text. doing it as html could still result in some weird stuff like if(a<4) being read as <4 being a tag so it comes out if(a<4>4> and other weirdness) It is either a closed script tag or the rest of the file is forfeit.
if(pos < data.length) {
e = new TextNode(this, data[pos .. $]);
pos = data.length;
}
}
} else {
ending += pos;
e.innerRawSource = data[pos..ending];
pos = ending + closer.length;
}
}
return Ele(0, e, null);
}
bool closed = selfClosed;
void considerHtmlParagraphHack(Element n) {
assert(!strict);
if(e.tagName == "p" && e.tagName == n.tagName) {
// html lets you write para 1
para 1
// but in the dom tree, they should be siblings, not children.
paragraphHackfixRequired = true;
}
}
//writef("<%s>", tagName);
while(!closed) {
Ele n;
if(strict)
n = readElement();
else
n = readElement(parentChain ~ tagName);
if(n.type == 4) return n; // the document is empty
if(n.type == 3 && n.element !is null) {
// special node, append if possible
if(e !is null)
e.appendChild(n.element);
else
piecesBeforeRoot ~= n.element;
} else if(n.type == 0) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
} else if(n.type == 1) {
bool found = false;
if(n.payload != tagName) {
if(strict)
parseError(format("mismatched tag: %s> != <%s> (opened on line %d)", n.payload, tagName, getLineNumber(whereThisTagStarted)));
else {
sawImproperNesting = true;
// this is so we don't drop several levels of awful markup
if(n.element) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
n.element = null;
}
// is the element open somewhere up the chain?
foreach(i, parent; parentChain)
if(parent == n.payload) {
recentAutoClosedTags ~= tagName;
// just rotating it so we don't inadvertently break stuff with vile crap
if(recentAutoClosedTags.length > 4)
recentAutoClosedTags = recentAutoClosedTags[1 .. $];
n.element = e;
return n;
}
// if not, this is a text node; we can't fix it up...
// If it's already in the tree somewhere, assume it is closed by algorithm
// and we shouldn't output it - odds are the user just flipped a couple tags
foreach(ele; e.tree) {
if(ele.tagName == n.payload) {
found = true;
break;
}
}
foreach(ele; recentAutoClosedTags) {
if(ele == n.payload) {
found = true;
break;
}
}
if(!found) // if not found in the tree though, it's probably just text
e.appendChild(TextNode.fromUndecodedString(this, ""~n.payload~">"));
}
} else {
if(n.element) {
if(!strict)
considerHtmlParagraphHack(n.element);
e.appendChild(n.element);
}
}
if(n.payload == tagName) // in strict mode, this is always true
closed = true;
} else { /*throw new Exception("wtf " ~ tagName);*/ }
}
//writef("%s>\n", tagName);
return Ele(0, e, null);
}
// if a tag was opened but not closed by end of file, we can arrive here
if(!strict && pos >= data.length)
return addTag(false);
//else if(strict) assert(0); // should be caught before
switch(data[pos]) {
default: assert(0);
case '/': // self closing tag
return addTag(true);
case '>':
return addTag(false);
case ' ':
case '\t':
case '\n':
case '\r':
// there might be attributes...
moreAttributes:
eatWhitespace();
// same deal as above the switch....
if(!strict && pos >= data.length)
return addTag(false);
if(strict && pos >= data.length)
throw new MarkupException("tag open, didn't find > before end of file");
switch(data[pos]) {
case '/': // self closing tag
return addTag(true);
case '>': // closed tag; open -- we now read the contents
return addTag(false);
default: // it is an attribute
string attrName = readAttributeName();
string attrValue = attrName;
bool ateAny = eatWhitespace();
if(strict && ateAny)
throw new MarkupException("inappropriate whitespace after attribute name");
if(pos >= data.length) {
if(strict)
assert(0, "this should have thrown in readAttributeName");
else {
data ~= ">";
goto blankValue;
}
}
if(data[pos] == '=') {
pos++;
ateAny = eatWhitespace();
// the spec actually allows this!
//if(strict && ateAny)
//throw new MarkupException("inappropriate whitespace after attribute equals");
attrValue = readAttributeValue();
eatWhitespace();
}
blankValue:
if(strict && attrName in attributes)
throw new MarkupException("Repeated attribute: " ~ attrName);
if(attrName.strip().length)
attributes[attrName] = attrValue;
else if(strict) throw new MarkupException("wtf, zero length attribute name");
if(!strict && pos < data.length && data[pos] == '<') {
// this is the broken tag that doesn't have a > at the end
data = data[0 .. pos] ~ ">" ~ data[pos.. $];
// let's insert one as a hack
goto case '>';
}
goto moreAttributes;
}
}
}
return Ele(2, null, null); // this is a