// Written in the D programming language.
/**
Classes and functions for creating and parsing XML
The basic architecture of this module is that there are standalone functions, classes for constructing an XML
document from scratch (Tag, Element and Document), and also classes for parsing a pre-existing
XML file (ElementParser and DocumentParser). The parsing classes may be used to build
a Document, but that is not their primary purpose. The handling capabilities of DocumentParser
and ElementParser are sufficiently customizable that you can make them do pretty much
whatever you want.
Authors: Janice Caron
Date: 2006.02.12
License: Public Domain
Examples:
--------------------------------------------------------------------------------------------------
import std.xml;
import std.stdio;
import std.string;
// books.xml is used in various samples throughout the Microsoft XML Core Services (MSXML) SDK.
// See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
struct Book
{
string id;
string author;
string title;
string genre;
string price;
string pubDate;
string description;
}
void main()
{
string s = import("books.xml");
// Check for well-formedness
check(s);
// Take it apart
Book[] books;
auto xml = new DocumentParser(s);
xml.onStartTag["book"] = delegate void(ElementParser xml)
{
Book book;
book.id = xml.tag.attr["id"];
xml.onEndTag["author"] = delegate void(in Element e) { book.author = e.text; };
xml.onEndTag["title"] = delegate void(in Element e) { book.title = e.text; };
xml.onEndTag["genre"] = delegate void(in Element e) { book.genre = e.text; };
xml.onEndTag["price"] = delegate void(in Element e) { book.price = e.text; };
xml.onEndTag["publish-date"] = delegate void(in Element e) { book.pubDate = e.text; };
xml.onEndTag["description"] = delegate void(in Element e) { book.description = e.text; };
xml.parse();
books ~= book;
};
xml.parse();
// Put it back together again;
auto doc = new Document("catalog");
foreach(book;books)
{
auto element = new Element("book");
element.tag.attr["id"] = book.id;
element ~= new Element("author", book.author);
element ~= new Element("title", book.title);
element ~= new Element("genre", book.genre);
element ~= new Element("price", book.price);
element ~= new Element("publish-date",book.pubDate);
element ~= new Element("description", book.description);
doc ~= element;
}
// Now let's pretty-print it to see what it looks like
writefln(join(doc.pretty(3),"\n"));
}
--------------------------------------------------------------------------------------------------
*/
module std.xml;
import std.string;
import std.utf;
/**
* Returns true if the character is a character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* You might be wondering why the character-testing functions are provided. After all, std.utf already provides
* isValidDchar(), and std.uni provides isUniAlpha(), etc.. The answer is that these functions conform to different
* standards, and so give different results in certain edge-cases. For example, std.utf.isValidDchar('\u0008) returns true,
* whereas std.xml.isChar('\u0008') returns false, because control codes are valid Unicode characters, but are not
* permitted in XML documents. Along similar lines, std.uni.isUniAlpha('\U00020000') returns true, whereas
* std.xml.isLetter('\U00020000') returns false, because Unified Han Ideographs are not permitted within tag names
* or attribute names in XML. These functions exist so you can check what is actually allowed in XML,
* according to the W3C consortium.
*
* Params:
* c = the character to be tested
*/
bool isChar(dchar c) // rule 2
{
return lookup(CharTable,c);
}
unittest
{
// const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,0x10000,0x10FFFF];
assert(!isChar(cast(dchar)0x8));
assert( isChar(cast(dchar)0x9));
assert( isChar(cast(dchar)0xA));
assert(!isChar(cast(dchar)0xB));
assert(!isChar(cast(dchar)0xC));
assert( isChar(cast(dchar)0xD));
assert(!isChar(cast(dchar)0xE));
assert(!isChar(cast(dchar)0x1F));
assert( isChar(cast(dchar)0x20));
assert( isChar('J'));
assert( isChar(cast(dchar)0xD7FF));
assert(!isChar(cast(dchar)0xD800));
assert(!isChar(cast(dchar)0xDFFF));
assert( isChar(cast(dchar)0xE000));
assert( isChar(cast(dchar)0xFFFD));
assert(!isChar(cast(dchar)0xFFFE));
assert(!isChar(cast(dchar)0xFFFF));
assert( isChar(cast(dchar)0x10000));
assert( isChar(cast(dchar)0x10FFFF));
assert(!isChar(cast(dchar)0x110000));
}
/**
* Returns true if the character is whitespace according to the XML standard
*
* Only the following characters are considered whitespace in XML - space, tab, carriage return and linefeed
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isSpace(dchar c)
{
return c == '\u0020' || c == '\u0009' || c == '\u000A' || c == '\u000D';
}
/**
* Returns true if the character is a digit according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isDigit(dchar c)
{
return lookup(DigitTable,c);
}
/**
* Returns true if the character is a letter according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isLetter(dchar c) // rule 84
{
return isIdeographic(c) || isBaseChar(c);
}
/**
* Returns true if the character is an ideographic character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isIdeographic(dchar c)
{
return lookup(IdeographicTable,c);
}
/**
* Returns true if the character is a base character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isBaseChar(dchar c)
{
return lookup(BaseCharTable,c);
}
/**
* Returns true if the character is a combining character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isCombiningChar(dchar c)
{
return lookup(CombiningCharTable,c);
}
/**
* Returns true if the character is an extender according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isExtender(dchar c)
{
return lookup(ExtenderTable,c);
}
/**
* Encodes a string by replacing all characters which need to be escaped with
* appropriate predefined XML entities.
*
* encode() escapes certain characters (ampersand, quote, apostrophe, less-than and greater-than),
* and similarly, decode() unescapes them. These functions are provided for convenience only. You do not need to use
* them when using the std.xml classes, because then all the encoding and decoding will be done for you automatically.
*
* If the string is not modified, the original will be returned.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* s = The string to be encoded
*
* Returns: The encoded string
*
* Examples:
* --------------
* writefln(encode("a > b")); // writes "a > b"
* --------------
*/
string encode(string s)
{
// The ifs are (temprarily, we hope) necessary, because std.string.write.replace
// does not do copy-on-write, but instead copies always.
if (s.find('&') != -1) s = replace(s,"&","&");
if (s.find('"') != -1) s = replace(s,"\"",""");
if (s.find("'") != -1) s = replace(s,"'","'");
if (s.find('<') != -1) s = replace(s,"<","<");
if (s.find('>') != -1) s = replace(s,">",">");
return s;
}
unittest
{
assert(encode("hello") is "hello");
assert(encode("a > b") == "a > b");
assert(encode("a < b") == "a < b");
assert(encode("don't") == "don't");
assert(encode("\"hi\"") == ""hi"");
assert(encode("cat & dog") == "cat & dog");
}
/**
* Decodes a string by unescaping all predefined XML entities.
*
* encode() escapes certain characters (ampersand, quote, apostrophe, less-than and greater-than),
* and similarly, decode() unescapes them. These functions are provided for convenience only. You do not need to use
* them when using the std.xml classes, because then all the encoding and decoding will be done for you automatically.
*
* This function decodes the entities &, ", ', < and >,
* as well as decimal and hexadecimal entities such as €
*
* If the string does not contain an ampersand, the original will be returned.
*
* Note that if the "strict" parameter is false, then illegal ampersands will be ignored
* (that is, "cat & dog" will decode to "cat & dog"), whereas, if the strict paramter
* is true, then illegal sequences will cause decoding to fail.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* s = The string to be decoded
* strict = (optional) if true, strictly enforce that ampersands must be escaped.
* (Defaults to false).
*
* Throws: DecodeException if strict is true and decode fails
*
* Returns: The decoded string
*
* Examples:
* --------------
* writefln(decode("a > b")); // writes "a > b"
* --------------
*/
string decode(string s, bool strict=false)
{
char[] buffer;
for (int i=0; i'; i += 3; }
else
{
if (strict) throw new DecodeException("Unescaped &");
buffer ~= '&';
}
}
}
return (buffer.length == 0) ? s : cast(string)buffer;
}
unittest
{
void assertNot(string s)
{
bool b = false;
try { decode(s,true); }
catch (DecodeException e) { b = true; }
assert(b,s);
}
// Assert that things that should work, do
assert(decode("hello") is "hello");
assert(decode("a > b") == "a > b");
assert(decode("a < b") == "a < b");
assert(decode("don't") == "don't");
assert(decode(""hi"") == "\"hi\"");
assert(decode("cat & dog") == "cat & dog");
assert(decode("*") == "*");
assert(decode("*") == "*");
assert(decode("cat & dog") == "cat & dog");
assert(decode("a > b") == "a > b");
assert(decode("") == "");
assert(decode("") == "");
assert(decode("G;") == "G;");
assert(decode("G;") == "G;");
// Assert that things that shouldn't work, don't
assertNot("cat & dog");
assertNot("a > b");
assertNot("");
assertNot("");
assertNot("G;");
assertNot("G;");
}
/**
* Class representing an XML document.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
*/
class Document : Element
{
/**
* Contains all text which occurs before the root element.
* Defaults to <?xml version="1.0"?>
*/
string prolog = "";
/**
* Contains all text which occurs after the root element.
* Defaults to the empty string
*/
string epilog;
/**
* Constructs a Document given the root name.
*
* Params:
* name = the name of the root element of the document.
*/
this(string name)
{
super(name);
}
/**
* Constructs a Document from a Tag.
*
* Params:
* tag = the start tag of the document.
*/
this(Tag tag)
{
super(tag);
}
const
{
/**
* Compares two Documents for equality
*
* Examples:
* --------------
* Document d1,d2;
* if (d1 == d2) { }
* --------------
*/
override int opEquals(Object o)
{
const doc = toType!(const Document)(o);
return
(prolog != doc.prolog ) ? false : (
(super != cast(const Element)doc) ? false : (
(epilog != doc.epilog ) ? false : (
true )));
}
/**
* Compares two Documents
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*
* Examples:
* --------------
* Document d1,d2;
* if (d1 < d2) { }
* --------------
*/
override int opCmp(Object o)
{
const doc = toType!(const Document)(o);
return
((prolog != doc.prolog ) ? ( prolog < doc.prolog ? -1 : 1 ) :
((super != cast(const Element)doc) ? ( super < cast(const Element)doc ? -1 : 1 ) :
((epilog != doc.epilog ) ? ( epilog < doc.epilog ? -1 : 1 ) :
0 )));
}
/**
* Returns the hash of a Document
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(prolog,hash(epilog,super.toHash)); }
/**
* Returns the string representation of a Document. (That is, the complete XML of a document).
*/
override string toString()
{
return prolog ~ super.toString ~ epilog;
}
}
}
/**
* Class representing an XML element.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*/
class Element : Item
{
Tag tag; /// The start tag of the element
Item[] items; /// The element's items
/**
* Constructs an Element given a name and a string to be used as a Text interior.
*
* Params:
* name = the name of the element.
* interior = (optional) the string interior.
*
* Examples:
* -------------------------------------------------------
* auto element = new Element("title","Serenity")
* // constructs the element Serenity
* -------------------------------------------------------
*/
this(string name, string interior=null)
{
this(new Tag(name));
if (interior.length != 0) opCatAssign(new Text(interior));
}
/**
* Constructs an Element from a Tag.
*
* Params:
* tag = the start or empty tag of the element.
*/
this(Tag tag)
{
this.tag = tag;
tag.type = TagType.EMPTY;
}
/**
* Append a complete item to the interior of this element
*
* Params:
* e = the element you wish to append.
*
* Examples:
* --------------
* Element element;
* Element other = new Element("br");
* element ~= other;
* // appends element representing
* --------------
*/
void opCatAssign(Item item)
{
items ~= item;
if (tag.type == TagType.EMPTY && !item.isEmptyXML) tag.type = TagType.START;
}
/**
* Compares two Elements for equality
*
* Examples:
* --------------
* Element e1,e2;
* if (e1 == e2) { }
* --------------
*/
override int opEquals(Object o)
{
const element = toType!(const Element)(o);
uint len = items.length;
if (len != element.items.length) return false;
for (uint i=0; ionly. So, for example, given
* XML such as "<title>Good & Bad</title>", will return "Good & Bad".
*
* Params:
* strict = (optional) if true, strictly enforce that ampersands must be escaped.
* (Defaults to false).
*
* Throws: DecodeException if decode fails
*/
string text(bool strict=false)
{
string buffer;
foreach(item;items)
{
Text t = cast(Text)item;
if (t is null) throw new DecodeException(item.toString);
buffer ~= decode(t.toString,strict);
}
return buffer;
}
/**
* Returns an indented string representation of this item
*
* Params:
* indent = (optional) number of spaces by which to indent this element. Defaults to 2.
*/
override string[] pretty(uint indent=2)
{
if (isEmptyXML) return [ tag.toEmptyString ];
if (items.length == 1)
{
Text t = cast(Text)(items[0]);
if (t !is null)
{
return [ tag.toStartString ~ t.toString ~ tag.toEndString ];
}
}
string[] a = [ tag.toStartString ];
foreach(item;items)
{
string[] b = item.pretty(indent);
foreach(s;b)
{
a ~= rjustify(s,s.length + indent);
}
}
a ~= tag.toEndString;
return a;
}
/**
* Returns the string representation of an Element
*
* Examples:
* --------------
* auto element = new Element("br");
* writefln(element.toString); // writes "
"
* --------------
*/
override string toString()
{
if (isEmptyXML) return tag.toEmptyString;
string buffer = tag.toStartString;
foreach(item;items) { buffer ~= item.toString; }
buffer ~= tag.toEndString;
return buffer;
}
override bool isEmptyXML() { return false; } /// Returns false always
}
}
/**
* Tag types.
*
* $(DDOC_ENUM_MEMBERS START) Used for start tags
* $(DDOC_ENUM_MEMBERS END) Used for end tags
* $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
*
*/
enum TagType { START, END, EMPTY };
/**
* Class representing an XML tag.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* The class invariant guarantees
*
* - that $(B type) is a valid enum TagType value
* - that $(B name) consists of valid characters
* - that each attribute name consists of valid characters
*
*/
class Tag
{
TagType type = TagType.START; /// Type of tag
string name; /// Tag name
string[string] attr; /// Associative array of attributes
private string tagString;
invariant()
{
string s;
string t;
assert(type == TagType.START || type == TagType.END || type == TagType.EMPTY);
s = name;
try { checkName(s,t); }
catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
foreach(k,v;attr)
{
s = k;
try { checkName(s,t); }
catch(Err e) { assert(false,"Invalid atrribute name:" ~ e.toString); }
}
}
/**
* Constructs an instance of Tag with a specified name and type
*
* The constructor does not initialize the attributes. To initialize the attributes,
* you access the $(B attr) member variable.
*
* Params:
* name = the Tag's name
* type = (optional) the Tag's type. If omitted, defaults to TagType.START.
*
* Examples:
* --------------
* auto tag = new Tag("img",Tag.EMPTY);
* tag.attr["src"] = "http://example.com/example.jpg";
* --------------
*/
this(string name, TagType type=TagType.START)
{
this.name = name;
this.type = type;
}
/* Private constructor (so don't ddoc this!)
*
* Constructs a Tag by parsing the string representation, e.g. "".
*
* The string is passed by reference, and is advanced over all characters consumed.
*
* The second parameter is a dummy parameter only, required soley to distinguish
* this constructor from the public one.
*/
private this(ref string s, bool dummy)
{
tagString = s;
try
{
reqc(s,'<');
if (optc(s,'/')) type = TagType.END;
name = munch(s,"^>"~whitespace);
munch(s,whitespace);
while(s.length > 0 && s[0] != '>' && s[0] != '/')
{
string key = munch(s,"^="~whitespace);
munch(s,whitespace);
reqc(s,'=');
munch(s,whitespace);
reqc(s,'"');
string val = encode(munch(s,"^\""));
reqc(s,'"');
munch(s,whitespace);
attr[key] = val;
}
if (optc(s,'/'))
{
if (type == TagType.END) throw new TagException("");
type = TagType.EMPTY;
}
reqc(s,'>');
tagString.length = (s.ptr - tagString.ptr);
}
catch(Exception e)
{
tagString.length = (s.ptr - tagString.ptr);
throw new TagException(tagString);
}
}
const
{
/**
* Compares two Tags for equality
*
* You should rarely need to call this function. It exists so that Tags
* can be used as associative array keys.
*
* Examples:
* --------------
* Tag tag1,tag2
* if (tag1 == tag2) { }
* --------------
*/
override int opEquals(Object o)
{
const tag = toType!(const Tag)(o);
return
(name != tag.name) ? false : (
(attr != tag.attr) ? false : (
(type != tag.type) ? false : (
true )));
}
/**
* Compares two Tags
*
* Examples:
* --------------
* Tag tag1,tag2
* if (tag1 < tag2) { }
* --------------
*/
override int opCmp(Object o)
{
const tag = toType!(const Tag)(o);
return
((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
0 )));
}
/**
* Returns the hash of a Tag
*
* You should rarely need to call this function. It exists so that Tags
* can be used as associative array keys.
*/
override hash_t toHash()
{
hash_t hash = 0;
foreach(dchar c;name) hash = hash * 11 + c;
return hash;
}
/**
* Returns the string representation of a Tag
*
* Examples:
* --------------
* auto tag = new Tag("book",TagType.START);
* writefln(tag.toString); // writes ""
* --------------
*/
override string toString()
{
if (isEmpty) return toEmptyString();
return (isEnd) ? toEndString() : toStartString();
}
private
{
string toNonEndString()
{
string s = "<" ~ name;
foreach(key,val;attr) s ~= format(" %s=\"%s\"",key,decode(val));
return s;
}
string toStartString() { return toNonEndString() ~ ">"; }
string toEndString() { return "" ~ name ~ ">"; }
string toEmptyString() { return toNonEndString() ~ " />"; }
}
/**
* Returns true if the Tag is a start tag
*
* Examples:
* --------------
* if (tag.isStart) { }
* --------------
*/
bool isStart() { return type == TagType.START; }
/**
* Returns true if the Tag is an end tag
*
* Examples:
* --------------
* if (tag.isEnd) { }
* --------------
*/
bool isEnd() { return type == TagType.END; }
/**
* Returns true if the Tag is an empty tag
*
* Examples:
* --------------
* if (tag.isEmpty) { }
* --------------
*/
bool isEmpty() { return type == TagType.EMPTY; }
}
}
/**
* Class representing a comment
*/
class Comment : Item
{
private string content;
/**
* Construct a comment
*
* Params:
* content = the body of the comment
*
* Throws: CommentException if the comment body is illegal (contains "--" or exactly equals "-")
*
* Examples:
* --------------
* auto item = new Comment("This is a comment");
* // constructs
* --------------
*/
this(string content)
{
if (content == "-" || content.find("==") != -1) throw new CommentException(content);
this.content = content;
}
/**
* Compares two comments for equality
*
* Examples:
* --------------
* Comment item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Comment)item;
return t !is null && content == t.content;
}
/**
* Compares two comments
*
* You should rarely need to call this function. It exists so that Comments
* can be used as associative array keys.
*
* Examples:
* --------------
* Comment item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Comment)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a Comment
*
* You should rarely need to call this function. It exists so that Comments
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this comment
*/
override const string toString() { return ""; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a Character Data section
*/
class CData : Item
{
private string content;
/**
* Construct a chraracter data section
*
* Params:
* content = the body of the character data segment
*
* Throws: CDataException if the segment body is illegal (contains "]]>")
*
* Examples:
* --------------
* auto item = new CData("hello");
* // constructs hello]]>
* --------------
*/
this(string content)
{
if (content.find("]]>") != -1) throw new CDataException(content);
this.content = content;
}
/**
* Compares two CDatas for equality
*
* Examples:
* --------------
* CData item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(CData)item;
return t !is null && content == t.content;
}
/**
* Compares two CDatas
*
* You should rarely need to call this function. It exists so that CDatas
* can be used as associative array keys.
*
* Examples:
* --------------
* CData item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(CData)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a CData
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this CData section
*/
override const string toString() { return "<[CDATA[" ~ content ~ "]]>"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a text (aka Parsed Character Data) section
*/
class Text : Item
{
private string content;
/**
* Construct a text (aka PCData) section
*
* Params:
* content = the text. This function encodes the text before insertion,
* so it is safe to insert any text
*
* Examples:
* --------------
* auto Text = new CData("a < b");
* // constructs a < b
* --------------
*/
this(string content)
{
this.content = encode(content);
}
/**
* Compares two text sections for equality
*
* Examples:
* --------------
* Text item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Text)item;
return t !is null && content == t.content;
}
/**
* Compares two text sections
*
* You should rarely need to call this function. It exists so that Texts
* can be used as associative array keys.
*
* Examples:
* --------------
* Text item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Text)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a text section
*
* You should rarely need to call this function. It exists so that Texts
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this Text section
*/
override const string toString() { return content; }
override const bool isEmptyXML() { return content.length == 0; } /// Returns true if the content is the empty string
}
/**
* Class representing an XML Instruction section
*/
class XMLInstruction : Item
{
private string content;
/**
* Construct an XML Instruction section
*
* Params:
* content = the body of the instruction segment
*
* Throws: XIException if the segment body is illegal (contains ">")
*
* Examples:
* --------------
* auto item = new XMLInstruction("ATTLIST");
* // constructs
* --------------
*/
this(string content)
{
if (content.find(">") != -1) throw new XIException(content);
this.content = content;
}
/**
* Compares two XML instructions for equality
*
* Examples:
* --------------
* XMLInstruction item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(XMLInstruction)item;
return t !is null && content == t.content;
}
/**
* Compares two XML instructions
*
* You should rarely need to call this function. It exists so that XmlInstructions
* can be used as associative array keys.
*
* Examples:
* --------------
* XMLInstruction item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(XMLInstruction)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of an XMLInstruction
*
* You should rarely need to call this function. It exists so that XmlInstructions
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this XmlInstruction
*/
override const string toString() { return ""; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a Processing Instruction section
*/
class ProcessingInstruction : Item
{
private string content;
/**
* Construct a Processing Instruction section
*
* Params:
* content = the body of the instruction segment
*
* Throws: PIException if the segment body is illegal (contains "?>")
*
* Examples:
* --------------
* auto item = new ProcessingInstruction("php");
* // constructs
* --------------
*/
this(string content)
{
if (content.find("?>") != -1) throw new PIException(content);
this.content = content;
}
/**
* Compares two processing instructions for equality
*
* Examples:
* --------------
* ProcessingInstruction item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(ProcessingInstruction)item;
return t !is null && content == t.content;
}
/**
* Compares two processing instructions
*
* You should rarely need to call this function. It exists so that ProcessingInstructions
* can be used as associative array keys.
*
* Examples:
* --------------
* ProcessingInstruction item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(ProcessingInstruction)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a ProcessingInstruction
*
* You should rarely need to call this function. It exists so that ProcessingInstructions
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this ProcessingInstruction
*/
override const string toString() { return "" ~ content ~ "?>"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Abstract base class for XML items
*/
abstract class Item
{
abstract override int opEquals(Object o); /// Compares with another Item of same type for equality
abstract override int opCmp(Object o); /// Compares with another Item of same type
abstract override hash_t toHash(); /// Returns the hash of this item
abstract override const string toString(); /// Returns a string representation of this item
/**
* Returns an indented string representation of this item
*
* Params:
* indent = number of spaces by which to indent child elements
*/
const string[] pretty(uint indent) { return [ toString() ]; }
abstract const bool isEmptyXML(); /// Returns true if the item represents empty XML text
}
/**
* Class for parsing an XML Document.
*
* This is a subclass of ElementParser. Most of the useful functions are documented there.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Bugs:
* Currently only supports UTF documents.
*
* If there is an encoding attribute in the prolog, it is ignored.
*
*/
class DocumentParser : ElementParser
{
string xmlText;
/**
* Constructs a DocumentParser
*
* Params:
* xmltext = the entire XML document as text
*
*/
this(string xmlText_)
{
xmlText = xmlText_;
s = &xmlText;
super(); // Initialize everything
parse(); // Parse through the root tag (but not beyond)
}
}
/**
* Class for parsing an XML element.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Note that you cannot construct instances of this class directly. You can construct a DocumentParser
* (which is a subclass of ElementParser), but otherwise, Instances of ElementParser will be created
* for you by the library, and passed your way via onStartTag handlers.
*
*/
class ElementParser
{
alias void delegate(string) Handler;
alias void delegate(in Element element) ElementHandler;
alias void delegate(ElementParser parser) ParserHandler;
private
{
Tag tag_;
string elementStart;
string* s;
Handler commentHandler;
Handler cdataHandler;
Handler xiHandler;
Handler piHandler;
Handler textHandler;
this(ElementParser parent)
{
s = parent.s;
this();
tag_ = parent.tag_;
}
}
/**
* The Tag at the start of the element being parsed. You can read this to determine
* the tag's name and attributes.
*/
const const(Tag) tag() { return tag_; }
/**
* Register a handler which will be called whenever a start tag is encountered which matches
* the specified name. You can also pass null as the name, in which case the handler will be
* called for any unmatched start tag.
*
* Examples:
* --------------
* // Call this function whenever a start tag is encountered
* onStartTag["podcast"] = delegate void(ElementParser xml)
* {
* // Your code here
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
*
* // call myEpisodeStartHandler (defined elsewhere) whenever an start tag is encountered
* onStartTag["episode"] = &myEpisodeStartHandler;
*
* // call delegate dg for all other start tags
* onStartTag[null] = dg;
* --------------
*
* This library will supply your function with a new instance of ElementHandler, which may
* be used to parse inside the element whose start tag was just found, or to identify the
* tag attributes of the element, etc.
*
* Note that your function will be called for both start tags and empty tags.
* That is, we make no distinction between <br></br> and <br/>.
*/
ParserHandler[string] onStartTag;
/**
* Register a handler which will be called whenever an end tag is encountered which matches
* the specified name. You can also pass null as the name, in which case the handler will be
* called for any unmatched end tag.
*
* Examples:
* --------------
* // Call this function whenever a end tag is encountered
* onEndTag["podcast"] = delegate void(in Element e)
* {
* // Your code here
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
*
* // call myEpisodeEndHandler (defined elsewhere) whenever an end tag is encountered
* onEndTag["episode"] = &myEpisodeEndHandler;
*
* // call delegate dg for all other end tags
* onEndTag[null] = dg;
* --------------
*
* Note that your function will be called for both start tags and empty tags.
* That is, we make no distinction between <br></br> and <br/>.
*/
ElementHandler[string] onEndTag;
protected this()
{
commentHandler = &defaultHandler;
cdataHandler = &defaultHandler;
xiHandler = &defaultHandler;
piHandler = &defaultHandler;
textHandler = &defaultHandler;
onStartTag[null] = &defaultParserHandler;
onEndTag[null] = &defaultElementHandler;
elementStart = *s;
}
void defaultHandler(string) {}
void defaultElementHandler(in Element) {}
void defaultParserHandler(ElementParser) {}
/**
* Register a handler which will be called whenever text is encountered.
*
* Examples:
* --------------
* // Call this function whenever text is encountered
* onText = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s will have been decoded by the time you see it,
* // and so may contain any character.
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onText(Handler handler) { textHandler = handler; }
/**
* Register a handler which will be called whenever a character data segement is encountered.
*
* Examples:
* --------------
* // Call this function whenever a CData section is encountered
* onCData = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onCData(Handler handler) { cdataHandler = handler; }
/**
* Register a handler which will be called whenever a comment is encountered.
*
* Examples:
* --------------
* // Call this function whenever a comment is encountered
* onComment = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onComment(Handler handler) { commentHandler = handler; }
/**
* Register a handler which will be called whenever a processing instruction is encountered.
*
* Examples:
* --------------
* // Call this function whenever a processing instruction is encountered
* onPI = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening nor closing ?>
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onPI(Handler handler) { piHandler = handler; }
/**
* Register a handler which will be called whenever an XML instruction is encountered.
*
* Examples:
* --------------
* // Call this function whenever an XML instruction is encountered
* // (Note: XML instructions may only occur preceeding the root tag of a document).
* onPI = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onXI(Handler handler) { xiHandler = handler; }
/**
* Parse an XML element.
*
* Parsing will continue until the end of the current element. Any items encountered
* for which a handler has been registered will invoke that handler.
*
* Throws: various kinds of Exception
*/
void parse()
{
Tag root = tag_;
Tag[string] startTags;
if (tag_ !is null) startTags[tag_.name] = tag_;
while(s.length != 0)
{
if (startsWith(*s,"")));
chop(*s,3);
}
else if (startsWith(*s,"")));
chop(*s,3);
}
else if (startsWith(*s,"")));
chop(*s,1);
}
else if (startsWith(*s,""))
{
chop(*s,2);
piHandler(chop(*s,find(*s,"?>")));
chop(*s,2);
}
else if (startsWith(*s,"<"))
{
tag_ = new Tag(*s,true);
if (root is null) return; // Return to constructor of derived class
if (tag_.isStart || tag_.isEmpty)
{
startTags[tag_.name] = tag_;
auto parser = new ElementParser(this);
if (tag_.isEmpty) parser.elementStart = null;
ParserHandler* handler = tag_.name in onStartTag;
if (handler is null) onStartTag[null](parser);
else (*handler)(parser);
}
if (tag_.isEnd || tag_.isEmpty)
{
auto startTag = startTags[tag_.name];
string text;
if (!tag_.isEmpty)
{
invariant(char)* p = startTag.tagString.ptr + startTag.tagString.length;
invariant(char)* q = tag_.tagString.ptr;
text = p[0..(q-p)];
}
auto element = new Element(startTag);
if (text.length != 0) element ~= new Text(text);
ElementHandler* handler = tag_.name in onEndTag;
if (handler is null) onEndTag[null](element);
else (*handler)(element);
if (tag_.name == root.name) return;
}
}
else
{
textHandler(decode(chop(*s,find(*s,"<"))));
}
}
}
/**
* Returns that part of the element which has already been parsed
*/
const override string toString()
{
int n = elementStart.length - s.length;
return elementStart[0..n];
}
}
private
{
template Check(string msg)
{
string old = s;
void fail()
{
s = old;
throw new Err(s,msg);
}
void fail(Err e)
{
s = old;
throw new Err(s,msg,e);
}
void fail(string msg2)
{
fail(new Err(s,msg2));
}
}
void checkMisc(ref string s) // rule 27
{
mixin Check!("Misc");
try
{
if (s.startsWith("",s); } catch(Err e) { fail(e); }
}
void checkPI(ref string s) // rule 16
{
mixin Check!("PI");
try
{
checkLiteral("",s);
checkEnd("?>",s);
}
catch(Err e) { fail(e); }
}
void checkCDSect(ref string s) // rule 18
{
mixin Check!("CDSect");
try
{
checkLiteral("<[CDATA[",s);
checkEnd("]]>",s);
}
catch(Err e) { fail(e); }
}
void checkProlog(ref string s) // rule 22
{
mixin Check!("Prolog");
try
{
checkXMLDecl(s);
star!(checkMisc)(s);
opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
}
catch(Err e) { fail(e); }
}
void checkXMLDecl(ref string s) // rule 23
{
mixin Check!("XMLDecl");
try
{
checkLiteral("",s);
}
catch(Err e) { fail(e); }
}
void checkVersionInfo(ref string s) // rule 24
{
mixin Check!("VersionInfo");
try
{
checkSpace(s);
checkLiteral("version",s);
checkEq(s);
quoted!(checkVersionNum)(s);
}
catch(Err e) { fail(e); }
}
void checkEq(ref string s) // rule 25
{
mixin Check!("Eq");
try
{
opt!(checkSpace)(s);
checkLiteral("=",s);
opt!(checkSpace)(s);
}
catch(Err e) { fail(e); }
}
void checkVersionNum(ref string s) // rule 26
{
mixin Check!("VersionNum");
munch(s,"a-zA-Z0-9_.:-");
if (s is old) fail();
}
void checkDocTypeDecl(ref string s) // rule 28
{
mixin Check!("DocTypeDecl");
try
{
checkLiteral("",s);
}
catch(Err e) { fail(e); }
}
void checkSDDecl(ref string s) // rule 32
{
mixin Check!("SDDecl");
try
{
checkSpace(s);
checkLiteral("standalone",s);
checkEq(s);
}
catch(Err e) { fail(e); }
int n = 0;
if (s.startsWith("'yes'") || s.startsWith("\"yes\"")) n = 5;
else if (s.startsWith("'no'" ) || s.startsWith("\"no\"" )) n = 4;
else fail("standalone attribute value must be 'yes', \"yes\", 'no' or \"no\"");
s = s[n..$];
}
void checkElement(ref string s) // rule 39
{
mixin Check!("Element");
string sname,ename,t;
try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
if (t == "STag")
{
try
{
checkContent(s);
t = s;
checkETag(s,ename);
}
catch(Err e) { fail(e); }
if (sname != ename)
{
s = t;
fail("end tag name \""~ename~"\" differs from start tag name \""~sname~"\"");
}
}
}
void checkTag(ref string s, out string type, out string name) // rules 40 and 44
{
mixin Check!("Tag");
try
{
type = "STag";
checkLiteral("<",s);
checkName(s,name);
star!(seq!(checkSpace,checkAttribute))(s);
opt!(checkSpace)(s);
if (s.length != 0 && s[0] == '/')
{
s = s[1..$];
type = "ETag";
}
checkLiteral(">",s);
}
catch(Err e) { fail(e); }
}
void checkAttribute(ref string s) // rule 41
{
mixin Check!("Attribute");
try
{
string name;
checkName(s,name);
checkEq(s);
checkAttValue(s);
}
catch(Err e) { fail(e); }
}
void checkETag(ref string s, out string name) // rule 42
{
mixin Check!("ETag");
try
{
checkLiteral("",s);
checkName(s,name);
opt!(checkSpace)(s);
checkLiteral(">",s);
}
catch(Err e) { fail(e); }
}
void checkContent(ref string s) // rule 43
{
mixin Check!("Content");
try
{
for(;;)
{
old = s;
if (s.startsWith("&")) { checkReference(s); }
else if (s.startsWith("