phobos/std/xml.d
2008-03-07 07:11:35 +00:00

2413 lines
61 KiB
D

// Written in the D programming language.
/**
Classes and functions for creating and parsing XML
The basic architecture of this module is that there are standalone functions, classes for constructing an XML
document from scratch (Tag, Element and Document), and also classes for parsing a pre-existing
XML file (ElementParser and DocumentParser). The parsing classes <i>may</i> be used to build
a Document, but that is not their primary purpose. The handling capabilities of DocumentParser
and ElementParser are sufficiently customizable that you can make them do pretty much
whatever you want.
Authors: Janice Caron
Date: 2006.02.12
License: Public Domain
Examples:
--------------------------------------------------------------------------------------------------
import std.xml;
import std.stdio;
import std.string;
// books.xml is used in various samples throughout the Microsoft XML Core Services (MSXML) SDK.
// See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
struct Book
{
string id;
string author;
string title;
string genre;
string price;
string pubDate;
string description;
}
void main()
{
string s = import("books.xml");
// Check for well-formedness
check(s);
// Take it apart
Book[] books;
auto xml = new DocumentParser(s);
xml.onStartTag["book"] = delegate void(ElementParser xml)
{
Book book;
book.id = xml.tag.attr["id"];
xml.onEndTag["author"] = delegate void(in Element e) { book.author = e.text; };
xml.onEndTag["title"] = delegate void(in Element e) { book.title = e.text; };
xml.onEndTag["genre"] = delegate void(in Element e) { book.genre = e.text; };
xml.onEndTag["price"] = delegate void(in Element e) { book.price = e.text; };
xml.onEndTag["publish-date"] = delegate void(in Element e) { book.pubDate = e.text; };
xml.onEndTag["description"] = delegate void(in Element e) { book.description = e.text; };
xml.parse();
books ~= book;
};
xml.parse();
// Put it back together again;
auto doc = new Document("catalog");
foreach(book;books)
{
auto element = new Element("book");
element.tag.attr["id"] = book.id;
element ~= new Element("author", book.author);
element ~= new Element("title", book.title);
element ~= new Element("genre", book.genre);
element ~= new Element("price", book.price);
element ~= new Element("publish-date",book.pubDate);
element ~= new Element("description", book.description);
doc ~= element;
}
// Now let's pretty-print it to see what it looks like
writefln(join(doc.pretty(3),"\n"));
}
--------------------------------------------------------------------------------------------------
*/
module std.xml;
import std.string;
import std.utf;
/**
* Returns true if the character is a character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* You might be wondering why the character-testing functions are provided. After all, std.utf already provides
* isValidDchar(), and std.uni provides isUniAlpha(), etc.. The answer is that these functions conform to different
* standards, and so give different results in certain edge-cases. For example, std.utf.isValidDchar('\u0008) returns true,
* whereas std.xml.isChar('\u0008') returns false, because control codes are valid Unicode characters, but are not
* permitted in XML documents. Along similar lines, std.uni.isUniAlpha('\U00020000') returns true, whereas
* std.xml.isLetter('\U00020000') returns false, because Unified Han Ideographs are not permitted within tag names
* or attribute names in XML. These functions exist so you can check what is actually allowed in XML,
* according to the W3C consortium.
*
* Params:
* c = the character to be tested
*/
bool isChar(dchar c) // rule 2
{
return lookup(CharTable,c);
}
unittest
{
// const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,0x10000,0x10FFFF];
assert(!isChar(cast(dchar)0x8));
assert( isChar(cast(dchar)0x9));
assert( isChar(cast(dchar)0xA));
assert(!isChar(cast(dchar)0xB));
assert(!isChar(cast(dchar)0xC));
assert( isChar(cast(dchar)0xD));
assert(!isChar(cast(dchar)0xE));
assert(!isChar(cast(dchar)0x1F));
assert( isChar(cast(dchar)0x20));
assert( isChar('J'));
assert( isChar(cast(dchar)0xD7FF));
assert(!isChar(cast(dchar)0xD800));
assert(!isChar(cast(dchar)0xDFFF));
assert( isChar(cast(dchar)0xE000));
assert( isChar(cast(dchar)0xFFFD));
assert(!isChar(cast(dchar)0xFFFE));
assert(!isChar(cast(dchar)0xFFFF));
assert( isChar(cast(dchar)0x10000));
assert( isChar(cast(dchar)0x10FFFF));
assert(!isChar(cast(dchar)0x110000));
}
/**
* Returns true if the character is whitespace according to the XML standard
*
* Only the following characters are considered whitespace in XML - space, tab, carriage return and linefeed
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isSpace(dchar c)
{
return c == '\u0020' || c == '\u0009' || c == '\u000A' || c == '\u000D';
}
/**
* Returns true if the character is a digit according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isDigit(dchar c)
{
return lookup(DigitTable,c);
}
/**
* Returns true if the character is a letter according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isLetter(dchar c) // rule 84
{
return isIdeographic(c) || isBaseChar(c);
}
/**
* Returns true if the character is an ideographic character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isIdeographic(dchar c)
{
return lookup(IdeographicTable,c);
}
/**
* Returns true if the character is a base character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isBaseChar(dchar c)
{
return lookup(BaseCharTable,c);
}
/**
* Returns true if the character is a combining character according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isCombiningChar(dchar c)
{
return lookup(CombiningCharTable,c);
}
/**
* Returns true if the character is an extender according to the XML standard
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* c = the character to be tested
*/
bool isExtender(dchar c)
{
return lookup(ExtenderTable,c);
}
/**
* Encodes a string by replacing all characters which need to be escaped with
* appropriate predefined XML entities.
*
* encode() escapes certain characters (ampersand, quote, apostrophe, less-than and greater-than),
* and similarly, decode() unescapes them. These functions are provided for convenience only. You do not need to use
* them when using the std.xml classes, because then all the encoding and decoding will be done for you automatically.
*
* If the string is not modified, the original will be returned.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* s = The string to be encoded
*
* Returns: The encoded string
*
* Examples:
* --------------
* writefln(encode("a > b")); // writes "a &gt; b"
* --------------
*/
string encode(string s)
{
// The ifs are (temprarily, we hope) necessary, because std.string.write.replace
// does not do copy-on-write, but instead copies always.
if (s.find('&') != -1) s = replace(s,"&","&amp;");
if (s.find('"') != -1) s = replace(s,"\"","&quot;");
if (s.find("'") != -1) s = replace(s,"'","&apos;");
if (s.find('<') != -1) s = replace(s,"<","&lt;");
if (s.find('>') != -1) s = replace(s,">","&gt;");
return s;
}
unittest
{
assert(encode("hello") is "hello");
assert(encode("a > b") == "a &gt; b");
assert(encode("a < b") == "a &lt; b");
assert(encode("don't") == "don&apos;t");
assert(encode("\"hi\"") == "&quot;hi&quot;");
assert(encode("cat & dog") == "cat &amp; dog");
}
/**
* Decodes a string by unescaping all predefined XML entities.
*
* encode() escapes certain characters (ampersand, quote, apostrophe, less-than and greater-than),
* and similarly, decode() unescapes them. These functions are provided for convenience only. You do not need to use
* them when using the std.xml classes, because then all the encoding and decoding will be done for you automatically.
*
* This function decodes the entities &amp;amp;, &amp;quot;, &amp;apos;, &amp;lt; and &amp;gt,
* as well as decimal and hexadecimal entities such as &amp;#x20AC;
*
* If the string does not contain an ampersand, the original will be returned.
*
* Note that if the "strict" parameter is false, then illegal ampersands will be ignored
* (that is, "cat &amp; dog" will decode to "cat &amp; dog"), whereas, if the strict paramter
* is true, then illegal sequences will cause decoding to fail.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Params:
* s = The string to be decoded
* strict = (optional) if true, strictly enforce that ampersands must be escaped.
* (Defaults to false).
*
* Throws: DecodeException if strict is true and decode fails
*
* Returns: The decoded string
*
* Examples:
* --------------
* writefln(decode("a &gt; b")); // writes "a > b"
* --------------
*/
string decode(string s, bool strict=false)
{
char[] buffer;
for (int i=0; i<s.length; ++i)
{
char c = s[i];
if (c != '&')
{
if (buffer.length != 0) buffer ~= c;
}
else
{
if (buffer.length == 0)
{
buffer = s.dup;
buffer.length = i;
}
if (startsWith(s[i..$],"&#"))
{
try
{
dchar d;
string t = s[i..$];
checkCharRef(t,d);
buffer ~= d;
i = s.length - t.length - 1;
}
catch(Err e)
{
if (strict) throw new DecodeException("Unescaped &");
buffer ~= '&';
}
}
else if (startsWith(s[i..$],"&amp;" )) { buffer ~= '&'; i += 4; }
else if (startsWith(s[i..$],"&quot;")) { buffer ~= '"'; i += 5; }
else if (startsWith(s[i..$],"&apos;")) { buffer ~= '\''; i += 5; }
else if (startsWith(s[i..$],"&lt;" )) { buffer ~= '<'; i += 3; }
else if (startsWith(s[i..$],"&gt;" )) { buffer ~= '>'; i += 3; }
else
{
if (strict) throw new DecodeException("Unescaped &");
buffer ~= '&';
}
}
}
return (buffer.length == 0) ? s : cast(string)buffer;
}
unittest
{
void assertNot(string s)
{
bool b = false;
try { decode(s,true); }
catch (DecodeException e) { b = true; }
assert(b,s);
}
// Assert that things that should work, do
assert(decode("hello") is "hello");
assert(decode("a &gt; b") == "a > b");
assert(decode("a &lt; b") == "a < b");
assert(decode("don&apos;t") == "don't");
assert(decode("&quot;hi&quot;") == "\"hi\"");
assert(decode("cat &amp; dog") == "cat & dog");
assert(decode("&#42;") == "*");
assert(decode("&#x2A;") == "*");
assert(decode("cat & dog") == "cat & dog");
assert(decode("a &gt b") == "a &gt b");
assert(decode("&#;") == "&#;");
assert(decode("&#x;") == "&#x;");
assert(decode("&#2G;") == "&#2G;");
assert(decode("&#x2G;") == "&#x2G;");
// Assert that things that shouldn't work, don't
assertNot("cat & dog");
assertNot("a &gt b");
assertNot("&#;");
assertNot("&#x;");
assertNot("&#2G;");
assertNot("&#x2G;");
}
/**
* Class representing an XML document.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
*/
class Document : Element
{
/**
* Contains all text which occurs before the root element.
* Defaults to &lt;?xml version="1.0"?&gt;
*/
string prolog = "<?xml version=\"1.0\"?>";
/**
* Contains all text which occurs after the root element.
* Defaults to the empty string
*/
string epilog;
/**
* Constructs a Document given the root name.
*
* Params:
* name = the name of the root element of the document.
*/
this(string name)
{
super(name);
}
/**
* Constructs a Document from a Tag.
*
* Params:
* tag = the start tag of the document.
*/
this(Tag tag)
{
super(tag);
}
const
{
/**
* Compares two Documents for equality
*
* Examples:
* --------------
* Document d1,d2;
* if (d1 == d2) { }
* --------------
*/
override int opEquals(Object o)
{
const doc = toType!(const Document)(o);
return
(prolog != doc.prolog ) ? false : (
(super != cast(const Element)doc) ? false : (
(epilog != doc.epilog ) ? false : (
true )));
}
/**
* Compares two Documents
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*
* Examples:
* --------------
* Document d1,d2;
* if (d1 < d2) { }
* --------------
*/
override int opCmp(Object o)
{
const doc = toType!(const Document)(o);
return
((prolog != doc.prolog ) ? ( prolog < doc.prolog ? -1 : 1 ) :
((super != cast(const Element)doc) ? ( super < cast(const Element)doc ? -1 : 1 ) :
((epilog != doc.epilog ) ? ( epilog < doc.epilog ? -1 : 1 ) :
0 )));
}
/**
* Returns the hash of a Document
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(prolog,hash(epilog,super.toHash)); }
/**
* Returns the string representation of a Document. (That is, the complete XML of a document).
*/
override string toString()
{
return prolog ~ super.toString ~ epilog;
}
}
}
/**
* Class representing an XML element.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*/
class Element : Item
{
Tag tag; /// The start tag of the element
Item[] items; /// The element's items
/**
* Constructs an Element given a name and a string to be used as a Text interior.
*
* Params:
* name = the name of the element.
* interior = (optional) the string interior.
*
* Examples:
* -------------------------------------------------------
* auto element = new Element("title","Serenity")
* // constructs the element <title>Serenity</title>
* -------------------------------------------------------
*/
this(string name, string interior=null)
{
this(new Tag(name));
if (interior.length != 0) opCatAssign(new Text(interior));
}
/**
* Constructs an Element from a Tag.
*
* Params:
* tag = the start or empty tag of the element.
*/
this(Tag tag)
{
this.tag = tag;
tag.type = TagType.EMPTY;
}
/**
* Append a complete item to the interior of this element
*
* Params:
* e = the element you wish to append.
*
* Examples:
* --------------
* Element element;
* Element other = new Element("br");
* element ~= other;
* // appends element representing <br />
* --------------
*/
void opCatAssign(Item item)
{
items ~= item;
if (tag.type == TagType.EMPTY && !item.isEmptyXML) tag.type = TagType.START;
}
/**
* Compares two Elements for equality
*
* Examples:
* --------------
* Element e1,e2;
* if (e1 == e2) { }
* --------------
*/
override int opEquals(Object o)
{
const element = toType!(const Element)(o);
uint len = items.length;
if (len != element.items.length) return false;
for (uint i=0; i<len; ++i)
{
if (!items[i].opEquals(element.items[i])) return false;
}
return true;
}
/**
* Compares two Elements
*
* You should rarely need to call this function. It exists so that Elements
* can be used as associative array keys.
*
* Examples:
* --------------
* Element e1,e2;
* if (e1 < e2) { }
* --------------
*/
override int opCmp(Object o)
{
const element = toType!(const Element)(o);
for (uint i=0; ; ++i)
{
if (i == items.length && i == element.items.length) return 0;
if (i == items.length) return -1;
if (i == element.items.length) return 1;
if (items[i] != element.items[i]) return items[i].opCmp(element.items[i]);
}
assert(false);
}
/**
* Returns the hash of an Element
*
* You should rarely need to call this function. It exists so that Elements
* can be used as associative array keys.
*/
override hash_t toHash()
{
hash_t hash = tag.toHash;
foreach(item;items) hash += item.toHash();
return hash;
}
const
{
/**
* Returns the decoded interior of an element.
*
* The element is assumed to containt text <i>only</i>. So, for example, given
* XML such as "&lt;title&gt;Good &amp;amp; Bad&lt;/title&gt;", will return "Good &amp; Bad".
*
* Params:
* strict = (optional) if true, strictly enforce that ampersands must be escaped.
* (Defaults to false).
*
* Throws: DecodeException if decode fails
*/
string text(bool strict=false)
{
string buffer;
foreach(item;items)
{
Text t = cast(Text)item;
if (t is null) throw new DecodeException(item.toString);
buffer ~= decode(t.toString,strict);
}
return buffer;
}
/**
* Returns an indented string representation of this item
*
* Params:
* indent = (optional) number of spaces by which to indent this element. Defaults to 2.
*/
override string[] pretty(uint indent=2)
{
if (isEmptyXML) return [ tag.toEmptyString ];
if (items.length == 1)
{
Text t = cast(Text)(items[0]);
if (t !is null)
{
return [ tag.toStartString ~ t.toString ~ tag.toEndString ];
}
}
string[] a = [ tag.toStartString ];
foreach(item;items)
{
string[] b = item.pretty(indent);
foreach(s;b)
{
a ~= rjustify(s,s.length + indent);
}
}
a ~= tag.toEndString;
return a;
}
/**
* Returns the string representation of an Element
*
* Examples:
* --------------
* auto element = new Element("br");
* writefln(element.toString); // writes "<br />"
* --------------
*/
override string toString()
{
if (isEmptyXML) return tag.toEmptyString;
string buffer = tag.toStartString;
foreach(item;items) { buffer ~= item.toString; }
buffer ~= tag.toEndString;
return buffer;
}
override bool isEmptyXML() { return false; } /// Returns false always
}
}
/**
* Tag types.
*
* $(DDOC_ENUM_MEMBERS START) Used for start tags
* $(DDOC_ENUM_MEMBERS END) Used for end tags
* $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
*
*/
enum TagType { START, END, EMPTY };
/**
* Class representing an XML tag.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* The class invariant guarantees
* <ul>
* <li> that $(B type) is a valid enum TagType value</li>
* <li> that $(B name) consists of valid characters</li>
* <li> that each attribute name consists of valid characters</li>
* </ul>
*/
class Tag
{
TagType type = TagType.START; /// Type of tag
string name; /// Tag name
string[string] attr; /// Associative array of attributes
private string tagString;
invariant()
{
string s;
string t;
assert(type == TagType.START || type == TagType.END || type == TagType.EMPTY);
s = name;
try { checkName(s,t); }
catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
foreach(k,v;attr)
{
s = k;
try { checkName(s,t); }
catch(Err e) { assert(false,"Invalid atrribute name:" ~ e.toString); }
}
}
/**
* Constructs an instance of Tag with a specified name and type
*
* The constructor does not initialize the attributes. To initialize the attributes,
* you access the $(B attr) member variable.
*
* Params:
* name = the Tag's name
* type = (optional) the Tag's type. If omitted, defaults to TagType.START.
*
* Examples:
* --------------
* auto tag = new Tag("img",Tag.EMPTY);
* tag.attr["src"] = "http://example.com/example.jpg";
* --------------
*/
this(string name, TagType type=TagType.START)
{
this.name = name;
this.type = type;
}
/* Private constructor (so don't ddoc this!)
*
* Constructs a Tag by parsing the string representation, e.g. "<html>".
*
* The string is passed by reference, and is advanced over all characters consumed.
*
* The second parameter is a dummy parameter only, required soley to distinguish
* this constructor from the public one.
*/
private this(ref string s, bool dummy)
{
tagString = s;
try
{
reqc(s,'<');
if (optc(s,'/')) type = TagType.END;
name = munch(s,"^>"~whitespace);
munch(s,whitespace);
while(s.length > 0 && s[0] != '>' && s[0] != '/')
{
string key = munch(s,"^="~whitespace);
munch(s,whitespace);
reqc(s,'=');
munch(s,whitespace);
reqc(s,'"');
string val = encode(munch(s,"^\""));
reqc(s,'"');
munch(s,whitespace);
attr[key] = val;
}
if (optc(s,'/'))
{
if (type == TagType.END) throw new TagException("");
type = TagType.EMPTY;
}
reqc(s,'>');
tagString.length = (s.ptr - tagString.ptr);
}
catch(Exception e)
{
tagString.length = (s.ptr - tagString.ptr);
throw new TagException(tagString);
}
}
const
{
/**
* Compares two Tags for equality
*
* You should rarely need to call this function. It exists so that Tags
* can be used as associative array keys.
*
* Examples:
* --------------
* Tag tag1,tag2
* if (tag1 == tag2) { }
* --------------
*/
override int opEquals(Object o)
{
const tag = toType!(const Tag)(o);
return
(name != tag.name) ? false : (
(attr != tag.attr) ? false : (
(type != tag.type) ? false : (
true )));
}
/**
* Compares two Tags
*
* Examples:
* --------------
* Tag tag1,tag2
* if (tag1 < tag2) { }
* --------------
*/
override int opCmp(Object o)
{
const tag = toType!(const Tag)(o);
return
((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
0 )));
}
/**
* Returns the hash of a Tag
*
* You should rarely need to call this function. It exists so that Tags
* can be used as associative array keys.
*/
override hash_t toHash()
{
hash_t hash = 0;
foreach(dchar c;name) hash = hash * 11 + c;
return hash;
}
/**
* Returns the string representation of a Tag
*
* Examples:
* --------------
* auto tag = new Tag("book",TagType.START);
* writefln(tag.toString); // writes "<book>"
* --------------
*/
override string toString()
{
if (isEmpty) return toEmptyString();
return (isEnd) ? toEndString() : toStartString();
}
private
{
string toNonEndString()
{
string s = "<" ~ name;
foreach(key,val;attr) s ~= format(" %s=\"%s\"",key,decode(val));
return s;
}
string toStartString() { return toNonEndString() ~ ">"; }
string toEndString() { return "</" ~ name ~ ">"; }
string toEmptyString() { return toNonEndString() ~ " />"; }
}
/**
* Returns true if the Tag is a start tag
*
* Examples:
* --------------
* if (tag.isStart) { }
* --------------
*/
bool isStart() { return type == TagType.START; }
/**
* Returns true if the Tag is an end tag
*
* Examples:
* --------------
* if (tag.isEnd) { }
* --------------
*/
bool isEnd() { return type == TagType.END; }
/**
* Returns true if the Tag is an empty tag
*
* Examples:
* --------------
* if (tag.isEmpty) { }
* --------------
*/
bool isEmpty() { return type == TagType.EMPTY; }
}
}
/**
* Class representing a comment
*/
class Comment : Item
{
private string content;
/**
* Construct a comment
*
* Params:
* content = the body of the comment
*
* Throws: CommentException if the comment body is illegal (contains "--" or exactly equals "-")
*
* Examples:
* --------------
* auto item = new Comment("This is a comment");
* // constructs <!--This is a comment-->
* --------------
*/
this(string content)
{
if (content == "-" || content.find("==") != -1) throw new CommentException(content);
this.content = content;
}
/**
* Compares two comments for equality
*
* Examples:
* --------------
* Comment item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Comment)item;
return t !is null && content == t.content;
}
/**
* Compares two comments
*
* You should rarely need to call this function. It exists so that Comments
* can be used as associative array keys.
*
* Examples:
* --------------
* Comment item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Comment)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a Comment
*
* You should rarely need to call this function. It exists so that Comments
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this comment
*/
override const string toString() { return "<!--" ~ content ~ "-->"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a Character Data section
*/
class CData : Item
{
private string content;
/**
* Construct a chraracter data section
*
* Params:
* content = the body of the character data segment
*
* Throws: CDataException if the segment body is illegal (contains "]]>")
*
* Examples:
* --------------
* auto item = new CData("<b>hello</b>");
* // constructs <![CDATA[<b>hello</b>]]>
* --------------
*/
this(string content)
{
if (content.find("]]>") != -1) throw new CDataException(content);
this.content = content;
}
/**
* Compares two CDatas for equality
*
* Examples:
* --------------
* CData item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(CData)item;
return t !is null && content == t.content;
}
/**
* Compares two CDatas
*
* You should rarely need to call this function. It exists so that CDatas
* can be used as associative array keys.
*
* Examples:
* --------------
* CData item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(CData)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a CData
*
* You should rarely need to call this function. It exists so that Documents
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this CData section
*/
override const string toString() { return "<[CDATA[" ~ content ~ "]]>"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a text (aka Parsed Character Data) section
*/
class Text : Item
{
private string content;
/**
* Construct a text (aka PCData) section
*
* Params:
* content = the text. This function encodes the text before insertion,
* so it is safe to insert any text
*
* Examples:
* --------------
* auto Text = new CData("a < b");
* // constructs a &lt; b
* --------------
*/
this(string content)
{
this.content = encode(content);
}
/**
* Compares two text sections for equality
*
* Examples:
* --------------
* Text item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Text)item;
return t !is null && content == t.content;
}
/**
* Compares two text sections
*
* You should rarely need to call this function. It exists so that Texts
* can be used as associative array keys.
*
* Examples:
* --------------
* Text item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(Text)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a text section
*
* You should rarely need to call this function. It exists so that Texts
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this Text section
*/
override const string toString() { return content; }
override const bool isEmptyXML() { return content.length == 0; } /// Returns true if the content is the empty string
}
/**
* Class representing an XML Instruction section
*/
class XMLInstruction : Item
{
private string content;
/**
* Construct an XML Instruction section
*
* Params:
* content = the body of the instruction segment
*
* Throws: XIException if the segment body is illegal (contains ">")
*
* Examples:
* --------------
* auto item = new XMLInstruction("ATTLIST");
* // constructs <!ATTLIST>
* --------------
*/
this(string content)
{
if (content.find(">") != -1) throw new XIException(content);
this.content = content;
}
/**
* Compares two XML instructions for equality
*
* Examples:
* --------------
* XMLInstruction item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(XMLInstruction)item;
return t !is null && content == t.content;
}
/**
* Compares two XML instructions
*
* You should rarely need to call this function. It exists so that XmlInstructions
* can be used as associative array keys.
*
* Examples:
* --------------
* XMLInstruction item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(XMLInstruction)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of an XMLInstruction
*
* You should rarely need to call this function. It exists so that XmlInstructions
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this XmlInstruction
*/
override const string toString() { return "<!" ~ content ~ ">"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Class representing a Processing Instruction section
*/
class ProcessingInstruction : Item
{
private string content;
/**
* Construct a Processing Instruction section
*
* Params:
* content = the body of the instruction segment
*
* Throws: PIException if the segment body is illegal (contains "?>")
*
* Examples:
* --------------
* auto item = new ProcessingInstruction("php");
* // constructs <?php?>
* --------------
*/
this(string content)
{
if (content.find("?>") != -1) throw new PIException(content);
this.content = content;
}
/**
* Compares two processing instructions for equality
*
* Examples:
* --------------
* ProcessingInstruction item1,item2;
* if (item1 == item2) { }
* --------------
*/
override int opEquals(Object o)
{
const item = toType!(const Item)(o);
const t = cast(ProcessingInstruction)item;
return t !is null && content == t.content;
}
/**
* Compares two processing instructions
*
* You should rarely need to call this function. It exists so that ProcessingInstructions
* can be used as associative array keys.
*
* Examples:
* --------------
* ProcessingInstruction item1,item2;
* if (item1 < item2) { }
* --------------
*/
override int opCmp(Object o)
{
const item = toType!(const Item)(o);
const t = cast(ProcessingInstruction)item;
return t !is null && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
}
/**
* Returns the hash of a ProcessingInstruction
*
* You should rarely need to call this function. It exists so that ProcessingInstructions
* can be used as associative array keys.
*/
override hash_t toHash() { return hash(content); }
/**
* Returns a string representation of this ProcessingInstruction
*/
override const string toString() { return "<?" ~ content ~ "?>"; }
override const bool isEmptyXML() { return false; } /// Returns false always
}
/**
* Abstract base class for XML items
*/
abstract class Item
{
abstract override int opEquals(Object o); /// Compares with another Item of same type for equality
abstract override int opCmp(Object o); /// Compares with another Item of same type
abstract override hash_t toHash(); /// Returns the hash of this item
abstract override const string toString(); /// Returns a string representation of this item
/**
* Returns an indented string representation of this item
*
* Params:
* indent = number of spaces by which to indent child elements
*/
const string[] pretty(uint indent) { return [ toString() ]; }
abstract const bool isEmptyXML(); /// Returns true if the item represents empty XML text
}
/**
* Class for parsing an XML Document.
*
* This is a subclass of ElementParser. Most of the useful functions are documented there.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Bugs:
* Currently only supports UTF documents.
*
* If there is an encoding attribute in the prolog, it is ignored.
*
*/
class DocumentParser : ElementParser
{
string xmlText;
/**
* Constructs a DocumentParser
*
* Params:
* xmltext = the entire XML document as text
*
*/
this(string xmlText_)
{
xmlText = xmlText_;
s = &xmlText;
super(); // Initialize everything
parse(); // Parse through the root tag (but not beyond)
}
}
/**
* Class for parsing an XML element.
*
* Standards: XML 1.0 (http://www.w3.org/TR/1998/REC-xml-19980210)
*
* Note that you cannot construct instances of this class directly. You can construct a DocumentParser
* (which is a subclass of ElementParser), but otherwise, Instances of ElementParser will be created
* for you by the library, and passed your way via onStartTag handlers.
*
*/
class ElementParser
{
alias void delegate(string) Handler;
alias void delegate(in Element element) ElementHandler;
alias void delegate(ElementParser parser) ParserHandler;
private
{
Tag tag_;
string elementStart;
string* s;
Handler commentHandler;
Handler cdataHandler;
Handler xiHandler;
Handler piHandler;
Handler textHandler;
this(ElementParser parent)
{
s = parent.s;
this();
tag_ = parent.tag_;
}
}
/**
* The Tag at the start of the element being parsed. You can read this to determine
* the tag's name and attributes.
*/
const const(Tag) tag() { return tag_; }
/**
* Register a handler which will be called whenever a start tag is encountered which matches
* the specified name. You can also pass null as the name, in which case the handler will be
* called for any unmatched start tag.
*
* Examples:
* --------------
* // Call this function whenever a <podcast> start tag is encountered
* onStartTag["podcast"] = delegate void(ElementParser xml)
* {
* // Your code here
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
*
* // call myEpisodeStartHandler (defined elsewhere) whenever an <episode> start tag is encountered
* onStartTag["episode"] = &myEpisodeStartHandler;
*
* // call delegate dg for all other start tags
* onStartTag[null] = dg;
* --------------
*
* This library will supply your function with a new instance of ElementHandler, which may
* be used to parse inside the element whose start tag was just found, or to identify the
* tag attributes of the element, etc.
*
* Note that your function will be called for both start tags and empty tags.
* That is, we make no distinction between &lt;br&gt;&lt;/br&gt; and &lt;br/&gt;.
*/
ParserHandler[string] onStartTag;
/**
* Register a handler which will be called whenever an end tag is encountered which matches
* the specified name. You can also pass null as the name, in which case the handler will be
* called for any unmatched end tag.
*
* Examples:
* --------------
* // Call this function whenever a </podcast> end tag is encountered
* onEndTag["podcast"] = delegate void(in Element e)
* {
* // Your code here
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
*
* // call myEpisodeEndHandler (defined elsewhere) whenever an </episode> end tag is encountered
* onEndTag["episode"] = &myEpisodeEndHandler;
*
* // call delegate dg for all other end tags
* onEndTag[null] = dg;
* --------------
*
* Note that your function will be called for both start tags and empty tags.
* That is, we make no distinction between &lt;br&gt;&lt;/br&gt; and &lt;br/&gt;.
*/
ElementHandler[string] onEndTag;
protected this()
{
commentHandler = &defaultHandler;
cdataHandler = &defaultHandler;
xiHandler = &defaultHandler;
piHandler = &defaultHandler;
textHandler = &defaultHandler;
onStartTag[null] = &defaultParserHandler;
onEndTag[null] = &defaultElementHandler;
elementStart = *s;
}
void defaultHandler(string) {}
void defaultElementHandler(in Element) {}
void defaultParserHandler(ElementParser) {}
/**
* Register a handler which will be called whenever text is encountered.
*
* Examples:
* --------------
* // Call this function whenever text is encountered
* onText = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s will have been decoded by the time you see it,
* // and so may contain any character.
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onText(Handler handler) { textHandler = handler; }
/**
* Register a handler which will be called whenever a character data segement is encountered.
*
* Examples:
* --------------
* // Call this function whenever a CData section is encountered
* onCData = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening <![CDATA[ nor closing ]]>
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onCData(Handler handler) { cdataHandler = handler; }
/**
* Register a handler which will be called whenever a comment is encountered.
*
* Examples:
* --------------
* // Call this function whenever a comment is encountered
* onComment = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening <!-- nor closing -->
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onComment(Handler handler) { commentHandler = handler; }
/**
* Register a handler which will be called whenever a processing instruction is encountered.
*
* Examples:
* --------------
* // Call this function whenever a processing instruction is encountered
* onPI = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening <? nor closing ?>
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onPI(Handler handler) { piHandler = handler; }
/**
* Register a handler which will be called whenever an XML instruction is encountered.
*
* Examples:
* --------------
* // Call this function whenever an XML instruction is encountered
* // (Note: XML instructions may only occur preceeding the root tag of a document).
* onPI = delegate void(string s)
* {
* // Your code here
*
* // The passed parameter s does not include the opening <! nor closing >
* //
* // This is a a closure, so code here may reference
* // variables which are outside of this scope
* };
* --------------
*/
void onXI(Handler handler) { xiHandler = handler; }
/**
* Parse an XML element.
*
* Parsing will continue until the end of the current element. Any items encountered
* for which a handler has been registered will invoke that handler.
*
* Throws: various kinds of Exception
*/
void parse()
{
Tag root = tag_;
Tag[string] startTags;
if (tag_ !is null) startTags[tag_.name] = tag_;
while(s.length != 0)
{
if (startsWith(*s,"<!--"))
{
chop(*s,4);
commentHandler(chop(*s,find(*s,"-->")));
chop(*s,3);
}
else if (startsWith(*s,"<![CDATA["))
{
chop(*s,9);
cdataHandler(chop(*s,find(*s,"]]>")));
chop(*s,3);
}
else if (startsWith(*s,"<!"))
{
chop(*s,2);
xiHandler(chop(*s,find(*s,">")));
chop(*s,1);
}
else if (startsWith(*s,"<?"))
{
chop(*s,2);
piHandler(chop(*s,find(*s,"?>")));
chop(*s,2);
}
else if (startsWith(*s,"<"))
{
tag_ = new Tag(*s,true);
if (root is null) return; // Return to constructor of derived class
if (tag_.isStart || tag_.isEmpty)
{
startTags[tag_.name] = tag_;
auto parser = new ElementParser(this);
if (tag_.isEmpty) parser.elementStart = null;
ParserHandler* handler = tag_.name in onStartTag;
if (handler is null) onStartTag[null](parser);
else (*handler)(parser);
}
if (tag_.isEnd || tag_.isEmpty)
{
auto startTag = startTags[tag_.name];
string text;
if (!tag_.isEmpty)
{
invariant(char)* p = startTag.tagString.ptr + startTag.tagString.length;
invariant(char)* q = tag_.tagString.ptr;
text = p[0..(q-p)];
}
auto element = new Element(startTag);
if (text.length != 0) element ~= new Text(text);
ElementHandler* handler = tag_.name in onEndTag;
if (handler is null) onEndTag[null](element);
else (*handler)(element);
if (tag_.name == root.name) return;
}
}
else
{
textHandler(decode(chop(*s,find(*s,"<"))));
}
}
}
/**
* Returns that part of the element which has already been parsed
*/
const override string toString()
{
int n = elementStart.length - s.length;
return elementStart[0..n];
}
}
private
{
template Check(string msg)
{
string old = s;
void fail()
{
s = old;
throw new Err(s,msg);
}
void fail(Err e)
{
s = old;
throw new Err(s,msg,e);
}
void fail(string msg2)
{
fail(new Err(s,msg2));
}
}
void checkMisc(ref string s) // rule 27
{
mixin Check!("Misc");
try
{
if (s.startsWith("<!--")) { checkComment(s); }
else if (s.startsWith("<?")) { checkPI(s); }
else { checkSpace(s); }
}
catch(Err e) { fail(e); }
}
void checkDocument(ref string s) // rule 1
{
mixin Check!("Document");
try
{
checkProlog(s);
checkElement(s);
star!(checkMisc)(s);
}
catch(Err e) { fail(e); }
}
void checkChars(ref string s) // rule 2
{
// TO DO - Fix std.utf stride and decode functions, then use those instead
mixin Check!("Chars");
dchar c;
int n = -1;
foreach(int i,dchar d; s)
{
if (!isChar(d))
{
c = d;
n = i;
break;
}
}
if (n != -1)
{
s = s[n..$];
fail(format("invalid character: U+%04X",c));
}
}
void checkSpace(ref string s) // rule 3
{
mixin Check!("Whitespace");
munch(s,"\u0020\u0009\u000A\u000D");
if (s is old) fail();
}
void checkName(ref string s, out string name) // rule 5
{
mixin Check!("Name");
if (s.length == 0) fail();
int n;
foreach(int i,dchar c;s)
{
if (c == '_' || c == ':' || isLetter(c)) continue;
if (i == 0) fail();
if (c == '-' || c == '.' || isDigit(c) || isCombiningChar(c) || isExtender(c)) continue;
n = i;
break;
}
name = s[0..n];
s = s[n..$];
}
void checkAttValue(ref string s) // rule 10
{
mixin Check!("AttValue");
if (s.length == 0) fail();
char c = s[0];
if (c != '\u0022' && c != '\u0027') fail("attribute value requires quotes");
s = s[1..$];
for(;;)
{
munch(s,"^<&"~c);
if (s.length == 0) fail("unterminated attribute value");
if (s[0] == '<') fail("< found in attribute value");
if (s[0] == c) break;
try { checkReference(s); } catch(Err e) { fail(e); }
}
s = s[1..$];
}
void checkCharData(ref string s) // rule 14
{
mixin Check!("CharData");
for(;;)
{
if (s.startsWith("&")) break;
if (s.startsWith("<")) break;
if (s.startsWith("]]>")) fail("]]> found within char data");
s = s[1..$];
}
}
void checkComment(ref string s) // rule 15
{
mixin Check!("Comment");
try { checkLiteral("<!--",s); } catch(Err e) { fail(e); }
int n = s.find("--");
if (n == -1) fail("unterminated comment");
s = s[0..n];
try { checkLiteral("-->",s); } catch(Err e) { fail(e); }
}
void checkPI(ref string s) // rule 16
{
mixin Check!("PI");
try
{
checkLiteral("<?",s);
checkEnd("?>",s);
}
catch(Err e) { fail(e); }
}
void checkCDSect(ref string s) // rule 18
{
mixin Check!("CDSect");
try
{
checkLiteral("<[CDATA[",s);
checkEnd("]]>",s);
}
catch(Err e) { fail(e); }
}
void checkProlog(ref string s) // rule 22
{
mixin Check!("Prolog");
try
{
checkXMLDecl(s);
star!(checkMisc)(s);
opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
}
catch(Err e) { fail(e); }
}
void checkXMLDecl(ref string s) // rule 23
{
mixin Check!("XMLDecl");
try
{
checkLiteral("<?xml",s);
checkVersionInfo(s);
opt!(checkEncodingDecl)(s);
opt!(checkSDDecl)(s);
opt!(checkSpace)(s);
checkLiteral("?>",s);
}
catch(Err e) { fail(e); }
}
void checkVersionInfo(ref string s) // rule 24
{
mixin Check!("VersionInfo");
try
{
checkSpace(s);
checkLiteral("version",s);
checkEq(s);
quoted!(checkVersionNum)(s);
}
catch(Err e) { fail(e); }
}
void checkEq(ref string s) // rule 25
{
mixin Check!("Eq");
try
{
opt!(checkSpace)(s);
checkLiteral("=",s);
opt!(checkSpace)(s);
}
catch(Err e) { fail(e); }
}
void checkVersionNum(ref string s) // rule 26
{
mixin Check!("VersionNum");
munch(s,"a-zA-Z0-9_.:-");
if (s is old) fail();
}
void checkDocTypeDecl(ref string s) // rule 28
{
mixin Check!("DocTypeDecl");
try
{
checkLiteral("<!DOCTYPE",s);
//
// TO DO -- ensure DOCTYPE is well formed (But not yet. That's one of our "future directions")
//
checkEnd(">",s);
}
catch(Err e) { fail(e); }
}
void checkSDDecl(ref string s) // rule 32
{
mixin Check!("SDDecl");
try
{
checkSpace(s);
checkLiteral("standalone",s);
checkEq(s);
}
catch(Err e) { fail(e); }
int n = 0;
if (s.startsWith("'yes'") || s.startsWith("\"yes\"")) n = 5;
else if (s.startsWith("'no'" ) || s.startsWith("\"no\"" )) n = 4;
else fail("standalone attribute value must be 'yes', \"yes\", 'no' or \"no\"");
s = s[n..$];
}
void checkElement(ref string s) // rule 39
{
mixin Check!("Element");
string sname,ename,t;
try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
if (t == "STag")
{
try
{
checkContent(s);
t = s;
checkETag(s,ename);
}
catch(Err e) { fail(e); }
if (sname != ename)
{
s = t;
fail("end tag name \""~ename~"\" differs from start tag name \""~sname~"\"");
}
}
}
void checkTag(ref string s, out string type, out string name) // rules 40 and 44
{
mixin Check!("Tag");
try
{
type = "STag";
checkLiteral("<",s);
checkName(s,name);
star!(seq!(checkSpace,checkAttribute))(s);
opt!(checkSpace)(s);
if (s.length != 0 && s[0] == '/')
{
s = s[1..$];
type = "ETag";
}
checkLiteral(">",s);
}
catch(Err e) { fail(e); }
}
void checkAttribute(ref string s) // rule 41
{
mixin Check!("Attribute");
try
{
string name;
checkName(s,name);
checkEq(s);
checkAttValue(s);
}
catch(Err e) { fail(e); }
}
void checkETag(ref string s, out string name) // rule 42
{
mixin Check!("ETag");
try
{
checkLiteral("</",s);
checkName(s,name);
opt!(checkSpace)(s);
checkLiteral(">",s);
}
catch(Err e) { fail(e); }
}
void checkContent(ref string s) // rule 43
{
mixin Check!("Content");
try
{
for(;;)
{
old = s;
if (s.startsWith("&")) { checkReference(s); }
else if (s.startsWith("<!--")) { checkComment(s); }
else if (s.startsWith("<?")) { checkPI(s); }
else if (s.startsWith("<[CDATA[")) { checkCDSect(s); }
else if (s.startsWith("</")) { break; }
else if (s.startsWith("<")) { checkElement(s); }
else { checkCharData(s); }
}
}
catch(Err e) { fail(e); }
}
void checkCharRef(ref string s, out dchar c) // rule 66
{
mixin Check!("CharRef");
c = 0;
try { checkLiteral("&#",s); } catch(Err e) { fail(e); }
int radix = 10;
if (s.length != 0 && s[0] == 'x')
{
s = s[1..$];
radix = 16;
}
if (s.length == 0) fail("unterminated character reference");
if (s[0] == ';') fail("character reference must have at least one digit");
while (s.length != 0)
{
char d = s[0];
int n = 0;
switch(d)
{
case 'F','f': ++n;
case 'E','e': ++n;
case 'D','d': ++n;
case 'C','c': ++n;
case 'B','b': ++n;
case 'A','a': ++n;
case '9': ++n;
case '8': ++n;
case '7': ++n;
case '6': ++n;
case '5': ++n;
case '4': ++n;
case '3': ++n;
case '2': ++n;
case '1': ++n;
case '0': break;
default: n = 100; break;
}
if (n >= radix) break;
c *= radix;
c += n;
s = s[1..$];
}
if (!isChar(c)) fail(format("U+%04X is not a legal character",c));
if (s.length == 0 || s[0] != ';') fail("expected ;");
else s = s[1..$];
}
void checkReference(ref string s) // rule 67
{
mixin Check!("Reference");
try
{
dchar c;
if (s.startsWith("&#")) checkCharRef(s,c);
else checkEntityRef(s);
}
catch(Err e) { fail(e); }
}
void checkEntityRef(ref string s) // rule 68
{
mixin Check!("EntityRef");
try
{
string name;
checkLiteral("&",s);
checkName(s,name);
checkLiteral(";",s);
}
catch(Err e) { fail(e); }
}
void checkEncName(ref string s) // rule 81
{
mixin Check!("EncName");
munch(s,"a-zA-Z");
if (s is old) fail();
munch(s,"a-zA-Z0-9_.-");
}
void checkEncodingDecl(ref string s) // rule 80
{
mixin Check!("EncodingDecl");
try
{
checkSpace(s);
checkLiteral("encoding",s);
checkEq(s);
quoted!(checkEncName)(s);
}
catch(Err e) { fail(e); }
}
// Helper functions
void checkLiteral(string literal,ref string s)
{
mixin Check!("Literal");
if (!s.startsWith(literal)) fail("Expected literal \""~literal~"\"");
s = s[literal.length..$];
}
void checkEnd(string end,ref string s)
{
// Deliberately no mixin Check here.
int n = s.find(end);
if (n == -1) throw new Err(s,"Unable to find terminating \""~end~"\"");
s = s[n..$];
checkLiteral(end,s);
}
// Metafunctions -- none of these use mixin Check
void opt(alias f)(ref string s)
{
try { f(s); } catch(Err e) {}
}
void plus(alias f)(ref string s)
{
f(s);
star!(f)(s);
}
void star(alias f)(ref string s)
{
for (;;)
{
try { f(s); }
catch(Err e) { return; }
}
}
void quoted(alias f)(ref string s)
{
if (s.startsWith("'"))
{
checkLiteral("'",s);
f(s);
checkLiteral("'",s);
}
else
{
checkLiteral("\"",s);
f(s);
checkLiteral("\"",s);
}
}
void seq(alias f,alias g)(ref string s)
{
f(s);
g(s);
}
}
/**
* Check an entire XML document for well-formedness
*
* Params:
* s = the document to be checked, passed as a string
*
* Throws: CheckException if the document is not well formed
*
* CheckException's toString() method will yield the complete heirarchy of parse failure
* (The XML equivalent of a stack trace), giving the line and column number of every
* failure at every level.
*/
void check(string s)
{
try
{
checkChars(s);
checkDocument(s);
if (s.length != 0) throw new Err(s,"Junk found after document");
}
catch(Err e)
{
e.complete(s);
throw e;
}
}
unittest
{
try
{
check(q"[<?xml version="1.0"?>
<catalog>
<book id="bk101">
<author>Gambardella, Matthew</author>
<title>XML Developer's Guide</title>
<genre>Computer</genre>
<price>44.95</price>
<publish_date>2000-10-01</publish_date>
<description>An in-depth look at creating applications
with XML.</description>
</book>
<book id="bk102">
<author>Ralls, Kim</author>
<title>Midnight Rain</title>
<genre>Fantasy</genres>
<price>5.95</price>
<publish_date>2000-12-16</publish_date>
<description>A former architect battles corporate zombies,
an evil sorceress, and her own childhood to become queen
of the world.</description>
</book>
<book id="bk103">
<author>Corets, Eva</author>
<title>Maeve Ascendant</title>
<genre>Fantasy</genre>
<price>5.95</price>
<publish_date>2000-11-17</publish_date>
<description>After the collapse of a nanotechnology
society in England, the young survivors lay the
foundation for a new society.</description>
</book>
</catalog>
]");
assert(false);
}
catch(CheckException e)
{
int n = e.toString().find("end tag name \"genres\" differs from start tag name \"genre\"");
assert(n != -1);
}
}
/** The base class for exceptions thrown by this module */
class Exception : object.Exception { this(string msg) { super(msg); } }
// Other exceptions
class CommentException : Exception { private this(string msg) { super(msg); } } /// Thrown during Comment constructor
class CDataException : Exception { private this(string msg) { super(msg); } } /// Thrown during CData constructor
class XIException : Exception { private this(string msg) { super(msg); } } /// Thrown during XMLInstruction constructor
class PIException : Exception { private this(string msg) { super(msg); } } /// Thrown during ProcessingInstruction constructor
class TextException : Exception { private this(string msg) { super(msg); } } /// Thrown during Text constructor
class DecodeException : Exception { private this(string msg) { super(msg); } } /// Thrown during decode()
class InvalidTypeException : Exception { private this(string msg) { super(msg); } } /// Thrown if comparing with wrong type
class TagException : Exception { private this(string msg) { super(msg); } } /// Thrown when parsing for Tags
/**
* Thrown during check()
*/
class CheckException : Exception
{
CheckException err; /// Parent in heirarchy
private string tail;
string msg; /// Name of production rule which failed to parse, or specific error message
uint line = 0; /// Line number at which parse failure occurred
uint column = 0; /// Column number at which parse failure occurred
private this(string tail,string msg,Err err=null)
{
super(null);
this.tail = tail;
this.msg = msg;
this.err = err;
}
private void complete(string entire)
{
string head = entire[0..$-tail.length];
int n = head.rfind('\n') + 1;
line = head.count("\n") + 1;
dstring t = toUTF32(head[n..$]);
column = t.length + 1;
if (err !is null) err.complete(entire);
}
override const string toString()
{
string s;
if (line != 0) s = format("Line %d, column %d: ",line,column);
s ~= msg;
s ~= '\n';
if (err !is null) s = err.toString ~ s;
return s;
}
}
private alias CheckException Err;
// Private helper functions
private
{
T toType(T)(Object o)
{
T t = cast(T)(o);
if (t is null)
{
throw new InvalidTypeException("Attempt to compare a " ~ T.stringof ~ " with an instance of another type");
}
return t;
}
string chop(ref string s, int n)
{
if (n == -1) n = s.length;
string t = s[0..n];
s = s[n..$];
return t;
}
bool optc(ref string s, char c)
{
bool b = s.length != 0 && s[0] == c;
if (b) s = s[1..$];
return b;
}
void reqc(ref string s, char c)
{
if (s.length == 0 || s[0] != c) throw new TagException("");
s = s[1..$];
}
hash_t hash(string s,hash_t h=0)
{
foreach(dchar c;s) h = h * 11 + c;
return h;
}
// Definitions from the XML specification
const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,0x10000,0x10FFFF];
const BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3];
const IdeographicTable=[0x4E00,0x9FA5,0x3007,0x3007,0x3021,0x3029];
const CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,0x3099,0x3099,0x309A,0x309A];
const DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,0x0ED9,0x0F20,0x0F29];
const ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,0x3035,0x309D,0x309E,0x30FC,0x30FE];
bool lookup(const(int)[] table, int c)
{
while (table.length != 0)
{
int m = (table.length >> 1) & ~1;
if (c < table[m])
{
table = table[0..m];
}
else if (c > table[m+1])
{
table = table[m+2..$];
}
else return true;
}
return false;
}
string startOf(string s)
{
string r;
foreach(char c;s)
{
r ~= (c < 0x20 || c > 0x7F) ? '.' : c;
if (r.length >= 20) { r ~= "..."; break; }
}
return r;
}
}