arsd/docx.d

67 lines
1.2 KiB
D

/++
Bare minimum support for reading Microsoft Word files.
History:
Added February 19, 2025
+/
module arsd.docx;
import arsd.core;
import arsd.zip;
import arsd.dom;
import arsd.color;
/++
+/
class DocxFile {
private ZipFile zipFile;
private XmlDocument document;
/++
+/
this(FilePath file) {
this.zipFile = new ZipFile(file);
load();
}
/// ditto
this(immutable(ubyte)[] rawData) {
this.zipFile = new ZipFile(rawData);
load();
}
/++
Converts the document to a plain text string that gives you
the jist of the document that you can view in a plain editor.
Most formatting is stripped out.
+/
string toPlainText() {
string ret;
foreach(paragraph; document.querySelectorAll("w\\:p")) {
if(ret.length)
ret ~= "\n\n";
ret ~= paragraph.innerText;
}
return ret;
}
// FIXME: to RTF, markdown, html, and terminal sequences might also be useful.
private void load() {
loadXml("word/document.xml", (document) {
this.document = document;
});
}
private void loadXml(string filename, scope void delegate(XmlDocument document) handler) {
auto document = new XmlDocument(cast(string) zipFile.getContent(filename));
handler(document);
}
}