arsd/htmltotext.d

474 lines
9.9 KiB
D

/++
Converts HTML to plain text. Can also output VT escape sequences for terminal output.
The exact output of this is subject to change - it is just what appears nice for me. (I actually use this on my personal email setup.)
+/
module arsd.htmltotext;
import arsd.dom;
import arsd.color;
import std.string;
import std.uni : isWhite;
///
class HtmlConverter {
int width;
/++
Will enable color output using VT codes. Determines color through dom.d's css support, which means you need to apply a stylesheet first.
---
import arsd.dom;
auto document = new Document(source_code_for_html);
auto stylesheet = new Stylesheet(source_code_for_css);
stylesheet.apply(document);
---
+/
bool enableVtOutput;
string color;
string backgroundColor;
///
void htmlToText(Element element, bool preformatted, int width) {
string color, backgroundColor;
if(enableVtOutput) {
color = element.computedStyle.getValue("color");
backgroundColor = element.computedStyle.getValue("background-color");
}
string originalColor = this.color, originalBackgroundColor = this.backgroundColor;
this.color = color.length ? color : this.color;
this.backgroundColor = backgroundColor.length ? backgroundColor : this.backgroundColor;
scope(exit) {
// the idea is as we pop working back up the tree, it restores what it was here
this.color = originalColor;
this.backgroundColor = originalBackgroundColor;
}
this.width = width;
if(auto tn = cast(TextNode) element) {
foreach(dchar ch; tn.nodeValue) {
sink(ch, preformatted);
}
} else {
void sinkChildren() {
foreach(child; element.childNodes)
htmlToText(child, preformatted, width);
}
switch(element.tagName) {
case "head", "script", "style":
// intentionally blank
break;
// The table stuff is removed right now because while it looks
// ok for test tables, it isn't working well for the emails I have
// - it handles data ok but not really nested layouts.
case "trlol":
auto children = element.childElements;
auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
if(tdWidth < 12) {
// too narrow to be reasonable
startBlock();
sinkChildren();
endBlock();
} else {
string[] tdBlocks;
int longestBlock;
foreach(child; children) {
auto fmt = new HtmlConverter();
fmt.htmlToText(child, false, tdWidth);
tdBlocks ~= fmt.s;
int lineCount = 1;
foreach(ch; fmt.s)
if(ch == '\n')
lineCount++;
if(lineCount > longestBlock)
longestBlock = lineCount;
}
if(s.length && s[$-1] != '\n')
s ~= '\n';
foreach(lineNumber; 0 .. longestBlock) {
foreach(bidx, ref block; tdBlocks) {
auto ob = block;
if(bidx)
s ~= " | ";
if(block.length) {
auto idx = block.indexOf("\n");
if(idx == -1)
idx = block.length;
s ~= block[0 .. idx];
if(idx == block.length)
block = block[$..$];
else
block = block[idx + 1 .. $];
}
if(ob.length < tdWidth)
foreach(a; 0 .. tdWidth - block.length)
s ~= " ";
}
s ~= "\n";
}
foreach(a; 0 .. children.length) {
foreach(w; 0 .. tdWidth) {
s ~= "-";
}
if(a +1 != children.length)
s ~= "-+-";
}
s ~= "\n";
}
break;
case "tr":
startBlock(2);
sinkChildren();
endBlock();
break;
case "td":
startBlock(0);
sinkChildren();
endBlock();
break;
case "a":
sinkChildren();
if(element.href != element.innerText) {
sink(' ', false);
sink('<', false);
// I want the link itself to NOT word wrap
// to make for easier double-clicking of it in
// the terminal
foreach(dchar ch; element.href)
sink(ch, false, int.max);
sink('>', false);
}
break;
case "span":
if(enableVtOutput) {
auto csc = color; // element.computedStyle.getValue("color");
if(csc.length) {
auto c = Color.fromString(csc);
s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
}
bool bold = element.computedStyle.getValue("font-weight") == "bold";
if(bold)
s ~= "\033[1m";
sinkChildren();
if(bold)
s ~= "\033[0m";
if(csc.length)
s ~= "\033[39m";
} else {
sinkChildren();
}
break;
case "p":
startBlock();
sinkChildren();
endBlock();
break;
case "b", "strong":
case "em", "i":
if(element.innerText.length == 0)
break;
if(enableVtOutput) {
s ~= "\033[1m";
sinkChildren();
s ~= "\033[0m";
} else {
sink('*', false);
sinkChildren();
sink('*', false);
}
break;
case "u":
if(element.innerText.length == 0)
break;
sink('_', false);
sinkChildren();
sink('_', false);
break;
case "ul":
ulDepth++;
startBlock(2);
sinkChildren();
endBlock();
ulDepth--;
break;
case "ol":
olDepth++;
startBlock(2);
sinkChildren();
endBlock();
olDepth--;
break;
case "li":
startBlock();
//sink('\t', true);
/*
foreach(cnt; 0 .. olDepth + ulDepth) {
sink(' ', true);
sink(' ', true);
}
*/
if(olDepth)
sink('*', false);
if(ulDepth)
sink('*', false);
sink(' ', true);
sinkChildren();
endBlock();
break;
case "dl":
case "dt":
case "dd":
startBlock(element.tagName == "dd" ? 2 : 0);
sinkChildren();
endBlock();
break;
case "h1":
startBlock();
sink('#', true);
sink('#', true);
sink(' ', true);
sinkChildren();
sink(' ', true);
sink('#', true);
sink('#', true);
endBlock();
break;
case "h2", "h3":
startBlock();
sinkChildren();
sink('\n', true);
foreach(dchar ch; element.innerText)
sink(element.tagName == "h2" ? '=' : '-', false);
endBlock();
break;
case "hr":
startBlock();
foreach(i; 0 .. width / 4)
sink(' ', true);
foreach(i; 0 .. width / 2)
sink('-', false);
endBlock();
break;
case "br":
sink('\n', true);
break;
case "div":
startBlock();
/*
auto csc = element.computedStyle.getValue("background-color");
if(csc.length) {
auto c = Color.fromString(csc);
s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
}
*/
sinkChildren();
/*
if(csc.length)
s ~= "\033[49m";
*/
endBlock();
break;
case "pre":
startBlock(4);
foreach(child; element.childNodes)
htmlToText(child, true, width);
endBlock();
break;
default:
sinkChildren();
}
}
}
int olDepth;
int ulDepth;
///
string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
Document document = new Document;
document.parse("<roottag>" ~ html ~ "</roottag>");
Element start;
auto bod = document.getElementsByTagName("body");
if(bod.length)
start = bod[0];
else
start = document.root;
//import std.file;
//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
//stylesheet.apply(document);
return convert(start, wantWordWrap, wrapAmount);
}
///
string convert(Element start, bool wantWordWrap = true, int wrapAmount = 74) {
htmlToText(start, false, wrapAmount);
return s;
}
///
void reset() {
s = null;
justOutputWhitespace = true;
justOutputBlock = true;
justOutputMargin = true;
}
///
string s;
bool justOutputWhitespace = true;
bool justOutputBlock = true;
bool justOutputMargin = true;
int lineLength;
void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
if(needsIndent && item != '\n') {
lineLength += doIndent();
needsIndent = false;
}
int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
if(!preformatted && isWhite(item)) {
if(!justOutputWhitespace) {
item = ' ';
justOutputWhitespace = true;
} else {
return;
}
} else {
// if it is preformatted, we still need to keep track of if it is whitespace
// so stuff like <br> is somewhat sane
justOutputWhitespace = preformatted && isWhite(item);
}
s ~= item;
if(lineLength >= width) {
// rewind to the nearest space, if there is one, to break on a word boundary
int c = lineLength;
bool broken;
foreach_reverse(idx, char ch; s) {
if(ch == '\n')
break;
if(ch == ' ') {
auto os = s;
s = os[0 .. idx];
s ~= '\n';
lineLength = cast(int)(os[idx+1..$].length);
lineLength += doIndent();
s ~= os[idx + 1 .. $];
broken = true;
break;
}
c--;
if(c < 5)
break;
}
if(!broken) {
s ~= '\n';
lineLength = 0;
needsIndent = true;
justOutputWhitespace = true;
}
}
if(item == '\n') {
lineLength = 0;
needsIndent = true;
} else
lineLength ++;
if(!justOutputWhitespace) {
justOutputBlock = false;
justOutputMargin = false;
}
}
int doIndent() {
int cnt = 0;
foreach(i; indentStack)
foreach(lol; 0 .. i) {
s ~= ' ';
cnt++;
}
return cnt;
}
int[] indentStack;
bool needsIndent = false;
void startBlock(int indent = 0) {
indentStack ~= indent;
if(!justOutputBlock) {
s ~= "\n";
lineLength = 0;
needsIndent = true;
justOutputBlock = true;
}
if(!justOutputMargin) {
s ~= "\n";
lineLength = 0;
needsIndent = true;
justOutputMargin = true;
}
}
void endBlock() {
if(indentStack.length)
indentStack = indentStack[0 .. $ - 1];
if(!justOutputMargin) {
s ~= "\n";
lineLength = 0;
needsIndent = true;
justOutputMargin = true;
}
}
}
///
string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
auto converter = new HtmlConverter();
return converter.convert(html, wantWordWrap, wrapAmount);
}