mirror of https://github.com/adamdruppe/arsd.git
475 lines
10 KiB
D
475 lines
10 KiB
D
///
|
|
module arsd.htmltotext;
|
|
|
|
import arsd.dom;
|
|
import arsd.color;
|
|
import std.string;
|
|
|
|
import std.uni : isWhite;
|
|
|
|
class HtmlConverter {
|
|
int width;
|
|
|
|
void htmlToText(Element element, bool preformatted, int width) {
|
|
this.width = width;
|
|
if(auto tn = cast(TextNode) element) {
|
|
foreach(dchar ch; tn.nodeValue) {
|
|
sink(ch, preformatted);
|
|
}
|
|
} else {
|
|
void sinkChildren() {
|
|
foreach(child; element.childNodes)
|
|
htmlToText(child, preformatted, width);
|
|
}
|
|
switch(element.tagName) {
|
|
case "head", "script", "style":
|
|
// intentionally blank
|
|
break;
|
|
// The table stuff is removed right now because while it looks
|
|
// ok for test tables, it isn't working well for the emails I have
|
|
// - it handles data ok but not really nested layouts.
|
|
case "trfixme":
|
|
auto children = element.childElements;
|
|
|
|
auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
|
|
if(tdWidth < 12) {
|
|
// too narrow to be reasonable
|
|
startBlock();
|
|
sinkChildren();
|
|
endBlock();
|
|
} else {
|
|
string[] tdBlocks;
|
|
int longestBlock;
|
|
foreach(child; children) {
|
|
auto fmt = new HtmlConverter();
|
|
|
|
fmt.htmlToText(child, false, tdWidth);
|
|
tdBlocks ~= fmt.s;
|
|
int lineCount = 1;
|
|
foreach(ch; fmt.s)
|
|
if(ch == '\n')
|
|
lineCount++;
|
|
if(lineCount > longestBlock)
|
|
longestBlock = lineCount;
|
|
}
|
|
|
|
if(s.length && s[$-1] != '\n')
|
|
s ~= '\n';
|
|
foreach(lineNumber; 0 .. longestBlock) {
|
|
foreach(bidx, ref block; tdBlocks) {
|
|
auto ob = block;
|
|
if(bidx)
|
|
s ~= " | ";
|
|
if(block.length) {
|
|
auto idx = block.indexOf("\n");
|
|
if(idx == -1)
|
|
idx = block.length;
|
|
|
|
s ~= block[0 .. idx];
|
|
|
|
if(idx == block.length)
|
|
block = block[$..$];
|
|
else
|
|
block = block[idx + 1 .. $];
|
|
}
|
|
|
|
if(ob.length < tdWidth)
|
|
foreach(a; 0 .. tdWidth - block.length)
|
|
s ~= " ";
|
|
|
|
}
|
|
s ~= "\n";
|
|
}
|
|
|
|
foreach(a; 0 .. children.length) {
|
|
foreach(w; 0 .. tdWidth) {
|
|
s ~= "-";
|
|
}
|
|
if(a +1 != children.length)
|
|
s ~= "-+-";
|
|
}
|
|
s ~= "\n";
|
|
}
|
|
break;
|
|
case "a":
|
|
sinkChildren();
|
|
if(element.href != element.innerText) {
|
|
sink(' ', false);
|
|
sink('<', false);
|
|
// I want the link itself to NOT word wrap
|
|
// to make for easier double-clicking of it in
|
|
// the terminal
|
|
foreach(dchar ch; element.href)
|
|
sink(ch, false, int.max);
|
|
sink('>', false);
|
|
}
|
|
break;
|
|
case "span":
|
|
/*
|
|
auto csc = element.computedStyle.getValue("color");
|
|
if(csc.length) {
|
|
auto c = Color.fromString(csc);
|
|
s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
|
|
}
|
|
sinkChildren();
|
|
|
|
if(csc.length)
|
|
s ~= "\033[39m";
|
|
*/
|
|
break;
|
|
case "p":
|
|
startBlock();
|
|
sinkChildren();
|
|
endBlock();
|
|
break;
|
|
case "b", "strong":
|
|
case "em", "i":
|
|
if(element.innerText.length == 0)
|
|
break;
|
|
sink('*', false);
|
|
sinkChildren();
|
|
sink('*', false);
|
|
break;
|
|
case "u":
|
|
if(element.innerText.length == 0)
|
|
break;
|
|
sink('_', false);
|
|
sinkChildren();
|
|
sink('_', false);
|
|
break;
|
|
case "ul":
|
|
ulDepth++;
|
|
sinkChildren();
|
|
ulDepth--;
|
|
break;
|
|
case "ol":
|
|
olDepth++;
|
|
sinkChildren();
|
|
olDepth--;
|
|
break;
|
|
case "li":
|
|
startBlock();
|
|
|
|
//sink('\t', true);
|
|
sink(' ', true);
|
|
sink(' ', true);
|
|
if(olDepth)
|
|
sink('*', false);
|
|
if(ulDepth)
|
|
sink('*', false);
|
|
sink(' ', true);
|
|
|
|
sinkChildren();
|
|
|
|
endBlock();
|
|
break;
|
|
|
|
case "h1", "h2":
|
|
startBlock();
|
|
sinkChildren();
|
|
sink('\n', true);
|
|
foreach(dchar ch; element.innerText)
|
|
sink(element.tagName == "h1" ? '=' : '-', false);
|
|
endBlock();
|
|
break;
|
|
|
|
case "hr":
|
|
startBlock();
|
|
foreach(i; 0 .. width / 4)
|
|
sink(' ', true);
|
|
foreach(i; 0 .. width / 2)
|
|
sink('-', false);
|
|
endBlock();
|
|
break;
|
|
|
|
case "br":
|
|
sink('\n', true);
|
|
break;
|
|
case "tr":
|
|
case "div":
|
|
startBlock();
|
|
|
|
/*
|
|
auto csc = element.computedStyle.getValue("background-color");
|
|
if(csc.length) {
|
|
auto c = Color.fromString(csc);
|
|
s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
|
|
}
|
|
*/
|
|
|
|
sinkChildren();
|
|
|
|
/*
|
|
if(csc.length)
|
|
s ~= "\033[49m";
|
|
*/
|
|
|
|
endBlock();
|
|
break;
|
|
case "pre":
|
|
startBlock();
|
|
foreach(child; element.childNodes)
|
|
htmlToText(child, true, width);
|
|
endBlock();
|
|
break;
|
|
default:
|
|
sinkChildren();
|
|
}
|
|
}
|
|
}
|
|
|
|
int olDepth;
|
|
int ulDepth;
|
|
|
|
string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
|
|
Document document = new Document;
|
|
|
|
document.parse("<roottag>" ~ html ~ "</roottag>");
|
|
|
|
Element start;
|
|
auto bod = document.getElementsByTagName("body");
|
|
if(bod.length)
|
|
start = bod[0];
|
|
else
|
|
start = document.root;
|
|
|
|
//import std.file;
|
|
//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
|
|
//stylesheet.apply(document);
|
|
|
|
htmlToText(start, false, wrapAmount);
|
|
return s;
|
|
}
|
|
|
|
void reset() {
|
|
s = null;
|
|
justOutputWhitespace = true;
|
|
justOutputBlock = true;
|
|
justOutputMargin = true;
|
|
}
|
|
|
|
string s;
|
|
bool justOutputWhitespace = true;
|
|
bool justOutputBlock = true;
|
|
bool justOutputMargin = true;
|
|
int lineLength;
|
|
|
|
void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
|
|
int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
|
|
if(!preformatted && isWhite(item)) {
|
|
if(!justOutputWhitespace) {
|
|
item = ' ';
|
|
justOutputWhitespace = true;
|
|
} else {
|
|
return;
|
|
}
|
|
} else {
|
|
// if it is preformatted, we still need to keep track of if it is whitespace
|
|
// so stuff like <br> is somewhat sane
|
|
justOutputWhitespace = preformatted && isWhite(item);
|
|
}
|
|
|
|
s ~= item;
|
|
|
|
if(lineLength >= width) {
|
|
// rewind to the nearest space, if there is one, to break on a word boundary
|
|
int c = lineLength;
|
|
bool broken;
|
|
foreach_reverse(idx, char ch; s) {
|
|
if(ch == '\n')
|
|
break;
|
|
if(ch == ' ') {
|
|
auto os = s;
|
|
s = os[0 .. idx];
|
|
s ~= '\n';
|
|
s ~= os[idx + 1 .. $];
|
|
lineLength = cast(int)(os[idx+1..$].length);
|
|
broken = true;
|
|
break;
|
|
}
|
|
c--;
|
|
if(c < 5)
|
|
break;
|
|
}
|
|
|
|
if(!broken) {
|
|
s ~= '\n';
|
|
lineLength = 0;
|
|
justOutputWhitespace = true;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
if(item == '\n')
|
|
lineLength = 0;
|
|
else
|
|
lineLength ++;
|
|
|
|
|
|
if(!justOutputWhitespace) {
|
|
justOutputBlock = false;
|
|
justOutputMargin = false;
|
|
}
|
|
}
|
|
void startBlock() {
|
|
if(!justOutputBlock) {
|
|
s ~= "\n";
|
|
lineLength = 0;
|
|
justOutputBlock = true;
|
|
}
|
|
if(!justOutputMargin) {
|
|
s ~= "\n";
|
|
lineLength = 0;
|
|
justOutputMargin = true;
|
|
}
|
|
}
|
|
void endBlock() {
|
|
if(!justOutputMargin) {
|
|
s ~= "\n";
|
|
lineLength = 0;
|
|
justOutputMargin = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
|
|
auto converter = new HtmlConverter();
|
|
return converter.convert(html, true, wrapAmount);
|
|
}
|
|
|
|
string repeat(string s, ulong num) {
|
|
string ret;
|
|
foreach(i; 0 .. num)
|
|
ret ~= s;
|
|
return ret;
|
|
}
|
|
|
|
import std.stdio;
|
|
version(none)
|
|
void penis() {
|
|
|
|
again:
|
|
string result = "";
|
|
foreach(ele; start.tree) {
|
|
if(ele is start) continue;
|
|
if(ele.nodeType != 1) continue;
|
|
|
|
switch(ele.tagName) {
|
|
goto again;
|
|
case "h1":
|
|
ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("=", ele.innerText.length) ~ "\r";
|
|
ele.stripOut();
|
|
goto again;
|
|
case "h2":
|
|
ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("-", ele.innerText.length) ~ "\r";
|
|
ele.stripOut();
|
|
goto again;
|
|
case "h3":
|
|
ele.innerText = "\r" ~ ele.innerText.toUpper ~ "\r";
|
|
ele.stripOut();
|
|
goto again;
|
|
case "td":
|
|
case "p":
|
|
/*
|
|
if(ele.innerHTML.length > 1)
|
|
ele.innerHTML = "\r" ~ wrap(ele.innerHTML) ~ "\r";
|
|
ele.stripOut();
|
|
goto again;
|
|
*/
|
|
break;
|
|
case "a":
|
|
string href = ele.getAttribute("href");
|
|
if(href && !ele.hasClass("no-brackets")) {
|
|
if(ele.hasClass("href-text"))
|
|
ele.innerText = href;
|
|
else {
|
|
if(ele.innerText != href)
|
|
ele.innerText = ele.innerText ~ " <" ~ href ~ "> ";
|
|
}
|
|
}
|
|
ele.stripOut();
|
|
goto again;
|
|
case "ol":
|
|
case "ul":
|
|
ele.innerHTML = "\r" ~ ele.innerHTML ~ "\r";
|
|
break;
|
|
case "li":
|
|
if(!ele.innerHTML.startsWith("* "))
|
|
ele.innerHTML = "* " ~ ele.innerHTML ~ "\r";
|
|
// ele.stripOut();
|
|
break;
|
|
case "sup":
|
|
ele.innerText = "^" ~ ele.innerText;
|
|
ele.stripOut();
|
|
break;
|
|
/*
|
|
case "img":
|
|
string alt = ele.getAttribute("alt");
|
|
if(alt)
|
|
result ~= ele.alt;
|
|
break;
|
|
*/
|
|
default:
|
|
ele.stripOut();
|
|
goto again;
|
|
}
|
|
}
|
|
|
|
again2:
|
|
//start.innerHTML = start.innerHTML().replace("\u0001", "\n");
|
|
|
|
foreach(ele; start.tree) {
|
|
if(ele.tagName == "td") {
|
|
if(ele.directText().strip().length) {
|
|
ele.prependText("\r");
|
|
ele.appendText("\r");
|
|
}
|
|
ele.stripOut();
|
|
goto again2;
|
|
} else if(ele.tagName == "p") {
|
|
if(strip(ele.innerText()).length > 1) {
|
|
string res = "";
|
|
string all = ele.innerText().replace("\n \n", "\n\n");
|
|
foreach(part; all.split("\n\n"))
|
|
res ~= "\r" ~ strip( wantWordWrap ? wrap(part, /*74*/ wrapAmount) : part ) ~ "\r";
|
|
ele.innerText = res;
|
|
} else
|
|
ele.innerText = strip(ele.innerText);
|
|
ele.stripOut();
|
|
goto again2;
|
|
} else if(ele.tagName == "li") {
|
|
auto part = ele.innerText;
|
|
part = strip( wantWordWrap ? wrap(part, wrapAmount - 2) : part );
|
|
part = " " ~ part.replace("\n", "\n\v") ~ "\r";
|
|
ele.innerText = part;
|
|
ele.stripOut();
|
|
goto again2;
|
|
}
|
|
}
|
|
|
|
result = start.innerText();
|
|
result = squeeze(result, " ");
|
|
|
|
result = result.replace("\r ", "\r");
|
|
result = result.replace(" \r", "\r");
|
|
|
|
//result = result.replace("\u00a0", " ");
|
|
|
|
|
|
result = squeeze(result, "\r");
|
|
result = result.replace("\r", "\n\n");
|
|
|
|
result = result.replace("\v", " ");
|
|
|
|
result = result.replace("舗", "'"); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere
|
|
result = result.replace(""", "\""); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere
|
|
//result = htmlEntitiesDecode(result); // for special chars mainly
|
|
|
|
result = result.replace("\u0001 ", "\n");
|
|
result = result.replace("\u0001", "\n");
|
|
|
|
//a = std.regex.replace(a, std.regex.regex("(\n\t)+", "g"), "\n"); //\t");
|
|
return result.strip;
|
|
}
|