From f3f5c6e12fcec6785d69a04a52928b7647635ef3 Mon Sep 17 00:00:00 2001 From: "Adam D. Ruppe" Date: Tue, 29 Mar 2016 08:39:23 -0400 Subject: [PATCH] rewrite --- htmltotext.d | 364 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 327 insertions(+), 37 deletions(-) diff --git a/htmltotext.d b/htmltotext.d index 6f9e7e8..2c7bb38 100644 --- a/htmltotext.d +++ b/htmltotext.d @@ -2,8 +2,332 @@ module arsd.htmltotext; import arsd.dom; +import arsd.color; import std.string; -import std.array : replace; + +import std.uni : isWhite; + +class HtmlConverter { + int width; + + void htmlToText(Element element, bool preformatted, int width) { + this.width = width; + if(auto tn = cast(TextNode) element) { + foreach(dchar ch; tn.nodeValue) { + sink(ch, preformatted); + } + } else { + void sinkChildren() { + foreach(child; element.childNodes) + htmlToText(child, preformatted, width); + } + switch(element.tagName) { + case "head", "script", "style": + // intentionally blank + break; + case "trfixme": + auto children = element.childElements; + + auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length); + if(tdWidth < 12) { + // too narrow to be reasonable + startBlock(); + sinkChildren(); + endBlock(); + } else { + string[] tdBlocks; + int longestBlock; + foreach(child; children) { + auto fmt = new HtmlConverter(); + + fmt.htmlToText(child, false, tdWidth); + tdBlocks ~= fmt.s; + int lineCount = 1; + foreach(ch; fmt.s) + if(ch == '\n') + lineCount++; + if(lineCount > longestBlock) + longestBlock = lineCount; + } + + if(s.length && s[$-1] != '\n') + s ~= '\n'; + foreach(lineNumber; 0 .. longestBlock) { + foreach(bidx, ref block; tdBlocks) { + auto ob = block; + if(bidx) + s ~= " | "; + if(block.length) { + auto idx = block.indexOf("\n"); + if(idx == -1) + idx = block.length; + + s ~= block[0 .. idx]; + + if(idx == block.length) + block = block[$..$]; + else + block = block[idx + 1 .. $]; + } + + if(ob.length < tdWidth) + foreach(a; 0 .. tdWidth - block.length) + s ~= " "; + + } + s ~= "\n"; + } + + foreach(a; 0 .. children.length) { + foreach(w; 0 .. tdWidth) { + s ~= "-"; + } + if(a +1 != children.length) + s ~= "-+-"; + } + s ~= "\n"; + } + break; + case "a": + sinkChildren(); + if(element.href != element.innerText) { + sink(' ', false); + sink('<', false); + foreach(dchar ch; element.href) + sink(ch, false); + sink('>', false); + } + break; + case "span": + auto csc = element.computedStyle.getValue("color"); + if(csc.length) { + auto c = Color.fromString(csc); + s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b); + } + sinkChildren(); + + if(csc.length) + s ~= "\033[39m"; + break; + case "p": + startBlock(); + sinkChildren(); + endBlock(); + break; + case "b", "strong": + case "em", "i": + if(element.innerText.length == 0) + break; + sink('*', false); + sinkChildren(); + sink('*', false); + break; + case "u": + if(element.innerText.length == 0) + break; + sink('_', false); + sinkChildren(); + sink('_', false); + break; + case "ul": + ulDepth++; + sinkChildren(); + ulDepth--; + break; + case "ol": + olDepth++; + sinkChildren(); + olDepth--; + break; + case "li": + startBlock(); + + //sink('\t', true); + sink(' ', true); + sink(' ', true); + if(olDepth) + sink('*', false); + if(ulDepth) + sink('*', false); + sink(' ', true); + + sinkChildren(); + + endBlock(); + break; + + case "h1", "h2": + startBlock(); + sinkChildren(); + sink('\n', true); + foreach(dchar ch; element.innerText) + sink(element.tagName == "h1" ? '=' : '-', false); + endBlock(); + break; + + case "hr": + startBlock(); + foreach(i; 0 .. width / 4) + sink(' ', true); + foreach(i; 0 .. width / 2) + sink('-', false); + endBlock(); + break; + + case "br": + sink('\n', true); + break; + case "div": + startBlock(); + + /* + auto csc = element.computedStyle.getValue("background-color"); + if(csc.length) { + auto c = Color.fromString(csc); + s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b); + } + */ + + sinkChildren(); + + /* + if(csc.length) + s ~= "\033[49m"; + */ + + endBlock(); + break; + case "pre": + startBlock(); + foreach(child; element.childNodes) + htmlToText(child, true, width); + endBlock(); + break; + default: + sinkChildren(); + } + } + } + + int olDepth; + int ulDepth; + + string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) { + Document document = new Document; + + document.parse("" ~ html ~ ""); + + Element start; + auto bod = document.getElementsByTagName("body"); + if(bod.length) + start = bod[0]; + else + start = document.root; + + import std.file; + auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css")); + + stylesheet.apply(document); + + htmlToText(start, false, wrapAmount); + return s; + } + + void reset() { + s = null; + justOutputWhitespace = true; + justOutputBlock = true; + justOutputMargin = true; + } + + string s; + bool justOutputWhitespace = true; + bool justOutputBlock = true; + bool justOutputMargin = true; + int lineLength; + + void sink(dchar item, bool preformatted) { + if(!preformatted && isWhite(item)) { + if(!justOutputWhitespace) { + item = ' '; + justOutputWhitespace = true; + } else { + return; + } + } else { + // if it is preformatted, we still need to keep track of if it is whitespace + // so stuff like
is somewhat sane + justOutputWhitespace = preformatted && isWhite(item); + } + + s ~= item; + + if(lineLength >= width) { + // rewind to the nearest space, if there is one, to break on a word boundary + int c = lineLength; + bool broken; + foreach_reverse(idx, char ch; s) { + if(ch == '\n') + break; + if(ch == ' ') { + auto os = s; + s = os[0 .. idx]; + s ~= '\n'; + s ~= os[idx + 1 .. $]; + lineLength = os[idx+1..$].length; + broken = true; + break; + } + c--; + if(c < 5) + break; + } + + if(!broken) { + s ~= '\n'; + lineLength = 0; + justOutputWhitespace = true; + } + + } + + + if(item == '\n') + lineLength = 0; + else + lineLength ++; + + + if(!justOutputWhitespace) { + justOutputBlock = false; + justOutputMargin = false; + } + } + void startBlock() { + if(!justOutputBlock) { + s ~= "\n"; + lineLength = 0; + justOutputBlock = true; + } + if(!justOutputMargin) { + s ~= "\n"; + lineLength = 0; + justOutputMargin = true; + } + } + void endBlock() { + if(!justOutputMargin) { + s ~= "\n"; + lineLength = 0; + justOutputMargin = true; + } + } +} + +string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { + auto converter = new HtmlConverter(); + return converter.convert(html, true, wrapAmount); +} string repeat(string s, ulong num) { string ret; @@ -13,29 +337,8 @@ string repeat(string s, ulong num) { } import std.stdio; - -static import std.regex; -string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { - Document document = new Document; - - - html = html.replace(" ", " "); - html = html.replace(" ", " "); - html = html.replace(" ", " "); - html = html.replace("\n", " "); - html = html.replace("\r", " "); - html = std.regex.replace(html, std.regex.regex("[\n\r\t \u00a0]+", "gm"), " "); - - document.parse("" ~ html ~ ""); - - Element start; - auto bod = document.getElementsByTagName("body"); - if(bod.length) - start = bod[0]; - else - start = document.root; - - start.innerHTML = start.innerHTML().replace("
", "\u0001"); +version(none) +void penis() { again: string result = ""; @@ -44,19 +347,6 @@ string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { if(ele.nodeType != 1) continue; switch(ele.tagName) { - case "b": - case "strong": - ele.innerText = "*" ~ ele.innerText ~ "*"; - ele.stripOut(); - goto again; - case "i": - case "em": - ele.innerText = "/" ~ ele.innerText ~ "/"; - ele.stripOut(); - goto again; - case "u": - ele.innerText = "_" ~ ele.innerText ~ "_"; - ele.stripOut(); goto again; case "h1": ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("=", ele.innerText.length) ~ "\r";