/// module arsd.htmltotext; import arsd.dom; import arsd.color; import std.string; import std.uni : isWhite; class HtmlConverter { int width; void htmlToText(Element element, bool preformatted, int width) { this.width = width; if(auto tn = cast(TextNode) element) { foreach(dchar ch; tn.nodeValue) { sink(ch, preformatted); } } else { void sinkChildren() { foreach(child; element.childNodes) htmlToText(child, preformatted, width); } switch(element.tagName) { case "head", "script", "style": // intentionally blank break; // The table stuff is removed right now because while it looks // ok for test tables, it isn't working well for the emails I have // - it handles data ok but not really nested layouts. case "trfixme": auto children = element.childElements; auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length); if(tdWidth < 12) { // too narrow to be reasonable startBlock(); sinkChildren(); endBlock(); } else { string[] tdBlocks; int longestBlock; foreach(child; children) { auto fmt = new HtmlConverter(); fmt.htmlToText(child, false, tdWidth); tdBlocks ~= fmt.s; int lineCount = 1; foreach(ch; fmt.s) if(ch == '\n') lineCount++; if(lineCount > longestBlock) longestBlock = lineCount; } if(s.length && s[$-1] != '\n') s ~= '\n'; foreach(lineNumber; 0 .. longestBlock) { foreach(bidx, ref block; tdBlocks) { auto ob = block; if(bidx) s ~= " | "; if(block.length) { auto idx = block.indexOf("\n"); if(idx == -1) idx = block.length; s ~= block[0 .. idx]; if(idx == block.length) block = block[$..$]; else block = block[idx + 1 .. $]; } if(ob.length < tdWidth) foreach(a; 0 .. tdWidth - block.length) s ~= " "; } s ~= "\n"; } foreach(a; 0 .. children.length) { foreach(w; 0 .. tdWidth) { s ~= "-"; } if(a +1 != children.length) s ~= "-+-"; } s ~= "\n"; } break; case "a": sinkChildren(); if(element.href != element.innerText) { sink(' ', false); sink('<', false); // I want the link itself to NOT word wrap // to make for easier double-clicking of it in // the terminal foreach(dchar ch; element.href) sink(ch, false, int.max); sink('>', false); } break; case "span": /* auto csc = element.computedStyle.getValue("color"); if(csc.length) { auto c = Color.fromString(csc); s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b); } sinkChildren(); if(csc.length) s ~= "\033[39m"; */ break; case "p": startBlock(); sinkChildren(); endBlock(); break; case "b", "strong": case "em", "i": if(element.innerText.length == 0) break; sink('*', false); sinkChildren(); sink('*', false); break; case "u": if(element.innerText.length == 0) break; sink('_', false); sinkChildren(); sink('_', false); break; case "ul": ulDepth++; sinkChildren(); ulDepth--; break; case "ol": olDepth++; sinkChildren(); olDepth--; break; case "li": startBlock(); //sink('\t', true); sink(' ', true); sink(' ', true); if(olDepth) sink('*', false); if(ulDepth) sink('*', false); sink(' ', true); sinkChildren(); endBlock(); break; case "h1", "h2": startBlock(); sinkChildren(); sink('\n', true); foreach(dchar ch; element.innerText) sink(element.tagName == "h1" ? '=' : '-', false); endBlock(); break; case "hr": startBlock(); foreach(i; 0 .. width / 4) sink(' ', true); foreach(i; 0 .. width / 2) sink('-', false); endBlock(); break; case "br": sink('\n', true); break; case "tr": case "div": startBlock(); /* auto csc = element.computedStyle.getValue("background-color"); if(csc.length) { auto c = Color.fromString(csc); s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b); } */ sinkChildren(); /* if(csc.length) s ~= "\033[49m"; */ endBlock(); break; case "pre": startBlock(); foreach(child; element.childNodes) htmlToText(child, true, width); endBlock(); break; default: sinkChildren(); } } } int olDepth; int ulDepth; string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) { Document document = new Document; document.parse("" ~ html ~ ""); Element start; auto bod = document.getElementsByTagName("body"); if(bod.length) start = bod[0]; else start = document.root; //import std.file; //auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css")); //stylesheet.apply(document); htmlToText(start, false, wrapAmount); return s; } void reset() { s = null; justOutputWhitespace = true; justOutputBlock = true; justOutputMargin = true; } string s; bool justOutputWhitespace = true; bool justOutputBlock = true; bool justOutputMargin = true; int lineLength; void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) { int width = lineWidthOverride == int.min ? this.width : lineWidthOverride; if(!preformatted && isWhite(item)) { if(!justOutputWhitespace) { item = ' '; justOutputWhitespace = true; } else { return; } } else { // if it is preformatted, we still need to keep track of if it is whitespace // so stuff like
is somewhat sane justOutputWhitespace = preformatted && isWhite(item); } s ~= item; if(lineLength >= width) { // rewind to the nearest space, if there is one, to break on a word boundary int c = lineLength; bool broken; foreach_reverse(idx, char ch; s) { if(ch == '\n') break; if(ch == ' ') { auto os = s; s = os[0 .. idx]; s ~= '\n'; s ~= os[idx + 1 .. $]; lineLength = cast(int)(os[idx+1..$].length); broken = true; break; } c--; if(c < 5) break; } if(!broken) { s ~= '\n'; lineLength = 0; justOutputWhitespace = true; } } if(item == '\n') lineLength = 0; else lineLength ++; if(!justOutputWhitespace) { justOutputBlock = false; justOutputMargin = false; } } void startBlock() { if(!justOutputBlock) { s ~= "\n"; lineLength = 0; justOutputBlock = true; } if(!justOutputMargin) { s ~= "\n"; lineLength = 0; justOutputMargin = true; } } void endBlock() { if(!justOutputMargin) { s ~= "\n"; lineLength = 0; justOutputMargin = true; } } } string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) { auto converter = new HtmlConverter(); return converter.convert(html, true, wrapAmount); } string repeat(string s, ulong num) { string ret; foreach(i; 0 .. num) ret ~= s; return ret; } import std.stdio; version(none) void penis() { again: string result = ""; foreach(ele; start.tree) { if(ele is start) continue; if(ele.nodeType != 1) continue; switch(ele.tagName) { goto again; case "h1": ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("=", ele.innerText.length) ~ "\r"; ele.stripOut(); goto again; case "h2": ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("-", ele.innerText.length) ~ "\r"; ele.stripOut(); goto again; case "h3": ele.innerText = "\r" ~ ele.innerText.toUpper ~ "\r"; ele.stripOut(); goto again; case "td": case "p": /* if(ele.innerHTML.length > 1) ele.innerHTML = "\r" ~ wrap(ele.innerHTML) ~ "\r"; ele.stripOut(); goto again; */ break; case "a": string href = ele.getAttribute("href"); if(href && !ele.hasClass("no-brackets")) { if(ele.hasClass("href-text")) ele.innerText = href; else { if(ele.innerText != href) ele.innerText = ele.innerText ~ " <" ~ href ~ "> "; } } ele.stripOut(); goto again; case "ol": case "ul": ele.innerHTML = "\r" ~ ele.innerHTML ~ "\r"; break; case "li": if(!ele.innerHTML.startsWith("* ")) ele.innerHTML = "* " ~ ele.innerHTML ~ "\r"; // ele.stripOut(); break; case "sup": ele.innerText = "^" ~ ele.innerText; ele.stripOut(); break; /* case "img": string alt = ele.getAttribute("alt"); if(alt) result ~= ele.alt; break; */ default: ele.stripOut(); goto again; } } again2: //start.innerHTML = start.innerHTML().replace("\u0001", "\n"); foreach(ele; start.tree) { if(ele.tagName == "td") { if(ele.directText().strip().length) { ele.prependText("\r"); ele.appendText("\r"); } ele.stripOut(); goto again2; } else if(ele.tagName == "p") { if(strip(ele.innerText()).length > 1) { string res = ""; string all = ele.innerText().replace("\n \n", "\n\n"); foreach(part; all.split("\n\n")) res ~= "\r" ~ strip( wantWordWrap ? wrap(part, /*74*/ wrapAmount) : part ) ~ "\r"; ele.innerText = res; } else ele.innerText = strip(ele.innerText); ele.stripOut(); goto again2; } else if(ele.tagName == "li") { auto part = ele.innerText; part = strip( wantWordWrap ? wrap(part, wrapAmount - 2) : part ); part = " " ~ part.replace("\n", "\n\v") ~ "\r"; ele.innerText = part; ele.stripOut(); goto again2; } } result = start.innerText(); result = squeeze(result, " "); result = result.replace("\r ", "\r"); result = result.replace(" \r", "\r"); //result = result.replace("\u00a0", " "); result = squeeze(result, "\r"); result = result.replace("\r", "\n\n"); result = result.replace("\v", " "); result = result.replace("舗", "'"); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere result = result.replace(""", "\""); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere //result = htmlEntitiesDecode(result); // for special chars mainly result = result.replace("\u0001 ", "\n"); result = result.replace("\u0001", "\n"); //a = std.regex.replace(a, std.regex.regex("(\n\t)+", "g"), "\n"); //\t"); return result.strip; }