///
module arsd.htmltotext;

import arsd.dom;
import arsd.color;
import std.string;

import std.uni : isWhite;

class HtmlConverter {
	int width;

	void htmlToText(Element element, bool preformatted, int width) {
		this.width = width;
		if(auto tn = cast(TextNode) element) {
			foreach(dchar ch; tn.nodeValue) {
				sink(ch, preformatted);
			}
		} else {
			void sinkChildren() {
				foreach(child; element.childNodes)
					htmlToText(child, preformatted, width);
			}
			switch(element.tagName) {
				case "head", "script", "style":
					// intentionally blank
				break;
				// The table stuff is removed right now because while it looks
				// ok for test tables, it isn't working well for the emails I have
				// - it handles data ok but not really nested layouts.
				case "trfixme":
					auto children = element.childElements;

					auto tdWidth = (width - cast(int)(children.length)*3) / cast(int)(children.length);
					if(tdWidth < 12) {
						// too narrow to be reasonable
						startBlock();
						sinkChildren();
						endBlock();
					} else {
						string[] tdBlocks;
						int longestBlock;
						foreach(child; children) {
							auto fmt = new HtmlConverter();

							fmt.htmlToText(child, false, tdWidth);
							tdBlocks ~= fmt.s;
							int lineCount = 1;
							foreach(ch; fmt.s)
								if(ch == '\n')
									lineCount++;
							if(lineCount > longestBlock)
								longestBlock = lineCount;
						}

						if(s.length && s[$-1] != '\n')
							s ~= '\n';
						foreach(lineNumber; 0 .. longestBlock) {
							foreach(bidx, ref block; tdBlocks) {
								auto ob = block;
								if(bidx)
									s ~= " | ";
								if(block.length) {
									auto idx = block.indexOf("\n");
									if(idx == -1)
										idx = block.length;

									s ~= block[0 .. idx];

									if(idx == block.length)
										block = block[$..$];
									else
										block = block[idx + 1 .. $];
								}

								if(ob.length < tdWidth)
								foreach(a; 0 .. tdWidth - block.length)
									s ~= " ";

							}
							s ~= "\n";
						}

						foreach(a; 0 .. children.length) {
							foreach(w; 0 .. tdWidth) {
								s ~= "-";
							}
							if(a +1 != children.length)
								s ~= "-+-";
						}
						s ~= "\n";
					}
				break;
				case "a":
					sinkChildren();
					if(element.href != element.innerText) {
						sink(' ', false);
						sink('<', false);
						// I want the link itself to NOT word wrap
						// to make for easier double-clicking of it in
						// the terminal
						foreach(dchar ch; element.href)
							sink(ch, false, int.max);
						sink('>', false);
					}
				break;
				case "span":
					/*
					auto csc = element.computedStyle.getValue("color");
					if(csc.length) {
						auto c = Color.fromString(csc);
						s ~= format("\033[38;2;%d;%d;%dm", c.r, c.g, c.b);
					}
					sinkChildren();

					if(csc.length)
						s ~= "\033[39m";
					*/
				break;
				case "p":
					startBlock();
					sinkChildren();
					endBlock();
				break;
				case "b", "strong":
				case "em", "i":
					if(element.innerText.length == 0)
						break;
					sink('*', false);
					sinkChildren();
					sink('*', false);
				break;
				case "u":
					if(element.innerText.length == 0)
						break;
					sink('_', false);
					sinkChildren();
					sink('_', false);
				break;
				case "ul":
					ulDepth++;
					sinkChildren();
					ulDepth--;
				break;
				case "ol":
					olDepth++;
					sinkChildren();
					olDepth--;
				break;
				case "li":
					startBlock();

					//sink('\t', true);
					sink(' ', true);
					sink(' ', true);
					if(olDepth)
						sink('*', false);
					if(ulDepth)
						sink('*', false);
					sink(' ', true);

					sinkChildren();

					endBlock();
				break;

				case "h1", "h2":
					startBlock();
					sinkChildren();
					sink('\n', true);
					foreach(dchar ch; element.innerText)
						sink(element.tagName == "h1" ? '=' : '-', false);
					endBlock();
				break;

				case "hr":
					startBlock();
					foreach(i; 0 .. width / 4)
						sink(' ', true);
					foreach(i; 0 .. width / 2)
						sink('-', false);
					endBlock();
				break;

				case "br":
					sink('\n', true);
				break;
				case "tr":
				case "div":
					startBlock();

					/*
					auto csc = element.computedStyle.getValue("background-color");
					if(csc.length) {
						auto c = Color.fromString(csc);
						s ~= format("\033[48;2;%d;%d;%dm", c.r, c.g, c.b);
					}
					*/

					sinkChildren();

					/*
					if(csc.length)
						s ~= "\033[49m";
					*/

					endBlock();
				break;
				case "pre":
					startBlock();
					foreach(child; element.childNodes)
						htmlToText(child, true, width);
					endBlock();
				break;
				default:
					sinkChildren();
			}
		}
	}

	int olDepth;
	int ulDepth;

	string convert(string html, bool wantWordWrap = true, int wrapAmount = 74) {
		Document document = new Document;

		document.parse("<roottag>" ~ html ~ "</roottag>");

		Element start;
		auto bod = document.getElementsByTagName("body");
		if(bod.length)
			start = bod[0];
		else
			start = document.root;

		//import std.file;
		//auto stylesheet = new StyleSheet(readText("/var/www/dpldocs.info/experimental-docs/style.css"));
		//stylesheet.apply(document);

		htmlToText(start, false, wrapAmount);
		return s;
	}

	void reset() {
		s = null;
		justOutputWhitespace = true;
		justOutputBlock = true;
		justOutputMargin = true;
	}

	string s;
	bool justOutputWhitespace = true;
	bool justOutputBlock = true;
	bool justOutputMargin = true;
	int lineLength;

	void sink(dchar item, bool preformatted, int lineWidthOverride = int.min) {
		int width = lineWidthOverride == int.min ? this.width : lineWidthOverride;
		if(!preformatted && isWhite(item)) {
			if(!justOutputWhitespace) {
				item = ' ';
				justOutputWhitespace = true;
			} else {
				return;
			}
		} else {
			// if it is preformatted, we still need to keep track of if it is whitespace
			// so stuff like <br> is somewhat sane
			justOutputWhitespace = preformatted && isWhite(item);
		}

		s ~= item;

		if(lineLength >= width) {
			// rewind to the nearest space, if there is one, to break on a word boundary
			int c =  lineLength;
			bool broken;
			foreach_reverse(idx, char ch; s) {
				if(ch == '\n')
					break;
				if(ch == ' ') {
					auto os = s;
					s = os[0 .. idx];
					s ~= '\n';
					s ~= os[idx + 1 .. $];
					lineLength = cast(int)(os[idx+1..$].length);
					broken = true;
					break;
				}
				c--;
				if(c < 5)
					break;
			}

			if(!broken) {
				s ~= '\n';
				lineLength = 0;
				justOutputWhitespace = true;
			}

		}


		if(item == '\n')
			lineLength = 0;
		else
			lineLength ++;


		if(!justOutputWhitespace) {
			justOutputBlock = false;
			justOutputMargin = false;
		}
	}
	void startBlock() {
		if(!justOutputBlock) {
			s ~= "\n";
			lineLength = 0;
			justOutputBlock = true;
		}
		if(!justOutputMargin) {
			s ~= "\n";
			lineLength = 0;
			justOutputMargin = true;
		}
	}
	void endBlock() {
		if(!justOutputMargin) {
			s ~= "\n";
			lineLength = 0;
			justOutputMargin = true;
		}
	}
}

string htmlToText(string html, bool wantWordWrap = true, int wrapAmount = 74) {
	auto converter = new HtmlConverter();
	return converter.convert(html, true, wrapAmount);
}

string repeat(string s, ulong num) {
	string ret;
	foreach(i; 0 .. num)
		ret ~= s;
	return ret;
}

import std.stdio;
version(none)
void penis() {

    again:
    	string result = "";
	foreach(ele; start.tree) {
		if(ele is start) continue;
		if(ele.nodeType != 1) continue;

		switch(ele.tagName) {
				goto again;
			case "h1":
				ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("=", ele.innerText.length) ~ "\r";
				ele.stripOut();
				goto again;
			case "h2":
				ele.innerText = "\r" ~ ele.innerText ~ "\n" ~ repeat("-", ele.innerText.length) ~ "\r";
				ele.stripOut();
				goto again;
			case "h3":
				ele.innerText = "\r" ~ ele.innerText.toUpper ~ "\r";
				ele.stripOut();
				goto again;
			case "td":
			case "p":
			/*
				if(ele.innerHTML.length > 1)
					ele.innerHTML = "\r" ~ wrap(ele.innerHTML) ~ "\r";
				ele.stripOut();
				goto again;
			*/
			break;
			case "a":
				string href = ele.getAttribute("href");
				if(href && !ele.hasClass("no-brackets")) {
					if(ele.hasClass("href-text"))
						ele.innerText = href;
					else {
						if(ele.innerText != href)
							ele.innerText = ele.innerText ~ " <" ~ href ~ "> ";
					}
				}
				ele.stripOut();
				goto again;
			case "ol":
			case "ul":
				ele.innerHTML = "\r" ~ ele.innerHTML ~ "\r";
			break;
			case "li":
				if(!ele.innerHTML.startsWith("* "))
					ele.innerHTML = "* " ~ ele.innerHTML ~ "\r";
				// ele.stripOut();
			break;
			case "sup":
				ele.innerText = "^" ~ ele.innerText;
				ele.stripOut();
			break;
			/*
			case "img":
				string alt = ele.getAttribute("alt");
				if(alt)
					result ~= ele.alt;
			break;
			*/
			default:
				ele.stripOut();
				goto again;
		}
	}

    again2:
	//start.innerHTML = start.innerHTML().replace("\u0001", "\n");

	foreach(ele; start.tree) {
		if(ele.tagName == "td") {
			if(ele.directText().strip().length) {
				ele.prependText("\r");
				ele.appendText("\r");
			}
			ele.stripOut();
			goto again2;
		} else if(ele.tagName == "p") {
			if(strip(ele.innerText()).length > 1) {
				string res = "";
				string all = ele.innerText().replace("\n \n", "\n\n");
				foreach(part; all.split("\n\n"))
					res ~= "\r" ~ strip( wantWordWrap ? wrap(part, /*74*/ wrapAmount) : part ) ~ "\r";
				ele.innerText = res;
			} else
				ele.innerText = strip(ele.innerText);
			ele.stripOut();
			goto again2;
		} else if(ele.tagName == "li") {
			auto part = ele.innerText;
			part = strip( wantWordWrap ? wrap(part, wrapAmount - 2) : part );
			part = "  " ~ part.replace("\n", "\n\v") ~ "\r";
			ele.innerText = part;
			ele.stripOut();
			goto again2;
		}
	}

	result = start.innerText();
	result = squeeze(result, " ");

	result = result.replace("\r ", "\r");
	result = result.replace(" \r", "\r");

	//result = result.replace("\u00a0", " ");


	result = squeeze(result, "\r");
	result = result.replace("\r", "\n\n");

	result = result.replace("\v", "  ");

	result = result.replace("&#33303;", "'"); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere
	result = result.replace("&quot;", "\""); // HACK: this shouldn't be needed, but apparently is in practice surely due to a bug elsewhere
	//result = htmlEntitiesDecode(result);  // for special chars mainly

	result = result.replace("\u0001 ", "\n");
	result = result.replace("\u0001", "\n");

	//a = std.regex.replace(a, std.regex.regex("(\n\t)+", "g"), "\n"); //\t");
	return result.strip;
}