add Vladimir Panteleev's DustMite

2025-04-28 06:00:37 +03:00 · 2013-02-18 19:43:13 -08:00 · 2013-02-18 19:43:13 -08:00 · fc3ac480c8
commit fc3ac480c8
parent 296c490c7c
3 changed files with 1650 additions and 0 deletions
--- a/DustMite/README
+++ b/DustMite/README
@ -0,0 +1,6 @@
 This is DustMite, a D source code minimization tool.
 For documentation, see the GitHub wiki:
 https://github.com/CyberShadow/DustMite/wiki
 DustMite was created by Vladimir Panteleev
 and is released into the Public Domain.
--- a/DustMite/dsplit.d
+++ b/DustMite/dsplit.d
@ -0,0 +1,528 @@
 /// Very simplistic D source code "parser"
 /// Written by Vladimir Panteleev <vladimir@thecybershadow.net>
 /// Released into the Public Domain
 module dsplit;
 import std.file;
 import std.path;
 import std.string;
 import std.ascii;
 import std.array;
 debug import std.stdio;
 class Entity
 {
 	string head;
 	Entity[] children;
 	string tail;
 	string filename, contents;
 	@property bool isFile() { return filename != ""; }
 	bool isPair;           /// internal hint
 	bool noRemove;         /// don't try removing this entity (children OK)
 	bool removed;          /// For dangling dependencies
 	Entity[] dependencies;
 	int id;                /// For diagnostics
 	size_t descendants;    /// For progress display
 	this(string head = null, Entity[] children = null, string tail = null, string filename = null, bool isPair = false)
 	{
 		this.head     = head;
 		this.children = children;
 		this.tail     = tail;
 		this.filename = filename;
 		this.isPair   = isPair;
 	}
 }
 struct ParseOptions
 {
 	enum Mode { Source, Words }
 	bool stripComments;
 	Mode mode;
 }
 Entity loadFiles(ref string path, ParseOptions options)
 {
 	if (isFile(path))
 	{
 		auto filePath = path;
 		path = stripExtension(path);
 		return loadFile(baseName(filePath).replace(`\`, `/`), filePath, options);
 	}
 	else
 	{
 		auto set = new Entity();
 		foreach (string entry; dirEntries(path, SpanMode.breadth))
 			if (isFile(entry))
 			{
 				assert(entry.startsWith(path));
 				auto name = entry[path.length+1..$];
 				set.children ~= loadFile(name, entry, options);
 			}
 		return set;
 	}
 }
 enum BIN_SIZE = 2;
 void optimize(Entity set)
 {
 	static void group(ref Entity[] set, size_t start, size_t end)
 	{
 		//set = set[0..start] ~ [new Entity(removable, set[start..end])] ~ set[end..$];
 		set.replaceInPlace(start, end, [new Entity(null, set[start..end].dup, null)]);
 	}
 	static void clusterBy(ref Entity[] set, size_t binSize)
 	{
 		while (set.length > binSize)
 		{
 			auto size = set.length >= binSize*2 ? binSize : (set.length+1) / 2;
 			//auto size = binSize;
 			auto bins = set.length/size;
 			if (set.length % size > 1)
 				group(set, bins*size, set.length);
 			foreach_reverse (i; 0..bins)
 				group(set, i*size, (i+1)*size);
 		}
 	}
 	static void doOptimize(Entity e)
 	{
 		foreach (c; e.children)
 			doOptimize(c);
 		clusterBy(e.children, BIN_SIZE);
 	}
 	doOptimize(set);
 }
 private:
 Entity loadFile(string name, string path, ParseOptions options)
 {
 	debug writeln("Loading ", path);
 	auto result = new Entity();
 	result.filename = name.replace(`\`, `/`);
 	result.contents = cast(string)read(path);
 	if (options.stripComments)
 		if (extension(path) == ".d" || extension(path) == ".di")
 			result.contents = stripDComments(result.contents);
 	final switch (options.mode)
 	{
 	case ParseOptions.Mode.Source:
 		switch (extension(path))
 		{
 		case ".d":
 		case ".di":
 			result.children = parseD(result.contents); return result;
 		// One could add custom splitters for other languages here - for example, a simple line/word/character splitter for most text-based formats
 		default:
 			result.children = [new Entity(result.contents, null, null)]; return result;
 		}
 	case ParseOptions.Mode.Words:
 		result.children = parseToWords(result.contents); return result;
 	}
 }
 string skipSymbol(string s, ref size_t i)
 {
 	auto start = i;
 	switch (s[i])
 	{
 	case '\'':
 		i++;
 		if (s[i] == '\\')
 			i+=2;
 		while (s[i] != '\'')
 			i++;
 		i++;
 		break;
 	case '\\':
 		i+=2;
 		break;
 	case '"':
 		if (i && s[i-1] == 'r')
 		{
 			i++;
 			while (s[i] != '"')
 				i++;
 			i++;
 		}
 		else
 		{
 			i++;
 			while (s[i] != '"')
 			{
 				if (s[i] == '\\')
 					i+=2;
 				else
 					i++;
 			}
 			i++;
 		}
 		break;
 	case '`':
 		i++;
 		while (s[i] != '`')
 			i++;
 		i++;
 		break;
 	case '/':
 		i++;
 		if (i==s.length)
 			break;
 		else
 		if (s[i] == '/')
 		{
 			while (i < s.length && s[i] != '\r' && s[i] != '\n')
 				i++;
 		}
 		else
 		if (s[i] == '*')
 		{
 			i+=3;
 			while (s[i-2] != '*' || s[i-1] != '/')
 				i++;
 		}
 		else
 		if (s[i] == '+')
 		{
 			i++;
 			int commentLevel = 1;
 			while (commentLevel)
 			{
 				if (s[i] == '/' && s[i+1]=='+')
 					commentLevel++, i+=2;
 				else
 				if (s[i] == '+' && s[i+1]=='/')
 					commentLevel--, i+=2;
 				else
 					i++;
 			}
 		}
 		else
 			i++;
 		break;
 	default:
 		i++;
 		break;
 	}
 	return s[start..i];
 }
 /// Moves i forward over first series of EOL characters, or until first non-whitespace character
 void skipToEOL(string s, ref size_t i)
 {
 	while (i < s.length)
 	{
 		if (s[i] == '\r' || s[i] == '\n')
 		{
 			while (i < s.length && (s[i] == '\r' || s[i] == '\n'))
 				i++;
 			return;
 		}
 		else
 		if (isWhite(s[i]))
 			i++;
 		else
 		if (s[i..$].startsWith("//"))
 			skipSymbol(s, i);
 		else
 			break;
 	}
 }
 /// Moves i backwards to the beginning of the current line, but not any further than start
 void backToEOL(string s, ref size_t i, size_t start)
 {
 	while (i>start && isWhite(s[i-1]) && s[i-1] != '\n')
 		i--;
 }
 Entity[] parseD(string s)
 {
 	size_t i = 0;
 	size_t start;
 	string innerTail;
 	Entity[] parseScope(char end)
 	{
 		// Here be dragons.
 		enum MAX_SPLITTER_LEVELS = 5;
 		struct DSplitter { char open, close, sep; }
 		static const DSplitter[MAX_SPLITTER_LEVELS] splitters = [{'{','}',';'}, {'(',')'}, {'[',']'}, {sep:','}, {sep:' '}];
 		Entity[][MAX_SPLITTER_LEVELS] splitterQueue;
 		Entity[] terminateLevel(int level)
 		{
 			if (level == MAX_SPLITTER_LEVELS)
 			{
 				auto text = s[start..i];
 				start = i;
 				return splitText(text);
 			}
 			else
 			{
 				auto next = terminateLevel(level+1);
 				if (next.length <= 1)
 					splitterQueue[level] ~= next;
 				else
 					splitterQueue[level] ~= new Entity(null, next, null);
 				auto r = splitterQueue[level];
 				splitterQueue[level] = null;
 				return r;
 			}
 		}
 		string terminateText()
 		{
 			auto r = s[start..i];
 			start = i;
 			return r;
 		}
 		characterLoop:
 		while (i < s.length)
 		{
 			char c = s[i];
 			foreach (int level, info; splitters)
 				if (info.sep && c == info.sep)
 				{
 					auto children = terminateLevel(level+1);
 					assert(i == start);
 					i++; skipToEOL(s, i);
 					splitterQueue[level] ~= new Entity(null, children, terminateText());
 					continue characterLoop;
 				}
 				else
 				if (info.open && c == info.open)
 				{
 					auto openPos = i;
 					backToEOL(s, i, start);
 					auto pairHead = terminateLevel(level+1);
 					i = openPos+1; skipToEOL(s, i);
 					auto startSequence = terminateText();
 					auto bodyContents = parseScope(info.close);
 					auto pairBody = new Entity(startSequence, bodyContents, innerTail);
 					if (pairHead.length == 0)
 						splitterQueue[level] ~= pairBody;
 					else
 					if (pairHead.length == 1)
 						splitterQueue[level] ~= new Entity(null, pairHead ~ pairBody, null, null, true);
 					else
 						splitterQueue[level] ~= new Entity(null, [new Entity(null, pairHead, null), pairBody], null, null, true);
 					continue characterLoop;
 				}
 			if (end && c == end)
 			{
 				auto closePos = i;
 				backToEOL(s, i, start);
 				auto result = terminateLevel(0);
 				i = closePos+1; skipToEOL(s, i);
 				innerTail = terminateText();
 				return result;
 			}
 			else
 				skipSymbol(s, i);
 		}
 		innerTail = null;
 		return terminateLevel(0);
 	}
 	auto result = parseScope(0);
 	postProcessD(result);
 	return result;
 }
 string stripDComments(string s)
 {
 	auto result = appender!string();
 	size_t i = 0;
 	while (i < s.length)
 	{
 		auto sym = skipSymbol(s, i);
 		if (!sym.startsWithComment())
 			result.put(sym);
 	}
 	return result.data;
 }
 void postProcessD(ref Entity[] entities)
 {
 	for (int i=0; i<entities.length;)
 	{
 		// Add dependencies for comma-separated lists.
 		if (i+2 <= entities.length && entities[i].children.length >= 1 && entities[i].tail.stripD() == ",")
 		{
 			auto comma = new Entity(entities[i].tail);
 			entities[i].children ~= comma;
 			entities[i].tail = null;
 			comma.dependencies ~= [entities[i].children[$-2], getHeadEntity(entities[i+1])];
 		}
 		// Group together consecutive entities which might represent a single language construct
 		// There is no penalty for false positives, so accuracy is not very important
 		if (i+2 <= entities.length && entities.length > 2 && (
 		    (getHeadText(entities[i]).startsWithWord("do") && getHeadText(entities[i+1]).isWord("while"))
 		 || (getHeadText(entities[i]).startsWithWord("try") && getHeadText(entities[i+1]).startsWithWord("catch"))
 		 || (getHeadText(entities[i]).startsWithWord("try") && getHeadText(entities[i+1]).startsWithWord("finally"))
 		 || (getHeadText(entities[i+1]).isWord("in"))
 		 || (getHeadText(entities[i+1]).isWord("out"))
 		 || (getHeadText(entities[i+1]).isWord("body"))
 		))
 		{
 			entities.replaceInPlace(i, i+2, [new Entity(null, entities[i..i+2].dup, null)]);
 			continue;
 		}	
 		postProcessD(entities[i].children);
 		i++;
 	}
 }
 const bool[string] wordsToSplit;
 static this() { wordsToSplit = ["else":true]; }
 Entity[] splitText(string s)
 {
 	Entity[] result;
 	while (s.length)
 	{
 		auto word = firstWord(s);
 		if (word in wordsToSplit)
 		{
 			size_t p = word.ptr + word.length - s.ptr;
 			skipToEOL(s, p);
 			result ~= new Entity(s[0..p], null, null);
 			s = s[p..$];
 		}
 		else
 		{
 			result ~= new Entity(s, null, null);
 			s = null;
 		}
 	}
 	return result;
 }
 string stripD(string s)
 {
 	size_t i=0;
 	size_t start=s.length, end=s.length;
 	while (i < s.length)
 	{
 		if (s[i..$].startsWithComment())
 			skipSymbol(s, i);
 		else
 		if (!isWhite(s[i]))
 		{
 			if (start > i)
 				start = i;
 			skipSymbol(s, i);
 			end = i;
 		}
 		else
 			i++;
 	}
 	return s[start..end];
 }
 string firstWord(string s)
 {
 	size_t i = 0;
 	s = stripD(s);
 	while (i<s.length && !isWhite(s[i]))
 		i++;
 	return s[0..i];
 }
 bool startsWithWord(string s, string word)
 {
 	s = stripD(s);
 	return s.startsWith(word) && (s.length == word.length || !isAlphaNum(s[word.length]));
 }
 bool endsWithWord(string s, string word)
 {
 	s = stripD(s);
 	return s.endsWith(word) && (s.length == word.length || !isAlphaNum(s[$-word.length-1]));
 }
 bool isWord(string s, string word)
 {
 	return stripD(s) == word;
 }
 bool startsWithComment(string s)
 {
 	return s.startsWith("//") || s.startsWith("/*") || s.startsWith("/+");
 }
 Entity getHeadEntity(Entity e)
 {
 	if (e.head.length)
 		return e;
 	foreach (child; e.children)
 	{
 		Entity r = getHeadEntity(child);
 		if (r)
 			return r;
 	}
 	if (e.tail.length)
 		return e;
 	return null;
 }
 string getHeadText(Entity e)
 {
 	e = getHeadEntity(e);
 	if (!e)
 		return null;
 	if (e.head)
 		return e.head;
 	return e.tail;
 }
 // ParseOptions.Mode.Words
 bool isDWordChar(char c)
 {
 	return isAlphaNum(c) || c=='_' || c=='@';
 }
 public Entity[] parseToWords(string text)
 {
 	Entity[] result;
 	size_t i, wordStart, wordEnd;
 	for (i = 1; i <= text.length; i++)
 		if (i==text.length || (!isDWordChar(text[i-1]) && isDWordChar(text[i])))
 		{
 			if (wordStart != i)
 				result ~= new Entity(text[wordStart..wordEnd], null, text[wordEnd..i]);
 			wordStart = wordEnd = i;
 		}
 		else
 		if ((isDWordChar(text[i-1]) && !isDWordChar(text[i])))
 			wordEnd = i;
 	return result;
 }
--- a/DustMite/dustmite.d
+++ b/DustMite/dustmite.d