mirror of
https://github.com/dlang/phobos.git
synced 2025-04-27 05:30:33 +03:00
1319 lines
40 KiB
D
1319 lines
40 KiB
D
//Written in the D programming language
|
||
/**
|
||
gen_uni is a tool to automatically generate source code for unicode data structures.
|
||
|
||
To generate the tables use:
|
||
```
|
||
$ rdmd -m32 unicode_table_generator.d
|
||
$ rdmd -m64 unicode_table_generator.d --min
|
||
```
|
||
|
||
See the global ``UnicodeDatabaseDirectory`` for the latest version of the Unicode database that was used to generate the tables.
|
||
|
||
TODO: Support emitting of Turkic casefolding mappings
|
||
|
||
Authors: Dmitry Olshansky
|
||
|
||
License: Boost
|
||
*/
|
||
module std.unicode_table_generator;
|
||
// this shouldn't be in std package, but stuff is package'd as that in std.uni.
|
||
|
||
import std.uni, std.stdio, std.traits, std.typetuple,
|
||
std.exception, std.format, std.algorithm, std.typecons,
|
||
std.regex, std.range, std.conv, std.getopt;
|
||
|
||
import std.file : exists;
|
||
static import std.ascii, std.string;
|
||
|
||
//common binary property sets and their aliases
|
||
struct PropertyTable
|
||
{
|
||
CodepointSet[string] table;
|
||
string[string] aliases;
|
||
}
|
||
|
||
PropertyTable general;
|
||
PropertyTable blocks;
|
||
PropertyTable scripts;
|
||
PropertyTable hangul;
|
||
PropertyTable graphemeBreaks;
|
||
PropertyTable emojiData;
|
||
|
||
//quick NO/MAYBE charaсter sets
|
||
CodepointSet[string] normalization;
|
||
|
||
//axuilary sets for case mapping
|
||
CodepointSet lowerCaseSet, upperCaseSet;
|
||
//CodepointSet titleCaseSet; //no sensible version found for isTitlecase
|
||
|
||
// sets for toLower/toUpper/toTitle
|
||
uint[] toLowerTab;
|
||
ushort toLowerTabSimpleLen; //start of long mappings
|
||
ushort[dchar] toLowerSimpleIndex, toLowerIndex;
|
||
uint[] toUpperTab;
|
||
ushort toUpperTabSimpleLen; //ditto for Upper
|
||
ushort[dchar] toUpperSimpleIndex, toUpperIndex;
|
||
uint[] toTitleTab;
|
||
ushort toTitleTabSimpleLen; //ditto for Title
|
||
ushort[dchar] toTitleSimpleIndex, toTitleIndex;
|
||
|
||
mixin(mixedCCEntry);
|
||
|
||
//case folding mapping
|
||
SimpleCaseEntry[] simpleTable;
|
||
FullCaseEntry[] fullTable;
|
||
|
||
///canonical combining class
|
||
CodepointSet[256] combiningClass;
|
||
//same but packaged per dchar
|
||
ubyte[dchar] combiningMapping;
|
||
|
||
//unrolled decompositions
|
||
dstring[dchar] canonDecomp;
|
||
dstring[dchar] compatDecomp;
|
||
|
||
//canonical composition tables
|
||
dchar[] canonicalyComposableLeft;
|
||
dchar[] canonicalyComposableRight;
|
||
|
||
|
||
//canonical composition exclusions
|
||
CodepointSet compExclusions;
|
||
|
||
//property names to discard
|
||
string[] blacklist = [];
|
||
|
||
enum mixedCCEntry = `
|
||
struct SimpleCaseEntry
|
||
{
|
||
uint ch;
|
||
ubyte n, bucket;// n - number in bucket
|
||
|
||
pure nothrow @nogc:
|
||
|
||
@property ubyte size() const
|
||
{
|
||
return bucket & 0x3F;
|
||
}
|
||
@property auto isLower() const
|
||
{
|
||
return bucket & 0x40;
|
||
}
|
||
@property auto isUpper() const
|
||
{
|
||
return bucket & 0x80;
|
||
}
|
||
}
|
||
|
||
struct FullCaseEntry
|
||
{
|
||
dchar[3] seq;
|
||
ubyte n, size;// n number in batch, size - size of batch
|
||
ubyte entry_len;
|
||
|
||
@property auto value() const @trusted pure nothrow @nogc return
|
||
{
|
||
return seq[0 .. entry_len];
|
||
}
|
||
}
|
||
|
||
struct CompEntry
|
||
{
|
||
dchar rhs, composed;
|
||
}
|
||
|
||
struct UnicodeProperty
|
||
{
|
||
string name;
|
||
ubyte[] compressed;
|
||
}
|
||
|
||
struct TrieEntry(T...)
|
||
{
|
||
size_t[] offsets;
|
||
size_t[] sizes;
|
||
size_t[] data;
|
||
}
|
||
|
||
`;
|
||
|
||
auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size)
|
||
{
|
||
dchar[3] val;
|
||
val[0 .. value.length] = value[];
|
||
return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length);
|
||
}
|
||
|
||
enum {
|
||
UnicodeDatabaseDirectory = "ucd-15/",
|
||
caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt",
|
||
blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt",
|
||
propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt",
|
||
graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt",
|
||
emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt",
|
||
propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt",
|
||
corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt",
|
||
normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt",
|
||
scriptsSrc = UnicodeDatabaseDirectory ~ "Scripts.txt",
|
||
hangulSyllableSrc = UnicodeDatabaseDirectory ~ "HangulSyllableType.txt",
|
||
unicodeDataSrc = UnicodeDatabaseDirectory ~ "UnicodeData.txt",
|
||
compositionExclusionsSrc = UnicodeDatabaseDirectory ~ "CompositionExclusions.txt",
|
||
specialCasingSrc = UnicodeDatabaseDirectory ~ "SpecialCasing.txt"
|
||
}
|
||
|
||
enum HeaderComment = `//Written in the D programming language
|
||
/**
|
||
* License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
|
||
*
|
||
* Authors: Dmitry Olshansky
|
||
*
|
||
*/
|
||
// !!! DO NOT EDIT !!!
|
||
// !!! Did you even read the comment? !!!
|
||
// This module is automatically generated from Unicode Character Database files
|
||
// https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d
|
||
//dfmt off`;
|
||
|
||
auto toPairs(K, V)(V[K] aa)
|
||
{
|
||
return aa.values.zip(aa.keys).array;
|
||
}
|
||
|
||
void main(string[] argv)
|
||
{
|
||
string mode = "w";
|
||
|
||
bool minimal = false;
|
||
getopt(argv, "min", &minimal);
|
||
if (minimal)
|
||
mode = "a";
|
||
|
||
|
||
if (!exists(UnicodeDatabaseDirectory))
|
||
{
|
||
writeln("Did you forget to download the Unicode database tables?");
|
||
writeln("Looking for them in: ", UnicodeDatabaseDirectory);
|
||
return;
|
||
}
|
||
|
||
enum UnicodeTableDirectory = "../std/internal/";
|
||
|
||
auto baseSink = File(UnicodeTableDirectory ~ "unicode_tables.d", mode);
|
||
auto compSink = File(UnicodeTableDirectory ~ "unicode_comp.d", mode);
|
||
auto decompSink = File(UnicodeTableDirectory ~ "unicode_decomp.d", mode);
|
||
auto normSink = File(UnicodeTableDirectory ~ "unicode_norm.d", mode);
|
||
auto graphSink = File(UnicodeTableDirectory ~ "unicode_grapheme.d", mode);
|
||
if (!minimal)
|
||
{
|
||
baseSink.writeln(HeaderComment);
|
||
baseSink.writeln("module std.internal.unicode_tables;");
|
||
baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
|
||
|
||
compSink.writeln("module std.internal.unicode_comp;");
|
||
compSink.writeln("import std.internal.unicode_tables;");
|
||
compSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
|
||
|
||
decompSink.writeln(HeaderComment);
|
||
decompSink.writeln("module std.internal.unicode_decomp;");
|
||
decompSink.writeln("import std.internal.unicode_tables;");
|
||
decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
|
||
|
||
normSink.writeln(HeaderComment);
|
||
normSink.writeln("module std.internal.unicode_norm;");
|
||
normSink.writeln("import std.internal.unicode_tables;");
|
||
normSink.writeln("\npackage(std):\n");
|
||
|
||
graphSink.writeln(HeaderComment);
|
||
graphSink.writeln("module std.internal.unicode_grapheme;");
|
||
graphSink.writeln("import std.internal.unicode_tables;");
|
||
graphSink.writeln("\npackage(std):\n");
|
||
}
|
||
|
||
loadBlocks(blocksSrc, blocks);
|
||
loadProperties(propListSrc, general);
|
||
loadProperties(corePropSrc, general);
|
||
loadProperties(scriptsSrc, scripts);
|
||
loadProperties(hangulSyllableSrc, hangul);
|
||
loadProperties(graphemeSrc, graphemeBreaks);
|
||
loadProperties(emojiDataSrc, emojiData);
|
||
loadPropertyAliases(propertyValueAliases);
|
||
|
||
loadUnicodeData(unicodeDataSrc);
|
||
loadSpecialCasing(specialCasingSrc);
|
||
loadExclusions(compositionExclusionsSrc);
|
||
loadCaseFolding(caseFoldingSrc);
|
||
loadNormalization(normalizationPropSrc);
|
||
|
||
static void writeTableOfSets(File sink, string prefix, PropertyTable tab)
|
||
{
|
||
sink.writeln();
|
||
writeAliasTable(sink, prefix, tab);
|
||
}
|
||
|
||
if (!minimal)
|
||
{
|
||
writeCaseFolding(baseSink);
|
||
writeTableOfSets(baseSink, "uniProps", general);
|
||
writeTableOfSets(baseSink, "blocks", blocks);
|
||
writeTableOfSets(baseSink, "scripts", scripts);
|
||
writeTableOfSets(baseSink, "hangul", hangul);
|
||
writeFunctions(baseSink);
|
||
}
|
||
|
||
static void trieProlog(File file)
|
||
{
|
||
file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof);
|
||
}
|
||
|
||
static void trieEpilog(File file)
|
||
{
|
||
file.writeln("
|
||
}\n");
|
||
}
|
||
|
||
trieProlog(baseSink);
|
||
trieProlog(compSink);
|
||
trieProlog(decompSink);
|
||
trieProlog(graphSink);
|
||
trieProlog(normSink);
|
||
|
||
writeTries(baseSink);
|
||
writeNormalizationTries(normSink);
|
||
writeGraphemeTries(graphSink);
|
||
writeCaseCoversion(baseSink);
|
||
writeCombining(compSink);
|
||
writeDecomposition(decompSink);
|
||
writeCompositionTable(compSink);
|
||
|
||
trieEpilog(decompSink);
|
||
trieEpilog(compSink);
|
||
trieEpilog(baseSink);
|
||
trieEpilog(graphSink);
|
||
trieEpilog(normSink);
|
||
}
|
||
|
||
void scanUniData(alias Fn)(string name, Regex!char r)
|
||
{
|
||
File f = File(name);
|
||
foreach (line; f.byLine)
|
||
{
|
||
auto m = match(line, r);
|
||
if (!m.empty)
|
||
Fn(m);
|
||
}
|
||
}
|
||
|
||
void loadCaseFolding(string f)
|
||
{
|
||
dchar[dchar] simple;
|
||
dstring[dchar] full;
|
||
|
||
auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);");
|
||
scanUniData!((m){
|
||
auto s1 = m.captures[1];
|
||
auto code = m.captures[2].front;
|
||
auto s2 = m.captures[3];
|
||
auto left = parse!int(s1, 16);
|
||
if (code == 'C')
|
||
{
|
||
auto right = parse!int(s2, 16);
|
||
simple[left] = right;
|
||
full[left] = [right];
|
||
}
|
||
else if (code == 'S')
|
||
{
|
||
auto right = parse!int(s2, 16);
|
||
simple[left] = right;
|
||
}
|
||
else if (code == 'F')
|
||
{
|
||
dstring right;
|
||
foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g")))
|
||
{
|
||
right ~= to!int(x[0], 16);
|
||
}
|
||
full[left] = right.idup;
|
||
}
|
||
else if (code == 'T')
|
||
{
|
||
// TODO: ignore Turkic languages for now.
|
||
}
|
||
})(f, r);
|
||
|
||
//make some useful sets by hand
|
||
|
||
lowerCaseSet = general.table["Lowercase"];
|
||
upperCaseSet = general.table["Uppercase"];
|
||
//titleCaseSet = general.table["Lt"];
|
||
|
||
foreach (ch; simple.keys())
|
||
{
|
||
dchar[8] entry;
|
||
int size=0;
|
||
entry[size++] = ch;
|
||
dchar x = simple[ch];
|
||
entry[size++] = x;
|
||
//simple is many:1 mapping
|
||
foreach (key, v; simple)
|
||
{
|
||
if (v == x && !canFind(entry[], key))
|
||
{
|
||
entry[size++] = key;
|
||
}
|
||
}
|
||
sort(entry[0 .. size]);
|
||
foreach (i, value; entry[0 .. size])
|
||
{
|
||
auto withFlags = cast(ubyte) size | (value in lowerCaseSet ? 0x40 : 0)
|
||
| (value in upperCaseSet ? 0x80 : 0);
|
||
simpleTable ~= SimpleCaseEntry(value, cast(ubyte) i,
|
||
cast(ubyte) withFlags);
|
||
}
|
||
}
|
||
|
||
foreach (ch; full.keys())
|
||
{
|
||
dstring[8] entry;
|
||
int size=0;
|
||
entry[size++] = [ch];
|
||
auto x = full[ch];
|
||
entry[size++] = x;
|
||
|
||
//full is many:many mapping
|
||
//sort single-char versions, and let them come first
|
||
foreach (key, v; full)
|
||
{
|
||
if (v == x && !canFind(entry[], [key]))
|
||
{
|
||
entry[size++] = [key];
|
||
}
|
||
|
||
}
|
||
auto right = partition!(a => a.length == 1)(entry[0 .. size]);
|
||
sort(entry[0 .. size - right.length]);
|
||
foreach (i, value; entry[0 .. size])
|
||
{
|
||
fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size);
|
||
}
|
||
}
|
||
}
|
||
|
||
void loadBlocks(string f, ref PropertyTable target)
|
||
{
|
||
auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`);
|
||
scanUniData!((m){
|
||
auto s1 = m.captures[1];
|
||
auto s2 = m.captures[2];
|
||
auto a1 = parse!uint(s1, 16);
|
||
auto a2 = parse!uint(s2, 16);
|
||
//@@@BUG 6178 memory corruption with
|
||
//target[to!string(m.captures[3])] = CodepointSet(a1, a2+1);
|
||
auto set = CodepointSet(a1, a2+1);
|
||
target.table[to!string(m.captures[3])] = set;
|
||
})(f, r);
|
||
}
|
||
|
||
void loadProperties(string inp, ref PropertyTable target)
|
||
{
|
||
auto acceptProp = (string name) => countUntil(blacklist, name) < 0 && !name.startsWith("Changes");
|
||
auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`);
|
||
string aliasStr;
|
||
auto set = CodepointSet.init; //workaround @@@BUG 6178
|
||
scanUniData!((m){
|
||
auto name = to!string(m.captures[4]);
|
||
if (!acceptProp(name))
|
||
return;
|
||
if (!m.captures[5].empty)
|
||
aliasStr = to!string(m.captures[5]);
|
||
else if (!m.captures[1].empty)
|
||
{
|
||
auto sa = m.captures[1];
|
||
auto sb = m.captures[2];
|
||
uint a = parse!uint(sa, 16);
|
||
uint b = parse!uint(sb, 16);
|
||
if (name !in target.table)
|
||
{
|
||
target.table[name] = set;
|
||
}
|
||
auto p = name in target.table;
|
||
p.add(a,b+1); // unicode lists [a, b] we need [a,b)
|
||
if (!aliasStr.empty)
|
||
{
|
||
target.aliases[name] = aliasStr;
|
||
aliasStr = "";
|
||
}
|
||
}
|
||
else if (!m.captures[3].empty)
|
||
{
|
||
auto sx = m.captures[3];
|
||
uint x = parse!uint(sx, 16);
|
||
if (name !in target.table)
|
||
{
|
||
target.table[name] = set;
|
||
}
|
||
auto p = name in target.table;
|
||
*p |= x;
|
||
if (!aliasStr.empty)
|
||
{
|
||
target.aliases[name] = aliasStr;
|
||
aliasStr = "";
|
||
}
|
||
}
|
||
})(inp, r);
|
||
}
|
||
|
||
void loadPropertyAliases(string inp)
|
||
{
|
||
auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`);
|
||
scanUniData!((m){
|
||
auto type = m.captures[1];
|
||
auto target = m.captures[2].idup;
|
||
auto label = m.captures[3].idup;
|
||
if (target != label)
|
||
{
|
||
if (type == "blk")
|
||
blocks.aliases[target] = label;
|
||
else if (type == "gc")
|
||
general.aliases[target] = label;
|
||
else if (type == "sc")
|
||
scripts.aliases[target] = label;
|
||
}
|
||
})(inp, r);
|
||
}
|
||
|
||
void loadNormalization(string inp)
|
||
{
|
||
auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`);
|
||
CodepointSet set; //workaround @@@BUG 6178
|
||
scanUniData!((m){
|
||
auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]);
|
||
if (!m.captures[1].empty)
|
||
{
|
||
auto sa = m.captures[1];
|
||
auto sb = m.captures[2];
|
||
uint a = parse!uint(sa, 16);
|
||
uint b = parse!uint(sb, 16);
|
||
if (name !in normalization)
|
||
{
|
||
normalization[name] = set;
|
||
}
|
||
auto p = name in normalization;
|
||
p.add(a,b+1);
|
||
}
|
||
else if (!m.captures[3].empty)
|
||
{
|
||
auto sx = m.captures[3];
|
||
uint x = parse!uint(sx, 16);
|
||
if (name !in normalization)
|
||
{
|
||
normalization[name] = set;
|
||
}
|
||
auto p = name in normalization;
|
||
*p |= x;
|
||
}
|
||
})(inp, r);
|
||
}
|
||
|
||
void loadUnicodeData(string inp)
|
||
{
|
||
auto f = File(inp);
|
||
|
||
dchar characterStart;
|
||
bool haveRangeStart, haveRangeEnd;
|
||
|
||
CodepointSet all;
|
||
foreach (line; f.byLine)
|
||
{
|
||
auto fields = split(line, ";");
|
||
//codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class,
|
||
//Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping
|
||
auto codepoint = fields[0];
|
||
auto decomp = fields[5];
|
||
auto name = fields[1];
|
||
|
||
dchar src = parse!uint(codepoint, 16);
|
||
|
||
if (name.endsWith("First>"))
|
||
{
|
||
assert(!haveRangeStart);
|
||
characterStart = src;
|
||
haveRangeStart = true;
|
||
continue;
|
||
}
|
||
else if (name.endsWith("Last>"))
|
||
{
|
||
haveRangeStart = false;
|
||
haveRangeEnd = true;
|
||
}
|
||
else
|
||
{
|
||
assert(!haveRangeStart);
|
||
haveRangeEnd = false;
|
||
characterStart = src;
|
||
}
|
||
|
||
auto generalCategory = fields[2];
|
||
auto ccc = parse!uint(fields[3]);
|
||
auto upperCasePart = fields[12];
|
||
auto lowerCasePart = fields[13];
|
||
auto titleCasePart = fields[14];
|
||
|
||
void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart)
|
||
{
|
||
if (!casePart.empty)
|
||
{
|
||
// if you have a range, you shouldn't have any casing provided
|
||
assert(!haveRangeEnd);
|
||
|
||
uint ch = parse!uint(casePart, 16);
|
||
chars ~= ch;
|
||
assert(chars.length < ushort.max);
|
||
index[src] = cast(ushort)(chars.length-1);
|
||
}
|
||
}
|
||
|
||
if (generalCategory !in general.table)
|
||
general.table[generalCategory.idup] = CodepointSet.init;
|
||
|
||
all.add(characterStart, src+1);
|
||
combiningClass[ccc].add(characterStart, src+1);
|
||
general.table[generalCategory].add(characterStart, src+1);
|
||
|
||
appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart);
|
||
appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart);
|
||
appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart);
|
||
|
||
if (!decomp.empty)
|
||
{
|
||
// none of the ranges in UnicodeData.txt have decompositions provided
|
||
assert(!haveRangeEnd);
|
||
|
||
//stderr.writeln(codepoint, " ---> ", decomp);
|
||
|
||
dstring dest;
|
||
bool compat = false;
|
||
if (decomp.startsWith(" "))
|
||
decomp = decomp[1..$];
|
||
if (decomp.front == '<')
|
||
{
|
||
decomp = findSplitAfter(decomp, ">")[1];
|
||
compat = true;
|
||
}
|
||
auto vals = split(decomp, " ");
|
||
foreach (v; vals)
|
||
{
|
||
if (!v.empty)
|
||
dest ~= cast(dchar) parse!uint(v, 16);
|
||
}
|
||
if (!compat)
|
||
{
|
||
assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!");
|
||
canonDecomp[src] = dest;
|
||
}
|
||
compatDecomp[src] = dest;
|
||
}
|
||
}
|
||
// compute Cn as all dchar we have not found in UnicodeData.txt
|
||
general.table["Cn"] = all.inverted;
|
||
general.aliases["Cn"] = "Unassigned";
|
||
auto arr = combiningClass[1 .. 255];
|
||
foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints
|
||
{
|
||
auto y = clazz.byCodepoint;
|
||
foreach (ch; y)
|
||
combiningMapping[ch] = cast(ubyte)(i+1);
|
||
}
|
||
}
|
||
|
||
void loadSpecialCasing(string f)
|
||
{
|
||
{
|
||
toLowerTabSimpleLen = cast(ushort) toLowerTab.length;
|
||
toUpperTabSimpleLen = cast(ushort) toUpperTab.length;
|
||
toTitleTabSimpleLen = cast(ushort) toTitleTab.length;
|
||
|
||
// duplicate the simple indexes prior to adding our uncondtional rules also
|
||
toLowerIndex = toLowerSimpleIndex.dup;
|
||
toTitleIndex = toTitleSimpleIndex.dup;
|
||
toUpperIndex = toUpperSimpleIndex.dup;
|
||
}
|
||
|
||
auto file = File(f);
|
||
auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g");
|
||
foreach (line; file.byLine)
|
||
{
|
||
if (!line.empty && line[0] == '#')
|
||
{
|
||
if (line.canFind("Conditional Mappings"))
|
||
{
|
||
// TODO: we kinda need the conditional mappings for languages like Turkish
|
||
break;
|
||
}
|
||
else
|
||
continue;
|
||
}
|
||
|
||
auto entries = line.match(r);
|
||
if (entries.empty)
|
||
continue;
|
||
auto pieces = array(entries.map!"a[1]");
|
||
dchar ch = parse!uint(pieces[0], 16);
|
||
void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece)
|
||
{
|
||
uint[] mapped = piece.split
|
||
.map!(x=>parse!uint(x, 16)).array;
|
||
if (mapped.length == 1)
|
||
{
|
||
table ~= mapped[0];
|
||
index[ch] = cast(ushort)(table.length - 1);
|
||
}
|
||
else
|
||
{
|
||
ushort idx = cast(ushort) table.length;
|
||
table ~= mapped;
|
||
table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence
|
||
index[ch] = idx;
|
||
}
|
||
}
|
||
|
||
// lower, title, upper
|
||
processPiece(toLowerIndex, toLowerTab, pieces[1]);
|
||
processPiece(toTitleIndex, toTitleTab, pieces[2]);
|
||
processPiece(toUpperIndex, toUpperTab, pieces[3]);
|
||
}
|
||
}
|
||
|
||
auto recursivelyDecompose(dstring[dchar] decompTable)
|
||
{
|
||
//apply recursively:
|
||
dstring[dchar] full;
|
||
foreach (k, v; decompTable)
|
||
{
|
||
dstring old, decomp=v;
|
||
do
|
||
{
|
||
old = decomp;
|
||
decomp = "";
|
||
foreach (dchar ch; old)
|
||
{
|
||
if (ch in decompTable)
|
||
decomp ~= decompTable[ch];
|
||
else
|
||
decomp ~= ch;
|
||
}
|
||
}
|
||
while (old != decomp);
|
||
full[k] = decomp;
|
||
}
|
||
return full;
|
||
}
|
||
|
||
void loadExclusions(string inp)
|
||
{
|
||
auto r = regex(`^([0-9A-F]+)`);
|
||
scanUniData!((m){
|
||
auto piece = m.captures[1];
|
||
uint a = parse!uint(piece, 16);
|
||
compExclusions |= cast(dchar) a;
|
||
})(inp, r);
|
||
}
|
||
|
||
string charsetString(CodepointSet set, string sep=";\n")
|
||
{
|
||
auto app = appender!(char[])();
|
||
ubyte[] data = compressIntervals(set.byInterval);
|
||
assert(CodepointSet(decompressIntervals(data)) == set);
|
||
formattedWrite(app, "[%(0x%x, %)];", data);
|
||
return cast(string) app.data;
|
||
}
|
||
|
||
string identName(string s)
|
||
{
|
||
auto app = appender!(char[])();
|
||
foreach (c; s)
|
||
{
|
||
if (c == '-' || c == ' ')
|
||
app.put('_');
|
||
else
|
||
app.put(c);
|
||
}
|
||
return cast(string) app.data;
|
||
}
|
||
|
||
string uniformName(string s)
|
||
{
|
||
auto app = appender!(char[])();
|
||
foreach (c; s)
|
||
{
|
||
if (c != '-' && c != ' ' && c != '_')
|
||
app.put(std.ascii.toLower(c));
|
||
}
|
||
return cast(string) app.data;
|
||
}
|
||
|
||
void writeSets(File sink, PropertyTable src)
|
||
{
|
||
with(sink)
|
||
{
|
||
writeln("private alias _T = ubyte[];");
|
||
foreach (k, v; src.table)
|
||
{
|
||
writef("_T %s = ", identName(k));
|
||
writeln(charsetString(v));
|
||
}
|
||
}
|
||
}
|
||
|
||
void writeAliasTable(File sink, string prefix, PropertyTable src)
|
||
{
|
||
with(sink)
|
||
{
|
||
writeln("struct ", prefix);
|
||
writeln("{");
|
||
writeln("private alias _U = immutable(UnicodeProperty);");
|
||
writeln("@property static _U[] tab() pure { return _tab; }");
|
||
writeln("static immutable:");
|
||
writeSets(sink, src);
|
||
writeln("_U[] _tab = [");
|
||
}
|
||
string[] lines;
|
||
string[] namesOnly;
|
||
auto app = appender!(char[])();
|
||
auto keys = src.table.keys;
|
||
foreach (k; keys)
|
||
{
|
||
formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k));
|
||
lines ~= app.data.idup;
|
||
namesOnly ~= uniformName(k);
|
||
app.shrinkTo(0);
|
||
if (k in src.aliases)
|
||
{
|
||
formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k));
|
||
lines ~= app.data.idup;
|
||
namesOnly ~= uniformName(src.aliases[k]);
|
||
app.shrinkTo(0);
|
||
}
|
||
}
|
||
static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); }
|
||
sort!ucmp(zip(namesOnly, lines));
|
||
|
||
with(sink)
|
||
{
|
||
foreach (i, v; lines)
|
||
{
|
||
write(lines[i]);
|
||
}
|
||
writeln("];");
|
||
writeln("}");
|
||
}
|
||
}
|
||
|
||
void writeCaseFolding(File sink)
|
||
{
|
||
with(sink)
|
||
{
|
||
write(mixedCCEntry);
|
||
|
||
writeln("@property immutable(SimpleCaseEntry[]) simpleCaseTable()");
|
||
writeln("{");
|
||
writeln("alias SCE = SimpleCaseEntry;");
|
||
writeln("static immutable SCE[] t = [");
|
||
foreach (i, v; simpleTable)
|
||
{
|
||
writef("SCE(0x%04x, %s, 0x%0x),", v.ch, v.n, v.bucket);
|
||
if (i % 4 == 0) writeln();
|
||
}
|
||
writeln("];");
|
||
writeln("return t;");
|
||
writeln("}");
|
||
static uint maxLen = 0;
|
||
writeln("@property immutable(FullCaseEntry[]) fullCaseTable() nothrow @nogc @safe pure");
|
||
writeln("{");
|
||
writeln("alias FCE = FullCaseEntry;");
|
||
writeln("static immutable FCE[] t = [");
|
||
foreach (i, v; fullTable)
|
||
{
|
||
maxLen = max(maxLen, v.entry_len);
|
||
if (v.entry_len > 1)
|
||
{
|
||
assert(v.n >= 1); // meaning that start of bucket is always single char
|
||
}
|
||
writef("FCE(\"%s\", %s, %s, %s),", v.value, v.n, v.size, v.entry_len);
|
||
if (i % 4 == 0) writeln();
|
||
}
|
||
writeln("];");
|
||
writeln("return t;");
|
||
writeln("}");
|
||
stderr.writefln("MAX FCF len = %d", maxLen);
|
||
}
|
||
}
|
||
|
||
void writeTries(File sink)
|
||
{
|
||
ushort[dchar] simpleIndices;
|
||
foreach (i, v; array(map!(x => x.ch)(simpleTable)))
|
||
simpleIndices[v] = cast(ushort) i;
|
||
|
||
ushort[dchar] fullIndices;
|
||
foreach (i, v; fullTable)
|
||
{
|
||
if (v.entry_len == 1)
|
||
fullIndices[v.seq[0]] = cast(ushort) i;
|
||
}
|
||
|
||
//these 2 only for verification of Trie code itself
|
||
auto st = codepointTrie!(ushort, 12, 9)(
|
||
zip(simpleIndices.values, simpleIndices.keys).array, ushort.max);
|
||
auto ft = codepointTrie!(ushort, 12, 9)(
|
||
zip(fullIndices.values, fullIndices.keys).array, ushort.max);
|
||
|
||
foreach (k, v; simpleIndices)
|
||
{
|
||
assert(st[k] == simpleIndices[k]);
|
||
}
|
||
|
||
foreach (k, v; fullIndices)
|
||
{
|
||
assert(ft[k] == fullIndices[k]);
|
||
}
|
||
|
||
writeBest3Level(sink, "lowerCase", lowerCaseSet);
|
||
writeBest3Level(sink, "upperCase", upperCaseSet);
|
||
//writeBest3Level("titleCase", titleCaseSet);
|
||
writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max);
|
||
writeBest3Level(sink, "fullCase", fullIndices, ushort.max);
|
||
|
||
//common isXXX properties
|
||
auto props = general.table;
|
||
CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks
|
||
CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"];
|
||
CodepointSet number = props["Nd"] | props["Nl"] | props["No"];
|
||
CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"]
|
||
| props["Pc"] | props["Po"] | props["Pi"] | props["Pf"];
|
||
CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"];
|
||
CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"];
|
||
CodepointSet nonCharacter = props["Cn"];
|
||
|
||
|
||
writeBest3Level(sink, "alpha", alpha);
|
||
writeBest3Level(sink, "mark", mark);
|
||
writeBest3Level(sink, "number", number);
|
||
writeBest3Level(sink, "punctuation", punctuation);
|
||
writeBest3Level(sink, "symbol", symbol);
|
||
writeBest3Level(sink, "graphical", graphical);
|
||
writeBest4Level(sink, "nonCharacter", nonCharacter);
|
||
|
||
}
|
||
|
||
void writeNormalizationTries(File sink)
|
||
{
|
||
CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"];
|
||
CodepointSet nfdQC = normalization["NFD_QCN"];
|
||
CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"];
|
||
CodepointSet nfkdQC = normalization["NFKD_QCN"];
|
||
writeBest3Level(sink, "nfcQC", nfcQC);
|
||
writeBest3Level(sink, "nfdQC", nfdQC);
|
||
writeBest3Level(sink, "nfkcQC", nfkcQC);
|
||
writeBest3Level(sink, "nfkdQC", nfkdQC);
|
||
}
|
||
|
||
void writeGraphemeTries(File sink)
|
||
{
|
||
auto table = graphemeBreaks.table;
|
||
|
||
foreach(key; table.byKey)
|
||
{
|
||
writeBest3Level(sink, key, table[key]);
|
||
}
|
||
|
||
sink.writeln();
|
||
|
||
writeBest3Level
|
||
(
|
||
sink,
|
||
"Extended_Pictographic",
|
||
emojiData.table["Extended_Pictographic"]
|
||
);
|
||
}
|
||
|
||
void writeCaseCoversion(File sink)
|
||
{
|
||
{
|
||
sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen);
|
||
sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen);
|
||
sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen);
|
||
}
|
||
|
||
{
|
||
// these are case mappings that also utilize the unconditional SpecialCasing.txt rules
|
||
writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max);
|
||
writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max);
|
||
writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max);
|
||
}
|
||
|
||
{
|
||
// these are all case mapping tables that are 1:1 acquired from UnicodeData.txt
|
||
writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max);
|
||
writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max);
|
||
writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max);
|
||
}
|
||
|
||
with(sink)
|
||
{
|
||
writeln("@property");
|
||
writeln("{");
|
||
writeln("private alias _IUA = immutable(uint[]);");
|
||
writefln("_IUA toUpperTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toUpperTab);
|
||
writefln("_IUA toLowerTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toLowerTab);
|
||
writefln("_IUA toTitleTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toTitleTab);
|
||
writeln("}");
|
||
}
|
||
}
|
||
|
||
void writeDecomposition(File sink)
|
||
{
|
||
auto fullCanon = recursivelyDecompose(canonDecomp);
|
||
auto fullCompat = recursivelyDecompose(compatDecomp);
|
||
dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0";
|
||
dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0";
|
||
stderr.writeln("Canon flattened: ", decompCanonFlat.length);
|
||
stderr.writeln("Compat flattened: ", decompCompatFlat.length);
|
||
|
||
ushort[dchar] mappingCanon;
|
||
ushort[dchar] mappingCompat;
|
||
//0 serves as doesn't decompose value
|
||
foreach (k, v; fullCanon)
|
||
{
|
||
size_t idx = decompCanonFlat.countUntil(v~"\0");
|
||
enforce(idx != 0);
|
||
enforce(decompCanonFlat[idx .. idx+v.length] == v);
|
||
mappingCanon[k] = cast(ushort) idx;
|
||
}
|
||
foreach (k, v; fullCompat)
|
||
{
|
||
size_t idx = decompCompatFlat.countUntil(v~"\0");
|
||
enforce(idx != 0);
|
||
enforce(decompCompatFlat[idx .. idx+v.length] == v);
|
||
mappingCompat[k] = cast(ushort) idx;
|
||
}
|
||
enforce(decompCanonFlat.length < 2^^16);
|
||
enforce(decompCompatFlat.length < 2^^16);
|
||
|
||
//these 2 are just self-test for Trie template code
|
||
auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array;
|
||
auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array;
|
||
auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0);
|
||
auto canonTrie = codepointTrie!(ushort, 12, 9)(canonRange, 0);
|
||
import std.string;
|
||
foreach (k, v; fullCompat)
|
||
{
|
||
auto idx = compatTrie[k];
|
||
enforce(idx == mappingCompat[k], "failed on compat");
|
||
size_t len = decompCompatFlat[idx..$].countUntil(0);
|
||
enforce(decompCompatFlat[idx .. idx+len] == v,
|
||
format("failed on compat: '%( 0x0%5x %)' not found", v));
|
||
}
|
||
foreach (k, v; fullCanon)
|
||
{
|
||
auto idx = canonTrie[k];
|
||
enforce(idx == mappingCanon[k], "failed on canon");
|
||
size_t len = decompCanonFlat[idx..$].countUntil(0);
|
||
enforce(decompCanonFlat[idx .. idx+len] == v,
|
||
format("failed on canon: '%( 0x%5x %)' not found", v));
|
||
}
|
||
|
||
writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0);
|
||
writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0);
|
||
with(sink)
|
||
{
|
||
writeln("@property");
|
||
writeln("{");
|
||
writeln("private alias _IDCA = immutable(dchar[]);");
|
||
writefln("_IDCA decompCanonTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCanonFlat);
|
||
writefln("_IDCA decompCompatTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCompatFlat);
|
||
writeln("}");
|
||
}
|
||
}
|
||
|
||
void writeFunctions(File sink)
|
||
{
|
||
auto format = general.table["Cf"];
|
||
auto space = general.table["Zs"];
|
||
auto control = general.table["Cc"];
|
||
auto whitespace = general.table["White_Space"];
|
||
|
||
//hangul L, V, T
|
||
auto hangL = hangul.table["L"];
|
||
auto hangV = hangul.table["V"];
|
||
auto hangT = hangul.table["T"];
|
||
with(sink)
|
||
{
|
||
writeln(format.toSourceCode("isFormatGen"));
|
||
writeln(control.toSourceCode("isControlGen"));
|
||
writeln(space.toSourceCode("isSpaceGen"));
|
||
writeln(whitespace.toSourceCode("isWhiteGen"));
|
||
writeln(hangL.toSourceCode("isHangL"));
|
||
writeln(hangV.toSourceCode("isHangV"));
|
||
writeln(hangT.toSourceCode("isHangT"));
|
||
}
|
||
}
|
||
|
||
|
||
void writeCompositionTable(File sink)
|
||
{
|
||
dchar[dstring] composeTab;
|
||
//construct compositions table
|
||
foreach (dchar k, dstring v; canonDecomp)
|
||
{
|
||
if (v.length != 2)//singleton
|
||
continue;
|
||
if (v[0] in combiningMapping) //non-starter
|
||
continue;
|
||
if (k in combiningMapping) //combines to non-starter
|
||
continue;
|
||
if (compExclusions[k]) // non-derivable exclusions
|
||
continue;
|
||
composeTab[v] = k;
|
||
}
|
||
|
||
Tuple!(dchar, dchar, dchar)[] triples;
|
||
foreach (dstring key, dchar val; composeTab)
|
||
triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val);
|
||
multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples);
|
||
//map to the triplets array
|
||
ushort[dchar] trimap;
|
||
dchar old = triples[0][0];
|
||
auto r = triples[];
|
||
for (size_t idx = 0;;)
|
||
{
|
||
ptrdiff_t cnt = countUntil!(x => x[0] != old)(r);
|
||
if (cnt == -1)//end of input
|
||
cnt = r.length;
|
||
assert(idx < 2048);
|
||
assert(cnt < 32);
|
||
trimap[old] = to!ushort(idx | (cnt << 11));
|
||
idx += cnt;
|
||
if (idx == triples.length)
|
||
break;
|
||
old = r[cnt][0];
|
||
r = r[cnt..$];
|
||
}
|
||
|
||
auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max);
|
||
auto dupletes = triples.map!(x => tuple(x[1], x[2])).array;
|
||
foreach (dstring key, dchar val; composeTab)
|
||
{
|
||
size_t pack = triT[key[0]];
|
||
assert(pack != ushort.max);
|
||
size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11;
|
||
auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]);
|
||
assert(!f.empty);
|
||
// & starts with the right value
|
||
assert(f.front[1] == val);
|
||
}
|
||
with(sink)
|
||
{
|
||
writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;");
|
||
write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)(");
|
||
triT.store(sink.lockingTextWriter());
|
||
writeln(");");
|
||
writeln("@property immutable(CompEntry[]) compositionTable() nothrow pure @nogc @safe");
|
||
writeln("{");
|
||
writeln("alias CE = CompEntry;");
|
||
write("static immutable CE[] t = [");
|
||
foreach (pair; dupletes)
|
||
writef("CE(0x%05x, 0x%05x),", pair[0], pair[1]);
|
||
writeln("];");
|
||
writeln("return t;");
|
||
writeln("}");
|
||
}
|
||
}
|
||
|
||
void writeCombining(File sink)
|
||
{
|
||
auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs);
|
||
foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints
|
||
{
|
||
foreach (ch; clazz.byCodepoint)
|
||
assert(ct[ch] == i+1);
|
||
}
|
||
writeBest3Level(sink, "combiningClass", combiningMapping);
|
||
}
|
||
|
||
//fussy compare for unicode property names as per UTS-18
|
||
int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b)
|
||
{
|
||
for (;;)
|
||
{
|
||
while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_'))
|
||
{
|
||
a.popFront();
|
||
}
|
||
while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_'))
|
||
{
|
||
b.popFront();
|
||
}
|
||
if (a.empty)
|
||
return b.empty ? 0 : -1;
|
||
if (b.empty)
|
||
return 1;
|
||
// names are all in ASCII either way though whitespace might be unicode
|
||
auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front);
|
||
if (ca > cb)
|
||
return 1;
|
||
else if ( ca < cb)
|
||
return -1;
|
||
a.popFront();
|
||
b.popFront();
|
||
}
|
||
}
|
||
|
||
bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b)
|
||
{
|
||
return comparePropertyName(a, b) < 0;
|
||
}
|
||
|
||
//meta helpers to generate and pick the best trie by size & levels
|
||
|
||
void writeBest2Level(Set)(File sink, string name, Set set)
|
||
if (isCodepointSet!Set)
|
||
{
|
||
alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||
size_t min = size_t.max;
|
||
void delegate(File) write;
|
||
foreach (lvl_1; List)
|
||
{
|
||
enum lvl_2 = 21-lvl_1;
|
||
auto t = codepointSetTrie!(lvl_1, lvl_2)(set);
|
||
if (t.bytes < min)
|
||
{
|
||
min = t.bytes;
|
||
write = createPrinter!(lvl_1, lvl_2)(name, t);
|
||
}
|
||
}
|
||
write(sink);
|
||
}
|
||
|
||
void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
|
||
{
|
||
alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
|
||
size_t min = size_t.max;
|
||
void delegate(File) write;
|
||
auto range = zip(map.values, map.keys).array;
|
||
foreach (lvl_1; List)
|
||
{
|
||
enum lvl_2 = 21-lvl_1;
|
||
alias codepointTrie!(V, lvl_1, lvl_2) CurTrie;
|
||
CurTrie t = CurTrie(range, defValue);
|
||
if (t.bytes < min)
|
||
{
|
||
min = t.bytes;
|
||
write = createPrinter!(lvl_1, lvl_2)(name, t);
|
||
}
|
||
}
|
||
write(sink);
|
||
}
|
||
|
||
alias List_1 = TypeTuple!(4, 5, 6, 7, 8);
|
||
|
||
auto writeBest3Level(Set)(File sink, string name, Set set)
|
||
if (isCodepointSet!Set)
|
||
{
|
||
// access speed trumps size, power of 2 is faster to access
|
||
// e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown:
|
||
// 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte
|
||
// 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!)
|
||
|
||
// e.g. 8-5-8 is one of hand picked that is a very close match
|
||
// to the best packing
|
||
void delegate(File) write;
|
||
|
||
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||
size_t min = size_t.max;
|
||
foreach (lvl_1; List_1)//to have the first stage index fit in byte
|
||
{
|
||
foreach (lvl_2; List)
|
||
{
|
||
static if (lvl_1 + lvl_2 <= 16)//so that 2nd stage fits in ushort
|
||
{
|
||
enum lvl_3 = 21-lvl_2-lvl_1;
|
||
auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set);
|
||
if (t.bytes < min)
|
||
{
|
||
min = t.bytes;
|
||
write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
write(sink);
|
||
}
|
||
|
||
void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
|
||
{
|
||
void delegate(File) write;
|
||
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
||
size_t min = size_t.max;
|
||
auto range = zip(map.values, map.keys).array;
|
||
foreach (lvl_1; List_1)//to have the first stage index fit in byte
|
||
{
|
||
foreach (lvl_2; List)
|
||
{
|
||
static if (lvl_1 + lvl_2 <= 16)// into ushort
|
||
{
|
||
enum lvl_3 = 21-lvl_2-lvl_1;
|
||
auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue);
|
||
if (t.bytes < min)
|
||
{
|
||
min = t.bytes;
|
||
write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
write(sink);
|
||
}
|
||
|
||
void writeBest4Level(Set)(File sink, string name, Set set)
|
||
{
|
||
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
|
||
size_t min = size_t.max;
|
||
void delegate(File) write;
|
||
foreach (lvl_1; List_1)//to have the first stage index fit in byte
|
||
{
|
||
foreach (lvl_2; List)
|
||
{
|
||
foreach (lvl_3; List)
|
||
{
|
||
static if (lvl_1 + lvl_2 + lvl_3 <= 16)
|
||
{
|
||
enum lvl_4 = 21-lvl_3-lvl_2-lvl_1;
|
||
auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set);
|
||
if (t.bytes < min)
|
||
{
|
||
min = t.bytes;
|
||
write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
write(sink);
|
||
}
|
||
|
||
template createPrinter(Params...)
|
||
{
|
||
void delegate(File) createPrinter(T)(string name, T trie)
|
||
{
|
||
return (File sink){
|
||
sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s",
|
||
trie.bytes, name, Unqual!(typeof(T.init[0])).stringof);
|
||
foreach (lvl; Params[0..$])
|
||
sink.writef(", %d", lvl);
|
||
sink.write(")(");
|
||
trie.store(sink.lockingTextWriter());
|
||
sink.writeln(");");
|
||
};
|
||
}
|
||
}
|