phobos/tools/unicode_table_generator.d

1319 lines
40 KiB
D
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//Written in the D programming language
/**
gen_uni is a tool to automatically generate source code for unicode data structures.
To generate the tables use:
```
$ rdmd -m32 unicode_table_generator.d
$ rdmd -m64 unicode_table_generator.d --min
```
See the global ``UnicodeDatabaseDirectory`` for the latest version of the Unicode database that was used to generate the tables.
TODO: Support emitting of Turkic casefolding mappings
Authors: Dmitry Olshansky
License: Boost
*/
module std.unicode_table_generator;
// this shouldn't be in std package, but stuff is package'd as that in std.uni.
import std.uni, std.stdio, std.traits, std.typetuple,
std.exception, std.format, std.algorithm, std.typecons,
std.regex, std.range, std.conv, std.getopt;
import std.file : exists;
static import std.ascii, std.string;
//common binary property sets and their aliases
struct PropertyTable
{
CodepointSet[string] table;
string[string] aliases;
}
PropertyTable general;
PropertyTable blocks;
PropertyTable scripts;
PropertyTable hangul;
PropertyTable graphemeBreaks;
PropertyTable emojiData;
//quick NO/MAYBE charaсter sets
CodepointSet[string] normalization;
//axuilary sets for case mapping
CodepointSet lowerCaseSet, upperCaseSet;
//CodepointSet titleCaseSet; //no sensible version found for isTitlecase
// sets for toLower/toUpper/toTitle
uint[] toLowerTab;
ushort toLowerTabSimpleLen; //start of long mappings
ushort[dchar] toLowerSimpleIndex, toLowerIndex;
uint[] toUpperTab;
ushort toUpperTabSimpleLen; //ditto for Upper
ushort[dchar] toUpperSimpleIndex, toUpperIndex;
uint[] toTitleTab;
ushort toTitleTabSimpleLen; //ditto for Title
ushort[dchar] toTitleSimpleIndex, toTitleIndex;
mixin(mixedCCEntry);
//case folding mapping
SimpleCaseEntry[] simpleTable;
FullCaseEntry[] fullTable;
///canonical combining class
CodepointSet[256] combiningClass;
//same but packaged per dchar
ubyte[dchar] combiningMapping;
//unrolled decompositions
dstring[dchar] canonDecomp;
dstring[dchar] compatDecomp;
//canonical composition tables
dchar[] canonicalyComposableLeft;
dchar[] canonicalyComposableRight;
//canonical composition exclusions
CodepointSet compExclusions;
//property names to discard
string[] blacklist = [];
enum mixedCCEntry = `
struct SimpleCaseEntry
{
uint ch;
ubyte n, bucket;// n - number in bucket
pure nothrow @nogc:
@property ubyte size() const
{
return bucket & 0x3F;
}
@property auto isLower() const
{
return bucket & 0x40;
}
@property auto isUpper() const
{
return bucket & 0x80;
}
}
struct FullCaseEntry
{
dchar[3] seq;
ubyte n, size;// n number in batch, size - size of batch
ubyte entry_len;
@property auto value() const @trusted pure nothrow @nogc return
{
return seq[0 .. entry_len];
}
}
struct CompEntry
{
dchar rhs, composed;
}
struct UnicodeProperty
{
string name;
ubyte[] compressed;
}
struct TrieEntry(T...)
{
size_t[] offsets;
size_t[] sizes;
size_t[] data;
}
`;
auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size)
{
dchar[3] val;
val[0 .. value.length] = value[];
return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length);
}
enum {
UnicodeDatabaseDirectory = "ucd-15/",
caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt",
blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt",
propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt",
graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt",
emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt",
propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt",
corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt",
normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt",
scriptsSrc = UnicodeDatabaseDirectory ~ "Scripts.txt",
hangulSyllableSrc = UnicodeDatabaseDirectory ~ "HangulSyllableType.txt",
unicodeDataSrc = UnicodeDatabaseDirectory ~ "UnicodeData.txt",
compositionExclusionsSrc = UnicodeDatabaseDirectory ~ "CompositionExclusions.txt",
specialCasingSrc = UnicodeDatabaseDirectory ~ "SpecialCasing.txt"
}
enum HeaderComment = `//Written in the D programming language
/**
* License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
*
* Authors: Dmitry Olshansky
*
*/
// !!! DO NOT EDIT !!!
// !!! Did you even read the comment? !!!
// This module is automatically generated from Unicode Character Database files
// https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d
//dfmt off`;
auto toPairs(K, V)(V[K] aa)
{
return aa.values.zip(aa.keys).array;
}
void main(string[] argv)
{
string mode = "w";
bool minimal = false;
getopt(argv, "min", &minimal);
if (minimal)
mode = "a";
if (!exists(UnicodeDatabaseDirectory))
{
writeln("Did you forget to download the Unicode database tables?");
writeln("Looking for them in: ", UnicodeDatabaseDirectory);
return;
}
enum UnicodeTableDirectory = "../std/internal/";
auto baseSink = File(UnicodeTableDirectory ~ "unicode_tables.d", mode);
auto compSink = File(UnicodeTableDirectory ~ "unicode_comp.d", mode);
auto decompSink = File(UnicodeTableDirectory ~ "unicode_decomp.d", mode);
auto normSink = File(UnicodeTableDirectory ~ "unicode_norm.d", mode);
auto graphSink = File(UnicodeTableDirectory ~ "unicode_grapheme.d", mode);
if (!minimal)
{
baseSink.writeln(HeaderComment);
baseSink.writeln("module std.internal.unicode_tables;");
baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
compSink.writeln("module std.internal.unicode_comp;");
compSink.writeln("import std.internal.unicode_tables;");
compSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
decompSink.writeln(HeaderComment);
decompSink.writeln("module std.internal.unicode_decomp;");
decompSink.writeln("import std.internal.unicode_tables;");
decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
normSink.writeln(HeaderComment);
normSink.writeln("module std.internal.unicode_norm;");
normSink.writeln("import std.internal.unicode_tables;");
normSink.writeln("\npackage(std):\n");
graphSink.writeln(HeaderComment);
graphSink.writeln("module std.internal.unicode_grapheme;");
graphSink.writeln("import std.internal.unicode_tables;");
graphSink.writeln("\npackage(std):\n");
}
loadBlocks(blocksSrc, blocks);
loadProperties(propListSrc, general);
loadProperties(corePropSrc, general);
loadProperties(scriptsSrc, scripts);
loadProperties(hangulSyllableSrc, hangul);
loadProperties(graphemeSrc, graphemeBreaks);
loadProperties(emojiDataSrc, emojiData);
loadPropertyAliases(propertyValueAliases);
loadUnicodeData(unicodeDataSrc);
loadSpecialCasing(specialCasingSrc);
loadExclusions(compositionExclusionsSrc);
loadCaseFolding(caseFoldingSrc);
loadNormalization(normalizationPropSrc);
static void writeTableOfSets(File sink, string prefix, PropertyTable tab)
{
sink.writeln();
writeAliasTable(sink, prefix, tab);
}
if (!minimal)
{
writeCaseFolding(baseSink);
writeTableOfSets(baseSink, "uniProps", general);
writeTableOfSets(baseSink, "blocks", blocks);
writeTableOfSets(baseSink, "scripts", scripts);
writeTableOfSets(baseSink, "hangul", hangul);
writeFunctions(baseSink);
}
static void trieProlog(File file)
{
file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof);
}
static void trieEpilog(File file)
{
file.writeln("
}\n");
}
trieProlog(baseSink);
trieProlog(compSink);
trieProlog(decompSink);
trieProlog(graphSink);
trieProlog(normSink);
writeTries(baseSink);
writeNormalizationTries(normSink);
writeGraphemeTries(graphSink);
writeCaseCoversion(baseSink);
writeCombining(compSink);
writeDecomposition(decompSink);
writeCompositionTable(compSink);
trieEpilog(decompSink);
trieEpilog(compSink);
trieEpilog(baseSink);
trieEpilog(graphSink);
trieEpilog(normSink);
}
void scanUniData(alias Fn)(string name, Regex!char r)
{
File f = File(name);
foreach (line; f.byLine)
{
auto m = match(line, r);
if (!m.empty)
Fn(m);
}
}
void loadCaseFolding(string f)
{
dchar[dchar] simple;
dstring[dchar] full;
auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);");
scanUniData!((m){
auto s1 = m.captures[1];
auto code = m.captures[2].front;
auto s2 = m.captures[3];
auto left = parse!int(s1, 16);
if (code == 'C')
{
auto right = parse!int(s2, 16);
simple[left] = right;
full[left] = [right];
}
else if (code == 'S')
{
auto right = parse!int(s2, 16);
simple[left] = right;
}
else if (code == 'F')
{
dstring right;
foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g")))
{
right ~= to!int(x[0], 16);
}
full[left] = right.idup;
}
else if (code == 'T')
{
// TODO: ignore Turkic languages for now.
}
})(f, r);
//make some useful sets by hand
lowerCaseSet = general.table["Lowercase"];
upperCaseSet = general.table["Uppercase"];
//titleCaseSet = general.table["Lt"];
foreach (ch; simple.keys())
{
dchar[8] entry;
int size=0;
entry[size++] = ch;
dchar x = simple[ch];
entry[size++] = x;
//simple is many:1 mapping
foreach (key, v; simple)
{
if (v == x && !canFind(entry[], key))
{
entry[size++] = key;
}
}
sort(entry[0 .. size]);
foreach (i, value; entry[0 .. size])
{
auto withFlags = cast(ubyte) size | (value in lowerCaseSet ? 0x40 : 0)
| (value in upperCaseSet ? 0x80 : 0);
simpleTable ~= SimpleCaseEntry(value, cast(ubyte) i,
cast(ubyte) withFlags);
}
}
foreach (ch; full.keys())
{
dstring[8] entry;
int size=0;
entry[size++] = [ch];
auto x = full[ch];
entry[size++] = x;
//full is many:many mapping
//sort single-char versions, and let them come first
foreach (key, v; full)
{
if (v == x && !canFind(entry[], [key]))
{
entry[size++] = [key];
}
}
auto right = partition!(a => a.length == 1)(entry[0 .. size]);
sort(entry[0 .. size - right.length]);
foreach (i, value; entry[0 .. size])
{
fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size);
}
}
}
void loadBlocks(string f, ref PropertyTable target)
{
auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`);
scanUniData!((m){
auto s1 = m.captures[1];
auto s2 = m.captures[2];
auto a1 = parse!uint(s1, 16);
auto a2 = parse!uint(s2, 16);
//@@@BUG 6178 memory corruption with
//target[to!string(m.captures[3])] = CodepointSet(a1, a2+1);
auto set = CodepointSet(a1, a2+1);
target.table[to!string(m.captures[3])] = set;
})(f, r);
}
void loadProperties(string inp, ref PropertyTable target)
{
auto acceptProp = (string name) => countUntil(blacklist, name) < 0 && !name.startsWith("Changes");
auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`);
string aliasStr;
auto set = CodepointSet.init; //workaround @@@BUG 6178
scanUniData!((m){
auto name = to!string(m.captures[4]);
if (!acceptProp(name))
return;
if (!m.captures[5].empty)
aliasStr = to!string(m.captures[5]);
else if (!m.captures[1].empty)
{
auto sa = m.captures[1];
auto sb = m.captures[2];
uint a = parse!uint(sa, 16);
uint b = parse!uint(sb, 16);
if (name !in target.table)
{
target.table[name] = set;
}
auto p = name in target.table;
p.add(a,b+1); // unicode lists [a, b] we need [a,b)
if (!aliasStr.empty)
{
target.aliases[name] = aliasStr;
aliasStr = "";
}
}
else if (!m.captures[3].empty)
{
auto sx = m.captures[3];
uint x = parse!uint(sx, 16);
if (name !in target.table)
{
target.table[name] = set;
}
auto p = name in target.table;
*p |= x;
if (!aliasStr.empty)
{
target.aliases[name] = aliasStr;
aliasStr = "";
}
}
})(inp, r);
}
void loadPropertyAliases(string inp)
{
auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`);
scanUniData!((m){
auto type = m.captures[1];
auto target = m.captures[2].idup;
auto label = m.captures[3].idup;
if (target != label)
{
if (type == "blk")
blocks.aliases[target] = label;
else if (type == "gc")
general.aliases[target] = label;
else if (type == "sc")
scripts.aliases[target] = label;
}
})(inp, r);
}
void loadNormalization(string inp)
{
auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`);
CodepointSet set; //workaround @@@BUG 6178
scanUniData!((m){
auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]);
if (!m.captures[1].empty)
{
auto sa = m.captures[1];
auto sb = m.captures[2];
uint a = parse!uint(sa, 16);
uint b = parse!uint(sb, 16);
if (name !in normalization)
{
normalization[name] = set;
}
auto p = name in normalization;
p.add(a,b+1);
}
else if (!m.captures[3].empty)
{
auto sx = m.captures[3];
uint x = parse!uint(sx, 16);
if (name !in normalization)
{
normalization[name] = set;
}
auto p = name in normalization;
*p |= x;
}
})(inp, r);
}
void loadUnicodeData(string inp)
{
auto f = File(inp);
dchar characterStart;
bool haveRangeStart, haveRangeEnd;
CodepointSet all;
foreach (line; f.byLine)
{
auto fields = split(line, ";");
//codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class,
//Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping
auto codepoint = fields[0];
auto decomp = fields[5];
auto name = fields[1];
dchar src = parse!uint(codepoint, 16);
if (name.endsWith("First>"))
{
assert(!haveRangeStart);
characterStart = src;
haveRangeStart = true;
continue;
}
else if (name.endsWith("Last>"))
{
haveRangeStart = false;
haveRangeEnd = true;
}
else
{
assert(!haveRangeStart);
haveRangeEnd = false;
characterStart = src;
}
auto generalCategory = fields[2];
auto ccc = parse!uint(fields[3]);
auto upperCasePart = fields[12];
auto lowerCasePart = fields[13];
auto titleCasePart = fields[14];
void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart)
{
if (!casePart.empty)
{
// if you have a range, you shouldn't have any casing provided
assert(!haveRangeEnd);
uint ch = parse!uint(casePart, 16);
chars ~= ch;
assert(chars.length < ushort.max);
index[src] = cast(ushort)(chars.length-1);
}
}
if (generalCategory !in general.table)
general.table[generalCategory.idup] = CodepointSet.init;
all.add(characterStart, src+1);
combiningClass[ccc].add(characterStart, src+1);
general.table[generalCategory].add(characterStart, src+1);
appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart);
appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart);
appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart);
if (!decomp.empty)
{
// none of the ranges in UnicodeData.txt have decompositions provided
assert(!haveRangeEnd);
//stderr.writeln(codepoint, " ---> ", decomp);
dstring dest;
bool compat = false;
if (decomp.startsWith(" "))
decomp = decomp[1..$];
if (decomp.front == '<')
{
decomp = findSplitAfter(decomp, ">")[1];
compat = true;
}
auto vals = split(decomp, " ");
foreach (v; vals)
{
if (!v.empty)
dest ~= cast(dchar) parse!uint(v, 16);
}
if (!compat)
{
assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!");
canonDecomp[src] = dest;
}
compatDecomp[src] = dest;
}
}
// compute Cn as all dchar we have not found in UnicodeData.txt
general.table["Cn"] = all.inverted;
general.aliases["Cn"] = "Unassigned";
auto arr = combiningClass[1 .. 255];
foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints
{
auto y = clazz.byCodepoint;
foreach (ch; y)
combiningMapping[ch] = cast(ubyte)(i+1);
}
}
void loadSpecialCasing(string f)
{
{
toLowerTabSimpleLen = cast(ushort) toLowerTab.length;
toUpperTabSimpleLen = cast(ushort) toUpperTab.length;
toTitleTabSimpleLen = cast(ushort) toTitleTab.length;
// duplicate the simple indexes prior to adding our uncondtional rules also
toLowerIndex = toLowerSimpleIndex.dup;
toTitleIndex = toTitleSimpleIndex.dup;
toUpperIndex = toUpperSimpleIndex.dup;
}
auto file = File(f);
auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g");
foreach (line; file.byLine)
{
if (!line.empty && line[0] == '#')
{
if (line.canFind("Conditional Mappings"))
{
// TODO: we kinda need the conditional mappings for languages like Turkish
break;
}
else
continue;
}
auto entries = line.match(r);
if (entries.empty)
continue;
auto pieces = array(entries.map!"a[1]");
dchar ch = parse!uint(pieces[0], 16);
void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece)
{
uint[] mapped = piece.split
.map!(x=>parse!uint(x, 16)).array;
if (mapped.length == 1)
{
table ~= mapped[0];
index[ch] = cast(ushort)(table.length - 1);
}
else
{
ushort idx = cast(ushort) table.length;
table ~= mapped;
table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence
index[ch] = idx;
}
}
// lower, title, upper
processPiece(toLowerIndex, toLowerTab, pieces[1]);
processPiece(toTitleIndex, toTitleTab, pieces[2]);
processPiece(toUpperIndex, toUpperTab, pieces[3]);
}
}
auto recursivelyDecompose(dstring[dchar] decompTable)
{
//apply recursively:
dstring[dchar] full;
foreach (k, v; decompTable)
{
dstring old, decomp=v;
do
{
old = decomp;
decomp = "";
foreach (dchar ch; old)
{
if (ch in decompTable)
decomp ~= decompTable[ch];
else
decomp ~= ch;
}
}
while (old != decomp);
full[k] = decomp;
}
return full;
}
void loadExclusions(string inp)
{
auto r = regex(`^([0-9A-F]+)`);
scanUniData!((m){
auto piece = m.captures[1];
uint a = parse!uint(piece, 16);
compExclusions |= cast(dchar) a;
})(inp, r);
}
string charsetString(CodepointSet set, string sep=";\n")
{
auto app = appender!(char[])();
ubyte[] data = compressIntervals(set.byInterval);
assert(CodepointSet(decompressIntervals(data)) == set);
formattedWrite(app, "[%(0x%x, %)];", data);
return cast(string) app.data;
}
string identName(string s)
{
auto app = appender!(char[])();
foreach (c; s)
{
if (c == '-' || c == ' ')
app.put('_');
else
app.put(c);
}
return cast(string) app.data;
}
string uniformName(string s)
{
auto app = appender!(char[])();
foreach (c; s)
{
if (c != '-' && c != ' ' && c != '_')
app.put(std.ascii.toLower(c));
}
return cast(string) app.data;
}
void writeSets(File sink, PropertyTable src)
{
with(sink)
{
writeln("private alias _T = ubyte[];");
foreach (k, v; src.table)
{
writef("_T %s = ", identName(k));
writeln(charsetString(v));
}
}
}
void writeAliasTable(File sink, string prefix, PropertyTable src)
{
with(sink)
{
writeln("struct ", prefix);
writeln("{");
writeln("private alias _U = immutable(UnicodeProperty);");
writeln("@property static _U[] tab() pure { return _tab; }");
writeln("static immutable:");
writeSets(sink, src);
writeln("_U[] _tab = [");
}
string[] lines;
string[] namesOnly;
auto app = appender!(char[])();
auto keys = src.table.keys;
foreach (k; keys)
{
formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k));
lines ~= app.data.idup;
namesOnly ~= uniformName(k);
app.shrinkTo(0);
if (k in src.aliases)
{
formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k));
lines ~= app.data.idup;
namesOnly ~= uniformName(src.aliases[k]);
app.shrinkTo(0);
}
}
static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); }
sort!ucmp(zip(namesOnly, lines));
with(sink)
{
foreach (i, v; lines)
{
write(lines[i]);
}
writeln("];");
writeln("}");
}
}
void writeCaseFolding(File sink)
{
with(sink)
{
write(mixedCCEntry);
writeln("@property immutable(SimpleCaseEntry[]) simpleCaseTable()");
writeln("{");
writeln("alias SCE = SimpleCaseEntry;");
writeln("static immutable SCE[] t = [");
foreach (i, v; simpleTable)
{
writef("SCE(0x%04x, %s, 0x%0x),", v.ch, v.n, v.bucket);
if (i % 4 == 0) writeln();
}
writeln("];");
writeln("return t;");
writeln("}");
static uint maxLen = 0;
writeln("@property immutable(FullCaseEntry[]) fullCaseTable() nothrow @nogc @safe pure");
writeln("{");
writeln("alias FCE = FullCaseEntry;");
writeln("static immutable FCE[] t = [");
foreach (i, v; fullTable)
{
maxLen = max(maxLen, v.entry_len);
if (v.entry_len > 1)
{
assert(v.n >= 1); // meaning that start of bucket is always single char
}
writef("FCE(\"%s\", %s, %s, %s),", v.value, v.n, v.size, v.entry_len);
if (i % 4 == 0) writeln();
}
writeln("];");
writeln("return t;");
writeln("}");
stderr.writefln("MAX FCF len = %d", maxLen);
}
}
void writeTries(File sink)
{
ushort[dchar] simpleIndices;
foreach (i, v; array(map!(x => x.ch)(simpleTable)))
simpleIndices[v] = cast(ushort) i;
ushort[dchar] fullIndices;
foreach (i, v; fullTable)
{
if (v.entry_len == 1)
fullIndices[v.seq[0]] = cast(ushort) i;
}
//these 2 only for verification of Trie code itself
auto st = codepointTrie!(ushort, 12, 9)(
zip(simpleIndices.values, simpleIndices.keys).array, ushort.max);
auto ft = codepointTrie!(ushort, 12, 9)(
zip(fullIndices.values, fullIndices.keys).array, ushort.max);
foreach (k, v; simpleIndices)
{
assert(st[k] == simpleIndices[k]);
}
foreach (k, v; fullIndices)
{
assert(ft[k] == fullIndices[k]);
}
writeBest3Level(sink, "lowerCase", lowerCaseSet);
writeBest3Level(sink, "upperCase", upperCaseSet);
//writeBest3Level("titleCase", titleCaseSet);
writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max);
writeBest3Level(sink, "fullCase", fullIndices, ushort.max);
//common isXXX properties
auto props = general.table;
CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks
CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"];
CodepointSet number = props["Nd"] | props["Nl"] | props["No"];
CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"]
| props["Pc"] | props["Po"] | props["Pi"] | props["Pf"];
CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"];
CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"];
CodepointSet nonCharacter = props["Cn"];
writeBest3Level(sink, "alpha", alpha);
writeBest3Level(sink, "mark", mark);
writeBest3Level(sink, "number", number);
writeBest3Level(sink, "punctuation", punctuation);
writeBest3Level(sink, "symbol", symbol);
writeBest3Level(sink, "graphical", graphical);
writeBest4Level(sink, "nonCharacter", nonCharacter);
}
void writeNormalizationTries(File sink)
{
CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"];
CodepointSet nfdQC = normalization["NFD_QCN"];
CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"];
CodepointSet nfkdQC = normalization["NFKD_QCN"];
writeBest3Level(sink, "nfcQC", nfcQC);
writeBest3Level(sink, "nfdQC", nfdQC);
writeBest3Level(sink, "nfkcQC", nfkcQC);
writeBest3Level(sink, "nfkdQC", nfkdQC);
}
void writeGraphemeTries(File sink)
{
auto table = graphemeBreaks.table;
foreach(key; table.byKey)
{
writeBest3Level(sink, key, table[key]);
}
sink.writeln();
writeBest3Level
(
sink,
"Extended_Pictographic",
emojiData.table["Extended_Pictographic"]
);
}
void writeCaseCoversion(File sink)
{
{
sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen);
sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen);
sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen);
}
{
// these are case mappings that also utilize the unconditional SpecialCasing.txt rules
writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max);
writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max);
writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max);
}
{
// these are all case mapping tables that are 1:1 acquired from UnicodeData.txt
writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max);
writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max);
writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max);
}
with(sink)
{
writeln("@property");
writeln("{");
writeln("private alias _IUA = immutable(uint[]);");
writefln("_IUA toUpperTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toUpperTab);
writefln("_IUA toLowerTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toLowerTab);
writefln("_IUA toTitleTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toTitleTab);
writeln("}");
}
}
void writeDecomposition(File sink)
{
auto fullCanon = recursivelyDecompose(canonDecomp);
auto fullCompat = recursivelyDecompose(compatDecomp);
dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0";
dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0";
stderr.writeln("Canon flattened: ", decompCanonFlat.length);
stderr.writeln("Compat flattened: ", decompCompatFlat.length);
ushort[dchar] mappingCanon;
ushort[dchar] mappingCompat;
//0 serves as doesn't decompose value
foreach (k, v; fullCanon)
{
size_t idx = decompCanonFlat.countUntil(v~"\0");
enforce(idx != 0);
enforce(decompCanonFlat[idx .. idx+v.length] == v);
mappingCanon[k] = cast(ushort) idx;
}
foreach (k, v; fullCompat)
{
size_t idx = decompCompatFlat.countUntil(v~"\0");
enforce(idx != 0);
enforce(decompCompatFlat[idx .. idx+v.length] == v);
mappingCompat[k] = cast(ushort) idx;
}
enforce(decompCanonFlat.length < 2^^16);
enforce(decompCompatFlat.length < 2^^16);
//these 2 are just self-test for Trie template code
auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array;
auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array;
auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0);
auto canonTrie = codepointTrie!(ushort, 12, 9)(canonRange, 0);
import std.string;
foreach (k, v; fullCompat)
{
auto idx = compatTrie[k];
enforce(idx == mappingCompat[k], "failed on compat");
size_t len = decompCompatFlat[idx..$].countUntil(0);
enforce(decompCompatFlat[idx .. idx+len] == v,
format("failed on compat: '%( 0x0%5x %)' not found", v));
}
foreach (k, v; fullCanon)
{
auto idx = canonTrie[k];
enforce(idx == mappingCanon[k], "failed on canon");
size_t len = decompCanonFlat[idx..$].countUntil(0);
enforce(decompCanonFlat[idx .. idx+len] == v,
format("failed on canon: '%( 0x%5x %)' not found", v));
}
writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0);
writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0);
with(sink)
{
writeln("@property");
writeln("{");
writeln("private alias _IDCA = immutable(dchar[]);");
writefln("_IDCA decompCanonTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCanonFlat);
writefln("_IDCA decompCompatTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCompatFlat);
writeln("}");
}
}
void writeFunctions(File sink)
{
auto format = general.table["Cf"];
auto space = general.table["Zs"];
auto control = general.table["Cc"];
auto whitespace = general.table["White_Space"];
//hangul L, V, T
auto hangL = hangul.table["L"];
auto hangV = hangul.table["V"];
auto hangT = hangul.table["T"];
with(sink)
{
writeln(format.toSourceCode("isFormatGen"));
writeln(control.toSourceCode("isControlGen"));
writeln(space.toSourceCode("isSpaceGen"));
writeln(whitespace.toSourceCode("isWhiteGen"));
writeln(hangL.toSourceCode("isHangL"));
writeln(hangV.toSourceCode("isHangV"));
writeln(hangT.toSourceCode("isHangT"));
}
}
void writeCompositionTable(File sink)
{
dchar[dstring] composeTab;
//construct compositions table
foreach (dchar k, dstring v; canonDecomp)
{
if (v.length != 2)//singleton
continue;
if (v[0] in combiningMapping) //non-starter
continue;
if (k in combiningMapping) //combines to non-starter
continue;
if (compExclusions[k]) // non-derivable exclusions
continue;
composeTab[v] = k;
}
Tuple!(dchar, dchar, dchar)[] triples;
foreach (dstring key, dchar val; composeTab)
triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val);
multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples);
//map to the triplets array
ushort[dchar] trimap;
dchar old = triples[0][0];
auto r = triples[];
for (size_t idx = 0;;)
{
ptrdiff_t cnt = countUntil!(x => x[0] != old)(r);
if (cnt == -1)//end of input
cnt = r.length;
assert(idx < 2048);
assert(cnt < 32);
trimap[old] = to!ushort(idx | (cnt << 11));
idx += cnt;
if (idx == triples.length)
break;
old = r[cnt][0];
r = r[cnt..$];
}
auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max);
auto dupletes = triples.map!(x => tuple(x[1], x[2])).array;
foreach (dstring key, dchar val; composeTab)
{
size_t pack = triT[key[0]];
assert(pack != ushort.max);
size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11;
auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]);
assert(!f.empty);
// & starts with the right value
assert(f.front[1] == val);
}
with(sink)
{
writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;");
write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)(");
triT.store(sink.lockingTextWriter());
writeln(");");
writeln("@property immutable(CompEntry[]) compositionTable() nothrow pure @nogc @safe");
writeln("{");
writeln("alias CE = CompEntry;");
write("static immutable CE[] t = [");
foreach (pair; dupletes)
writef("CE(0x%05x, 0x%05x),", pair[0], pair[1]);
writeln("];");
writeln("return t;");
writeln("}");
}
}
void writeCombining(File sink)
{
auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs);
foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints
{
foreach (ch; clazz.byCodepoint)
assert(ct[ch] == i+1);
}
writeBest3Level(sink, "combiningClass", combiningMapping);
}
//fussy compare for unicode property names as per UTS-18
int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b)
{
for (;;)
{
while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_'))
{
a.popFront();
}
while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_'))
{
b.popFront();
}
if (a.empty)
return b.empty ? 0 : -1;
if (b.empty)
return 1;
// names are all in ASCII either way though whitespace might be unicode
auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front);
if (ca > cb)
return 1;
else if ( ca < cb)
return -1;
a.popFront();
b.popFront();
}
}
bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b)
{
return comparePropertyName(a, b) < 0;
}
//meta helpers to generate and pick the best trie by size & levels
void writeBest2Level(Set)(File sink, string name, Set set)
if (isCodepointSet!Set)
{
alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
size_t min = size_t.max;
void delegate(File) write;
foreach (lvl_1; List)
{
enum lvl_2 = 21-lvl_1;
auto t = codepointSetTrie!(lvl_1, lvl_2)(set);
if (t.bytes < min)
{
min = t.bytes;
write = createPrinter!(lvl_1, lvl_2)(name, t);
}
}
write(sink);
}
void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
{
alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
size_t min = size_t.max;
void delegate(File) write;
auto range = zip(map.values, map.keys).array;
foreach (lvl_1; List)
{
enum lvl_2 = 21-lvl_1;
alias codepointTrie!(V, lvl_1, lvl_2) CurTrie;
CurTrie t = CurTrie(range, defValue);
if (t.bytes < min)
{
min = t.bytes;
write = createPrinter!(lvl_1, lvl_2)(name, t);
}
}
write(sink);
}
alias List_1 = TypeTuple!(4, 5, 6, 7, 8);
auto writeBest3Level(Set)(File sink, string name, Set set)
if (isCodepointSet!Set)
{
// access speed trumps size, power of 2 is faster to access
// e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown:
// 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte
// 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!)
// e.g. 8-5-8 is one of hand picked that is a very close match
// to the best packing
void delegate(File) write;
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
size_t min = size_t.max;
foreach (lvl_1; List_1)//to have the first stage index fit in byte
{
foreach (lvl_2; List)
{
static if (lvl_1 + lvl_2 <= 16)//so that 2nd stage fits in ushort
{
enum lvl_3 = 21-lvl_2-lvl_1;
auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set);
if (t.bytes < min)
{
min = t.bytes;
write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
}
}
}
}
write(sink);
}
void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
{
void delegate(File) write;
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
size_t min = size_t.max;
auto range = zip(map.values, map.keys).array;
foreach (lvl_1; List_1)//to have the first stage index fit in byte
{
foreach (lvl_2; List)
{
static if (lvl_1 + lvl_2 <= 16)// into ushort
{
enum lvl_3 = 21-lvl_2-lvl_1;
auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue);
if (t.bytes < min)
{
min = t.bytes;
write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
}
}
}
}
write(sink);
}
void writeBest4Level(Set)(File sink, string name, Set set)
{
alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
size_t min = size_t.max;
void delegate(File) write;
foreach (lvl_1; List_1)//to have the first stage index fit in byte
{
foreach (lvl_2; List)
{
foreach (lvl_3; List)
{
static if (lvl_1 + lvl_2 + lvl_3 <= 16)
{
enum lvl_4 = 21-lvl_3-lvl_2-lvl_1;
auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set);
if (t.bytes < min)
{
min = t.bytes;
write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t);
}
}
}
}
}
write(sink);
}
template createPrinter(Params...)
{
void delegate(File) createPrinter(T)(string name, T trie)
{
return (File sink){
sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s",
trie.bytes, name, Unqual!(typeof(T.init[0])).stringof);
foreach (lvl; Params[0..$])
sink.writef(", %d", lvl);
sink.write(")(");
trie.store(sink.lockingTextWriter());
sink.writeln(");");
};
}
}