phobos/std/internal/uni.d

798 lines
23 KiB
D

//Written in the D programming language
/**
Codepoint set and trie for efficient character class manipulation,
currently for internal use only.
*/
module std.internal.uni;
import std.algorithm, std.range, std.uni, std.format;
import std.internal.uni_tab;
import core.bitop;
@safe:
public:
//wrappers for CTFE
@trusted void insertInPlaceAlt(T)(ref T[] arr, size_t idx, T[] items...)
{
if(__ctfe)
arr = arr[0..idx] ~ items ~ arr[idx..$];
else
insertInPlace(arr, idx, items);
}
//ditto
@trusted void replaceInPlaceAlt(T)(ref T[] arr, size_t from, size_t to, T[] items...)
in
{
assert(to >= from);
}
body
{
if(__ctfe)
arr = arr[0..from]~items~arr[to..$];
else //@@@BUG@@@ in replaceInPlace? symptoms being sudden ZEROs in array
{
//replaceInPlace(arr, from, to, items);
size_t window = to - from, ilen = items.length;
if(window >= ilen)
{
size_t delta = window - ilen;
arr[from .. from+ilen] = items[0..$];
if(delta)
{//arrayops won't do - aliasing
for(size_t i = from+ilen; i < arr.length-delta; i++)
arr[i] = arr[i+delta];
arr.length -= delta;
}
}
else
{
size_t delta = ilen - window, old = arr.length;
arr.length += delta;
//arrayops won't do - aliasing
for(size_t i = old - 1; i != to-1; i--)
arr[i+delta] = arr[i];
arr[from .. from+ilen] = items[0..$];
}
}
}
//$(D Interval) represents an interval of codepoints: [a,b).
struct Interval
{
uint begin, end;
///Create interval containig a single character $(D ch).
this(dchar ch)
{
begin = ch;
end = ch+1;
}
/++
Create Interval from inclusive range [$(D a),$(D b)]. Contrary to internal structure, inclusive is chosen for interface.
The reason for this is usability e.g. it's would force user to type the unwieldy Interval('a','z'+1) all over the place.
+/
this(dchar a, dchar b)
{
assert(a <= b);
begin = a;
end = b+1;
}
///
@trusted string toString()const
{
auto s = appender!string();
formattedWrite(s,"%s..%s", begin, end);
return s.data;
}
}
/+
$(D CodepointSet) is a data structure for manipulating sets
of Unicode codepoints in an efficient manner.
Instances of CodepointSet have half-reference semantics akin to dynamic arrays,
to obtain a unique copy use $(D dup).
+/
struct CodepointSet
{
enum uint endOfRange = 0x110000;
uint[] ivals;
//Add an $(D interval) of codepoints to this set.
@trusted ref CodepointSet add(Interval inter)
{
debug(fred_charset) writeln("Inserting ",inter);
if(ivals.empty)
{
insertInPlaceAlt(ivals, 0, inter.begin, inter.end);
return this;
}//assumeSorted is @system
auto svals = assumeSorted(ivals);
auto s = svals.lowerBound(inter.begin).length;
auto e = s+svals[s..svals.length].lowerBound(inter.end).length;
debug(fred_charset) writeln("Indexes: ", s," ", e);
if(s & 1)
{
inter.begin = ivals[s-1];
s ^= 1;
}
if(e & 1)
{
inter.end = ivals[e];
e += 1;
}
else //e % 2 == 0
{
if(e < ivals.length && inter.end == ivals[e])
{
inter.end = ivals[e+1];
e+=2;
}
}
debug(fred_charset)
for(size_t i=1;i<ivals.length; i++)
assert(ivals[i-1] < ivals[i]);
replaceInPlaceAlt(ivals, s, e, inter.begin ,inter.end);
return this;
}
//Add a codepoint $(D ch) to this set.
ref CodepointSet add(dchar ch){ add(Interval(cast(uint)ch)); return this; }
//Add $(D set) in this set.
//Algebra: this = this | set.
ref CodepointSet add(in CodepointSet set)
{
debug(fred_charset) writef ("%s || %s --> ", ivals, set.ivals);
for(size_t i=0; i<set.ivals.length; i+=2)
add(Interval(set.ivals[i], set.ivals[i+1]-1));
debug(fred_charset) writeln(ivals);
return this;
}
//Exclude $(D set) from this set.
//Algebra: this = this - set.
@trusted ref CodepointSet sub(in CodepointSet set)
{
if(empty)
{
ivals = [];
return this;
}
if(set.empty)
return this;
auto a = cast(Interval[])ivals;
auto b = cast(const(Interval)[])set.ivals;
Interval[] result;
while(!a.empty && !b.empty)
{
if(a.front.end < b.front.begin)
{
result ~= a.front;
a.popFront();
}
else if(a.front.begin > b.front.end)
{
b.popFront();
}
else //there is an intersection
{
if(a.front.begin < b.front.begin)
{
result ~= Interval(a.front.begin, b.front.begin-1);
if(a.front.end < b.front.end)
{
a.popFront();
}
else if(a.front.end > b.front.end)
{
//adjust a in place
a.front.begin = b.front.end;
if(a.front.begin >= a.front.end)
a.popFront();
b.popFront();
}
else //==
{
a.popFront();
b.popFront();
}
}
else //a.front.begin > b.front.begin
{//adjust in place
if(a.front.end < b.front.end)
{
a.popFront();
}
else
{
a.front.begin = b.front.end;
if(a.front.begin >= a.front.end)
a.popFront();
b.popFront();
}
}
}
}
result ~= a;//+ leftover of original
ivals = cast(uint[])result;
return this;
}
//Make this set a symmetric difference with $(D set).
//Algebra: this = this ~ set (i.e. (this || set) -- (this && set)).
@trusted ref CodepointSet symmetricSub(in CodepointSet set)
{
auto a = CodepointSet(ivals.dup);
a.intersect(set);
this.add(set);
this.sub(a);
return this;
}
//Intersect this set with $(D set).
//Algebra: this = this & set
@trusted ref CodepointSet intersect(in CodepointSet set)
{
if(empty || set.empty)
{
ivals = [];
return this;
}
Interval[] intersection;
auto a = cast(const(Interval)[])ivals;
auto b = cast(const(Interval)[])set.ivals;
for(;;)
{
if(a.front.end < b.front.begin)
{
a.popFront();
if(a.empty)
break;
}
else if(a.front.begin > b.front.end)
{
b.popFront();
if(b.empty)
break;
}
else //there is an intersection
{
if(a.front.end < b.front.end)
{
intersection ~= Interval(max(a.front.begin, b.front.begin), a.front.end);
a.popFront();
if(a.empty)
break;
}
else if(a.front.end > b.front.end)
{
intersection ~= Interval(max(a.front.begin, b.front.begin), b.front.end);
b.popFront();
if(b.empty)
break;
}
else //==
{
intersection ~= Interval(max(a.front.begin, b.front.begin), a.front.end);
a.popFront();
b.popFront();
if(a.empty || b.empty)
break;
}
}
}
ivals = cast(uint[])intersection;
return this;
}
//this = !this (i.e. [^...] in regex syntax)
@trusted ref CodepointSet negate()
{
if(empty)
{
insertInPlaceAlt(ivals, 0, 0u, endOfRange);
return this;
}
if(ivals[0] != 0)
insertInPlaceAlt(ivals, 0, 0u);
else
{
for(size_t i=1; i<ivals.length; i++)
ivals[i-1] = ivals[i];//moveAll(ivals[1..$], ivals[0..$-1]);
ivals = ivals[0..$-1];
if(!__ctfe)
assumeSafeAppend(ivals);
}
if(ivals[$-1] != endOfRange)
insertInPlaceAlt(ivals, ivals.length, endOfRange);
else
{
ivals = ivals[0..$-1] ;
if(!__ctfe)
assumeSafeAppend(ivals);
}
assert(!(ivals.length & 1));
return this;
}
/+
Test if ch is present in this set, linear search done in $(BIGOH N) operations
on number of $(U intervals) in this set.
In practice linear search outperforms binary search until a certain threshold.
Unless number of elements is known to be small in advance it's recommended
to use overloaded indexing operator.
+/
bool scanFor(dchar ch) const
{
//linear search is in fact faster (given that length is fixed under threshold)
for(size_t i=1; i<ivals.length; i+=2)
if(ch < ivals[i])
return ch >= ivals[i-1];
return false;
}
/+
Test if ch is present in this set, in $(BIGOH LogN) operations on number
of $(U intervals) in this set.
+/
@trusted bool opIndex(dchar ch)const
{
auto svals = assumeSorted!"a <= b"(ivals);
auto s = svals.lowerBound(cast(uint)ch).length;
return s & 1;
}
//Test if this set is empty.
@property bool empty() const pure nothrow { return ivals.empty; }
//Write out in regular expression style [\uxxxx-\uyyyy...].
@trusted void printUnicodeSet(R)(R sink) const
if(isOutputRange!(R, const(char)[]))
{
sink("[");
for(uint i=0;i<ivals.length; i+=2)
if(ivals[i] + 1 == ivals[i+1])
formattedWrite(sink, "\\U%08x", ivals[i]);
else
formattedWrite(sink, "\\U%08x-\\U%08x", ivals[i], ivals[i+1]-1);
sink("]");
}
//Deep copy this set.
@property CodepointSet dup() const
{
return CodepointSet(ivals.dup);
}
//Full covered length from first codepoint to the last one.
@property uint extent() const
{
return ivals.empty ? 0 : ivals[$-1] - ivals[0];
}
//Number of codepoints stored in this set.
@property uint chars() const
{
//CTFE workaround
uint ret;
for(uint i=0; i<ivals.length; i+=2)
ret += ivals[i+1] - ivals[i];
return ret;
}
//Troika for built-in hash maps.
bool opEquals(ref const CodepointSet set) const
{
return ivals == set.ivals;
}
//ditto
int opCmp(ref const CodepointSet set) const
{
return cmp(cast(const(uint)[])ivals, cast(const(uint)[])set.ivals);
}
//ditto
size_t toHash() const pure nothrow @safe
{
size_t hash = 5381+7*ivals.length;
if(!empty)
hash = 31*ivals[0] + 17*ivals[$-1];
return hash;
}
struct ByCodepoint
{
const(uint)[] ivals;
uint j;
this(in CodepointSet set)
{
ivals = set.ivals;
if(!empty)
j = ivals[0];
}
@property bool empty() const { return ivals.empty; }
@property uint front() const
{
assert(!empty);
return j;
}
void popFront()
{
assert(!empty);
if(++j >= ivals[1])
{
ivals = ivals[2..$];
if(!empty)
j = ivals[0];
}
}
@property auto ref save() const { return this; }
}
static assert(isForwardRange!ByCodepoint);
//Forward range of all codepoints in this set.
auto opSlice() const
{
return ByCodepoint(this);
}
//Random access range of intervals in this set.
@trusted @property auto byInterval() const
{
const(uint)[] hack = ivals;
return cast(const(Interval)[])hack;
}
//eaten alive by @@@BUG@@@s
/+invariant()
{
assert(ivals.length % 2 == 0);
for(size_t i=1; i<ivals.length; i++)
assert(ivals[i-1] < ivals[i]);
}+/
}
/*
$(D CodepointTrie) is 1-level $(LUCKY Trie) of codepoints.
Primary use case is to convert a previously obtained CodepointSet
in order to speed up subsequent element lookup.
---
auto input = ...;
Charset set;
set.add(unicodeAlphabetic).add('$').add('#');
auto lookup = CodepointTrie!8(set);
int count;
foreach(dchar ch; input)
if(lookup[ch])
count++;
---
$(D prefixBits) parameter controls number of bits used to index last level
and provided for tuning to a specific applications.
A default parameter of 8 works best in common cases though.
*/
struct CodepointTrie(uint prefixBits)
if(prefixBits > 4)
{
static if(size_t.sizeof == 4)
enum unitBits = 2;
else static if(size_t.sizeof == 8)
enum unitBits = 3;
else
static assert(0);
enum prefixWordBits = prefixBits-unitBits, prefixSize=1<<prefixBits,
prefixWordSize = 1<<(prefixWordBits),
bitTestShift = prefixBits+3, prefixMask = (1<<prefixBits)-1;
size_t[] data;
ushort[] indexes;
bool negative;
//debugging tool
@trusted debug(fred_trie) static void printBlock(in size_t[] block)
{//@@@BUG@@@ write is @system
for(uint k=0; k<prefixSize; k++)
{
if((k & 15) == 0)
write(" ");
if((k & 63) == 0)
writeln();
writef("%d", bt(block.ptr, k) != 0);
}
writeln();
}
//ditto
@trusted debug(fred_trie) void desc() const
{//@@@BUG@@@ writeln is @system
writeln(indexes);
writeln("***Blocks***");
for(uint i=0; i<data.length; i+=prefixWordSize)
{
printBlock(data[i .. i+prefixWordSize]);
writeln("---");
}
}
public:
//Create a trie from CodepointSet $(D set).
@trusted this(in CodepointSet s)
{
if(s.empty)
return;
const(CodepointSet) set = s.chars > 500_000 ? (negative=true, s.dup.negate()) : s;
uint bound = 0;//set up on first iteration
ushort emptyBlock = ushort.max;
auto ivals = set.ivals;
size_t[prefixWordSize] page;
for(uint i=0; i<CodepointSet.endOfRange; i+= prefixSize)
{
if(i+prefixSize > ivals[bound] || emptyBlock == ushort.max)//avoid empty blocks if we have one already
{
bool flag = true;
L_Prefix_Loop:
for(uint j=0; j<prefixSize; j++)
{
while(i+j >= ivals[bound+1])
{
bound += 2;
if(bound == ivals.length)
{
bound = uint.max;
if(flag)//not a single one set so far
return;
//no more bits in the whole set, but need to add the last bucket
break L_Prefix_Loop;
}
}
if(i+j >= ivals[bound])
{
enum mask = (1<<(3+unitBits))-1;
page[j>>(3+unitBits)]
|= cast(size_t)1<<(j & mask);
flag = false;
}
}
debug(fred_trie)
{
printBlock(page);
}
uint npos;
for(npos=0;npos<data.length;npos+=prefixWordSize)
if(equal(page[], data[npos .. npos+prefixWordSize]))
{
indexes ~= cast(ushort)(npos>>prefixWordBits);
break;
}
if(npos == data.length)
{
indexes ~= cast(ushort)(data.length>>prefixWordBits);
data ~= page;
if(flag)
emptyBlock = indexes[$-1];
}
if(bound == uint.max)
break;
page[] = 0;
}
else//fast reroute whole blocks to an empty one
{
indexes ~= emptyBlock;
}
}
}
//Test if contains $(D ch).
@trusted bool opIndex(dchar ch) const
{
assert(ch < 0x110000);
uint ind = ch>>prefixBits;
if(ind >= indexes.length)
return negative;
return cast(bool)bt(data.ptr, (indexes[ind]<<bitTestShift)+(ch&prefixMask)) ^ negative;
version(none)//is in fact slower (on AMD Phenom)
{
auto ptr = cast(const(ubyte)*)data.ptr;
return ((ptr[(cast(size_t)indexes[ind]<<prefixBits) + ((ch&prefixMask)>>3)]>>(ch&7))&1) ^ negative;
}
}
//invert trie (trick internal for regular expressions, has aliasing problem)
@trusted private auto negated() const
{
CodepointTrie t = cast(CodepointTrie)this;//shallow copy, need to subvert type system?
t.negative = !negative;
return t;
}
}
unittest
{
auto wordSet =
CodepointSet.init.add(unicodeAlphabetic).add(unicodeMn).add(unicodeMc)
.add(unicodeMe).add(unicodeNd).add(unicodePc);
auto t = CodepointTrie!8(wordSet);
assert(t['a']);
assert(!t[' ']);
}
unittest
{
CodepointSet set;
set.add(unicodeAlphabetic);
for(size_t i=1;i<set.ivals.length; i++)
assert(set.ivals[i-1] < set.ivals[i]);
}
@system unittest
{
import std.conv, std.random, std.range;
immutable seed = unpredictableSeed;
auto rnd = Random(seed);
auto testCases = randomSample(unicodeProperties, 10, rnd);
// test trie using ~2000 codepoints
foreach(up; testCases.save)
{
void test(in CodepointSet set, scope void delegate(uint ch) dg)
{
foreach (_; 0 .. 10)
{
immutable idx = uniform(0, set.ivals.length / 2, rnd);
immutable lo = set.ivals[2*idx], hi = set.ivals[2*idx+1];
foreach (_2; 0 .. min(10, hi - lo))
dg(uniform(lo, hi, rnd));
}
}
auto neg = up.set.dup.negate();
auto trie = CodepointTrie!8(up.set);
test(up.set, ch => assert(trie[ch], text("on ch == ", ch, " seed was ", seed)));
test(neg, ch => assert(!trie[ch], text("negative on ch == ", ch, " seed was ", seed)));
}
// test that negate is reversible
foreach(up; testCases.save)
{
auto neg = up.set.dup.negate().negate();
assert(equal(up.set.ivals, neg.ivals));
}
// test codepoint forward iterator
auto set = testCases.front.set;
auto rng = set[];
foreach (idx; 0 .. set.ivals.length / 2)
{
immutable lo = set.ivals[2*idx], hi = set.ivals[2*idx+1];
foreach (val; lo .. hi)
{
assert(rng.front == val, text("on val == ", val, " seed was ", seed));
rng.popFront();
}
}
}
//fussy compare for unicode property names as per UTS-18
int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b)
{
for(;;)
{
while(!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_'))
{
a.popFront();
}
while(!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_'))
{
b.popFront();
}
if(a.empty)
return b.empty ? 0 : -1;
if(b.empty)
return 1;
auto ca = toLower(a.front), cb = toLower(b.front);
if(ca > cb)
return 1;
else if( ca < cb)
return -1;
a.popFront();
b.popFront();
}
}
//ditto (workaround for internal tools)
public bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b)
{
return comparePropertyName(a, b) < 0;
}
unittest
{
assert(comparePropertyName("test","test") == 0);
assert(comparePropertyName("Al chemical Symbols", "Alphabetic Presentation Forms") == -1);
assert(comparePropertyName("Basic Latin","basic-LaTin") == 0);
}
//Gets array of all of common case eqivalents of given codepoint
//(fills provided array & returns a slice of it)
@trusted dchar[] getCommonCasing(dchar ch, dchar[] range)
{
CommonCaseEntry cs;
size_t i=1, j=0;
range[0] = ch;
while(j < i)
{
ch = range[j++];
cs.start = ch;
cs.end = ch;
auto idx = assumeSorted!"a.end <= b.end"(commonCaseTable)
.lowerBound(cs).length;
immutable(CommonCaseEntry)[] slice = commonCaseTable[idx..$];
idx = assumeSorted!"a.start <= b.start"(slice).lowerBound(cs).length;
slice = slice[0..idx];
foreach(v; slice)
if(ch < v.end)
{
if(v.xor)
{
auto t = ch ^ v.delta;
if(countUntil(range[0..i], t) < 0)
range[i++] = t;
}
else
{
auto t = v.neg ? ch - v.delta : ch + v.delta;
if(countUntil(range[0..i], t) < 0)
range[i++] = t;
}
}
}
return range[0..i];
}
unittest
{
dchar[6] data;
//these values give 100% code coverage for getCommonCasing
assert(getCommonCasing(0x01BC, data) == [0x01bc, 0x01bd]);
assert(getCommonCasing(0x03B9, data) == [0x03b9, 0x0399, 0x1fbe, 0x0345]);
assert(getCommonCasing(0x10402, data) == [0x10402, 0x1042a]);
}
//
@trusted CodepointSet caseEnclose(in CodepointSet set)
{
CodepointSet n;
for(size_t i=0;i<set.ivals.length; i+=2)
{
CommonCaseEntry cs;
cs.start = set.ivals[i+1]-1;
cs.end = set.ivals[i];
auto idx = assumeSorted!"a.end <= b.end"(commonCaseTable)
.lowerBound(cs).length;
immutable(CommonCaseEntry)[] slice = commonCaseTable[idx..$];
idx = assumeSorted!"a.start <= b.start"(slice)
.lowerBound(cs).length;
slice = slice[0..idx];
if(!slice.empty)
{
dchar[6] r;
for(uint ch = set.ivals[i]; ch <set.ivals[i+1]; ch++)
{
auto rng = getCommonCasing(ch, r[]);
foreach(v; rng)
n.add(v);
}
}
else
n.add(Interval(cs.end,cs.start));
}
return n;
}