//Written in the D programming language /** Codepoint set and trie for efficient character class manipulation, currently for internal use only. */ module std.internal.uni; import std.algorithm, std.range, std.uni, std.format; import std.internal.uni_tab; import core.bitop; @safe: public: //wrappers for CTFE @trusted void insertInPlaceAlt(T)(ref T[] arr, size_t idx, T[] items...) { if(__ctfe) arr = arr[0..idx] ~ items ~ arr[idx..$]; else insertInPlace(arr, idx, items); } //ditto @trusted void replaceInPlaceAlt(T)(ref T[] arr, size_t from, size_t to, T[] items...) in { assert(to >= from); } body { if(__ctfe) arr = arr[0..from]~items~arr[to..$]; else //@@@BUG@@@ in replaceInPlace? symptoms being sudden ZEROs in array { //replaceInPlace(arr, from, to, items); size_t window = to - from, ilen = items.length; if(window >= ilen) { size_t delta = window - ilen; arr[from .. from+ilen] = items[0..$]; if(delta) {//arrayops won't do - aliasing for(size_t i = from+ilen; i < arr.length-delta; i++) arr[i] = arr[i+delta]; arr.length -= delta; } } else { size_t delta = ilen - window, old = arr.length; arr.length += delta; //arrayops won't do - aliasing for(size_t i = old - 1; i != to-1; i--) arr[i+delta] = arr[i]; arr[from .. from+ilen] = items[0..$]; } } } //$(D Interval) represents an interval of codepoints: [a,b). struct Interval { uint begin, end; ///Create interval containig a single character $(D ch). this(dchar ch) { begin = ch; end = ch+1; } /++ Create Interval from inclusive range [$(D a),$(D b)]. Contrary to internal structure, inclusive is chosen for interface. The reason for this is usability e.g. it's would force user to type the unwieldy Interval('a','z'+1) all over the place. +/ this(dchar a, dchar b) { assert(a <= b); begin = a; end = b+1; } /// @trusted string toString()const { auto s = appender!string(); formattedWrite(s,"%s..%s", begin, end); return s.data; } } /+ $(D CodepointSet) is a data structure for manipulating sets of Unicode codepoints in an efficient manner. Instances of CodepointSet have half-reference semantics akin to dynamic arrays, to obtain a unique copy use $(D dup). +/ struct CodepointSet { enum uint endOfRange = 0x110000; uint[] ivals; //Add an $(D interval) of codepoints to this set. @trusted ref CodepointSet add(Interval inter) { debug(fred_charset) writeln("Inserting ",inter); if(ivals.empty) { insertInPlaceAlt(ivals, 0, inter.begin, inter.end); return this; }//assumeSorted is @system auto svals = assumeSorted(ivals); auto s = svals.lowerBound(inter.begin).length; auto e = s+svals[s..svals.length].lowerBound(inter.end).length; debug(fred_charset) writeln("Indexes: ", s," ", e); if(s & 1) { inter.begin = ivals[s-1]; s ^= 1; } if(e & 1) { inter.end = ivals[e]; e += 1; } else //e % 2 == 0 { if(e < ivals.length && inter.end == ivals[e]) { inter.end = ivals[e+1]; e+=2; } } debug(fred_charset) for(size_t i=1;i ", ivals, set.ivals); for(size_t i=0; i b.front.end) { b.popFront(); } else //there is an intersection { if(a.front.begin < b.front.begin) { result ~= Interval(a.front.begin, b.front.begin-1); if(a.front.end < b.front.end) { a.popFront(); } else if(a.front.end > b.front.end) { //adjust a in place a.front.begin = b.front.end; if(a.front.begin >= a.front.end) a.popFront(); b.popFront(); } else //== { a.popFront(); b.popFront(); } } else //a.front.begin > b.front.begin {//adjust in place if(a.front.end < b.front.end) { a.popFront(); } else { a.front.begin = b.front.end; if(a.front.begin >= a.front.end) a.popFront(); b.popFront(); } } } } result ~= a;//+ leftover of original ivals = cast(uint[])result; return this; } //Make this set a symmetric difference with $(D set). //Algebra: this = this ~ set (i.e. (this || set) -- (this && set)). @trusted ref CodepointSet symmetricSub(in CodepointSet set) { auto a = CodepointSet(ivals.dup); a.intersect(set); this.add(set); this.sub(a); return this; } //Intersect this set with $(D set). //Algebra: this = this & set @trusted ref CodepointSet intersect(in CodepointSet set) { if(empty || set.empty) { ivals = []; return this; } Interval[] intersection; auto a = cast(const(Interval)[])ivals; auto b = cast(const(Interval)[])set.ivals; for(;;) { if(a.front.end < b.front.begin) { a.popFront(); if(a.empty) break; } else if(a.front.begin > b.front.end) { b.popFront(); if(b.empty) break; } else //there is an intersection { if(a.front.end < b.front.end) { intersection ~= Interval(max(a.front.begin, b.front.begin), a.front.end); a.popFront(); if(a.empty) break; } else if(a.front.end > b.front.end) { intersection ~= Interval(max(a.front.begin, b.front.begin), b.front.end); b.popFront(); if(b.empty) break; } else //== { intersection ~= Interval(max(a.front.begin, b.front.begin), a.front.end); a.popFront(); b.popFront(); if(a.empty || b.empty) break; } } } ivals = cast(uint[])intersection; return this; } //this = !this (i.e. [^...] in regex syntax) @trusted ref CodepointSet negate() { if(empty) { insertInPlaceAlt(ivals, 0, 0u, endOfRange); return this; } if(ivals[0] != 0) insertInPlaceAlt(ivals, 0, 0u); else { for(size_t i=1; i= ivals[i-1]; return false; } /+ Test if ch is present in this set, in $(BIGOH LogN) operations on number of $(U intervals) in this set. +/ @trusted bool opIndex(dchar ch)const { auto svals = assumeSorted!"a <= b"(ivals); auto s = svals.lowerBound(cast(uint)ch).length; return s & 1; } //Test if this set is empty. @property bool empty() const pure nothrow { return ivals.empty; } //Write out in regular expression style [\uxxxx-\uyyyy...]. @trusted void printUnicodeSet(R)(R sink) const if(isOutputRange!(R, const(char)[])) { sink("["); for(uint i=0;i= ivals[1]) { ivals = ivals[2..$]; if(!empty) j = ivals[0]; } } @property auto ref save() const { return this; } } static assert(isForwardRange!ByCodepoint); //Forward range of all codepoints in this set. auto opSlice() const { return ByCodepoint(this); } //Random access range of intervals in this set. @trusted @property auto byInterval() const { const(uint)[] hack = ivals; return cast(const(Interval)[])hack; } //eaten alive by @@@BUG@@@s /+invariant() { assert(ivals.length % 2 == 0); for(size_t i=1; i 4) { static if(size_t.sizeof == 4) enum unitBits = 2; else static if(size_t.sizeof == 8) enum unitBits = 3; else static assert(0); enum prefixWordBits = prefixBits-unitBits, prefixSize=1< 500_000 ? (negative=true, s.dup.negate()) : s; uint bound = 0;//set up on first iteration ushort emptyBlock = ushort.max; auto ivals = set.ivals; size_t[prefixWordSize] page; for(uint i=0; i ivals[bound] || emptyBlock == ushort.max)//avoid empty blocks if we have one already { bool flag = true; L_Prefix_Loop: for(uint j=0; j= ivals[bound+1]) { bound += 2; if(bound == ivals.length) { bound = uint.max; if(flag)//not a single one set so far return; //no more bits in the whole set, but need to add the last bucket break L_Prefix_Loop; } } if(i+j >= ivals[bound]) { enum mask = (1<<(3+unitBits))-1; page[j>>(3+unitBits)] |= cast(size_t)1<<(j & mask); flag = false; } } debug(fred_trie) { printBlock(page); } uint npos; for(npos=0;npos>prefixWordBits); break; } if(npos == data.length) { indexes ~= cast(ushort)(data.length>>prefixWordBits); data ~= page; if(flag) emptyBlock = indexes[$-1]; } if(bound == uint.max) break; page[] = 0; } else//fast reroute whole blocks to an empty one { indexes ~= emptyBlock; } } } //Test if contains $(D ch). @trusted bool opIndex(dchar ch) const { assert(ch < 0x110000); uint ind = ch>>prefixBits; if(ind >= indexes.length) return negative; return cast(bool)bt(data.ptr, (indexes[ind]<>3)]>>(ch&7))&1) ^ negative; } } //invert trie (trick internal for regular expressions, has aliasing problem) @trusted private auto negated() const { CodepointTrie t = cast(CodepointTrie)this;//shallow copy, need to subvert type system? t.negative = !negative; return t; } } unittest { auto wordSet = CodepointSet.init.add(unicodeAlphabetic).add(unicodeMn).add(unicodeMc) .add(unicodeMe).add(unicodeNd).add(unicodePc); auto t = CodepointTrie!8(wordSet); assert(t['a']); assert(!t[' ']); } unittest { CodepointSet set; set.add(unicodeAlphabetic); for(size_t i=1;i assert(trie[ch], text("on ch == ", ch, " seed was ", seed))); test(neg, ch => assert(!trie[ch], text("negative on ch == ", ch, " seed was ", seed))); } // test that negate is reversible foreach(up; testCases.save) { auto neg = up.set.dup.negate().negate(); assert(equal(up.set.ivals, neg.ivals)); } // test codepoint forward iterator auto set = testCases.front.set; auto rng = set[]; foreach (idx; 0 .. set.ivals.length / 2) { immutable lo = set.ivals[2*idx], hi = set.ivals[2*idx+1]; foreach (val; lo .. hi) { assert(rng.front == val, text("on val == ", val, " seed was ", seed)); rng.popFront(); } } } //fussy compare for unicode property names as per UTS-18 int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b) { for(;;) { while(!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_')) { a.popFront(); } while(!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_')) { b.popFront(); } if(a.empty) return b.empty ? 0 : -1; if(b.empty) return 1; auto ca = toLower(a.front), cb = toLower(b.front); if(ca > cb) return 1; else if( ca < cb) return -1; a.popFront(); b.popFront(); } } //ditto (workaround for internal tools) public bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b) { return comparePropertyName(a, b) < 0; } unittest { assert(comparePropertyName("test","test") == 0); assert(comparePropertyName("Al chemical Symbols", "Alphabetic Presentation Forms") == -1); assert(comparePropertyName("Basic Latin","basic-LaTin") == 0); } //Gets array of all of common case eqivalents of given codepoint //(fills provided array & returns a slice of it) @trusted dchar[] getCommonCasing(dchar ch, dchar[] range) { CommonCaseEntry cs; size_t i=1, j=0; range[0] = ch; while(j < i) { ch = range[j++]; cs.start = ch; cs.end = ch; auto idx = assumeSorted!"a.end <= b.end"(commonCaseTable) .lowerBound(cs).length; immutable(CommonCaseEntry)[] slice = commonCaseTable[idx..$]; idx = assumeSorted!"a.start <= b.start"(slice).lowerBound(cs).length; slice = slice[0..idx]; foreach(v; slice) if(ch < v.end) { if(v.xor) { auto t = ch ^ v.delta; if(countUntil(range[0..i], t) < 0) range[i++] = t; } else { auto t = v.neg ? ch - v.delta : ch + v.delta; if(countUntil(range[0..i], t) < 0) range[i++] = t; } } } return range[0..i]; } unittest { dchar[6] data; //these values give 100% code coverage for getCommonCasing assert(getCommonCasing(0x01BC, data) == [0x01bc, 0x01bd]); assert(getCommonCasing(0x03B9, data) == [0x03b9, 0x0399, 0x1fbe, 0x0345]); assert(getCommonCasing(0x10402, data) == [0x10402, 0x1042a]); } // @trusted CodepointSet caseEnclose(in CodepointSet set) { CodepointSet n; for(size_t i=0;i