= idxA && idxA >= curIndex, errMsg);
putRangeAt(idxA, idxB, v);
}
/**
Put a value $(D v) into slot mapped by $(D key).
All slots prior to $(D key) are filled with the
default filler.
*/
void putValue(Key key, Value v)
{
auto idx = getIndex(key);
enforce(idx >= curIndex, text(errMsg, " ", idx));
putAt(idx, v);
}
/// Finishes construction of Trie, yielding an immutable Trie instance.
auto build()
{
static if(maxIndex != 0) // doesn't cover full range of size_t
{
assert(curIndex <= maxIndex);
addValue!lastLevel(defValue, maxIndex - curIndex);
}
else
{
if(curIndex != 0 // couldn't wrap around
|| (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
{
addValue!lastLevel(defValue, size_t.max - curIndex);
addValue!lastLevel(defValue, 1);
}
// else curIndex already completed the full range of size_t by wrapping around
}
return Trie!(V, Key, maxIndex, Prefix)(table);
}
}
/*
$(P A generic Trie data-structure for a fixed number of stages.
The design goal is optimal speed with smallest footprint size.
)
$(P It's intentionally read-only and doesn't provide constructors.
To construct one use a special builder,
see $(LREF TrieBuilder) and $(LREF buildTrie).
)
*/
@trusted public struct Trie(Value, Key, Args...)
if(isValidPrefixForTrie!(Key, Args)
|| (isValidPrefixForTrie!(Key, Args[1..$])
&& is(typeof(Args[0]) : size_t)))
{
static if(is(typeof(Args[0]) : size_t))
{
enum maxIndex = Args[0];
enum hasBoundsCheck = true;
alias Prefix = Args[1..$];
}
else
{
enum hasBoundsCheck = false;
alias Prefix = Args;
}
private this()(typeof(_table) table)
{
_table = table;
}
// only for constant Tries constructed from precompiled tables
private this()(const(size_t)[] offsets, const(size_t)[] sizes,
const(size_t)[] data) const
{
_table = typeof(_table)(offsets, sizes, data);
}
/*
$(P Lookup the $(D key) in this $(D Trie). )
$(P The lookup always succeeds if key fits the domain
provided during construction. The whole domain defined
is covered so instead of not found condition
the sentinel (filler) value could be used. )
$(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
define a domain of $(D Trie) keys and the sentinel value. )
Note:
Domain range-checking is only enabled in debug builds
and results in assertion failure.
*/
// templated to auto-detect pure, @safe and nothrow
TypeOfBitPacked!Value opIndex()(Key key) const
{
static if(hasBoundsCheck)
assert(mapTrieIndex!Prefix(key) < maxIndex);
size_t idx;
alias p = Prefix;
idx = cast(size_t)p[0](key);
foreach(i, v; p[0..$-1])
idx = cast(size_t)((_table.ptr!i[idx]< 0)
alias GetBitSlicing =
TypeTuple!(sliceBits!(top - sizes[0], top),
GetBitSlicing!(top - sizes[0], sizes[1..$]));
else
alias GetBitSlicing = TypeTuple!();
}
template callableWith(T)
{
template callableWith(alias Pred)
{
static if(!is(typeof(Pred(T.init))))
enum callableWith = false;
else
{
alias Result = typeof(Pred(T.init));
enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
}
}
}
/*
Check if $(D Prefix) is a valid set of predicates
for $(D Trie) template having $(D Key) as the type of keys.
This requires all predicates to be callable, take
single argument of type $(D Key) and return unsigned value.
*/
template isValidPrefixForTrie(Key, Prefix...)
{
enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
}
/*
Check if $(D Args) is a set of maximum key value followed by valid predicates
for $(D Trie) template having $(D Key) as the type of keys.
*/
template isValidArgsForTrie(Key, Args...)
{
static if(Args.length > 1)
{
enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
|| (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
}
else
enum isValidArgsForTrie = isValidPrefixForTrie!Args;
}
@property size_t sumOfIntegerTuple(ints...)()
{
size_t count=0;
foreach(v; ints)
count += v;
return count;
}
/**
A shorthand for creating a custom multi-level fixed Trie
from a $(D CodepointSet). $(D sizes) are numbers of bits per level,
with the most significant bits used first.
Note: The sum of $(D sizes) must be equal 21.
See_Also: $(LREF toTrie), which is even simpler.
Example:
---
{
import std.stdio;
auto set = unicode("Number");
auto trie = codepointSetTrie!(8, 5, 8)(set);
writeln("Input code points to test:");
foreach(line; stdin.byLine)
{
int count=0;
foreach(dchar ch; line)
if(trie[ch])// is number
count++;
writefln("Contains %d number code points.", count);
}
}
---
*/
public template codepointSetTrie(sizes...)
if(sumOfIntegerTuple!sizes == 21)
{
auto codepointSetTrie(Set)(Set set)
if(isCodepointSet!Set)
{
auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
foreach(ival; set.byInterval)
builder.putRange(ival[0], ival[1], true);
return builder.build();
}
}
/// Type of Trie generated by codepointSetTrie function.
public template CodepointSetTrie(sizes...)
if(sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
}
/**
A slightly more general tool for building fixed $(D Trie)
for the Unicode data.
Specifically unlike $(D codepointSetTrie) it's allows creating mappings
of $(D dchar) to an arbitrary type $(D T).
Note: Overload taking $(D CodepointSet)s will naturally convert
only to bool mapping $(D Trie)s.
Example:
---
// pick characters from the Greek script
auto set = unicode.Greek;
// a user-defined property (or an expensive function)
// that we want to look up
static uint luckFactor(dchar ch)
{
// here we consider a character lucky
// if its code point has a lot of identical hex-digits
// e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
ubyte[6] nibbles; // 6 4-bit chunks of code point
uint value = ch;
foreach(i; 0..6)
{
nibbles[i] = value & 0xF;
value >>= 4;
}
uint luck;
foreach(n; nibbles)
luck = cast(uint)max(luck, count(nibbles[], n));
return luck;
}
// only unsigned built-ins are supported at the moment
alias LuckFactor = BitPacked!(uint, 3);
// create a temporary associative array (AA)
LuckFactor[dchar] map;
foreach(ch; set.byCodepoint)
map[ch] = luckFactor(ch);
// bits per stage are chosen randomly, fell free to optimize
auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
// from now on the AA is not needed
foreach(ch; set.byCodepoint)
assert(trie[ch] == luckFactor(ch)); // verify
// CJK is not Greek, thus it has the default value
assert(trie['\u4444'] == 0);
// and here is a couple of quite lucky Greek characters:
// Greek small letter epsilon with dasia
assert(trie['\u1F11'] == 3);
// Ancient Greek metretes sign
assert(trie['\U00010181'] == 3);
---
*/
public template codepointTrie(T, sizes...)
if(sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
static if(is(TypeOfBitPacked!T == bool))
{
auto codepointTrie(Set)(in Set set)
if(isCodepointSet!Set)
{
return codepointSetTrie(set);
}
}
auto codepointTrie()(T[dchar] map, T defValue=T.init)
{
return buildTrie!(T, dchar, Prefix)(map, defValue);
}
// unsorted range of pairs
auto codepointTrie(R)(R range, T defValue=T.init)
if(isInputRange!R
&& is(typeof(ElementType!R.init[0]) : T)
&& is(typeof(ElementType!R.init[1]) : dchar))
{
// build from unsorted array of pairs
// TODO: expose index sorting functions for Trie
return buildTrie!(T, dchar, Prefix)(range, defValue, true);
}
}
unittest // codepointTrie example
{
// pick characters from the Greek script
auto set = unicode.Greek;
// a user-defined property (or an expensive function)
// that we want to look up
static uint luckFactor(dchar ch)
{
// here we consider a character lucky
// if its code point has a lot of identical hex-digits
// e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
ubyte[6] nibbles; // 6 4-bit chunks of code point
uint value = ch;
foreach(i; 0..6)
{
nibbles[i] = value & 0xF;
value >>= 4;
}
uint luck;
foreach(n; nibbles)
luck = cast(uint)max(luck, count(nibbles[], n));
return luck;
}
// only unsigned built-ins are supported at the moment
alias LuckFactor = BitPacked!(uint, 3);
// create a temporary associative array (AA)
LuckFactor[dchar] map;
foreach(ch; set.byCodepoint)
map[ch] = LuckFactor(luckFactor(ch));
// bits per stage are chosen randomly, fell free to optimize
auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
// from now on the AA is not needed
foreach(ch; set.byCodepoint)
assert(trie[ch] == luckFactor(ch)); // verify
// CJK is not Greek, thus it has the default value
assert(trie['\u4444'] == 0);
// and here is a couple of quite lucky Greek characters:
// Greek small letter epsilon with dasia
assert(trie['\u1F11'] == 3);
// Ancient Greek metretes sign
assert(trie['\U00010181'] == 3);
}
/// Type of Trie as generated by codepointTrie function.
public template CodepointTrie(T, sizes...)
if(sumOfIntegerTuple!sizes == 21)
{
alias Prefix = GetBitSlicing!(21, sizes);
alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
}
// @@@BUG multiSort can's access private symbols from uni
public template cmpK0(alias Pred)
{
import std.typecons;
static bool cmpK0(Value, Key)
(Tuple!(Value, Key) a, Tuple!(Value, Key) b)
{
return Pred(a[1]) < Pred(b[1]);
}
}
/*
The most general utility for construction of $(D Trie)s
short of using $(D TrieBuilder) directly.
Provides a number of convenience overloads.
$(D Args) is tuple of maximum key value followed by
predicates to construct index from key.
Alternatively if the first argument is not a value convertible to $(D Key)
then the whole tuple of $(D Args) is treated as predicates
and the maximum Key is deduced from predicates.
*/
public template buildTrie(Value, Key, Args...)
if(isValidArgsForTrie!(Key, Args))
{
static if(is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
{
alias Prefix = Args[1..$];
}
else
alias Prefix = Args;
alias getIndex = mapTrieIndex!(Prefix);
// for multi-sort
template GetComparators(size_t n)
{
static if(n > 0)
alias GetComparators =
TypeTuple!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
else
alias GetComparators = TypeTuple!();
}
/*
Build $(D Trie) from a range of a Key-Value pairs,
assuming it is sorted by Key as defined by the following lambda:
------
(a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
------
Exception is thrown if it's detected that the above order doesn't hold.
In other words $(LREF mapTrieIndex) should be a
monotonically increasing function that maps $(D Key) to an integer.
See also: $(XREF _algorithm, sort),
$(XREF _range, SortedRange),
$(XREF _algorithm, setUnion).
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if(isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
&& is(typeof(Range.init.front[1]) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach(v; range)
builder.putValue(v[1], v[0]);
return builder.build();
}
/*
If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
to build $(D Trie) from a range of open-right intervals of $(D Key)s.
The requirement on the ordering of keys (and the behavior on the
violation of it) is the same as for Key-Value range overload.
Intervals denote ranges of !$(D filler) i.e. the opposite of filler.
If no filler provided keys inside of the intervals map to true,
and $(D filler) is false.
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if(is(TypeOfBitPacked!Value == bool)
&& isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
&& is(typeof(Range.init.front[1]) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach(ival; range)
builder.putRange(ival[0], ival[1], !filler);
return builder.build();
}
auto buildTrie(Range)(Range range, Value filler, bool unsorted)
if(isInputRange!Range
&& is(typeof(Range.init.front[0]) : Value)
&& is(typeof(Range.init.front[1]) : Key))
{
alias Comps = GetComparators!(Prefix.length);
if(unsorted)
multiSort!(Comps)(range);
return buildTrie(range, filler);
}
/*
If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
to build $(D Trie) simply from an input range of $(D Key)s.
The requirement on the ordering of keys (and the behavior on the
violation of it) is the same as for Key-Value range overload.
Keys found in range denote !$(D filler) i.e. the opposite of filler.
If no filler provided keys map to true, and $(D filler) is false.
*/
auto buildTrie(Range)(Range range, Value filler=Value.init)
if(is(TypeOfBitPacked!Value == bool)
&& isInputRange!Range && is(typeof(Range.init.front) : Key))
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach(v; range)
builder.putValue(v, !filler);
return builder.build();
}
/*
If $(D Key) is unsigned integer $(D Trie) could be constructed from array
of values where array index serves as key.
*/
auto buildTrie()(Value[] array, Value filler=Value.init)
if(isUnsigned!Key)
{
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
foreach(idx, v; array)
builder.putValue(idx, v);
return builder.build();
}
/*
Builds $(D Trie) from associative array.
*/
auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
{
auto range = array(zip(map.values, map.keys));
return buildTrie(range, filler, true); // sort it
}
}
// helper in place of assumeSize to
//reduce mangled name & help DMD inline Trie functors
struct clamp(size_t bits)
{
static size_t opCall(T)(T arg){ return arg; }
enum bitSize = bits;
}
struct clampIdx(size_t idx, size_t bits)
{
static size_t opCall(T)(T arg){ return arg[idx]; }
enum bitSize = bits;
}
/**
Conceptual type that outlines the common properties of all UTF Matchers.
Note: For illustration purposes only, every method
call results in assertion failure.
Use $(LREF utfMatcher) to obtain a concrete matcher
for UTF-8 or UTF-16 encodings.
*/
public struct MatcherConcept
{
/**
$(P Perform a semantic equivalent 2 operations:
decoding a $(CODEPOINT) at front of $(D inp) and testing if
it belongs to the set of $(CODEPOINTS) of this matcher. )
$(P The effect on $(D inp) depends on the kind of function called:)
$(P Match. If the codepoint is found in the set then range $(D inp)
is advanced by its size in $(S_LINK Code unit, code units),
otherwise the range is not modifed.)
$(P Skip. The range is always advanced by the size
of the tested $(CODEPOINT) regardless of the result of test.)
$(P Test. The range is left unaffected regardless
of the result of test.)
*/
public bool match(Range)(ref Range inp)
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///ditto
public bool skip(Range)(ref Range inp)
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///ditto
public bool test(Range)(ref Range inp)
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
assert(false);
}
///
@safe unittest
{
string truth = "2² = 4";
auto m = utfMatcher!char(unicode.Number);
assert(m.match(truth)); // '2' is a number all right
assert(truth == "² = 4"); // skips on match
assert(m.match(truth)); // so is the superscript '2'
assert(!m.match(truth)); // space is not a number
assert(truth == " = 4"); // unaffected on no match
assert(!m.skip(truth)); // same test ...
assert(truth == "= 4"); // but skips a codepoint regardless
assert(!m.test(truth)); // '=' is not a number
assert(truth == "= 4"); // test never affects argument
}
/*
Advanced feature - provide direct access to a subset of matcher based a
set of known encoding lengths. Lengths are provided in
$(S_LINK Code unit, code units). The sub-matcher then may do less
operations per any $(D test)/$(D match).
Use with care as the sub-matcher won't match
any $(CODEPOINTS) that have encoded length that doesn't belong
to the selected set of lengths. Also the sub-matcher object references
the parent matcher and must not be used past the liftetime
of the latter.
Another caveat of using sub-matcher is that skip is not available
preciesly because sub-matcher doesn't detect all lengths.
*/
@property auto subMatcher(Lengths...)()
{
assert(0);
return this;
}
///
@safe unittest
{
auto m = utfMatcher!char(unicode.Number);
string square = "2²";
// about sub-matchers
assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
assert(m.subMatcher!1.match(square)); // ASCII-only, works
assert(!m.subMatcher!1.test(square)); // unicode '²'
assert(m.subMatcher!(2,3,4).match(square)); //
assert(square == "");
wstring wsquare = "2²";
auto m16 = utfMatcher!wchar(unicode.Number);
// may keep ref, but the orignal (m16) must be kept alive
auto bmp = m16.subMatcher!1;
assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
assert(bmp.match(wsquare)); // And '²' too
}
}
/**
Test if $(D M) is an UTF Matcher for ranges of $(D Char).
*/
public enum isUtfMatcher(M, C) = __traits(compiles, (){
C[] s;
auto d = s.decoder;
M m;
assert(is(typeof(m.match(d)) == bool));
assert(is(typeof(m.test(d)) == bool));
static if(is(typeof(m.skip(d))))
{
assert(is(typeof(m.skip(d)) == bool));
assert(is(typeof(m.skip(s)) == bool));
}
assert(is(typeof(m.match(s)) == bool));
assert(is(typeof(m.test(s)) == bool));
});
unittest
{
alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
static assert(isUtfMatcher!(CharMatcher, char));
static assert(isUtfMatcher!(CharMatcher, immutable(char)));
static assert(isUtfMatcher!(WcharMatcher, wchar));
static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
}
enum Mode {
alwaysSkip,
neverSkip,
skipOnMatch
};
mixin template ForwardStrings()
{
private bool fwdStr(string fn, C)(ref C[] str) const pure
{
alias type = typeof(units(str));
return mixin(fn~"(*cast(type*)&str)");
}
}
template Utf8Matcher()
{
enum validSize(int sz) = sz >= 1 && sz <=4;
void badEncoding() pure @safe
{
import std.utf;
throw new UTFException("Invalid UTF-8 sequence");
}
//for 1-stage ASCII
alias AsciiSpec = TypeTuple!(bool, char, clamp!7);
//for 2-stage lookup of 2 byte UTF-8 sequences
alias Utf8Spec2 = TypeTuple!(bool, char[2],
clampIdx!(0, 5), clampIdx!(1, 6));
//ditto for 3 byte
alias Utf8Spec3 = TypeTuple!(bool, char[3],
clampIdx!(0, 4),
clampIdx!(1, 6),
clampIdx!(2, 6)
);
//ditto for 4 byte
alias Utf8Spec4 = TypeTuple!(bool, char[4],
clampIdx!(0, 3), clampIdx!(1, 6),
clampIdx!(2, 6), clampIdx!(3, 6)
);
alias Tables = TypeTuple!(
typeof(TrieBuilder!(AsciiSpec)(false).build()),
typeof(TrieBuilder!(Utf8Spec2)(false).build()),
typeof(TrieBuilder!(Utf8Spec3)(false).build()),
typeof(TrieBuilder!(Utf8Spec4)(false).build())
);
alias Table(int size) = Tables[size-1];
enum leadMask(size_t size) = (cast(size_t)1<<(7 - size))-1;
enum encMask(size_t size) = ((1< 1)
{
import std.utf : encode;
char[4] buf;
std.utf.encode(buf, ch);
char[sz] ret;
buf[0] &= leadMask!sz;
foreach(n; 1..sz)
buf[n] = buf[n] & 0x3f; //keep 6 lower bits
ret[] = buf[0..sz];
return ret;
}
auto build(Set)(Set set)
{
auto ascii = set & unicode.ASCII;
auto utf8_2 = set & CodepointSet(0x80, 0x800);
auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
auto asciiT = ascii.byCodepoint.map!(x=>cast(char)x).buildTrie!(AsciiSpec);
auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
alias Ret = Impl!(1,2,3,4);
return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
}
// Bootstrap UTF-8 static matcher interface
// from 3 primitives: tab!(size), lookup and Sizes
mixin template DefMatcher()
{
import std.string : format;
enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
alias UniSizes = Erase!(1, Sizes);
//generate dispatch code sequence for unicode parts
static auto genDispatch()
{
string code;
foreach(size; UniSizes)
code ~= format(q{
if ((ch & ~leadMask!%d) == encMask!(%d))
return lookup!(%d, mode)(inp);
else
}, size, size, size);
static if (Sizes.length == 4) //covers all code unit cases
code ~= "{ badEncoding(); return false; }";
else
code ~= "return false;"; //may be just fine but not covered
return code;
}
enum dispatch = genDispatch();
public bool match(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
enum mode = Mode.skipOnMatch;
assert(!inp.empty);
auto ch = inp[0];
static if(hasASCII)
{
if (ch < 0x80)
{
bool r = tab!1[ch];
if(r)
inp.popFront();
return r;
}
else
mixin(dispatch);
}
else
mixin(dispatch);
}
static if(Sizes.length == 4) // can skip iff can detect all encodings
{
public bool skip(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
enum mode = Mode.alwaysSkip;
assert(!inp.empty);
auto ch = inp[0];
static if(hasASCII)
{
if (ch < 0x80)
{
inp.popFront();
return tab!1[ch];
}
else
mixin(dispatch);
}
else
mixin(dispatch);
}
}
public bool test(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : char))
{
enum mode = Mode.neverSkip;
assert(!inp.empty);
auto ch = inp[0];
static if(hasASCII)
{
if (ch < 0x80)
return tab!1[ch];
else
mixin(dispatch);
}
else
mixin(dispatch);
}
bool match(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"match"(str);
}
bool skip(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"skip"(str);
}
bool test(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"test"(str);
}
mixin ForwardStrings;
}
struct Impl(Sizes...)
{
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
private:
//pick tables for chosen sizes
alias OurTabs = staticMap!(Table, Sizes);
OurTabs tables;
mixin DefMatcher;
//static disptach helper UTF size ==> table
alias tab(int i) = tables[i - 1];
package @property auto subMatcher(SizesToPick...)() @trusted
{
return CherryPick!(Impl, SizesToPick)(&this);
}
bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted
{
import std.typecons;
if(inp.length < size)
{
badEncoding();
return false;
}
char[size] needle = void;
needle[0] = leadMask!size & inp[0];
foreach(i; staticIota!(1, size))
{
needle[i] = truncate(inp[i]);
}
//overlong encoding checks
static if(size == 2)
{
//0x80-0x7FF
//got 6 bits in needle[1], must use at least 8 bits
//must use at least 2 bits in needle[1]
if(needle[0] < 2) badEncoding();
}
else static if(size == 3)
{
//0x800-0xFFFF
//got 6 bits in needle[2], must use at least 12bits
//must use 6 bits in needle[1] or anything in needle[0]
if(needle[0] == 0 && needle[1] < 0x20) badEncoding();
}
else static if(size == 4)
{
//0x800-0xFFFF
//got 2x6=12 bits in needle[2..3] must use at least 17bits
//must use 5 bits (or above) in needle[1] or anything in needle[0]
if(needle[0] == 0 && needle[1] < 0x10) badEncoding();
}
static if(mode == Mode.alwaysSkip)
{
inp.popFrontN(size);
return tab!size[needle];
}
else static if(mode == Mode.neverSkip)
{
return tab!size[needle];
}
else
{
static assert(mode == Mode.skipOnMatch);
if (tab!size[needle])
{
inp.popFrontN(size);
return true;
}
else
return false;
}
}
}
struct CherryPick(I, Sizes...)
{
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
private:
I* m;
@property ref tab(int i)() const pure { return m.tables[i - 1]; }
bool lookup(int size, Mode mode, Range)(ref Range inp) const pure
{
return m.lookup!(size, mode)(inp);
}
mixin DefMatcher;
}
}
template Utf16Matcher()
{
enum validSize(int sz) = sz >= 1 && sz <=2;
void badEncoding() pure
{
import std.utf;
throw new UTFException("Invalid UTF-16 sequence");
}
alias Seq = TypeTuple;
// 1-stage ASCII
alias AsciiSpec = Seq!(bool, wchar, clamp!7);
//2-stage BMP
alias BmpSpec = Seq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
//4-stage - full Unicode
//assume that 0xD800 & 0xDC00 bits are cleared
//thus leaving 10 bit per wchar to worry about
alias UniSpec = Seq!(bool, wchar[2],
assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
);
alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
auto encode2(dchar ch)
{
ch -= 0x1_0000;
assert(ch <= 0xF_FFFF);
wchar[2] ret;
//do not put surrogate bits, they are sliced off
ret[0] = (ch>>10);
ret[1] = (ch & 0xFFF);
return ret;
}
auto build(Set)(Set set)
{
auto ascii = set & unicode.ASCII;
auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
- CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
auto other = set - (bmp | ascii);
auto asciiT = ascii.byCodepoint.map!(x=>cast(char)x).buildTrie!(AsciiSpec);
auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar)x).buildTrie!(BmpSpec);
auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
alias Ret = Impl!(1,2);
return Ret(asciiT, bmpT, otherT);
}
//bootstrap full UTF-16 matcher interace from
//sizeFlags, lookupUni and ascii
mixin template DefMatcher()
{
public bool match(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
{
enum mode = Mode.skipOnMatch;
assert(!inp.empty);
auto ch = inp[0];
static if(sizeFlags & 1)
{
if (ch < 0x80)
{
if (ascii[ch])
{
inp.popFront();
return true;
}
else
return false;
}
return lookupUni!mode(inp);
}
else
return lookupUni!mode(inp);
}
static if(Sizes.length == 2)
{
public bool skip(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
{
enum mode = Mode.alwaysSkip;
assert(!inp.empty);
auto ch = inp[0];
static if(sizeFlags & 1)
{
if (ch < 0x80)
{
inp.popFront();
return ascii[ch];
}
else
return lookupUni!mode(inp);
}
else
return lookupUni!mode(inp);
}
}
public bool test(Range)(ref Range inp) const pure @trusted
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
{
enum mode = Mode.neverSkip;
assert(!inp.empty);
auto ch = inp[0];
static if(sizeFlags & 1)
return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
else
return lookupUni!mode(inp);
}
bool match(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"match"(str);
}
bool skip(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"skip"(str);
}
bool test(C)(ref C[] str) const pure @trusted
if(isSomeChar!C)
{
return fwdStr!"test"(str);
}
mixin ForwardStrings; //dispatch strings to range versions
}
struct Impl(Sizes...)
if(Sizes.length >= 1 && Sizes.length <= 2)
{
private:
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1 and 2 code units are possible in UTF-16");
static if(Sizes.length > 1)
enum sizeFlags = Sizes[0] | Sizes[1];
else
enum sizeFlags = Sizes[0];
static if(sizeFlags & 1)
{
Ascii ascii;
Bmp bmp;
}
static if(sizeFlags & 2)
{
Uni uni;
}
mixin DefMatcher;
package @property auto subMatcher(SizesToPick...)() @trusted
{
return CherryPick!(Impl, SizesToPick)(&this);
}
bool lookupUni(Mode mode, Range)(ref Range inp) const pure
{
wchar x = cast(wchar)(inp[0] - 0xD800);
//not a high surrogate
if(x > 0x3FF)
{
//low surrogate
if(x <= 0x7FF) badEncoding();
static if(sizeFlags & 1)
{
auto ch = inp[0];
static if(mode == Mode.alwaysSkip)
inp.popFront();
static if(mode == Mode.skipOnMatch)
{
if (bmp[ch])
{
inp.popFront();
return true;
}
else
return false;
}
else
return bmp[ch];
}
else //skip is not available for sub-matchers, so just false
return false;
}
else
{
static if(sizeFlags & 2)
{
if(inp.length < 2)
badEncoding();
wchar y = cast(wchar)(inp[1] - 0xDC00);
//not a low surrogate
if(y > 0x3FF)
badEncoding();
wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
static if(mode == Mode.alwaysSkip)
inp.popFrontN(2);
static if(mode == Mode.skipOnMatch)
{
if (uni[needle])
{
inp.popFrontN(2);
return true;
}
else
return false;
}
else
return uni[needle];
}
else //ditto
return false;
}
}
}
struct CherryPick(I, Sizes...)
if(Sizes.length >= 1 && Sizes.length <= 2)
{
private:
I* m;
enum sizeFlags = I.sizeFlags;
static if(sizeFlags & 1)
{
@property ref ascii()() const pure{ return m.ascii; }
}
bool lookupUni(Mode mode, Range)(ref Range inp) const pure
{
return m.lookupUni!mode(inp);
}
mixin DefMatcher;
static assert(allSatisfy!(validSize, Sizes),
"Only lengths of 1 and 2 code units are possible in UTF-16");
}
}
private auto utf8Matcher(Set)(Set set) @trusted
{
return Utf8Matcher!().build(set);
}
private auto utf16Matcher(Set)(Set set) @trusted
{
return Utf16Matcher!().build(set);
}
/**
Constructs a matcher object
to classify $(CODEPOINTS) from the $(D set) for encoding
that has $(D Char) as code unit.
See $(LREF MatcherConcept) for API outline.
*/
public auto utfMatcher(Char, Set)(Set set) @trusted
if(isCodepointSet!Set)
{
static if(is(Char : char))
return utf8Matcher(set);
else static if(is(Char : wchar))
return utf16Matcher(set);
else static if(is(Char : dchar))
static assert(false, "UTF-32 needs no decoding,
and thus not supported by utfMatcher");
else
static assert(false, "Only character types 'char' and 'wchar' are allowed");
}
//a range of code units, packed with index to speed up forward iteration
package auto decoder(C)(C[] s, size_t offset=0) @trusted
if(is(C : wchar) || is(C : char))
{
static struct Decoder
{
pure nothrow:
C[] str;
size_t idx;
@property C front(){ return str[idx]; }
@property C back(){ return str[$-1]; }
void popFront(){ idx++; }
void popBack(){ str = str[0..$-1]; }
void popFrontN(size_t n){ idx += n; }
@property bool empty(){ return idx == str.length; }
@property auto save(){ return this; }
auto opIndex(size_t i){ return str[idx+i]; }
@property size_t length(){ return str.length - idx; }
alias opDollar = length;
auto opSlice(size_t a, size_t b){ return Decoder(str[0..idx+b], idx+a); }
}
static assert(isRandomAccessRange!Decoder);
static assert(is(ElementType!Decoder : C));
return Decoder(s, offset);
}
/*
Expose UTF string $(D s) as a random-access
range of $(S_LINK Code unit, code units).
*/
package auto units(C)(C[] s)
if(is(C : wchar) || is(C : char))
{
static struct Units
{
pure nothrow:
C[] str;
@property C front(){ return str[0]; }
@property C back(){ return str[$-1]; }
void popFront(){ str = str[1..$]; }
void popBack(){ str = str[0..$-1]; }
void popFrontN(size_t n){ str = str[n..$]; }
@property bool empty(){ return 0 == str.length; }
@property auto save(){ return this; }
auto opIndex(size_t i){ return str[i]; }
@property size_t length(){ return str.length; }
alias opDollar = length;
auto opSlice(size_t a, size_t b){ return Units(str[a..b]); }
}
static assert(isRandomAccessRange!Units);
static assert(is(ElementType!Units : C));
return Units(s);
}
@safe unittest
{
import std.range;
string rs = "hi! ネемног砀 текста";
auto codec = rs.decoder;
auto utf8 = utf8Matcher(unicode.Letter);
auto asc = utf8.subMatcher!(1);
auto uni = utf8.subMatcher!(2,3,4);
assert(asc.test(codec));
assert(!uni.match(codec));
assert(utf8.skip(codec));
assert(codec.idx == 1);
assert(!uni.match(codec));
assert(asc.test(codec));
assert(utf8.skip(codec));
assert(codec.idx == 2);
assert(!asc.match(codec));
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
assert(!asc.test(codec));
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
assert(utf8.test(codec));
foreach(i; 0..7)
{
assert(!asc.test(codec));
assert(uni.test(codec));
assert(utf8.skip(codec));
}
assert(!utf8.test(codec));
assert(!utf8.skip(codec));
//the same with match where applicable
codec = rs.decoder;
assert(utf8.match(codec));
assert(codec.idx == 1);
assert(utf8.match(codec));
assert(codec.idx == 2);
assert(!utf8.match(codec));
assert(codec.idx == 2);
assert(!utf8.skip(codec));
assert(!utf8.skip(codec));
foreach(i; 0..7)
{
assert(!asc.test(codec));
assert(utf8.test(codec));
assert(utf8.match(codec));
}
auto i = codec.idx;
assert(!utf8.match(codec));
assert(codec.idx == i);
}
@safe unittest
{
static bool testAll(Matcher, Range)(ref Matcher m, ref Range r)
{
bool t = m.test(r);
auto save = r.idx;
assert(t == m.match(r));
assert(r.idx == save || t); //ether no change or was match
r.idx = save;
static if(is(typeof(m.skip(r))))
{
assert(t == m.skip(r));
assert(r.idx != save); //always changed
r.idx = save;
}
return t;
}
auto utf16 = utfMatcher!wchar(unicode.L);
auto bmp = utf16.subMatcher!1;
auto nonBmp = utf16.subMatcher!1;
auto utf8 = utfMatcher!char(unicode.L);
auto ascii = utf8.subMatcher!1;
auto uni2 = utf8.subMatcher!2;
auto uni3 = utf8.subMatcher!3;
auto uni24 = utf8.subMatcher!(2,4);
foreach(ch; unicode.L.byCodepoint.stride(3))
{
import std.utf : encode;
char[4] buf;
wchar[2] buf16;
auto len = std.utf.encode(buf, ch);
auto len16 = std.utf.encode(buf16, ch);
auto c8 = buf[0..len].decoder;
auto c16 = buf16[0..len16].decoder;
assert(testAll(utf16, c16));
assert(testAll(bmp, c16) || len16 != 1);
assert(testAll(nonBmp, c16) || len16 != 2);
assert(testAll(utf8, c8));
//submatchers return false on out of their domain
assert(testAll(ascii, c8) || len != 1);
assert(testAll(uni2, c8) || len != 2);
assert(testAll(uni3, c8) || len != 3);
assert(testAll(uni24, c8) || (len != 2 && len !=4));
}
}
// cover decode fail cases of Matcher
unittest
{
import std.string : format;
auto utf16 = utfMatcher!wchar(unicode.L);
auto utf8 = utfMatcher!char(unicode.L);
//decode failure cases UTF-8
alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
"\xCF\x00\0x00\0x00\x00");
foreach(msg; fails8){
assert(collectException((){
auto s = msg;
import std.utf;
size_t idx = 0;
//decode(s, idx);
utf8.test(s);
}()), format("%( %2x %)", cast(ubyte[])msg));
}
//decode failure cases UTF-16
alias fails16 = TypeTuple!([0xD811], [0xDC02]);
foreach(msg; fails16){
assert(collectException((){
auto s = msg.map!(x => cast(wchar)x);
utf16.test(s);
}()));
}
}
/++
Convenience function to construct optimal configurations for
packed Trie from any $(D set) of $(CODEPOINTS).
The parameter $(D level) indicates the number of trie levels to use,
allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
speed-size wise.
$(P Level 1 is fastest and the most memory hungry (a bit array). )
$(P Level 4 is the slowest and has the smallest footprint. )
See the $(S_LINK Synopsis, Synopsis) section for example.
Note:
Level 4 stays very practical (being faster and more predictable)
compared to using direct lookup on the $(D set) itself.
+/
public auto toTrie(size_t level, Set)(Set set)
if(isCodepointSet!Set)
{
static if(level == 1)
return codepointSetTrie!(21)(set);
else static if(level == 2)
return codepointSetTrie!(10, 11)(set);
else static if(level == 3)
return codepointSetTrie!(8, 5, 8)(set);
else static if(level == 4)
return codepointSetTrie!(6, 4, 4, 7)(set);
else
static assert(false,
"Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
}
/**
$(P Builds a $(D Trie) with typically optimal speed-size trade-off
and wraps it into a delegate of the following type:
$(D bool delegate(dchar ch)). )
$(P Effectively this creates a 'tester' lambda suitable
for algorithms like std.algorithm.find that take unary predicates. )
See the $(S_LINK Synopsis, Synopsis) section for example.
*/
public auto toDelegate(Set)(Set set)
if(isCodepointSet!Set)
{
// 3 is very small and is almost as fast as 2-level (due to CPU caches?)
auto t = toTrie!3(set);
return (dchar ch) => t[ch];
}
/**
$(P Opaque wrapper around unsigned built-in integers and
code unit (char/wchar/dchar) types.
Parameter $(D sz) indicates that the value is confined
to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
packed more tightly when stored in certain
data-structures like trie. )
Note:
$(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T)
but not vise-versa. Users have to ensure the value fits in
the range required and use the $(D cast)
operator to perform the conversion.)
*/
struct BitPacked(T, size_t sz)
if(isIntegral!T || is(T:dchar))
{
enum bitSize = sz;
T _value;
alias _value this;
}
/*
Depending on the form of the passed argument $(D bitSizeOf) returns
the amount of bits required to represent a given type
or a return type of a given functor.
*/
template bitSizeOf(Args...)
if(Args.length == 1)
{
alias T = Args[0];
static if(__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
{
enum bitSizeOf = T.bitSize;
}
else static if(is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
{
enum bitSizeOf = bitSizeOf!(ReturnType!T);
}
else
{
enum bitSizeOf = T.sizeof*8;
}
}
/**
Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x)
and thus suitable for packing.
*/
template isBitPacked(T)
{
static if(is(T dummy == BitPacked!(U, bits), U, size_t bits))
enum isBitPacked = true;
else
enum isBitPacked = false;
}
/**
Gives the type $(D U) from $(LREF BitPacked)!(U, x)
or $(D T) itself for every other type.
*/
template TypeOfBitPacked(T)
{
static if(is(T dummy == BitPacked!(U, bits), U, size_t bits))
alias TypeOfBitPacked = U;
else
alias TypeOfBitPacked = T;
}
/*
Wrapper, used in definition of custom data structures from $(D Trie) template.
Applying it to a unary lambda function indicates that the returned value always
fits within $(D bits) of bits.
*/
struct assumeSize(alias Fn, size_t bits)
{
enum bitSize = bits;
static auto ref opCall(T)(auto ref T arg)
{
return Fn(arg);
}
}
/*
A helper for defining lambda function that yields a slice
of certain bits from an unsigned integral value.
The resulting lambda is wrapped in assumeSize and can be used directly
with $(D Trie) template.
*/
struct sliceBits(size_t from, size_t to)
{
//for now bypass assumeSize, DMD has trouble inlining it
enum bitSize = to-from;
static auto opCall(T)(T x)
out(result)
{
assert(result < (1<> from) & ((1<<(to-from))-1);
}
}
uint low_8(uint x) { return x&0xFF; }
@safe pure nothrow uint midlow_8(uint x){ return (x&0xFF00)>>8; }
alias lo8 = assumeSize!(low_8, 8);
alias mlo8 = assumeSize!(midlow_8, 8);
static assert(bitSizeOf!lo8 == 8);
static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
template Sequence(size_t start, size_t end)
{
static if(start < end)
alias Sequence = TypeTuple!(start, Sequence!(start+1, end));
else
alias Sequence = TypeTuple!();
}
//---- TRIE TESTS ----
unittest
{
static trieStats(TRIE)(TRIE t)
{
version(std_uni_stats)
{
import std.stdio;
writeln("---TRIE FOOTPRINT STATS---");
foreach(i; staticIota!(0, t.table.dim) )
{
writefln("lvl%s = %s bytes; %s pages"
, i, t.bytes!i, t.pages!i);
}
writefln("TOTAL: %s bytes", t.bytes);
version(none)
{
writeln("INDEX (excluding value level):");
foreach(i; staticIota!(0, t.table.dim-1) )
writeln(t.table.slice!(i)[0..t.table.length!i]);
}
writeln("---------------------------");
}
}
//@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
// alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
// alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
alias CodepointSet Set;
auto set = Set('A','Z','a','z');
auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
for(int a='a'; a<'z';a++)
assert(trie[a]);
for(int a='A'; a<'Z';a++)
assert(trie[a]);
for(int a=0; a<'A'; a++)
assert(!trie[a]);
for(int a ='Z'; a<'a'; a++)
assert(!trie[a]);
trieStats(trie);
auto redundant2 = Set(
1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
trieStats(trie2);
foreach(e; redundant2.byCodepoint)
assert(trie2[e], text(cast(uint)e, " - ", trie2[e]));
foreach(i; 0..1024)
{
assert(trie2[i] == (i in redundant2));
}
auto redundant3 = Set(
2, 4, 6, 8, 16,
2+16, 4+16, 16+6, 16+8, 16+16,
2+32, 4+32, 32+6, 32+8,
);
enum max3 = 256;
// sliceBits
auto trie3 = buildTrie!(bool, uint, max3,
sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
)(redundant3.byInterval);
trieStats(trie3);
foreach(i; 0..max3)
assert(trie3[i] == (i in redundant3), text(cast(uint)i));
auto redundant4 = Set(
10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
1000, 2000, 3000, 4000, 5000, 6000
);
enum max4 = 2^^16;
auto trie4 = buildTrie!(bool, size_t, max4,
sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
)(redundant4.byInterval);
foreach(i; 0..max4){
if(i in redundant4)
assert(trie4[i], text(cast(uint)i));
}
trieStats(trie4);
alias mapToS = mapTrieIndex!(useItemAt!(0, char));
string[] redundantS = ["tea", "start", "orange"];
redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
// using first char only
assert(redundantS == ["orange", "start", "tea"]);
assert(strie["test"], text(strie["test"]));
assert(!strie["aea"]);
assert(strie["s"]);
// a bit size test
auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
trieStats(bt);
foreach(i; 0..256)
assert(bt[cast(ubyte)i]);
}
template useItemAt(size_t idx, T)
if(isIntegral!T || is(T: dchar))
{
size_t impl(in T[] arr){ return arr[idx]; }
alias useItemAt = assumeSize!(impl, 8*T.sizeof);
}
template useLastItem(T)
{
size_t impl(in T[] arr){ return arr[$-1]; }
alias useLastItem = assumeSize!(impl, 8*T.sizeof);
}
template fullBitSize(Prefix...)
{
static if(Prefix.length > 0)
enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
else
enum fullBitSize = 0;
}
template idxTypes(Key, size_t fullBits, Prefix...)
{
static if(Prefix.length == 1)
{// the last level is value level, so no index once reduced to 1-level
alias idxTypes = TypeTuple!();
}
else
{
// Important note on bit packing
// Each level has to hold enough of bits to address the next one
// The bottom level is known to hold full bit width
// thus it's size in pages is full_bit_width - size_of_last_prefix
// Recourse on this notion
alias idxTypes =
TypeTuple!(
idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
);
}
}
//============================================================================
@trusted int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
{
alias low = std.ascii.toLower;
return cmp(
a.map!(x => low(x))()
.filter!(x => !isWhite(x) && x != '-' && x != '_')(),
b.map!(x => low(x))()
.filter!(x => !isWhite(x) && x != '-' && x != '_')()
);
}
bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
{
return comparePropertyName(a, b) < 0;
}
//============================================================================
// Utilities for compression of Unicode code point sets
//============================================================================
@safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
{
// not optimized as usually done 1 time (and not public interface)
if(val < 128)
arr ~= cast(ubyte)val;
else if(val < (1<<13))
{
arr ~= (0b1_00<<5) | cast(ubyte)(val>>8);
arr ~= val & 0xFF;
}
else
{
assert(val < (1<<21));
arr ~= (0b1_01<<5) | cast(ubyte)(val>>16);
arr ~= (val >> 8) & 0xFF;
arr ~= val & 0xFF;
}
}
@safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
{
uint first = arr[idx++];
if(!(first & 0x80)) // no top bit -> [0..127]
return first;
uint extra = ((first>>5) & 1) + 1; // [1, 2]
uint val = (first & 0x1F);
enforce(idx + extra <= arr.length, "bad code point interval encoding");
foreach(j; 0..extra)
val = (val<<8) | arr[idx+j];
idx += extra;
return val;
}
package ubyte[] compressIntervals(Range)(Range intervals)
if(isInputRange!Range && isIntegralPair!(ElementType!Range))
{
ubyte[] storage;
uint base = 0;
// RLE encode
foreach(val; intervals)
{
compressTo(val[0]-base, storage);
base = val[0];
if(val[1] != lastDchar+1) // till the end of the domain so don't store it
{
compressTo(val[1]-base, storage);
base = val[1];
}
}
return storage;
}
unittest
{
auto run = [tuple(80, 127), tuple(128, (1<<10)+128)];
ubyte[] enc = [cast(ubyte)80, 47, 1, (0b1_00<<5) | (1<<2), 0];
assert(compressIntervals(run) == enc);
auto run2 = [tuple(0, (1<<20)+512+1), tuple((1<<20)+512+4, lastDchar+1)];
ubyte[] enc2 = [cast(ubyte)0, (0b1_01<<5) | (1<<4), 2, 1, 3]; // odd length-ed
assert(compressIntervals(run2) == enc2);
size_t idx = 0;
assert(decompressFrom(enc, idx) == 80);
assert(decompressFrom(enc, idx) == 47);
assert(decompressFrom(enc, idx) == 1);
assert(decompressFrom(enc, idx) == (1<<10));
idx = 0;
assert(decompressFrom(enc2, idx) == 0);
assert(decompressFrom(enc2, idx) == (1<<20)+512+1);
assert(equalS(decompressIntervals(compressIntervals(run)), run));
assert(equalS(decompressIntervals(compressIntervals(run2)), run2));
}
// Creates a range of $(D CodepointInterval) that lazily decodes compressed data.
@safe package auto decompressIntervals(const(ubyte)[] data) pure
{
return DecompressedIntervals(data);
}
@trusted struct DecompressedIntervals
{
pure:
const(ubyte)[] _stream;
size_t _idx;
CodepointInterval _front;
this(const(ubyte)[] stream)
{
_stream = stream;
popFront();
}
@property CodepointInterval front()
{
assert(!empty);
return _front;
}
void popFront()
{
if(_idx == _stream.length)
{
_idx = size_t.max;
return;
}
uint base = _front[1];
_front[0] = base + decompressFrom(_stream, _idx);
if(_idx == _stream.length)// odd length ---> till the end
_front[1] = lastDchar+1;
else
{
base = _front[0];
_front[1] = base + decompressFrom(_stream, _idx);
}
}
@property bool empty() const
{
return _idx == size_t.max;
}
@property DecompressedIntervals save() { return this; }
}
static assert(isInputRange!DecompressedIntervals);
static assert(isForwardRange!DecompressedIntervals);
//============================================================================
version(std_uni_bootstrap){}
else
{
// helper for looking up code point sets
@trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure
{
auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
(table.map!"a.name"());
size_t idx = range.lowerBound(name).length;
if(idx < range.length && comparePropertyName(range[idx], name) == 0)
return idx;
return -1;
}
// another one that loads it
@trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure
{
auto idx = findUnicodeSet!table(name);
if(idx >= 0)
{
dest = Set(asSet(table[idx].compressed));
return true;
}
return false;
}
@trusted bool loadProperty(Set=CodepointSet, C)
(in C[] name, ref Set target) pure
{
alias ucmp = comparePropertyName;
// conjure cumulative properties by hand
if(ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
{
target = asSet(uniProps.Lu);
target |= asSet(uniProps.Ll);
target |= asSet(uniProps.Lt);
target |= asSet(uniProps.Lo);
target |= asSet(uniProps.Lm);
}
else if(ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
{
target = asSet(uniProps.Ll);
target |= asSet(uniProps.Lu);
target |= asSet(uniProps.Lt);// Title case
}
else if(ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
{
target = asSet(uniProps.Mn);
target |= asSet(uniProps.Mc);
target |= asSet(uniProps.Me);
}
else if(ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
{
target = asSet(uniProps.Nd);
target |= asSet(uniProps.Nl);
target |= asSet(uniProps.No);
}
else if(ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
{
target = asSet(uniProps.Pc);
target |= asSet(uniProps.Pd);
target |= asSet(uniProps.Ps);
target |= asSet(uniProps.Pe);
target |= asSet(uniProps.Pi);
target |= asSet(uniProps.Pf);
target |= asSet(uniProps.Po);
}
else if(ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
{
target = asSet(uniProps.Sm);
target |= asSet(uniProps.Sc);
target |= asSet(uniProps.Sk);
target |= asSet(uniProps.So);
}
else if(ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
{
target = asSet(uniProps.Zs);
target |= asSet(uniProps.Zl);
target |= asSet(uniProps.Zp);
}
else if(ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
{
target = asSet(uniProps.Co);
target |= asSet(uniProps.Lo);
target |= asSet(uniProps.No);
target |= asSet(uniProps.So);
target |= asSet(uniProps.Po);
}
else if(ucmp(name, "graphical") == 0){
target = asSet(uniProps.Alphabetic);
target |= asSet(uniProps.Mn);
target |= asSet(uniProps.Mc);
target |= asSet(uniProps.Me);
target |= asSet(uniProps.Nd);
target |= asSet(uniProps.Nl);
target |= asSet(uniProps.No);
target |= asSet(uniProps.Pc);
target |= asSet(uniProps.Pd);
target |= asSet(uniProps.Ps);
target |= asSet(uniProps.Pe);
target |= asSet(uniProps.Pi);
target |= asSet(uniProps.Pf);
target |= asSet(uniProps.Po);
target |= asSet(uniProps.Zs);
target |= asSet(uniProps.Sm);
target |= asSet(uniProps.Sc);
target |= asSet(uniProps.Sk);
target |= asSet(uniProps.So);
}
else if(ucmp(name, "any") == 0)
target = Set.fromIntervals(0, 0x110000);
else if(ucmp(name, "ascii") == 0)
target = Set.fromIntervals(0, 0x80);
else
return loadUnicodeSet!(uniProps.tab)(name, target);
return true;
}
// CTFE-only helper for checking property names at compile-time
@safe bool isPrettyPropertyName(C)(in C[] name)
{
auto names = [
"L", "Letter",
"LC", "Cased Letter",
"M", "Mark",
"N", "Number",
"P", "Punctuation",
"S", "Symbol",
"Z", "Separator",
"Graphical",
"any",
"ascii"
];
auto x = find!(x => comparePropertyName(x, name) == 0)(names);
return !x.empty;
}
// ditto, CTFE-only, not optimized
@safe private static bool findSetName(alias table, C)(in C[] name)
{
return findUnicodeSet!table(name) >= 0;
}
template SetSearcher(alias table, string kind)
{
/// Run-time checked search.
static auto opCall(C)(in C[] name)
if(is(C : dchar))
{
CodepointSet set;
if(loadUnicodeSet!table(name, set))
return set;
throw new Exception("No unicode set for "~kind~" by name "
~name.to!string()~" was found.");
}
/// Compile-time checked search.
static @property auto opDispatch(string name)()
{
static if(findSetName!table(name))
{
CodepointSet set;
loadUnicodeSet!table(name, set);
return set;
}
else
static assert(false, "No unicode set for "~kind~" by name "
~name~" was found.");
}
}
/**
A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
a block, script or general category.
It uses well defined standard rules of property name lookup.
This includes fuzzy matching of names, so that
'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
and yield the same set of white space $(CHARACTERS).
*/
@safe public struct unicode
{
/**
Performs the lookup of set of $(CODEPOINTS)
with compile-time correctness checking.
This short-cut version combines 3 searches:
across blocks, scripts, and common binary properties.
Note that since scripts and blocks overlap the
usual trick to disambiguate is used - to get a block use
$(D unicode.InBlockName), to search a script
use $(D unicode.ScriptName).
See also $(LREF block), $(LREF script)
and (not included in this search) $(LREF hangulSyllableType).
Example:
---
auto ascii = unicode.ASCII;
assert(ascii['A']);
assert(ascii['~']);
assert(!ascii['\u00e0']);
// matching is case-insensitive
assert(ascii == unicode.ascII);
assert(!ascii['à']);
// underscores, '-' and whitespace in names are ignored too
auto latin = unicode.in_latin1_Supplement;
assert(latin['à']);
assert(!latin['$']);
// BTW Latin 1 Supplement is a block, hence "In" prefix
assert(latin == unicode("In Latin 1 Supplement"));
import std.exception;
// run-time look up throws if no such set is found
assert(collectException(unicode("InCyrilliac")));
---
*/
static @property auto opDispatch(string name)() pure
{
static if(findAny(name))
return loadAny(name);
else
static assert(false, "No unicode set by name "~name~" was found.");
}
/**
The same lookup across blocks, scripts, or binary properties,
but performed at run-time.
This version is provided for cases where $(D name)
is not known beforehand; otherwise compile-time
checked $(LREF opDispatch) is typically a better choice.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
*/
static auto opCall(C)(in C[] name)
if(is(C : dchar))
{
return loadAny(name);
}
/**
Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
See also $(S_LINK Unicode properties, table of properties).
Note:
Here block names are unambiguous as no scripts are searched
and thus to search use simply $(D unicode.block.BlockName) notation.
See $(S_LINK Unicode properties, table of properties) for available sets.
Example:
---
// use .block for explicitness
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
---
*/
struct block
{
mixin SetSearcher!(blocks.tab, "block");
}
/**
Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
Example:
---
auto arabicScript = unicode.script.arabic;
auto arabicBlock = unicode.block.arabic;
// there is an intersection between script and block
assert(arabicBlock['']);
assert(arabicScript['']);
// but they are different
assert(arabicBlock != arabicScript);
assert(arabicBlock == unicode.inArabic);
assert(arabicScript == unicode.arabic);
---
*/
struct script
{
mixin SetSearcher!(scripts.tab, "script");
}
/**
Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
Other non-binary properties (once supported) follow the same
notation - $(D unicode.propertyName.propertyValue) for compile-time
checked access and $(D unicode.propertyName(propertyValue))
for run-time checked one.
See the $(S_LINK Unicode properties, table of properties) for available
sets.
Example:
---
// L here is syllable type not Letter as in unicode.L short-cut
auto leadingVowel = unicode.hangulSyllableType("L");
// check that some leading vowels are present
foreach(vowel; '\u1110'..'\u115F')
assert(leadingVowel[vowel]);
assert(leadingVowel == unicode.hangulSyllableType.L);
---
*/
struct hangulSyllableType
{
mixin SetSearcher!(hangul.tab, "hangul syllable type");
}
private:
alias ucmp = comparePropertyName;
static bool findAny(string name)
{
return isPrettyPropertyName(name)
|| findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
|| (ucmp(name[0..2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
}
static auto loadAny(Set=CodepointSet, C)(in C[] name) pure
{
Set set;
bool loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
|| (name.length > 2 && ucmp(name[0..2],"In") == 0
&& loadUnicodeSet!(blocks.tab)(name[2..$], set));
if(loaded)
return set;
throw new Exception("No unicode set by name "~name.to!string()~" was found.");
}
// FIXME: re-disable once the compiler is fixed
// Disabled to prevent the mistake of creating instances of this pseudo-struct.
//@disable ~this();
}
unittest
{
auto ascii = unicode.ASCII;
assert(ascii['A']);
assert(ascii['~']);
assert(!ascii['\u00e0']);
// matching is case-insensitive
assert(ascii == unicode.ascII);
assert(!ascii['à']);
// underscores, '-' and whitespace in names are ignored too
auto latin = unicode.Inlatin1_Supplement;
assert(latin['à']);
assert(!latin['$']);
// BTW Latin 1 Supplement is a block, hence "In" prefix
assert(latin == unicode("In Latin 1 Supplement"));
import std.exception;
// R-T look up throws if no such set is found
assert(collectException(unicode("InCyrilliac")));
assert(collectException(unicode("X")));
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
// L here is explicitly syllable type not "Letter" as in unicode.L
auto leadingVowel = unicode.hangulSyllableType("L");
// check that some leading vowels are present
foreach(vowel; '\u1110'..'\u115F'+1)
assert(leadingVowel[vowel]);
assert(leadingVowel == unicode.hangulSyllableType.L);
auto arabicScript = unicode.script.arabic;
auto arabicBlock = unicode.block.arabic;
// there is an intersection between script and block
assert(arabicBlock['']);
assert(arabicScript['']);
// but they are different
assert(arabicBlock != arabicScript);
assert(arabicBlock == unicode.inArabic);
assert(arabicScript == unicode.arabic);
}
unittest
{
assert(unicode("InHebrew") == asSet(blocks.Hebrew));
assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
}
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
// control - '\r'
enum controlSwitch = `
case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
`;
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
// kill unrolled switches
private static bool isRegionalIndicator(dchar ch) @safe
{
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
}
template genericDecodeGrapheme(bool getValue)
{
alias graphemeExtend = graphemeExtendTrie;
alias spacingMark = mcTrie;
static if(getValue)
alias Value = Grapheme;
else
alias Value = void;
Value genericDecodeGrapheme(Input)(ref Input range)
{
enum GraphemeState {
Start,
CR,
RI,
L,
V,
LVT
}
static if(getValue)
Grapheme grapheme;
auto state = GraphemeState.Start;
enum eat = q{
static if(getValue)
grapheme ~= ch;
range.popFront();
};
dchar ch;
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
while(!range.empty)
{
ch = range.front;
final switch(state) with(GraphemeState)
{
case Start:
mixin(eat);
if(ch == '\r')
state = CR;
else if(isRegionalIndicator(ch))
state = RI;
else if(isHangL(ch))
state = L;
else if(hangLV[ch] || isHangV(ch))
state = V;
else if(hangLVT[ch])
state = LVT;
else if(isHangT(ch))
state = LVT;
else
{
switch(ch)
{
mixin(controlSwitch);
goto L_End;
default:
goto L_End_Extend;
}
}
break;
case CR:
if(ch == '\n')
mixin(eat);
goto L_End_Extend;
case RI:
if(isRegionalIndicator(ch))
mixin(eat);
else
goto L_End_Extend;
break;
case L:
if(isHangL(ch))
mixin(eat);
else if(isHangV(ch) || hangLV[ch])
{
state = V;
mixin(eat);
}
else if(hangLVT[ch])
{
state = LVT;
mixin(eat);
}
else
goto L_End_Extend;
break;
case V:
if(isHangV(ch))
mixin(eat);
else if(isHangT(ch))
{
state = LVT;
mixin(eat);
}
else
goto L_End_Extend;
break;
case LVT:
if(isHangT(ch))
{
mixin(eat);
}
else
goto L_End_Extend;
break;
}
}
L_End_Extend:
while(!range.empty)
{
ch = range.front;
// extend & spacing marks
if(!graphemeExtend[ch] && !spacingMark[ch])
break;
mixin(eat);
}
L_End:
static if(getValue)
return grapheme;
}
}
@trusted:
public: // Public API continues
/++
Returns the length of grapheme cluster starting at $(D index).
Both the resulting length and the $(D index) are measured
in $(S_LINK Code unit, code units).
Example:
---
// ASCII as usual is 1 code unit, 1 code point etc.
assert(graphemeStride(" ", 1) == 1);
// A + combing ring above
string city = "A\u030Arhus";
size_t first = graphemeStride(city, 0);
assert(first == 3); //\u030A has 2 UTF-8 code units
assert(city[0..first] == "A\u030A");
assert(city[first..$] == "rhus");
---
+/
size_t graphemeStride(C)(in C[] input, size_t index)
if(is(C : dchar))
{
auto src = input[index..$];
auto n = src.length;
genericDecodeGrapheme!(false)(src);
return n - src.length;
}
// for now tested separately see test_grapheme.d
unittest
{
assert(graphemeStride(" ", 1) == 1);
// A + combing ring above
string city = "A\u030Arhus";
size_t first = graphemeStride(city, 0);
assert(first == 3); //\u030A has 2 UTF-8 code units
assert(city[0..first] == "A\u030A");
assert(city[first..$] == "rhus");
}
/++
Reads one full grapheme cluster from an input range of dchar $(D inp).
For examples see the $(LREF Grapheme) below.
Note:
This function modifies $(D inp) and thus $(D inp)
must be an L-value.
+/
Grapheme decodeGrapheme(Input)(ref Input inp)
if(isInputRange!Input && is(Unqual!(ElementType!Input) == dchar))
{
return genericDecodeGrapheme!true(inp);
}
unittest
{
Grapheme gr;
string s = " \u0020\u0308 ";
gr = decodeGrapheme(s);
assert(gr.length == 1 && gr[0] == ' ');
gr = decodeGrapheme(s);
assert(gr.length == 2 && equalS(gr[0..2], " \u0308"));
s = "\u0300\u0308\u1100";
assert(equalS(decodeGrapheme(s)[], "\u0300\u0308"));
assert(equalS(decodeGrapheme(s)[], "\u1100"));
s = "\u11A8\u0308\uAC01";
assert(equalS(decodeGrapheme(s)[], "\u11A8\u0308"));
assert(equalS(decodeGrapheme(s)[], "\uAC01"));
}
/++
$(P Iterate a string by grapheme.)
$(P Useful for doing string manipulation that needs to be aware
of graphemes.)
See_Also:
$(LREF byCodePoint)
+/
auto byGrapheme(Range)(Range range)
if(isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
{
// TODO: Bidirectional access
static struct Result
{
private Range _range;
private Grapheme _front;
bool empty() @property
{
return _front.length == 0;
}
Grapheme front() @property
{
return _front;
}
void popFront()
{
_front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
}
static if(isForwardRange!Range)
{
Result save() @property
{
return Result(_range.save, _front);
}
}
}
auto result = Result(range);
result.popFront();
return result;
}
///
unittest
{
auto text = "noe\u0308l"; // noël using e + combining diaeresis
assert(text.walkLength == 5); // 5 code points
auto gText = text.byGrapheme;
assert(gText.walkLength == 4); // 4 graphemes
assert(gText.take(3).equal("noe\u0308".byGrapheme));
assert(gText.drop(3).equal("l".byGrapheme));
}
// For testing non-forward-range input ranges
version(unittest)
private static struct InputRangeString
{
private string s;
bool empty() @property { return s.empty; }
dchar front() @property { return s.front; }
void popFront() { s.popFront(); }
}
unittest
{
assert("".byGrapheme.walkLength == 0);
auto reverse = "le\u0308on";
assert(reverse.walkLength == 5);
auto gReverse = reverse.byGrapheme;
assert(gReverse.walkLength == 4);
foreach(text; TypeTuple!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
{
assert(text.walkLength == 5);
static assert(isForwardRange!(typeof(text)));
auto gText = text.byGrapheme;
static assert(isForwardRange!(typeof(gText)));
assert(gText.walkLength == 4);
assert(gText.array.retro.equal(gReverse));
}
auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
static assert(!isForwardRange!(typeof(nonForwardRange)));
assert(nonForwardRange.walkLength == 4);
}
/++
$(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
$(P Useful for converting the result to a string after doing operations
on graphemes.)
$(P Acts as the identity function when given a range of code points.)
+/
auto byCodePoint(Range)(Range range)
if(isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme))
{
// TODO: Propagate bidirectional access
static struct Result
{
private Range _range;
private size_t i = 0;
bool empty() @property
{
return _range.empty;
}
dchar front() @property
{
return _range.front[i];
}
void popFront()
{
++i;
if(i >= _range.front.length)
{
_range.popFront();
i = 0;
}
}
static if(isForwardRange!Range)
{
Result save() @property
{
return Result(_range.save, i);
}
}
}
return Result(range);
}
/// Ditto
Range byCodePoint(Range)(Range range)
if(isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
{
return range;
}
///
unittest
{
import std.conv : text;
string s = "noe\u0308l"; // noël
// reverse it and convert the result to a string
string reverse = s.byGrapheme
.array
.retro
.byCodePoint
.text;
assert(reverse == "le\u0308on"); // lëon
}
unittest
{
assert("".byGrapheme.byCodePoint.equal(""));
string text = "noe\u0308l";
static assert(is(typeof(text.byCodePoint) == string));
auto gText = InputRangeString(text).byGrapheme;
static assert(!isForwardRange!(typeof(gText)));
auto cpText = gText.byCodePoint;
static assert(!isForwardRange!(typeof(cpText)));
assert(cpText.walkLength == text.walkLength);
}
/++
$(P A structure designed to effectively pack $(CHARACTERS)
of a $(CLUSTER).
)
$(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme)
always refer to distinct objects. In most actual scenarios a $(D Grapheme)
fits on the stack and avoids memory allocation overhead for all but quite
long clusters.
)
Example:
---
import std.algorithm;
string bold = "ku\u0308hn";
// note that decodeGrapheme takes parameter by ref
// slicing a grapheme yields a range of dchar
assert(decodeGrapheme(bold)[].equal("k"));
// the next grapheme is 2 characters long
auto wideOne = decodeGrapheme(bold);
assert(wideOne.length == 2);
assert(wideOne[].equal("u\u0308"));
// the usual range manipulation is possible
assert(wideOne[].filter!isMark.equal("\u0308"));
---
$(P See also $(LREF decodeGrapheme), $(LREF graphemeStride). )
+/
@trusted struct Grapheme
{
public:
this(C)(in C[] chars...)
if(is(C : dchar))
{
this ~= chars;
}
this(Input)(Input seq)
if(!isDynamicArray!Input
&& isInputRange!Input && is(ElementType!Input : dchar))
{
this ~= seq;
}
/// Gets a $(CODEPOINT) at the given index in this cluster.
dchar opIndex(size_t index) const pure nothrow
{
assert(index < length);
return read24(isBig ? ptr_ : small_.ptr, index);
}
/++
Writes a $(CODEPOINT) $(D ch) at given index in this cluster.
Warning:
Use of this facility may invalidate grapheme cluster,
see also $(LREF Grapheme.valid).
Example:
---
auto g = Grapheme("A\u0302");
assert(g[0] == 'A');
assert(g.valid);
g[1] = '~'; // ASCII tilda is not a combining mark
assert(g[1] == '~');
assert(!g.valid);
---
+/
void opIndexAssign(dchar ch, size_t index) pure nothrow
{
assert(index < length);
write24(isBig ? ptr_ : small_.ptr, ch, index);
}
/++
Random-access range over Grapheme's $(CHARACTERS).
Warning: Invalidates when this Grapheme leaves the scope,
attempts to use it then would lead to memory corruption.
+/
@system auto opSlice(size_t a, size_t b) pure nothrow
{
return sliceOverIndexed(a, b, &this);
}
/// ditto
@system auto opSlice() pure nothrow
{
return sliceOverIndexed(0, length, &this);
}
/// Grapheme cluster length in $(CODEPOINTS).
@property size_t length() const pure nothrow
{
return isBig ? len_ : slen_ & 0x7F;
}
/++
Append $(CHARACTER) $(D ch) to this grapheme.
Warning:
Use of this facility may invalidate grapheme cluster,
see also $(D valid).
Example:
---
auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equal("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore
assert(!g.valid);
// still could be useful though
assert(g[].equal("A\u0301B"));
---
See also $(LREF Grapheme.valid) below.
+/
ref opOpAssign(string op)(dchar ch)
{
static if(op == "~")
{
if(!isBig)
{
if(slen_ + 1 > small_cap)
convertToBig();// & fallthrough to "big" branch
else
{
write24(small_.ptr, ch, smallLength);
slen_++;
return this;
}
}
assert(isBig);
if(len_ + 1 > cap_)
{
cap_ += grow;
ptr_ = cast(ubyte*)enforce(realloc(ptr_, 3*(cap_+1)));
}
write24(ptr_, ch, len_++);
return this;
}
else
static assert(false, "No operation "~op~" defined for Grapheme");
}
/// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme.
ref opOpAssign(string op, Input)(Input inp)
if(isInputRange!Input && is(ElementType!Input : dchar))
{
static if(op == "~")
{
foreach(dchar ch; inp)
this ~= ch;
return this;
}
else
static assert(false, "No operation "~op~" defined for Grapheme");
}
/++
True if this object contains valid extended grapheme cluster.
Decoding primitives of this module always return a valid $(D Grapheme).
Appending to and direct manipulation of grapheme's $(CHARACTERS) may
render it no longer valid. Certain applications may chose to use
Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
entirely.
+/
@property bool valid()() /*const*/
{
auto r = this[];
genericDecodeGrapheme!false(r);
return r.length == 0;
}
this(this)
{
if(isBig)
{// dup it
auto raw_cap = 3*(cap_+1);
auto p = cast(ubyte*)enforce(malloc(raw_cap));
p[0..raw_cap] = ptr_[0..raw_cap];
ptr_ = p;
}
}
~this()
{
if(isBig)
{
free(ptr_);
}
}
private:
enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
// "out of the blue" grow rate, needs testing
// (though graphemes are typically small < 9)
enum grow = 20;
enum small_cap = small_bytes/3;
enum small_flag = 0x80, small_mask = 0x7F;
// 16 bytes in 32bits, should be enough for the majority of cases
union
{
struct
{
ubyte* ptr_;
size_t cap_;
size_t len_;
size_t padding_;
}
struct
{
ubyte[small_bytes] small_;
ubyte slen_;
}
}
void convertToBig()
{
size_t k = smallLength;
ubyte* p = cast(ubyte*)enforce(malloc(3*(grow+1)));
for(int i=0; i len_);
cap_ = grow;
setBig();
}
void setBig(){ slen_ |= small_flag; }
@property size_t smallLength() pure nothrow
{
return slen_ & small_mask;
}
@property ubyte isBig() const pure nothrow
{
return slen_ & small_flag;
}
}
static assert(Grapheme.sizeof == size_t.sizeof*4);
// verify the example
unittest
{
import std.algorithm;
string bold = "ku\u0308hn";
// note that decodeGrapheme takes parameter by ref
auto first = decodeGrapheme(bold);
assert(first.length == 1);
assert(first[0] == 'k');
// the next grapheme is 2 characters long
auto wideOne = decodeGrapheme(bold);
// slicing a grapheme yields a random-access range of dchar
assert(wideOne[].equalS("u\u0308"));
assert(wideOne.length == 2);
static assert(isRandomAccessRange!(typeof(wideOne[])));
// all of the usual range manipulation is possible
assert(wideOne[].filter!isMark().equalS("\u0308"));
auto g = Grapheme("A");
assert(g.valid);
g ~= '\u0301';
assert(g[].equalS("A\u0301"));
assert(g.valid);
g ~= "B";
// not a valid grapheme cluster anymore
assert(!g.valid);
// still could be useful though
assert(g[].equalS("A\u0301B"));
}
unittest
{
auto g = Grapheme("A\u0302");
assert(g[0] == 'A');
assert(g.valid);
g[1] = '~'; // ASCII tilda is not a combining mark
assert(g[1] == '~');
assert(!g.valid);
}
unittest
{
// not valid clusters (but it just a test)
auto g = Grapheme('a', 'b', 'c', 'd', 'e');
assert(g[0] == 'a');
assert(g[1] == 'b');
assert(g[2] == 'c');
assert(g[3] == 'd');
assert(g[4] == 'e');
g[3] = 'Й';
assert(g[2] == 'c');
assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
assert(g[4] == 'e');
assert(!g.valid);
g ~= 'ц';
g ~= '~';
assert(g[0] == 'a');
assert(g[1] == 'b');
assert(g[2] == 'c');
assert(g[3] == 'Й');
assert(g[4] == 'e');
assert(g[5] == 'ц');
assert(g[6] == '~');
assert(!g.valid);
Grapheme copy = g;
copy[0] = 'X';
copy[1] = '-';
assert(g[0] == 'a' && copy[0] == 'X');
assert(g[1] == 'b' && copy[1] == '-');
assert(equalS(g[2..g.length], copy[2..copy.length]));
copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
assert(equalS(copy[0..8], "АБВГДЕЁЖ"), text(copy[0..8]));
copy ~= "xyz";
assert(equalS(copy[13..15], "xy"), text(copy[13..15]));
assert(!copy.valid);
Grapheme h;
foreach(dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
h ~= v;
assert(equalS(h[], iota(cast(int)'A', cast(int)'Z'+1)));
}
/++
$(P Does basic case-insensitive comparison of strings $(D str1) and $(D str2).
This function uses simpler comparison rule thus achieving better performance
then $(LREF icmp). However keep in mind the warning below.)
Warning:
This function only handles 1:1 $(CODEPOINT) mapping
and thus is not sufficient for certain alphabets
like German, Greek and few others.
Example:
---
assert(sicmp("Август", "авгусТ") == 0);
// Greek also works as long as there is no 1:M mapping in sight
assert(sicmp("ΌΎ", "όύ") == 0);
// things like the following won't get matched as equal
// Greek small letter iota with dialytika and tonos
assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
// while icmp has no problem with that
assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
assert(icmp("ΌΎ", "όύ") == 0);
---
+/
int sicmp(S1, S2)(S1 str1, S2 str2)
if(isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar)
&& isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
{
import std.utf : decode;
alias sTable = simpleCaseTable;
size_t ridx=0;
foreach(dchar lhs; str1)
{
if(ridx == str2.length)
return 1;
dchar rhs = std.utf.decode(str2, ridx);
int diff = lhs - rhs;
if(!diff)
continue;
size_t idx = simpleCaseTrie[lhs];
size_t idx2 = simpleCaseTrie[rhs];
// simpleCaseTrie is packed index table
if(idx != EMPTY_CASE_TRIE)
{
if(idx2 != EMPTY_CASE_TRIE)
{// both cased chars
// adjust idx --> start of bucket
idx = idx - sTable[idx].n;
idx2 = idx2 - sTable[idx2].n;
if(idx == idx2)// one bucket, equivalent chars
continue;
else// not the same bucket
diff = sTable[idx].ch - sTable[idx2].ch;
}
else
diff = sTable[idx - sTable[idx].n].ch - rhs;
}
else if(idx2 != EMPTY_CASE_TRIE)
{
diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
}
// one of chars is not cased at all
return diff;
}
return ridx == str2.length ? 0 : -1;
}
// overloads for the most common cases to reduce compile time
@safe pure /*TODO nothrow*/
{
int sicmp(const(char)[] str1, const(char)[] str2)
{ return sicmp!(const(char)[], const(char)[])(str1, str2); }
int sicmp(const(wchar)[] str1, const(wchar)[] str2)
{ return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
int sicmp(const(dchar)[] str1, const(dchar)[] str2)
{ return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
}
private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
@trusted pure /*TODO nothrow*/
{
alias fTable = fullCaseTable;
size_t idx = fullCaseTrie[lhs];
// fullCaseTrie is packed index table
if(idx == EMPTY_CASE_TRIE)
return lhs;
size_t start = idx - fTable[idx].n;
size_t end = fTable[idx].size + start;
assert(fTable[start].entry_len == 1);
for(idx=start; idx \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
---
+/
int icmp(S1, S2)(S1 str1, S2 str2)
if(isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar)
&& isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
{
for(;;)
{
if(str1.empty)
return str2.empty ? 0 : -1;
dchar lhs = str1.front;
if(str2.empty)
return 1;
dchar rhs = str2.front;
str1.popFront();
str2.popFront();
int diff = lhs - rhs;
if(!diff)
continue;
// first try to match lhs to sequence
int cmpLR = fullCasedCmp(lhs, rhs, str2);
if(!cmpLR)
continue;
// then rhs to sequence
int cmpRL = fullCasedCmp(rhs, lhs, str1);
if(!cmpRL)
continue;
// cmpXX contain remapped codepoints
// to obtain stable ordering of icmp
diff = cmpLR - cmpRL;
return diff;
}
}
// overloads for the most common cases to reduce compile time
@safe pure /*TODO nothrow*/
{
int icmp(const(char)[] str1, const(char)[] str2)
{ return icmp!(const(char)[], const(char)[])(str1, str2); }
int icmp(const(wchar)[] str1, const(wchar)[] str2)
{ return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
int icmp(const(dchar)[] str1, const(dchar)[] str2)
{ return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
}
unittest
{
assertCTFEable!(
{
foreach(cfunc; TypeTuple!(icmp, sicmp))
{
foreach(S1; TypeTuple!(string, wstring, dstring))
foreach(S2; TypeTuple!(string, wstring, dstring))
{
assert(cfunc("".to!S1(), "".to!S2()) == 0);
assert(cfunc("A".to!S1(), "".to!S2()) > 0);
assert(cfunc("".to!S1(), "0".to!S2()) < 0);
assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
// Check example:
assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
}
// check that the order is properly agnostic to the case
auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"];
sort!((a,b) => cfunc(a,b) < 0)(strs);
assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]);
}
assert(icmp("ßb", "ssa") > 0);
// Check example:
assert(icmp("Russland", "Rußland") == 0);
assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
//bugzilla 11057
assert( icmp("K", "L") < 0 );
});
}
// This is package for the moment to be used as a support tool for std.regex
// It needs a better API
/*
Return a range of all $(CODEPOINTS) that casefold to
and from this $(D ch).
*/
package auto simpleCaseFoldings(dchar ch)
{
alias sTable = simpleCaseTable;
static struct Range
{
pure nothrow:
uint idx; //if == uint.max, then read c.
union
{
dchar c; // == 0 - empty range
uint len;
}
@property bool isSmall() const { return idx == uint.max; }
this(dchar ch)
{
idx = uint.max;
c = ch;
}
this(uint start, uint size)
{
idx = start;
len = size;
}
@property dchar front() const
{
assert(!empty);
if(isSmall)
{
return c;
}
auto ch = sTable[idx].ch;
return ch;
}
@property bool empty() const
{
if(isSmall)
{
return c == 0;
}
return len == 0;
}
@property uint length() const
{
if(isSmall)
{
return c == 0 ? 0 : 1;
}
return len;
}
void popFront()
{
if(isSmall)
c = 0;
else
{
idx++;
len--;
}
}
}
immutable idx = simpleCaseTrie[ch];
if (idx == EMPTY_CASE_TRIE)
return Range(ch);
auto entry = sTable[idx];
immutable start = idx - entry.n;
return Range(start, entry.size);
}
unittest
{
assertCTFEable!((){
auto r = simpleCaseFoldings('Э').array;
assert(r.length == 2);
assert(r.canFind('э') && r.canFind('Э'));
auto sr = simpleCaseFoldings('~');
assert(sr.equalS("~"));
//A with ring above - casefolds to the same bucket as Angstrom sign
sr = simpleCaseFoldings('Å');
assert(sr.length == 3);
assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
});
}
/++
$(P Returns the $(S_LINK Combining class, combining class) of $(D ch).)
Example:
---
// shorten the code
alias CC = combiningClass;
// combining tilda
assert(CC('\u0303') == 230);
// combining ring below
assert(CC('\u0325') == 220);
// the simple consequence is that "tilda" should be
// placed after a "ring below" in a sequence
---
+/
ubyte combiningClass(dchar ch)
{
return combiningClassTrie[ch];
}
unittest
{
foreach(ch; 0..0x80)
assert(combiningClass(ch) == 0);
assert(combiningClass('\u05BD') == 22);
assert(combiningClass('\u0300') == 230);
assert(combiningClass('\u0317') == 220);
assert(combiningClass('\u1939') == 222);
}
/// Unicode character decomposition type.
enum UnicodeDecomposition {
/// Canonical decomposition. The result is canonically equivalent sequence.
Canonical,
/**
Compatibility decomposition. The result is compatibility equivalent sequence.
Note: Compatibility decomposition is a $(B lossy) conversion,
typically suitable only for fuzzy matching and internal processing.
*/
Compatibility
};
/**
Shorthand aliases for character decomposition type, passed as a
template parameter to $(LREF decompose).
*/
enum {
Canonical = UnicodeDecomposition.Canonical,
Compatibility = UnicodeDecomposition.Compatibility
};
/++
Try to canonically compose 2 $(CHARACTERS).
Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
The assumption is that $(D first) comes before $(D second) in the original text,
usually meaning that the first is a starter.
Note: Hangul syllables are not covered by this function.
See $(D composeJamo) below.
Example:
---
assert(compose('A','\u0308') == '\u00C4');
assert(compose('A', 'B') == dchar.init);
assert(compose('C', '\u0301') == '\u0106');
// note that the starter is the first one
// thus the following doesn't compose
assert(compose('\u0308', 'A') == dchar.init);
---
+/
public dchar compose(dchar first, dchar second)
{
import std.internal.unicode_comp;
size_t packed = compositionJumpTrie[first];
if(packed == ushort.max)
return dchar.init;
// unpack offset and length
size_t idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
// TODO: optimize this micro binary search (no more then 4-5 steps)
auto r = compositionTable[idx..idx+cnt].map!"a.rhs"().assumeSorted();
auto target = r.lowerBound(second).length;
if(target == cnt)
return dchar.init;
auto entry = compositionTable[idx+target];
if(entry.rhs != second)
return dchar.init;
return entry.composed;
}
/++
Returns a full $(S_LINK Canonical decomposition, Canonical)
(by default) or $(S_LINK Compatibility decomposition, Compatibility)
decomposition of $(CHARACTER) $(D ch).
If no decomposition is available returns a $(LREF Grapheme)
with the $(D ch) itself.
Note:
This function also decomposes hangul syllables
as prescribed by the standard.
See also $(LREF decomposeHangul) for a restricted version
that takes into account only hangul syllables but
no other decompositions.
Example:
---
import std.algorithm;
assert(decompose('Ĉ')[].equal("C\u0302"));
assert(decompose('D')[].equal("D"));
assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
assert(decompose!Compatibility('¹').equal("1"));
---
+/
public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch)
{
import std.internal.unicode_decomp;
static if(decompType == Canonical)
{
alias table = decompCanonTable;
alias mapping = canonMappingTrie;
}
else static if(decompType == Compatibility)
{
alias table = decompCompatTable;
alias mapping = compatMappingTrie;
}
ushort idx = mapping[ch];
if(!idx) // not found, check hangul arithmetic decomposition
return decomposeHangul(ch);
auto decomp = table[idx..$].until(0);
return Grapheme(decomp);
}
unittest
{
// verify examples
assert(compose('A','\u0308') == '\u00C4');
assert(compose('A', 'B') == dchar.init);
assert(compose('C', '\u0301') == '\u0106');
// note that the starter is the first one
// thus the following doesn't compose
assert(compose('\u0308', 'A') == dchar.init);
import std.algorithm;
assert(decompose('Ĉ')[].equalS("C\u0302"));
assert(decompose('D')[].equalS("D"));
assert(decompose('\uD4DC')[].equalS("\u1111\u1171\u11B7"));
assert(decompose!Compatibility('¹')[].equalS("1"));
}
//----------------------------------------------------------------------------
// Hangul specific composition/decomposition
enum jamoSBase = 0xAC00;
enum jamoLBase = 0x1100;
enum jamoVBase = 0x1161;
enum jamoTBase = 0x11A7;
enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
enum jamoNCount = jamoVCount * jamoTCount;
enum jamoSCount = jamoLCount * jamoNCount;
// Tests if $(D ch) is a Hangul leading consonant jamo.
bool isJamoL(dchar ch)
{
// first cmp rejects ~ 1M code points above leading jamo range
return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
}
// Tests if $(D ch) is a Hangul vowel jamo.
bool isJamoT(dchar ch)
{
// first cmp rejects ~ 1M code points above trailing jamo range
// Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
return ch < jamoTBase+jamoTCount && ch > jamoTBase;
}
// Tests if $(D ch) is a Hangul trailnig consonant jamo.
bool isJamoV(dchar ch)
{
// first cmp rejects ~ 1M code points above vowel range
return ch < jamoVBase+jamoVCount && ch >= jamoVBase;
}
int hangulSyllableIndex(dchar ch)
{
int idxS = cast(int)ch - jamoSBase;
return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
}
// internal helper: compose hangul syllables leaving dchar.init in holes
void hangulRecompose(dchar[] seq)
{
for(size_t idx = 0; idx + 1 < seq.length; )
{
if(isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
{
int indexL = seq[idx] - jamoLBase;
int indexV = seq[idx+1] - jamoVBase;
int indexLV = indexL * jamoNCount + indexV * jamoTCount;
if(idx + 2 < seq.length && isJamoT(seq[idx+2]))
{
seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
seq[idx+1] = dchar.init;
seq[idx+2] = dchar.init;
idx += 3;
}
else
{
seq[idx] = jamoSBase + indexLV;
seq[idx+1] = dchar.init;
idx += 2;
}
}
else
idx++;
}
}
//----------------------------------------------------------------------------
public:
/**
Decomposes a Hangul syllable. If $(D ch) is not a composed syllable
then this function returns $(LREF Grapheme) containing only $(D ch) as is.
Example:
---
import std.algorithm;
assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
---
*/
Grapheme decomposeHangul(dchar ch)
{
int idxS = cast(int)ch - jamoSBase;
if(idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
int idxL = idxS / jamoNCount;
int idxV = (idxS % jamoNCount) / jamoTCount;
int idxT = idxS % jamoTCount;
int partL = jamoLBase + idxL;
int partV = jamoVBase + idxV;
if(idxT > 0) // there is a trailling consonant (T); decomposition
return Grapheme(partL, partV, jamoTBase + idxT);
else // decomposition
return Grapheme(partL, partV);
}
/++
Try to compose hangul syllable out of a leading consonant ($(D lead)),
a $(D vowel) and optional $(D trailing) consonant jamos.
On success returns the composed LV or LVT hangul syllable.
If any of $(D lead) and $(D vowel) are not a valid hangul jamo
of the respective $(CHARACTER) class returns dchar.init.
Example:
---
assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
// leaving out T-vowel, or passing any codepoint
// that is not trailing consonant composes an LV-syllable
assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
assert(composeJamo('\u1111', 'A') == dchar.init);
assert(composeJamo('A', '\u1171') == dchar.init);
---
+/
dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init)
{
if(!isJamoL(lead))
return dchar.init;
int indexL = lead - jamoLBase;
if(!isJamoV(vowel))
return dchar.init;
int indexV = vowel - jamoVBase;
int indexLV = indexL * jamoNCount + indexV * jamoTCount;
dchar syllable = jamoSBase + indexLV;
return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
}
unittest
{
static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
{
Grapheme g = decompose!T(ch);
assert(equalS(g[], r), text(g[], " vs ", r));
}
testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
testDecomp!Canonical('\uF907', "\u9F9C");
testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
testDecomp!Compatibility('\uA7F9', "\u0153");
// check examples
assert(decomposeHangul('\uD4DB')[].equalS("\u1111\u1171\u11B6"));
assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
assert(composeJamo('\u1111', 'A') == dchar.init);
assert(composeJamo('A', '\u1171') == dchar.init);
}
/**
Enumeration type for normalization forms,
passed as template parameter for functions like $(LREF normalize).
*/
enum NormalizationForm {
NFC,
NFD,
NFKC,
NFKD
}
enum {
/**
Shorthand aliases from values indicating normalization forms.
*/
NFC = NormalizationForm.NFC,
///ditto
NFD = NormalizationForm.NFD,
///ditto
NFKC = NormalizationForm.NFKC,
///ditto
NFKD = NormalizationForm.NFKD
};
/++
Returns $(D input) string normalized to the chosen form.
Form C is used by default.
For more information on normalization forms see
the $(S_LINK Normalization, normalization section).
Note:
In cases where the string in question is already normalized,
it is returned unmodified and no memory allocation happens.
Example:
---
// any encoding works
wstring greet = "Hello world";
assert(normalize(greet) is greet); // the same exact slice
// An example of a character with all 4 forms being different:
// Greek upsilon with acute and hook symbol (code point 0x03D3)
assert(normalize!NFC("ϓ") == "\u03D3");
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
assert(normalize!NFKC("ϓ") == "\u038E");
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
---
+/
inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
{
auto anchors = splitNormalized!norm(input);
if(anchors[0] == input.length && anchors[1] == input.length)
return input;
dchar[] decomposed;
decomposed.reserve(31);
ubyte[] ccc;
ccc.reserve(31);
auto app = appender!(C[])();
do
{
app.put(input[0..anchors[0]]);
foreach(dchar ch; input[anchors[0]..anchors[1]])
static if(norm == NFD || norm == NFC)
{
foreach(dchar c; decompose!Canonical(ch)[])
decomposed ~= c;
}
else // NFKD & NFKC
{
foreach(dchar c; decompose!Compatibility(ch)[])
decomposed ~= c;
}
ccc.length = decomposed.length;
size_t firstNonStable = 0;
ubyte lastClazz = 0;
foreach(idx, dchar ch; decomposed)
{
auto clazz = combiningClass(ch);
ccc[idx] = clazz;
if(clazz == 0 && lastClazz != 0)
{
// found a stable code point after unstable ones
sort!("a[0] < b[0]", SwapStrategy.stable)
(zip(ccc[firstNonStable..idx], decomposed[firstNonStable..idx]));
firstNonStable = decomposed.length;
}
else if(clazz != 0 && lastClazz == 0)
{
// found first unstable code point after stable ones
firstNonStable = idx;
}
lastClazz = clazz;
}
sort!("a[0] < b[0]", SwapStrategy.stable)
(zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
static if(norm == NFC || norm == NFKC)
{
size_t idx = 0;
auto first = countUntil(ccc, 0);
if(first >= 0) // no starters?? no recomposition
{
for(;;)
{
auto second = recompose(first, decomposed, ccc);
if(second == decomposed.length)
break;
first = second;
}
// 2nd pass for hangul syllables
hangulRecompose(decomposed);
}
}
static if(norm == NFD || norm == NFKD)
app.put(decomposed);
else
{
auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
app.put(decomposed[0 .. clean.length]);
}
// reset variables
decomposed.length = 0;
decomposed.assumeSafeAppend();
ccc.length = 0;
ccc.assumeSafeAppend();
input = input[anchors[1]..$];
// and move on
anchors = splitNormalized!norm(input);
}while(anchors[0] != input.length);
app.put(input[0..anchors[0]]);
return cast(inout(C)[])app.data;
}
unittest
{
assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
assert(normalize!NFD("Äffin") == "A\u0308ffin");
// check example
// any encoding works
wstring greet = "Hello world";
assert(normalize(greet) is greet); // the same exact slice
// An example of a character with all 4 forms being different:
// Greek upsilon with acute and hook symbol (code point 0x03D3)
assert(normalize!NFC("ϓ") == "\u03D3");
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
assert(normalize!NFKC("ϓ") == "\u038E");
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
}
// canonically recompose given slice of code points, works in-place and mutates data
private size_t recompose(size_t start, dchar[] input, ubyte[] ccc)
{
assert(input.length == ccc.length);
int accumCC = -1;// so that it's out of 0..255 range
bool foundSolidStarter = false;
// writefln("recomposing %( %04x %)", input);
// first one is always a starter thus we start at i == 1
size_t i = start+1;
for(; ; )
{
if(i == input.length)
break;
int curCC = ccc[i];
// In any character sequence beginning with a starter S
// a character C is blocked from S if and only if there
// is some character B between S and C, and either B
// is a starter or it has the same or higher combining class as C.
//------------------------
// Applying to our case:
// S is input[0]
// accumCC is the maximum CCC of characters between C and S,
// as ccc are sorted
// C is input[i]
if(curCC > accumCC)
{
dchar comp = compose(input[start], input[i]);
if(comp != dchar.init)
{
input[start] = comp;
input[i] = dchar.init;// put a sentinel
// current was merged so its CCC shouldn't affect
// composing with the next one
}
else {
// if it was a starter then accumCC is now 0, end of loop
accumCC = curCC;
if(accumCC == 0)
break;
}
}
else{
// ditto here
accumCC = curCC;
if(accumCC == 0)
break;
}
i++;
}
return i;
}
// returns tuple of 2 indexes that delimit:
// normalized text, piece that needs normalization and
// the rest of input starting with stable code point
private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
{
auto result = input;
ubyte lastCC = 0;
foreach(idx, dchar ch; input)
{
static if(norm == NFC)
if(ch < 0x0300)
{
lastCC = 0;
continue;
}
ubyte CC = combiningClass(ch);
if(lastCC > CC && CC != 0)
{
return seekStable!norm(idx, input);
}
if(notAllowedIn!norm(ch))
{
return seekStable!norm(idx, input);
}
lastCC = CC;
}
return tuple(input.length, input.length);
}
private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input)
{
import std.utf : codeLength;
auto br = input[0..idx];
size_t region_start = 0;// default
for(;;)
{
if(br.empty)// start is 0
break;
dchar ch = br.back;
if(combiningClass(ch) == 0 && allowedIn!norm(ch))
{
region_start = br.length - std.utf.codeLength!C(ch);
break;
}
br.popFront();
}
///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
size_t region_end=input.length;// end is $ by default
foreach(i, dchar ch; input[idx..$])
{
if(combiningClass(ch) == 0 && allowedIn!norm(ch))
{
region_end = i+idx;
break;
}
}
// writeln("Region to normalize: ", input[region_start..region_end]);
return tuple(region_start, region_end);
}
/**
Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization
form $(D norm).
---
// e.g. Cyrillic is always allowed, so is ASCII
assert(allowedIn!NFC('я'));
assert(allowedIn!NFD('я'));
assert(allowedIn!NFKC('я'));
assert(allowedIn!NFKD('я'));
assert(allowedIn!NFC('Z'));
---
*/
public bool allowedIn(NormalizationForm norm)(dchar ch)
{
return !notAllowedIn!norm(ch);
}
// not user friendly name but more direct
private bool notAllowedIn(NormalizationForm norm)(dchar ch)
{
static if(norm == NFC)
alias qcTrie = nfcQCTrie;
else static if(norm == NFD)
alias qcTrie = nfdQCTrie;
else static if(norm == NFKC)
alias qcTrie = nfkcQCTrie;
else static if(norm == NFKD)
alias qcTrie = nfkdQCTrie;
else
static assert("Unknown normalization form "~norm);
return qcTrie[ch];
}
unittest
{
assert(allowedIn!NFC('я'));
assert(allowedIn!NFD('я'));
assert(allowedIn!NFKC('я'));
assert(allowedIn!NFKD('я'));
assert(allowedIn!NFC('Z'));
}
}
version(std_uni_bootstrap)
{
// old version used for bootstrapping of gen_uni.d that generates
// up to date optimal versions of all of isXXX functions
@safe pure nothrow @nogc public bool isWhite(dchar c)
{
return std.ascii.isWhite(c) ||
c == lineSep || c == paraSep ||
c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
(c >= '\u2000' && c <= '\u200A') ||
c == '\u202F' || c == '\u205F' || c == '\u3000';
}
}
else
{
// trusted -> avoid bounds check
@trusted pure nothrow
{
// hide template instances behind functions (Bugzilla 13232)
ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
}
public:
/++
Whether or not $(D c) is a Unicode whitespace $(CHARACTER).
(general Unicode category: Part of C0(tab, vertical tab, form feed,
carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
+/
@safe pure nothrow @nogc
public bool isWhite(dchar c)
{
return isWhiteGen(c); // call pregenerated binary search
}
/++
Return whether $(D c) is a Unicode lowercase $(CHARACTER).
+/
@safe pure nothrow
bool isLower(dchar c)
{
if(std.ascii.isASCII(c))
return std.ascii.isLower(c);
return lowerCaseTrie[c];
}
@safe unittest
{
foreach(v; 0..0x80)
assert(std.ascii.isLower(v) == isLower(v));
assert(isLower('я'));
assert(isLower('й'));
assert(!isLower('Ж'));
// Greek HETA
assert(!isLower('\u0370'));
assert(isLower('\u0371'));
assert(!isLower('\u039C')); // capital MU
assert(isLower('\u03B2')); // beta
// from extended Greek
assert(!isLower('\u1F18'));
assert(isLower('\u1F00'));
foreach(v; unicode.lowerCase.byCodepoint)
assert(isLower(v) && !isUpper(v));
}
/++
Return whether $(D c) is a Unicode uppercase $(CHARACTER).
+/
@safe pure nothrow
bool isUpper(dchar c)
{
if(std.ascii.isASCII(c))
return std.ascii.isUpper(c);
return upperCaseTrie[c];
}
@safe unittest
{
foreach(v; 0..0x80)
assert(std.ascii.isLower(v) == isLower(v));
assert(!isUpper('й'));
assert(isUpper('Ж'));
// Greek HETA
assert(isUpper('\u0370'));
assert(!isUpper('\u0371'));
assert(isUpper('\u039C')); // capital MU
assert(!isUpper('\u03B2')); // beta
// from extended Greek
assert(!isUpper('\u1F00'));
assert(isUpper('\u1F18'));
foreach(v; unicode.upperCase.byCodepoint)
assert(isUpper(v) && !isLower(v));
}
/++
If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
is returned. Otherwise $(D c) is returned.
Warning: certain alphabets like German and Greek have no 1:1
upper-lower mapping. Use overload of toLower which takes full string instead.
+/
@safe pure nothrow
dchar toLower(dchar c)
{
// optimize ASCII case
if(c < 0xAA)
{
if(c < 'A')
return c;
if(c <= 'Z')
return c + 32;
return c;
}
size_t idx = toLowerSimpleIndex(c);
if(idx != ushort.max)
{
return toLowerTab(idx);
}
return c;
}
//TODO: Hidden for now, needs better API.
//Other transforms could use better API as well, but this one is a new primitive.
@safe pure nothrow
private dchar toTitlecase(dchar c)
{
// optimize ASCII case
if(c < 0xAA)
{
if(c < 'a')
return c;
if(c <= 'z')
return c - 32;
return c;
}
size_t idx = toTitleSimpleIndex(c);
if(idx != ushort.max)
{
return toTitleTab(idx);
}
return c;
}
private alias UpperTriple = TypeTuple!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
private alias LowerTriple = TypeTuple!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
// generic toUpper/toLower on whole string, creates new or returns as is
private S toCase(alias indexFn, uint maxIdx, alias tableFn, S)(S s) @trusted pure
if(isSomeString!S)
{
foreach(i, dchar cOuter; s)
{
ushort idx = indexFn(cOuter);
if(idx == ushort.max)
continue;
auto result = appender!S(s[0..i]);
result.reserve(s.length);
foreach(dchar c; s[i .. $])
{
idx = indexFn(c);
if(idx == ushort.max)
result.put(c);
else if(idx < maxIdx)
{
c = tableFn(idx);
result.put(c);
}
else
{
auto val = tableFn(idx);
// unpack length + codepoint
uint len = val>>24;
result.put(cast(dchar)(val & 0xFF_FFFF));
foreach(j; idx+1..idx+len)
result.put(tableFn(j));
}
}
return result.data;
}
return s;
}
unittest //12428
{
auto s = "abcdefghij".replicate(300);
s = s[0..10];
toUpper(s);
assert(s == "abcdefghij");
}
// TODO: helper, I wish std.utf was more flexible (and stright)
private size_t encodeTo(char[] buf, size_t idx, dchar c) @trusted pure
{
if (c <= 0x7F)
{
buf[idx] = cast(char)c;
idx++;
}
else if (c <= 0x7FF)
{
buf[idx] = cast(char)(0xC0 | (c >> 6));
buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
idx += 2;
}
else if (c <= 0xFFFF)
{
buf[idx] = cast(char)(0xE0 | (c >> 12));
buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
idx += 3;
}
else if (c <= 0x10FFFF)
{
buf[idx] = cast(char)(0xF0 | (c >> 18));
buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
idx += 4;
}
else
assert(0);
return idx;
}
unittest
{
char[] s = "abcd".dup;
size_t i = 0;
i = encodeTo(s, i, 'X');
assert(s == "Xbcd");
i = encodeTo(s, i, cast(dchar)'\u00A9');
assert(s == "X\xC2\xA9d");
}
// TODO: helper, I wish std.utf was more flexible (and stright)
private size_t encodeTo(wchar[] buf, size_t idx, dchar c) @trusted pure
{
import std.utf;
if (c <= 0xFFFF)
{
if (0xD800 <= c && c <= 0xDFFF)
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
buf[idx] = cast(wchar)c;
idx++;
}
else if (c <= 0x10FFFF)
{
buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
idx += 2;
}
else
assert(0);
return idx;
}
private size_t encodeTo(dchar[] buf, size_t idx, dchar c) @trusted pure
{
buf[idx] = c;
idx++;
return idx;
}
private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
if (is(C == char) || is(C == wchar) || is(C == dchar))
{
import std.utf;
size_t curIdx = 0;
size_t destIdx = 0;
alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
size_t lastUnchanged = 0;
// in-buffer move of bytes to a new start index
// the trick is that it may not need to copy at all
static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
{
// Interestingly we may just bump pointer for a while
// then have to copy if a re-cased char was smaller the original
// later we may regain pace with char that got bigger
// In the end it sometimes flip-flops between the 2 cases below
if(dest == from)
return to;
// got to copy
foreach(C c; str[from..to])
str[dest++] = c;
return dest;
}
while(curIdx != s.length)
{
size_t startIdx = curIdx;
dchar ch = decode(s, curIdx);
// TODO: special case for ASCII
auto caseIndex = indexFn(ch);
if(caseIndex == ushort.max) // unchanged, skip over
{
continue;
}
else if(caseIndex < maxIdx) // 1:1 codepoint mapping
{
// previous cased chars had the same length as uncased ones
// thus can just adjust pointer
destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
lastUnchanged = curIdx;
dchar cased = tableFn(caseIndex);
auto casedLen = codeLength!C(cased);
if(casedLen + destIdx > curIdx) // no place to fit cased char
{
// switch to slow codepath, where we allocate
return slowToCase(s, startIdx, destIdx);
}
else
{
destIdx = encodeTo(s, destIdx, cased);
}
}
else // 1:m codepoint mapping, slow codepath
{
destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
lastUnchanged = curIdx;
return slowToCase(s, startIdx, destIdx);
}
assert(destIdx <= curIdx);
}
if(lastUnchanged != s.length)
{
destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
}
s = s[0..destIdx];
}
// helper to precalculate size of case-converted string
private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
{
size_t toCaseLength(C)(in C[] str)
{
import std.utf;
size_t codeLen = 0;
size_t lastNonTrivial = 0;
size_t curIdx = 0;
while(curIdx != str.length)
{
size_t startIdx = curIdx;
dchar ch = decode(str, curIdx);
ushort caseIndex = indexFn(ch);
if(caseIndex == ushort.max)
continue;
else if(caseIndex < maxIdx)
{
codeLen += startIdx - lastNonTrivial;
lastNonTrivial = curIdx;
dchar cased = tableFn(caseIndex);
codeLen += codeLength!C(cased);
}
else
{
codeLen += startIdx - lastNonTrivial;
lastNonTrivial = curIdx;
auto val = tableFn(caseIndex);
auto len = val>>24;
dchar cased = val & 0xFF_FFFF;
codeLen += codeLength!C(cased);
foreach(j; caseIndex+1..caseIndex+len)
codeLen += codeLength!C(tableFn(j));
}
}
if(lastNonTrivial != str.length)
codeLen += str.length - lastNonTrivial;
return codeLen;
}
}
unittest
{
import std.conv;
alias toLowerLength = toCaseLength!(LowerTriple);
assert(toLowerLength("abcd") == 4);
assert(toLowerLength("аБВгд456") == 10+3);
}
// slower code path that preallocates and then copies
// case-converted stuf to the new string
private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
{
void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
size_t destIdx) @trusted pure
if (is(C == char) || is(C == wchar) || is(C == dchar))
{
import std.utf : decode;
alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
auto trueLength = destIdx + caseLength(s[curIdx..$]);
C[] ns = new C[trueLength];
ns[0..destIdx] = s[0..destIdx];
size_t lastUnchanged = curIdx;
while(curIdx != s.length)
{
size_t startIdx = curIdx; // start of current codepoint
dchar ch = decode(s, curIdx);
auto caseIndex = indexFn(ch);
if(caseIndex == ushort.max) // skip over
{
continue;
}
else if(caseIndex < maxIdx) // 1:1 codepoint mapping
{
dchar cased = tableFn(caseIndex);
auto toCopy = startIdx - lastUnchanged;
ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
lastUnchanged = curIdx;
destIdx += toCopy;
destIdx = encodeTo(ns, destIdx, cased);
}
else // 1:m codepoint mapping, slow codepath
{
auto toCopy = startIdx - lastUnchanged;
ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
lastUnchanged = curIdx;
destIdx += toCopy;
auto val = tableFn(caseIndex);
// unpack length + codepoint
uint len = val>>24;
destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
foreach(j; caseIndex+1..caseIndex+len)
destIdx = encodeTo(ns, destIdx, tableFn(j));
}
}
if(lastUnchanged != s.length)
{
auto toCopy = s.length - lastUnchanged;
ns[destIdx..destIdx+toCopy] = s[lastUnchanged..$];
destIdx += toCopy;
}
assert(ns.length == destIdx);
s = ns;
}
}
/++
Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
For a few characters string length may increase after the transformation,
in such a case the function reallocates exactly once.
If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
+/
void toLowerInPlace(C)(ref C[] s) @trusted pure
if (is(C == char) || is(C == wchar) || is(C == dchar))
{
toCaseInPlace!(LowerTriple)(s);
}
// overloads for the most common cases to reduce compile time
@safe pure /*TODO nothrow*/
{
void toLowerInPlace(ref char[] s)
{ toLowerInPlace!char(s); }
void toLowerInPlace(ref wchar[] s)
{ toLowerInPlace!wchar(s); }
void toLowerInPlace(ref dchar[] s)
{ toLowerInPlace!dchar(s); }
}
/++
Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place.
For a few characters string length may increase after the transformation,
in such a case the function reallocates exactly once.
If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
+/
void toUpperInPlace(C)(ref C[] s) @trusted pure
if (is(C == char) || is(C == wchar) || is(C == dchar))
{
toCaseInPlace!(UpperTriple)(s);
}
// overloads for the most common cases to reduce compile time/code size
@safe pure /*TODO nothrow*/
{
void toUpperInPlace(ref char[] s)
{ toUpperInPlace!char(s); }
void toUpperInPlace(ref wchar[] s)
{ toUpperInPlace!wchar(s); }
void toUpperInPlace(ref dchar[] s)
{ toUpperInPlace!dchar(s); }
}
/++
Returns a string which is identical to $(D s) except that all of its
characters are converted to lowercase (by preforming Unicode lowercase mapping).
If none of $(D s) characters were affected, then $(D s) itself is returned.
+/
S toLower(S)(S s) @trusted pure
if(isSomeString!S)
{
return toCase!(LowerTriple)(s);
}
// overloads for the most common cases to reduce compile time
@safe pure /*TODO nothrow*/
{
string toLower(string s)
{ return toLower!string(s); }
wstring toLower(wstring s)
{ return toLower!wstring(s); }
dstring toLower(dstring s)
{ return toLower!dstring(s); }
}
@trusted unittest //@@@BUG std.format is not @safe
{
import std.string : format;
foreach(ch; 0..0x80)
assert(std.ascii.toLower(ch) == toLower(ch));
assert(toLower('Я') == 'я');
assert(toLower('Δ') == 'δ');
foreach(ch; unicode.upperCase.byCodepoint)
{
dchar low = ch.toLower();
assert(low == ch || isLower(low), format("%s -> %s", ch, low));
}
assert(toLower("АЯ") == "ая");
assert("\u1E9E".toLower == "\u00df");
assert("\u00df".toUpper == "SS");
}
//bugzilla 9629
unittest
{
wchar[] test = "hello þ world"w.dup;
auto piece = test[6..7];
toUpperInPlace(piece);
assert(test == "hello Þ world");
}
unittest
{
string s1 = "FoL";
string s2 = toLower(s1);
assert(cmp(s2, "fol") == 0, s2);
assert(s2 != s1);
char[] s3 = s1.dup;
toLowerInPlace(s3);
assert(s3 == s2);
s1 = "A\u0100B\u0101d";
s2 = toLower(s1);
s3 = s1.dup;
assert(cmp(s2, "a\u0101b\u0101d") == 0);
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2);
s1 = "A\u0460B\u0461d";
s2 = toLower(s1);
s3 = s1.dup;
assert(cmp(s2, "a\u0461b\u0461d") == 0);
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2);
s1 = "\u0130";
s2 = toLower(s1);
s3 = s1.dup;
assert(s2 == "i\u0307");
assert(s2 !is s1);
toLowerInPlace(s3);
assert(s3 == s2);
// Test on wchar and dchar strings.
assert(toLower("Some String"w) == "some string"w);
assert(toLower("Some String"d) == "some string"d);
// bugzilla 12455
dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
assert(isUpper(c));
assert(toLower(c) == 'i');
// extend on 12455 reprot - check simple-case toUpper too
c = '\u1f87';
assert(isLower(c));
assert(toUpper(c) == '\u1F8F');
}
/++
If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
is returned. Otherwise $(D c) is returned.
Warning:
Certain alphabets like German and Greek have no 1:1
upper-lower mapping. Use overload of toUpper which takes full string instead.
+/
@safe pure nothrow
dchar toUpper(dchar c)
{
// optimize ASCII case
if(c < 0xAA)
{
if(c < 'a')
return c;
if(c <= 'z')
return c - 32;
return c;
}
size_t idx = toUpperSimpleIndex(c);
if(idx != ushort.max)
{
return toUpperTab(idx);
}
return c;
}
@trusted unittest
{
import std.string : format;
foreach(ch; 0..0x80)
assert(std.ascii.toUpper(ch) == toUpper(ch));
assert(toUpper('я') == 'Я');
assert(toUpper('δ') == 'Δ');
auto title = unicode.Titlecase_Letter;
foreach(ch; unicode.lowerCase.byCodepoint)
{
dchar up = ch.toUpper();
assert(up == ch || isUpper(up) || title[up],
format("%x -> %x", ch, up));
}
}
/++
Returns a string which is identical to $(D s) except that all of its
characters are converted to uppercase (by preforming Unicode uppercase mapping).
If none of $(D s) characters were affected, then $(D s) itself is returned.
+/
S toUpper(S)(S s) @trusted pure
if(isSomeString!S)
{
return toCase!(UpperTriple)(s);
}
// overloads for the most common cases to reduce compile time
@safe pure /*TODO nothrow*/
{
string toUpper(string s)
{ return toUpper!string(s); }
wstring toUpper(wstring s)
{ return toUpper!wstring(s); }
dstring toUpper(dstring s)
{ return toUpper!dstring(s); }
}
unittest
{
string s1 = "FoL";
string s2;
char[] s3;
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2, s3);
assert(cmp(s2, "FOL") == 0);
assert(s2 !is s1);
s1 = "a\u0100B\u0101d";
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2);
assert(cmp(s2, "A\u0100B\u0100D") == 0);
assert(s2 !is s1);
s1 = "a\u0460B\u0461d";
s2 = toUpper(s1);
s3 = s1.dup; toUpperInPlace(s3);
assert(s3 == s2);
assert(cmp(s2, "A\u0460B\u0460D") == 0);
assert(s2 !is s1);
}
unittest
{
static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
{
import std.string : format;
string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
auto low = s.toLower() , up = s.toUpper();
auto lowInp = s.dup, upInp = s.dup;
lowInp.toLowerInPlace();
upInp.toUpperInPlace();
assert(low == trueLow, format(diff, low, trueLow));
assert(up == trueUp, format(diff, up, trueUp));
assert(lowInp == trueLow,
format(diff, cast(ubyte[])s, cast(ubyte[])lowInp, cast(ubyte[])trueLow));
assert(upInp == trueUp,
format(diff, cast(ubyte[])s, cast(ubyte[])upInp, cast(ubyte[])trueUp));
}
foreach(S; TypeTuple!(dstring, wstring, string))
{
S easy = "123";
S good = "abCФеж";
S awful = "\u0131\u023f\u2126";
S wicked = "\u0130\u1FE2";
auto options = [easy, good, awful, wicked];
S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
foreach(val; TypeTuple!(easy, good))
{
auto e = val.dup;
auto g = e;
e.toUpperInPlace();
assert(e is g);
e.toLowerInPlace();
assert(e is g);
}
foreach(i, v; options)
{
doTest(v, upper[i], lower[i]);
}
// a few combinatorial runs
foreach(i; 0..options.length)
foreach(j; i..options.length)
foreach(k; j..options.length)
{
auto sample = options[i] ~ options[j] ~ options[k];
auto sample2 = options[k] ~ options[j] ~ options[i];
doTest(sample, upper[i] ~ upper[j] ~ upper[k],
lower[i] ~ lower[j] ~ lower[k]);
doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
lower[k] ~ lower[j] ~ lower[i]);
}
}
}
/++
Returns whether $(D c) is a Unicode alphabetic $(CHARACTER)
(general Unicode category: Alphabetic).
+/
@safe pure nothrow
bool isAlpha(dchar c)
{
// optimization
if(c < 0xAA)
{
size_t x = c - 'A';
if(x <= 'Z' - 'A')
return true;
else
{
x = c - 'a';
if(x <= 'z'-'a')
return true;
}
return false;
}
return alphaTrie[c];
}
@safe unittest
{
auto alpha = unicode("Alphabetic");
foreach(ch; alpha.byCodepoint)
assert(isAlpha(ch));
foreach(ch; 0..0x4000)
assert((ch in alpha) == isAlpha(ch));
}
/++
Returns whether $(D c) is a Unicode mark
(general Unicode category: Mn, Me, Mc).
+/
@safe pure nothrow
bool isMark(dchar c)
{
return markTrie[c];
}
@safe unittest
{
auto mark = unicode("Mark");
foreach(ch; mark.byCodepoint)
assert(isMark(ch));
foreach(ch; 0..0x4000)
assert((ch in mark) == isMark(ch));
}
/++
Returns whether $(D c) is a Unicode numerical $(CHARACTER)
(general Unicode category: Nd, Nl, No).
+/
@safe pure nothrow
bool isNumber(dchar c)
{
return numberTrie[c];
}
@safe unittest
{
auto n = unicode("N");
foreach(ch; n.byCodepoint)
assert(isNumber(ch));
foreach(ch; 0..0x4000)
assert((ch in n) == isNumber(ch));
}
/++
Returns whether $(D c) is a Unicode punctuation $(CHARACTER)
(general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
+/
@safe pure nothrow
bool isPunctuation(dchar c)
{
return punctuationTrie[c];
}
unittest
{
assert(isPunctuation('\u0021'));
assert(isPunctuation('\u0028'));
assert(isPunctuation('\u0029'));
assert(isPunctuation('\u002D'));
assert(isPunctuation('\u005F'));
assert(isPunctuation('\u00AB'));
assert(isPunctuation('\u00BB'));
foreach(ch; unicode("P").byCodepoint)
assert(isPunctuation(ch));
}
/++
Returns whether $(D c) is a Unicode symbol $(CHARACTER)
(general Unicode category: Sm, Sc, Sk, So).
+/
@safe pure nothrow
bool isSymbol(dchar c)
{
return symbolTrie[c];
}
unittest
{
import std.string;
assert(isSymbol('\u0024'));
assert(isSymbol('\u002B'));
assert(isSymbol('\u005E'));
assert(isSymbol('\u00A6'));
foreach(ch; unicode("S").byCodepoint)
assert(isSymbol(ch), format("%04x", ch));
}
/++
Returns whether $(D c) is a Unicode space $(CHARACTER)
(general Unicode category: Zs)
Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
For commonly used less strict semantics see $(LREF isWhite).
+/
@safe pure nothrow
bool isSpace(dchar c)
{
return isSpaceGen(c);
}
unittest
{
assert(isSpace('\u0020'));
auto space = unicode.Zs;
foreach(ch; space.byCodepoint)
assert(isSpace(ch));
foreach(ch; 0..0x1000)
assert(isSpace(ch) == space[ch]);
}
/++
Returns whether $(D c) is a Unicode graphical $(CHARACTER)
(general Unicode category: L, M, N, P, S, Zs).
+/
@safe pure nothrow
bool isGraphical(dchar c)
{
return graphicalTrie[c];
}
unittest
{
auto set = unicode("Graphical");
import std.string;
foreach(ch; set.byCodepoint)
assert(isGraphical(ch), format("%4x", ch));
foreach(ch; 0..0x4000)
assert((ch in set) == isGraphical(ch));
}
/++
Returns whether $(D c) is a Unicode control $(CHARACTER)
(general Unicode category: Cc).
+/
@safe pure nothrow
bool isControl(dchar c)
{
return isControlGen(c);
}
unittest
{
assert(isControl('\u0000'));
assert(isControl('\u0081'));
assert(!isControl('\u0100'));
auto cc = unicode.Cc;
foreach(ch; cc.byCodepoint)
assert(isControl(ch));
foreach(ch; 0..0x1000)
assert(isControl(ch) == cc[ch]);
}
/++
Returns whether $(D c) is a Unicode formatting $(CHARACTER)
(general Unicode category: Cf).
+/
@safe pure nothrow
bool isFormat(dchar c)
{
return isFormatGen(c);
}
unittest
{
assert(isFormat('\u00AD'));
foreach(ch; unicode("Format").byCodepoint)
assert(isFormat(ch));
}
// code points for private use, surrogates are not likely to change in near feature
// if need be they can be generated from unicode data as well
/++
Returns whether $(D c) is a Unicode Private Use $(CODEPOINT)
(general Unicode category: Co).
+/
@safe pure nothrow
bool isPrivateUse(dchar c)
{
return (0x00_E000 <= c && c <= 0x00_F8FF)
|| (0x0F_0000 <= c && c <= 0x0F_FFFD)
|| (0x10_0000 <= c && c <= 0x10_FFFD);
}
/++
Returns whether $(D c) is a Unicode surrogate $(CODEPOINT)
(general Unicode category: Cs).
+/
@safe pure nothrow
bool isSurrogate(dchar c)
{
return (0xD800 <= c && c <= 0xDFFF);
}
/++
Returns whether $(D c) is a Unicode high surrogate (lead surrogate).
+/
@safe pure nothrow
bool isSurrogateHi(dchar c)
{
return (0xD800 <= c && c <= 0xDBFF);
}
/++
Returns whether $(D c) is a Unicode low surrogate (trail surrogate).
+/
@safe pure nothrow
bool isSurrogateLo(dchar c)
{
return (0xDC00 <= c && c <= 0xDFFF);
}
/++
Returns whether $(D c) is a Unicode non-character i.e.
a $(CODEPOINT) with no assigned abstract character.
(general Unicode category: Cn)
+/
@safe pure nothrow
bool isNonCharacter(dchar c)
{
return nonCharacterTrie[c];
}
unittest
{
auto set = unicode("Cn");
foreach(ch; set.byCodepoint)
assert(isNonCharacter(ch));
}
private:
// load static data from pre-generated tables into usable datastructures
@safe auto asSet(const (ubyte)[] compressed) pure
{
return CodepointSet.fromIntervals(decompressIntervals(compressed));
}
@safe pure nothrow auto asTrie(T...)(in TrieEntry!T e)
{
return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
}
@safe pure nothrow @property
{
// It's important to use auto return here, so that the compiler
// only runs semantic on the return type if the function gets
// used. Also these are functions rather than templates to not
// increase the object size of the caller.
auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
//normalization quick-check tables
auto nfcQCTrie()
{
import std.internal.unicode_norm;
static immutable res = asTrie(nfcQCTrieEntries);
return res;
}
auto nfdQCTrie()
{
import std.internal.unicode_norm;
static immutable res = asTrie(nfdQCTrieEntries);
return res;
}
auto nfkcQCTrie()
{
import std.internal.unicode_norm;
static immutable res = asTrie(nfkcQCTrieEntries);
return res;
}
auto nfkdQCTrie()
{
import std.internal.unicode_norm;
static immutable res = asTrie(nfkdQCTrieEntries);
return res;
}
//grapheme breaking algorithm tables
auto mcTrie()
{
import std.internal.unicode_grapheme;
static immutable res = asTrie(mcTrieEntries);
return res;
}
auto graphemeExtendTrie()
{
import std.internal.unicode_grapheme;
static immutable res = asTrie(graphemeExtendTrieEntries);
return res;
}
auto hangLV()
{
import std.internal.unicode_grapheme;
static immutable res = asTrie(hangulLVTrieEntries);
return res;
}
auto hangLVT()
{
import std.internal.unicode_grapheme;
static immutable res = asTrie(hangulLVTTrieEntries);
return res;
}
// tables below are used for composition/decomposition
auto combiningClassTrie()
{
import std.internal.unicode_comp;
static immutable res = asTrie(combiningClassTrieEntries);
return res;
}
auto compatMappingTrie()
{
import std.internal.unicode_decomp;
static immutable res = asTrie(compatMappingTrieEntries);
return res;
}
auto canonMappingTrie()
{
import std.internal.unicode_decomp;
static immutable res = asTrie(canonMappingTrieEntries);
return res;
}
auto compositionJumpTrie()
{
import std.internal.unicode_comp;
static immutable res = asTrie(compositionJumpTrieEntries);
return res;
}
//case conversion tables
auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
//simple case conversion tables
auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
}
}// version(!std_uni_bootstrap)