mirror of
https://github.com/dlang/phobos.git
synced 2025-04-26 21:22:20 +03:00
9038 lines
272 KiB
D
9038 lines
272 KiB
D
// Written in the D programming language.
|
||
|
||
/++
|
||
$(SECTION Overview)
|
||
|
||
$(P The $(D std.uni) module provides an implementation
|
||
of fundamental Unicode algorithms and data structures.
|
||
This doesn't include UTF encoding and decoding primitives,
|
||
see $(XREF _utf, decode) and $(XREF _utf, encode) in std.utf
|
||
for this functionality. )
|
||
|
||
$(P All primitives listed operate on Unicode characters and
|
||
sets of characters. For functions which operate on ASCII characters
|
||
and ignore Unicode $(CHARACTERS), see $(LINK2 std_ascii.html, std.ascii).
|
||
For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
|
||
used throughout this module see the $(S_LINK Terminology, terminology) section
|
||
below.
|
||
)
|
||
|
||
$(P The focus of this module is the core needs of developing Unicode-aware
|
||
applications. To that effect it provides the following optimized primitives:
|
||
)
|
||
$(UL
|
||
$(LI Character classification by category and common properties:
|
||
$(LREF isAlpha), $(LREF isWhite) and others.
|
||
)
|
||
$(LI
|
||
Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
|
||
)
|
||
$(LI
|
||
Converting text to any of the four normalization forms via $(LREF normalize).
|
||
)
|
||
$(LI
|
||
Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
|
||
by user-perceived characters, that is by $(LREF Grapheme) clusters.
|
||
)
|
||
$(LI
|
||
Decomposing and composing of individual character(s) according to canonical
|
||
or compatibility rules, see $(LREF compose) and $(LREF decompose),
|
||
including the specific version for Hangul syllables $(LREF composeJamo)
|
||
and $(LREF decomposeHangul).
|
||
)
|
||
)
|
||
$(P It's recognized that an application may need further enhancements
|
||
and extensions, such as less commonly known algorithms,
|
||
or tailoring existing ones for region specific needs. To help users
|
||
with building any extra functionality beyond the core primitives,
|
||
the module provides:
|
||
)
|
||
$(UL
|
||
$(LI
|
||
$(LREF CodepointSet), a type for easy manipulation of sets of characters.
|
||
Besides the typical set algebra it provides an unusual feature:
|
||
a D source code generator for detection of $(CODEPOINTS) in this set.
|
||
This is a boon for meta-programming parser frameworks,
|
||
and is used internally to power classification in small
|
||
sets like $(LREF isWhite).
|
||
)
|
||
$(LI
|
||
A way to construct optimal packed multi-stage tables also known as a
|
||
special case of $(LUCKY Trie).
|
||
The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
|
||
construct custom tries that map dchar to value.
|
||
The end result is a fast and predictable $(BIGOH 1) lookup that powers
|
||
functions like $(LREF isAlpha) and $(LREF combiningClass),
|
||
but for user-defined data sets.
|
||
)
|
||
$(LI
|
||
A useful technique for Unicode-aware parsers that perform
|
||
character classification of encoded $(CODEPOINTS)
|
||
is to avoid unnecassary decoding at all costs.
|
||
$(LREF utfMatcher) provides an improvement over the usual workflow
|
||
of decode-classify-process, combining the decoding and classification
|
||
steps. By extracting necessary bits directly from encoded
|
||
$(S_LINK Code unit, code units) matchers achieve
|
||
significant performance improvements. See $(LREF MatcherConcept) for
|
||
the common interface of UTF matchers.
|
||
)
|
||
$(LI
|
||
Generally useful building blocks for customized normalization:
|
||
$(LREF combiningClass) for querying combining class
|
||
and $(LREF allowedIn) for testing the Quick_Check
|
||
property of a given normalization form.
|
||
)
|
||
$(LI
|
||
Access to a large selection of commonly used sets of $(CODEPOINTS).
|
||
$(S_LINK Unicode properties, Supported sets) include Script,
|
||
Block and General Category. The exact contents of a set can be
|
||
observed in the CLDR utility, on the
|
||
$(WEB www.unicode.org/cldr/utility/properties.jsp, property index) page
|
||
of the Unicode website.
|
||
See $(LREF unicode) for easy and (optionally) compile-time checked set
|
||
queries.
|
||
)
|
||
)
|
||
$(SECTION Synopsis)
|
||
---
|
||
import std.uni;
|
||
void main()
|
||
{
|
||
// initialize code point sets using script/block or property name
|
||
// now 'set' contains code points from both scripts.
|
||
auto set = unicode("Cyrillic") | unicode("Armenian");
|
||
// same thing but simpler and checked at compile-time
|
||
auto ascii = unicode.ASCII;
|
||
auto currency = unicode.Currency_Symbol;
|
||
|
||
// easy set ops
|
||
auto a = set & ascii;
|
||
assert(a.empty); // as it has no intersection with ascii
|
||
a = set | ascii;
|
||
auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
|
||
|
||
// some properties of code point sets
|
||
assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
|
||
// testing presence of a code point in a set
|
||
// is just fine, it is O(logN)
|
||
assert(!b['$']);
|
||
assert(!b['\u058F']); // Armenian dram sign
|
||
assert(b['¥']);
|
||
|
||
// building fast lookup tables, these guarantee O(1) complexity
|
||
// 1-level Trie lookup table essentially a huge bit-set ~262Kb
|
||
auto oneTrie = toTrie!1(b);
|
||
// 2-level far more compact but typically slightly slower
|
||
auto twoTrie = toTrie!2(b);
|
||
// 3-level even smaller, and a bit slower yet
|
||
auto threeTrie = toTrie!3(b);
|
||
assert(oneTrie['£']);
|
||
assert(twoTrie['£']);
|
||
assert(threeTrie['£']);
|
||
|
||
// build the trie with the most sensible trie level
|
||
// and bind it as a functor
|
||
auto cyrillicOrArmenian = toDelegate(set);
|
||
auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
|
||
assert(balance == "ընկեր!");
|
||
// compatible with bool delegate(dchar)
|
||
bool delegate(dchar) bindIt = cyrillicOrArmenian;
|
||
|
||
// Normalization
|
||
string s = "Plain ascii (and not only), is always normalized!";
|
||
assert(s is normalize(s));// is the same string
|
||
|
||
string nonS = "A\u0308ffin"; // A ligature
|
||
auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
|
||
assert(nS == "Äffin");
|
||
assert(nS != nonS);
|
||
string composed = "Äffin";
|
||
|
||
assert(normalize!NFD(composed) == "A\u0308ffin");
|
||
// to NFKD, compatibility decomposition useful for fuzzy matching/searching
|
||
assert(normalize!NFKD("2¹⁰") == "210");
|
||
}
|
||
---
|
||
$(SECTION Terminology)
|
||
$(P The following is a list of important Unicode notions
|
||
and definitions. Any conventions used specifically in this
|
||
module alone are marked as such. The descriptions are based on the formal
|
||
definition as found in $(WEB www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
|
||
chapter three of The Unicode Standard Core Specification.)
|
||
)
|
||
|
||
$(P $(DEF Abstract character) A unit of information used for the organization,
|
||
control, or representation of textual data.
|
||
Note that:
|
||
$(UL
|
||
$(LI When representing data, the nature of that data
|
||
is generally symbolic as opposed to some other
|
||
kind of data (for example, visual).)
|
||
|
||
$(LI An abstract character has no concrete form
|
||
and should not be confused with a $(S_LINK Glyph, glyph).)
|
||
|
||
$(LI An abstract character does not necessarily
|
||
correspond to what a user thinks of as a “character”
|
||
and should not be confused with a $(LREF Grapheme).)
|
||
|
||
$(LI The abstract characters encoded (see Encoded character)
|
||
are known as Unicode abstract characters.)
|
||
|
||
$(LI Abstract characters not directly
|
||
encoded by the Unicode Standard can often be
|
||
represented by the use of combining character sequences.)
|
||
)
|
||
)
|
||
|
||
$(P $(DEF Canonical decomposition)
|
||
The decomposition of a character or character sequence
|
||
that results from recursively applying the canonical
|
||
mappings found in the Unicode Character Database
|
||
and these described in Conjoining Jamo Behavior
|
||
(section 12 of
|
||
$(WEB www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
|
||
)
|
||
|
||
$(P $(DEF Canonical composition)
|
||
The precise definition of the Canonical composition
|
||
is the algorithm as specified in $(WEB www.unicode.org/uni2book/ch03.pdf,
|
||
Unicode Conformance) section 11.
|
||
Informally it's the process that does the reverse of the canonical
|
||
decomposition with the addition of certain rules
|
||
that e.g. prevent legacy characters from appearing in the composed result.
|
||
)
|
||
|
||
$(P $(DEF Canonical equivalent)
|
||
Two character sequences are said to be canonical equivalents if
|
||
their full canonical decompositions are identical.
|
||
)
|
||
|
||
$(P $(DEF Character) Typically differs by context.
|
||
For the purpose of this documentation the term $(I character)
|
||
implies $(I encoded character), that is, a code point having
|
||
an assigned abstract character (a symbolic meaning).
|
||
)
|
||
|
||
$(P $(DEF Code point) Any value in the Unicode codespace;
|
||
that is, the range of integers from 0 to 10FFFF (hex).
|
||
Not all code points are assigned to encoded characters.
|
||
)
|
||
|
||
$(P $(DEF Code unit) The minimal bit combination that can represent
|
||
a unit of encoded text for processing or interchange.
|
||
Depending on the encoding this could be:
|
||
8-bit code units in the UTF-8 ($(D char)),
|
||
16-bit code units in the UTF-16 ($(D wchar)),
|
||
and 32-bit code units in the UTF-32 ($(D dchar)).
|
||
$(I Note that in UTF-32, a code unit is a code point
|
||
and is represented by the D $(D dchar) type.)
|
||
)
|
||
|
||
$(P $(DEF Combining character) A character with the General Category
|
||
of Combining Mark(M).
|
||
$(UL
|
||
$(LI All characters with non-zero canonical combining class
|
||
are combining characters, but the reverse is not the case:
|
||
there are combining characters with a zero combining class.
|
||
)
|
||
$(LI These characters are not normally used in isolation
|
||
unless they are being described. They include such characters
|
||
as accents, diacritics, Hebrew points, Arabic vowel signs,
|
||
and Indic matras.
|
||
)
|
||
)
|
||
)
|
||
|
||
$(P $(DEF Combining class)
|
||
A numerical value used by the Unicode Canonical Ordering Algorithm
|
||
to determine which sequences of combining marks are to be
|
||
considered canonically equivalent and which are not.
|
||
)
|
||
|
||
$(P $(DEF Compatibility decomposition)
|
||
The decomposition of a character or character sequence that results
|
||
from recursively applying both the compatibility mappings and
|
||
the canonical mappings found in the Unicode Character Database, and those
|
||
described in Conjoining Jamo Behavior no characters
|
||
can be further decomposed.
|
||
)
|
||
|
||
$(P $(DEF Compatibility equivalent)
|
||
Two character sequences are said to be compatibility
|
||
equivalents if their full compatibility decompositions are identical.
|
||
)
|
||
|
||
$(P $(DEF Encoded character) An association (or mapping)
|
||
between an abstract character and a code point.
|
||
)
|
||
|
||
$(P $(DEF Glyph) The actual, concrete image of a glyph representation
|
||
having been rasterized or otherwise imaged onto some display surface.
|
||
)
|
||
|
||
$(P $(DEF Grapheme base) A character with the property
|
||
Grapheme_Base, or any standard Korean syllable block.
|
||
)
|
||
|
||
$(P $(DEF Grapheme cluster) Defined as the text between
|
||
grapheme boundaries as specified by Unicode Standard Annex #29,
|
||
$(WEB www.unicode.org/reports/tr29/, Unicode text segmentation).
|
||
Important general properties of a grapheme:
|
||
$(UL
|
||
$(LI The grapheme cluster represents a horizontally segmentable
|
||
unit of text, consisting of some grapheme base (which may
|
||
consist of a Korean syllable) together with any number of
|
||
nonspacing marks applied to it.
|
||
)
|
||
$(LI A grapheme cluster typically starts with a grapheme base
|
||
and then extends across any subsequent sequence of nonspacing marks.
|
||
A grapheme cluster is most directly relevant to text rendering and
|
||
processes such as cursor placement and text selection in editing,
|
||
but may also be relevant to comparison and searching.
|
||
)
|
||
$(LI For many processes, a grapheme cluster behaves as if it was a
|
||
single character with the same properties as its grapheme base.
|
||
Effectively, nonspacing marks apply $(I graphically) to the base,
|
||
but do not change its properties.
|
||
)
|
||
)
|
||
$(P This module defines a number of primitives that work with graphemes:
|
||
$(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
|
||
All of them are using $(I extended grapheme) boundaries
|
||
as defined in the aforementioned standard annex.
|
||
)
|
||
)
|
||
|
||
|
||
$(P $(DEF Nonspacing mark) A combining character with the
|
||
General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
|
||
)
|
||
|
||
$(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.)
|
||
|
||
|
||
$(SECTION Normalization)
|
||
|
||
$(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
|
||
or $(S_LINK Compatibility equivalent, compatibility equivalent)
|
||
characters in the Unicode Standard make it necessary to have a full, formal
|
||
definition of equivalence for Unicode strings.
|
||
String equivalence is determined by a process called normalization,
|
||
whereby strings are converted into forms which are compared
|
||
directly for identity. This is the primary goal of the normalization process,
|
||
see the function $(LREF normalize) to convert into any of
|
||
the four defined forms.
|
||
)
|
||
|
||
$(P A very important attribute of the Unicode Normalization Forms
|
||
is that they must remain stable between versions of the Unicode Standard.
|
||
A Unicode string normalized to a particular Unicode Normalization Form
|
||
in one version of the standard is guaranteed to remain in that Normalization
|
||
Form for implementations of future versions of the standard.
|
||
)
|
||
|
||
$(P The Unicode Standard specifies four normalization forms.
|
||
Informally, two of these forms are defined by maximal decomposition
|
||
of equivalent sequences, and two of these forms are defined
|
||
by maximal $(I composition) of equivalent sequences.
|
||
$(UL
|
||
$(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
|
||
canonical decomposition) of a character sequence.)
|
||
$(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
|
||
compatibility decomposition) of a character sequence.)
|
||
$(LI Normalization Form C (NFC): The canonical composition of the
|
||
$(S_LINK Canonical decomposition, canonical decomposition)
|
||
of a coded character sequence.)
|
||
$(LI Normalization Form KC (NFKC): The canonical composition
|
||
of the $(S_LINK Compatibility decomposition,
|
||
compatibility decomposition) of a character sequence)
|
||
)
|
||
)
|
||
|
||
$(P The choice of the normalization form depends on the particular use case.
|
||
NFC is the best form for general text, since it's more compatible with
|
||
strings converted from legacy encodings. NFKC is the preferred form for
|
||
identifiers, especially where there are security concerns. NFD and NFKD
|
||
are the most useful for internal processing.
|
||
)
|
||
|
||
$(SECTION Construction of lookup tables)
|
||
|
||
$(P The Unicode standard describes a set of algorithms that
|
||
depend on having the ability to quickly look up various properties
|
||
of a code point. Given the the codespace of about 1 million $(CODEPOINTS),
|
||
it is not a trivial task to provide a space-efficient solution for
|
||
the multitude of properties.)
|
||
|
||
$(P Common approaches such as hash-tables or binary search over
|
||
sorted code point intervals (as in $(LREF InversionList)) are insufficient.
|
||
Hash-tables have enormous memory footprint and binary search
|
||
over intervals is not fast enough for some heavy-duty algorithms.
|
||
)
|
||
|
||
$(P The recommended solution (see Unicode Implementation Guidelines)
|
||
is using multi-stage tables that are an implementation of the
|
||
$(WEB en.wikipedia.org/wiki/Trie, Trie) data structure with integer
|
||
keys and a fixed number of stages. For the remainder of the section
|
||
this will be called a fixed trie. The following describes a particular
|
||
implementation that is aimed for the speed of access at the expense
|
||
of ideal size savings.
|
||
)
|
||
|
||
$(P Taking a 2-level Trie as an example the principle of operation is as follows.
|
||
Split the number of bits in a key (code point, 21 bits) into 2 components
|
||
(e.g. 15 and 8). The first is the number of bits in the index of the trie
|
||
and the other is number of bits in each page of the trie.
|
||
The layout of the trie is then an array of size 2^^bits-of-index followed
|
||
an array of memory chunks of size 2^^bits-of-page/bits-per-element.
|
||
)
|
||
|
||
$(P The number of pages is variable (but not less then 1)
|
||
unlike the number of entries in the index. The slots of the index
|
||
all have to contain a number of a page that is present. The lookup is then
|
||
just a couple of operations - slice the upper bits,
|
||
lookup an index for these, take a page at this index and use
|
||
the lower bits as an offset within this page.
|
||
|
||
Assuming that pages are laid out consequently
|
||
in one array at $(D pages), the pseudo-code is:
|
||
)
|
||
---
|
||
auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
|
||
pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
|
||
---
|
||
$(P Where if $(D elemsPerPage) is a power of 2 the whole process is
|
||
a handful of simple instructions and 2 array reads. Subsequent levels
|
||
of the trie are introduced by recursing on this notion - the index array
|
||
is treated as values. The number of bits in index is then again
|
||
split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
|
||
)
|
||
|
||
$(P For completeness a level 1 trie is simply an array.
|
||
The current implementation takes advantage of bit-packing values
|
||
when the range is known to be limited in advance (such as $(D bool)).
|
||
See also $(LREF BitPacked) for enforcing it manually.
|
||
The major size advantage however comes from the fact
|
||
that multiple $(B identical pages on every level are merged) by construction.
|
||
)
|
||
|
||
$(P The process of constructing a trie is more involved and is hidden from
|
||
the user in a form of the convenience functions $(LREF codepointTrie),
|
||
$(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
|
||
In general a set or built-in AA with $(D dchar) type
|
||
can be turned into a trie. The trie object in this module
|
||
is read-only (immutable); it's effectively frozen after construction.
|
||
)
|
||
|
||
$(SECTION Unicode properties)
|
||
|
||
$(P This is a full list of Unicode properties accessible through $(LREF unicode)
|
||
with specific helpers per category nested within. Consult the
|
||
$(WEB www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
|
||
when in doubt about the contents of a particular set.)
|
||
|
||
$(P General category sets listed below are only accessible with the
|
||
$(LREF unicode) shorthand accessor.)
|
||
$(BOOKTABLE $(B General category ),
|
||
$(TR $(TH Abb.) $(TH Long form)
|
||
$(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
|
||
$(TR $(TD L) $(TD Letter)
|
||
$(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation))
|
||
$(TR $(TD Ll) $(TD Lowercase_Letter)
|
||
$(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
|
||
$(TR $(TD Lm) $(TD Modifier_Letter)
|
||
$(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol))
|
||
$(TR $(TD Lo) $(TD Other_Letter)
|
||
$(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol))
|
||
$(TR $(TD Lt) $(TD Titlecase_Letter)
|
||
$(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol))
|
||
$(TR $(TD Lu) $(TD Uppercase_Letter)
|
||
$(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol))
|
||
$(TR $(TD M) $(TD Mark)
|
||
$(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol))
|
||
$(TR $(TD Mc) $(TD Spacing_Mark)
|
||
$(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
|
||
$(TR $(TD Me) $(TD Enclosing_Mark)
|
||
$(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator))
|
||
$(TR $(TD Mn) $(TD Nonspacing_Mark)
|
||
$(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator))
|
||
$(TR $(TD C) $(TD Other)
|
||
$(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
|
||
$(TR $(TD Cc) $(TD Control) $(TD Pf)
|
||
$(TD Final_Punctuation) $(TD -) $(TD Any))
|
||
$(TR $(TD Cf) $(TD Format)
|
||
$(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
|
||
)
|
||
$(P Sets for other commonly useful properties that are
|
||
accessible with $(LREF unicode):)
|
||
$(BOOKTABLE $(B Common binary properties),
|
||
$(TR $(TH Name) $(TH Name) $(TH Name))
|
||
$(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase))
|
||
$(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
|
||
$(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space))
|
||
$(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark))
|
||
$(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical))
|
||
$(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted))
|
||
$(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm))
|
||
$(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation))
|
||
$(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
|
||
$(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase))
|
||
$(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector))
|
||
$(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space))
|
||
$(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue))
|
||
$(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start))
|
||
$(TR $(TD Hyphen) $(TD Other_Lowercase) )
|
||
$(TR $(TD ID_Continue) $(TD Other_Math) )
|
||
)
|
||
$(P Bellow is the table with block names accepted by $(LREF unicode.block).
|
||
Note that the shorthand version $(LREF unicode) requires "In"
|
||
to be prepended to the names of blocks so as to disambiguate
|
||
scripts and blocks.)
|
||
|
||
$(BOOKTABLE $(B Blocks),
|
||
$(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian))
|
||
$(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols))
|
||
$(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar))
|
||
$(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A))
|
||
$(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue))
|
||
$(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo))
|
||
$(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms))
|
||
$(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham))
|
||
$(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki))
|
||
$(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic))
|
||
$(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian))
|
||
$(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian))
|
||
$(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic))
|
||
$(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
|
||
$(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya))
|
||
$(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya))
|
||
$(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa))
|
||
$(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc))
|
||
$(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician))
|
||
$(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions))
|
||
$(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement))
|
||
$(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards))
|
||
$(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area))
|
||
$(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang))
|
||
$(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols))
|
||
$(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic))
|
||
$(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan))
|
||
$(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra))
|
||
$(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada))
|
||
$(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian))
|
||
$(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala))
|
||
$(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants))
|
||
$(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng))
|
||
$(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters))
|
||
$(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials))
|
||
$(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese))
|
||
$(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement))
|
||
$(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts))
|
||
$(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A))
|
||
$(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B))
|
||
$(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
|
||
$(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation))
|
||
$(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A))
|
||
$(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B))
|
||
$(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri))
|
||
$(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac))
|
||
$(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog))
|
||
$(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa))
|
||
$(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags))
|
||
$(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le))
|
||
$(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
|
||
$(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet))
|
||
$(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols))
|
||
$(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri))
|
||
$(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil))
|
||
$(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu))
|
||
$(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana))
|
||
$(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
|
||
$(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan))
|
||
$(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh))
|
||
$(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
|
||
$(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic))
|
||
$(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics))
|
||
$(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended))
|
||
$(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai))
|
||
$(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors))
|
||
$(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement))
|
||
$(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions))
|
||
$(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms))
|
||
$(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
|
||
$(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals))
|
||
$(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
|
||
$(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) )
|
||
$(TR $(TD Ethiopic) $(TD Modifier Tone Letters) )
|
||
)
|
||
|
||
$(P Bellow is the table with script names accepted by $(LREF unicode.script)
|
||
and by the shorthand version $(LREF unicode):)
|
||
$(BOOKTABLE $(B Scripts),
|
||
$(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic))
|
||
$(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian))
|
||
$(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian))
|
||
$(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic))
|
||
$(TR $(TD Bamum) $(TD Inherited) $(TD Oriya))
|
||
$(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya))
|
||
$(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa))
|
||
$(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician))
|
||
$(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang))
|
||
$(TR $(TD Braille) $(TD Kannada) $(TD Runic))
|
||
$(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan))
|
||
$(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra))
|
||
$(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada))
|
||
$(TR $(TD Carian) $(TD Khmer) $(TD Shavian))
|
||
$(TR $(TD Chakma) $(TD Lao) $(TD Sinhala))
|
||
$(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng))
|
||
$(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese))
|
||
$(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri))
|
||
$(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac))
|
||
$(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog))
|
||
$(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa))
|
||
$(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le))
|
||
$(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham))
|
||
$(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet))
|
||
$(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri))
|
||
$(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil))
|
||
$(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu))
|
||
$(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana))
|
||
$(TR $(TD Gothic) $(TD Mongolian) $(TD Thai))
|
||
$(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan))
|
||
$(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh))
|
||
$(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic))
|
||
$(TR $(TD Han) $(TD Ogham) $(TD Vai))
|
||
$(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi))
|
||
)
|
||
|
||
$(P Bellow is the table of names accepted by $(LREF unicode.hangulSyllableType).)
|
||
$(BOOKTABLE $(B Hangul syllable type),
|
||
$(TR $(TH Abb.) $(TH Long form))
|
||
$(TR $(TD L) $(TD Leading_Jamo))
|
||
$(TR $(TD LV) $(TD LV_Syllable))
|
||
$(TR $(TD LVT) $(TD LVT_Syllable) )
|
||
$(TR $(TD T) $(TD Trailing_Jamo))
|
||
$(TR $(TD V) $(TD Vowel_Jamo))
|
||
)
|
||
References:
|
||
$(WEB www.digitalmars.com/d/ascii-table.html, ASCII Table),
|
||
$(WEB en.wikipedia.org/wiki/Unicode, Wikipedia),
|
||
$(WEB www.unicode.org, The Unicode Consortium),
|
||
$(WEB www.unicode.org/reports/tr15/, Unicode normalization forms),
|
||
$(WEB www.unicode.org/reports/tr29/, Unicode text segmentation)
|
||
$(WEB www.unicode.org/uni2book/ch05.pdf,
|
||
Unicode Implementation Guidelines)
|
||
$(WEB www.unicode.org/uni2book/ch03.pdf,
|
||
Unicode Conformance)
|
||
Trademarks:
|
||
Unicode(tm) is a trademark of Unicode, Inc.
|
||
|
||
Macros:
|
||
WIKI=Phobos/StdUni
|
||
|
||
Copyright: Copyright 2013 -
|
||
License: $(WEB www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
|
||
Authors: Dmitry Olshansky
|
||
Source: $(PHOBOSSRC std/_uni.d)
|
||
Standards: $(WEB www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
|
||
|
||
Macros:
|
||
|
||
SECTION = <h3><a id="$1">$0</a></h3>
|
||
DEF = <div><a id="$1"><i>$0</i></a></div>
|
||
S_LINK = <a href="#$1">$+</a>
|
||
CODEPOINT = $(S_LINK Code point, code point)
|
||
CODEPOINTS = $(S_LINK Code point, code points)
|
||
CHARACTER = $(S_LINK Character, character)
|
||
CHARACTERS = $(S_LINK Character, characters)
|
||
CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
|
||
+/
|
||
module std.uni;
|
||
|
||
static import std.ascii;
|
||
import std.traits, std.range, std.algorithm, std.conv,
|
||
std.typetuple, std.exception, core.stdc.stdlib;
|
||
import std.array; //@@BUG UFCS doesn't work with 'local' imports
|
||
import core.bitop;
|
||
|
||
import std.typecons;
|
||
|
||
// debug = std_uni;
|
||
|
||
debug(std_uni) import std.stdio;
|
||
|
||
private:
|
||
|
||
version(std_uni_bootstrap){}
|
||
else
|
||
{
|
||
import std.internal.unicode_tables; // generated file
|
||
}
|
||
|
||
void copyBackwards(T,U)(T[] src, U[] dest)
|
||
{
|
||
assert(src.length == dest.length);
|
||
for(size_t i=src.length; i-- > 0; )
|
||
dest[i] = src[i];
|
||
}
|
||
|
||
void copyForward(T,U)(T[] src, U[] dest)
|
||
{
|
||
assert(src.length == dest.length);
|
||
for(size_t i=0; i<src.length; i++)
|
||
dest[i] = src[i];
|
||
}
|
||
|
||
// TODO: update to reflect all major CPUs supporting unaligned reads
|
||
version(X86)
|
||
enum hasUnalignedReads = true;
|
||
else version(X86_64)
|
||
enum hasUnalignedReads = true;
|
||
else
|
||
enum hasUnalignedReads = false; // better be safe then sorry
|
||
|
||
public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
|
||
public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
|
||
|
||
// test the intro example
|
||
unittest
|
||
{
|
||
// initialize code point sets using script/block or property name
|
||
// set contains code points from both scripts.
|
||
auto set = unicode("Cyrillic") | unicode("Armenian");
|
||
// or simpler and statically-checked look
|
||
auto ascii = unicode.ASCII;
|
||
auto currency = unicode.Currency_Symbol;
|
||
|
||
// easy set ops
|
||
auto a = set & ascii;
|
||
assert(a.empty); // as it has no intersection with ascii
|
||
a = set | ascii;
|
||
auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
|
||
|
||
// some properties of code point sets
|
||
assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
|
||
// testing presence of a code point in a set
|
||
// is just fine, it is O(logN)
|
||
assert(!b['$']);
|
||
assert(!b['\u058F']); // Armenian dram sign
|
||
assert(b['¥']);
|
||
|
||
// building fast lookup tables, these guarantee O(1) complexity
|
||
// 1-level Trie lookup table essentially a huge bit-set ~262Kb
|
||
auto oneTrie = toTrie!1(b);
|
||
// 2-level far more compact but typically slightly slower
|
||
auto twoTrie = toTrie!2(b);
|
||
// 3-level even smaller, and a bit slower yet
|
||
auto threeTrie = toTrie!3(b);
|
||
assert(oneTrie['£']);
|
||
assert(twoTrie['£']);
|
||
assert(threeTrie['£']);
|
||
|
||
// build the trie with the most sensible trie level
|
||
// and bind it as a functor
|
||
auto cyrillicOrArmenian = toDelegate(set);
|
||
auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
|
||
assert(balance == "ընկեր!");
|
||
// compatible with bool delegate(dchar)
|
||
bool delegate(dchar) bindIt = cyrillicOrArmenian;
|
||
|
||
// Normalization
|
||
string s = "Plain ascii (and not only), is always normalized!";
|
||
assert(s is normalize(s));// is the same string
|
||
|
||
string nonS = "A\u0308ffin"; // A ligature
|
||
auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
|
||
assert(nS == "Äffin");
|
||
assert(nS != nonS);
|
||
string composed = "Äffin";
|
||
|
||
assert(normalize!NFD(composed) == "A\u0308ffin");
|
||
// to NFKD, compatibility decomposition useful for fuzzy matching/searching
|
||
assert(normalize!NFKD("2¹⁰") == "210");
|
||
}
|
||
|
||
enum lastDchar = 0x10FFFF;
|
||
|
||
auto force(T, F)(F from)
|
||
if(isIntegral!T && !is(T == F))
|
||
{
|
||
assert(from <= T.max && from >= T.min);
|
||
return cast(T)from;
|
||
}
|
||
|
||
auto force(T, F)(F from)
|
||
if(isBitPacked!T && !is(T == F))
|
||
{
|
||
assert(from <= 2^^bitSizeOf!T-1);
|
||
return T(cast(TypeOfBitPacked!T)from);
|
||
}
|
||
|
||
auto force(T, F)(F from)
|
||
if(is(T == F))
|
||
{
|
||
return from;
|
||
}
|
||
|
||
// cheap algorithm grease ;)
|
||
auto adaptIntRange(T, F)(F[] src)
|
||
{
|
||
//@@@BUG when in the 9 hells will map be copyable again?!
|
||
static struct ConvertIntegers
|
||
{
|
||
private F[] data;
|
||
|
||
@property T front()
|
||
{
|
||
return force!T(data.front);
|
||
}
|
||
|
||
void popFront(){ data.popFront(); }
|
||
|
||
@property bool empty()const { return data.empty; }
|
||
|
||
@property size_t length()const { return data.length; }
|
||
|
||
auto opSlice(size_t s, size_t e)
|
||
{
|
||
return ConvertIntegers(data[s..e]);
|
||
}
|
||
|
||
@property size_t opDollar(){ return data.length; }
|
||
}
|
||
return ConvertIntegers(src);
|
||
}
|
||
|
||
// repeat X times the bit-pattern in val assuming it's length is 'bits'
|
||
size_t replicateBits(size_t times, size_t bits)(size_t val)
|
||
{
|
||
static if(times == 1)
|
||
return val;
|
||
else static if(bits == 1)
|
||
{
|
||
static if(times == size_t.sizeof*8)
|
||
return val ? size_t.max : 0;
|
||
else
|
||
return val ? (1<<times)-1 : 0;
|
||
}
|
||
else static if(times % 2)
|
||
return (replicateBits!(times-1, bits)(val)<<bits) | val;
|
||
else
|
||
return replicateBits!(times/2, bits*2)((val<<bits) | val);
|
||
}
|
||
|
||
unittest // for replicate
|
||
{
|
||
size_t m = 0b111;
|
||
size_t m2 = 0b01;
|
||
foreach(i; TypeTuple!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
|
||
{
|
||
assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
|
||
assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().reduce!"a+b"());
|
||
}
|
||
}
|
||
|
||
// multiple arrays squashed into one memory block
|
||
struct MultiArray(Types...)
|
||
{
|
||
this(size_t[] sizes...)
|
||
{
|
||
size_t full_size;
|
||
foreach(i, v; Types)
|
||
{
|
||
full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
|
||
sz[i] = sizes[i];
|
||
static if(i >= 1)
|
||
offsets[i] = offsets[i-1] +
|
||
spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
|
||
}
|
||
|
||
storage = new size_t[full_size];
|
||
}
|
||
|
||
this(const(size_t)[] raw_offsets,
|
||
const(size_t)[] raw_sizes, const(size_t)[] data)const
|
||
{
|
||
offsets[] = raw_offsets[];
|
||
sz[] = raw_sizes[];
|
||
storage = data;
|
||
}
|
||
|
||
@property auto slice(size_t n)()inout pure nothrow
|
||
{
|
||
auto ptr = raw_ptr!n;
|
||
return packedArrayView!(Types[n])(ptr, sz[n]);
|
||
}
|
||
|
||
@property auto ptr(size_t n)()inout pure nothrow
|
||
{
|
||
auto ptr = raw_ptr!n;
|
||
return inout(PackedPtr!(Types[n]))(ptr);
|
||
}
|
||
|
||
template length(size_t n)
|
||
{
|
||
@property size_t length()const{ return sz[n]; }
|
||
|
||
@property void length(size_t new_size)
|
||
{
|
||
if(new_size > sz[n])
|
||
{// extend
|
||
size_t delta = (new_size - sz[n]);
|
||
sz[n] += delta;
|
||
delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
|
||
storage.length += delta;// extend space at end
|
||
// raw_slice!x must follow resize as it could be moved!
|
||
// next stmts move all data past this array, last-one-goes-first
|
||
static if(n != dim-1)
|
||
{
|
||
auto start = raw_ptr!(n+1);
|
||
// len includes delta
|
||
size_t len = (storage.ptr+storage.length-start);
|
||
|
||
copyBackwards(start[0..len-delta], start[delta..len]);
|
||
|
||
start[0..delta] = 0;
|
||
// offsets are used for raw_slice, ptr etc.
|
||
foreach(i; n+1..dim)
|
||
offsets[i] += delta;
|
||
}
|
||
}
|
||
else if(new_size < sz[n])
|
||
{// shrink
|
||
size_t delta = (sz[n] - new_size);
|
||
sz[n] -= delta;
|
||
delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
|
||
// move all data past this array, forward direction
|
||
static if(n != dim-1)
|
||
{
|
||
auto start = raw_ptr!(n+1);
|
||
size_t len = (storage.ptr+storage.length-start);
|
||
copyForward(start[0..len-delta], start[delta..len]);
|
||
|
||
// adjust offsets last, they affect raw_slice
|
||
foreach(i; n+1..dim)
|
||
offsets[i] -= delta;
|
||
}
|
||
storage.length -= delta;
|
||
}
|
||
// else - NOP
|
||
}
|
||
}
|
||
|
||
@property size_t bytes(size_t n=size_t.max)() const
|
||
{
|
||
static if(n == size_t.max)
|
||
return storage.length*size_t.sizeof;
|
||
else static if(n != Types.length-1)
|
||
return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
|
||
else
|
||
return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
|
||
}
|
||
|
||
void store(OutRange)(scope OutRange sink) const
|
||
if(isOutputRange!(OutRange, char))
|
||
{
|
||
import std.format;
|
||
formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
|
||
formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
|
||
formattedWrite(sink, ", [%( 0x%x, %)]", storage);
|
||
}
|
||
|
||
private:
|
||
@property auto raw_ptr(size_t n)()inout
|
||
{
|
||
static if(n == 0)
|
||
return storage.ptr;
|
||
else
|
||
{
|
||
return storage.ptr+offsets[n];
|
||
}
|
||
}
|
||
enum dim = Types.length;
|
||
size_t[dim] offsets;// offset for level x
|
||
size_t[dim] sz;// size of level x
|
||
alias bitWidth = staticMap!(bitSizeOf, Types);
|
||
size_t[] storage;
|
||
}
|
||
|
||
unittest
|
||
{
|
||
enum dg = (){
|
||
// sizes are:
|
||
// lvl0: 3, lvl1 : 2, lvl2: 1
|
||
auto m = MultiArray!(int, ubyte, int)(3,2,1);
|
||
|
||
static void check(size_t k, T)(ref T m, int n)
|
||
{
|
||
foreach(i; 0..n)
|
||
assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0..n]));
|
||
}
|
||
|
||
static void checkB(size_t k, T)(ref T m, int n)
|
||
{
|
||
foreach(i; 0..n)
|
||
assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0..n]));
|
||
}
|
||
|
||
static void fill(size_t k, T)(ref T m, int n)
|
||
{
|
||
foreach(i; 0..n)
|
||
m.slice!(k)[i] = force!ubyte(i+1);
|
||
}
|
||
|
||
static void fillB(size_t k, T)(ref T m, int n)
|
||
{
|
||
foreach(i; 0..n)
|
||
m.slice!(k)[i] = force!ubyte(n-i);
|
||
}
|
||
|
||
m.length!1 = 100;
|
||
fill!1(m, 100);
|
||
check!1(m, 100);
|
||
|
||
m.length!0 = 220;
|
||
fill!0(m, 220);
|
||
check!1(m, 100);
|
||
check!0(m, 220);
|
||
|
||
m.length!2 = 17;
|
||
fillB!2(m, 17);
|
||
checkB!2(m, 17);
|
||
check!0(m, 220);
|
||
check!1(m, 100);
|
||
|
||
m.length!2 = 33;
|
||
checkB!2(m, 17);
|
||
fillB!2(m, 33);
|
||
checkB!2(m, 33);
|
||
check!0(m, 220);
|
||
check!1(m, 100);
|
||
|
||
m.length!1 = 195;
|
||
fillB!1(m, 195);
|
||
checkB!1(m, 195);
|
||
checkB!2(m, 33);
|
||
check!0(m, 220);
|
||
|
||
auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
|
||
marr.length!0 = 15;
|
||
marr.length!1 = 30;
|
||
fill!1(marr, 30);
|
||
fill!0(marr, 15);
|
||
check!1(marr, 30);
|
||
check!0(marr, 15);
|
||
return 0;
|
||
};
|
||
enum ct = dg();
|
||
auto rt = dg();
|
||
}
|
||
|
||
unittest
|
||
{// more bitpacking tests
|
||
alias Bitty =
|
||
MultiArray!(BitPacked!(size_t, 3)
|
||
, BitPacked!(size_t, 4)
|
||
, BitPacked!(size_t, 3)
|
||
, BitPacked!(size_t, 6)
|
||
, bool);
|
||
alias fn1 = sliceBits!(13, 16);
|
||
alias fn2 = sliceBits!( 9, 13);
|
||
alias fn3 = sliceBits!( 6, 9);
|
||
alias fn4 = sliceBits!( 0, 6);
|
||
static void check(size_t lvl, MA)(ref MA arr){
|
||
for(size_t i = 0; i< arr.length!lvl; i++)
|
||
assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
|
||
}
|
||
|
||
static void fillIdx(size_t lvl, MA)(ref MA arr){
|
||
for(size_t i = 0; i< arr.length!lvl; i++)
|
||
arr.slice!(lvl)[i] = i;
|
||
}
|
||
Bitty m1;
|
||
|
||
m1.length!4 = 10;
|
||
m1.length!3 = 2^^6;
|
||
m1.length!2 = 2^^3;
|
||
m1.length!1 = 2^^4;
|
||
m1.length!0 = 2^^3;
|
||
|
||
m1.length!4 = 2^^16;
|
||
|
||
for(size_t i = 0; i< m1.length!4; i++)
|
||
m1.slice!(4)[i] = i % 2;
|
||
|
||
fillIdx!1(m1);
|
||
check!1(m1);
|
||
fillIdx!2(m1);
|
||
check!2(m1);
|
||
fillIdx!3(m1);
|
||
check!3(m1);
|
||
fillIdx!0(m1);
|
||
check!0(m1);
|
||
check!3(m1);
|
||
check!2(m1);
|
||
check!1(m1);
|
||
for(size_t i=0; i < 2^^16; i++)
|
||
{
|
||
m1.slice!(4)[i] = i % 2;
|
||
m1.slice!(0)[fn1(i)] = fn1(i);
|
||
m1.slice!(1)[fn2(i)] = fn2(i);
|
||
m1.slice!(2)[fn3(i)] = fn3(i);
|
||
m1.slice!(3)[fn4(i)] = fn4(i);
|
||
}
|
||
for(size_t i=0; i < 2^^16; i++)
|
||
{
|
||
assert(m1.slice!(4)[i] == i % 2);
|
||
assert(m1.slice!(0)[fn1(i)] == fn1(i));
|
||
assert(m1.slice!(1)[fn2(i)] == fn2(i));
|
||
assert(m1.slice!(2)[fn3(i)] == fn3(i));
|
||
assert(m1.slice!(3)[fn4(i)] == fn4(i));
|
||
}
|
||
}
|
||
|
||
size_t spaceFor(size_t _bits)(size_t new_len) pure nothrow
|
||
{
|
||
enum bits = _bits == 1 ? 1 : ceilPowerOf2(_bits);// see PackedArrayView
|
||
static if(bits > 8*size_t.sizeof)
|
||
{
|
||
static assert(bits % (size_t.sizeof*8) == 0);
|
||
return new_len * bits/(8*size_t.sizeof);
|
||
}
|
||
else
|
||
{
|
||
enum factor = size_t.sizeof*8/bits;
|
||
return (new_len+factor-1)/factor; // rounded up
|
||
}
|
||
}
|
||
|
||
template isBitPackableType(T)
|
||
{
|
||
enum isBitPackableType = isBitPacked!T
|
||
|| isIntegral!T || is(T == bool) || isSomeChar!T;
|
||
}
|
||
|
||
//============================================================================
|
||
template PackedArrayView(T)
|
||
if((is(T dummy == BitPacked!(U, sz), U, size_t sz)
|
||
&& isBitPackableType!U) || isBitPackableType!T)
|
||
{
|
||
private enum bits = bitSizeOf!T;
|
||
alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? ceilPowerOf2(bits) : 1);
|
||
}
|
||
|
||
//unsafe and fast access to a chunk of RAM as if it contains packed values
|
||
template PackedPtr(T)
|
||
if((is(T dummy == BitPacked!(U, sz), U, size_t sz)
|
||
&& isBitPackableType!U) || isBitPackableType!T)
|
||
{
|
||
private enum bits = bitSizeOf!T;
|
||
alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? ceilPowerOf2(bits) : 1);
|
||
}
|
||
|
||
@trusted struct PackedPtrImpl(T, size_t bits)
|
||
{
|
||
pure nothrow:
|
||
static assert(isPowerOf2(bits));
|
||
|
||
this(inout(size_t)* ptr)inout
|
||
{
|
||
origin = ptr;
|
||
}
|
||
|
||
private T simpleIndex(size_t n) inout
|
||
{
|
||
auto q = n / factor;
|
||
auto r = n % factor;
|
||
return cast(T)((origin[q] >> bits*r) & mask);
|
||
}
|
||
|
||
private void simpleWrite(TypeOfBitPacked!T val, size_t n)
|
||
in
|
||
{
|
||
static if(isIntegral!T)
|
||
assert(val <= mask);
|
||
}
|
||
body
|
||
{
|
||
auto q = n / factor;
|
||
auto r = n % factor;
|
||
size_t tgt_shift = bits*r;
|
||
size_t word = origin[q];
|
||
origin[q] = (word & ~(mask<<tgt_shift))
|
||
| (cast(size_t)val << tgt_shift);
|
||
}
|
||
|
||
static if(factor == bytesPerWord// can safely pack by byte
|
||
|| factor == 1 // a whole word at a time
|
||
|| ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
|
||
&& hasUnalignedReads)) // this needs unaligned reads
|
||
{
|
||
static if(factor == bytesPerWord)
|
||
alias U = ubyte;
|
||
else static if(factor == bytesPerWord/2)
|
||
alias U = ushort;
|
||
else static if(factor == bytesPerWord/4)
|
||
alias U = uint;
|
||
else static if(size_t.sizeof == 8 && factor == bytesPerWord/8)
|
||
alias U = ulong;
|
||
|
||
T opIndex(size_t idx) inout
|
||
{
|
||
return __ctfe ? simpleIndex(idx) :
|
||
cast(inout(T))(cast(U*)origin)[idx];
|
||
}
|
||
|
||
static if(isBitPacked!T) // lack of user-defined implicit conversion
|
||
{
|
||
void opIndexAssign(T val, size_t idx)
|
||
{
|
||
return opIndexAssign(cast(TypeOfBitPacked!T)val, idx);
|
||
}
|
||
}
|
||
|
||
void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
|
||
{
|
||
if(__ctfe)
|
||
simpleWrite(val, idx);
|
||
else
|
||
(cast(U*)origin)[idx] = cast(U)val;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
T opIndex(size_t n) inout
|
||
{
|
||
return simpleIndex(n);
|
||
}
|
||
|
||
static if(isBitPacked!T) // lack of user-defined implicit conversion
|
||
{
|
||
void opIndexAssign(T val, size_t idx)
|
||
{
|
||
return opIndexAssign(cast(TypeOfBitPacked!T)val, idx);
|
||
}
|
||
}
|
||
|
||
void opIndexAssign(TypeOfBitPacked!T val, size_t n)
|
||
{
|
||
return simpleWrite(val, n);
|
||
}
|
||
}
|
||
|
||
private:
|
||
// factor - number of elements in one machine word
|
||
enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
|
||
enum bytesPerWord = size_t.sizeof;
|
||
size_t* origin;
|
||
}
|
||
|
||
// data is packed only by power of two sized packs per word,
|
||
// thus avoiding mul/div overhead at the cost of ultimate packing
|
||
// this construct doesn't own memory, only provides access, see MultiArray for usage
|
||
@trusted struct PackedArrayViewImpl(T, size_t bits)
|
||
{
|
||
pure nothrow:
|
||
|
||
this(inout(size_t)* origin, size_t offset, size_t items) inout
|
||
{
|
||
ptr = inout(PackedPtr!(T))(origin);
|
||
ofs = offset;
|
||
limit = items;
|
||
}
|
||
|
||
bool zeros(size_t s, size_t e)
|
||
in
|
||
{
|
||
assert(s <= e);
|
||
}
|
||
body
|
||
{
|
||
s += ofs;
|
||
e += ofs;
|
||
size_t pad_s = roundUp(s);
|
||
if ( s >= e)
|
||
{
|
||
foreach (i; s..e)
|
||
if(ptr[i])
|
||
return false;
|
||
return true;
|
||
}
|
||
size_t pad_e = roundDown(e);
|
||
size_t i;
|
||
for(i=s; i<pad_s; i++)
|
||
if(ptr[i])
|
||
return false;
|
||
// all in between is x*factor elements
|
||
for(size_t j=i/factor; i<pad_e; i+=factor, j++)
|
||
if(ptr.origin[j])
|
||
return false;
|
||
for(; i<e; i++)
|
||
if(ptr[i])
|
||
return false;
|
||
return true;
|
||
}
|
||
|
||
T opIndex(size_t idx) inout
|
||
in
|
||
{
|
||
assert(idx < limit);
|
||
}
|
||
body
|
||
{
|
||
return ptr[ofs + idx];
|
||
}
|
||
|
||
static if(isBitPacked!T) // lack of user-defined implicit conversion
|
||
{
|
||
void opIndexAssign(T val, size_t idx)
|
||
{
|
||
return opIndexAssign(cast(TypeOfBitPacked!T)val, idx);
|
||
}
|
||
}
|
||
|
||
void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
|
||
in
|
||
{
|
||
assert(idx < limit);
|
||
}
|
||
body
|
||
{
|
||
ptr[ofs + idx] = val;
|
||
}
|
||
|
||
static if(isBitPacked!T) // lack of user-defined implicit conversions
|
||
{
|
||
void opSliceAssign(T val, size_t start, size_t end)
|
||
{
|
||
opSliceAssign(cast(TypeOfBitPacked!T)val, start, end);
|
||
}
|
||
}
|
||
|
||
void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
|
||
in
|
||
{
|
||
assert(start <= end);
|
||
assert(end <= limit);
|
||
}
|
||
body
|
||
{
|
||
// account for ofsetted view
|
||
start += ofs;
|
||
end += ofs;
|
||
// rounded to factor granularity
|
||
size_t pad_start = roundUp(start);// rounded up
|
||
if(pad_start >= end) //rounded up >= then end of slice
|
||
{
|
||
//nothing to gain, use per element assignment
|
||
foreach(i; start..end)
|
||
ptr[i] = val;
|
||
return;
|
||
}
|
||
size_t pad_end = roundDown(end); // rounded down
|
||
size_t i;
|
||
for(i=start; i<pad_start; i++)
|
||
ptr[i] = val;
|
||
// all in between is x*factor elements
|
||
if(pad_start != pad_end)
|
||
{
|
||
size_t repval = replicateBits!(factor, bits)(val);
|
||
for(size_t j=i/factor; i<pad_end; i+=factor, j++)
|
||
ptr.origin[j] = repval;// so speed it up by factor
|
||
}
|
||
for(; i<end; i++)
|
||
ptr[i] = val;
|
||
}
|
||
|
||
auto opSlice(size_t from, size_t to)inout
|
||
in
|
||
{
|
||
assert(from <= to);
|
||
assert(ofs + to <= limit);
|
||
}
|
||
body
|
||
{
|
||
return typeof(this)(ptr.origin, ofs + from, to - from);
|
||
}
|
||
|
||
auto opSlice(){ return opSlice(0, length); }
|
||
|
||
bool opEquals(T)(auto ref T arr) const
|
||
{
|
||
if(limit != arr.limit)
|
||
return false;
|
||
size_t s1 = ofs, s2 = arr.ofs;
|
||
size_t e1 = s1 + limit, e2 = s2 + limit;
|
||
if(s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
|
||
{
|
||
return ptr.origin[s1/factor .. e1/factor]
|
||
== arr.ptr.origin[s2/factor .. e2/factor];
|
||
}
|
||
for(size_t i=0;i<limit; i++)
|
||
if(this[i] != arr[i])
|
||
return false;
|
||
return true;
|
||
}
|
||
|
||
@property size_t length()const{ return limit; }
|
||
|
||
private:
|
||
auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
|
||
auto roundDown()(size_t val){ return val/factor*factor; }
|
||
// factor - number of elements in one machine word
|
||
enum factor = size_t.sizeof*8/bits;
|
||
PackedPtr!(T) ptr;
|
||
size_t ofs, limit;
|
||
}
|
||
|
||
|
||
private struct SliceOverIndexed(T)
|
||
{
|
||
enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
|
||
enum assignableSlice = is(typeof((){ T.init[0..0] = Item.init; }));
|
||
auto opIndex(size_t idx)const
|
||
in
|
||
{
|
||
assert(idx < to - from);
|
||
}
|
||
body
|
||
{
|
||
return (*arr)[from+idx];
|
||
}
|
||
|
||
static if(assignableIndex)
|
||
void opIndexAssign(Item val, size_t idx)
|
||
in
|
||
{
|
||
assert(idx < to - from);
|
||
}
|
||
body
|
||
{
|
||
(*arr)[from+idx] = val;
|
||
}
|
||
|
||
auto opSlice(size_t a, size_t b)
|
||
{
|
||
return typeof(this)(from+a, from+b, arr);
|
||
}
|
||
|
||
// static if(assignableSlice)
|
||
void opSliceAssign(T)(T val, size_t start, size_t end)
|
||
{
|
||
(*arr)[start+from .. end+from] = val;
|
||
}
|
||
|
||
auto opSlice()
|
||
{
|
||
return typeof(this)(from, to, arr);
|
||
}
|
||
|
||
@property size_t length()const { return to-from;}
|
||
|
||
auto opDollar()const { return length; }
|
||
|
||
@property bool empty()const { return from == to; }
|
||
|
||
@property auto front()const { return (*arr)[from]; }
|
||
|
||
static if(assignableIndex)
|
||
@property void front(Item val) { (*arr)[from] = val; }
|
||
|
||
@property auto back()const { return (*arr)[to-1]; }
|
||
|
||
static if(assignableIndex)
|
||
@property void back(Item val) { (*arr)[to-1] = val; }
|
||
|
||
@property auto save() inout { return this; }
|
||
|
||
void popFront() { from++; }
|
||
|
||
void popBack() { to--; }
|
||
|
||
bool opEquals(T)(auto ref T arr) const
|
||
{
|
||
if(arr.length != length)
|
||
return false;
|
||
for(size_t i=0; i <length; i++)
|
||
if(this[i] != arr[i])
|
||
return false;
|
||
return true;
|
||
}
|
||
private:
|
||
alias Item = typeof(T.init[0]);
|
||
size_t from, to;
|
||
T* arr;
|
||
}
|
||
|
||
static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
|
||
|
||
// BUG? forward reference to return type of sliceOverIndexed!Grapheme
|
||
SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
|
||
if(is(Unqual!T == T))
|
||
{
|
||
return SliceOverIndexed!(const(T))(a, b, x);
|
||
}
|
||
|
||
// BUG? inout is out of reach
|
||
//...SliceOverIndexed.arr only parameters or stack based variables can be inout
|
||
SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
|
||
if(is(Unqual!T == T))
|
||
{
|
||
return SliceOverIndexed!T(a, b, x);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
int[] idxArray = [2, 3, 5, 8, 13];
|
||
auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
|
||
|
||
assert(!sliced.empty);
|
||
assert(sliced.front == 2);
|
||
sliced.front = 1;
|
||
assert(sliced.front == 1);
|
||
assert(sliced.back == 13);
|
||
sliced.popFront();
|
||
assert(sliced.front == 3);
|
||
assert(sliced.back == 13);
|
||
sliced.back = 11;
|
||
assert(sliced.back == 11);
|
||
sliced.popBack();
|
||
|
||
assert(sliced.front == 3);
|
||
assert(sliced[$-1] == 8);
|
||
sliced = sliced[];
|
||
assert(sliced[0] == 3);
|
||
assert(sliced.back == 8);
|
||
sliced = sliced[1..$];
|
||
assert(sliced.front == 5);
|
||
sliced = sliced[0..$-1];
|
||
assert(sliced[$-1] == 5);
|
||
|
||
int[] other = [2, 5];
|
||
assert(sliced[] == sliceOverIndexed(1, 2, &other));
|
||
sliceOverIndexed(0, 2, &idxArray)[0..2] = -1;
|
||
assert(idxArray[0..2] == [-1, -1]);
|
||
uint[] nullArr = null;
|
||
auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
|
||
assert(nullSlice.empty);
|
||
}
|
||
|
||
private auto packedArrayView(T)(inout(size_t)* ptr, size_t items) @trusted pure nothrow
|
||
{
|
||
return inout(PackedArrayView!T)(ptr, 0, items);
|
||
}
|
||
|
||
|
||
//============================================================================
|
||
// Partially unrolled binary search using Shar's method
|
||
//============================================================================
|
||
|
||
private import std.math : pow;
|
||
|
||
string genUnrolledSwitchSearch(size_t size)
|
||
{
|
||
assert(isPowerOf2(size));
|
||
string code = `auto power = bsr(m)+1;
|
||
switch(power){`;
|
||
size_t i = bsr(size);
|
||
foreach_reverse(val; 0..bsr(size))
|
||
{
|
||
auto v = 2^^val;
|
||
code ~= `
|
||
case pow:
|
||
if(pred(range[idx+m], needle))
|
||
idx += m;
|
||
goto case;
|
||
`.replace("m", to!string(v))
|
||
.replace("pow", to!string(i));
|
||
i--;
|
||
}
|
||
code ~= `
|
||
case 0:
|
||
if(pred(range[idx], needle))
|
||
idx += 1;
|
||
goto default;
|
||
`;
|
||
code ~= `
|
||
default:
|
||
}`;
|
||
return code;
|
||
}
|
||
|
||
bool isPowerOf2(size_t sz) @safe pure nothrow
|
||
{
|
||
return (sz & (sz-1)) == 0;
|
||
}
|
||
|
||
size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
|
||
if(is(T : ElementType!Range))
|
||
{
|
||
assert(isPowerOf2(range.length));
|
||
size_t idx = 0, m = range.length/2;
|
||
while(m != 0)
|
||
{
|
||
if(pred(range[idx+m], needle))
|
||
idx += m;
|
||
m /= 2;
|
||
}
|
||
if(pred(range[idx], needle))
|
||
idx += 1;
|
||
return idx;
|
||
}
|
||
|
||
size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
|
||
if(is(T : ElementType!Range))
|
||
{
|
||
assert(isPowerOf2(range.length));
|
||
size_t idx = 0, m = range.length/2;
|
||
enum max = 1<<10;
|
||
while(m >= max)
|
||
{
|
||
if(pred(range[idx+m], needle))
|
||
idx += m;
|
||
m /= 2;
|
||
}
|
||
mixin(genUnrolledSwitchSearch(max));
|
||
return idx;
|
||
}
|
||
|
||
//
|
||
size_t floorPowerOf2(size_t arg) @safe pure nothrow
|
||
{
|
||
assert(arg > 1); // else bsr is undefined
|
||
return 1<<bsr(arg-1);
|
||
}
|
||
|
||
size_t ceilPowerOf2(size_t arg) @safe pure nothrow
|
||
{
|
||
assert(arg > 1); // else bsr is undefined
|
||
return 1<<bsr(arg-1)+1;
|
||
}
|
||
|
||
template sharMethod(alias uniLowerBound)
|
||
{
|
||
size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
|
||
if(is(T : ElementType!Range))
|
||
{
|
||
import std.functional;
|
||
alias pred = binaryFun!_pred;
|
||
if(range.length == 0)
|
||
return 0;
|
||
if(isPowerOf2(range.length))
|
||
return uniLowerBound!pred(range, needle);
|
||
size_t n = floorPowerOf2(range.length);
|
||
if(pred(range[n-1], needle))
|
||
{// search in another 2^^k area that fully covers the tail of range
|
||
size_t k = ceilPowerOf2(range.length - n + 1);
|
||
return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
|
||
}
|
||
else
|
||
return uniLowerBound!pred(range[0..n], needle);
|
||
}
|
||
}
|
||
|
||
alias sharLowerBound = sharMethod!uniformLowerBound;
|
||
alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
|
||
|
||
unittest
|
||
{
|
||
auto stdLowerBound(T)(T[] range, T needle)
|
||
{
|
||
return assumeSorted(range).lowerBound(needle).length;
|
||
}
|
||
immutable MAX = 5*1173;
|
||
auto arr = array(iota(5, MAX, 5));
|
||
assert(arr.length == MAX/5-1);
|
||
foreach(i; 0..MAX+5)
|
||
{
|
||
auto std = stdLowerBound(arr, i);
|
||
assert(std == sharLowerBound(arr, i));
|
||
assert(std == sharSwitchLowerBound(arr, i));
|
||
}
|
||
arr = [];
|
||
auto std = stdLowerBound(arr, 33);
|
||
assert(std == sharLowerBound(arr, 33));
|
||
assert(std == sharSwitchLowerBound(arr, 33));
|
||
}
|
||
//============================================================================
|
||
|
||
@safe:
|
||
// hope to see simillar stuff in public interface... once Allocators are out
|
||
//@@@BUG moveFront and friends? dunno, for now it's POD-only
|
||
|
||
@trusted size_t genericReplace(Policy=void, T, Range)
|
||
(ref T dest, size_t from, size_t to, Range stuff)
|
||
{
|
||
size_t delta = to - from;
|
||
size_t stuff_end = from+stuff.length;
|
||
if(stuff.length > delta)
|
||
{// replace increases length
|
||
delta = stuff.length - delta;// now, new is > old by delta
|
||
static if(is(Policy == void))
|
||
dest.length = dest.length+delta;//@@@BUG lame @property
|
||
else
|
||
dest = Policy.realloc(dest, dest.length+delta);
|
||
copyBackwards(dest[to..dest.length-delta],
|
||
dest[to+delta..dest.length]);
|
||
copyForward(stuff, dest[from..stuff_end]);
|
||
}
|
||
else if(stuff.length == delta)
|
||
{
|
||
copy(stuff, dest[from..to]);
|
||
}
|
||
else
|
||
{// replace decreases length by delta
|
||
delta = delta - stuff.length;
|
||
copy(stuff, dest[from..stuff_end]);
|
||
copyForward(dest[to..dest.length],
|
||
dest[stuff_end..dest.length-delta]);
|
||
static if(is(Policy == void))
|
||
dest.length = dest.length - delta;//@@@BUG lame @property
|
||
else
|
||
dest = Policy.realloc(dest, dest.length-delta);
|
||
}
|
||
return stuff_end;
|
||
}
|
||
|
||
|
||
// Simple storage manipulation policy
|
||
@trusted public struct GcPolicy
|
||
{
|
||
static T[] dup(T)(const T[] arr)
|
||
{
|
||
return arr.dup;
|
||
}
|
||
|
||
static T[] alloc(T)(size_t size)
|
||
{
|
||
return new T[size];
|
||
}
|
||
|
||
static T[] realloc(T)(T[] arr, size_t sz)
|
||
{
|
||
arr.length = sz;
|
||
return arr;
|
||
}
|
||
|
||
static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
|
||
{
|
||
replaceInPlace(dest, from, to, stuff);
|
||
}
|
||
|
||
static void append(T, V)(ref T[] arr, V value)
|
||
if(!isInputRange!V)
|
||
{
|
||
arr ~= force!T(value);
|
||
}
|
||
|
||
static void append(T, V)(ref T[] arr, V value)
|
||
if(isInputRange!V)
|
||
{
|
||
insertInPlace(arr, arr.length, value);
|
||
}
|
||
|
||
static void destroy(T)(ref T arr)
|
||
if(isDynamicArray!T && is(Unqual!T == T))
|
||
{
|
||
debug
|
||
{
|
||
arr[] = cast(typeof(T.init[0]))(0xdead_beef);
|
||
}
|
||
arr = null;
|
||
}
|
||
|
||
static void destroy(T)(ref T arr)
|
||
if(isDynamicArray!T && !is(Unqual!T == T))
|
||
{
|
||
arr = null;
|
||
}
|
||
}
|
||
|
||
// ditto
|
||
@trusted struct ReallocPolicy
|
||
{
|
||
static T[] dup(T)(const T[] arr)
|
||
{
|
||
auto result = alloc!T(arr.length);
|
||
result[] = arr[];
|
||
return result;
|
||
}
|
||
|
||
static T[] alloc(T)(size_t size)
|
||
{
|
||
auto ptr = cast(T*)enforce(malloc(T.sizeof*size), "out of memory on C heap");
|
||
return ptr[0..size];
|
||
}
|
||
|
||
static T[] realloc(T)(T[] arr, size_t size)
|
||
{
|
||
if(!size)
|
||
{
|
||
destroy(arr);
|
||
return null;
|
||
}
|
||
auto ptr = cast(T*)enforce(core.stdc.stdlib.realloc(
|
||
arr.ptr, T.sizeof*size), "out of memory on C heap");
|
||
return ptr[0..size];
|
||
}
|
||
|
||
static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
|
||
{
|
||
genericReplace!(ReallocPolicy)(dest, from, to, stuff);
|
||
}
|
||
|
||
static void append(T, V)(ref T[] arr, V value)
|
||
if(!isInputRange!V)
|
||
{
|
||
arr = realloc(arr, arr.length+1);
|
||
arr[$-1] = force!T(value);
|
||
}
|
||
|
||
static void append(T, V)(ref T[] arr, V value)
|
||
if(isInputRange!V && hasLength!V)
|
||
{
|
||
arr = realloc(arr, arr.length+value.length);
|
||
copy(value, arr[$-value.length..$]);
|
||
}
|
||
|
||
static void destroy(T)(ref T[] arr)
|
||
{
|
||
if(arr.ptr)
|
||
free(arr.ptr);
|
||
arr = null;
|
||
}
|
||
}
|
||
|
||
//build hack
|
||
alias _RealArray = CowArray!ReallocPolicy;
|
||
|
||
unittest
|
||
{
|
||
with(ReallocPolicy)
|
||
{
|
||
bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
|
||
string file = __FILE__, size_t line = __LINE__)
|
||
{
|
||
{
|
||
replaceImpl(orig, from, to, toReplace);
|
||
scope(exit) destroy(orig);
|
||
if(!equalS(orig, result))
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
static T[] arr(T)(T[] args... )
|
||
{
|
||
return dup(args);
|
||
}
|
||
|
||
assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
|
||
assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
|
||
assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
|
||
assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
|
||
assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
|
||
}
|
||
}
|
||
|
||
/**
|
||
Tests if T is some kind a set of code points. Intended for template constraints.
|
||
*/
|
||
public template isCodepointSet(T)
|
||
{
|
||
static if(is(T dummy == InversionList!(Args), Args...))
|
||
enum isCodepointSet = true;
|
||
else
|
||
enum isCodepointSet = false;
|
||
}
|
||
|
||
/**
|
||
Tests if $(D T) is a pair of integers that implicitly convert to $(D V).
|
||
The following code must compile for any pair $(D T):
|
||
---
|
||
(T x){ V a = x[0]; V b = x[1];}
|
||
---
|
||
The following must not compile:
|
||
---
|
||
(T x){ V c = x[2];}
|
||
---
|
||
*/
|
||
public template isIntegralPair(T, V=uint)
|
||
{
|
||
enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
|
||
&& !is(typeof((T x){ V c = x[2]; }));
|
||
}
|
||
|
||
|
||
/**
|
||
The recommended default type for set of $(CODEPOINTS).
|
||
For details, see the current implementation: $(LREF InversionList).
|
||
*/
|
||
public alias CodepointSet = InversionList!GcPolicy;
|
||
|
||
|
||
//@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
|
||
// which relies on std.uni.isGraphical and this chain blows up with Forward reference error
|
||
// hence below doesn't seem to work
|
||
// public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
|
||
|
||
/**
|
||
The recommended type of $(XREF _typecons, Tuple)
|
||
to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
|
||
Any interval type should pass $(LREF isIntegralPair) trait.
|
||
*/
|
||
public struct CodepointInterval
|
||
{
|
||
pure:
|
||
uint[2] _tuple;
|
||
alias _tuple this;
|
||
this(uint low, uint high)
|
||
{
|
||
_tuple[0] = low;
|
||
_tuple[1] = high;
|
||
}
|
||
bool opEquals(T)(T val) const
|
||
{
|
||
return this[0] == val[0] && this[1] == val[1];
|
||
}
|
||
@property ref inout(uint) a() inout { return _tuple[0]; }
|
||
@property ref inout(uint) b() inout { return _tuple[1]; }
|
||
}
|
||
|
||
//@@@BUG another forward reference workaround
|
||
@trusted bool equalS(R1, R2)(R1 lhs, R2 rhs)
|
||
{
|
||
for(;;){
|
||
if(lhs.empty)
|
||
return rhs.empty;
|
||
if(rhs.empty)
|
||
return false;
|
||
if(lhs.front != rhs.front)
|
||
return false;
|
||
lhs.popFront();
|
||
rhs.popFront();
|
||
}
|
||
}
|
||
|
||
|
||
|
||
/**
|
||
$(P
|
||
$(D InversionList) is a set of $(CODEPOINTS)
|
||
represented as an array of open-right [a, b$(RPAREN)
|
||
intervals (see $(LREF CodepointInterval) above).
|
||
The name comes from the way the representation reads left to right.
|
||
For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
|
||
plus a singular value 60 looks like this:
|
||
)
|
||
---
|
||
10, 50, 60, 61, 80, 90
|
||
---
|
||
$(P
|
||
The way to read this is: start with negative meaning that all numbers
|
||
smaller then the next one are not present in this set (and positive
|
||
- the contrary). Then switch positive/negative after each
|
||
number passed from left to right.
|
||
)
|
||
$(P This way negative spans until 10, then positive until 50,
|
||
then negative until 60, then positive until 61, and so on.
|
||
As seen this provides a space-efficient storage of highly redundant data
|
||
that comes in long runs. A description which Unicode $(CHARACTER)
|
||
properties fit nicely. The technique itself could be seen as a variation
|
||
on $(LUCKY RLE encoding).
|
||
)
|
||
|
||
$(P Sets are value types (just like $(D int) is) thus they
|
||
are never aliased.
|
||
)
|
||
Example:
|
||
---
|
||
auto a = CodepointSet('a', 'z'+1);
|
||
auto b = CodepointSet('A', 'Z'+1);
|
||
auto c = a;
|
||
a = a | b;
|
||
assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
|
||
assert(a != c);
|
||
---
|
||
$(P See also $(LREF unicode) for simpler construction of sets
|
||
from predefined ones.
|
||
)
|
||
|
||
$(P Memory usage is 8 bytes per each contiguous interval in a set.
|
||
The value semantics are achieved by using the
|
||
$(WEB en.wikipedia.org/wiki/Copy-on-write, COW) technique
|
||
and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
|
||
)
|
||
|
||
Note:
|
||
$(P It's not recommended to rely on the template parameters
|
||
or the exact type of a current $(CODEPOINT) set in $(D std.uni).
|
||
The type and parameters may change when the standard
|
||
allocators design is finalized.
|
||
Use $(LREF isCodepointSet) with templates or just stick with the default
|
||
alias $(LREF CodepointSet) throughout the whole code base.
|
||
)
|
||
*/
|
||
@trusted public struct InversionList(SP=GcPolicy)
|
||
{
|
||
public:
|
||
|
||
/**
|
||
Construct from another code point set of any type.
|
||
*/
|
||
this(Set)(Set set) pure
|
||
if(isCodepointSet!Set)
|
||
{
|
||
uint[] arr;
|
||
foreach(v; set.byInterval)
|
||
{
|
||
arr ~= v.a;
|
||
arr ~= v.b;
|
||
}
|
||
data = CowArray!(SP).reuse(arr);
|
||
}
|
||
|
||
/**
|
||
Construct a set from a forward range of code point intervals.
|
||
*/
|
||
this(Range)(Range intervals) pure
|
||
if(isForwardRange!Range && isIntegralPair!(ElementType!Range))
|
||
{
|
||
uint[] arr;
|
||
foreach(v; intervals)
|
||
{
|
||
SP.append(arr, v.a);
|
||
SP.append(arr, v.b);
|
||
}
|
||
data = CowArray!(SP).reuse(arr);
|
||
sanitize(); //enforce invariant: sort intervals etc.
|
||
}
|
||
|
||
//helper function that avoids sanity check to be CTFE-friendly
|
||
private static fromIntervals(Range)(Range intervals) pure
|
||
{
|
||
auto flattened = roundRobin(intervals.save.map!"a[0]"(),
|
||
intervals.save.map!"a[1]"());
|
||
InversionList set;
|
||
set.data = CowArray!(SP)(flattened);
|
||
return set;
|
||
}
|
||
//ditto untill sort is CTFE-able
|
||
private static fromIntervals()(uint[] intervals...) pure
|
||
in
|
||
{
|
||
assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
|
||
for (uint i = 0; i < intervals.length; i += 2)
|
||
{
|
||
auto a = intervals[i], b = intervals[i+1];
|
||
assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
|
||
}
|
||
}
|
||
body
|
||
{
|
||
InversionList set;
|
||
set.data = CowArray!(SP)(intervals);
|
||
return set;
|
||
}
|
||
|
||
/**
|
||
Construct a set from plain values of code point intervals.
|
||
Example:
|
||
---
|
||
import std.algorithm;
|
||
auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
|
||
foreach(v; 'a'..'z'+1)
|
||
assert(set[v]);
|
||
// Cyrillic lowercase interval
|
||
foreach(v; 'а'..'я'+1)
|
||
assert(set[v]);
|
||
//specific order is not required, intervals may interesect
|
||
auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
|
||
//the same end result
|
||
assert(set2.byInterval.equal(set.byInterval));
|
||
---
|
||
*/
|
||
this()(uint[] intervals...)
|
||
in
|
||
{
|
||
assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
|
||
for (uint i = 0; i < intervals.length; i += 2)
|
||
{
|
||
auto a = intervals[i], b = intervals[i+1];
|
||
assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
|
||
}
|
||
}
|
||
body
|
||
{
|
||
data = CowArray!(SP)(intervals);
|
||
sanitize(); //enforce invariant: sort intervals etc.
|
||
}
|
||
|
||
/**
|
||
Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
|
||
|
||
Example:
|
||
---
|
||
import std.algorithm, std.typecons;
|
||
auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
|
||
set.byInterval.equal([tuple('A', 'E'), tuple('a', 'e')]);
|
||
---
|
||
*/
|
||
@property auto byInterval()
|
||
{
|
||
return Intervals!(typeof(data))(data);
|
||
}
|
||
|
||
/**
|
||
Tests the presence of code point $(D val) in this set.
|
||
|
||
Example:
|
||
---
|
||
auto gothic = unicode.Gothic;
|
||
// Gothic letter ahsa
|
||
assert(gothic['\U00010330']);
|
||
// no ascii in Gothic obviously
|
||
assert(!gothic['$']);
|
||
---
|
||
*/
|
||
bool opIndex(uint val) const
|
||
{
|
||
// the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1
|
||
// return assumeSorted!((a,b) => a<=b)(data[]).lowerBound(val).length & 1;
|
||
return sharSwitchLowerBound!"a<=b"(data[], val) & 1;
|
||
}
|
||
|
||
// Linear scan for $(D ch). Useful only for small sets.
|
||
// TODO:
|
||
// used internally in std.regex
|
||
// should be properly exposed in a public API ?
|
||
package auto scanFor()(dchar ch) const
|
||
{
|
||
immutable len = data.length;
|
||
for(size_t i = 0; i < len; i++)
|
||
if(ch < data[i])
|
||
return i & 1;
|
||
return 0;
|
||
}
|
||
|
||
/// Number of $(CODEPOINTS) in this set
|
||
@property size_t length()
|
||
{
|
||
size_t sum = 0;
|
||
foreach(iv; byInterval)
|
||
{
|
||
sum += iv.b - iv.a;
|
||
}
|
||
return sum;
|
||
}
|
||
|
||
// bootstrap full set operations from 4 primitives (suitable as a template mixin):
|
||
// addInterval, skipUpTo, dropUpTo & byInterval iteration
|
||
//============================================================================
|
||
public:
|
||
/**
|
||
$(P Sets support natural syntax for set algebra, namely: )
|
||
$(BOOKTABLE ,
|
||
$(TR $(TH Operator) $(TH Math notation) $(TH Description) )
|
||
$(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
|
||
$(TR $(TD |) $(TD a ∪ b) $(TD union) )
|
||
$(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
|
||
$(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
|
||
)
|
||
|
||
Example:
|
||
---
|
||
auto lower = unicode.LowerCase;
|
||
auto upper = unicode.UpperCase;
|
||
auto ascii = unicode.ASCII;
|
||
|
||
assert((lower & upper).empty); // no intersection
|
||
auto lowerASCII = lower & ascii;
|
||
assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
|
||
// throw away all of the lowercase ASCII
|
||
assert((ascii - lower).length == 128 - 26);
|
||
|
||
auto onlyOneOf = lower ~ ascii;
|
||
assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
|
||
assert(onlyOneOf['$']); // ASCII and not lowercase
|
||
assert(!onlyOneOf['a']); // ASCII and lowercase
|
||
assert(onlyOneOf['я']); // not ASCII but lowercase
|
||
|
||
// throw away all cased letters from ASCII
|
||
auto noLetters = ascii - (lower | upper);
|
||
assert(noLetters.length == 128 - 26*2);
|
||
---
|
||
*/
|
||
This opBinary(string op, U)(U rhs)
|
||
if(isCodepointSet!U || is(U:dchar))
|
||
{
|
||
static if(op == "&" || op == "|" || op == "~")
|
||
{// symmetric ops thus can swap arguments to reuse r-value
|
||
static if(is(U:dchar))
|
||
{
|
||
auto tmp = this;
|
||
mixin("tmp "~op~"= rhs; ");
|
||
return tmp;
|
||
}
|
||
else
|
||
{
|
||
static if(is(Unqual!U == U))
|
||
{
|
||
// try hard to reuse r-value
|
||
mixin("rhs "~op~"= this;");
|
||
return rhs;
|
||
}
|
||
else
|
||
{
|
||
auto tmp = this;
|
||
mixin("tmp "~op~"= rhs;");
|
||
return tmp;
|
||
}
|
||
}
|
||
}
|
||
else static if(op == "-") // anti-symmetric
|
||
{
|
||
auto tmp = this;
|
||
tmp -= rhs;
|
||
return tmp;
|
||
}
|
||
else
|
||
static assert(0, "no operator "~op~" defined for Set");
|
||
}
|
||
|
||
/// The 'op=' versions of the above overloaded operators.
|
||
ref This opOpAssign(string op, U)(U rhs)
|
||
if(isCodepointSet!U || is(U:dchar))
|
||
{
|
||
static if(op == "|") // union
|
||
{
|
||
static if(is(U:dchar))
|
||
{
|
||
this.addInterval(rhs, rhs+1);
|
||
return this;
|
||
}
|
||
else
|
||
return this.add(rhs);
|
||
}
|
||
else static if(op == "&") // intersection
|
||
return this.intersect(rhs);// overloaded
|
||
else static if(op == "-") // set difference
|
||
return this.sub(rhs);// overloaded
|
||
else static if(op == "~") // symmetric set difference
|
||
{
|
||
auto copy = this & rhs;
|
||
this |= rhs;
|
||
this -= copy;
|
||
return this;
|
||
}
|
||
else
|
||
static assert(0, "no operator "~op~" defined for Set");
|
||
}
|
||
|
||
/**
|
||
Tests the presence of codepoint $(D ch) in this set,
|
||
the same as $(LREF opIndex).
|
||
*/
|
||
bool opBinaryRight(string op: "in", U)(U ch) const
|
||
if(is(U : dchar))
|
||
{
|
||
return this[ch];
|
||
}
|
||
|
||
///
|
||
unittest
|
||
{
|
||
assert('я' in unicode.Cyrillic);
|
||
assert(!('z' in unicode.Cyrillic));
|
||
}
|
||
|
||
|
||
|
||
/// Obtains a set that is the inversion of this set. See also $(LREF inverted).
|
||
auto opUnary(string op: "!")()
|
||
{
|
||
return this.inverted;
|
||
}
|
||
|
||
/**
|
||
A range that spans each $(CODEPOINT) in this set.
|
||
|
||
Example:
|
||
---
|
||
import std.algorithm;
|
||
auto set = unicode.ASCII;
|
||
set.byCodepoint.equal(iota(0, 0x80));
|
||
---
|
||
*/
|
||
@property auto byCodepoint()
|
||
{
|
||
@trusted static struct CodepointRange
|
||
{
|
||
this(This set)
|
||
{
|
||
r = set.byInterval;
|
||
if(!r.empty)
|
||
cur = r.front.a;
|
||
}
|
||
|
||
@property dchar front() const
|
||
{
|
||
return cast(dchar)cur;
|
||
}
|
||
|
||
@property bool empty() const
|
||
{
|
||
return r.empty;
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
cur++;
|
||
while(cur >= r.front.b)
|
||
{
|
||
r.popFront();
|
||
if(r.empty)
|
||
break;
|
||
cur = r.front.a;
|
||
}
|
||
}
|
||
private:
|
||
uint cur;
|
||
typeof(This.init.byInterval) r;
|
||
}
|
||
|
||
return CodepointRange(this);
|
||
}
|
||
|
||
/**
|
||
$(P Obtain textual representation of this set in from of
|
||
open-right intervals and feed it to $(D sink).
|
||
)
|
||
$(P Used by various standard formatting facilities such as
|
||
$(XREF _format, formattedWrite), $(XREF _stdio, write),
|
||
$(XREF _stdio, writef), $(XREF _conv, to) and others.
|
||
)
|
||
Example:
|
||
---
|
||
import std.conv;
|
||
assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
|
||
---
|
||
*/
|
||
|
||
private import std.format : FormatException, FormatSpec;
|
||
|
||
/***************************************
|
||
* Obtain a textual representation of this InversionList
|
||
* in form of open-right intervals.
|
||
*
|
||
* The formatting flag is applied individually to each value, for example:
|
||
* $(LI $(B %s) and $(B %d) format the intervals as a [low..high$(RPAREN) range of integrals)
|
||
* $(LI $(B %x) formats the intervals as a [low..high$(RPAREN) range of lowercase hex characters)
|
||
* $(LI $(B %X) formats the intervals as a [low..high$(RPAREN) range of uppercase hex characters)
|
||
*/
|
||
void toString(scope void delegate(const(char)[]) sink,
|
||
FormatSpec!char fmt) /* const */
|
||
{
|
||
import std.format;
|
||
auto range = byInterval;
|
||
if(range.empty)
|
||
return;
|
||
|
||
while (1)
|
||
{
|
||
auto i = range.front;
|
||
range.popFront();
|
||
|
||
put(sink, "[");
|
||
formatValue(sink, i.a, fmt);
|
||
put(sink, "..");
|
||
formatValue(sink, i.b, fmt);
|
||
put(sink, ")");
|
||
if (range.empty) return;
|
||
put(sink, " ");
|
||
}
|
||
}
|
||
|
||
///
|
||
unittest
|
||
{
|
||
import std.conv : to;
|
||
import std.string : format;
|
||
import std.uni : unicode;
|
||
|
||
assert(unicode.Cyrillic.to!string ==
|
||
"[1024..1157) [1159..1320) [7467..7468) [7544..7545) [11744..11776) [42560..42648) [42655..42656)");
|
||
|
||
// The specs '%s' and '%d' are equivalent to the to!string call above.
|
||
assert(format("%d", unicode.Cyrillic) == unicode.Cyrillic.to!string);
|
||
|
||
assert(format("%#x", unicode.Cyrillic) ==
|
||
"[0x400..0x485) [0x487..0x528) [0x1d2b..0x1d2c) [0x1d78..0x1d79) [0x2de0..0x2e00) [0xa640..0xa698) [0xa69f..0xa6a0)");
|
||
|
||
assert(format("%#X", unicode.Cyrillic) ==
|
||
"[0X400..0X485) [0X487..0X528) [0X1D2B..0X1D2C) [0X1D78..0X1D79) [0X2DE0..0X2E00) [0XA640..0XA698) [0XA69F..0XA6A0)");
|
||
}
|
||
|
||
unittest
|
||
{
|
||
import std.string : format;
|
||
assertThrown!FormatException(format("%a", unicode.ASCII));
|
||
}
|
||
|
||
|
||
/**
|
||
Add an interval [a, b$(RPAREN) to this set.
|
||
|
||
Example:
|
||
---
|
||
CodepointSet someSet;
|
||
someSet.add('0', '5').add('A','Z'+1);
|
||
someSet.add('5', '9'+1);
|
||
assert(someSet['0']);
|
||
assert(someSet['5']);
|
||
assert(someSet['9']);
|
||
assert(someSet['Z']);
|
||
---
|
||
*/
|
||
ref add()(uint a, uint b)
|
||
{
|
||
addInterval(a, b);
|
||
return this;
|
||
}
|
||
|
||
private:
|
||
|
||
ref intersect(U)(U rhs)
|
||
if(isCodepointSet!U)
|
||
{
|
||
Marker mark;
|
||
foreach( i; rhs.byInterval)
|
||
{
|
||
mark = this.dropUpTo(i.a, mark);
|
||
mark = this.skipUpTo(i.b, mark);
|
||
}
|
||
this.dropUpTo(uint.max, mark);
|
||
return this;
|
||
}
|
||
|
||
ref intersect()(dchar ch)
|
||
{
|
||
foreach(i; byInterval)
|
||
if(i.a <= ch && ch < i.b)
|
||
return this = This.init.add(ch, ch+1);
|
||
this = This.init;
|
||
return this;
|
||
}
|
||
|
||
///
|
||
unittest
|
||
{
|
||
assert(unicode.Cyrillic.intersect('-').byInterval.empty);
|
||
}
|
||
|
||
ref sub()(dchar ch)
|
||
{
|
||
return subChar(ch);
|
||
}
|
||
|
||
// same as the above except that skip & drop parts are swapped
|
||
ref sub(U)(U rhs)
|
||
if(isCodepointSet!U)
|
||
{
|
||
uint top;
|
||
Marker mark;
|
||
foreach(i; rhs.byInterval)
|
||
{
|
||
mark = this.skipUpTo(i.a, mark);
|
||
mark = this.dropUpTo(i.b, mark);
|
||
}
|
||
return this;
|
||
}
|
||
|
||
ref add(U)(U rhs)
|
||
if(isCodepointSet!U)
|
||
{
|
||
Marker start;
|
||
foreach(i; rhs.byInterval)
|
||
{
|
||
start = addInterval(i.a, i.b, start);
|
||
}
|
||
return this;
|
||
}
|
||
|
||
// end of mixin-able part
|
||
//============================================================================
|
||
public:
|
||
/**
|
||
Obtains a set that is the inversion of this set.
|
||
|
||
See the '!' $(LREF opUnary) for the same but using operators.
|
||
|
||
Example:
|
||
---
|
||
set = unicode.ASCII;
|
||
// union with the inverse gets all of the code points in the Unicode
|
||
assert((set | set.inverted).length == 0x110000);
|
||
// no intersection with the inverse
|
||
assert((set & set.inverted).empty);
|
||
---
|
||
*/
|
||
@property auto inverted()
|
||
{
|
||
InversionList inversion = this;
|
||
if(inversion.data.length == 0)
|
||
{
|
||
inversion.addInterval(0, lastDchar+1);
|
||
return inversion;
|
||
}
|
||
if(inversion.data[0] != 0)
|
||
genericReplace(inversion.data, 0, 0, [0]);
|
||
else
|
||
genericReplace(inversion.data, 0, 1, cast(uint[])null);
|
||
if(data[data.length-1] != lastDchar+1)
|
||
genericReplace(inversion.data,
|
||
inversion.data.length, inversion.data.length, [lastDchar+1]);
|
||
else
|
||
genericReplace(inversion.data,
|
||
inversion.data.length-1, inversion.data.length, cast(uint[])null);
|
||
|
||
return inversion;
|
||
}
|
||
|
||
/**
|
||
Generates string with D source code of unary function with name of
|
||
$(D funcName) taking a single $(D dchar) argument. If $(D funcName) is empty
|
||
the code is adjusted to be a lambda function.
|
||
|
||
The function generated tests if the $(CODEPOINT) passed
|
||
belongs to this set or not. The result is to be used with string mixin.
|
||
The intended usage area is aggressive optimization via meta programming
|
||
in parser generators and the like.
|
||
|
||
Note: Use with care for relatively small or regular sets. It
|
||
could end up being slower then just using multi-staged tables.
|
||
|
||
Example:
|
||
---
|
||
import std.stdio;
|
||
|
||
// construct set directly from [a, b$RPAREN intervals
|
||
auto set = CodepointSet(10, 12, 45, 65, 100, 200);
|
||
writeln(set);
|
||
writeln(set.toSourceCode("func"));
|
||
---
|
||
|
||
The above outputs something along the lines of:
|
||
---
|
||
bool func(dchar ch) @safe pure nothrow @nogc
|
||
{
|
||
if(ch < 45)
|
||
{
|
||
if(ch == 10 || ch == 11) return true;
|
||
return false;
|
||
}
|
||
else if (ch < 65) return true;
|
||
else
|
||
{
|
||
if(ch < 100) return false;
|
||
if(ch < 200) return true;
|
||
return false;
|
||
}
|
||
}
|
||
---
|
||
*/
|
||
string toSourceCode(string funcName="")
|
||
{
|
||
import std.string;
|
||
enum maxBinary = 3;
|
||
static string linearScope(R)(R ivals, string indent)
|
||
{
|
||
string result = indent~"{\n";
|
||
string deeper = indent~" ";
|
||
foreach(ival; ivals)
|
||
{
|
||
auto span = ival[1] - ival[0];
|
||
assert(span != 0);
|
||
if(span == 1)
|
||
{
|
||
result ~= format("%sif(ch == %s) return true;\n", deeper, ival[0]);
|
||
}
|
||
else if(span == 2)
|
||
{
|
||
result ~= format("%sif(ch == %s || ch == %s) return true;\n",
|
||
deeper, ival[0], ival[0]+1);
|
||
}
|
||
else
|
||
{
|
||
if(ival[0] != 0) // dchar is unsigned and < 0 is useless
|
||
result ~= format("%sif(ch < %s) return false;\n", deeper, ival[0]);
|
||
result ~= format("%sif(ch < %s) return true;\n", deeper, ival[1]);
|
||
}
|
||
}
|
||
result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
|
||
return result;
|
||
}
|
||
|
||
static string binaryScope(R)(R ivals, string indent)
|
||
{
|
||
// time to do unrolled comparisons?
|
||
if(ivals.length < maxBinary)
|
||
return linearScope(ivals, indent);
|
||
else
|
||
return bisect(ivals, ivals.length/2, indent);
|
||
}
|
||
|
||
// not used yet if/elsebinary search is far better with DMD as of 2.061
|
||
// and GDC is doing fine job either way
|
||
static string switchScope(R)(R ivals, string indent)
|
||
{
|
||
string result = indent~"switch(ch){\n";
|
||
string deeper = indent~" ";
|
||
foreach(ival; ivals)
|
||
{
|
||
if(ival[0]+1 == ival[1])
|
||
{
|
||
result ~= format("%scase %s: return true;\n",
|
||
deeper, ival[0]);
|
||
}
|
||
else
|
||
{
|
||
result ~= format("%scase %s: .. case %s: return true;\n",
|
||
deeper, ival[0], ival[1]-1);
|
||
}
|
||
}
|
||
result ~= deeper~"default: return false;\n"~indent~"}\n";
|
||
return result;
|
||
}
|
||
|
||
static string bisect(R)(R range, size_t idx, string indent)
|
||
{
|
||
string deeper = indent ~ " ";
|
||
// bisect on one [a, b) interval at idx
|
||
string result = indent~"{\n";
|
||
// less branch, < a
|
||
result ~= format("%sif(ch < %s)\n%s",
|
||
deeper, range[idx][0], binaryScope(range[0..idx], deeper));
|
||
// middle point, >= a && < b
|
||
result ~= format("%selse if (ch < %s) return true;\n",
|
||
deeper, range[idx][1]);
|
||
// greater or equal branch, >= b
|
||
result ~= format("%selse\n%s",
|
||
deeper, binaryScope(range[idx+1..$], deeper));
|
||
return result~indent~"}\n";
|
||
}
|
||
|
||
string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
|
||
funcName.empty ? "function" : funcName);
|
||
auto range = byInterval.array();
|
||
// special case first bisection to be on ASCII vs beyond
|
||
auto tillAscii = countUntil!"a[0] > 0x80"(range);
|
||
if(tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
|
||
code ~= binaryScope(range, "");
|
||
else
|
||
code ~= bisect(range, tillAscii, "");
|
||
return code;
|
||
}
|
||
|
||
/**
|
||
True if this set doesn't contain any $(CODEPOINTS).
|
||
Example:
|
||
---
|
||
CodepointSet emptySet;
|
||
assert(emptySet.length == 0);
|
||
assert(emptySet.empty);
|
||
---
|
||
*/
|
||
@property bool empty() const
|
||
{
|
||
return data.length == 0;
|
||
}
|
||
|
||
private:
|
||
alias This = typeof(this);
|
||
alias Marker = size_t;
|
||
|
||
// a random-access range of integral pairs
|
||
static struct Intervals(Range)
|
||
{
|
||
this(Range sp)
|
||
{
|
||
slice = sp;
|
||
start = 0;
|
||
end = sp.length;
|
||
}
|
||
|
||
this(Range sp, size_t s, size_t e)
|
||
{
|
||
slice = sp;
|
||
start = s;
|
||
end = e;
|
||
}
|
||
|
||
@property auto front()const
|
||
{
|
||
uint a = slice[start];
|
||
uint b = slice[start+1];
|
||
return CodepointInterval(a, b);
|
||
}
|
||
|
||
//may break sorted property - but we need std.sort to access it
|
||
//hence package protection attribute
|
||
package @property auto front(CodepointInterval val)
|
||
{
|
||
slice[start] = val.a;
|
||
slice[start+1] = val.b;
|
||
}
|
||
|
||
@property auto back()const
|
||
{
|
||
uint a = slice[end-2];
|
||
uint b = slice[end-1];
|
||
return CodepointInterval(a, b);
|
||
}
|
||
|
||
//ditto about package
|
||
package @property auto back(CodepointInterval val)
|
||
{
|
||
slice[end-2] = val.a;
|
||
slice[end-1] = val.b;
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
start += 2;
|
||
}
|
||
|
||
void popBack()
|
||
{
|
||
end -= 2;
|
||
}
|
||
|
||
auto opIndex(size_t idx) const
|
||
{
|
||
uint a = slice[start+idx*2];
|
||
uint b = slice[start+idx*2+1];
|
||
return CodepointInterval(a, b);
|
||
}
|
||
|
||
//ditto about package
|
||
package auto opIndexAssign(CodepointInterval val, size_t idx)
|
||
{
|
||
slice[start+idx*2] = val.a;
|
||
slice[start+idx*2+1] = val.b;
|
||
}
|
||
|
||
auto opSlice(size_t s, size_t e)
|
||
{
|
||
return Intervals(slice, s*2+start, e*2+start);
|
||
}
|
||
|
||
@property size_t length()const { return slice.length/2; }
|
||
|
||
@property bool empty()const { return start == end; }
|
||
|
||
@property auto save(){ return this; }
|
||
private:
|
||
size_t start, end;
|
||
Range slice;
|
||
}
|
||
|
||
// called after construction from intervals
|
||
// to make sure invariants hold
|
||
void sanitize()
|
||
{
|
||
if (data.length == 0)
|
||
return;
|
||
alias Ival = CodepointInterval;
|
||
//intervals wrapper for a _range_ over packed array
|
||
auto ivals = Intervals!(typeof(data[]))(data[]);
|
||
//@@@BUG@@@ can't use "a.a < b.a" see issue 12265
|
||
sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
|
||
// what follows is a variation on stable remove
|
||
// differences:
|
||
// - predicate is binary, and is tested against
|
||
// the last kept element (at 'i').
|
||
// - predicate mutates lhs (merges rhs into lhs)
|
||
size_t len = ivals.length;
|
||
size_t i = 0;
|
||
size_t j = 1;
|
||
while (j < len)
|
||
{
|
||
if (ivals[i].b >= ivals[j].a)
|
||
{
|
||
ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
|
||
j++;
|
||
}
|
||
else //unmergable
|
||
{
|
||
// check if there is a hole after merges
|
||
// (in the best case we do 0 writes to ivals)
|
||
if (j != i+1)
|
||
ivals[i+1] = ivals[j]; //copy over
|
||
i++;
|
||
j++;
|
||
}
|
||
}
|
||
len = i + 1;
|
||
for (size_t k=0; k + 1 < len; k++)
|
||
{
|
||
assert(ivals[k].a < ivals[k].b);
|
||
assert(ivals[k].b < ivals[k+1].a);
|
||
}
|
||
data.length = len * 2;
|
||
}
|
||
|
||
// special case for normal InversionList
|
||
ref subChar(dchar ch)
|
||
{
|
||
auto mark = skipUpTo(ch);
|
||
if(mark != data.length
|
||
&& data[mark] == ch && data[mark-1] == ch)
|
||
{
|
||
// it has split, meaning that ch happens to be in one of intervals
|
||
data[mark] = data[mark]+1;
|
||
}
|
||
return this;
|
||
}
|
||
|
||
//
|
||
Marker addInterval(int a, int b, Marker hint=Marker.init)
|
||
in
|
||
{
|
||
assert(a <= b);
|
||
}
|
||
body
|
||
{
|
||
auto range = assumeSorted(data[]);
|
||
size_t pos;
|
||
size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
|
||
if(a_idx == range.length)
|
||
{
|
||
// [---+++----++++----++++++]
|
||
// [ a b]
|
||
data.append(a, b);
|
||
return data.length-1;
|
||
}
|
||
size_t b_idx = range[a_idx..range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
|
||
uint[3] buf = void;
|
||
uint to_insert;
|
||
debug(std_uni)
|
||
{
|
||
writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
|
||
}
|
||
if(b_idx == range.length)
|
||
{
|
||
// [-------++++++++----++++++-]
|
||
// [ s a b]
|
||
if(a_idx & 1)// a in positive
|
||
{
|
||
buf[0] = b;
|
||
to_insert = 1;
|
||
}
|
||
else// a in negative
|
||
{
|
||
buf[0] = a;
|
||
buf[1] = b;
|
||
to_insert = 2;
|
||
}
|
||
pos = genericReplace(data, a_idx, b_idx, buf[0..to_insert]);
|
||
return pos - 1;
|
||
}
|
||
|
||
uint top = data[b_idx];
|
||
|
||
debug(std_uni)
|
||
{
|
||
writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
|
||
writefln("a=%s; b=%s; top=%s;", a, b, top);
|
||
}
|
||
if(a_idx & 1)
|
||
{// a in positive
|
||
if(b_idx & 1)// b in positive
|
||
{
|
||
// [-------++++++++----++++++-]
|
||
// [ s a b ]
|
||
buf[0] = top;
|
||
to_insert = 1;
|
||
}
|
||
else // b in negative
|
||
{
|
||
// [-------++++++++----++++++-]
|
||
// [ s a b ]
|
||
if(top == b)
|
||
{
|
||
assert(b_idx+1 < data.length);
|
||
buf[0] = data[b_idx+1];
|
||
pos = genericReplace(data, a_idx, b_idx+2, buf[0..1]);
|
||
return pos - 1;
|
||
}
|
||
buf[0] = b;
|
||
buf[1] = top;
|
||
to_insert = 2;
|
||
}
|
||
}
|
||
else
|
||
{ // a in negative
|
||
if(b_idx & 1) // b in positive
|
||
{
|
||
// [----------+++++----++++++-]
|
||
// [ a b ]
|
||
buf[0] = a;
|
||
buf[1] = top;
|
||
to_insert = 2;
|
||
}
|
||
else// b in negative
|
||
{
|
||
// [----------+++++----++++++-]
|
||
// [ a s b ]
|
||
if(top == b)
|
||
{
|
||
assert(b_idx+1 < data.length);
|
||
buf[0] = a;
|
||
buf[1] = data[b_idx+1];
|
||
pos = genericReplace(data, a_idx, b_idx+2, buf[0..2]);
|
||
return pos - 1;
|
||
}
|
||
buf[0] = a;
|
||
buf[1] = b;
|
||
buf[2] = top;
|
||
to_insert = 3;
|
||
}
|
||
}
|
||
pos = genericReplace(data, a_idx, b_idx+1, buf[0..to_insert]);
|
||
debug(std_uni)
|
||
{
|
||
writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
|
||
writeln("inserting ", buf[0..to_insert]);
|
||
}
|
||
return pos - 1;
|
||
}
|
||
|
||
//
|
||
Marker dropUpTo(uint a, Marker pos=Marker.init)
|
||
in
|
||
{
|
||
assert(pos % 2 == 0); // at start of interval
|
||
}
|
||
body
|
||
{
|
||
auto range = assumeSorted!"a<=b"(data[pos..data.length]);
|
||
if(range.empty)
|
||
return pos;
|
||
size_t idx = pos;
|
||
idx += range.lowerBound(a).length;
|
||
|
||
debug(std_uni)
|
||
{
|
||
writeln("dropUpTo full length=", data.length);
|
||
writeln(pos,"~~~", idx);
|
||
}
|
||
if(idx == data.length)
|
||
return genericReplace(data, pos, idx, cast(uint[])[]);
|
||
if(idx & 1)
|
||
{ // a in positive
|
||
//[--+++----++++++----+++++++------...]
|
||
// |<---si s a t
|
||
genericReplace(data, pos, idx, [a]);
|
||
}
|
||
else
|
||
{ // a in negative
|
||
//[--+++----++++++----+++++++-------+++...]
|
||
// |<---si s a t
|
||
genericReplace(data, pos, idx, cast(uint[])[]);
|
||
}
|
||
return pos;
|
||
}
|
||
|
||
//
|
||
Marker skipUpTo(uint a, Marker pos=Marker.init)
|
||
out(result)
|
||
{
|
||
assert(result % 2 == 0);// always start of interval
|
||
//(may be 0-width after-split)
|
||
}
|
||
body
|
||
{
|
||
assert(data.length % 2 == 0);
|
||
auto range = assumeSorted!"a<=b"(data[pos..data.length]);
|
||
size_t idx = pos+range.lowerBound(a).length;
|
||
|
||
if(idx >= data.length) // could have Marker point to recently removed stuff
|
||
return data.length;
|
||
|
||
if(idx & 1)// inside of interval, check for split
|
||
{
|
||
|
||
uint top = data[idx];
|
||
if(top == a)// no need to split, it's end
|
||
return idx+1;
|
||
uint start = data[idx-1];
|
||
if(a == start)
|
||
return idx-1;
|
||
// split it up
|
||
genericReplace(data, idx, idx+1, [a, a, top]);
|
||
return idx+1; // avoid odd index
|
||
}
|
||
return idx;
|
||
}
|
||
|
||
CowArray!SP data;
|
||
};
|
||
|
||
@system unittest
|
||
{
|
||
// test examples
|
||
import std.algorithm, std.typecons;
|
||
auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
|
||
set.byInterval.equalS([tuple('A', 'E'), tuple('a', 'e')]);
|
||
set = unicode.ASCII;
|
||
assert(set.byCodepoint.equalS(iota(0, 0x80)));
|
||
set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
|
||
foreach(v; 'a'..'z'+1)
|
||
assert(set[v]);
|
||
// Cyrillic lowercase interval
|
||
foreach(v; 'а'..'я'+1)
|
||
assert(set[v]);
|
||
//specific order is not required, intervals may interesect
|
||
auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
|
||
assert(set2.byInterval.equal(set.byInterval));
|
||
|
||
auto gothic = unicode.Gothic;
|
||
// Gothic letter ahsa
|
||
assert(gothic['\U00010330']);
|
||
// no ascii in Gothic obviously
|
||
assert(!gothic['$']);
|
||
|
||
CodepointSet emptySet;
|
||
assert(emptySet.length == 0);
|
||
assert(emptySet.empty);
|
||
|
||
set = unicode.ASCII;
|
||
// union with the inverse gets all of code points in the Unicode
|
||
assert((set | set.inverted).length == 0x110000);
|
||
// no intersection with inverse
|
||
assert((set & set.inverted).empty);
|
||
|
||
CodepointSet someSet;
|
||
someSet.add('0', '5').add('A','Z'+1);
|
||
someSet.add('5', '9'+1);
|
||
assert(someSet['0']);
|
||
assert(someSet['5']);
|
||
assert(someSet['9']);
|
||
assert(someSet['Z']);
|
||
|
||
auto lower = unicode.LowerCase;
|
||
auto upper = unicode.UpperCase;
|
||
auto ascii = unicode.ASCII;
|
||
assert((lower & upper).empty); // no intersection
|
||
auto lowerASCII = lower & ascii;
|
||
assert(lowerASCII.byCodepoint.equalS(iota('a', 'z'+1)));
|
||
// throw away all of the lowercase ASCII
|
||
assert((ascii - lower).length == 128 - 26);
|
||
auto onlyOneOf = lower ~ ascii;
|
||
assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
|
||
assert(onlyOneOf['$']); // ASCII and not lowercase
|
||
assert(!onlyOneOf['a']); // ASCII and lowercase
|
||
assert(onlyOneOf['я']); // not ASCII but lowercase
|
||
|
||
auto noLetters = ascii - (lower | upper);
|
||
assert(noLetters.length == 128 - 26*2);
|
||
import std.conv;
|
||
assert(unicode.ASCII.to!string() == "[0..128)");
|
||
}
|
||
|
||
// pedantic version for ctfe, and aligned-access only architectures
|
||
@trusted uint safeRead24(const ubyte* ptr, size_t idx) pure nothrow
|
||
{
|
||
idx *= 3;
|
||
version(LittleEndian)
|
||
return ptr[idx] + (cast(uint)ptr[idx+1]<<8)
|
||
+ (cast(uint)ptr[idx+2]<<16);
|
||
else
|
||
return (cast(uint)ptr[idx]<<16) + (cast(uint)ptr[idx+1]<<8)
|
||
+ ptr[idx+2];
|
||
}
|
||
|
||
// ditto
|
||
@trusted void safeWrite24(ubyte* ptr, uint val, size_t idx) pure nothrow
|
||
{
|
||
idx *= 3;
|
||
version(LittleEndian)
|
||
{
|
||
ptr[idx] = val & 0xFF;
|
||
ptr[idx+1] = (val>>8) & 0xFF;
|
||
ptr[idx+2] = (val>>16) & 0xFF;
|
||
}
|
||
else
|
||
{
|
||
ptr[idx] = (val>>16) & 0xFF;
|
||
ptr[idx+1] = (val>>8) & 0xFF;
|
||
ptr[idx+2] = val & 0xFF;
|
||
}
|
||
}
|
||
|
||
// unaligned x86-like read/write functions
|
||
@trusted uint unalignedRead24(const ubyte* ptr, size_t idx) pure nothrow
|
||
{
|
||
uint* src = cast(uint*)(ptr+3*idx);
|
||
version(LittleEndian)
|
||
return *src & 0xFF_FFFF;
|
||
else
|
||
return *src >> 8;
|
||
}
|
||
|
||
// ditto
|
||
@trusted void unalignedWrite24(ubyte* ptr, uint val, size_t idx) pure nothrow
|
||
{
|
||
uint* dest = cast(uint*)(cast(ubyte*)ptr + 3*idx);
|
||
version(LittleEndian)
|
||
*dest = val | (*dest & 0xFF00_0000);
|
||
else
|
||
*dest = (val<<8) | (*dest & 0xFF);
|
||
}
|
||
|
||
uint read24(const ubyte* ptr, size_t idx) pure nothrow
|
||
{
|
||
static if(hasUnalignedReads)
|
||
return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
|
||
else
|
||
return safeRead24(ptr, idx);
|
||
}
|
||
|
||
void write24(ubyte* ptr, uint val, size_t idx) pure nothrow
|
||
{
|
||
static if(hasUnalignedReads)
|
||
return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
|
||
else
|
||
return safeWrite24(ptr, val, idx);
|
||
}
|
||
@trusted struct CowArray(SP=GcPolicy)
|
||
{
|
||
static auto reuse(uint[] arr)
|
||
{
|
||
CowArray cow;
|
||
cow.data = arr;
|
||
SP.append(cow.data, 1);
|
||
assert(cow.refCount == 1);
|
||
assert(cow.length == arr.length);
|
||
return cow;
|
||
}
|
||
|
||
this(Range)(Range range)
|
||
if(isInputRange!Range && hasLength!Range)
|
||
{
|
||
length = range.length;
|
||
copy(range, data[0..$-1]);
|
||
}
|
||
|
||
this(Range)(Range range)
|
||
if(isForwardRange!Range && !hasLength!Range)
|
||
{
|
||
auto len = walkLength(range.save);
|
||
length = len;
|
||
copy(range, data[0..$-1]);
|
||
}
|
||
|
||
this(this)
|
||
{
|
||
if(!empty)
|
||
{
|
||
refCount = refCount + 1;
|
||
}
|
||
}
|
||
|
||
~this()
|
||
{
|
||
if(!empty)
|
||
{
|
||
auto cnt = refCount;
|
||
if(cnt == 1)
|
||
SP.destroy(data);
|
||
else
|
||
refCount = cnt - 1;
|
||
}
|
||
}
|
||
|
||
// no ref-count for empty U24 array
|
||
@property bool empty() const { return data.length == 0; }
|
||
|
||
// report one less then actual size
|
||
@property size_t length() const
|
||
{
|
||
return data.length ? data.length - 1 : 0;
|
||
}
|
||
|
||
//+ an extra slot for ref-count
|
||
@property void length(size_t len)
|
||
{
|
||
if(len == 0)
|
||
{
|
||
if(!empty)
|
||
freeThisReference();
|
||
return;
|
||
}
|
||
immutable total = len + 1; // including ref-count
|
||
if(empty)
|
||
{
|
||
data = SP.alloc!uint(total);
|
||
refCount = 1;
|
||
return;
|
||
}
|
||
auto cur_cnt = refCount;
|
||
if(cur_cnt != 1) // have more references to this memory
|
||
{
|
||
refCount = cur_cnt - 1;
|
||
auto new_data = SP.alloc!uint(total);
|
||
// take shrinking into account
|
||
auto to_copy = min(total, data.length) - 1;
|
||
copy(data[0..to_copy], new_data[0..to_copy]);
|
||
data = new_data; // before setting refCount!
|
||
refCount = 1;
|
||
}
|
||
else // 'this' is the only reference
|
||
{
|
||
// use the realloc (hopefully in-place operation)
|
||
data = SP.realloc(data, total);
|
||
refCount = 1; // setup a ref-count in the new end of the array
|
||
}
|
||
}
|
||
|
||
alias opDollar = length;
|
||
|
||
uint opIndex()(size_t idx)const
|
||
{
|
||
return data[idx];
|
||
}
|
||
|
||
void opIndexAssign(uint val, size_t idx)
|
||
{
|
||
auto cnt = refCount;
|
||
if(cnt != 1)
|
||
dupThisReference(cnt);
|
||
data[idx] = val;
|
||
}
|
||
|
||
//
|
||
auto opSlice(size_t from, size_t to)
|
||
{
|
||
if(!empty)
|
||
{
|
||
auto cnt = refCount;
|
||
if(cnt != 1)
|
||
dupThisReference(cnt);
|
||
}
|
||
return data[from .. to];
|
||
|
||
}
|
||
|
||
//
|
||
auto opSlice(size_t from, size_t to) const
|
||
{
|
||
return data[from .. to];
|
||
}
|
||
|
||
// length slices before the ref count
|
||
auto opSlice()
|
||
{
|
||
return opSlice(0, length);
|
||
}
|
||
|
||
// ditto
|
||
auto opSlice() const
|
||
{
|
||
return opSlice(0, length);
|
||
}
|
||
|
||
void append(Range)(Range range)
|
||
if(isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
|
||
{
|
||
size_t nl = length + range.length;
|
||
length = nl;
|
||
copy(range, this[nl-range.length..nl]);
|
||
}
|
||
|
||
void append()(uint[] val...)
|
||
{
|
||
length = length + val.length;
|
||
data[$-val.length-1 .. $-1] = val[];
|
||
}
|
||
|
||
bool opEquals()(auto const ref CowArray rhs)const
|
||
{
|
||
if(empty ^ rhs.empty)
|
||
return false; // one is empty and the other isn't
|
||
return empty || data[0..$-1] == rhs.data[0..$-1];
|
||
}
|
||
|
||
private:
|
||
// ref-count is right after the data
|
||
@property uint refCount() const
|
||
{
|
||
return data[$-1];
|
||
}
|
||
|
||
@property void refCount(uint cnt)
|
||
{
|
||
data[$-1] = cnt;
|
||
}
|
||
|
||
void freeThisReference()
|
||
{
|
||
auto count = refCount;
|
||
if(count != 1) // have more references to this memory
|
||
{
|
||
// dec shared ref-count
|
||
refCount = count - 1;
|
||
data = [];
|
||
}
|
||
else
|
||
SP.destroy(data);
|
||
assert(!data.ptr);
|
||
}
|
||
|
||
void dupThisReference(uint count)
|
||
in
|
||
{
|
||
assert(!empty && count != 1 && count == refCount);
|
||
}
|
||
body
|
||
{
|
||
// dec shared ref-count
|
||
refCount = count - 1;
|
||
// copy to the new chunk of RAM
|
||
auto new_data = SP.alloc!uint(data.length);
|
||
// bit-blit old stuff except the counter
|
||
copy(data[0..$-1], new_data[0..$-1]);
|
||
data = new_data; // before setting refCount!
|
||
refCount = 1; // so that this updates the right one
|
||
}
|
||
|
||
uint[] data;
|
||
}
|
||
|
||
@trusted unittest// Uint24 tests //@@@BUG@@ iota is system ?!
|
||
{
|
||
void funcRef(T)(ref T u24)
|
||
{
|
||
u24.length = 2;
|
||
u24[1] = 1024;
|
||
T u24_c = u24;
|
||
assert(u24[1] == 1024);
|
||
u24.length = 0;
|
||
assert(u24.empty);
|
||
u24.append([1, 2]);
|
||
assert(equalS(u24[], [1, 2]));
|
||
u24.append(111);
|
||
assert(equalS(u24[], [1, 2, 111]));
|
||
assert(!u24_c.empty && u24_c[1] == 1024);
|
||
u24.length = 3;
|
||
copy(iota(0, 3), u24[]);
|
||
assert(equalS(u24[], iota(0, 3)));
|
||
assert(u24_c[1] == 1024);
|
||
}
|
||
|
||
void func2(T)(T u24)
|
||
{
|
||
T u24_2 = u24;
|
||
T u24_3;
|
||
u24_3 = u24_2;
|
||
assert(u24_2 == u24_3);
|
||
assert(equalS(u24[], u24_2[]));
|
||
assert(equalS(u24_2[], u24_3[]));
|
||
funcRef(u24_3);
|
||
|
||
assert(equalS(u24_3[], iota(0, 3)));
|
||
assert(!equalS(u24_2[], u24_3[]));
|
||
assert(equalS(u24_2[], u24[]));
|
||
u24_2 = u24_3;
|
||
assert(equalS(u24_2[], iota(0, 3)));
|
||
// to test that passed arg is intact outside
|
||
// plus try out opEquals
|
||
u24 = u24_3;
|
||
u24 = T.init;
|
||
u24_3 = T.init;
|
||
assert(u24.empty);
|
||
assert(u24 == u24_3);
|
||
assert(u24 != u24_2);
|
||
}
|
||
|
||
foreach(Policy; TypeTuple!(GcPolicy, ReallocPolicy))
|
||
{
|
||
alias Range = typeof(CowArray!Policy.init[]);
|
||
alias U24A = CowArray!Policy;
|
||
static assert(isForwardRange!Range);
|
||
static assert(isBidirectionalRange!Range);
|
||
static assert(isOutputRange!(Range, uint));
|
||
static assert(isRandomAccessRange!(Range));
|
||
|
||
auto arr = U24A([42u, 36, 100]);
|
||
assert(arr[0] == 42);
|
||
assert(arr[1] == 36);
|
||
arr[0] = 72;
|
||
arr[1] = 0xFE_FEFE;
|
||
assert(arr[0] == 72);
|
||
assert(arr[1] == 0xFE_FEFE);
|
||
assert(arr[2] == 100);
|
||
U24A arr2 = arr;
|
||
assert(arr2[0] == 72);
|
||
arr2[0] = 11;
|
||
// test COW-ness
|
||
assert(arr[0] == 72);
|
||
assert(arr2[0] == 11);
|
||
// set this to about 100M to stress-test COW memory management
|
||
foreach(v; 0..10_000)
|
||
func2(arr);
|
||
assert(equalS(arr[], [72, 0xFE_FEFE, 100]));
|
||
|
||
auto r2 = U24A(iota(0, 100));
|
||
assert(equalS(r2[], iota(0, 100)), text(r2[]));
|
||
copy(iota(10, 170, 2), r2[10..90]);
|
||
assert(equalS(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
|
||
, text(r2[]));
|
||
}
|
||
}
|
||
|
||
version(unittest)
|
||
{
|
||
private alias AllSets = TypeTuple!(InversionList!GcPolicy, InversionList!ReallocPolicy);
|
||
}
|
||
|
||
@trusted unittest// core set primitives test
|
||
{
|
||
foreach(CodeList; AllSets)
|
||
{
|
||
CodeList a;
|
||
//"plug a hole" test
|
||
a.add(10, 20).add(25, 30).add(15, 27);
|
||
assert(a == CodeList(10, 30), text(a));
|
||
|
||
auto x = CodeList.init;
|
||
x.add(10, 20).add(30, 40).add(50, 60);
|
||
|
||
a = x;
|
||
a.add(20, 49);//[10, 49) [50, 60)
|
||
assert(a == CodeList(10, 49, 50 ,60));
|
||
|
||
a = x;
|
||
a.add(20, 50);
|
||
assert(a == CodeList(10, 60), text(a));
|
||
|
||
// simple unions, mostly edge effects
|
||
x = CodeList.init;
|
||
x.add(10, 20).add(40, 60);
|
||
|
||
a = x;
|
||
a.add(10, 25); //[10, 25) [40, 60)
|
||
assert(a == CodeList(10, 25, 40, 60));
|
||
|
||
a = x;
|
||
a.add(5, 15); //[5, 20) [40, 60)
|
||
assert(a == CodeList(5, 20, 40, 60));
|
||
|
||
a = x;
|
||
a.add(0, 10); // [0, 20) [40, 60)
|
||
assert(a == CodeList(0, 20, 40, 60));
|
||
|
||
a = x;
|
||
a.add(0, 5); // prepand
|
||
assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
|
||
|
||
a = x;
|
||
a.add(5, 20);
|
||
assert(a == CodeList(5, 20, 40, 60));
|
||
|
||
a = x;
|
||
a.add(3, 37);
|
||
assert(a == CodeList(3, 37, 40, 60));
|
||
|
||
a = x;
|
||
a.add(37, 65);
|
||
assert(a == CodeList(10, 20, 37, 65));
|
||
|
||
// some tests on helpers for set intersection
|
||
x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
|
||
a = x;
|
||
|
||
auto m = a.skipUpTo(60);
|
||
a.dropUpTo(110, m);
|
||
assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
|
||
|
||
a = x;
|
||
a.dropUpTo(100);
|
||
assert(a == CodeList(100, 120), text(a.data[]));
|
||
|
||
a = x;
|
||
m = a.skipUpTo(50);
|
||
a.dropUpTo(140, m);
|
||
assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
|
||
a = x;
|
||
a.dropUpTo(60);
|
||
assert(a == CodeList(100, 120), text(a.data[]));
|
||
}
|
||
}
|
||
|
||
|
||
//test constructor to work with any order of intervals
|
||
@system unittest //@@@BUG@@@ iota is @system
|
||
{
|
||
import std.conv, std.range, std.algorithm;
|
||
//ensure constructor handles bad ordering and overlap
|
||
auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
|
||
foreach(ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
|
||
assert(ch in c1, to!string(ch));
|
||
|
||
//contiguos
|
||
assert(CodepointSet(1000, 1006, 1006, 1009)
|
||
.byInterval.equal([tuple(1000, 1009)]));
|
||
//contains
|
||
assert(CodepointSet(900, 1200, 1000, 1100)
|
||
.byInterval.equal([tuple(900, 1200)]));
|
||
//intersect left
|
||
assert(CodepointSet(900, 1100, 1000, 1200)
|
||
.byInterval.equal([tuple(900, 1200)]));
|
||
//intersect right
|
||
assert(CodepointSet(1000, 1200, 900, 1100)
|
||
.byInterval.equal([tuple(900, 1200)]));
|
||
|
||
//ditto with extra items at end
|
||
assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
|
||
.byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
|
||
assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
|
||
.byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
|
||
|
||
//"plug a hole" test
|
||
auto c2 = CodepointSet(20, 40,
|
||
60, 80, 100, 140, 150, 200,
|
||
40, 60, 80, 100, 140, 150
|
||
);
|
||
assert(c2.byInterval.equal([tuple(20, 200)]));
|
||
|
||
auto c3 = CodepointSet(
|
||
20, 40, 60, 80, 100, 140, 150, 200,
|
||
0, 10, 15, 100, 10, 20, 200, 220);
|
||
assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
|
||
}
|
||
|
||
|
||
@trusted unittest
|
||
{ // full set operations
|
||
foreach(CodeList; AllSets)
|
||
{
|
||
CodeList a, b, c, d;
|
||
|
||
//"plug a hole"
|
||
a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
|
||
b.add(40, 60).add(80, 100).add(140, 150);
|
||
c = a | b;
|
||
d = b | a;
|
||
assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
|
||
c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
|
||
d = b | a;
|
||
assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
b = CodeList.init.add(10, 20).add(30,100).add(145,200);
|
||
c = a | b;//[10, 140) [145, 200)
|
||
d = b | a;
|
||
assert(c == CodeList(10, 140, 145, 200));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
|
||
c = a | b;//[0, 140) [150, 220)
|
||
d = b | a;
|
||
assert(c == CodeList(0, 140, 150, 220));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
|
||
a = CodeList.init.add(20, 40).add(60, 80);
|
||
b = CodeList.init.add(25, 35).add(65, 75);
|
||
c = a & b;
|
||
d = b & a;
|
||
assert(c == CodeList(25, 35, 65, 75), text(c));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
|
||
b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
|
||
c = a & b;
|
||
d = b & a;
|
||
assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
|
||
assert(c == d, text(c," vs ", d));
|
||
|
||
a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
|
||
b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
|
||
c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
|
||
d = b & a;
|
||
|
||
assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
|
||
assert(c == d, text(c, " vs ",d));
|
||
assert((c & a) == c);
|
||
assert((d & b) == d);
|
||
assert((c & d) == d);
|
||
|
||
b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
|
||
c = a & b;
|
||
d = b & a;
|
||
assert(c == CodeList(150, 200), text(c));
|
||
assert(c == d, text(c, " vs ",d));
|
||
assert((c & a) == c);
|
||
assert((d & b) == d);
|
||
assert((c & d) == d);
|
||
|
||
assert((a & a) == a);
|
||
assert((b & b) == b);
|
||
|
||
a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
|
||
b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
|
||
c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
|
||
d = b - a;// [40, 60) [80, 100) [200, 300)
|
||
assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
|
||
assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
|
||
assert(c - d == c, text(c-d, " vs ", c));
|
||
assert(d - c == d, text(d-c, " vs ", d));
|
||
assert(c - c == CodeList.init);
|
||
assert(d - d == CodeList.init);
|
||
|
||
a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200);
|
||
b = CodeList.init.add(10, 50).add(60, 160).add(190, 300);
|
||
c = a - b;// [160, 190)
|
||
d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
|
||
assert(c == CodeList(160, 190), text(c));
|
||
assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
|
||
assert(c - d == c, text(c-d, " vs ", c));
|
||
assert(d - c == d, text(d-c, " vs ", d));
|
||
assert(c - c == CodeList.init);
|
||
assert(d - d == CodeList.init);
|
||
|
||
a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
|
||
b = CodeList.init.add(10, 30).add(45, 100).add(130, 190);
|
||
c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
|
||
d = b ~ a;
|
||
assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
|
||
text(c));
|
||
assert(c == d, text(c, " vs ", d));
|
||
}
|
||
}
|
||
|
||
|
||
@system:
|
||
unittest// vs single dchar
|
||
{
|
||
CodepointSet a = CodepointSet(10, 100, 120, 200);
|
||
assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
|
||
assert((a & 'B') == CodepointSet(66, 67));
|
||
}
|
||
|
||
unittest// iteration & opIndex
|
||
{
|
||
import std.typecons;
|
||
foreach(CodeList; TypeTuple!(InversionList!(ReallocPolicy)))
|
||
{
|
||
auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
|
||
auto a = CodeList('A','N','a', 'n');
|
||
assert(equalS(a.byInterval,
|
||
[tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
|
||
), text(a.byInterval));
|
||
|
||
// same @@@BUG as in issue 8949 ?
|
||
version(bug8949)
|
||
{
|
||
assert(equalS(retro(a.byInterval),
|
||
[tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
|
||
), text(retro(a.byInterval)));
|
||
}
|
||
auto achr = a.byCodepoint;
|
||
assert(equalS(achr, arr), text(a.byCodepoint));
|
||
foreach(ch; a.byCodepoint)
|
||
assert(a[ch]);
|
||
auto x = CodeList(100, 500, 600, 900, 1200, 1500);
|
||
assert(equalS(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
|
||
foreach(ch; x.byCodepoint)
|
||
assert(x[ch]);
|
||
static if(is(CodeList == CodepointSet))
|
||
{
|
||
auto y = CodeList(x.byInterval);
|
||
assert(equalS(x.byInterval, y.byInterval));
|
||
}
|
||
assert(equalS(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
|
||
assert(equalS(CodepointSet.init.byCodepoint, cast(dchar[])[]));
|
||
}
|
||
}
|
||
|
||
//============================================================================
|
||
// Generic Trie template and various ways to build it
|
||
//============================================================================
|
||
|
||
// debug helper to get a shortened array dump
|
||
auto arrayRepr(T)(T x)
|
||
{
|
||
if(x.length > 32)
|
||
{
|
||
return text(x[0..16],"~...~", x[x.length-16..x.length]);
|
||
}
|
||
else
|
||
return text(x);
|
||
}
|
||
|
||
/**
|
||
Maps $(D Key) to a suitable integer index within the range of $(D size_t).
|
||
The mapping is constructed by applying predicates from $(D Prefix) left to right
|
||
and concatenating the resulting bits.
|
||
|
||
The first (leftmost) predicate defines the most significant bits of
|
||
the resulting index.
|
||
*/
|
||
template mapTrieIndex(Prefix...)
|
||
{
|
||
size_t mapTrieIndex(Key)(Key key)
|
||
if(isValidPrefixForTrie!(Key, Prefix))
|
||
{
|
||
alias p = Prefix;
|
||
size_t idx;
|
||
foreach(i, v; p[0..$-1])
|
||
{
|
||
idx |= p[i](key);
|
||
idx <<= p[i+1].bitSize;
|
||
}
|
||
idx |= p[$-1](key);
|
||
return idx;
|
||
}
|
||
}
|
||
|
||
/*
|
||
$(D TrieBuilder) is a type used for incremental construction
|
||
of $(LREF Trie)s.
|
||
|
||
See $(LREF buildTrie) for generic helpers built on top of it.
|
||
*/
|
||
@trusted struct TrieBuilder(Value, Key, Args...)
|
||
if(isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
|
||
{
|
||
private:
|
||
// last index is not stored in table, it is used as an offset to values in a block.
|
||
static if(is(Value == bool))// always pack bool
|
||
alias V = BitPacked!(Value, 1);
|
||
else
|
||
alias V = Value;
|
||
static auto deduceMaxIndex(Preds...)()
|
||
{
|
||
size_t idx = 1;
|
||
foreach(v; Preds)
|
||
idx *= 2^^v.bitSize;
|
||
return idx;
|
||
}
|
||
|
||
static if(is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
|
||
{
|
||
alias Prefix = Args[1..$];
|
||
enum lastPageSize = 2^^Prefix[$-1].bitSize;
|
||
enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
|
||
enum roughedMaxIndex =
|
||
(translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
|
||
// check warp around - if wrapped, use the default deduction rule
|
||
enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
|
||
deduceMaxIndex!(Prefix)() : roughedMaxIndex;
|
||
}
|
||
else
|
||
{
|
||
alias Prefix = Args;
|
||
enum maxIndex = deduceMaxIndex!(Prefix)();
|
||
}
|
||
|
||
alias getIndex = mapTrieIndex!(Prefix);
|
||
|
||
enum lastLevel = Prefix.length-1;
|
||
struct ConstructState
|
||
{
|
||
size_t idx_zeros, idx_ones;
|
||
}
|
||
// iteration over levels of Trie, each indexes its own level and thus a shortened domain
|
||
size_t[Prefix.length] indices;
|
||
// default filler value to use
|
||
Value defValue;
|
||
// this is a full-width index of next item
|
||
size_t curIndex;
|
||
// all-zeros page index, all-ones page index (+ indicator if there is such a page)
|
||
ConstructState[Prefix.length] state;
|
||
// the table being constructed
|
||
MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
|
||
|
||
@disable this();
|
||
|
||
//shortcut for index variable at level 'level'
|
||
@property ref idx(size_t level)(){ return indices[level]; }
|
||
|
||
// this function assumes no holes in the input so
|
||
// indices are going one by one
|
||
void addValue(size_t level, T)(T val, size_t numVals)
|
||
{
|
||
alias j = idx!level;
|
||
enum pageSize = 1<<Prefix[level].bitSize;
|
||
if(numVals == 0)
|
||
return;
|
||
auto ptr = table.slice!(level);
|
||
if(numVals == 1)
|
||
{
|
||
static if(level == Prefix.length-1)
|
||
ptr[j] = val;
|
||
else
|
||
{// can incur narrowing conversion
|
||
assert(j < ptr.length);
|
||
ptr[j] = force!(typeof(ptr[j]))(val);
|
||
}
|
||
j++;
|
||
if(j % pageSize == 0)
|
||
spillToNextPage!level(ptr);
|
||
return;
|
||
}
|
||
// longer row of values
|
||
// get to the next page boundary
|
||
size_t nextPB = (j + pageSize) & ~(pageSize-1);
|
||
size_t n = nextPB - j;// can fill right in this page
|
||
if(numVals < n) //fits in current page
|
||
{
|
||
ptr[j..j+numVals] = val;
|
||
j += numVals;
|
||
return;
|
||
}
|
||
static if(level != 0)//on the first level it always fits
|
||
{
|
||
numVals -= n;
|
||
//write till the end of current page
|
||
ptr[j..j+n] = val;
|
||
j += n;
|
||
//spill to the next page
|
||
spillToNextPage!level(ptr);
|
||
// page at once loop
|
||
if(state[level].idx_zeros != size_t.max && val == T.init)
|
||
{
|
||
alias NextIdx = typeof(table.slice!(level-1)[0]);
|
||
addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
|
||
numVals/pageSize);
|
||
ptr = table.slice!level; //table structure might have changed
|
||
numVals %= pageSize;
|
||
}
|
||
else
|
||
{
|
||
while(numVals >= pageSize)
|
||
{
|
||
numVals -= pageSize;
|
||
ptr[j..j+pageSize] = val;
|
||
j += pageSize;
|
||
spillToNextPage!level(ptr);
|
||
}
|
||
}
|
||
if(numVals)
|
||
{
|
||
// the leftovers, an incomplete page
|
||
ptr[j..j+numVals] = val;
|
||
j += numVals;
|
||
}
|
||
}
|
||
}
|
||
|
||
void spillToNextPage(size_t level, Slice)(ref Slice ptr)
|
||
{
|
||
// last level (i.e. topmost) has 1 "page"
|
||
// thus it need not to add a new page on upper level
|
||
static if(level != 0)
|
||
spillToNextPageImpl!(level)(ptr);
|
||
}
|
||
|
||
// this can re-use the current page if duplicate or allocate a new one
|
||
// it also makes sure that previous levels point to the correct page in this level
|
||
void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
|
||
{
|
||
alias NextIdx = typeof(table.slice!(level-1)[0]);
|
||
NextIdx next_lvl_index;
|
||
enum pageSize = 1<<Prefix[level].bitSize;
|
||
assert(idx!level % pageSize == 0);
|
||
auto last = idx!level-pageSize;
|
||
auto slice = ptr[idx!level - pageSize..idx!level];
|
||
size_t j;
|
||
for(j=0; j<last; j+=pageSize)
|
||
{
|
||
if(ptr[j..j+pageSize] == slice)
|
||
{
|
||
// get index to it, reuse ptr space for the next block
|
||
next_lvl_index = force!NextIdx(j/pageSize);
|
||
version(none)
|
||
{
|
||
writefln("LEVEL(%s) page maped idx: %s: 0..%s ---> [%s..%s]"
|
||
,level
|
||
,indices[level-1], pageSize, j, j+pageSize);
|
||
writeln("LEVEL(", level
|
||
, ") mapped page is: ", slice, ": ", arrayRepr(ptr[j..j+pageSize]));
|
||
writeln("LEVEL(", level
|
||
, ") src page is :", ptr, ": ", arrayRepr(slice[0..pageSize]));
|
||
}
|
||
idx!level -= pageSize; // reuse this page, it is duplicate
|
||
break;
|
||
}
|
||
}
|
||
if(j == last)
|
||
{
|
||
L_allocate_page:
|
||
next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
|
||
if(state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
|
||
{
|
||
state[level].idx_zeros = next_lvl_index;
|
||
}
|
||
// allocate next page
|
||
version(none)
|
||
{
|
||
writefln("LEVEL(%s) page allocated: %s"
|
||
, level, arrayRepr(slice[0..pageSize]));
|
||
writefln("LEVEL(%s) index: %s ; page at this index %s"
|
||
, level
|
||
, next_lvl_index
|
||
, arrayRepr(
|
||
table.slice!(level)
|
||
[pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
|
||
));
|
||
}
|
||
table.length!level = table.length!level + pageSize;
|
||
}
|
||
L_know_index:
|
||
// for the previous level, values are indices to the pages in the current level
|
||
addValue!(level-1)(next_lvl_index, 1);
|
||
ptr = table.slice!level; //re-load the slice after moves
|
||
}
|
||
|
||
// idx - full-width index to fill with v (full-width index != key)
|
||
// fills everything in the range of [curIndex, idx) with filler
|
||
void putAt(size_t idx, Value v)
|
||
{
|
||
assert(idx >= curIndex);
|
||
size_t numFillers = idx - curIndex;
|
||
addValue!lastLevel(defValue, numFillers);
|
||
addValue!lastLevel(v, 1);
|
||
curIndex = idx + 1;
|
||
}
|
||
|
||
// ditto, but sets the range of [idxA, idxB) to v
|
||
void putRangeAt(size_t idxA, size_t idxB, Value v)
|
||
{
|
||
assert(idxA >= curIndex);
|
||
assert(idxB >= idxA);
|
||
size_t numFillers = idxA - curIndex;
|
||
addValue!lastLevel(defValue, numFillers);
|
||
addValue!lastLevel(v, idxB - idxA);
|
||
curIndex = idxB; // open-right
|
||
}
|
||
|
||
enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
|
||
"duplicate key->value mapping";
|
||
|
||
public:
|
||
/**
|
||
Construct a builder, where $(D filler) is a value
|
||
to indicate empty slots (or "not found" condition).
|
||
*/
|
||
this(Value filler)
|
||
{
|
||
curIndex = 0;
|
||
defValue = filler;
|
||
// zeros-page index, ones-page index
|
||
foreach(ref v; state)
|
||
v = ConstructState(size_t.max, size_t.max);
|
||
table = typeof(table)(indices);
|
||
// one page per level is a bootstrap minimum
|
||
foreach(i, Pred; Prefix)
|
||
table.length!i = (1<<Pred.bitSize);
|
||
}
|
||
|
||
/**
|
||
Put a value $(D v) into interval as
|
||
mapped by keys from $(D a) to $(D b).
|
||
All slots prior to $(D a) are filled with
|
||
the default filler.
|
||
*/
|
||
void putRange(Key a, Key b, Value v)
|
||
{
|
||
auto idxA = getIndex(a), idxB = getIndex(b);
|
||
// indexes of key should always grow
|
||
enforce(idxB >= idxA && idxA >= curIndex, errMsg);
|
||
putRangeAt(idxA, idxB, v);
|
||
}
|
||
|
||
/**
|
||
Put a value $(D v) into slot mapped by $(D key).
|
||
All slots prior to $(D key) are filled with the
|
||
default filler.
|
||
*/
|
||
void putValue(Key key, Value v)
|
||
{
|
||
auto idx = getIndex(key);
|
||
enforce(idx >= curIndex, text(errMsg, " ", idx));
|
||
putAt(idx, v);
|
||
}
|
||
|
||
/// Finishes construction of Trie, yielding an immutable Trie instance.
|
||
auto build()
|
||
{
|
||
static if(maxIndex != 0) // doesn't cover full range of size_t
|
||
{
|
||
assert(curIndex <= maxIndex);
|
||
addValue!lastLevel(defValue, maxIndex - curIndex);
|
||
}
|
||
else
|
||
{
|
||
if(curIndex != 0 // couldn't wrap around
|
||
|| (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
|
||
{
|
||
addValue!lastLevel(defValue, size_t.max - curIndex);
|
||
addValue!lastLevel(defValue, 1);
|
||
}
|
||
// else curIndex already completed the full range of size_t by wrapping around
|
||
}
|
||
return Trie!(V, Key, maxIndex, Prefix)(table);
|
||
}
|
||
}
|
||
|
||
/*
|
||
$(P A generic Trie data-structure for a fixed number of stages.
|
||
The design goal is optimal speed with smallest footprint size.
|
||
)
|
||
$(P It's intentionally read-only and doesn't provide constructors.
|
||
To construct one use a special builder,
|
||
see $(LREF TrieBuilder) and $(LREF buildTrie).
|
||
)
|
||
|
||
*/
|
||
@trusted public struct Trie(Value, Key, Args...)
|
||
if(isValidPrefixForTrie!(Key, Args)
|
||
|| (isValidPrefixForTrie!(Key, Args[1..$])
|
||
&& is(typeof(Args[0]) : size_t)))
|
||
{
|
||
static if(is(typeof(Args[0]) : size_t))
|
||
{
|
||
enum maxIndex = Args[0];
|
||
enum hasBoundsCheck = true;
|
||
alias Prefix = Args[1..$];
|
||
}
|
||
else
|
||
{
|
||
enum hasBoundsCheck = false;
|
||
alias Prefix = Args;
|
||
}
|
||
|
||
private this()(typeof(_table) table)
|
||
{
|
||
_table = table;
|
||
}
|
||
|
||
// only for constant Tries constructed from precompiled tables
|
||
private this()(const(size_t)[] offsets, const(size_t)[] sizes,
|
||
const(size_t)[] data) const
|
||
{
|
||
_table = typeof(_table)(offsets, sizes, data);
|
||
}
|
||
|
||
/*
|
||
$(P Lookup the $(D key) in this $(D Trie). )
|
||
|
||
$(P The lookup always succeeds if key fits the domain
|
||
provided during construction. The whole domain defined
|
||
is covered so instead of not found condition
|
||
the sentinel (filler) value could be used. )
|
||
|
||
$(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
|
||
define a domain of $(D Trie) keys and the sentinel value. )
|
||
|
||
Note:
|
||
Domain range-checking is only enabled in debug builds
|
||
and results in assertion failure.
|
||
*/
|
||
// templated to auto-detect pure, @safe and nothrow
|
||
TypeOfBitPacked!Value opIndex()(Key key) const
|
||
{
|
||
static if(hasBoundsCheck)
|
||
assert(mapTrieIndex!Prefix(key) < maxIndex);
|
||
size_t idx;
|
||
alias p = Prefix;
|
||
idx = cast(size_t)p[0](key);
|
||
foreach(i, v; p[0..$-1])
|
||
idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
|
||
return _table.ptr!(p.length-1)[idx];
|
||
}
|
||
|
||
@property size_t bytes(size_t n=size_t.max)() const
|
||
{
|
||
return _table.bytes!n;
|
||
}
|
||
|
||
@property size_t pages(size_t n)() const
|
||
{
|
||
return (bytes!n+2^^(Prefix[n].bitSize-1))
|
||
/2^^Prefix[n].bitSize;
|
||
}
|
||
|
||
void store(OutRange)(scope OutRange sink) const
|
||
if(isOutputRange!(OutRange, char))
|
||
{
|
||
_table.store(sink);
|
||
}
|
||
|
||
private:
|
||
MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
|
||
}
|
||
|
||
// create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
|
||
// left-to-right, the most significant bits first
|
||
template GetBitSlicing(size_t top, sizes...)
|
||
{
|
||
static if(sizes.length > 0)
|
||
alias GetBitSlicing =
|
||
TypeTuple!(sliceBits!(top - sizes[0], top),
|
||
GetBitSlicing!(top - sizes[0], sizes[1..$]));
|
||
else
|
||
alias GetBitSlicing = TypeTuple!();
|
||
}
|
||
|
||
template callableWith(T)
|
||
{
|
||
template callableWith(alias Pred)
|
||
{
|
||
static if(!is(typeof(Pred(T.init))))
|
||
enum callableWith = false;
|
||
else
|
||
{
|
||
alias Result = typeof(Pred(T.init));
|
||
enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
|
||
}
|
||
}
|
||
}
|
||
|
||
/*
|
||
Check if $(D Prefix) is a valid set of predicates
|
||
for $(D Trie) template having $(D Key) as the type of keys.
|
||
This requires all predicates to be callable, take
|
||
single argument of type $(D Key) and return unsigned value.
|
||
*/
|
||
template isValidPrefixForTrie(Key, Prefix...)
|
||
{
|
||
enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
|
||
}
|
||
|
||
/*
|
||
Check if $(D Args) is a set of maximum key value followed by valid predicates
|
||
for $(D Trie) template having $(D Key) as the type of keys.
|
||
*/
|
||
template isValidArgsForTrie(Key, Args...)
|
||
{
|
||
static if(Args.length > 1)
|
||
{
|
||
enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
|
||
|| (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
|
||
}
|
||
else
|
||
enum isValidArgsForTrie = isValidPrefixForTrie!Args;
|
||
}
|
||
|
||
@property size_t sumOfIntegerTuple(ints...)()
|
||
{
|
||
size_t count=0;
|
||
foreach(v; ints)
|
||
count += v;
|
||
return count;
|
||
}
|
||
|
||
/**
|
||
A shorthand for creating a custom multi-level fixed Trie
|
||
from a $(D CodepointSet). $(D sizes) are numbers of bits per level,
|
||
with the most significant bits used first.
|
||
|
||
Note: The sum of $(D sizes) must be equal 21.
|
||
|
||
See_Also: $(LREF toTrie), which is even simpler.
|
||
|
||
Example:
|
||
---
|
||
{
|
||
import std.stdio;
|
||
auto set = unicode("Number");
|
||
auto trie = codepointSetTrie!(8, 5, 8)(set);
|
||
writeln("Input code points to test:");
|
||
foreach(line; stdin.byLine)
|
||
{
|
||
int count=0;
|
||
foreach(dchar ch; line)
|
||
if(trie[ch])// is number
|
||
count++;
|
||
writefln("Contains %d number code points.", count);
|
||
}
|
||
}
|
||
---
|
||
*/
|
||
public template codepointSetTrie(sizes...)
|
||
if(sumOfIntegerTuple!sizes == 21)
|
||
{
|
||
auto codepointSetTrie(Set)(Set set)
|
||
if(isCodepointSet!Set)
|
||
{
|
||
auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
|
||
foreach(ival; set.byInterval)
|
||
builder.putRange(ival[0], ival[1], true);
|
||
return builder.build();
|
||
}
|
||
}
|
||
|
||
/// Type of Trie generated by codepointSetTrie function.
|
||
public template CodepointSetTrie(sizes...)
|
||
if(sumOfIntegerTuple!sizes == 21)
|
||
{
|
||
alias Prefix = GetBitSlicing!(21, sizes);
|
||
alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
|
||
}
|
||
|
||
/**
|
||
A slightly more general tool for building fixed $(D Trie)
|
||
for the Unicode data.
|
||
|
||
Specifically unlike $(D codepointSetTrie) it's allows creating mappings
|
||
of $(D dchar) to an arbitrary type $(D T).
|
||
|
||
Note: Overload taking $(D CodepointSet)s will naturally convert
|
||
only to bool mapping $(D Trie)s.
|
||
|
||
Example:
|
||
---
|
||
// pick characters from the Greek script
|
||
auto set = unicode.Greek;
|
||
|
||
// a user-defined property (or an expensive function)
|
||
// that we want to look up
|
||
static uint luckFactor(dchar ch)
|
||
{
|
||
// here we consider a character lucky
|
||
// if its code point has a lot of identical hex-digits
|
||
// e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
|
||
ubyte[6] nibbles; // 6 4-bit chunks of code point
|
||
uint value = ch;
|
||
foreach(i; 0..6)
|
||
{
|
||
nibbles[i] = value & 0xF;
|
||
value >>= 4;
|
||
}
|
||
uint luck;
|
||
foreach(n; nibbles)
|
||
luck = cast(uint)max(luck, count(nibbles[], n));
|
||
return luck;
|
||
}
|
||
|
||
// only unsigned built-ins are supported at the moment
|
||
alias LuckFactor = BitPacked!(uint, 3);
|
||
|
||
// create a temporary associative array (AA)
|
||
LuckFactor[dchar] map;
|
||
foreach(ch; set.byCodepoint)
|
||
map[ch] = luckFactor(ch);
|
||
|
||
// bits per stage are chosen randomly, fell free to optimize
|
||
auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
|
||
|
||
// from now on the AA is not needed
|
||
foreach(ch; set.byCodepoint)
|
||
assert(trie[ch] == luckFactor(ch)); // verify
|
||
// CJK is not Greek, thus it has the default value
|
||
assert(trie['\u4444'] == 0);
|
||
// and here is a couple of quite lucky Greek characters:
|
||
// Greek small letter epsilon with dasia
|
||
assert(trie['\u1F11'] == 3);
|
||
// Ancient Greek metretes sign
|
||
assert(trie['\U00010181'] == 3);
|
||
---
|
||
*/
|
||
public template codepointTrie(T, sizes...)
|
||
if(sumOfIntegerTuple!sizes == 21)
|
||
{
|
||
alias Prefix = GetBitSlicing!(21, sizes);
|
||
|
||
static if(is(TypeOfBitPacked!T == bool))
|
||
{
|
||
auto codepointTrie(Set)(in Set set)
|
||
if(isCodepointSet!Set)
|
||
{
|
||
return codepointSetTrie(set);
|
||
}
|
||
}
|
||
|
||
auto codepointTrie()(T[dchar] map, T defValue=T.init)
|
||
{
|
||
return buildTrie!(T, dchar, Prefix)(map, defValue);
|
||
}
|
||
|
||
// unsorted range of pairs
|
||
auto codepointTrie(R)(R range, T defValue=T.init)
|
||
if(isInputRange!R
|
||
&& is(typeof(ElementType!R.init[0]) : T)
|
||
&& is(typeof(ElementType!R.init[1]) : dchar))
|
||
{
|
||
// build from unsorted array of pairs
|
||
// TODO: expose index sorting functions for Trie
|
||
return buildTrie!(T, dchar, Prefix)(range, defValue, true);
|
||
}
|
||
}
|
||
|
||
unittest // codepointTrie example
|
||
{
|
||
// pick characters from the Greek script
|
||
auto set = unicode.Greek;
|
||
|
||
// a user-defined property (or an expensive function)
|
||
// that we want to look up
|
||
static uint luckFactor(dchar ch)
|
||
{
|
||
// here we consider a character lucky
|
||
// if its code point has a lot of identical hex-digits
|
||
// e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
|
||
ubyte[6] nibbles; // 6 4-bit chunks of code point
|
||
uint value = ch;
|
||
foreach(i; 0..6)
|
||
{
|
||
nibbles[i] = value & 0xF;
|
||
value >>= 4;
|
||
}
|
||
uint luck;
|
||
foreach(n; nibbles)
|
||
luck = cast(uint)max(luck, count(nibbles[], n));
|
||
return luck;
|
||
}
|
||
|
||
// only unsigned built-ins are supported at the moment
|
||
alias LuckFactor = BitPacked!(uint, 3);
|
||
|
||
// create a temporary associative array (AA)
|
||
LuckFactor[dchar] map;
|
||
foreach(ch; set.byCodepoint)
|
||
map[ch] = LuckFactor(luckFactor(ch));
|
||
|
||
// bits per stage are chosen randomly, fell free to optimize
|
||
auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
|
||
|
||
// from now on the AA is not needed
|
||
foreach(ch; set.byCodepoint)
|
||
assert(trie[ch] == luckFactor(ch)); // verify
|
||
// CJK is not Greek, thus it has the default value
|
||
assert(trie['\u4444'] == 0);
|
||
// and here is a couple of quite lucky Greek characters:
|
||
// Greek small letter epsilon with dasia
|
||
assert(trie['\u1F11'] == 3);
|
||
// Ancient Greek metretes sign
|
||
assert(trie['\U00010181'] == 3);
|
||
|
||
}
|
||
|
||
/// Type of Trie as generated by codepointTrie function.
|
||
public template CodepointTrie(T, sizes...)
|
||
if(sumOfIntegerTuple!sizes == 21)
|
||
{
|
||
alias Prefix = GetBitSlicing!(21, sizes);
|
||
alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
|
||
}
|
||
|
||
// @@@BUG multiSort can's access private symbols from uni
|
||
public template cmpK0(alias Pred)
|
||
{
|
||
import std.typecons;
|
||
static bool cmpK0(Value, Key)
|
||
(Tuple!(Value, Key) a, Tuple!(Value, Key) b)
|
||
{
|
||
return Pred(a[1]) < Pred(b[1]);
|
||
}
|
||
}
|
||
|
||
/*
|
||
The most general utility for construction of $(D Trie)s
|
||
short of using $(D TrieBuilder) directly.
|
||
|
||
Provides a number of convenience overloads.
|
||
$(D Args) is tuple of maximum key value followed by
|
||
predicates to construct index from key.
|
||
|
||
Alternatively if the first argument is not a value convertible to $(D Key)
|
||
then the whole tuple of $(D Args) is treated as predicates
|
||
and the maximum Key is deduced from predicates.
|
||
*/
|
||
public template buildTrie(Value, Key, Args...)
|
||
if(isValidArgsForTrie!(Key, Args))
|
||
{
|
||
static if(is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
|
||
{
|
||
alias Prefix = Args[1..$];
|
||
}
|
||
else
|
||
alias Prefix = Args;
|
||
|
||
alias getIndex = mapTrieIndex!(Prefix);
|
||
|
||
// for multi-sort
|
||
template GetComparators(size_t n)
|
||
{
|
||
static if(n > 0)
|
||
alias GetComparators =
|
||
TypeTuple!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
|
||
else
|
||
alias GetComparators = TypeTuple!();
|
||
}
|
||
|
||
/*
|
||
Build $(D Trie) from a range of a Key-Value pairs,
|
||
assuming it is sorted by Key as defined by the following lambda:
|
||
------
|
||
(a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
|
||
------
|
||
Exception is thrown if it's detected that the above order doesn't hold.
|
||
|
||
In other words $(LREF mapTrieIndex) should be a
|
||
monotonically increasing function that maps $(D Key) to an integer.
|
||
|
||
See also: $(XREF _algorithm, sort),
|
||
$(XREF _range, SortedRange),
|
||
$(XREF _algorithm, setUnion).
|
||
*/
|
||
auto buildTrie(Range)(Range range, Value filler=Value.init)
|
||
if(isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
|
||
&& is(typeof(Range.init.front[1]) : Key))
|
||
{
|
||
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
|
||
foreach(v; range)
|
||
builder.putValue(v[1], v[0]);
|
||
return builder.build();
|
||
}
|
||
|
||
/*
|
||
If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
|
||
to build $(D Trie) from a range of open-right intervals of $(D Key)s.
|
||
The requirement on the ordering of keys (and the behavior on the
|
||
violation of it) is the same as for Key-Value range overload.
|
||
|
||
Intervals denote ranges of !$(D filler) i.e. the opposite of filler.
|
||
If no filler provided keys inside of the intervals map to true,
|
||
and $(D filler) is false.
|
||
*/
|
||
auto buildTrie(Range)(Range range, Value filler=Value.init)
|
||
if(is(TypeOfBitPacked!Value == bool)
|
||
&& isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
|
||
&& is(typeof(Range.init.front[1]) : Key))
|
||
{
|
||
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
|
||
foreach(ival; range)
|
||
builder.putRange(ival[0], ival[1], !filler);
|
||
return builder.build();
|
||
}
|
||
|
||
auto buildTrie(Range)(Range range, Value filler, bool unsorted)
|
||
if(isInputRange!Range
|
||
&& is(typeof(Range.init.front[0]) : Value)
|
||
&& is(typeof(Range.init.front[1]) : Key))
|
||
{
|
||
alias Comps = GetComparators!(Prefix.length);
|
||
if(unsorted)
|
||
multiSort!(Comps)(range);
|
||
return buildTrie(range, filler);
|
||
}
|
||
|
||
/*
|
||
If $(D Value) is bool (or BitPacked!(bool, x)) then it's possible
|
||
to build $(D Trie) simply from an input range of $(D Key)s.
|
||
The requirement on the ordering of keys (and the behavior on the
|
||
violation of it) is the same as for Key-Value range overload.
|
||
|
||
Keys found in range denote !$(D filler) i.e. the opposite of filler.
|
||
If no filler provided keys map to true, and $(D filler) is false.
|
||
*/
|
||
auto buildTrie(Range)(Range range, Value filler=Value.init)
|
||
if(is(TypeOfBitPacked!Value == bool)
|
||
&& isInputRange!Range && is(typeof(Range.init.front) : Key))
|
||
{
|
||
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
|
||
foreach(v; range)
|
||
builder.putValue(v, !filler);
|
||
return builder.build();
|
||
}
|
||
|
||
/*
|
||
If $(D Key) is unsigned integer $(D Trie) could be constructed from array
|
||
of values where array index serves as key.
|
||
*/
|
||
auto buildTrie()(Value[] array, Value filler=Value.init)
|
||
if(isUnsigned!Key)
|
||
{
|
||
auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
|
||
foreach(idx, v; array)
|
||
builder.putValue(idx, v);
|
||
return builder.build();
|
||
}
|
||
|
||
/*
|
||
Builds $(D Trie) from associative array.
|
||
*/
|
||
auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
|
||
{
|
||
auto range = array(zip(map.values, map.keys));
|
||
return buildTrie(range, filler, true); // sort it
|
||
}
|
||
}
|
||
|
||
// helper in place of assumeSize to
|
||
//reduce mangled name & help DMD inline Trie functors
|
||
struct clamp(size_t bits)
|
||
{
|
||
static size_t opCall(T)(T arg){ return arg; }
|
||
enum bitSize = bits;
|
||
}
|
||
|
||
struct clampIdx(size_t idx, size_t bits)
|
||
{
|
||
static size_t opCall(T)(T arg){ return arg[idx]; }
|
||
enum bitSize = bits;
|
||
}
|
||
|
||
/**
|
||
Conceptual type that outlines the common properties of all UTF Matchers.
|
||
|
||
Note: For illustration purposes only, every method
|
||
call results in assertion failure.
|
||
Use $(LREF utfMatcher) to obtain a concrete matcher
|
||
for UTF-8 or UTF-16 encodings.
|
||
*/
|
||
public struct MatcherConcept
|
||
{
|
||
/**
|
||
$(P Perform a semantic equivalent 2 operations:
|
||
decoding a $(CODEPOINT) at front of $(D inp) and testing if
|
||
it belongs to the set of $(CODEPOINTS) of this matcher. )
|
||
|
||
$(P The effect on $(D inp) depends on the kind of function called:)
|
||
|
||
$(P Match. If the codepoint is found in the set then range $(D inp)
|
||
is advanced by its size in $(S_LINK Code unit, code units),
|
||
otherwise the range is not modifed.)
|
||
|
||
$(P Skip. The range is always advanced by the size
|
||
of the tested $(CODEPOINT) regardless of the result of test.)
|
||
|
||
$(P Test. The range is left unaffected regardless
|
||
of the result of test.)
|
||
*/
|
||
public bool match(Range)(ref Range inp)
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
assert(false);
|
||
}
|
||
|
||
///ditto
|
||
public bool skip(Range)(ref Range inp)
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
assert(false);
|
||
}
|
||
|
||
///ditto
|
||
public bool test(Range)(ref Range inp)
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
assert(false);
|
||
}
|
||
///
|
||
@safe unittest
|
||
{
|
||
string truth = "2² = 4";
|
||
auto m = utfMatcher!char(unicode.Number);
|
||
assert(m.match(truth)); // '2' is a number all right
|
||
assert(truth == "² = 4"); // skips on match
|
||
assert(m.match(truth)); // so is the superscript '2'
|
||
assert(!m.match(truth)); // space is not a number
|
||
assert(truth == " = 4"); // unaffected on no match
|
||
assert(!m.skip(truth)); // same test ...
|
||
assert(truth == "= 4"); // but skips a codepoint regardless
|
||
assert(!m.test(truth)); // '=' is not a number
|
||
assert(truth == "= 4"); // test never affects argument
|
||
}
|
||
|
||
/*
|
||
Advanced feature - provide direct access to a subset of matcher based a
|
||
set of known encoding lengths. Lengths are provided in
|
||
$(S_LINK Code unit, code units). The sub-matcher then may do less
|
||
operations per any $(D test)/$(D match).
|
||
|
||
Use with care as the sub-matcher won't match
|
||
any $(CODEPOINTS) that have encoded length that doesn't belong
|
||
to the selected set of lengths. Also the sub-matcher object references
|
||
the parent matcher and must not be used past the liftetime
|
||
of the latter.
|
||
|
||
Another caveat of using sub-matcher is that skip is not available
|
||
preciesly because sub-matcher doesn't detect all lengths.
|
||
*/
|
||
@property auto subMatcher(Lengths...)()
|
||
{
|
||
assert(0);
|
||
return this;
|
||
}
|
||
|
||
///
|
||
@safe unittest
|
||
{
|
||
auto m = utfMatcher!char(unicode.Number);
|
||
string square = "2²";
|
||
// about sub-matchers
|
||
assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
|
||
assert(m.subMatcher!1.match(square)); // ASCII-only, works
|
||
assert(!m.subMatcher!1.test(square)); // unicode '²'
|
||
assert(m.subMatcher!(2,3,4).match(square)); //
|
||
assert(square == "");
|
||
wstring wsquare = "2²";
|
||
auto m16 = utfMatcher!wchar(unicode.Number);
|
||
// may keep ref, but the orignal (m16) must be kept alive
|
||
auto bmp = m16.subMatcher!1;
|
||
assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
|
||
assert(bmp.match(wsquare)); // And '²' too
|
||
}
|
||
}
|
||
|
||
/**
|
||
Test if $(D M) is an UTF Matcher for ranges of $(D Char).
|
||
*/
|
||
public enum isUtfMatcher(M, C) = __traits(compiles, (){
|
||
C[] s;
|
||
auto d = s.decoder;
|
||
M m;
|
||
assert(is(typeof(m.match(d)) == bool));
|
||
assert(is(typeof(m.test(d)) == bool));
|
||
static if(is(typeof(m.skip(d))))
|
||
{
|
||
assert(is(typeof(m.skip(d)) == bool));
|
||
assert(is(typeof(m.skip(s)) == bool));
|
||
}
|
||
assert(is(typeof(m.match(s)) == bool));
|
||
assert(is(typeof(m.test(s)) == bool));
|
||
});
|
||
|
||
unittest
|
||
{
|
||
alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
|
||
alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
|
||
static assert(isUtfMatcher!(CharMatcher, char));
|
||
static assert(isUtfMatcher!(CharMatcher, immutable(char)));
|
||
static assert(isUtfMatcher!(WcharMatcher, wchar));
|
||
static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
|
||
}
|
||
|
||
enum Mode {
|
||
alwaysSkip,
|
||
neverSkip,
|
||
skipOnMatch
|
||
};
|
||
|
||
mixin template ForwardStrings()
|
||
{
|
||
private bool fwdStr(string fn, C)(ref C[] str) const pure
|
||
{
|
||
alias type = typeof(units(str));
|
||
return mixin(fn~"(*cast(type*)&str)");
|
||
}
|
||
}
|
||
|
||
template Utf8Matcher()
|
||
{
|
||
enum validSize(int sz) = sz >= 1 && sz <=4;
|
||
|
||
void badEncoding() pure @safe
|
||
{
|
||
import std.utf;
|
||
throw new UTFException("Invalid UTF-8 sequence");
|
||
}
|
||
|
||
//for 1-stage ASCII
|
||
alias AsciiSpec = TypeTuple!(bool, char, clamp!7);
|
||
//for 2-stage lookup of 2 byte UTF-8 sequences
|
||
alias Utf8Spec2 = TypeTuple!(bool, char[2],
|
||
clampIdx!(0, 5), clampIdx!(1, 6));
|
||
//ditto for 3 byte
|
||
alias Utf8Spec3 = TypeTuple!(bool, char[3],
|
||
clampIdx!(0, 4),
|
||
clampIdx!(1, 6),
|
||
clampIdx!(2, 6)
|
||
);
|
||
//ditto for 4 byte
|
||
alias Utf8Spec4 = TypeTuple!(bool, char[4],
|
||
clampIdx!(0, 3), clampIdx!(1, 6),
|
||
clampIdx!(2, 6), clampIdx!(3, 6)
|
||
);
|
||
alias Tables = TypeTuple!(
|
||
typeof(TrieBuilder!(AsciiSpec)(false).build()),
|
||
typeof(TrieBuilder!(Utf8Spec2)(false).build()),
|
||
typeof(TrieBuilder!(Utf8Spec3)(false).build()),
|
||
typeof(TrieBuilder!(Utf8Spec4)(false).build())
|
||
);
|
||
alias Table(int size) = Tables[size-1];
|
||
|
||
enum leadMask(size_t size) = (cast(size_t)1<<(7 - size))-1;
|
||
enum encMask(size_t size) = ((1<<size)-1)<<(8-size);
|
||
|
||
char truncate()(char ch) pure @safe
|
||
{
|
||
ch -= 0x80;
|
||
if (ch < 0x40)
|
||
{
|
||
return ch;
|
||
}
|
||
else
|
||
{
|
||
badEncoding();
|
||
return cast(char)0;
|
||
}
|
||
}
|
||
|
||
static auto encode(size_t sz)(dchar ch)
|
||
if(sz > 1)
|
||
{
|
||
import std.utf : encode;
|
||
char[4] buf;
|
||
std.utf.encode(buf, ch);
|
||
char[sz] ret;
|
||
buf[0] &= leadMask!sz;
|
||
foreach(n; 1..sz)
|
||
buf[n] = buf[n] & 0x3f; //keep 6 lower bits
|
||
ret[] = buf[0..sz];
|
||
return ret;
|
||
}
|
||
|
||
auto build(Set)(Set set)
|
||
{
|
||
auto ascii = set & unicode.ASCII;
|
||
auto utf8_2 = set & CodepointSet(0x80, 0x800);
|
||
auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
|
||
auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
|
||
auto asciiT = ascii.byCodepoint.map!(x=>cast(char)x).buildTrie!(AsciiSpec);
|
||
auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
|
||
auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
|
||
auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
|
||
alias Ret = Impl!(1,2,3,4);
|
||
return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
|
||
}
|
||
|
||
// Bootstrap UTF-8 static matcher interface
|
||
// from 3 primitives: tab!(size), lookup and Sizes
|
||
mixin template DefMatcher()
|
||
{
|
||
import std.string : format;
|
||
enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
|
||
alias UniSizes = Erase!(1, Sizes);
|
||
|
||
//generate dispatch code sequence for unicode parts
|
||
static auto genDispatch()
|
||
{
|
||
string code;
|
||
foreach(size; UniSizes)
|
||
code ~= format(q{
|
||
if ((ch & ~leadMask!%d) == encMask!(%d))
|
||
return lookup!(%d, mode)(inp);
|
||
else
|
||
}, size, size, size);
|
||
static if (Sizes.length == 4) //covers all code unit cases
|
||
code ~= "{ badEncoding(); return false; }";
|
||
else
|
||
code ~= "return false;"; //may be just fine but not covered
|
||
return code;
|
||
}
|
||
enum dispatch = genDispatch();
|
||
|
||
public bool match(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
enum mode = Mode.skipOnMatch;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(hasASCII)
|
||
{
|
||
if (ch < 0x80)
|
||
{
|
||
bool r = tab!1[ch];
|
||
if(r)
|
||
inp.popFront();
|
||
return r;
|
||
}
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
|
||
static if(Sizes.length == 4) // can skip iff can detect all encodings
|
||
{
|
||
public bool skip(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
enum mode = Mode.alwaysSkip;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(hasASCII)
|
||
{
|
||
if (ch < 0x80)
|
||
{
|
||
inp.popFront();
|
||
return tab!1[ch];
|
||
}
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
}
|
||
|
||
public bool test(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : char))
|
||
{
|
||
enum mode = Mode.neverSkip;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(hasASCII)
|
||
{
|
||
if (ch < 0x80)
|
||
return tab!1[ch];
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
else
|
||
mixin(dispatch);
|
||
}
|
||
|
||
bool match(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"match"(str);
|
||
}
|
||
|
||
bool skip(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"skip"(str);
|
||
}
|
||
|
||
bool test(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"test"(str);
|
||
}
|
||
|
||
mixin ForwardStrings;
|
||
}
|
||
|
||
struct Impl(Sizes...)
|
||
{
|
||
static assert(allSatisfy!(validSize, Sizes),
|
||
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
|
||
private:
|
||
//pick tables for chosen sizes
|
||
alias OurTabs = staticMap!(Table, Sizes);
|
||
OurTabs tables;
|
||
mixin DefMatcher;
|
||
//static disptach helper UTF size ==> table
|
||
alias tab(int i) = tables[i - 1];
|
||
|
||
package @property auto subMatcher(SizesToPick...)() @trusted
|
||
{
|
||
return CherryPick!(Impl, SizesToPick)(&this);
|
||
}
|
||
|
||
bool lookup(int size, Mode mode, Range)(ref Range inp) const pure @trusted
|
||
{
|
||
import std.typecons;
|
||
if(inp.length < size)
|
||
{
|
||
badEncoding();
|
||
return false;
|
||
}
|
||
char[size] needle = void;
|
||
needle[0] = leadMask!size & inp[0];
|
||
foreach(i; staticIota!(1, size))
|
||
{
|
||
needle[i] = truncate(inp[i]);
|
||
}
|
||
//overlong encoding checks
|
||
static if(size == 2)
|
||
{
|
||
//0x80-0x7FF
|
||
//got 6 bits in needle[1], must use at least 8 bits
|
||
//must use at least 2 bits in needle[1]
|
||
if(needle[0] < 2) badEncoding();
|
||
}
|
||
else static if(size == 3)
|
||
{
|
||
//0x800-0xFFFF
|
||
//got 6 bits in needle[2], must use at least 12bits
|
||
//must use 6 bits in needle[1] or anything in needle[0]
|
||
if(needle[0] == 0 && needle[1] < 0x20) badEncoding();
|
||
}
|
||
else static if(size == 4)
|
||
{
|
||
//0x800-0xFFFF
|
||
//got 2x6=12 bits in needle[2..3] must use at least 17bits
|
||
//must use 5 bits (or above) in needle[1] or anything in needle[0]
|
||
if(needle[0] == 0 && needle[1] < 0x10) badEncoding();
|
||
}
|
||
static if(mode == Mode.alwaysSkip)
|
||
{
|
||
inp.popFrontN(size);
|
||
return tab!size[needle];
|
||
}
|
||
else static if(mode == Mode.neverSkip)
|
||
{
|
||
return tab!size[needle];
|
||
}
|
||
else
|
||
{
|
||
static assert(mode == Mode.skipOnMatch);
|
||
if (tab!size[needle])
|
||
{
|
||
inp.popFrontN(size);
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
|
||
struct CherryPick(I, Sizes...)
|
||
{
|
||
static assert(allSatisfy!(validSize, Sizes),
|
||
"Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
|
||
private:
|
||
I* m;
|
||
@property ref tab(int i)() const pure { return m.tables[i - 1]; }
|
||
bool lookup(int size, Mode mode, Range)(ref Range inp) const pure
|
||
{
|
||
return m.lookup!(size, mode)(inp);
|
||
}
|
||
mixin DefMatcher;
|
||
}
|
||
}
|
||
|
||
template Utf16Matcher()
|
||
{
|
||
enum validSize(int sz) = sz >= 1 && sz <=2;
|
||
|
||
void badEncoding() pure
|
||
{
|
||
import std.utf;
|
||
throw new UTFException("Invalid UTF-16 sequence");
|
||
}
|
||
|
||
alias Seq = TypeTuple;
|
||
// 1-stage ASCII
|
||
alias AsciiSpec = Seq!(bool, wchar, clamp!7);
|
||
//2-stage BMP
|
||
alias BmpSpec = Seq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
|
||
//4-stage - full Unicode
|
||
//assume that 0xD800 & 0xDC00 bits are cleared
|
||
//thus leaving 10 bit per wchar to worry about
|
||
alias UniSpec = Seq!(bool, wchar[2],
|
||
assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
|
||
assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
|
||
);
|
||
alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
|
||
alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
|
||
alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
|
||
|
||
auto encode2(dchar ch)
|
||
{
|
||
ch -= 0x1_0000;
|
||
assert(ch <= 0xF_FFFF);
|
||
wchar[2] ret;
|
||
//do not put surrogate bits, they are sliced off
|
||
ret[0] = (ch>>10);
|
||
ret[1] = (ch & 0xFFF);
|
||
return ret;
|
||
}
|
||
|
||
auto build(Set)(Set set)
|
||
{
|
||
auto ascii = set & unicode.ASCII;
|
||
auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
|
||
- CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
|
||
auto other = set - (bmp | ascii);
|
||
auto asciiT = ascii.byCodepoint.map!(x=>cast(char)x).buildTrie!(AsciiSpec);
|
||
auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar)x).buildTrie!(BmpSpec);
|
||
auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
|
||
alias Ret = Impl!(1,2);
|
||
return Ret(asciiT, bmpT, otherT);
|
||
}
|
||
|
||
//bootstrap full UTF-16 matcher interace from
|
||
//sizeFlags, lookupUni and ascii
|
||
mixin template DefMatcher()
|
||
{
|
||
public bool match(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
|
||
{
|
||
enum mode = Mode.skipOnMatch;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(sizeFlags & 1)
|
||
{
|
||
if (ch < 0x80)
|
||
{
|
||
if (ascii[ch])
|
||
{
|
||
inp.popFront();
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
return lookupUni!mode(inp);
|
||
}
|
||
else
|
||
return lookupUni!mode(inp);
|
||
}
|
||
|
||
static if(Sizes.length == 2)
|
||
{
|
||
public bool skip(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
|
||
{
|
||
enum mode = Mode.alwaysSkip;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(sizeFlags & 1)
|
||
{
|
||
if (ch < 0x80)
|
||
{
|
||
inp.popFront();
|
||
return ascii[ch];
|
||
}
|
||
else
|
||
return lookupUni!mode(inp);
|
||
}
|
||
else
|
||
return lookupUni!mode(inp);
|
||
}
|
||
}
|
||
|
||
public bool test(Range)(ref Range inp) const pure @trusted
|
||
if(isRandomAccessRange!Range && is(ElementType!Range : wchar))
|
||
{
|
||
enum mode = Mode.neverSkip;
|
||
assert(!inp.empty);
|
||
auto ch = inp[0];
|
||
static if(sizeFlags & 1)
|
||
return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
|
||
else
|
||
return lookupUni!mode(inp);
|
||
}
|
||
|
||
bool match(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"match"(str);
|
||
}
|
||
|
||
bool skip(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"skip"(str);
|
||
}
|
||
|
||
bool test(C)(ref C[] str) const pure @trusted
|
||
if(isSomeChar!C)
|
||
{
|
||
return fwdStr!"test"(str);
|
||
}
|
||
|
||
mixin ForwardStrings; //dispatch strings to range versions
|
||
}
|
||
|
||
struct Impl(Sizes...)
|
||
if(Sizes.length >= 1 && Sizes.length <= 2)
|
||
{
|
||
private:
|
||
static assert(allSatisfy!(validSize, Sizes),
|
||
"Only lengths of 1 and 2 code units are possible in UTF-16");
|
||
static if(Sizes.length > 1)
|
||
enum sizeFlags = Sizes[0] | Sizes[1];
|
||
else
|
||
enum sizeFlags = Sizes[0];
|
||
|
||
static if(sizeFlags & 1)
|
||
{
|
||
Ascii ascii;
|
||
Bmp bmp;
|
||
}
|
||
static if(sizeFlags & 2)
|
||
{
|
||
Uni uni;
|
||
}
|
||
mixin DefMatcher;
|
||
|
||
package @property auto subMatcher(SizesToPick...)() @trusted
|
||
{
|
||
return CherryPick!(Impl, SizesToPick)(&this);
|
||
}
|
||
|
||
bool lookupUni(Mode mode, Range)(ref Range inp) const pure
|
||
{
|
||
wchar x = cast(wchar)(inp[0] - 0xD800);
|
||
//not a high surrogate
|
||
if(x > 0x3FF)
|
||
{
|
||
//low surrogate
|
||
if(x <= 0x7FF) badEncoding();
|
||
static if(sizeFlags & 1)
|
||
{
|
||
auto ch = inp[0];
|
||
static if(mode == Mode.alwaysSkip)
|
||
inp.popFront();
|
||
static if(mode == Mode.skipOnMatch)
|
||
{
|
||
if (bmp[ch])
|
||
{
|
||
inp.popFront();
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
else
|
||
return bmp[ch];
|
||
}
|
||
else //skip is not available for sub-matchers, so just false
|
||
return false;
|
||
}
|
||
else
|
||
{
|
||
static if(sizeFlags & 2)
|
||
{
|
||
if(inp.length < 2)
|
||
badEncoding();
|
||
wchar y = cast(wchar)(inp[1] - 0xDC00);
|
||
//not a low surrogate
|
||
if(y > 0x3FF)
|
||
badEncoding();
|
||
wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
|
||
static if(mode == Mode.alwaysSkip)
|
||
inp.popFrontN(2);
|
||
static if(mode == Mode.skipOnMatch)
|
||
{
|
||
if (uni[needle])
|
||
{
|
||
inp.popFrontN(2);
|
||
return true;
|
||
}
|
||
else
|
||
return false;
|
||
}
|
||
else
|
||
return uni[needle];
|
||
}
|
||
else //ditto
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
|
||
struct CherryPick(I, Sizes...)
|
||
if(Sizes.length >= 1 && Sizes.length <= 2)
|
||
{
|
||
private:
|
||
I* m;
|
||
enum sizeFlags = I.sizeFlags;
|
||
|
||
static if(sizeFlags & 1)
|
||
{
|
||
@property ref ascii()() const pure{ return m.ascii; }
|
||
}
|
||
|
||
bool lookupUni(Mode mode, Range)(ref Range inp) const pure
|
||
{
|
||
return m.lookupUni!mode(inp);
|
||
}
|
||
mixin DefMatcher;
|
||
static assert(allSatisfy!(validSize, Sizes),
|
||
"Only lengths of 1 and 2 code units are possible in UTF-16");
|
||
}
|
||
}
|
||
|
||
private auto utf8Matcher(Set)(Set set) @trusted
|
||
{
|
||
return Utf8Matcher!().build(set);
|
||
}
|
||
|
||
private auto utf16Matcher(Set)(Set set) @trusted
|
||
{
|
||
return Utf16Matcher!().build(set);
|
||
}
|
||
|
||
/**
|
||
Constructs a matcher object
|
||
to classify $(CODEPOINTS) from the $(D set) for encoding
|
||
that has $(D Char) as code unit.
|
||
|
||
See $(LREF MatcherConcept) for API outline.
|
||
*/
|
||
public auto utfMatcher(Char, Set)(Set set) @trusted
|
||
if(isCodepointSet!Set)
|
||
{
|
||
static if(is(Char : char))
|
||
return utf8Matcher(set);
|
||
else static if(is(Char : wchar))
|
||
return utf16Matcher(set);
|
||
else static if(is(Char : dchar))
|
||
static assert(false, "UTF-32 needs no decoding,
|
||
and thus not supported by utfMatcher");
|
||
else
|
||
static assert(false, "Only character types 'char' and 'wchar' are allowed");
|
||
}
|
||
|
||
|
||
//a range of code units, packed with index to speed up forward iteration
|
||
package auto decoder(C)(C[] s, size_t offset=0) @trusted
|
||
if(is(C : wchar) || is(C : char))
|
||
{
|
||
static struct Decoder
|
||
{
|
||
pure nothrow:
|
||
C[] str;
|
||
size_t idx;
|
||
@property C front(){ return str[idx]; }
|
||
@property C back(){ return str[$-1]; }
|
||
void popFront(){ idx++; }
|
||
void popBack(){ str = str[0..$-1]; }
|
||
void popFrontN(size_t n){ idx += n; }
|
||
@property bool empty(){ return idx == str.length; }
|
||
@property auto save(){ return this; }
|
||
auto opIndex(size_t i){ return str[idx+i]; }
|
||
@property size_t length(){ return str.length - idx; }
|
||
alias opDollar = length;
|
||
auto opSlice(size_t a, size_t b){ return Decoder(str[0..idx+b], idx+a); }
|
||
}
|
||
static assert(isRandomAccessRange!Decoder);
|
||
static assert(is(ElementType!Decoder : C));
|
||
return Decoder(s, offset);
|
||
}
|
||
|
||
/*
|
||
Expose UTF string $(D s) as a random-access
|
||
range of $(S_LINK Code unit, code units).
|
||
*/
|
||
package auto units(C)(C[] s)
|
||
if(is(C : wchar) || is(C : char))
|
||
{
|
||
static struct Units
|
||
{
|
||
pure nothrow:
|
||
C[] str;
|
||
@property C front(){ return str[0]; }
|
||
@property C back(){ return str[$-1]; }
|
||
void popFront(){ str = str[1..$]; }
|
||
void popBack(){ str = str[0..$-1]; }
|
||
void popFrontN(size_t n){ str = str[n..$]; }
|
||
@property bool empty(){ return 0 == str.length; }
|
||
@property auto save(){ return this; }
|
||
auto opIndex(size_t i){ return str[i]; }
|
||
@property size_t length(){ return str.length; }
|
||
alias opDollar = length;
|
||
auto opSlice(size_t a, size_t b){ return Units(str[a..b]); }
|
||
}
|
||
static assert(isRandomAccessRange!Units);
|
||
static assert(is(ElementType!Units : C));
|
||
return Units(s);
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
import std.range;
|
||
string rs = "hi! ネемног砀 текста";
|
||
auto codec = rs.decoder;
|
||
auto utf8 = utf8Matcher(unicode.Letter);
|
||
auto asc = utf8.subMatcher!(1);
|
||
auto uni = utf8.subMatcher!(2,3,4);
|
||
assert(asc.test(codec));
|
||
assert(!uni.match(codec));
|
||
assert(utf8.skip(codec));
|
||
assert(codec.idx == 1);
|
||
|
||
assert(!uni.match(codec));
|
||
assert(asc.test(codec));
|
||
assert(utf8.skip(codec));
|
||
assert(codec.idx == 2);
|
||
assert(!asc.match(codec));
|
||
|
||
assert(!utf8.test(codec));
|
||
assert(!utf8.skip(codec));
|
||
|
||
assert(!asc.test(codec));
|
||
assert(!utf8.test(codec));
|
||
assert(!utf8.skip(codec));
|
||
assert(utf8.test(codec));
|
||
foreach(i; 0..7)
|
||
{
|
||
assert(!asc.test(codec));
|
||
assert(uni.test(codec));
|
||
assert(utf8.skip(codec));
|
||
}
|
||
assert(!utf8.test(codec));
|
||
assert(!utf8.skip(codec));
|
||
//the same with match where applicable
|
||
codec = rs.decoder;
|
||
assert(utf8.match(codec));
|
||
assert(codec.idx == 1);
|
||
assert(utf8.match(codec));
|
||
assert(codec.idx == 2);
|
||
assert(!utf8.match(codec));
|
||
assert(codec.idx == 2);
|
||
assert(!utf8.skip(codec));
|
||
assert(!utf8.skip(codec));
|
||
|
||
foreach(i; 0..7)
|
||
{
|
||
assert(!asc.test(codec));
|
||
assert(utf8.test(codec));
|
||
assert(utf8.match(codec));
|
||
}
|
||
auto i = codec.idx;
|
||
assert(!utf8.match(codec));
|
||
assert(codec.idx == i);
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
static bool testAll(Matcher, Range)(ref Matcher m, ref Range r)
|
||
{
|
||
bool t = m.test(r);
|
||
auto save = r.idx;
|
||
assert(t == m.match(r));
|
||
assert(r.idx == save || t); //ether no change or was match
|
||
r.idx = save;
|
||
static if(is(typeof(m.skip(r))))
|
||
{
|
||
assert(t == m.skip(r));
|
||
assert(r.idx != save); //always changed
|
||
r.idx = save;
|
||
}
|
||
return t;
|
||
}
|
||
auto utf16 = utfMatcher!wchar(unicode.L);
|
||
auto bmp = utf16.subMatcher!1;
|
||
auto nonBmp = utf16.subMatcher!1;
|
||
auto utf8 = utfMatcher!char(unicode.L);
|
||
auto ascii = utf8.subMatcher!1;
|
||
auto uni2 = utf8.subMatcher!2;
|
||
auto uni3 = utf8.subMatcher!3;
|
||
auto uni24 = utf8.subMatcher!(2,4);
|
||
foreach(ch; unicode.L.byCodepoint.stride(3))
|
||
{
|
||
import std.utf : encode;
|
||
char[4] buf;
|
||
wchar[2] buf16;
|
||
auto len = std.utf.encode(buf, ch);
|
||
auto len16 = std.utf.encode(buf16, ch);
|
||
auto c8 = buf[0..len].decoder;
|
||
auto c16 = buf16[0..len16].decoder;
|
||
assert(testAll(utf16, c16));
|
||
assert(testAll(bmp, c16) || len16 != 1);
|
||
assert(testAll(nonBmp, c16) || len16 != 2);
|
||
|
||
assert(testAll(utf8, c8));
|
||
|
||
//submatchers return false on out of their domain
|
||
assert(testAll(ascii, c8) || len != 1);
|
||
assert(testAll(uni2, c8) || len != 2);
|
||
assert(testAll(uni3, c8) || len != 3);
|
||
assert(testAll(uni24, c8) || (len != 2 && len !=4));
|
||
}
|
||
}
|
||
|
||
// cover decode fail cases of Matcher
|
||
unittest
|
||
{
|
||
import std.string : format;
|
||
auto utf16 = utfMatcher!wchar(unicode.L);
|
||
auto utf8 = utfMatcher!char(unicode.L);
|
||
//decode failure cases UTF-8
|
||
alias fails8 = TypeTuple!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
|
||
"\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
|
||
"\xCF\x00\0x00\0x00\x00");
|
||
foreach(msg; fails8){
|
||
assert(collectException((){
|
||
auto s = msg;
|
||
import std.utf;
|
||
size_t idx = 0;
|
||
//decode(s, idx);
|
||
utf8.test(s);
|
||
}()), format("%( %2x %)", cast(ubyte[])msg));
|
||
}
|
||
//decode failure cases UTF-16
|
||
alias fails16 = TypeTuple!([0xD811], [0xDC02]);
|
||
foreach(msg; fails16){
|
||
assert(collectException((){
|
||
auto s = msg.map!(x => cast(wchar)x);
|
||
utf16.test(s);
|
||
}()));
|
||
}
|
||
}
|
||
|
||
/++
|
||
Convenience function to construct optimal configurations for
|
||
packed Trie from any $(D set) of $(CODEPOINTS).
|
||
|
||
The parameter $(D level) indicates the number of trie levels to use,
|
||
allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
|
||
speed-size wise.
|
||
|
||
$(P Level 1 is fastest and the most memory hungry (a bit array). )
|
||
$(P Level 4 is the slowest and has the smallest footprint. )
|
||
|
||
See the $(S_LINK Synopsis, Synopsis) section for example.
|
||
|
||
Note:
|
||
Level 4 stays very practical (being faster and more predictable)
|
||
compared to using direct lookup on the $(D set) itself.
|
||
|
||
|
||
+/
|
||
public auto toTrie(size_t level, Set)(Set set)
|
||
if(isCodepointSet!Set)
|
||
{
|
||
static if(level == 1)
|
||
return codepointSetTrie!(21)(set);
|
||
else static if(level == 2)
|
||
return codepointSetTrie!(10, 11)(set);
|
||
else static if(level == 3)
|
||
return codepointSetTrie!(8, 5, 8)(set);
|
||
else static if(level == 4)
|
||
return codepointSetTrie!(6, 4, 4, 7)(set);
|
||
else
|
||
static assert(false,
|
||
"Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
|
||
}
|
||
|
||
/**
|
||
$(P Builds a $(D Trie) with typically optimal speed-size trade-off
|
||
and wraps it into a delegate of the following type:
|
||
$(D bool delegate(dchar ch)). )
|
||
|
||
$(P Effectively this creates a 'tester' lambda suitable
|
||
for algorithms like std.algorithm.find that take unary predicates. )
|
||
|
||
See the $(S_LINK Synopsis, Synopsis) section for example.
|
||
*/
|
||
public auto toDelegate(Set)(Set set)
|
||
if(isCodepointSet!Set)
|
||
{
|
||
// 3 is very small and is almost as fast as 2-level (due to CPU caches?)
|
||
auto t = toTrie!3(set);
|
||
return (dchar ch) => t[ch];
|
||
}
|
||
|
||
/**
|
||
$(P Opaque wrapper around unsigned built-in integers and
|
||
code unit (char/wchar/dchar) types.
|
||
Parameter $(D sz) indicates that the value is confined
|
||
to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
|
||
packed more tightly when stored in certain
|
||
data-structures like trie. )
|
||
|
||
Note:
|
||
$(P The $(D BitPacked!(T, sz)) is implicitly convertible to $(D T)
|
||
but not vise-versa. Users have to ensure the value fits in
|
||
the range required and use the $(D cast)
|
||
operator to perform the conversion.)
|
||
*/
|
||
struct BitPacked(T, size_t sz)
|
||
if(isIntegral!T || is(T:dchar))
|
||
{
|
||
enum bitSize = sz;
|
||
T _value;
|
||
alias _value this;
|
||
}
|
||
|
||
/*
|
||
Depending on the form of the passed argument $(D bitSizeOf) returns
|
||
the amount of bits required to represent a given type
|
||
or a return type of a given functor.
|
||
*/
|
||
template bitSizeOf(Args...)
|
||
if(Args.length == 1)
|
||
{
|
||
alias T = Args[0];
|
||
static if(__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
|
||
{
|
||
enum bitSizeOf = T.bitSize;
|
||
}
|
||
else static if(is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
|
||
{
|
||
enum bitSizeOf = bitSizeOf!(ReturnType!T);
|
||
}
|
||
else
|
||
{
|
||
enum bitSizeOf = T.sizeof*8;
|
||
}
|
||
}
|
||
|
||
/**
|
||
Tests if $(D T) is some instantiation of $(LREF BitPacked)!(U, x)
|
||
and thus suitable for packing.
|
||
*/
|
||
template isBitPacked(T)
|
||
{
|
||
static if(is(T dummy == BitPacked!(U, bits), U, size_t bits))
|
||
enum isBitPacked = true;
|
||
else
|
||
enum isBitPacked = false;
|
||
}
|
||
|
||
/**
|
||
Gives the type $(D U) from $(LREF BitPacked)!(U, x)
|
||
or $(D T) itself for every other type.
|
||
*/
|
||
template TypeOfBitPacked(T)
|
||
{
|
||
static if(is(T dummy == BitPacked!(U, bits), U, size_t bits))
|
||
alias TypeOfBitPacked = U;
|
||
else
|
||
alias TypeOfBitPacked = T;
|
||
}
|
||
|
||
/*
|
||
Wrapper, used in definition of custom data structures from $(D Trie) template.
|
||
Applying it to a unary lambda function indicates that the returned value always
|
||
fits within $(D bits) of bits.
|
||
*/
|
||
struct assumeSize(alias Fn, size_t bits)
|
||
{
|
||
enum bitSize = bits;
|
||
static auto ref opCall(T)(auto ref T arg)
|
||
{
|
||
return Fn(arg);
|
||
}
|
||
}
|
||
|
||
/*
|
||
A helper for defining lambda function that yields a slice
|
||
of certain bits from an unsigned integral value.
|
||
The resulting lambda is wrapped in assumeSize and can be used directly
|
||
with $(D Trie) template.
|
||
*/
|
||
struct sliceBits(size_t from, size_t to)
|
||
{
|
||
//for now bypass assumeSize, DMD has trouble inlining it
|
||
enum bitSize = to-from;
|
||
static auto opCall(T)(T x)
|
||
out(result)
|
||
{
|
||
assert(result < (1<<to-from));
|
||
}
|
||
body
|
||
{
|
||
static assert(from < to);
|
||
static if(from == 0)
|
||
return x & ((1<<to)-1);
|
||
else
|
||
return (x >> from) & ((1<<(to-from))-1);
|
||
}
|
||
}
|
||
|
||
uint low_8(uint x) { return x&0xFF; }
|
||
@safe pure nothrow uint midlow_8(uint x){ return (x&0xFF00)>>8; }
|
||
alias lo8 = assumeSize!(low_8, 8);
|
||
alias mlo8 = assumeSize!(midlow_8, 8);
|
||
|
||
static assert(bitSizeOf!lo8 == 8);
|
||
static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
|
||
static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
|
||
|
||
template Sequence(size_t start, size_t end)
|
||
{
|
||
static if(start < end)
|
||
alias Sequence = TypeTuple!(start, Sequence!(start+1, end));
|
||
else
|
||
alias Sequence = TypeTuple!();
|
||
}
|
||
|
||
//---- TRIE TESTS ----
|
||
unittest
|
||
{
|
||
static trieStats(TRIE)(TRIE t)
|
||
{
|
||
version(std_uni_stats)
|
||
{
|
||
import std.stdio;
|
||
writeln("---TRIE FOOTPRINT STATS---");
|
||
foreach(i; staticIota!(0, t.table.dim) )
|
||
{
|
||
writefln("lvl%s = %s bytes; %s pages"
|
||
, i, t.bytes!i, t.pages!i);
|
||
}
|
||
writefln("TOTAL: %s bytes", t.bytes);
|
||
version(none)
|
||
{
|
||
writeln("INDEX (excluding value level):");
|
||
foreach(i; staticIota!(0, t.table.dim-1) )
|
||
writeln(t.table.slice!(i)[0..t.table.length!i]);
|
||
}
|
||
writeln("---------------------------");
|
||
}
|
||
}
|
||
//@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
|
||
// alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; });
|
||
// alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
|
||
alias CodepointSet Set;
|
||
auto set = Set('A','Z','a','z');
|
||
auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
|
||
for(int a='a'; a<'z';a++)
|
||
assert(trie[a]);
|
||
for(int a='A'; a<'Z';a++)
|
||
assert(trie[a]);
|
||
for(int a=0; a<'A'; a++)
|
||
assert(!trie[a]);
|
||
for(int a ='Z'; a<'a'; a++)
|
||
assert(!trie[a]);
|
||
trieStats(trie);
|
||
|
||
auto redundant2 = Set(
|
||
1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
|
||
auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
|
||
trieStats(trie2);
|
||
foreach(e; redundant2.byCodepoint)
|
||
assert(trie2[e], text(cast(uint)e, " - ", trie2[e]));
|
||
foreach(i; 0..1024)
|
||
{
|
||
assert(trie2[i] == (i in redundant2));
|
||
}
|
||
|
||
|
||
auto redundant3 = Set(
|
||
2, 4, 6, 8, 16,
|
||
2+16, 4+16, 16+6, 16+8, 16+16,
|
||
2+32, 4+32, 32+6, 32+8,
|
||
);
|
||
|
||
enum max3 = 256;
|
||
// sliceBits
|
||
auto trie3 = buildTrie!(bool, uint, max3,
|
||
sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
|
||
)(redundant3.byInterval);
|
||
trieStats(trie3);
|
||
foreach(i; 0..max3)
|
||
assert(trie3[i] == (i in redundant3), text(cast(uint)i));
|
||
|
||
auto redundant4 = Set(
|
||
10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
|
||
1000, 2000, 3000, 4000, 5000, 6000
|
||
);
|
||
enum max4 = 2^^16;
|
||
auto trie4 = buildTrie!(bool, size_t, max4,
|
||
sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
|
||
)(redundant4.byInterval);
|
||
foreach(i; 0..max4){
|
||
if(i in redundant4)
|
||
assert(trie4[i], text(cast(uint)i));
|
||
}
|
||
trieStats(trie4);
|
||
|
||
alias mapToS = mapTrieIndex!(useItemAt!(0, char));
|
||
string[] redundantS = ["tea", "start", "orange"];
|
||
redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
|
||
auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
|
||
// using first char only
|
||
assert(redundantS == ["orange", "start", "tea"]);
|
||
assert(strie["test"], text(strie["test"]));
|
||
assert(!strie["aea"]);
|
||
assert(strie["s"]);
|
||
|
||
// a bit size test
|
||
auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
|
||
auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
|
||
trieStats(bt);
|
||
foreach(i; 0..256)
|
||
assert(bt[cast(ubyte)i]);
|
||
}
|
||
|
||
template useItemAt(size_t idx, T)
|
||
if(isIntegral!T || is(T: dchar))
|
||
{
|
||
size_t impl(in T[] arr){ return arr[idx]; }
|
||
alias useItemAt = assumeSize!(impl, 8*T.sizeof);
|
||
}
|
||
|
||
template useLastItem(T)
|
||
{
|
||
size_t impl(in T[] arr){ return arr[$-1]; }
|
||
alias useLastItem = assumeSize!(impl, 8*T.sizeof);
|
||
}
|
||
|
||
template fullBitSize(Prefix...)
|
||
{
|
||
static if(Prefix.length > 0)
|
||
enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
|
||
else
|
||
enum fullBitSize = 0;
|
||
}
|
||
|
||
template idxTypes(Key, size_t fullBits, Prefix...)
|
||
{
|
||
static if(Prefix.length == 1)
|
||
{// the last level is value level, so no index once reduced to 1-level
|
||
alias idxTypes = TypeTuple!();
|
||
}
|
||
else
|
||
{
|
||
// Important note on bit packing
|
||
// Each level has to hold enough of bits to address the next one
|
||
// The bottom level is known to hold full bit width
|
||
// thus it's size in pages is full_bit_width - size_of_last_prefix
|
||
// Recourse on this notion
|
||
alias idxTypes =
|
||
TypeTuple!(
|
||
idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
|
||
BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
|
||
);
|
||
}
|
||
}
|
||
|
||
//============================================================================
|
||
|
||
@trusted int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
|
||
{
|
||
alias low = std.ascii.toLower;
|
||
return cmp(
|
||
a.map!(x => low(x))()
|
||
.filter!(x => !isWhite(x) && x != '-' && x != '_')(),
|
||
b.map!(x => low(x))()
|
||
.filter!(x => !isWhite(x) && x != '-' && x != '_')()
|
||
);
|
||
}
|
||
|
||
bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
|
||
{
|
||
return comparePropertyName(a, b) < 0;
|
||
}
|
||
|
||
//============================================================================
|
||
// Utilities for compression of Unicode code point sets
|
||
//============================================================================
|
||
|
||
@safe void compressTo(uint val, ref ubyte[] arr) pure nothrow
|
||
{
|
||
// not optimized as usually done 1 time (and not public interface)
|
||
if(val < 128)
|
||
arr ~= cast(ubyte)val;
|
||
else if(val < (1<<13))
|
||
{
|
||
arr ~= (0b1_00<<5) | cast(ubyte)(val>>8);
|
||
arr ~= val & 0xFF;
|
||
}
|
||
else
|
||
{
|
||
assert(val < (1<<21));
|
||
arr ~= (0b1_01<<5) | cast(ubyte)(val>>16);
|
||
arr ~= (val >> 8) & 0xFF;
|
||
arr ~= val & 0xFF;
|
||
}
|
||
}
|
||
|
||
@safe uint decompressFrom(const(ubyte)[] arr, ref size_t idx) pure
|
||
{
|
||
uint first = arr[idx++];
|
||
if(!(first & 0x80)) // no top bit -> [0..127]
|
||
return first;
|
||
uint extra = ((first>>5) & 1) + 1; // [1, 2]
|
||
uint val = (first & 0x1F);
|
||
enforce(idx + extra <= arr.length, "bad code point interval encoding");
|
||
foreach(j; 0..extra)
|
||
val = (val<<8) | arr[idx+j];
|
||
idx += extra;
|
||
return val;
|
||
}
|
||
|
||
|
||
package ubyte[] compressIntervals(Range)(Range intervals)
|
||
if(isInputRange!Range && isIntegralPair!(ElementType!Range))
|
||
{
|
||
ubyte[] storage;
|
||
uint base = 0;
|
||
// RLE encode
|
||
foreach(val; intervals)
|
||
{
|
||
compressTo(val[0]-base, storage);
|
||
base = val[0];
|
||
if(val[1] != lastDchar+1) // till the end of the domain so don't store it
|
||
{
|
||
compressTo(val[1]-base, storage);
|
||
base = val[1];
|
||
}
|
||
}
|
||
return storage;
|
||
}
|
||
|
||
unittest
|
||
{
|
||
auto run = [tuple(80, 127), tuple(128, (1<<10)+128)];
|
||
ubyte[] enc = [cast(ubyte)80, 47, 1, (0b1_00<<5) | (1<<2), 0];
|
||
assert(compressIntervals(run) == enc);
|
||
auto run2 = [tuple(0, (1<<20)+512+1), tuple((1<<20)+512+4, lastDchar+1)];
|
||
ubyte[] enc2 = [cast(ubyte)0, (0b1_01<<5) | (1<<4), 2, 1, 3]; // odd length-ed
|
||
assert(compressIntervals(run2) == enc2);
|
||
size_t idx = 0;
|
||
assert(decompressFrom(enc, idx) == 80);
|
||
assert(decompressFrom(enc, idx) == 47);
|
||
assert(decompressFrom(enc, idx) == 1);
|
||
assert(decompressFrom(enc, idx) == (1<<10));
|
||
idx = 0;
|
||
assert(decompressFrom(enc2, idx) == 0);
|
||
assert(decompressFrom(enc2, idx) == (1<<20)+512+1);
|
||
assert(equalS(decompressIntervals(compressIntervals(run)), run));
|
||
assert(equalS(decompressIntervals(compressIntervals(run2)), run2));
|
||
}
|
||
|
||
// Creates a range of $(D CodepointInterval) that lazily decodes compressed data.
|
||
@safe package auto decompressIntervals(const(ubyte)[] data) pure
|
||
{
|
||
return DecompressedIntervals(data);
|
||
}
|
||
|
||
@trusted struct DecompressedIntervals
|
||
{
|
||
pure:
|
||
const(ubyte)[] _stream;
|
||
size_t _idx;
|
||
CodepointInterval _front;
|
||
|
||
this(const(ubyte)[] stream)
|
||
{
|
||
_stream = stream;
|
||
popFront();
|
||
}
|
||
|
||
@property CodepointInterval front()
|
||
{
|
||
assert(!empty);
|
||
return _front;
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
if(_idx == _stream.length)
|
||
{
|
||
_idx = size_t.max;
|
||
return;
|
||
}
|
||
uint base = _front[1];
|
||
_front[0] = base + decompressFrom(_stream, _idx);
|
||
if(_idx == _stream.length)// odd length ---> till the end
|
||
_front[1] = lastDchar+1;
|
||
else
|
||
{
|
||
base = _front[0];
|
||
_front[1] = base + decompressFrom(_stream, _idx);
|
||
}
|
||
}
|
||
|
||
@property bool empty() const
|
||
{
|
||
return _idx == size_t.max;
|
||
}
|
||
|
||
@property DecompressedIntervals save() { return this; }
|
||
}
|
||
|
||
static assert(isInputRange!DecompressedIntervals);
|
||
static assert(isForwardRange!DecompressedIntervals);
|
||
//============================================================================
|
||
|
||
version(std_uni_bootstrap){}
|
||
else
|
||
{
|
||
|
||
// helper for looking up code point sets
|
||
@trusted ptrdiff_t findUnicodeSet(alias table, C)(in C[] name) pure
|
||
{
|
||
auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
|
||
(table.map!"a.name"());
|
||
size_t idx = range.lowerBound(name).length;
|
||
if(idx < range.length && comparePropertyName(range[idx], name) == 0)
|
||
return idx;
|
||
return -1;
|
||
}
|
||
|
||
// another one that loads it
|
||
@trusted bool loadUnicodeSet(alias table, Set, C)(in C[] name, ref Set dest) pure
|
||
{
|
||
auto idx = findUnicodeSet!table(name);
|
||
if(idx >= 0)
|
||
{
|
||
dest = Set(asSet(table[idx].compressed));
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
@trusted bool loadProperty(Set=CodepointSet, C)
|
||
(in C[] name, ref Set target) pure
|
||
{
|
||
alias ucmp = comparePropertyName;
|
||
// conjure cumulative properties by hand
|
||
if(ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
|
||
{
|
||
target = asSet(uniProps.Lu);
|
||
target |= asSet(uniProps.Ll);
|
||
target |= asSet(uniProps.Lt);
|
||
target |= asSet(uniProps.Lo);
|
||
target |= asSet(uniProps.Lm);
|
||
}
|
||
else if(ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
|
||
{
|
||
target = asSet(uniProps.Ll);
|
||
target |= asSet(uniProps.Lu);
|
||
target |= asSet(uniProps.Lt);// Title case
|
||
}
|
||
else if(ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
|
||
{
|
||
target = asSet(uniProps.Mn);
|
||
target |= asSet(uniProps.Mc);
|
||
target |= asSet(uniProps.Me);
|
||
}
|
||
else if(ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
|
||
{
|
||
target = asSet(uniProps.Nd);
|
||
target |= asSet(uniProps.Nl);
|
||
target |= asSet(uniProps.No);
|
||
}
|
||
else if(ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
|
||
{
|
||
target = asSet(uniProps.Pc);
|
||
target |= asSet(uniProps.Pd);
|
||
target |= asSet(uniProps.Ps);
|
||
target |= asSet(uniProps.Pe);
|
||
target |= asSet(uniProps.Pi);
|
||
target |= asSet(uniProps.Pf);
|
||
target |= asSet(uniProps.Po);
|
||
}
|
||
else if(ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
|
||
{
|
||
target = asSet(uniProps.Sm);
|
||
target |= asSet(uniProps.Sc);
|
||
target |= asSet(uniProps.Sk);
|
||
target |= asSet(uniProps.So);
|
||
}
|
||
else if(ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
|
||
{
|
||
target = asSet(uniProps.Zs);
|
||
target |= asSet(uniProps.Zl);
|
||
target |= asSet(uniProps.Zp);
|
||
}
|
||
else if(ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
|
||
{
|
||
target = asSet(uniProps.Co);
|
||
target |= asSet(uniProps.Lo);
|
||
target |= asSet(uniProps.No);
|
||
target |= asSet(uniProps.So);
|
||
target |= asSet(uniProps.Po);
|
||
}
|
||
else if(ucmp(name, "graphical") == 0){
|
||
target = asSet(uniProps.Alphabetic);
|
||
|
||
target |= asSet(uniProps.Mn);
|
||
target |= asSet(uniProps.Mc);
|
||
target |= asSet(uniProps.Me);
|
||
|
||
target |= asSet(uniProps.Nd);
|
||
target |= asSet(uniProps.Nl);
|
||
target |= asSet(uniProps.No);
|
||
|
||
target |= asSet(uniProps.Pc);
|
||
target |= asSet(uniProps.Pd);
|
||
target |= asSet(uniProps.Ps);
|
||
target |= asSet(uniProps.Pe);
|
||
target |= asSet(uniProps.Pi);
|
||
target |= asSet(uniProps.Pf);
|
||
target |= asSet(uniProps.Po);
|
||
|
||
target |= asSet(uniProps.Zs);
|
||
|
||
target |= asSet(uniProps.Sm);
|
||
target |= asSet(uniProps.Sc);
|
||
target |= asSet(uniProps.Sk);
|
||
target |= asSet(uniProps.So);
|
||
}
|
||
else if(ucmp(name, "any") == 0)
|
||
target = Set.fromIntervals(0, 0x110000);
|
||
else if(ucmp(name, "ascii") == 0)
|
||
target = Set.fromIntervals(0, 0x80);
|
||
else
|
||
return loadUnicodeSet!(uniProps.tab)(name, target);
|
||
return true;
|
||
}
|
||
|
||
// CTFE-only helper for checking property names at compile-time
|
||
@safe bool isPrettyPropertyName(C)(in C[] name)
|
||
{
|
||
auto names = [
|
||
"L", "Letter",
|
||
"LC", "Cased Letter",
|
||
"M", "Mark",
|
||
"N", "Number",
|
||
"P", "Punctuation",
|
||
"S", "Symbol",
|
||
"Z", "Separator",
|
||
"Graphical",
|
||
"any",
|
||
"ascii"
|
||
];
|
||
auto x = find!(x => comparePropertyName(x, name) == 0)(names);
|
||
return !x.empty;
|
||
}
|
||
|
||
// ditto, CTFE-only, not optimized
|
||
@safe private static bool findSetName(alias table, C)(in C[] name)
|
||
{
|
||
return findUnicodeSet!table(name) >= 0;
|
||
}
|
||
|
||
template SetSearcher(alias table, string kind)
|
||
{
|
||
/// Run-time checked search.
|
||
static auto opCall(C)(in C[] name)
|
||
if(is(C : dchar))
|
||
{
|
||
CodepointSet set;
|
||
if(loadUnicodeSet!table(name, set))
|
||
return set;
|
||
throw new Exception("No unicode set for "~kind~" by name "
|
||
~name.to!string()~" was found.");
|
||
}
|
||
/// Compile-time checked search.
|
||
static @property auto opDispatch(string name)()
|
||
{
|
||
static if(findSetName!table(name))
|
||
{
|
||
CodepointSet set;
|
||
loadUnicodeSet!table(name, set);
|
||
return set;
|
||
}
|
||
else
|
||
static assert(false, "No unicode set for "~kind~" by name "
|
||
~name~" was found.");
|
||
}
|
||
}
|
||
|
||
/**
|
||
A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
|
||
a block, script or general category.
|
||
|
||
It uses well defined standard rules of property name lookup.
|
||
This includes fuzzy matching of names, so that
|
||
'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
|
||
and yield the same set of white space $(CHARACTERS).
|
||
*/
|
||
@safe public struct unicode
|
||
{
|
||
/**
|
||
Performs the lookup of set of $(CODEPOINTS)
|
||
with compile-time correctness checking.
|
||
This short-cut version combines 3 searches:
|
||
across blocks, scripts, and common binary properties.
|
||
|
||
Note that since scripts and blocks overlap the
|
||
usual trick to disambiguate is used - to get a block use
|
||
$(D unicode.InBlockName), to search a script
|
||
use $(D unicode.ScriptName).
|
||
|
||
See also $(LREF block), $(LREF script)
|
||
and (not included in this search) $(LREF hangulSyllableType).
|
||
|
||
Example:
|
||
---
|
||
auto ascii = unicode.ASCII;
|
||
assert(ascii['A']);
|
||
assert(ascii['~']);
|
||
assert(!ascii['\u00e0']);
|
||
// matching is case-insensitive
|
||
assert(ascii == unicode.ascII);
|
||
assert(!ascii['à']);
|
||
// underscores, '-' and whitespace in names are ignored too
|
||
auto latin = unicode.in_latin1_Supplement;
|
||
assert(latin['à']);
|
||
assert(!latin['$']);
|
||
// BTW Latin 1 Supplement is a block, hence "In" prefix
|
||
assert(latin == unicode("In Latin 1 Supplement"));
|
||
import std.exception;
|
||
// run-time look up throws if no such set is found
|
||
assert(collectException(unicode("InCyrilliac")));
|
||
---
|
||
*/
|
||
|
||
static @property auto opDispatch(string name)() pure
|
||
{
|
||
static if(findAny(name))
|
||
return loadAny(name);
|
||
else
|
||
static assert(false, "No unicode set by name "~name~" was found.");
|
||
}
|
||
|
||
/**
|
||
The same lookup across blocks, scripts, or binary properties,
|
||
but performed at run-time.
|
||
This version is provided for cases where $(D name)
|
||
is not known beforehand; otherwise compile-time
|
||
checked $(LREF opDispatch) is typically a better choice.
|
||
|
||
See the $(S_LINK Unicode properties, table of properties) for available
|
||
sets.
|
||
*/
|
||
static auto opCall(C)(in C[] name)
|
||
if(is(C : dchar))
|
||
{
|
||
return loadAny(name);
|
||
}
|
||
|
||
/**
|
||
Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
|
||
|
||
See also $(S_LINK Unicode properties, table of properties).
|
||
|
||
Note:
|
||
Here block names are unambiguous as no scripts are searched
|
||
and thus to search use simply $(D unicode.block.BlockName) notation.
|
||
|
||
See $(S_LINK Unicode properties, table of properties) for available sets.
|
||
|
||
Example:
|
||
---
|
||
// use .block for explicitness
|
||
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
|
||
---
|
||
*/
|
||
struct block
|
||
{
|
||
mixin SetSearcher!(blocks.tab, "block");
|
||
}
|
||
|
||
/**
|
||
Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
|
||
|
||
See the $(S_LINK Unicode properties, table of properties) for available
|
||
sets.
|
||
|
||
Example:
|
||
---
|
||
auto arabicScript = unicode.script.arabic;
|
||
auto arabicBlock = unicode.block.arabic;
|
||
// there is an intersection between script and block
|
||
assert(arabicBlock['']);
|
||
assert(arabicScript['']);
|
||
// but they are different
|
||
assert(arabicBlock != arabicScript);
|
||
assert(arabicBlock == unicode.inArabic);
|
||
assert(arabicScript == unicode.arabic);
|
||
---
|
||
*/
|
||
struct script
|
||
{
|
||
mixin SetSearcher!(scripts.tab, "script");
|
||
}
|
||
|
||
/**
|
||
Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
|
||
|
||
Other non-binary properties (once supported) follow the same
|
||
notation - $(D unicode.propertyName.propertyValue) for compile-time
|
||
checked access and $(D unicode.propertyName(propertyValue))
|
||
for run-time checked one.
|
||
|
||
See the $(S_LINK Unicode properties, table of properties) for available
|
||
sets.
|
||
|
||
Example:
|
||
---
|
||
// L here is syllable type not Letter as in unicode.L short-cut
|
||
auto leadingVowel = unicode.hangulSyllableType("L");
|
||
// check that some leading vowels are present
|
||
foreach(vowel; '\u1110'..'\u115F')
|
||
assert(leadingVowel[vowel]);
|
||
assert(leadingVowel == unicode.hangulSyllableType.L);
|
||
---
|
||
*/
|
||
struct hangulSyllableType
|
||
{
|
||
mixin SetSearcher!(hangul.tab, "hangul syllable type");
|
||
}
|
||
|
||
private:
|
||
alias ucmp = comparePropertyName;
|
||
|
||
static bool findAny(string name)
|
||
{
|
||
return isPrettyPropertyName(name)
|
||
|| findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
|
||
|| (ucmp(name[0..2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
|
||
}
|
||
|
||
static auto loadAny(Set=CodepointSet, C)(in C[] name) pure
|
||
{
|
||
Set set;
|
||
bool loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
|
||
|| (name.length > 2 && ucmp(name[0..2],"In") == 0
|
||
&& loadUnicodeSet!(blocks.tab)(name[2..$], set));
|
||
if(loaded)
|
||
return set;
|
||
throw new Exception("No unicode set by name "~name.to!string()~" was found.");
|
||
}
|
||
|
||
// FIXME: re-disable once the compiler is fixed
|
||
// Disabled to prevent the mistake of creating instances of this pseudo-struct.
|
||
//@disable ~this();
|
||
}
|
||
|
||
unittest
|
||
{
|
||
auto ascii = unicode.ASCII;
|
||
assert(ascii['A']);
|
||
assert(ascii['~']);
|
||
assert(!ascii['\u00e0']);
|
||
// matching is case-insensitive
|
||
assert(ascii == unicode.ascII);
|
||
assert(!ascii['à']);
|
||
// underscores, '-' and whitespace in names are ignored too
|
||
auto latin = unicode.Inlatin1_Supplement;
|
||
assert(latin['à']);
|
||
assert(!latin['$']);
|
||
// BTW Latin 1 Supplement is a block, hence "In" prefix
|
||
assert(latin == unicode("In Latin 1 Supplement"));
|
||
import std.exception;
|
||
// R-T look up throws if no such set is found
|
||
assert(collectException(unicode("InCyrilliac")));
|
||
assert(collectException(unicode("X")));
|
||
|
||
assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
|
||
|
||
// L here is explicitly syllable type not "Letter" as in unicode.L
|
||
auto leadingVowel = unicode.hangulSyllableType("L");
|
||
// check that some leading vowels are present
|
||
foreach(vowel; '\u1110'..'\u115F'+1)
|
||
assert(leadingVowel[vowel]);
|
||
assert(leadingVowel == unicode.hangulSyllableType.L);
|
||
|
||
auto arabicScript = unicode.script.arabic;
|
||
auto arabicBlock = unicode.block.arabic;
|
||
// there is an intersection between script and block
|
||
assert(arabicBlock['']);
|
||
assert(arabicScript['']);
|
||
// but they are different
|
||
assert(arabicBlock != arabicScript);
|
||
assert(arabicBlock == unicode.inArabic);
|
||
assert(arabicScript == unicode.arabic);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(unicode("InHebrew") == asSet(blocks.Hebrew));
|
||
assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
|
||
assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
|
||
}
|
||
|
||
enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
|
||
|
||
// control - '\r'
|
||
enum controlSwitch = `
|
||
case '\u0000':..case '\u0008':case '\u000E':..case '\u001F':case '\u007F':..case '\u0084':case '\u0086':..case '\u009F': case '\u0009':..case '\u000C': case '\u0085':
|
||
`;
|
||
// TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
|
||
// kill unrolled switches
|
||
|
||
private static bool isRegionalIndicator(dchar ch) @safe
|
||
{
|
||
return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
|
||
}
|
||
|
||
template genericDecodeGrapheme(bool getValue)
|
||
{
|
||
alias graphemeExtend = graphemeExtendTrie;
|
||
alias spacingMark = mcTrie;
|
||
static if(getValue)
|
||
alias Value = Grapheme;
|
||
else
|
||
alias Value = void;
|
||
|
||
Value genericDecodeGrapheme(Input)(ref Input range)
|
||
{
|
||
enum GraphemeState {
|
||
Start,
|
||
CR,
|
||
RI,
|
||
L,
|
||
V,
|
||
LVT
|
||
}
|
||
static if(getValue)
|
||
Grapheme grapheme;
|
||
auto state = GraphemeState.Start;
|
||
enum eat = q{
|
||
static if(getValue)
|
||
grapheme ~= ch;
|
||
range.popFront();
|
||
};
|
||
|
||
dchar ch;
|
||
assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
|
||
while(!range.empty)
|
||
{
|
||
ch = range.front;
|
||
final switch(state) with(GraphemeState)
|
||
{
|
||
case Start:
|
||
mixin(eat);
|
||
if(ch == '\r')
|
||
state = CR;
|
||
else if(isRegionalIndicator(ch))
|
||
state = RI;
|
||
else if(isHangL(ch))
|
||
state = L;
|
||
else if(hangLV[ch] || isHangV(ch))
|
||
state = V;
|
||
else if(hangLVT[ch])
|
||
state = LVT;
|
||
else if(isHangT(ch))
|
||
state = LVT;
|
||
else
|
||
{
|
||
switch(ch)
|
||
{
|
||
mixin(controlSwitch);
|
||
goto L_End;
|
||
default:
|
||
goto L_End_Extend;
|
||
}
|
||
}
|
||
break;
|
||
case CR:
|
||
if(ch == '\n')
|
||
mixin(eat);
|
||
goto L_End_Extend;
|
||
case RI:
|
||
if(isRegionalIndicator(ch))
|
||
mixin(eat);
|
||
else
|
||
goto L_End_Extend;
|
||
break;
|
||
case L:
|
||
if(isHangL(ch))
|
||
mixin(eat);
|
||
else if(isHangV(ch) || hangLV[ch])
|
||
{
|
||
state = V;
|
||
mixin(eat);
|
||
}
|
||
else if(hangLVT[ch])
|
||
{
|
||
state = LVT;
|
||
mixin(eat);
|
||
}
|
||
else
|
||
goto L_End_Extend;
|
||
break;
|
||
case V:
|
||
if(isHangV(ch))
|
||
mixin(eat);
|
||
else if(isHangT(ch))
|
||
{
|
||
state = LVT;
|
||
mixin(eat);
|
||
}
|
||
else
|
||
goto L_End_Extend;
|
||
break;
|
||
case LVT:
|
||
if(isHangT(ch))
|
||
{
|
||
mixin(eat);
|
||
}
|
||
else
|
||
goto L_End_Extend;
|
||
break;
|
||
}
|
||
}
|
||
L_End_Extend:
|
||
while(!range.empty)
|
||
{
|
||
ch = range.front;
|
||
// extend & spacing marks
|
||
if(!graphemeExtend[ch] && !spacingMark[ch])
|
||
break;
|
||
mixin(eat);
|
||
}
|
||
L_End:
|
||
static if(getValue)
|
||
return grapheme;
|
||
}
|
||
|
||
}
|
||
|
||
@trusted:
|
||
public: // Public API continues
|
||
|
||
/++
|
||
Returns the length of grapheme cluster starting at $(D index).
|
||
Both the resulting length and the $(D index) are measured
|
||
in $(S_LINK Code unit, code units).
|
||
|
||
Example:
|
||
---
|
||
// ASCII as usual is 1 code unit, 1 code point etc.
|
||
assert(graphemeStride(" ", 1) == 1);
|
||
// A + combing ring above
|
||
string city = "A\u030Arhus";
|
||
size_t first = graphemeStride(city, 0);
|
||
assert(first == 3); //\u030A has 2 UTF-8 code units
|
||
assert(city[0..first] == "A\u030A");
|
||
assert(city[first..$] == "rhus");
|
||
---
|
||
+/
|
||
size_t graphemeStride(C)(in C[] input, size_t index)
|
||
if(is(C : dchar))
|
||
{
|
||
auto src = input[index..$];
|
||
auto n = src.length;
|
||
genericDecodeGrapheme!(false)(src);
|
||
return n - src.length;
|
||
}
|
||
|
||
// for now tested separately see test_grapheme.d
|
||
unittest
|
||
{
|
||
assert(graphemeStride(" ", 1) == 1);
|
||
// A + combing ring above
|
||
string city = "A\u030Arhus";
|
||
size_t first = graphemeStride(city, 0);
|
||
assert(first == 3); //\u030A has 2 UTF-8 code units
|
||
assert(city[0..first] == "A\u030A");
|
||
assert(city[first..$] == "rhus");
|
||
}
|
||
|
||
/++
|
||
Reads one full grapheme cluster from an input range of dchar $(D inp).
|
||
|
||
For examples see the $(LREF Grapheme) below.
|
||
|
||
Note:
|
||
This function modifies $(D inp) and thus $(D inp)
|
||
must be an L-value.
|
||
+/
|
||
Grapheme decodeGrapheme(Input)(ref Input inp)
|
||
if(isInputRange!Input && is(Unqual!(ElementType!Input) == dchar))
|
||
{
|
||
return genericDecodeGrapheme!true(inp);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
Grapheme gr;
|
||
string s = " \u0020\u0308 ";
|
||
gr = decodeGrapheme(s);
|
||
assert(gr.length == 1 && gr[0] == ' ');
|
||
gr = decodeGrapheme(s);
|
||
assert(gr.length == 2 && equalS(gr[0..2], " \u0308"));
|
||
s = "\u0300\u0308\u1100";
|
||
assert(equalS(decodeGrapheme(s)[], "\u0300\u0308"));
|
||
assert(equalS(decodeGrapheme(s)[], "\u1100"));
|
||
s = "\u11A8\u0308\uAC01";
|
||
assert(equalS(decodeGrapheme(s)[], "\u11A8\u0308"));
|
||
assert(equalS(decodeGrapheme(s)[], "\uAC01"));
|
||
}
|
||
|
||
/++
|
||
$(P Iterate a string by grapheme.)
|
||
|
||
$(P Useful for doing string manipulation that needs to be aware
|
||
of graphemes.)
|
||
|
||
See_Also:
|
||
$(LREF byCodePoint)
|
||
+/
|
||
auto byGrapheme(Range)(Range range)
|
||
if(isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
|
||
{
|
||
// TODO: Bidirectional access
|
||
static struct Result
|
||
{
|
||
private Range _range;
|
||
private Grapheme _front;
|
||
|
||
bool empty() @property
|
||
{
|
||
return _front.length == 0;
|
||
}
|
||
|
||
Grapheme front() @property
|
||
{
|
||
return _front;
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
_front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
|
||
}
|
||
|
||
static if(isForwardRange!Range)
|
||
{
|
||
Result save() @property
|
||
{
|
||
return Result(_range.save, _front);
|
||
}
|
||
}
|
||
}
|
||
|
||
auto result = Result(range);
|
||
result.popFront();
|
||
return result;
|
||
}
|
||
|
||
///
|
||
unittest
|
||
{
|
||
auto text = "noe\u0308l"; // noël using e + combining diaeresis
|
||
assert(text.walkLength == 5); // 5 code points
|
||
|
||
auto gText = text.byGrapheme;
|
||
assert(gText.walkLength == 4); // 4 graphemes
|
||
|
||
assert(gText.take(3).equal("noe\u0308".byGrapheme));
|
||
assert(gText.drop(3).equal("l".byGrapheme));
|
||
}
|
||
|
||
// For testing non-forward-range input ranges
|
||
version(unittest)
|
||
private static struct InputRangeString
|
||
{
|
||
private string s;
|
||
|
||
bool empty() @property { return s.empty; }
|
||
dchar front() @property { return s.front; }
|
||
void popFront() { s.popFront(); }
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert("".byGrapheme.walkLength == 0);
|
||
|
||
auto reverse = "le\u0308on";
|
||
assert(reverse.walkLength == 5);
|
||
|
||
auto gReverse = reverse.byGrapheme;
|
||
assert(gReverse.walkLength == 4);
|
||
|
||
foreach(text; TypeTuple!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
|
||
{
|
||
assert(text.walkLength == 5);
|
||
static assert(isForwardRange!(typeof(text)));
|
||
|
||
auto gText = text.byGrapheme;
|
||
static assert(isForwardRange!(typeof(gText)));
|
||
assert(gText.walkLength == 4);
|
||
assert(gText.array.retro.equal(gReverse));
|
||
}
|
||
|
||
auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
|
||
static assert(!isForwardRange!(typeof(nonForwardRange)));
|
||
assert(nonForwardRange.walkLength == 4);
|
||
}
|
||
|
||
/++
|
||
$(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
|
||
|
||
$(P Useful for converting the result to a string after doing operations
|
||
on graphemes.)
|
||
|
||
$(P Acts as the identity function when given a range of code points.)
|
||
+/
|
||
auto byCodePoint(Range)(Range range)
|
||
if(isInputRange!Range && is(Unqual!(ElementType!Range) == Grapheme))
|
||
{
|
||
// TODO: Propagate bidirectional access
|
||
static struct Result
|
||
{
|
||
private Range _range;
|
||
private size_t i = 0;
|
||
|
||
bool empty() @property
|
||
{
|
||
return _range.empty;
|
||
}
|
||
|
||
dchar front() @property
|
||
{
|
||
return _range.front[i];
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
++i;
|
||
|
||
if(i >= _range.front.length)
|
||
{
|
||
_range.popFront();
|
||
i = 0;
|
||
}
|
||
}
|
||
|
||
static if(isForwardRange!Range)
|
||
{
|
||
Result save() @property
|
||
{
|
||
return Result(_range.save, i);
|
||
}
|
||
}
|
||
}
|
||
|
||
return Result(range);
|
||
}
|
||
|
||
/// Ditto
|
||
Range byCodePoint(Range)(Range range)
|
||
if(isInputRange!Range && is(Unqual!(ElementType!Range) == dchar))
|
||
{
|
||
return range;
|
||
}
|
||
|
||
///
|
||
unittest
|
||
{
|
||
import std.conv : text;
|
||
|
||
string s = "noe\u0308l"; // noël
|
||
|
||
// reverse it and convert the result to a string
|
||
string reverse = s.byGrapheme
|
||
.array
|
||
.retro
|
||
.byCodePoint
|
||
.text;
|
||
|
||
assert(reverse == "le\u0308on"); // lëon
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert("".byGrapheme.byCodePoint.equal(""));
|
||
|
||
string text = "noe\u0308l";
|
||
static assert(is(typeof(text.byCodePoint) == string));
|
||
|
||
auto gText = InputRangeString(text).byGrapheme;
|
||
static assert(!isForwardRange!(typeof(gText)));
|
||
|
||
auto cpText = gText.byCodePoint;
|
||
static assert(!isForwardRange!(typeof(cpText)));
|
||
|
||
assert(cpText.walkLength == text.walkLength);
|
||
}
|
||
|
||
/++
|
||
$(P A structure designed to effectively pack $(CHARACTERS)
|
||
of a $(CLUSTER).
|
||
)
|
||
|
||
$(P $(D Grapheme) has value semantics so 2 copies of a $(D Grapheme)
|
||
always refer to distinct objects. In most actual scenarios a $(D Grapheme)
|
||
fits on the stack and avoids memory allocation overhead for all but quite
|
||
long clusters.
|
||
)
|
||
|
||
Example:
|
||
---
|
||
import std.algorithm;
|
||
string bold = "ku\u0308hn";
|
||
|
||
// note that decodeGrapheme takes parameter by ref
|
||
// slicing a grapheme yields a range of dchar
|
||
assert(decodeGrapheme(bold)[].equal("k"));
|
||
|
||
// the next grapheme is 2 characters long
|
||
auto wideOne = decodeGrapheme(bold);
|
||
assert(wideOne.length == 2);
|
||
assert(wideOne[].equal("u\u0308"));
|
||
|
||
// the usual range manipulation is possible
|
||
assert(wideOne[].filter!isMark.equal("\u0308"));
|
||
---
|
||
$(P See also $(LREF decodeGrapheme), $(LREF graphemeStride). )
|
||
+/
|
||
@trusted struct Grapheme
|
||
{
|
||
public:
|
||
this(C)(in C[] chars...)
|
||
if(is(C : dchar))
|
||
{
|
||
this ~= chars;
|
||
}
|
||
|
||
this(Input)(Input seq)
|
||
if(!isDynamicArray!Input
|
||
&& isInputRange!Input && is(ElementType!Input : dchar))
|
||
{
|
||
this ~= seq;
|
||
}
|
||
|
||
/// Gets a $(CODEPOINT) at the given index in this cluster.
|
||
dchar opIndex(size_t index) const pure nothrow
|
||
{
|
||
assert(index < length);
|
||
return read24(isBig ? ptr_ : small_.ptr, index);
|
||
}
|
||
|
||
/++
|
||
Writes a $(CODEPOINT) $(D ch) at given index in this cluster.
|
||
|
||
Warning:
|
||
Use of this facility may invalidate grapheme cluster,
|
||
see also $(LREF Grapheme.valid).
|
||
|
||
Example:
|
||
---
|
||
auto g = Grapheme("A\u0302");
|
||
assert(g[0] == 'A');
|
||
assert(g.valid);
|
||
g[1] = '~'; // ASCII tilda is not a combining mark
|
||
assert(g[1] == '~');
|
||
assert(!g.valid);
|
||
---
|
||
+/
|
||
void opIndexAssign(dchar ch, size_t index) pure nothrow
|
||
{
|
||
assert(index < length);
|
||
write24(isBig ? ptr_ : small_.ptr, ch, index);
|
||
}
|
||
|
||
/++
|
||
Random-access range over Grapheme's $(CHARACTERS).
|
||
|
||
Warning: Invalidates when this Grapheme leaves the scope,
|
||
attempts to use it then would lead to memory corruption.
|
||
+/
|
||
@system auto opSlice(size_t a, size_t b) pure nothrow
|
||
{
|
||
return sliceOverIndexed(a, b, &this);
|
||
}
|
||
|
||
/// ditto
|
||
@system auto opSlice() pure nothrow
|
||
{
|
||
return sliceOverIndexed(0, length, &this);
|
||
}
|
||
|
||
/// Grapheme cluster length in $(CODEPOINTS).
|
||
@property size_t length() const pure nothrow
|
||
{
|
||
return isBig ? len_ : slen_ & 0x7F;
|
||
}
|
||
|
||
/++
|
||
Append $(CHARACTER) $(D ch) to this grapheme.
|
||
Warning:
|
||
Use of this facility may invalidate grapheme cluster,
|
||
see also $(D valid).
|
||
|
||
Example:
|
||
---
|
||
auto g = Grapheme("A");
|
||
assert(g.valid);
|
||
g ~= '\u0301';
|
||
assert(g[].equal("A\u0301"));
|
||
assert(g.valid);
|
||
g ~= "B";
|
||
// not a valid grapheme cluster anymore
|
||
assert(!g.valid);
|
||
// still could be useful though
|
||
assert(g[].equal("A\u0301B"));
|
||
---
|
||
See also $(LREF Grapheme.valid) below.
|
||
+/
|
||
ref opOpAssign(string op)(dchar ch)
|
||
{
|
||
static if(op == "~")
|
||
{
|
||
if(!isBig)
|
||
{
|
||
if(slen_ + 1 > small_cap)
|
||
convertToBig();// & fallthrough to "big" branch
|
||
else
|
||
{
|
||
write24(small_.ptr, ch, smallLength);
|
||
slen_++;
|
||
return this;
|
||
}
|
||
}
|
||
|
||
assert(isBig);
|
||
if(len_ + 1 > cap_)
|
||
{
|
||
cap_ += grow;
|
||
ptr_ = cast(ubyte*)enforce(realloc(ptr_, 3*(cap_+1)));
|
||
}
|
||
write24(ptr_, ch, len_++);
|
||
return this;
|
||
}
|
||
else
|
||
static assert(false, "No operation "~op~" defined for Grapheme");
|
||
}
|
||
|
||
/// Append all $(CHARACTERS) from the input range $(D inp) to this Grapheme.
|
||
ref opOpAssign(string op, Input)(Input inp)
|
||
if(isInputRange!Input && is(ElementType!Input : dchar))
|
||
{
|
||
static if(op == "~")
|
||
{
|
||
foreach(dchar ch; inp)
|
||
this ~= ch;
|
||
return this;
|
||
}
|
||
else
|
||
static assert(false, "No operation "~op~" defined for Grapheme");
|
||
}
|
||
|
||
/++
|
||
True if this object contains valid extended grapheme cluster.
|
||
Decoding primitives of this module always return a valid $(D Grapheme).
|
||
|
||
Appending to and direct manipulation of grapheme's $(CHARACTERS) may
|
||
render it no longer valid. Certain applications may chose to use
|
||
Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
|
||
entirely.
|
||
+/
|
||
@property bool valid()() /*const*/
|
||
{
|
||
auto r = this[];
|
||
genericDecodeGrapheme!false(r);
|
||
return r.length == 0;
|
||
}
|
||
|
||
this(this)
|
||
{
|
||
if(isBig)
|
||
{// dup it
|
||
auto raw_cap = 3*(cap_+1);
|
||
auto p = cast(ubyte*)enforce(malloc(raw_cap));
|
||
p[0..raw_cap] = ptr_[0..raw_cap];
|
||
ptr_ = p;
|
||
}
|
||
}
|
||
|
||
~this()
|
||
{
|
||
if(isBig)
|
||
{
|
||
free(ptr_);
|
||
}
|
||
}
|
||
|
||
|
||
private:
|
||
enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
|
||
// "out of the blue" grow rate, needs testing
|
||
// (though graphemes are typically small < 9)
|
||
enum grow = 20;
|
||
enum small_cap = small_bytes/3;
|
||
enum small_flag = 0x80, small_mask = 0x7F;
|
||
// 16 bytes in 32bits, should be enough for the majority of cases
|
||
union
|
||
{
|
||
struct
|
||
{
|
||
ubyte* ptr_;
|
||
size_t cap_;
|
||
size_t len_;
|
||
size_t padding_;
|
||
}
|
||
struct
|
||
{
|
||
ubyte[small_bytes] small_;
|
||
ubyte slen_;
|
||
}
|
||
}
|
||
|
||
void convertToBig()
|
||
{
|
||
size_t k = smallLength;
|
||
ubyte* p = cast(ubyte*)enforce(malloc(3*(grow+1)));
|
||
for(int i=0; i<k; i++)
|
||
write24(p, read24(small_.ptr, i), i);
|
||
// now we can overwrite small array data
|
||
ptr_ = p;
|
||
len_ = slen_;
|
||
assert(grow > len_);
|
||
cap_ = grow;
|
||
setBig();
|
||
}
|
||
|
||
void setBig(){ slen_ |= small_flag; }
|
||
|
||
@property size_t smallLength() pure nothrow
|
||
{
|
||
return slen_ & small_mask;
|
||
}
|
||
@property ubyte isBig() const pure nothrow
|
||
{
|
||
return slen_ & small_flag;
|
||
}
|
||
}
|
||
|
||
static assert(Grapheme.sizeof == size_t.sizeof*4);
|
||
|
||
// verify the example
|
||
unittest
|
||
{
|
||
import std.algorithm;
|
||
string bold = "ku\u0308hn";
|
||
|
||
// note that decodeGrapheme takes parameter by ref
|
||
auto first = decodeGrapheme(bold);
|
||
|
||
assert(first.length == 1);
|
||
assert(first[0] == 'k');
|
||
|
||
// the next grapheme is 2 characters long
|
||
auto wideOne = decodeGrapheme(bold);
|
||
// slicing a grapheme yields a random-access range of dchar
|
||
assert(wideOne[].equalS("u\u0308"));
|
||
assert(wideOne.length == 2);
|
||
static assert(isRandomAccessRange!(typeof(wideOne[])));
|
||
|
||
// all of the usual range manipulation is possible
|
||
assert(wideOne[].filter!isMark().equalS("\u0308"));
|
||
|
||
auto g = Grapheme("A");
|
||
assert(g.valid);
|
||
g ~= '\u0301';
|
||
assert(g[].equalS("A\u0301"));
|
||
assert(g.valid);
|
||
g ~= "B";
|
||
// not a valid grapheme cluster anymore
|
||
assert(!g.valid);
|
||
// still could be useful though
|
||
assert(g[].equalS("A\u0301B"));
|
||
}
|
||
|
||
unittest
|
||
{
|
||
auto g = Grapheme("A\u0302");
|
||
assert(g[0] == 'A');
|
||
assert(g.valid);
|
||
g[1] = '~'; // ASCII tilda is not a combining mark
|
||
assert(g[1] == '~');
|
||
assert(!g.valid);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
// not valid clusters (but it just a test)
|
||
auto g = Grapheme('a', 'b', 'c', 'd', 'e');
|
||
assert(g[0] == 'a');
|
||
assert(g[1] == 'b');
|
||
assert(g[2] == 'c');
|
||
assert(g[3] == 'd');
|
||
assert(g[4] == 'e');
|
||
g[3] = 'Й';
|
||
assert(g[2] == 'c');
|
||
assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
|
||
assert(g[4] == 'e');
|
||
assert(!g.valid);
|
||
|
||
g ~= 'ц';
|
||
g ~= '~';
|
||
assert(g[0] == 'a');
|
||
assert(g[1] == 'b');
|
||
assert(g[2] == 'c');
|
||
assert(g[3] == 'Й');
|
||
assert(g[4] == 'e');
|
||
assert(g[5] == 'ц');
|
||
assert(g[6] == '~');
|
||
assert(!g.valid);
|
||
|
||
Grapheme copy = g;
|
||
copy[0] = 'X';
|
||
copy[1] = '-';
|
||
assert(g[0] == 'a' && copy[0] == 'X');
|
||
assert(g[1] == 'b' && copy[1] == '-');
|
||
assert(equalS(g[2..g.length], copy[2..copy.length]));
|
||
copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
|
||
assert(equalS(copy[0..8], "АБВГДЕЁЖ"), text(copy[0..8]));
|
||
copy ~= "xyz";
|
||
assert(equalS(copy[13..15], "xy"), text(copy[13..15]));
|
||
assert(!copy.valid);
|
||
|
||
Grapheme h;
|
||
foreach(dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
|
||
h ~= v;
|
||
assert(equalS(h[], iota(cast(int)'A', cast(int)'Z'+1)));
|
||
}
|
||
|
||
/++
|
||
$(P Does basic case-insensitive comparison of strings $(D str1) and $(D str2).
|
||
This function uses simpler comparison rule thus achieving better performance
|
||
then $(LREF icmp). However keep in mind the warning below.)
|
||
|
||
Warning:
|
||
This function only handles 1:1 $(CODEPOINT) mapping
|
||
and thus is not sufficient for certain alphabets
|
||
like German, Greek and few others.
|
||
|
||
Example:
|
||
---
|
||
assert(sicmp("Август", "авгусТ") == 0);
|
||
// Greek also works as long as there is no 1:M mapping in sight
|
||
assert(sicmp("ΌΎ", "όύ") == 0);
|
||
// things like the following won't get matched as equal
|
||
// Greek small letter iota with dialytika and tonos
|
||
assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
|
||
|
||
// while icmp has no problem with that
|
||
assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
|
||
assert(icmp("ΌΎ", "όύ") == 0);
|
||
---
|
||
+/
|
||
int sicmp(S1, S2)(S1 str1, S2 str2)
|
||
if(isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar)
|
||
&& isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
|
||
{
|
||
import std.utf : decode;
|
||
|
||
alias sTable = simpleCaseTable;
|
||
size_t ridx=0;
|
||
foreach(dchar lhs; str1)
|
||
{
|
||
if(ridx == str2.length)
|
||
return 1;
|
||
dchar rhs = std.utf.decode(str2, ridx);
|
||
int diff = lhs - rhs;
|
||
if(!diff)
|
||
continue;
|
||
size_t idx = simpleCaseTrie[lhs];
|
||
size_t idx2 = simpleCaseTrie[rhs];
|
||
// simpleCaseTrie is packed index table
|
||
if(idx != EMPTY_CASE_TRIE)
|
||
{
|
||
if(idx2 != EMPTY_CASE_TRIE)
|
||
{// both cased chars
|
||
// adjust idx --> start of bucket
|
||
idx = idx - sTable[idx].n;
|
||
idx2 = idx2 - sTable[idx2].n;
|
||
if(idx == idx2)// one bucket, equivalent chars
|
||
continue;
|
||
else// not the same bucket
|
||
diff = sTable[idx].ch - sTable[idx2].ch;
|
||
}
|
||
else
|
||
diff = sTable[idx - sTable[idx].n].ch - rhs;
|
||
}
|
||
else if(idx2 != EMPTY_CASE_TRIE)
|
||
{
|
||
diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
|
||
}
|
||
// one of chars is not cased at all
|
||
return diff;
|
||
}
|
||
return ridx == str2.length ? 0 : -1;
|
||
}
|
||
// overloads for the most common cases to reduce compile time
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
int sicmp(const(char)[] str1, const(char)[] str2)
|
||
{ return sicmp!(const(char)[], const(char)[])(str1, str2); }
|
||
int sicmp(const(wchar)[] str1, const(wchar)[] str2)
|
||
{ return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
|
||
int sicmp(const(dchar)[] str1, const(dchar)[] str2)
|
||
{ return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
|
||
}
|
||
|
||
private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
|
||
@trusted pure /*TODO nothrow*/
|
||
{
|
||
alias fTable = fullCaseTable;
|
||
size_t idx = fullCaseTrie[lhs];
|
||
// fullCaseTrie is packed index table
|
||
if(idx == EMPTY_CASE_TRIE)
|
||
return lhs;
|
||
size_t start = idx - fTable[idx].n;
|
||
size_t end = fTable[idx].size + start;
|
||
assert(fTable[start].entry_len == 1);
|
||
for(idx=start; idx<end; idx++)
|
||
{
|
||
auto entryLen = fTable[idx].entry_len;
|
||
if(entryLen == 1)
|
||
{
|
||
if(fTable[idx].seq[0] == rhs)
|
||
{
|
||
return 0;
|
||
}
|
||
}
|
||
else
|
||
{// OK it's a long chunk, like 'ss' for German
|
||
dstring seq = fTable[idx].seq[0..entryLen];
|
||
if(rhs == seq[0]
|
||
&& rtail.skipOver(seq[1..$]))
|
||
{
|
||
// note that this path modifies rtail
|
||
// iff we managed to get there
|
||
return 0;
|
||
}
|
||
}
|
||
}
|
||
return fTable[start].seq[0]; // new remapped character for accurate diffs
|
||
}
|
||
|
||
/++
|
||
$(P Does case insensitive comparison of $(D str1) and $(D str2).
|
||
Follows the rules of full case-folding mapping.
|
||
This includes matching as equal german ß with "ss" and
|
||
other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
|
||
The cost of $(D icmp) being pedantically correct is
|
||
slightly worse performance.
|
||
)
|
||
|
||
Example:
|
||
---
|
||
assert(icmp("Rußland", "Russland") == 0);
|
||
assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
|
||
---
|
||
+/
|
||
int icmp(S1, S2)(S1 str1, S2 str2)
|
||
if(isForwardRange!S1 && is(Unqual!(ElementType!S1) == dchar)
|
||
&& isForwardRange!S2 && is(Unqual!(ElementType!S2) == dchar))
|
||
{
|
||
for(;;)
|
||
{
|
||
if(str1.empty)
|
||
return str2.empty ? 0 : -1;
|
||
dchar lhs = str1.front;
|
||
if(str2.empty)
|
||
return 1;
|
||
dchar rhs = str2.front;
|
||
str1.popFront();
|
||
str2.popFront();
|
||
int diff = lhs - rhs;
|
||
if(!diff)
|
||
continue;
|
||
// first try to match lhs to <rhs,right-tail> sequence
|
||
int cmpLR = fullCasedCmp(lhs, rhs, str2);
|
||
if(!cmpLR)
|
||
continue;
|
||
// then rhs to <lhs,left-tail> sequence
|
||
int cmpRL = fullCasedCmp(rhs, lhs, str1);
|
||
if(!cmpRL)
|
||
continue;
|
||
// cmpXX contain remapped codepoints
|
||
// to obtain stable ordering of icmp
|
||
diff = cmpLR - cmpRL;
|
||
return diff;
|
||
}
|
||
}
|
||
// overloads for the most common cases to reduce compile time
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
int icmp(const(char)[] str1, const(char)[] str2)
|
||
{ return icmp!(const(char)[], const(char)[])(str1, str2); }
|
||
int icmp(const(wchar)[] str1, const(wchar)[] str2)
|
||
{ return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
|
||
int icmp(const(dchar)[] str1, const(dchar)[] str2)
|
||
{ return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assertCTFEable!(
|
||
{
|
||
foreach(cfunc; TypeTuple!(icmp, sicmp))
|
||
{
|
||
foreach(S1; TypeTuple!(string, wstring, dstring))
|
||
foreach(S2; TypeTuple!(string, wstring, dstring))
|
||
{
|
||
assert(cfunc("".to!S1(), "".to!S2()) == 0);
|
||
assert(cfunc("A".to!S1(), "".to!S2()) > 0);
|
||
assert(cfunc("".to!S1(), "0".to!S2()) < 0);
|
||
assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
|
||
assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
|
||
assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
|
||
assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
|
||
assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
|
||
// Check example:
|
||
assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
|
||
assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
|
||
}
|
||
// check that the order is properly agnostic to the case
|
||
auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"];
|
||
sort!((a,b) => cfunc(a,b) < 0)(strs);
|
||
assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]);
|
||
}
|
||
assert(icmp("ßb", "ssa") > 0);
|
||
// Check example:
|
||
assert(icmp("Russland", "Rußland") == 0);
|
||
assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
|
||
assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
|
||
assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
|
||
//bugzilla 11057
|
||
assert( icmp("K", "L") < 0 );
|
||
});
|
||
}
|
||
|
||
// This is package for the moment to be used as a support tool for std.regex
|
||
// It needs a better API
|
||
/*
|
||
Return a range of all $(CODEPOINTS) that casefold to
|
||
and from this $(D ch).
|
||
*/
|
||
package auto simpleCaseFoldings(dchar ch)
|
||
{
|
||
alias sTable = simpleCaseTable;
|
||
static struct Range
|
||
{
|
||
pure nothrow:
|
||
uint idx; //if == uint.max, then read c.
|
||
union
|
||
{
|
||
dchar c; // == 0 - empty range
|
||
uint len;
|
||
}
|
||
@property bool isSmall() const { return idx == uint.max; }
|
||
|
||
this(dchar ch)
|
||
{
|
||
idx = uint.max;
|
||
c = ch;
|
||
}
|
||
|
||
this(uint start, uint size)
|
||
{
|
||
idx = start;
|
||
len = size;
|
||
}
|
||
|
||
@property dchar front() const
|
||
{
|
||
assert(!empty);
|
||
if(isSmall)
|
||
{
|
||
return c;
|
||
}
|
||
auto ch = sTable[idx].ch;
|
||
return ch;
|
||
}
|
||
|
||
@property bool empty() const
|
||
{
|
||
if(isSmall)
|
||
{
|
||
return c == 0;
|
||
}
|
||
return len == 0;
|
||
}
|
||
|
||
@property uint length() const
|
||
{
|
||
if(isSmall)
|
||
{
|
||
return c == 0 ? 0 : 1;
|
||
}
|
||
return len;
|
||
}
|
||
|
||
void popFront()
|
||
{
|
||
if(isSmall)
|
||
c = 0;
|
||
else
|
||
{
|
||
idx++;
|
||
len--;
|
||
}
|
||
}
|
||
}
|
||
immutable idx = simpleCaseTrie[ch];
|
||
if (idx == EMPTY_CASE_TRIE)
|
||
return Range(ch);
|
||
auto entry = sTable[idx];
|
||
immutable start = idx - entry.n;
|
||
return Range(start, entry.size);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assertCTFEable!((){
|
||
auto r = simpleCaseFoldings('Э').array;
|
||
assert(r.length == 2);
|
||
assert(r.canFind('э') && r.canFind('Э'));
|
||
auto sr = simpleCaseFoldings('~');
|
||
assert(sr.equalS("~"));
|
||
//A with ring above - casefolds to the same bucket as Angstrom sign
|
||
sr = simpleCaseFoldings('Å');
|
||
assert(sr.length == 3);
|
||
assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
|
||
});
|
||
}
|
||
|
||
/++
|
||
$(P Returns the $(S_LINK Combining class, combining class) of $(D ch).)
|
||
|
||
Example:
|
||
---
|
||
// shorten the code
|
||
alias CC = combiningClass;
|
||
|
||
// combining tilda
|
||
assert(CC('\u0303') == 230);
|
||
// combining ring below
|
||
assert(CC('\u0325') == 220);
|
||
// the simple consequence is that "tilda" should be
|
||
// placed after a "ring below" in a sequence
|
||
---
|
||
+/
|
||
ubyte combiningClass(dchar ch)
|
||
{
|
||
return combiningClassTrie[ch];
|
||
}
|
||
|
||
unittest
|
||
{
|
||
foreach(ch; 0..0x80)
|
||
assert(combiningClass(ch) == 0);
|
||
assert(combiningClass('\u05BD') == 22);
|
||
assert(combiningClass('\u0300') == 230);
|
||
assert(combiningClass('\u0317') == 220);
|
||
assert(combiningClass('\u1939') == 222);
|
||
}
|
||
|
||
/// Unicode character decomposition type.
|
||
enum UnicodeDecomposition {
|
||
/// Canonical decomposition. The result is canonically equivalent sequence.
|
||
Canonical,
|
||
/**
|
||
Compatibility decomposition. The result is compatibility equivalent sequence.
|
||
Note: Compatibility decomposition is a $(B lossy) conversion,
|
||
typically suitable only for fuzzy matching and internal processing.
|
||
*/
|
||
Compatibility
|
||
};
|
||
|
||
/**
|
||
Shorthand aliases for character decomposition type, passed as a
|
||
template parameter to $(LREF decompose).
|
||
*/
|
||
enum {
|
||
Canonical = UnicodeDecomposition.Canonical,
|
||
Compatibility = UnicodeDecomposition.Compatibility
|
||
};
|
||
|
||
/++
|
||
Try to canonically compose 2 $(CHARACTERS).
|
||
Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
|
||
|
||
The assumption is that $(D first) comes before $(D second) in the original text,
|
||
usually meaning that the first is a starter.
|
||
|
||
Note: Hangul syllables are not covered by this function.
|
||
See $(D composeJamo) below.
|
||
|
||
Example:
|
||
---
|
||
assert(compose('A','\u0308') == '\u00C4');
|
||
assert(compose('A', 'B') == dchar.init);
|
||
assert(compose('C', '\u0301') == '\u0106');
|
||
// note that the starter is the first one
|
||
// thus the following doesn't compose
|
||
assert(compose('\u0308', 'A') == dchar.init);
|
||
---
|
||
+/
|
||
public dchar compose(dchar first, dchar second)
|
||
{
|
||
import std.internal.unicode_comp;
|
||
size_t packed = compositionJumpTrie[first];
|
||
if(packed == ushort.max)
|
||
return dchar.init;
|
||
// unpack offset and length
|
||
size_t idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
|
||
// TODO: optimize this micro binary search (no more then 4-5 steps)
|
||
auto r = compositionTable[idx..idx+cnt].map!"a.rhs"().assumeSorted();
|
||
auto target = r.lowerBound(second).length;
|
||
if(target == cnt)
|
||
return dchar.init;
|
||
auto entry = compositionTable[idx+target];
|
||
if(entry.rhs != second)
|
||
return dchar.init;
|
||
return entry.composed;
|
||
}
|
||
|
||
/++
|
||
Returns a full $(S_LINK Canonical decomposition, Canonical)
|
||
(by default) or $(S_LINK Compatibility decomposition, Compatibility)
|
||
decomposition of $(CHARACTER) $(D ch).
|
||
If no decomposition is available returns a $(LREF Grapheme)
|
||
with the $(D ch) itself.
|
||
|
||
Note:
|
||
This function also decomposes hangul syllables
|
||
as prescribed by the standard.
|
||
See also $(LREF decomposeHangul) for a restricted version
|
||
that takes into account only hangul syllables but
|
||
no other decompositions.
|
||
|
||
Example:
|
||
---
|
||
import std.algorithm;
|
||
assert(decompose('Ĉ')[].equal("C\u0302"));
|
||
assert(decompose('D')[].equal("D"));
|
||
assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
|
||
assert(decompose!Compatibility('¹').equal("1"));
|
||
---
|
||
+/
|
||
public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch)
|
||
{
|
||
import std.internal.unicode_decomp;
|
||
static if(decompType == Canonical)
|
||
{
|
||
alias table = decompCanonTable;
|
||
alias mapping = canonMappingTrie;
|
||
}
|
||
else static if(decompType == Compatibility)
|
||
{
|
||
alias table = decompCompatTable;
|
||
alias mapping = compatMappingTrie;
|
||
}
|
||
ushort idx = mapping[ch];
|
||
if(!idx) // not found, check hangul arithmetic decomposition
|
||
return decomposeHangul(ch);
|
||
auto decomp = table[idx..$].until(0);
|
||
return Grapheme(decomp);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
// verify examples
|
||
assert(compose('A','\u0308') == '\u00C4');
|
||
assert(compose('A', 'B') == dchar.init);
|
||
assert(compose('C', '\u0301') == '\u0106');
|
||
// note that the starter is the first one
|
||
// thus the following doesn't compose
|
||
assert(compose('\u0308', 'A') == dchar.init);
|
||
|
||
import std.algorithm;
|
||
assert(decompose('Ĉ')[].equalS("C\u0302"));
|
||
assert(decompose('D')[].equalS("D"));
|
||
assert(decompose('\uD4DC')[].equalS("\u1111\u1171\u11B7"));
|
||
assert(decompose!Compatibility('¹')[].equalS("1"));
|
||
}
|
||
|
||
//----------------------------------------------------------------------------
|
||
// Hangul specific composition/decomposition
|
||
enum jamoSBase = 0xAC00;
|
||
enum jamoLBase = 0x1100;
|
||
enum jamoVBase = 0x1161;
|
||
enum jamoTBase = 0x11A7;
|
||
enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
|
||
enum jamoNCount = jamoVCount * jamoTCount;
|
||
enum jamoSCount = jamoLCount * jamoNCount;
|
||
|
||
// Tests if $(D ch) is a Hangul leading consonant jamo.
|
||
bool isJamoL(dchar ch)
|
||
{
|
||
// first cmp rejects ~ 1M code points above leading jamo range
|
||
return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
|
||
}
|
||
|
||
// Tests if $(D ch) is a Hangul vowel jamo.
|
||
bool isJamoT(dchar ch)
|
||
{
|
||
// first cmp rejects ~ 1M code points above trailing jamo range
|
||
// Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
|
||
return ch < jamoTBase+jamoTCount && ch > jamoTBase;
|
||
}
|
||
|
||
// Tests if $(D ch) is a Hangul trailnig consonant jamo.
|
||
bool isJamoV(dchar ch)
|
||
{
|
||
// first cmp rejects ~ 1M code points above vowel range
|
||
return ch < jamoVBase+jamoVCount && ch >= jamoVBase;
|
||
}
|
||
|
||
int hangulSyllableIndex(dchar ch)
|
||
{
|
||
int idxS = cast(int)ch - jamoSBase;
|
||
return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
|
||
}
|
||
|
||
// internal helper: compose hangul syllables leaving dchar.init in holes
|
||
void hangulRecompose(dchar[] seq)
|
||
{
|
||
for(size_t idx = 0; idx + 1 < seq.length; )
|
||
{
|
||
if(isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
|
||
{
|
||
int indexL = seq[idx] - jamoLBase;
|
||
int indexV = seq[idx+1] - jamoVBase;
|
||
int indexLV = indexL * jamoNCount + indexV * jamoTCount;
|
||
if(idx + 2 < seq.length && isJamoT(seq[idx+2]))
|
||
{
|
||
seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
|
||
seq[idx+1] = dchar.init;
|
||
seq[idx+2] = dchar.init;
|
||
idx += 3;
|
||
}
|
||
else
|
||
{
|
||
seq[idx] = jamoSBase + indexLV;
|
||
seq[idx+1] = dchar.init;
|
||
idx += 2;
|
||
}
|
||
}
|
||
else
|
||
idx++;
|
||
}
|
||
}
|
||
|
||
//----------------------------------------------------------------------------
|
||
public:
|
||
|
||
/**
|
||
Decomposes a Hangul syllable. If $(D ch) is not a composed syllable
|
||
then this function returns $(LREF Grapheme) containing only $(D ch) as is.
|
||
|
||
Example:
|
||
---
|
||
import std.algorithm;
|
||
assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
|
||
---
|
||
*/
|
||
Grapheme decomposeHangul(dchar ch)
|
||
{
|
||
int idxS = cast(int)ch - jamoSBase;
|
||
if(idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
|
||
int idxL = idxS / jamoNCount;
|
||
int idxV = (idxS % jamoNCount) / jamoTCount;
|
||
int idxT = idxS % jamoTCount;
|
||
|
||
int partL = jamoLBase + idxL;
|
||
int partV = jamoVBase + idxV;
|
||
if(idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
|
||
return Grapheme(partL, partV, jamoTBase + idxT);
|
||
else // <L, V> decomposition
|
||
return Grapheme(partL, partV);
|
||
}
|
||
|
||
/++
|
||
Try to compose hangul syllable out of a leading consonant ($(D lead)),
|
||
a $(D vowel) and optional $(D trailing) consonant jamos.
|
||
|
||
On success returns the composed LV or LVT hangul syllable.
|
||
|
||
If any of $(D lead) and $(D vowel) are not a valid hangul jamo
|
||
of the respective $(CHARACTER) class returns dchar.init.
|
||
|
||
Example:
|
||
---
|
||
assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
|
||
// leaving out T-vowel, or passing any codepoint
|
||
// that is not trailing consonant composes an LV-syllable
|
||
assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
|
||
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
|
||
assert(composeJamo('\u1111', 'A') == dchar.init);
|
||
assert(composeJamo('A', '\u1171') == dchar.init);
|
||
---
|
||
+/
|
||
dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init)
|
||
{
|
||
if(!isJamoL(lead))
|
||
return dchar.init;
|
||
int indexL = lead - jamoLBase;
|
||
if(!isJamoV(vowel))
|
||
return dchar.init;
|
||
int indexV = vowel - jamoVBase;
|
||
int indexLV = indexL * jamoNCount + indexV * jamoTCount;
|
||
dchar syllable = jamoSBase + indexLV;
|
||
return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
|
||
}
|
||
|
||
unittest
|
||
{
|
||
static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
|
||
{
|
||
Grapheme g = decompose!T(ch);
|
||
assert(equalS(g[], r), text(g[], " vs ", r));
|
||
}
|
||
testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
|
||
testDecomp!Canonical('\uF907', "\u9F9C");
|
||
testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
|
||
testDecomp!Compatibility('\uA7F9', "\u0153");
|
||
|
||
// check examples
|
||
assert(decomposeHangul('\uD4DB')[].equalS("\u1111\u1171\u11B6"));
|
||
assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
|
||
assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
|
||
assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
|
||
assert(composeJamo('\u1111', 'A') == dchar.init);
|
||
assert(composeJamo('A', '\u1171') == dchar.init);
|
||
}
|
||
|
||
/**
|
||
Enumeration type for normalization forms,
|
||
passed as template parameter for functions like $(LREF normalize).
|
||
*/
|
||
enum NormalizationForm {
|
||
NFC,
|
||
NFD,
|
||
NFKC,
|
||
NFKD
|
||
}
|
||
|
||
|
||
enum {
|
||
/**
|
||
Shorthand aliases from values indicating normalization forms.
|
||
*/
|
||
NFC = NormalizationForm.NFC,
|
||
///ditto
|
||
NFD = NormalizationForm.NFD,
|
||
///ditto
|
||
NFKC = NormalizationForm.NFKC,
|
||
///ditto
|
||
NFKD = NormalizationForm.NFKD
|
||
};
|
||
|
||
/++
|
||
Returns $(D input) string normalized to the chosen form.
|
||
Form C is used by default.
|
||
|
||
For more information on normalization forms see
|
||
the $(S_LINK Normalization, normalization section).
|
||
|
||
Note:
|
||
In cases where the string in question is already normalized,
|
||
it is returned unmodified and no memory allocation happens.
|
||
|
||
Example:
|
||
---
|
||
// any encoding works
|
||
wstring greet = "Hello world";
|
||
assert(normalize(greet) is greet); // the same exact slice
|
||
|
||
// An example of a character with all 4 forms being different:
|
||
// Greek upsilon with acute and hook symbol (code point 0x03D3)
|
||
assert(normalize!NFC("ϓ") == "\u03D3");
|
||
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
|
||
assert(normalize!NFKC("ϓ") == "\u038E");
|
||
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
|
||
---
|
||
+/
|
||
inout(C)[] normalize(NormalizationForm norm=NFC, C)(inout(C)[] input)
|
||
{
|
||
auto anchors = splitNormalized!norm(input);
|
||
if(anchors[0] == input.length && anchors[1] == input.length)
|
||
return input;
|
||
dchar[] decomposed;
|
||
decomposed.reserve(31);
|
||
ubyte[] ccc;
|
||
ccc.reserve(31);
|
||
auto app = appender!(C[])();
|
||
do
|
||
{
|
||
app.put(input[0..anchors[0]]);
|
||
foreach(dchar ch; input[anchors[0]..anchors[1]])
|
||
static if(norm == NFD || norm == NFC)
|
||
{
|
||
foreach(dchar c; decompose!Canonical(ch)[])
|
||
decomposed ~= c;
|
||
}
|
||
else // NFKD & NFKC
|
||
{
|
||
foreach(dchar c; decompose!Compatibility(ch)[])
|
||
decomposed ~= c;
|
||
}
|
||
ccc.length = decomposed.length;
|
||
size_t firstNonStable = 0;
|
||
ubyte lastClazz = 0;
|
||
|
||
foreach(idx, dchar ch; decomposed)
|
||
{
|
||
auto clazz = combiningClass(ch);
|
||
ccc[idx] = clazz;
|
||
if(clazz == 0 && lastClazz != 0)
|
||
{
|
||
// found a stable code point after unstable ones
|
||
sort!("a[0] < b[0]", SwapStrategy.stable)
|
||
(zip(ccc[firstNonStable..idx], decomposed[firstNonStable..idx]));
|
||
firstNonStable = decomposed.length;
|
||
}
|
||
else if(clazz != 0 && lastClazz == 0)
|
||
{
|
||
// found first unstable code point after stable ones
|
||
firstNonStable = idx;
|
||
}
|
||
lastClazz = clazz;
|
||
}
|
||
sort!("a[0] < b[0]", SwapStrategy.stable)
|
||
(zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
|
||
static if(norm == NFC || norm == NFKC)
|
||
{
|
||
size_t idx = 0;
|
||
auto first = countUntil(ccc, 0);
|
||
if(first >= 0) // no starters?? no recomposition
|
||
{
|
||
for(;;)
|
||
{
|
||
auto second = recompose(first, decomposed, ccc);
|
||
if(second == decomposed.length)
|
||
break;
|
||
first = second;
|
||
}
|
||
// 2nd pass for hangul syllables
|
||
hangulRecompose(decomposed);
|
||
}
|
||
}
|
||
static if(norm == NFD || norm == NFKD)
|
||
app.put(decomposed);
|
||
else
|
||
{
|
||
auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
|
||
app.put(decomposed[0 .. clean.length]);
|
||
}
|
||
// reset variables
|
||
decomposed.length = 0;
|
||
decomposed.assumeSafeAppend();
|
||
ccc.length = 0;
|
||
ccc.assumeSafeAppend();
|
||
input = input[anchors[1]..$];
|
||
// and move on
|
||
anchors = splitNormalized!norm(input);
|
||
}while(anchors[0] != input.length);
|
||
app.put(input[0..anchors[0]]);
|
||
return cast(inout(C)[])app.data;
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
|
||
assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
|
||
assert(normalize!NFD("Äffin") == "A\u0308ffin");
|
||
|
||
// check example
|
||
|
||
// any encoding works
|
||
wstring greet = "Hello world";
|
||
assert(normalize(greet) is greet); // the same exact slice
|
||
|
||
// An example of a character with all 4 forms being different:
|
||
// Greek upsilon with acute and hook symbol (code point 0x03D3)
|
||
assert(normalize!NFC("ϓ") == "\u03D3");
|
||
assert(normalize!NFD("ϓ") == "\u03D2\u0301");
|
||
assert(normalize!NFKC("ϓ") == "\u038E");
|
||
assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
|
||
}
|
||
|
||
// canonically recompose given slice of code points, works in-place and mutates data
|
||
private size_t recompose(size_t start, dchar[] input, ubyte[] ccc)
|
||
{
|
||
assert(input.length == ccc.length);
|
||
int accumCC = -1;// so that it's out of 0..255 range
|
||
bool foundSolidStarter = false;
|
||
// writefln("recomposing %( %04x %)", input);
|
||
// first one is always a starter thus we start at i == 1
|
||
size_t i = start+1;
|
||
for(; ; )
|
||
{
|
||
if(i == input.length)
|
||
break;
|
||
int curCC = ccc[i];
|
||
// In any character sequence beginning with a starter S
|
||
// a character C is blocked from S if and only if there
|
||
// is some character B between S and C, and either B
|
||
// is a starter or it has the same or higher combining class as C.
|
||
//------------------------
|
||
// Applying to our case:
|
||
// S is input[0]
|
||
// accumCC is the maximum CCC of characters between C and S,
|
||
// as ccc are sorted
|
||
// C is input[i]
|
||
|
||
if(curCC > accumCC)
|
||
{
|
||
dchar comp = compose(input[start], input[i]);
|
||
if(comp != dchar.init)
|
||
{
|
||
input[start] = comp;
|
||
input[i] = dchar.init;// put a sentinel
|
||
// current was merged so its CCC shouldn't affect
|
||
// composing with the next one
|
||
}
|
||
else {
|
||
// if it was a starter then accumCC is now 0, end of loop
|
||
accumCC = curCC;
|
||
if(accumCC == 0)
|
||
break;
|
||
}
|
||
}
|
||
else{
|
||
// ditto here
|
||
accumCC = curCC;
|
||
if(accumCC == 0)
|
||
break;
|
||
}
|
||
i++;
|
||
}
|
||
return i;
|
||
}
|
||
|
||
// returns tuple of 2 indexes that delimit:
|
||
// normalized text, piece that needs normalization and
|
||
// the rest of input starting with stable code point
|
||
private auto splitNormalized(NormalizationForm norm, C)(const(C)[] input)
|
||
{
|
||
auto result = input;
|
||
ubyte lastCC = 0;
|
||
|
||
foreach(idx, dchar ch; input)
|
||
{
|
||
static if(norm == NFC)
|
||
if(ch < 0x0300)
|
||
{
|
||
lastCC = 0;
|
||
continue;
|
||
}
|
||
ubyte CC = combiningClass(ch);
|
||
if(lastCC > CC && CC != 0)
|
||
{
|
||
return seekStable!norm(idx, input);
|
||
}
|
||
|
||
if(notAllowedIn!norm(ch))
|
||
{
|
||
return seekStable!norm(idx, input);
|
||
}
|
||
lastCC = CC;
|
||
}
|
||
return tuple(input.length, input.length);
|
||
}
|
||
|
||
private auto seekStable(NormalizationForm norm, C)(size_t idx, in C[] input)
|
||
{
|
||
import std.utf : codeLength;
|
||
|
||
auto br = input[0..idx];
|
||
size_t region_start = 0;// default
|
||
for(;;)
|
||
{
|
||
if(br.empty)// start is 0
|
||
break;
|
||
dchar ch = br.back;
|
||
if(combiningClass(ch) == 0 && allowedIn!norm(ch))
|
||
{
|
||
region_start = br.length - std.utf.codeLength!C(ch);
|
||
break;
|
||
}
|
||
br.popFront();
|
||
}
|
||
///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
|
||
size_t region_end=input.length;// end is $ by default
|
||
foreach(i, dchar ch; input[idx..$])
|
||
{
|
||
if(combiningClass(ch) == 0 && allowedIn!norm(ch))
|
||
{
|
||
region_end = i+idx;
|
||
break;
|
||
}
|
||
}
|
||
// writeln("Region to normalize: ", input[region_start..region_end]);
|
||
return tuple(region_start, region_end);
|
||
}
|
||
|
||
/**
|
||
Tests if dchar $(D ch) is always allowed (Quick_Check=YES) in normalization
|
||
form $(D norm).
|
||
---
|
||
// e.g. Cyrillic is always allowed, so is ASCII
|
||
assert(allowedIn!NFC('я'));
|
||
assert(allowedIn!NFD('я'));
|
||
assert(allowedIn!NFKC('я'));
|
||
assert(allowedIn!NFKD('я'));
|
||
assert(allowedIn!NFC('Z'));
|
||
---
|
||
*/
|
||
public bool allowedIn(NormalizationForm norm)(dchar ch)
|
||
{
|
||
return !notAllowedIn!norm(ch);
|
||
}
|
||
|
||
// not user friendly name but more direct
|
||
private bool notAllowedIn(NormalizationForm norm)(dchar ch)
|
||
{
|
||
static if(norm == NFC)
|
||
alias qcTrie = nfcQCTrie;
|
||
else static if(norm == NFD)
|
||
alias qcTrie = nfdQCTrie;
|
||
else static if(norm == NFKC)
|
||
alias qcTrie = nfkcQCTrie;
|
||
else static if(norm == NFKD)
|
||
alias qcTrie = nfkdQCTrie;
|
||
else
|
||
static assert("Unknown normalization form "~norm);
|
||
return qcTrie[ch];
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(allowedIn!NFC('я'));
|
||
assert(allowedIn!NFD('я'));
|
||
assert(allowedIn!NFKC('я'));
|
||
assert(allowedIn!NFKD('я'));
|
||
assert(allowedIn!NFC('Z'));
|
||
}
|
||
|
||
}
|
||
|
||
version(std_uni_bootstrap)
|
||
{
|
||
// old version used for bootstrapping of gen_uni.d that generates
|
||
// up to date optimal versions of all of isXXX functions
|
||
@safe pure nothrow @nogc public bool isWhite(dchar c)
|
||
{
|
||
return std.ascii.isWhite(c) ||
|
||
c == lineSep || c == paraSep ||
|
||
c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
|
||
(c >= '\u2000' && c <= '\u200A') ||
|
||
c == '\u202F' || c == '\u205F' || c == '\u3000';
|
||
}
|
||
}
|
||
else
|
||
{
|
||
|
||
// trusted -> avoid bounds check
|
||
@trusted pure nothrow
|
||
{
|
||
// hide template instances behind functions (Bugzilla 13232)
|
||
ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
|
||
ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
|
||
dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
|
||
|
||
ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
|
||
ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
|
||
dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
|
||
|
||
ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
|
||
ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
|
||
dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
|
||
}
|
||
|
||
public:
|
||
|
||
/++
|
||
Whether or not $(D c) is a Unicode whitespace $(CHARACTER).
|
||
(general Unicode category: Part of C0(tab, vertical tab, form feed,
|
||
carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
|
||
+/
|
||
@safe pure nothrow @nogc
|
||
public bool isWhite(dchar c)
|
||
{
|
||
return isWhiteGen(c); // call pregenerated binary search
|
||
}
|
||
|
||
/++
|
||
Return whether $(D c) is a Unicode lowercase $(CHARACTER).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isLower(dchar c)
|
||
{
|
||
if(std.ascii.isASCII(c))
|
||
return std.ascii.isLower(c);
|
||
return lowerCaseTrie[c];
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
foreach(v; 0..0x80)
|
||
assert(std.ascii.isLower(v) == isLower(v));
|
||
assert(isLower('я'));
|
||
assert(isLower('й'));
|
||
assert(!isLower('Ж'));
|
||
// Greek HETA
|
||
assert(!isLower('\u0370'));
|
||
assert(isLower('\u0371'));
|
||
assert(!isLower('\u039C')); // capital MU
|
||
assert(isLower('\u03B2')); // beta
|
||
// from extended Greek
|
||
assert(!isLower('\u1F18'));
|
||
assert(isLower('\u1F00'));
|
||
foreach(v; unicode.lowerCase.byCodepoint)
|
||
assert(isLower(v) && !isUpper(v));
|
||
}
|
||
|
||
|
||
/++
|
||
Return whether $(D c) is a Unicode uppercase $(CHARACTER).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isUpper(dchar c)
|
||
{
|
||
if(std.ascii.isASCII(c))
|
||
return std.ascii.isUpper(c);
|
||
return upperCaseTrie[c];
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
foreach(v; 0..0x80)
|
||
assert(std.ascii.isLower(v) == isLower(v));
|
||
assert(!isUpper('й'));
|
||
assert(isUpper('Ж'));
|
||
// Greek HETA
|
||
assert(isUpper('\u0370'));
|
||
assert(!isUpper('\u0371'));
|
||
assert(isUpper('\u039C')); // capital MU
|
||
assert(!isUpper('\u03B2')); // beta
|
||
// from extended Greek
|
||
assert(!isUpper('\u1F00'));
|
||
assert(isUpper('\u1F18'));
|
||
foreach(v; unicode.upperCase.byCodepoint)
|
||
assert(isUpper(v) && !isLower(v));
|
||
}
|
||
|
||
|
||
/++
|
||
If $(D c) is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
|
||
is returned. Otherwise $(D c) is returned.
|
||
|
||
Warning: certain alphabets like German and Greek have no 1:1
|
||
upper-lower mapping. Use overload of toLower which takes full string instead.
|
||
+/
|
||
@safe pure nothrow
|
||
dchar toLower(dchar c)
|
||
{
|
||
// optimize ASCII case
|
||
if(c < 0xAA)
|
||
{
|
||
if(c < 'A')
|
||
return c;
|
||
if(c <= 'Z')
|
||
return c + 32;
|
||
return c;
|
||
}
|
||
size_t idx = toLowerSimpleIndex(c);
|
||
if(idx != ushort.max)
|
||
{
|
||
return toLowerTab(idx);
|
||
}
|
||
return c;
|
||
}
|
||
|
||
//TODO: Hidden for now, needs better API.
|
||
//Other transforms could use better API as well, but this one is a new primitive.
|
||
@safe pure nothrow
|
||
private dchar toTitlecase(dchar c)
|
||
{
|
||
// optimize ASCII case
|
||
if(c < 0xAA)
|
||
{
|
||
if(c < 'a')
|
||
return c;
|
||
if(c <= 'z')
|
||
return c - 32;
|
||
return c;
|
||
}
|
||
size_t idx = toTitleSimpleIndex(c);
|
||
if(idx != ushort.max)
|
||
{
|
||
return toTitleTab(idx);
|
||
}
|
||
return c;
|
||
}
|
||
|
||
private alias UpperTriple = TypeTuple!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
|
||
private alias LowerTriple = TypeTuple!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
|
||
|
||
// generic toUpper/toLower on whole string, creates new or returns as is
|
||
private S toCase(alias indexFn, uint maxIdx, alias tableFn, S)(S s) @trusted pure
|
||
if(isSomeString!S)
|
||
{
|
||
foreach(i, dchar cOuter; s)
|
||
{
|
||
ushort idx = indexFn(cOuter);
|
||
if(idx == ushort.max)
|
||
continue;
|
||
auto result = appender!S(s[0..i]);
|
||
result.reserve(s.length);
|
||
foreach(dchar c; s[i .. $])
|
||
{
|
||
idx = indexFn(c);
|
||
if(idx == ushort.max)
|
||
result.put(c);
|
||
else if(idx < maxIdx)
|
||
{
|
||
c = tableFn(idx);
|
||
result.put(c);
|
||
}
|
||
else
|
||
{
|
||
auto val = tableFn(idx);
|
||
// unpack length + codepoint
|
||
uint len = val>>24;
|
||
result.put(cast(dchar)(val & 0xFF_FFFF));
|
||
foreach(j; idx+1..idx+len)
|
||
result.put(tableFn(j));
|
||
}
|
||
}
|
||
return result.data;
|
||
}
|
||
return s;
|
||
}
|
||
|
||
unittest //12428
|
||
{
|
||
auto s = "abcdefghij".replicate(300);
|
||
s = s[0..10];
|
||
|
||
toUpper(s);
|
||
|
||
assert(s == "abcdefghij");
|
||
}
|
||
|
||
// TODO: helper, I wish std.utf was more flexible (and stright)
|
||
private size_t encodeTo(char[] buf, size_t idx, dchar c) @trusted pure
|
||
{
|
||
if (c <= 0x7F)
|
||
{
|
||
buf[idx] = cast(char)c;
|
||
idx++;
|
||
}
|
||
else if (c <= 0x7FF)
|
||
{
|
||
buf[idx] = cast(char)(0xC0 | (c >> 6));
|
||
buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
|
||
idx += 2;
|
||
}
|
||
else if (c <= 0xFFFF)
|
||
{
|
||
buf[idx] = cast(char)(0xE0 | (c >> 12));
|
||
buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
||
buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
|
||
idx += 3;
|
||
}
|
||
else if (c <= 0x10FFFF)
|
||
{
|
||
buf[idx] = cast(char)(0xF0 | (c >> 18));
|
||
buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
||
buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
||
buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
|
||
idx += 4;
|
||
}
|
||
else
|
||
assert(0);
|
||
return idx;
|
||
}
|
||
|
||
unittest
|
||
{
|
||
char[] s = "abcd".dup;
|
||
size_t i = 0;
|
||
i = encodeTo(s, i, 'X');
|
||
assert(s == "Xbcd");
|
||
|
||
i = encodeTo(s, i, cast(dchar)'\u00A9');
|
||
assert(s == "X\xC2\xA9d");
|
||
}
|
||
|
||
// TODO: helper, I wish std.utf was more flexible (and stright)
|
||
private size_t encodeTo(wchar[] buf, size_t idx, dchar c) @trusted pure
|
||
{
|
||
import std.utf;
|
||
if (c <= 0xFFFF)
|
||
{
|
||
if (0xD800 <= c && c <= 0xDFFF)
|
||
throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
|
||
buf[idx] = cast(wchar)c;
|
||
idx++;
|
||
}
|
||
else if (c <= 0x10FFFF)
|
||
{
|
||
buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
|
||
buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
|
||
idx += 2;
|
||
}
|
||
else
|
||
assert(0);
|
||
return idx;
|
||
}
|
||
|
||
private size_t encodeTo(dchar[] buf, size_t idx, dchar c) @trusted pure
|
||
{
|
||
buf[idx] = c;
|
||
idx++;
|
||
return idx;
|
||
}
|
||
|
||
private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
|
||
if (is(C == char) || is(C == wchar) || is(C == dchar))
|
||
{
|
||
import std.utf;
|
||
size_t curIdx = 0;
|
||
size_t destIdx = 0;
|
||
alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
|
||
size_t lastUnchanged = 0;
|
||
// in-buffer move of bytes to a new start index
|
||
// the trick is that it may not need to copy at all
|
||
static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
|
||
{
|
||
// Interestingly we may just bump pointer for a while
|
||
// then have to copy if a re-cased char was smaller the original
|
||
// later we may regain pace with char that got bigger
|
||
// In the end it sometimes flip-flops between the 2 cases below
|
||
if(dest == from)
|
||
return to;
|
||
// got to copy
|
||
foreach(C c; str[from..to])
|
||
str[dest++] = c;
|
||
return dest;
|
||
}
|
||
while(curIdx != s.length)
|
||
{
|
||
size_t startIdx = curIdx;
|
||
dchar ch = decode(s, curIdx);
|
||
// TODO: special case for ASCII
|
||
auto caseIndex = indexFn(ch);
|
||
if(caseIndex == ushort.max) // unchanged, skip over
|
||
{
|
||
continue;
|
||
}
|
||
else if(caseIndex < maxIdx) // 1:1 codepoint mapping
|
||
{
|
||
// previous cased chars had the same length as uncased ones
|
||
// thus can just adjust pointer
|
||
destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
|
||
lastUnchanged = curIdx;
|
||
dchar cased = tableFn(caseIndex);
|
||
auto casedLen = codeLength!C(cased);
|
||
if(casedLen + destIdx > curIdx) // no place to fit cased char
|
||
{
|
||
// switch to slow codepath, where we allocate
|
||
return slowToCase(s, startIdx, destIdx);
|
||
}
|
||
else
|
||
{
|
||
destIdx = encodeTo(s, destIdx, cased);
|
||
}
|
||
}
|
||
else // 1:m codepoint mapping, slow codepath
|
||
{
|
||
destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
|
||
lastUnchanged = curIdx;
|
||
return slowToCase(s, startIdx, destIdx);
|
||
}
|
||
assert(destIdx <= curIdx);
|
||
}
|
||
if(lastUnchanged != s.length)
|
||
{
|
||
destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
|
||
}
|
||
s = s[0..destIdx];
|
||
}
|
||
|
||
// helper to precalculate size of case-converted string
|
||
private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
|
||
{
|
||
size_t toCaseLength(C)(in C[] str)
|
||
{
|
||
import std.utf;
|
||
size_t codeLen = 0;
|
||
size_t lastNonTrivial = 0;
|
||
size_t curIdx = 0;
|
||
while(curIdx != str.length)
|
||
{
|
||
size_t startIdx = curIdx;
|
||
dchar ch = decode(str, curIdx);
|
||
ushort caseIndex = indexFn(ch);
|
||
if(caseIndex == ushort.max)
|
||
continue;
|
||
else if(caseIndex < maxIdx)
|
||
{
|
||
codeLen += startIdx - lastNonTrivial;
|
||
lastNonTrivial = curIdx;
|
||
dchar cased = tableFn(caseIndex);
|
||
codeLen += codeLength!C(cased);
|
||
}
|
||
else
|
||
{
|
||
codeLen += startIdx - lastNonTrivial;
|
||
lastNonTrivial = curIdx;
|
||
auto val = tableFn(caseIndex);
|
||
auto len = val>>24;
|
||
dchar cased = val & 0xFF_FFFF;
|
||
codeLen += codeLength!C(cased);
|
||
foreach(j; caseIndex+1..caseIndex+len)
|
||
codeLen += codeLength!C(tableFn(j));
|
||
}
|
||
}
|
||
if(lastNonTrivial != str.length)
|
||
codeLen += str.length - lastNonTrivial;
|
||
return codeLen;
|
||
}
|
||
}
|
||
|
||
unittest
|
||
{
|
||
import std.conv;
|
||
alias toLowerLength = toCaseLength!(LowerTriple);
|
||
assert(toLowerLength("abcd") == 4);
|
||
assert(toLowerLength("аБВгд456") == 10+3);
|
||
}
|
||
|
||
// slower code path that preallocates and then copies
|
||
// case-converted stuf to the new string
|
||
private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
|
||
{
|
||
void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
|
||
size_t destIdx) @trusted pure
|
||
if (is(C == char) || is(C == wchar) || is(C == dchar))
|
||
{
|
||
import std.utf : decode;
|
||
alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
|
||
auto trueLength = destIdx + caseLength(s[curIdx..$]);
|
||
C[] ns = new C[trueLength];
|
||
ns[0..destIdx] = s[0..destIdx];
|
||
size_t lastUnchanged = curIdx;
|
||
while(curIdx != s.length)
|
||
{
|
||
size_t startIdx = curIdx; // start of current codepoint
|
||
dchar ch = decode(s, curIdx);
|
||
auto caseIndex = indexFn(ch);
|
||
if(caseIndex == ushort.max) // skip over
|
||
{
|
||
continue;
|
||
}
|
||
else if(caseIndex < maxIdx) // 1:1 codepoint mapping
|
||
{
|
||
dchar cased = tableFn(caseIndex);
|
||
auto toCopy = startIdx - lastUnchanged;
|
||
ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
|
||
lastUnchanged = curIdx;
|
||
destIdx += toCopy;
|
||
destIdx = encodeTo(ns, destIdx, cased);
|
||
}
|
||
else // 1:m codepoint mapping, slow codepath
|
||
{
|
||
auto toCopy = startIdx - lastUnchanged;
|
||
ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
|
||
lastUnchanged = curIdx;
|
||
destIdx += toCopy;
|
||
auto val = tableFn(caseIndex);
|
||
// unpack length + codepoint
|
||
uint len = val>>24;
|
||
destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
|
||
foreach(j; caseIndex+1..caseIndex+len)
|
||
destIdx = encodeTo(ns, destIdx, tableFn(j));
|
||
}
|
||
}
|
||
if(lastUnchanged != s.length)
|
||
{
|
||
auto toCopy = s.length - lastUnchanged;
|
||
ns[destIdx..destIdx+toCopy] = s[lastUnchanged..$];
|
||
destIdx += toCopy;
|
||
}
|
||
assert(ns.length == destIdx);
|
||
s = ns;
|
||
}
|
||
}
|
||
|
||
/++
|
||
Converts $(D s) to lowercase (by performing Unicode lowercase mapping) in place.
|
||
For a few characters string length may increase after the transformation,
|
||
in such a case the function reallocates exactly once.
|
||
If $(D s) does not have any uppercase characters, then $(D s) is unaltered.
|
||
+/
|
||
void toLowerInPlace(C)(ref C[] s) @trusted pure
|
||
if (is(C == char) || is(C == wchar) || is(C == dchar))
|
||
{
|
||
toCaseInPlace!(LowerTriple)(s);
|
||
}
|
||
// overloads for the most common cases to reduce compile time
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
void toLowerInPlace(ref char[] s)
|
||
{ toLowerInPlace!char(s); }
|
||
void toLowerInPlace(ref wchar[] s)
|
||
{ toLowerInPlace!wchar(s); }
|
||
void toLowerInPlace(ref dchar[] s)
|
||
{ toLowerInPlace!dchar(s); }
|
||
}
|
||
|
||
/++
|
||
Converts $(D s) to uppercase (by performing Unicode uppercase mapping) in place.
|
||
For a few characters string length may increase after the transformation,
|
||
in such a case the function reallocates exactly once.
|
||
If $(D s) does not have any lowercase characters, then $(D s) is unaltered.
|
||
+/
|
||
void toUpperInPlace(C)(ref C[] s) @trusted pure
|
||
if (is(C == char) || is(C == wchar) || is(C == dchar))
|
||
{
|
||
toCaseInPlace!(UpperTriple)(s);
|
||
}
|
||
// overloads for the most common cases to reduce compile time/code size
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
void toUpperInPlace(ref char[] s)
|
||
{ toUpperInPlace!char(s); }
|
||
void toUpperInPlace(ref wchar[] s)
|
||
{ toUpperInPlace!wchar(s); }
|
||
void toUpperInPlace(ref dchar[] s)
|
||
{ toUpperInPlace!dchar(s); }
|
||
}
|
||
|
||
/++
|
||
Returns a string which is identical to $(D s) except that all of its
|
||
characters are converted to lowercase (by preforming Unicode lowercase mapping).
|
||
If none of $(D s) characters were affected, then $(D s) itself is returned.
|
||
+/
|
||
S toLower(S)(S s) @trusted pure
|
||
if(isSomeString!S)
|
||
{
|
||
return toCase!(LowerTriple)(s);
|
||
}
|
||
// overloads for the most common cases to reduce compile time
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
string toLower(string s)
|
||
{ return toLower!string(s); }
|
||
wstring toLower(wstring s)
|
||
{ return toLower!wstring(s); }
|
||
dstring toLower(dstring s)
|
||
{ return toLower!dstring(s); }
|
||
}
|
||
|
||
|
||
@trusted unittest //@@@BUG std.format is not @safe
|
||
{
|
||
import std.string : format;
|
||
foreach(ch; 0..0x80)
|
||
assert(std.ascii.toLower(ch) == toLower(ch));
|
||
assert(toLower('Я') == 'я');
|
||
assert(toLower('Δ') == 'δ');
|
||
foreach(ch; unicode.upperCase.byCodepoint)
|
||
{
|
||
dchar low = ch.toLower();
|
||
assert(low == ch || isLower(low), format("%s -> %s", ch, low));
|
||
}
|
||
assert(toLower("АЯ") == "ая");
|
||
|
||
assert("\u1E9E".toLower == "\u00df");
|
||
assert("\u00df".toUpper == "SS");
|
||
}
|
||
|
||
//bugzilla 9629
|
||
unittest
|
||
{
|
||
wchar[] test = "hello þ world"w.dup;
|
||
auto piece = test[6..7];
|
||
toUpperInPlace(piece);
|
||
assert(test == "hello Þ world");
|
||
}
|
||
|
||
|
||
unittest
|
||
{
|
||
string s1 = "FoL";
|
||
string s2 = toLower(s1);
|
||
assert(cmp(s2, "fol") == 0, s2);
|
||
assert(s2 != s1);
|
||
|
||
char[] s3 = s1.dup;
|
||
toLowerInPlace(s3);
|
||
assert(s3 == s2);
|
||
|
||
s1 = "A\u0100B\u0101d";
|
||
s2 = toLower(s1);
|
||
s3 = s1.dup;
|
||
assert(cmp(s2, "a\u0101b\u0101d") == 0);
|
||
assert(s2 !is s1);
|
||
toLowerInPlace(s3);
|
||
assert(s3 == s2);
|
||
|
||
s1 = "A\u0460B\u0461d";
|
||
s2 = toLower(s1);
|
||
s3 = s1.dup;
|
||
assert(cmp(s2, "a\u0461b\u0461d") == 0);
|
||
assert(s2 !is s1);
|
||
toLowerInPlace(s3);
|
||
assert(s3 == s2);
|
||
|
||
s1 = "\u0130";
|
||
s2 = toLower(s1);
|
||
s3 = s1.dup;
|
||
assert(s2 == "i\u0307");
|
||
assert(s2 !is s1);
|
||
toLowerInPlace(s3);
|
||
assert(s3 == s2);
|
||
|
||
// Test on wchar and dchar strings.
|
||
assert(toLower("Some String"w) == "some string"w);
|
||
assert(toLower("Some String"d) == "some string"d);
|
||
|
||
// bugzilla 12455
|
||
dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||
assert(isUpper(c));
|
||
assert(toLower(c) == 'i');
|
||
// extend on 12455 reprot - check simple-case toUpper too
|
||
c = '\u1f87';
|
||
assert(isLower(c));
|
||
assert(toUpper(c) == '\u1F8F');
|
||
}
|
||
|
||
|
||
/++
|
||
If $(D c) is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
|
||
is returned. Otherwise $(D c) is returned.
|
||
|
||
Warning:
|
||
Certain alphabets like German and Greek have no 1:1
|
||
upper-lower mapping. Use overload of toUpper which takes full string instead.
|
||
+/
|
||
@safe pure nothrow
|
||
dchar toUpper(dchar c)
|
||
{
|
||
// optimize ASCII case
|
||
if(c < 0xAA)
|
||
{
|
||
if(c < 'a')
|
||
return c;
|
||
if(c <= 'z')
|
||
return c - 32;
|
||
return c;
|
||
}
|
||
size_t idx = toUpperSimpleIndex(c);
|
||
if(idx != ushort.max)
|
||
{
|
||
return toUpperTab(idx);
|
||
}
|
||
return c;
|
||
}
|
||
|
||
@trusted unittest
|
||
{
|
||
import std.string : format;
|
||
foreach(ch; 0..0x80)
|
||
assert(std.ascii.toUpper(ch) == toUpper(ch));
|
||
assert(toUpper('я') == 'Я');
|
||
assert(toUpper('δ') == 'Δ');
|
||
auto title = unicode.Titlecase_Letter;
|
||
foreach(ch; unicode.lowerCase.byCodepoint)
|
||
{
|
||
dchar up = ch.toUpper();
|
||
assert(up == ch || isUpper(up) || title[up],
|
||
format("%x -> %x", ch, up));
|
||
}
|
||
}
|
||
|
||
/++
|
||
Returns a string which is identical to $(D s) except that all of its
|
||
characters are converted to uppercase (by preforming Unicode uppercase mapping).
|
||
If none of $(D s) characters were affected, then $(D s) itself is returned.
|
||
+/
|
||
S toUpper(S)(S s) @trusted pure
|
||
if(isSomeString!S)
|
||
{
|
||
return toCase!(UpperTriple)(s);
|
||
}
|
||
// overloads for the most common cases to reduce compile time
|
||
@safe pure /*TODO nothrow*/
|
||
{
|
||
string toUpper(string s)
|
||
{ return toUpper!string(s); }
|
||
wstring toUpper(wstring s)
|
||
{ return toUpper!wstring(s); }
|
||
dstring toUpper(dstring s)
|
||
{ return toUpper!dstring(s); }
|
||
}
|
||
|
||
unittest
|
||
{
|
||
string s1 = "FoL";
|
||
string s2;
|
||
char[] s3;
|
||
|
||
s2 = toUpper(s1);
|
||
s3 = s1.dup; toUpperInPlace(s3);
|
||
assert(s3 == s2, s3);
|
||
assert(cmp(s2, "FOL") == 0);
|
||
assert(s2 !is s1);
|
||
|
||
s1 = "a\u0100B\u0101d";
|
||
s2 = toUpper(s1);
|
||
s3 = s1.dup; toUpperInPlace(s3);
|
||
assert(s3 == s2);
|
||
assert(cmp(s2, "A\u0100B\u0100D") == 0);
|
||
assert(s2 !is s1);
|
||
|
||
s1 = "a\u0460B\u0461d";
|
||
s2 = toUpper(s1);
|
||
s3 = s1.dup; toUpperInPlace(s3);
|
||
assert(s3 == s2);
|
||
assert(cmp(s2, "A\u0460B\u0460D") == 0);
|
||
assert(s2 !is s1);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
|
||
{
|
||
import std.string : format;
|
||
string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
|
||
auto low = s.toLower() , up = s.toUpper();
|
||
auto lowInp = s.dup, upInp = s.dup;
|
||
lowInp.toLowerInPlace();
|
||
upInp.toUpperInPlace();
|
||
assert(low == trueLow, format(diff, low, trueLow));
|
||
assert(up == trueUp, format(diff, up, trueUp));
|
||
assert(lowInp == trueLow,
|
||
format(diff, cast(ubyte[])s, cast(ubyte[])lowInp, cast(ubyte[])trueLow));
|
||
assert(upInp == trueUp,
|
||
format(diff, cast(ubyte[])s, cast(ubyte[])upInp, cast(ubyte[])trueUp));
|
||
}
|
||
foreach(S; TypeTuple!(dstring, wstring, string))
|
||
{
|
||
|
||
S easy = "123";
|
||
S good = "abCФеж";
|
||
S awful = "\u0131\u023f\u2126";
|
||
S wicked = "\u0130\u1FE2";
|
||
auto options = [easy, good, awful, wicked];
|
||
S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
|
||
S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
|
||
|
||
foreach(val; TypeTuple!(easy, good))
|
||
{
|
||
auto e = val.dup;
|
||
auto g = e;
|
||
e.toUpperInPlace();
|
||
assert(e is g);
|
||
e.toLowerInPlace();
|
||
assert(e is g);
|
||
}
|
||
foreach(i, v; options)
|
||
{
|
||
doTest(v, upper[i], lower[i]);
|
||
}
|
||
|
||
// a few combinatorial runs
|
||
foreach(i; 0..options.length)
|
||
foreach(j; i..options.length)
|
||
foreach(k; j..options.length)
|
||
{
|
||
auto sample = options[i] ~ options[j] ~ options[k];
|
||
auto sample2 = options[k] ~ options[j] ~ options[i];
|
||
doTest(sample, upper[i] ~ upper[j] ~ upper[k],
|
||
lower[i] ~ lower[j] ~ lower[k]);
|
||
doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
|
||
lower[k] ~ lower[j] ~ lower[i]);
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode alphabetic $(CHARACTER)
|
||
(general Unicode category: Alphabetic).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isAlpha(dchar c)
|
||
{
|
||
// optimization
|
||
if(c < 0xAA)
|
||
{
|
||
size_t x = c - 'A';
|
||
if(x <= 'Z' - 'A')
|
||
return true;
|
||
else
|
||
{
|
||
x = c - 'a';
|
||
if(x <= 'z'-'a')
|
||
return true;
|
||
}
|
||
return false;
|
||
}
|
||
|
||
return alphaTrie[c];
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
auto alpha = unicode("Alphabetic");
|
||
foreach(ch; alpha.byCodepoint)
|
||
assert(isAlpha(ch));
|
||
foreach(ch; 0..0x4000)
|
||
assert((ch in alpha) == isAlpha(ch));
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode mark
|
||
(general Unicode category: Mn, Me, Mc).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isMark(dchar c)
|
||
{
|
||
return markTrie[c];
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
auto mark = unicode("Mark");
|
||
foreach(ch; mark.byCodepoint)
|
||
assert(isMark(ch));
|
||
foreach(ch; 0..0x4000)
|
||
assert((ch in mark) == isMark(ch));
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode numerical $(CHARACTER)
|
||
(general Unicode category: Nd, Nl, No).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isNumber(dchar c)
|
||
{
|
||
return numberTrie[c];
|
||
}
|
||
|
||
@safe unittest
|
||
{
|
||
auto n = unicode("N");
|
||
foreach(ch; n.byCodepoint)
|
||
assert(isNumber(ch));
|
||
foreach(ch; 0..0x4000)
|
||
assert((ch in n) == isNumber(ch));
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode punctuation $(CHARACTER)
|
||
(general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isPunctuation(dchar c)
|
||
{
|
||
return punctuationTrie[c];
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(isPunctuation('\u0021'));
|
||
assert(isPunctuation('\u0028'));
|
||
assert(isPunctuation('\u0029'));
|
||
assert(isPunctuation('\u002D'));
|
||
assert(isPunctuation('\u005F'));
|
||
assert(isPunctuation('\u00AB'));
|
||
assert(isPunctuation('\u00BB'));
|
||
foreach(ch; unicode("P").byCodepoint)
|
||
assert(isPunctuation(ch));
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode symbol $(CHARACTER)
|
||
(general Unicode category: Sm, Sc, Sk, So).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isSymbol(dchar c)
|
||
{
|
||
return symbolTrie[c];
|
||
}
|
||
|
||
unittest
|
||
{
|
||
import std.string;
|
||
assert(isSymbol('\u0024'));
|
||
assert(isSymbol('\u002B'));
|
||
assert(isSymbol('\u005E'));
|
||
assert(isSymbol('\u00A6'));
|
||
foreach(ch; unicode("S").byCodepoint)
|
||
assert(isSymbol(ch), format("%04x", ch));
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode space $(CHARACTER)
|
||
(general Unicode category: Zs)
|
||
Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
|
||
For commonly used less strict semantics see $(LREF isWhite).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isSpace(dchar c)
|
||
{
|
||
return isSpaceGen(c);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(isSpace('\u0020'));
|
||
auto space = unicode.Zs;
|
||
foreach(ch; space.byCodepoint)
|
||
assert(isSpace(ch));
|
||
foreach(ch; 0..0x1000)
|
||
assert(isSpace(ch) == space[ch]);
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode graphical $(CHARACTER)
|
||
(general Unicode category: L, M, N, P, S, Zs).
|
||
|
||
+/
|
||
@safe pure nothrow
|
||
bool isGraphical(dchar c)
|
||
{
|
||
return graphicalTrie[c];
|
||
}
|
||
|
||
|
||
unittest
|
||
{
|
||
auto set = unicode("Graphical");
|
||
import std.string;
|
||
foreach(ch; set.byCodepoint)
|
||
assert(isGraphical(ch), format("%4x", ch));
|
||
foreach(ch; 0..0x4000)
|
||
assert((ch in set) == isGraphical(ch));
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode control $(CHARACTER)
|
||
(general Unicode category: Cc).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isControl(dchar c)
|
||
{
|
||
return isControlGen(c);
|
||
}
|
||
|
||
unittest
|
||
{
|
||
assert(isControl('\u0000'));
|
||
assert(isControl('\u0081'));
|
||
assert(!isControl('\u0100'));
|
||
auto cc = unicode.Cc;
|
||
foreach(ch; cc.byCodepoint)
|
||
assert(isControl(ch));
|
||
foreach(ch; 0..0x1000)
|
||
assert(isControl(ch) == cc[ch]);
|
||
}
|
||
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode formatting $(CHARACTER)
|
||
(general Unicode category: Cf).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isFormat(dchar c)
|
||
{
|
||
return isFormatGen(c);
|
||
}
|
||
|
||
|
||
unittest
|
||
{
|
||
assert(isFormat('\u00AD'));
|
||
foreach(ch; unicode("Format").byCodepoint)
|
||
assert(isFormat(ch));
|
||
}
|
||
|
||
// code points for private use, surrogates are not likely to change in near feature
|
||
// if need be they can be generated from unicode data as well
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode Private Use $(CODEPOINT)
|
||
(general Unicode category: Co).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isPrivateUse(dchar c)
|
||
{
|
||
return (0x00_E000 <= c && c <= 0x00_F8FF)
|
||
|| (0x0F_0000 <= c && c <= 0x0F_FFFD)
|
||
|| (0x10_0000 <= c && c <= 0x10_FFFD);
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode surrogate $(CODEPOINT)
|
||
(general Unicode category: Cs).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isSurrogate(dchar c)
|
||
{
|
||
return (0xD800 <= c && c <= 0xDFFF);
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode high surrogate (lead surrogate).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isSurrogateHi(dchar c)
|
||
{
|
||
return (0xD800 <= c && c <= 0xDBFF);
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode low surrogate (trail surrogate).
|
||
+/
|
||
@safe pure nothrow
|
||
bool isSurrogateLo(dchar c)
|
||
{
|
||
return (0xDC00 <= c && c <= 0xDFFF);
|
||
}
|
||
|
||
/++
|
||
Returns whether $(D c) is a Unicode non-character i.e.
|
||
a $(CODEPOINT) with no assigned abstract character.
|
||
(general Unicode category: Cn)
|
||
+/
|
||
@safe pure nothrow
|
||
bool isNonCharacter(dchar c)
|
||
{
|
||
return nonCharacterTrie[c];
|
||
}
|
||
|
||
unittest
|
||
{
|
||
auto set = unicode("Cn");
|
||
foreach(ch; set.byCodepoint)
|
||
assert(isNonCharacter(ch));
|
||
}
|
||
|
||
private:
|
||
// load static data from pre-generated tables into usable datastructures
|
||
|
||
|
||
@safe auto asSet(const (ubyte)[] compressed) pure
|
||
{
|
||
return CodepointSet.fromIntervals(decompressIntervals(compressed));
|
||
}
|
||
|
||
@safe pure nothrow auto asTrie(T...)(in TrieEntry!T e)
|
||
{
|
||
return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
|
||
}
|
||
|
||
@safe pure nothrow @property
|
||
{
|
||
// It's important to use auto return here, so that the compiler
|
||
// only runs semantic on the return type if the function gets
|
||
// used. Also these are functions rather than templates to not
|
||
// increase the object size of the caller.
|
||
auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
|
||
auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
|
||
auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
|
||
auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
|
||
auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
|
||
auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
|
||
auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
|
||
auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
|
||
auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
|
||
auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
|
||
auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
|
||
|
||
//normalization quick-check tables
|
||
auto nfcQCTrie()
|
||
{
|
||
import std.internal.unicode_norm;
|
||
static immutable res = asTrie(nfcQCTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto nfdQCTrie()
|
||
{
|
||
import std.internal.unicode_norm;
|
||
static immutable res = asTrie(nfdQCTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto nfkcQCTrie()
|
||
{
|
||
import std.internal.unicode_norm;
|
||
static immutable res = asTrie(nfkcQCTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto nfkdQCTrie()
|
||
{
|
||
import std.internal.unicode_norm;
|
||
static immutable res = asTrie(nfkdQCTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
//grapheme breaking algorithm tables
|
||
auto mcTrie()
|
||
{
|
||
import std.internal.unicode_grapheme;
|
||
static immutable res = asTrie(mcTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto graphemeExtendTrie()
|
||
{
|
||
import std.internal.unicode_grapheme;
|
||
static immutable res = asTrie(graphemeExtendTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto hangLV()
|
||
{
|
||
import std.internal.unicode_grapheme;
|
||
static immutable res = asTrie(hangulLVTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto hangLVT()
|
||
{
|
||
import std.internal.unicode_grapheme;
|
||
static immutable res = asTrie(hangulLVTTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
// tables below are used for composition/decomposition
|
||
auto combiningClassTrie()
|
||
{
|
||
import std.internal.unicode_comp;
|
||
static immutable res = asTrie(combiningClassTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto compatMappingTrie()
|
||
{
|
||
import std.internal.unicode_decomp;
|
||
static immutable res = asTrie(compatMappingTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto canonMappingTrie()
|
||
{
|
||
import std.internal.unicode_decomp;
|
||
static immutable res = asTrie(canonMappingTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
auto compositionJumpTrie()
|
||
{
|
||
import std.internal.unicode_comp;
|
||
static immutable res = asTrie(compositionJumpTrieEntries);
|
||
return res;
|
||
}
|
||
|
||
//case conversion tables
|
||
auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
|
||
auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
|
||
auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
|
||
//simple case conversion tables
|
||
auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
|
||
auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
|
||
auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
|
||
|
||
}
|
||
|
||
}// version(!std_uni_bootstrap)
|