Implement UAX31 character ranges (#15307)

This commit is contained in:
Richard (Rikki) Andrew Cattermole 2024-03-19 07:19:16 +13:00 committed by GitHub
parent e74da19bcd
commit dffd899508
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
32 changed files with 5902 additions and 458 deletions

View file

@ -0,0 +1,12 @@
Expansion of identifier tables to allow new characters to match C23 have been added along with CLI configurability
You can currently choose between ``c99``, ``c11``, ``UAX31`` (C23's) and ``all`` (the least restrictive set) for both D and ImportC.
This can be done with ``-identifiers=<table>`` and for ImportC ``-identifiers-importc=<table>``.
The default table for D is currently set to ``all``, while ImportC is set to ``c11``.
Previously both D and ImportC used the ``c99`` tables.
D's table will be swapped over at a later date to [UAX31](https://unicode.org/reports/tr31/), this should be done in 2.117.
If you find yourself at this time using ``c99`` specific characters and not willing to change them, you may switch back to ``all``.
Although it should be unlikely that you will need to.

View file

@ -0,0 +1,6 @@
ImportC has improved Unicode support
Universal Character Names are now supported, allowing you to use the ``\uXXXX`` and ``\UXXXXXXXX`` syntax where ``X`` is a hex digit as part of an identifier.
DigitalMars sppn does not support anything newer than C99.
It is known to be limited and using any Unicode character not in those ranges will result in an error.

View file

@ -1584,7 +1584,7 @@ auto sourceFiles()
stringtable.d utf.d
"),
common: fileArray(env["COMMON"], "
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d charactertables.d identifiertables.d
"),
commonHeaders: fileArray(env["COMMON"], "
outbuffer.h

View file

@ -466,6 +466,26 @@ dmd -cov -unittest myprog.d
$(P Note that multiple `-i=...` options are allowed, each one adds a pattern.)}"
),
Option("identifiers=<table>",
"Specify the non-ASCII tables for D identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11)
$(LI $(I all): All, the least restrictive set, which comes all others (default))
)`
),
Option("identifiers-importc=<table>",
"Specify the non-ASCII tables for ImportC identifiers",
`Set the identifier table to use for the non-ASCII values.
$(UL
$(LI $(I UAX31): UAX31)
$(LI $(I c99): C99)
$(LI $(I c11): C11 (default))
$(LI $(I all): All, the least restrictive set, which comes all others)
)`
),
Option("ignore",
"deprecated flag, unsupported pragmas are always ignored now"
),

View file

@ -0,0 +1,267 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
* Documentation: https://dlang.org/phobos/dmd_common_charactertables.html
* Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/common/charactertables.d
*/
module dmd.common.charactertables;
@safe nothrow @nogc pure:
extern(C++):
///
enum IdentifierTable {
UAX31, ///
C99, ///
C11, ///
LR, /// Least Restrictive aka All
}
///
struct IdentifierCharLookup
{
@safe nothrow @nogc pure:
///
extern(C++) bool function(dchar) isStart;
///
extern(C++) bool function(dchar) isContinue;
/// Lookup the table given the table name
static IdentifierCharLookup forTable(IdentifierTable table)
{
import dmd.common.identifiertables;
// Awful solution to require these lambdas.
// However without them the extern(C++) ABI issues crop up for isInRange,
// and then it can't access the tables.
final switch(table) {
case IdentifierTable.UAX31:
return IdentifierCharLookup(
(c) => isInRange!UAX31_Start(c),
(c) => isInRange!UAX31_Continue(c));
case IdentifierTable.C99:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C99_Start(c),
(c) => isInRange!FixedTable_C99_Continue(c));
case IdentifierTable.C11:
return IdentifierCharLookup(
(c) => isInRange!FixedTable_C11_Start(c),
(c) => isInRange!FixedTable_C11_Continue(c));
case IdentifierTable.LR:
return IdentifierCharLookup(
(c) => isInRange!LeastRestrictive_Start(c),
(c) => isInRange!LeastRestrictive_Continue(c));
}
}
}
/**
Convenience function for use in places where we just don't care,
what the identifier ranges are, or if it is start/continue.
Returns: is character a member of least restrictive of all.
*/
bool isAnyIdentifierCharacter(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_OfAll(c);
}
///
unittest
{
assert(isAnyContinue('ğ'));
}
/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.
Returns: is character a member of restrictive Start
*/
bool isAnyStart(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Start(c);
}
///
unittest
{
assert(isAnyStart('ğ'));
}
/**
Convenience function for use in places where we just don't care,
what the identifier ranges are.
Returns: is character a member of least restrictive Continue
*/
bool isAnyContinue(dchar c)
{
import dmd.common.identifiertables;
return isInRange!LeastRestrictive_Continue(c);
}
///
unittest
{
assert(isAnyContinue('ğ'));
}
/// UTF line separator
enum LS = 0x2028;
/// UTF paragraph separator
enum PS = 0x2029;
private
{
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
}
///
bool isoctal(const char c)
{
return (cmtable[c] & CMoctal) != 0;
}
///
bool ishex(const char c)
{
return (cmtable[c] & CMhex) != 0;
}
///
bool isidchar(const char c)
{
return (cmtable[c] & CMidchar) != 0;
}
///
bool isZeroSecond(const char c)
{
return (cmtable[c] & CMzerosecond) != 0;
}
///
bool isDigitSecond(const char c)
{
return (cmtable[c] & CMdigitsecond) != 0;
}
///
bool issinglechar(const char c)
{
return (cmtable[c] & CMsinglechar) != 0;
}
///
bool c_isxdigit(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'f') ||
( c >= 'A' && c <= 'F'));
}
///
bool c_isalnum(const int c)
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'z') ||
( c >= 'A' && c <= 'Z'));
}
extern(D) private:
// originally from dmd.root.utf
bool isInRange(alias Ranges)(dchar c)
{
size_t high = Ranges.length - 1;
// Shortcut search if c is out of range
size_t low = (c < Ranges[0][0] || Ranges[high][1] < c) ? high + 1 : 0;
// Binary search
while (low <= high)
{
const size_t mid = low + ((high - low) >> 1);
if (c < Ranges[mid][0])
high = mid - 1;
else if (Ranges[mid][1] < c)
low = mid + 1;
else
{
assert(Ranges[mid][0] <= c && c <= Ranges[mid][1]);
return true;
}
}
return false;
}
/********************************************
* Do our own char maps
*/
// originally from dmd.lexer (was private)
static immutable cmtable = ()
{
ubyte[256] table;
foreach (const c; 0 .. table.length)
{
if ('0' <= c && c <= '7')
table[c] |= CMoctal;
if (c_isxdigit(c))
table[c] |= CMhex;
if (c_isalnum(c) || c == '_')
table[c] |= CMidchar;
switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
table[c] |= CMzerosecond;
break;
case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
table[c] |= CMzerosecond | CMdigitsecond;
break;
default:
break;
}
switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
table[c] |= CMsinglechar;
break;
}
}
return table;
}();

View file

@ -0,0 +1,20 @@
/**
* Character tables related to identifiers.
*
* Supports UAX31, C99, C11 and least restrictive (All).
*
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
*/
#pragma once
struct IdentifierCharLookup final
{
bool(*isStart)(char32_t);
bool(*isContinue)(char32_t);
// constructor not provided here.
};

File diff suppressed because it is too large Load diff

View file

@ -72,12 +72,14 @@ void mangleToBuffer(TemplateInstance ti, ref OutBuffer buf)
/// Returns: `true` if the given character is a valid mangled character
package bool isValidMangling(dchar c) nothrow
{
import dmd.common.charactertables;
return
c >= 'A' && c <= 'Z' ||
c >= 'a' && c <= 'z' ||
c >= '0' && c <= '9' ||
c != 0 && strchr("$%().:?@[]_", c) ||
isUniAlpha(c);
isAnyIdentifierCharacter(c);
}
// valid mangled characters

View file

@ -2106,43 +2106,13 @@ int getMarkdownIndent(ref OutBuffer buf, size_t from, size_t to) @safe
return indent;
}
/************************************************
* Scan forward to one of:
* start of identifier
* beginning of next line
* end of buf
*/
size_t skiptoident(ref OutBuffer buf, size_t i) @safe
{
const slice = buf[];
while (i < slice.length)
{
dchar c;
size_t oi = i;
if (utf_decodeChar(slice, i, c))
{
/* Ignore UTF errors, but still consume input
*/
break;
}
if (c >= 0x80)
{
if (!isUniAlpha(c))
continue;
}
else if (!(isalpha(c) || c == '_' || c == '\n'))
continue;
i = oi;
break;
}
return i;
}
/************************************************
* Scan forward past end of identifier.
*/
size_t skippastident(ref OutBuffer buf, size_t i) @safe
{
import dmd.common.charactertables;
const slice = buf[];
while (i < slice.length)
{
@ -2156,7 +2126,8 @@ size_t skippastident(ref OutBuffer buf, size_t i) @safe
}
if (c >= 0x80)
{
if (isUniAlpha(c))
// we don't care if it is start/continue here
if (isAnyIdentifierCharacter(c))
continue;
}
else if (isalnum(c) || c == '_')
@ -2173,6 +2144,8 @@ size_t skippastident(ref OutBuffer buf, size_t i) @safe
*/
size_t skipPastIdentWithDots(ref OutBuffer buf, size_t i) @safe
{
import dmd.common.charactertables;
const slice = buf[];
bool lastCharWasDot;
while (i < slice.length)
@ -2203,7 +2176,8 @@ size_t skipPastIdentWithDots(ref OutBuffer buf, size_t i) @safe
{
if (c >= 0x80)
{
if (isUniAlpha(c))
// we don't care if it is start/continue here
if (isAnyIdentifierCharacter(c))
{
lastCharWasDot = false;
continue;
@ -5249,6 +5223,8 @@ bool isCVariadicArg(const(char)[] p) @nogc nothrow pure @safe
@trusted
bool isIdStart(const(char)* p) @nogc nothrow pure
{
import dmd.common.charactertables;
dchar c = *p;
if (isalpha(c) || c == '_')
return true;
@ -5257,7 +5233,7 @@ bool isIdStart(const(char)* p) @nogc nothrow pure
size_t i = 0;
if (utf_decodeChar(p[0 .. 4], i, c))
return false; // ignore errors
if (isUniAlpha(c))
if (isAnyStart(c))
return true;
}
return false;
@ -5269,6 +5245,8 @@ bool isIdStart(const(char)* p) @nogc nothrow pure
@trusted
bool isIdTail(const(char)* p) @nogc nothrow pure
{
import dmd.common.charactertables;
dchar c = *p;
if (isalnum(c) || c == '_')
return true;
@ -5277,7 +5255,7 @@ bool isIdTail(const(char)* p) @nogc nothrow pure
size_t i = 0;
if (utf_decodeChar(p[0 .. 4], i, c))
return false; // ignore errors
if (isUniAlpha(c))
if (isAnyContinue(c))
return true;
}
return false;

View file

@ -6118,6 +6118,15 @@ enum class CHECKACTION : uint8_t
context = 3u,
};
enum class CLIIdentifierTable : uint8_t
{
default_ = 0u,
C99 = 1u,
C11 = 2u,
UAX31 = 3u,
All = 4u,
};
enum class JsonFieldFlags : uint32_t
{
none = 0u,
@ -6137,6 +6146,8 @@ struct CompileEnv final
bool previewIn;
bool ddocOutput;
bool masm;
IdentifierCharLookup cCharLookupTable;
IdentifierCharLookup dCharLookupTable;
CompileEnv() :
versionNumber(),
date(),
@ -6145,10 +6156,12 @@ struct CompileEnv final
timestamp(),
previewIn(),
ddocOutput(),
masm()
masm(),
cCharLookupTable(),
dCharLookupTable()
{
}
CompileEnv(uint32_t versionNumber, _d_dynamicArray< const char > date = {}, _d_dynamicArray< const char > time = {}, _d_dynamicArray< const char > vendor = {}, _d_dynamicArray< const char > timestamp = {}, bool previewIn = false, bool ddocOutput = false, bool masm = false) :
CompileEnv(uint32_t versionNumber, _d_dynamicArray< const char > date = {}, _d_dynamicArray< const char > time = {}, _d_dynamicArray< const char > vendor = {}, _d_dynamicArray< const char > timestamp = {}, bool previewIn = false, bool ddocOutput = false, bool masm = false, IdentifierCharLookup cCharLookupTable = IdentifierCharLookup(), IdentifierCharLookup dCharLookupTable = IdentifierCharLookup()) :
versionNumber(versionNumber),
date(date),
time(time),
@ -6156,7 +6169,9 @@ struct CompileEnv final
timestamp(timestamp),
previewIn(previewIn),
ddocOutput(ddocOutput),
masm(masm)
masm(masm),
cCharLookupTable(cCharLookupTable),
dCharLookupTable(dCharLookupTable)
{}
};
@ -7804,6 +7819,56 @@ extern _d_real cimagl(complex_t x);
extern void browse(const char* url);
enum class IdentifierTable
{
UAX31 = 0,
C99 = 1,
C11 = 2,
LR = 3,
};
struct IdentifierCharLookup final
{
bool(*isStart)(char32_t );
bool(*isContinue)(char32_t );
static IdentifierCharLookup forTable(IdentifierTable table);
IdentifierCharLookup() :
isStart(),
isContinue()
{
}
IdentifierCharLookup(bool(*isStart)(char32_t ), bool(*isContinue)(char32_t ) = nullptr) :
isStart(isStart),
isContinue(isContinue)
{}
};
extern bool isAnyIdentifierCharacter(char32_t c);
extern bool isAnyStart(char32_t c);
extern bool isAnyContinue(char32_t c);
enum : int32_t { LS = 8232 };
enum : int32_t { PS = 8233 };
extern bool isoctal(const char c);
extern bool ishex(const char c);
extern bool isidchar(const char c);
extern bool isZeroSecond(const char c);
extern bool isDigitSecond(const char c);
extern bool issinglechar(const char c);
extern bool c_isxdigit(const int32_t c);
extern bool c_isalnum(const int32_t c);
extern void error(const Loc& loc, const char* format, ...);
extern void error(const char* filename, uint32_t linnum, uint32_t charnum, const char* format, ...);
@ -8013,6 +8078,8 @@ struct Param final
CHECKENABLE useSwitchError;
CHECKENABLE boundscheck;
CHECKACTION checkAction;
CLIIdentifierTable dIdentifierTable;
CLIIdentifierTable cIdentifierTable;
_d_dynamicArray< const char > argv0;
Array<const char* > modFileAliasStrings;
Array<const char* > imppath;
@ -8088,6 +8155,8 @@ struct Param final
useSwitchError((CHECKENABLE)0u),
boundscheck((CHECKENABLE)0u),
checkAction((CHECKACTION)0u),
dIdentifierTable((CLIIdentifierTable)0u),
cIdentifierTable((CLIIdentifierTable)0u),
argv0(),
modFileAliasStrings(),
imppath(),
@ -8119,7 +8188,7 @@ struct Param final
mapfile()
{
}
Param(bool obj, bool multiobj = false, bool trace = false, bool tracegc = false, bool vcg_ast = false, DiagnosticReporting useDeprecated = (DiagnosticReporting)1u, bool useUnitTests = false, bool useInline = false, bool release = false, bool preservePaths = false, DiagnosticReporting warnings = (DiagnosticReporting)2u, bool cov = false, uint8_t covPercent = 0u, bool ctfe_cov = false, bool ignoreUnsupportedPragmas = true, bool useModuleInfo = true, bool useTypeInfo = true, bool useExceptions = true, bool useGC = true, bool betterC = false, bool addMain = false, bool allInst = false, bool bitfields = false, CppStdRevision cplusplus = (CppStdRevision)201103u, Help help = Help(), Verbose v = Verbose(), FeatureState useDIP25 = (FeatureState)2u, FeatureState useDIP1000 = (FeatureState)0u, bool ehnogc = false, bool useDIP1021 = false, FeatureState fieldwise = (FeatureState)0u, bool fixAliasThis = false, FeatureState rvalueRefParam = (FeatureState)0u, FeatureState noSharedAccess = (FeatureState)0u, bool previewIn = false, bool inclusiveInContracts = false, bool shortenedMethods = true, bool fixImmutableConv = false, bool fix16997 = true, FeatureState dtorFields = (FeatureState)0u, FeatureState systemVariables = (FeatureState)0u, CHECKENABLE useInvariants = (CHECKENABLE)0u, CHECKENABLE useIn = (CHECKENABLE)0u, CHECKENABLE useOut = (CHECKENABLE)0u, CHECKENABLE useArrayBounds = (CHECKENABLE)0u, CHECKENABLE useAssert = (CHECKENABLE)0u, CHECKENABLE useSwitchError = (CHECKENABLE)0u, CHECKENABLE boundscheck = (CHECKENABLE)0u, CHECKACTION checkAction = (CHECKACTION)0u, _d_dynamicArray< const char > argv0 = {}, Array<const char* > modFileAliasStrings = Array<const char* >(), Array<const char* > imppath = Array<const char* >(), Array<const char* > fileImppath = Array<const char* >(), _d_dynamicArray< const char > objdir = {}, _d_dynamicArray< const char > objname = {}, _d_dynamicArray< const char > libname = {}, Output ddoc = Output(), Output dihdr = Output(), Output cxxhdr = Output(), Output json = Output(), JsonFieldFlags jsonFieldFlags = (JsonFieldFlags)0u, Output makeDeps = Output(), Output mixinOut = Output(), Output moduleDeps = Output(), uint32_t debuglevel = 0u, uint32_t versionlevel = 0u, bool run = false, Array<const char* > runargs = Array<const char* >(), Array<const char* > cppswitches = Array<const char* >(), const char* cpp = nullptr, Array<const char* > objfiles = Array<const char* >(), Array<const char* > linkswitches = Array<const char* >(), Array<bool > linkswitchIsForCC = Array<bool >(), Array<const char* > libfiles = Array<const char* >(), Array<const char* > dllfiles = Array<const char* >(), _d_dynamicArray< const char > deffile = {}, _d_dynamicArray< const char > resfile = {}, _d_dynamicArray< const char > exefile = {}, _d_dynamicArray< const char > mapfile = {}) :
Param(bool obj, bool multiobj = false, bool trace = false, bool tracegc = false, bool vcg_ast = false, DiagnosticReporting useDeprecated = (DiagnosticReporting)1u, bool useUnitTests = false, bool useInline = false, bool release = false, bool preservePaths = false, DiagnosticReporting warnings = (DiagnosticReporting)2u, bool cov = false, uint8_t covPercent = 0u, bool ctfe_cov = false, bool ignoreUnsupportedPragmas = true, bool useModuleInfo = true, bool useTypeInfo = true, bool useExceptions = true, bool useGC = true, bool betterC = false, bool addMain = false, bool allInst = false, bool bitfields = false, CppStdRevision cplusplus = (CppStdRevision)201103u, Help help = Help(), Verbose v = Verbose(), FeatureState useDIP25 = (FeatureState)2u, FeatureState useDIP1000 = (FeatureState)0u, bool ehnogc = false, bool useDIP1021 = false, FeatureState fieldwise = (FeatureState)0u, bool fixAliasThis = false, FeatureState rvalueRefParam = (FeatureState)0u, FeatureState noSharedAccess = (FeatureState)0u, bool previewIn = false, bool inclusiveInContracts = false, bool shortenedMethods = true, bool fixImmutableConv = false, bool fix16997 = true, FeatureState dtorFields = (FeatureState)0u, FeatureState systemVariables = (FeatureState)0u, CHECKENABLE useInvariants = (CHECKENABLE)0u, CHECKENABLE useIn = (CHECKENABLE)0u, CHECKENABLE useOut = (CHECKENABLE)0u, CHECKENABLE useArrayBounds = (CHECKENABLE)0u, CHECKENABLE useAssert = (CHECKENABLE)0u, CHECKENABLE useSwitchError = (CHECKENABLE)0u, CHECKENABLE boundscheck = (CHECKENABLE)0u, CHECKACTION checkAction = (CHECKACTION)0u, CLIIdentifierTable dIdentifierTable = (CLIIdentifierTable)0u, CLIIdentifierTable cIdentifierTable = (CLIIdentifierTable)0u, _d_dynamicArray< const char > argv0 = {}, Array<const char* > modFileAliasStrings = Array<const char* >(), Array<const char* > imppath = Array<const char* >(), Array<const char* > fileImppath = Array<const char* >(), _d_dynamicArray< const char > objdir = {}, _d_dynamicArray< const char > objname = {}, _d_dynamicArray< const char > libname = {}, Output ddoc = Output(), Output dihdr = Output(), Output cxxhdr = Output(), Output json = Output(), JsonFieldFlags jsonFieldFlags = (JsonFieldFlags)0u, Output makeDeps = Output(), Output mixinOut = Output(), Output moduleDeps = Output(), uint32_t debuglevel = 0u, uint32_t versionlevel = 0u, bool run = false, Array<const char* > runargs = Array<const char* >(), Array<const char* > cppswitches = Array<const char* >(), const char* cpp = nullptr, Array<const char* > objfiles = Array<const char* >(), Array<const char* > linkswitches = Array<const char* >(), Array<bool > linkswitchIsForCC = Array<bool >(), Array<const char* > libfiles = Array<const char* >(), Array<const char* > dllfiles = Array<const char* >(), _d_dynamicArray< const char > deffile = {}, _d_dynamicArray< const char > resfile = {}, _d_dynamicArray< const char > exefile = {}, _d_dynamicArray< const char > mapfile = {}) :
obj(obj),
multiobj(multiobj),
trace(trace),
@ -8169,6 +8238,8 @@ struct Param final
useSwitchError(useSwitchError),
boundscheck(boundscheck),
checkAction(checkAction),
dIdentifierTable(dIdentifierTable),
cIdentifierTable(cIdentifierTable),
argv0(argv0),
modFileAliasStrings(modFileAliasStrings),
imppath(imppath),

View file

@ -72,6 +72,16 @@ enum FeatureState : ubyte
enabled = 2, /// Specified as `-preview=`
}
/// Different identifier tables specifiable by CLI
enum CLIIdentifierTable : ubyte
{
default_ = 0, /// Not specified by user
C99 = 1, /// Tables from C99 standard
C11 = 2, /// Tables from C11 standard
UAX31 = 3, /// Tables from the Unicode Standard Annex 31: UNICODE IDENTIFIERS AND SYNTAX
All = 4, /// The least restrictive set of all other tables
}
extern(C++) struct Output
{
bool doOutput; // Output is enabled
@ -199,6 +209,9 @@ extern (C++) struct Param
CHECKACTION checkAction = CHECKACTION.D; // action to take when bounds, asserts or switch defaults are violated
CLIIdentifierTable dIdentifierTable = CLIIdentifierTable.default_;
CLIIdentifierTable cIdentifierTable = CLIIdentifierTable.default_;
const(char)[] argv0; // program name
Array!(const(char)*) modFileAliasStrings; // array of char*'s of -I module filename alias strings
Array!(const(char)*) imppath; // array of char*'s of where to look for import modules

View file

@ -13,6 +13,7 @@
#include "root/dcompat.h"
#include "root/ctfloat.h"
#include "common/outbuffer.h"
#include "common/charactertables.h"
#include "root/filename.h"
#include "compiler.h"
@ -82,6 +83,16 @@ enum class FeatureState : unsigned char
enabled = 2, /// Specified as `-preview=`
};
/// Different identifier tables specifiable by CLI
enum class CLIIdentifierTable : unsigned char
{
default_ = 0, /// Not specified by user
C99 = 1, /// Tables from C99 standard
C11 = 2, /// Tables from C11 standard
UAX31 = 3, /// Tables from the Unicode Standard Annex 31: UNICODE IDENTIFIERS AND SYNTAX
All = 4, /// The least restrictive set of all other tables
};
struct Output
{
/// Configuration for the compiler generator
@ -200,6 +211,9 @@ struct Param
CHECKACTION checkAction; // action to take when bounds, asserts or switch defaults are violated
CLIIdentifierTable dIdentifierTable;
CLIIdentifierTable cIdentifierTable;
DString argv0; // program name
Array<const char *> modFileAliasStrings; // array of char*'s of -I module filename alias strings
Array<const char *> imppath; // array of char*'s of where to look for import modules
@ -274,6 +288,9 @@ struct CompileEnv
DString timestamp;
d_bool previewIn;
d_bool ddocOutput;
d_bool masm;
IdentifierCharLookup cCharLookupTable;
IdentifierCharLookup dCharLookupTable;
};
struct Global

View file

@ -315,28 +315,83 @@ nothrow:
/**********************************
* ditto
*/
extern (D) static bool isValidIdentifier(const(char)[] str) @safe
extern (D) static bool isValidIdentifier(const(char)[] str) @trusted
{
import dmd.common.charactertables;
if (str.length == 0 ||
(str[0] >= '0' && str[0] <= '9')) // beware of isdigit() on signed chars
{
return false;
}
size_t idx = 0;
while (idx < str.length)
// In a previous implementation this was implemented quite naively,
// by utilizing the libc.
// However we can do better, by copying the lexer approach to identifier validation.
const(char)* p = &str[0], pEnd = str.ptr + str.length;
// handle start characters
{
dchar dc;
const s = utf_decodeChar(str, idx, dc);
if (s ||
!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
const c = *p;
if (isidchar(c))
p++;
else if (c & 0x80)
{
return false;
size_t countDecoded;
dchar decoded;
if (utf_decodeChar(p[0 .. pEnd - p], countDecoded, decoded) is null ||
isAnyStart(decoded))
p += countDecoded;
else
return false;
}
else
return false;
}
// handle continue characters
while(p !is pEnd)
{
const c = *p;
if (isidchar(c)) // handles ASCII subset
{
p++;
continue;
}
else if (c & 0x80)
{
size_t countDecoded;
dchar decoded;
if (utf_decodeChar(p[0 .. pEnd - p], countDecoded, decoded) is null ||
isAnyContinue(decoded))
{
p += countDecoded;
continue;
}
else
return false;
}
else
return false;
}
return true;
}
///
unittest
{
assert(Identifier.isValidIdentifier("tes123_t".ptr));
assert(!Identifier.isValidIdentifier("tes123_^t".ptr));
assert(Identifier.isValidIdentifier("te123s_ğt".ptr));
assert(!Identifier.isValidIdentifier("t^e123s_ğt".ptr));
}
extern (D) static Identifier lookup(const(char)* s, size_t len)
{
return lookup(s[0 .. len]);

View file

@ -22,9 +22,11 @@ import dmd.errorsink;
import dmd.id;
import dmd.identifier;
import dmd.location;
import dmd.common.smallbuffer;
import dmd.common.outbuffer;
import dmd.common.charactertables;
import dmd.root.array;
import dmd.root.ctfloat;
import dmd.common.outbuffer;
import dmd.root.port;
import dmd.root.rmem;
import dmd.root.utf;
@ -42,6 +44,8 @@ version (DMDLIB)
*/
struct CompileEnv
{
import dmd.common.charactertables;
uint versionNumber; /// __VERSION__
const(char)[] date; /// __DATE__
const(char)[] time; /// __TIME__
@ -51,6 +55,10 @@ struct CompileEnv
bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues
bool ddocOutput; /// collect embedded documentation comments
bool masm; /// use MASM inline asm syntax
// these need a default otherwise tests won't work.
IdentifierCharLookup cCharLookupTable;
IdentifierCharLookup dCharLookupTable;
}
/***********************************************************
@ -66,6 +74,8 @@ class Lexer
Token token;
IdentifierCharLookup charLookup;
// For ImportC
bool Ccompile; /// true if compiling ImportC
@ -142,6 +152,8 @@ class Lexer
{
this.compileEnv.versionNumber = 1;
this.compileEnv.vendor = "DLF";
this.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
this.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
}
//initKeywords();
/* If first line starts with '#!', ignore the line
@ -175,6 +187,16 @@ class Lexer
}
endOfLine();
}
// setup the identifier table lookup functions
if (this.Ccompile)
{
charLookup = this.compileEnv.cCharLookupTable;
}
else
{
charLookup = this.compileEnv.dCharLookupTable;
}
}
/***********************
@ -306,6 +328,8 @@ class Lexer
t.blockComment = null;
t.lineComment = null;
size_t universalCharacterName4, universalCharacterName8;
while (1)
{
t.ptr = p;
@ -395,10 +419,35 @@ class Lexer
continue; // skip white space
case '\\':
if (Ccompile && (p[1] == '\r' || p[1] == '\n'))
if (Ccompile)
{
++p; // ignore \ followed by new line, like VC does
continue;
if (p[1] == '\r' || p[1] == '\n')
{
++p; // ignore \ followed by new line, like VC does
continue;
}
else if (p[1] == 'u')
{
// Universal Character Name (C) 2 byte
// \uXXXX
// let the main case handling for identifiers process this
// case_indent will always increment, so subtract to prevent branching on the fast path
p--;
goto case_ident;
}
else if (p[1] == 'U')
{
// Universal Character Name (C) 4 byte
// \UXXXXXXXX
// let the main case handling for identifiers process this
// case_indent will always increment, so subtract to prevent branching on the fast path
p--;
goto case_ident;
}
}
goto default;
@ -586,23 +635,161 @@ class Lexer
case '_':
case_ident:
{
while (1)
IdentLoop: while (1)
{
// If this is changed, change the decrement in C's universal character name code above
// For syntax \uXXXX and \UXXXXXXXX
const c = *++p;
// Is this the first character of the identifier
// For the universal character name this will line up,
// for the main switch it won't since it wasn't the first,
// for the default it won't either because a decode increments.
const isStartCharacter = t.ptr is p;
if (isidchar(c))
continue;
else if (c & 0x80)
{
const s = p;
const u = decodeUTF();
if (isUniAlpha(u))
continue;
error(t.loc, "char 0x%04x not allowed in identifier", u);
if (isStartCharacter)
{
if (charLookup.isStart(u))
continue;
error(t.loc, "character 0x%04x is not allowed as a start character in an identifier", u);
}
else
{
if (charLookup.isContinue(u))
continue;
error(t.loc, "character 0x%04x is not allowed as a continue character in an identifier", u);
}
p = s;
}
else if (Ccompile && c == '\\')
{
uint times;
const s = p;
p++;
if (*p == 'u')
{
// Universal Character Name (C) 2 byte
// \uXXXX
p++;
times = 4;
}
else if (*p == 'U')
{
// Universal Character Name (C) 4 byte
// \UXXXXXXXX
p++;
times = 8;
}
else
{
error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid u/U", *p);
p = s;
break;
}
foreach(_; 0 .. times)
{
const hc = *p;
p++;
if ((hc >= '0' && hc <= '9') || (hc >= 'a' && hc <= 'f') || (hc >= 'A' && hc <= 'F'))
continue;
error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid hex digit", hc);
p = s;
break IdentLoop;
}
continue;
}
break;
}
Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
Identifier id;
if (universalCharacterName4 > 0 || universalCharacterName8 > 0)
{
auto priorValidation = t.ptr[0 .. p - t.ptr];
const(char)* priorVPtr = priorValidation.ptr;
const possibleLength = (
priorValidation.length - (
(universalCharacterName4 * 6) +
(universalCharacterName8 * 10)
)) + (
(universalCharacterName4 * 3) +
(universalCharacterName8 * 4)
);
char[64] buffer = void;
SmallBuffer!char sb = SmallBuffer!char(possibleLength, buffer[]);
char[] storage = sb.extent;
size_t offset;
while(priorVPtr < &priorValidation[$-1] + 1)
{
if (*priorVPtr == '\\')
{
dchar tempDchar = 0;
uint times;
// universal character name (C)
if (priorVPtr[1] == 'u')
times = 4;
else if (priorVPtr[1] == 'U')
times = 8;
else
assert(0, "ICE: Universal character name is 2 or 4 bytes only");
priorVPtr += 2;
foreach(_; 0 .. times)
{
char c = *++priorVPtr;
if (c >= '0' && c <= '9')
c -= '0';
else if (c >= 'a' && c <= 'f')
c -= 'a' - 10;
else if (c >= 'A' && c <= 'F')
c -= 'A' - 10;
tempDchar <<= 4;
tempDchar |= c;
}
utf_encodeChar(&storage[offset], tempDchar);
offset += utf_codeLengthChar(tempDchar);
// Could be an error instead of a warning,
// but hey it was written specifically so why worry?
if (priorVPtr is priorValidation.ptr)
{
if (!charLookup.isStart(tempDchar))
warning(t.loc, "char 0x%x is not allowed start character for an identifier", tempDchar);
}
else
{
if (!charLookup.isContinue(tempDchar))
warning(t.loc, "char 0x%x is not allowed continue character for an identifier", tempDchar);
}
}
else
storage[offset++] = *++priorVPtr;
}
id = Identifier.idPool(storage[0 .. offset], false);
}
else
id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
t.ident = id;
t.value = cast(TOK)id.getValue();
@ -1174,9 +1361,11 @@ class Lexer
if (c & 0x80)
{
c = decodeUTF();
// Check for start of unicode identifier
if (isUniAlpha(c))
// Check for start of an identifier
if (charLookup.isStart(c))
goto case_ident;
if (c == PS || c == LS)
{
endOfLine();
@ -1688,7 +1877,7 @@ class Lexer
delimright = ']';
else if (c == '<')
delimright = '>';
else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
else if (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c)))
{
// Start of identifier; must be a heredoc
Token tok;
@ -1736,7 +1925,9 @@ class Lexer
}
else if (c == delimright)
goto Ldone;
if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
// we're looking for a new identifier token
if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c))) && hereid)
{
Token tok;
auto psave = p;
@ -2988,6 +3179,11 @@ class Lexer
eSink.deprecation(loc, format, args);
}
void warning(T...)(const ref Loc loc, const(char)* format, T args)
{
eSink.warning(loc, format, args);
}
void deprecation(T...)(const(char)* format, T args)
{
eSink.deprecation(token.loc, format, args);
@ -3416,124 +3612,6 @@ class Lexer
}
}
/******************************* Private *****************************************/
private:
private enum LS = 0x2028; // UTF line separator
private enum PS = 0x2029; // UTF paragraph separator
/********************************************
* Do our own char maps
*/
private static immutable cmtable = ()
{
ubyte[256] table;
foreach (const c; 0 .. table.length)
{
if ('0' <= c && c <= '7')
table[c] |= CMoctal;
if (c_isxdigit(c))
table[c] |= CMhex;
if (c_isalnum(c) || c == '_')
table[c] |= CMidchar;
switch (c)
{
case 'x': case 'X':
case 'b': case 'B':
table[c] |= CMzerosecond;
break;
case '0': .. case '9':
case 'e': case 'E':
case 'f': case 'F':
case 'l': case 'L':
case 'p': case 'P':
case 'u': case 'U':
case 'i':
case '.':
case '_':
table[c] |= CMzerosecond | CMdigitsecond;
break;
default:
break;
}
switch (c)
{
case '\\':
case '\n':
case '\r':
case 0:
case 0x1A:
case '\'':
break;
default:
if (!(c & 0x80))
table[c] |= CMsinglechar;
break;
}
}
return table;
}();
private
{
enum CMoctal = 0x1;
enum CMhex = 0x2;
enum CMidchar = 0x4;
enum CMzerosecond = 0x8;
enum CMdigitsecond = 0x10;
enum CMsinglechar = 0x20;
}
private bool isoctal(const char c) pure @nogc @safe
{
return (cmtable[c] & CMoctal) != 0;
}
private bool ishex(const char c) pure @nogc @safe
{
return (cmtable[c] & CMhex) != 0;
}
private bool isidchar(const char c) pure @nogc @safe
{
return (cmtable[c] & CMidchar) != 0;
}
private bool isZeroSecond(const char c) pure @nogc @safe
{
return (cmtable[c] & CMzerosecond) != 0;
}
private bool isDigitSecond(const char c) pure @nogc @safe
{
return (cmtable[c] & CMdigitsecond) != 0;
}
private bool issinglechar(const char c) pure @nogc @safe
{
return (cmtable[c] & CMsinglechar) != 0;
}
private bool c_isxdigit(const int c) pure @nogc @safe
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'f') ||
( c >= 'A' && c <= 'F'));
}
private bool c_isalnum(const int c) pure @nogc @safe
{
return (( c >= '0' && c <= '9') ||
( c >= 'a' && c <= 'z') ||
( c >= 'A' && c <= 'Z'));
}
/******************************* Unittest *****************************************/
unittest

View file

@ -157,6 +157,8 @@ private:
*/
private int tryMain(size_t argc, const(char)** argv, ref Param params)
{
import dmd.common.charactertables;
Strings files;
Strings libmodules;
global._init();
@ -168,6 +170,52 @@ private int tryMain(size_t argc, const(char)** argv, ref Param params)
global.compileEnv.previewIn = global.params.previewIn;
global.compileEnv.ddocOutput = global.params.ddoc.doOutput;
final switch(global.params.cIdentifierTable)
{
case CLIIdentifierTable.C99:
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C99);
break;
case CLIIdentifierTable.C11:
case CLIIdentifierTable.default_:
// ImportC is defined against C11, not C23.
// If it was C23 this needs to be changed to UAX31 instead.
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C11);
break;
case CLIIdentifierTable.UAX31:
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.UAX31);
break;
case CLIIdentifierTable.All:
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
break;
}
final switch(global.params.dIdentifierTable)
{
case CLIIdentifierTable.C99:
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C99);
break;
case CLIIdentifierTable.C11:
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C11);
break;
case CLIIdentifierTable.UAX31:
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.UAX31);
break;
case CLIIdentifierTable.All:
case CLIIdentifierTable.default_:
// @@@DEPRECATED_2.119@@@
// Change the default to UAX31,
// this is a breaking change as C99 (what D used for ~23 years),
// has characters that are not in UAX31.
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
break;
}
if (params.help.usage)
{
usage();

View file

@ -1383,6 +1383,58 @@ bool parseCommandLine(const ref Strings arguments, const size_t argc, ref Param
params.useInline = true;
params.dihdr.fullOutput = true;
}
else if (startsWith(p + 1, "identifiers-importc"))
{
enum len = "-identifiers-importc=".length;
// Parse:
// -identifiers=table
immutable string msg = "Only `UAX31`, `c99`, `c11`, `all`, allowed for `-identifiers-importc`";
if (Identifier.isValidIdentifier(p + len))
{
const ident = p + len;
switch (ident.toDString())
{
case "c99": params.cIdentifierTable = CLIIdentifierTable.C99; break;
case "c11": params.cIdentifierTable = CLIIdentifierTable.C11; break;
case "UAX31": params.cIdentifierTable = CLIIdentifierTable.UAX31; break;
case "all": params.cIdentifierTable = CLIIdentifierTable.All; break;
default:
errorInvalidSwitch(p, msg);
return false;
}
}
else
{
errorInvalidSwitch(p, msg);
return false;
}
}
else if (startsWith(p + 1, "identifiers"))
{
enum len = "-identifiers=".length;
// Parse:
// -identifiers=table
immutable string msg = "Only `UAX31`, `c99`, `c11`, `all`, allowed for `-identifiers`";
if (Identifier.isValidIdentifier(p + len))
{
const ident = p + len;
switch (ident.toDString())
{
case "c99": params.dIdentifierTable = CLIIdentifierTable.C99; break;
case "c11": params.dIdentifierTable = CLIIdentifierTable.C11; break;
case "UAX31": params.dIdentifierTable = CLIIdentifierTable.UAX31; break;
case "all": params.dIdentifierTable = CLIIdentifierTable.All; break;
default:
errorInvalidSwitch(p, msg);
return false;
}
}
else
{
errorInvalidSwitch(p, msg);
return false;
}
}
else if (arg == "-i")
includeImports = true;
else if (startsWith(p + 1, "i="))

View file

@ -67,6 +67,8 @@ void pragmaDeclSemantic(PragmaDeclaration pd, Scope* sc)
}
version (all)
{
import dmd.common.charactertables;
/* Note: D language specification should not have any assumption about backend
* implementation. Ideally pragma(mangle) can accept a string of any content.
*
@ -94,7 +96,7 @@ void pragmaDeclSemantic(PragmaDeclaration pd, Scope* sc)
.error(pd.loc, "%s `%s` %.*s", pd.kind, pd.toPrettyChars, cast(int)msg.length, msg.ptr);
break;
}
if (!isUniAlpha(c))
if (!isAnyIdentifierCharacter(c))
{
.error(pd.loc, "%s `%s` char `0x%04x` not allowed in mangled name", pd.kind, pd.toPrettyChars, c);
break;

View file

@ -27,281 +27,6 @@ bool utf_isValidDchar(dchar c)
return false;
}
/*******************************
* Return !=0 if unicode alpha.
* Use table from C99 Appendix D.
*/
bool isUniAlpha(dchar c)
{
static immutable wchar[2][] ALPHA_TABLE =
[
[0x00AA, 0x00AA],
[0x00B5, 0x00B5],
[0x00B7, 0x00B7],
[0x00BA, 0x00BA],
[0x00C0, 0x00D6],
[0x00D8, 0x00F6],
[0x00F8, 0x01F5],
[0x01FA, 0x0217],
[0x0250, 0x02A8],
[0x02B0, 0x02B8],
[0x02BB, 0x02BB],
[0x02BD, 0x02C1],
[0x02D0, 0x02D1],
[0x02E0, 0x02E4],
[0x037A, 0x037A],
[0x0386, 0x0386],
[0x0388, 0x038A],
[0x038C, 0x038C],
[0x038E, 0x03A1],
[0x03A3, 0x03CE],
[0x03D0, 0x03D6],
[0x03DA, 0x03DA],
[0x03DC, 0x03DC],
[0x03DE, 0x03DE],
[0x03E0, 0x03E0],
[0x03E2, 0x03F3],
[0x0401, 0x040C],
[0x040E, 0x044F],
[0x0451, 0x045C],
[0x045E, 0x0481],
[0x0490, 0x04C4],
[0x04C7, 0x04C8],
[0x04CB, 0x04CC],
[0x04D0, 0x04EB],
[0x04EE, 0x04F5],
[0x04F8, 0x04F9],
[0x0531, 0x0556],
[0x0559, 0x0559],
[0x0561, 0x0587],
[0x05B0, 0x05B9],
[0x05BB, 0x05BD],
[0x05BF, 0x05BF],
[0x05C1, 0x05C2],
[0x05D0, 0x05EA],
[0x05F0, 0x05F2],
[0x0621, 0x063A],
[0x0640, 0x0652],
[0x0660, 0x0669],
[0x0670, 0x06B7],
[0x06BA, 0x06BE],
[0x06C0, 0x06CE],
[0x06D0, 0x06DC],
[0x06E5, 0x06E8],
[0x06EA, 0x06ED],
[0x06F0, 0x06F9],
[0x0901, 0x0903],
[0x0905, 0x0939],
[0x093D, 0x094D],
[0x0950, 0x0952],
[0x0958, 0x0963],
[0x0966, 0x096F],
[0x0981, 0x0983],
[0x0985, 0x098C],
[0x098F, 0x0990],
[0x0993, 0x09A8],
[0x09AA, 0x09B0],
[0x09B2, 0x09B2],
[0x09B6, 0x09B9],
[0x09BE, 0x09C4],
[0x09C7, 0x09C8],
[0x09CB, 0x09CD],
[0x09DC, 0x09DD],
[0x09DF, 0x09E3],
[0x09E6, 0x09F1],
[0x0A02, 0x0A02],
[0x0A05, 0x0A0A],
[0x0A0F, 0x0A10],
[0x0A13, 0x0A28],
[0x0A2A, 0x0A30],
[0x0A32, 0x0A33],
[0x0A35, 0x0A36],
[0x0A38, 0x0A39],
[0x0A3E, 0x0A42],
[0x0A47, 0x0A48],
[0x0A4B, 0x0A4D],
[0x0A59, 0x0A5C],
[0x0A5E, 0x0A5E],
[0x0A66, 0x0A6F],
[0x0A74, 0x0A74],
[0x0A81, 0x0A83],
[0x0A85, 0x0A8B],
[0x0A8D, 0x0A8D],
[0x0A8F, 0x0A91],
[0x0A93, 0x0AA8],
[0x0AAA, 0x0AB0],
[0x0AB2, 0x0AB3],
[0x0AB5, 0x0AB9],
[0x0ABD, 0x0AC5],
[0x0AC7, 0x0AC9],
[0x0ACB, 0x0ACD],
[0x0AD0, 0x0AD0],
[0x0AE0, 0x0AE0],
[0x0AE6, 0x0AEF],
[0x0B01, 0x0B03],
[0x0B05, 0x0B0C],
[0x0B0F, 0x0B10],
[0x0B13, 0x0B28],
[0x0B2A, 0x0B30],
[0x0B32, 0x0B33],
[0x0B36, 0x0B39],
[0x0B3D, 0x0B43],
[0x0B47, 0x0B48],
[0x0B4B, 0x0B4D],
[0x0B5C, 0x0B5D],
[0x0B5F, 0x0B61],
[0x0B66, 0x0B6F],
[0x0B82, 0x0B83],
[0x0B85, 0x0B8A],
[0x0B8E, 0x0B90],
[0x0B92, 0x0B95],
[0x0B99, 0x0B9A],
[0x0B9C, 0x0B9C],
[0x0B9E, 0x0B9F],
[0x0BA3, 0x0BA4],
[0x0BA8, 0x0BAA],
[0x0BAE, 0x0BB5],
[0x0BB7, 0x0BB9],
[0x0BBE, 0x0BC2],
[0x0BC6, 0x0BC8],
[0x0BCA, 0x0BCD],
[0x0BE7, 0x0BEF],
[0x0C01, 0x0C03],
[0x0C05, 0x0C0C],
[0x0C0E, 0x0C10],
[0x0C12, 0x0C28],
[0x0C2A, 0x0C33],
[0x0C35, 0x0C39],
[0x0C3E, 0x0C44],
[0x0C46, 0x0C48],
[0x0C4A, 0x0C4D],
[0x0C60, 0x0C61],
[0x0C66, 0x0C6F],
[0x0C82, 0x0C83],
[0x0C85, 0x0C8C],
[0x0C8E, 0x0C90],
[0x0C92, 0x0CA8],
[0x0CAA, 0x0CB3],
[0x0CB5, 0x0CB9],
[0x0CBE, 0x0CC4],
[0x0CC6, 0x0CC8],
[0x0CCA, 0x0CCD],
[0x0CDE, 0x0CDE],
[0x0CE0, 0x0CE1],
[0x0CE6, 0x0CEF],
[0x0D02, 0x0D03],
[0x0D05, 0x0D0C],
[0x0D0E, 0x0D10],
[0x0D12, 0x0D28],
[0x0D2A, 0x0D39],
[0x0D3E, 0x0D43],
[0x0D46, 0x0D48],
[0x0D4A, 0x0D4D],
[0x0D60, 0x0D61],
[0x0D66, 0x0D6F],
[0x0E01, 0x0E3A],
[0x0E40, 0x0E5B],
[0x0E81, 0x0E82],
[0x0E84, 0x0E84],
[0x0E87, 0x0E88],
[0x0E8A, 0x0E8A],
[0x0E8D, 0x0E8D],
[0x0E94, 0x0E97],
[0x0E99, 0x0E9F],
[0x0EA1, 0x0EA3],
[0x0EA5, 0x0EA5],
[0x0EA7, 0x0EA7],
[0x0EAA, 0x0EAB],
[0x0EAD, 0x0EAE],
[0x0EB0, 0x0EB9],
[0x0EBB, 0x0EBD],
[0x0EC0, 0x0EC4],
[0x0EC6, 0x0EC6],
[0x0EC8, 0x0ECD],
[0x0ED0, 0x0ED9],
[0x0EDC, 0x0EDD],
[0x0F00, 0x0F00],
[0x0F18, 0x0F19],
[0x0F20, 0x0F33],
[0x0F35, 0x0F35],
[0x0F37, 0x0F37],
[0x0F39, 0x0F39],
[0x0F3E, 0x0F47],
[0x0F49, 0x0F69],
[0x0F71, 0x0F84],
[0x0F86, 0x0F8B],
[0x0F90, 0x0F95],
[0x0F97, 0x0F97],
[0x0F99, 0x0FAD],
[0x0FB1, 0x0FB7],
[0x0FB9, 0x0FB9],
[0x10A0, 0x10C5],
[0x10D0, 0x10F6],
[0x1E00, 0x1E9B],
[0x1EA0, 0x1EF9],
[0x1F00, 0x1F15],
[0x1F18, 0x1F1D],
[0x1F20, 0x1F45],
[0x1F48, 0x1F4D],
[0x1F50, 0x1F57],
[0x1F59, 0x1F59],
[0x1F5B, 0x1F5B],
[0x1F5D, 0x1F5D],
[0x1F5F, 0x1F7D],
[0x1F80, 0x1FB4],
[0x1FB6, 0x1FBC],
[0x1FBE, 0x1FBE],
[0x1FC2, 0x1FC4],
[0x1FC6, 0x1FCC],
[0x1FD0, 0x1FD3],
[0x1FD6, 0x1FDB],
[0x1FE0, 0x1FEC],
[0x1FF2, 0x1FF4],
[0x1FF6, 0x1FFC],
[0x203F, 0x2040],
[0x207F, 0x207F],
[0x2102, 0x2102],
[0x2107, 0x2107],
[0x210A, 0x2113],
[0x2115, 0x2115],
[0x2118, 0x211D],
[0x2124, 0x2124],
[0x2126, 0x2126],
[0x2128, 0x2128],
[0x212A, 0x2131],
[0x2133, 0x2138],
[0x2160, 0x2182],
[0x3005, 0x3007],
[0x3021, 0x3029],
[0x3041, 0x3093],
[0x309B, 0x309C],
[0x30A1, 0x30F6],
[0x30FB, 0x30FC],
[0x3105, 0x312C],
[0x4E00, 0x9FA5],
[0xAC00, 0xD7A3]
];
size_t high = ALPHA_TABLE.length - 1;
// Shortcut search if c is out of range
size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
// Binary search
while (low <= high)
{
const size_t mid = low + ((high - low) >> 1);
if (c < ALPHA_TABLE[mid][0])
high = mid - 1;
else if (ALPHA_TABLE[mid][1] < c)
low = mid + 1;
else
{
assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
return true;
}
}
return false;
}
/**
* Returns the code length of c in code units.
*/

View file

@ -0,0 +1,9 @@
// REQUIRED_ARGS: -identifiers-importc=UAX31
// sppn doesn't support anything newer than c99
// DISABLED: win32omf
// verify that the UAX31 identifier set is applied.
int \u00F8ide\u00F9nt;
int øideùnt2;

View file

@ -0,0 +1,5 @@
// REQUIRED_ARGS: -identifiers=UAX31
// verify that the UAX31 identifier set is applied.
int øideùnt;

View file

@ -0,0 +1,11 @@
// REQUIRED_ARGS: -identifiers-importc=all
// sppn doesn't support anything newer than c99
// DISABLED: win32omf
// verify that the All identifier set is applied.
int \u00F8ide\u00F9nt;
int \u00AAide\u00B5nt;
int \u00A8ide\u00AFnt;
int \u00F8ide\u00F9nt;

View file

@ -0,0 +1,10 @@
// REQUIRED_ARGS: -identifiers=all
// verify that the UAX31 identifier set is applied.
int øideùnt;
int ªideµnt;
int ¨ide¯nt;
// just to play it safe, do we support one unicode then another at start?
int øùident;

View file

@ -0,0 +1,9 @@
// REQUIRED_ARGS: -identifiers-importc=c11
// sppn doesn't support anything newer than c99
// DISABLED: win32omf
// verify that the C11 identifier set is applied.
int \u00A8ide\u00AFnt;
int ¨ide¯nt;

View file

@ -0,0 +1,5 @@
// REQUIRED_ARGS: -identifiers=c11
// verify that the C11 identifier set is applied.
int ¨ide¯nt;

View file

@ -0,0 +1,6 @@
// REQUIRED_ARGS: -identifiers-importc=c99
// verify that the C99 identifier set is applied.
int \u00AAide\u00B5nt;
int ªideµnt2;

View file

@ -0,0 +1,5 @@
// REQUIRED_ARGS: -identifiers=c99
// verify that the C99 identifier set is applied.
int ªideµnt;

View file

@ -1,7 +1,7 @@
/*
TEST_OUTPUT:
---
fail_compilation/lexer23465.d(19): Error: char 0x1f37a not allowed in identifier
fail_compilation/lexer23465.d(19): Error: character 0x1f37a is not allowed as a continue character in an identifier
fail_compilation/lexer23465.d(19): Error: character 0x1f37a is not a valid token
fail_compilation/lexer23465.d(20): Error: character '\' is not a valid token
fail_compilation/lexer23465.d(21): Error: unterminated /+ +/ comment

View file

@ -0,0 +1,77 @@
/**
This module parses the UCD DerivedCoreProperties.txt file.
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module unicode_tables.derivedCoreProperties;
import unicode_tables.util;
ValueRanges propertyXID_StartRanges, propertyXID_ContinueRanges;
void parseProperties(string dataFile)
{
import std.algorithm : countUntil, startsWith;
import std.file : readText;
import std.string : lineSplitter, strip, split;
import std.conv : parse;
foreach (line; readText(dataFile).lineSplitter)
{
{
// handle end of line comment
ptrdiff_t offset = line.countUntil('#');
if (offset >= 0)
line = line[0 .. offset];
line = line.strip;
}
string[] fields = line.split(";");
{
foreach (ref field; fields)
{
field = field.strip;
}
if (fields.length == 0)
{
continue;
}
else if (fields.length != 2)
{
continue;
}
}
ValueRange range;
{
range.start = parse!uint(fields[0], 16);
if (fields[0].startsWith(".."))
{
fields[0] = fields[0][2 .. $];
range.end = parse!uint(fields[0], 16);
}
else
{
range.end = range.start;
}
}
switch (fields[1])
{
case "XID_Start":
propertyXID_StartRanges.add(range);
break;
case "XID_Continue":
propertyXID_ContinueRanges.add(range);
break;
default:
break;
}
}
}

View file

@ -0,0 +1,165 @@
/**
Known fixed tables.
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module unicode_tables.fixedtables;
import unicode_tables.util;
immutable ValueRanges ASCII_Table = ValueRanges([
ValueRange(0, 127)
]);
immutable ValueRanges c99_Table = ValueRanges([
ValueRange(0x00AA, 0x00AA), ValueRange(0x00B5, 0x00B5),
ValueRange(0x00B7, 0x00B7), ValueRange(0x00BA, 0x00BA),
ValueRange(0x00C0, 0x00D6), ValueRange(0x00D8, 0x00F6),
ValueRange(0x00F8, 0x01F5), ValueRange(0x01FA, 0x0217),
ValueRange(0x0250, 0x02A8), ValueRange(0x02B0, 0x02B8),
ValueRange(0x02BB, 0x02BB), ValueRange(0x02BD, 0x02C1),
ValueRange(0x02D0, 0x02D1), ValueRange(0x02E0, 0x02E4),
ValueRange(0x037A, 0x037A), ValueRange(0x0386, 0x0386),
ValueRange(0x0388, 0x038A), ValueRange(0x038C, 0x038C),
ValueRange(0x038E, 0x03A1), ValueRange(0x03A3, 0x03CE),
ValueRange(0x03D0, 0x03D6), ValueRange(0x03DA, 0x03DA),
ValueRange(0x03DC, 0x03DC), ValueRange(0x03DE, 0x03DE),
ValueRange(0x03E0, 0x03E0), ValueRange(0x03E2, 0x03F3),
ValueRange(0x0401, 0x040C), ValueRange(0x040E, 0x044F),
ValueRange(0x0451, 0x045C), ValueRange(0x045E, 0x0481),
ValueRange(0x0490, 0x04C4), ValueRange(0x04C7, 0x04C8),
ValueRange(0x04CB, 0x04CC), ValueRange(0x04D0, 0x04EB),
ValueRange(0x04EE, 0x04F5), ValueRange(0x04F8, 0x04F9),
ValueRange(0x0531, 0x0556), ValueRange(0x0559, 0x0559),
ValueRange(0x0561, 0x0587), ValueRange(0x05B0, 0x05B9),
ValueRange(0x05BB, 0x05BD), ValueRange(0x05BF, 0x05BF),
ValueRange(0x05C1, 0x05C2), ValueRange(0x05D0, 0x05EA),
ValueRange(0x05F0, 0x05F2), ValueRange(0x0621, 0x063A),
ValueRange(0x0640, 0x0652), ValueRange(0x0660, 0x0669),
ValueRange(0x0670, 0x06B7), ValueRange(0x06BA, 0x06BE),
ValueRange(0x06C0, 0x06CE), ValueRange(0x06D0, 0x06DC),
ValueRange(0x06E5, 0x06E8), ValueRange(0x06EA, 0x06ED),
ValueRange(0x06F0, 0x06F9), ValueRange(0x0901, 0x0903),
ValueRange(0x0905, 0x0939), ValueRange(0x093D, 0x094D),
ValueRange(0x0950, 0x0952), ValueRange(0x0958, 0x0963),
ValueRange(0x0966, 0x096F), ValueRange(0x0981, 0x0983),
ValueRange(0x0985, 0x098C), ValueRange(0x098F, 0x0990),
ValueRange(0x0993, 0x09A8), ValueRange(0x09AA, 0x09B0),
ValueRange(0x09B2, 0x09B2), ValueRange(0x09B6, 0x09B9),
ValueRange(0x09BE, 0x09C4), ValueRange(0x09C7, 0x09C8),
ValueRange(0x09CB, 0x09CD), ValueRange(0x09DC, 0x09DD),
ValueRange(0x09DF, 0x09E3), ValueRange(0x09E6, 0x09F1),
ValueRange(0x0A02, 0x0A02), ValueRange(0x0A05, 0x0A0A),
ValueRange(0x0A0F, 0x0A10), ValueRange(0x0A13, 0x0A28),
ValueRange(0x0A2A, 0x0A30), ValueRange(0x0A32, 0x0A33),
ValueRange(0x0A35, 0x0A36), ValueRange(0x0A38, 0x0A39),
ValueRange(0x0A3E, 0x0A42), ValueRange(0x0A47, 0x0A48),
ValueRange(0x0A4B, 0x0A4D), ValueRange(0x0A59, 0x0A5C),
ValueRange(0x0A5E, 0x0A5E), ValueRange(0x0A66, 0x0A6F),
ValueRange(0x0A74, 0x0A74), ValueRange(0x0A81, 0x0A83),
ValueRange(0x0A85, 0x0A8B), ValueRange(0x0A8D, 0x0A8D),
ValueRange(0x0A8F, 0x0A91), ValueRange(0x0A93, 0x0AA8),
ValueRange(0x0AAA, 0x0AB0), ValueRange(0x0AB2, 0x0AB3),
ValueRange(0x0AB5, 0x0AB9), ValueRange(0x0ABD, 0x0AC5),
ValueRange(0x0AC7, 0x0AC9), ValueRange(0x0ACB, 0x0ACD),
ValueRange(0x0AD0, 0x0AD0), ValueRange(0x0AE0, 0x0AE0),
ValueRange(0x0AE6, 0x0AEF), ValueRange(0x0B01, 0x0B03),
ValueRange(0x0B05, 0x0B0C), ValueRange(0x0B0F, 0x0B10),
ValueRange(0x0B13, 0x0B28), ValueRange(0x0B2A, 0x0B30),
ValueRange(0x0B32, 0x0B33), ValueRange(0x0B36, 0x0B39),
ValueRange(0x0B3D, 0x0B43), ValueRange(0x0B47, 0x0B48),
ValueRange(0x0B4B, 0x0B4D), ValueRange(0x0B5C, 0x0B5D),
ValueRange(0x0B5F, 0x0B61), ValueRange(0x0B66, 0x0B6F),
ValueRange(0x0B82, 0x0B83), ValueRange(0x0B85, 0x0B8A),
ValueRange(0x0B8E, 0x0B90), ValueRange(0x0B92, 0x0B95),
ValueRange(0x0B99, 0x0B9A), ValueRange(0x0B9C, 0x0B9C),
ValueRange(0x0B9E, 0x0B9F), ValueRange(0x0BA3, 0x0BA4),
ValueRange(0x0BA8, 0x0BAA), ValueRange(0x0BAE, 0x0BB5),
ValueRange(0x0BB7, 0x0BB9), ValueRange(0x0BBE, 0x0BC2),
ValueRange(0x0BC6, 0x0BC8), ValueRange(0x0BCA, 0x0BCD),
ValueRange(0x0BE7, 0x0BEF), ValueRange(0x0C01, 0x0C03),
ValueRange(0x0C05, 0x0C0C), ValueRange(0x0C0E, 0x0C10),
ValueRange(0x0C12, 0x0C28), ValueRange(0x0C2A, 0x0C33),
ValueRange(0x0C35, 0x0C39), ValueRange(0x0C3E, 0x0C44),
ValueRange(0x0C46, 0x0C48), ValueRange(0x0C4A, 0x0C4D),
ValueRange(0x0C60, 0x0C61), ValueRange(0x0C66, 0x0C6F),
ValueRange(0x0C82, 0x0C83), ValueRange(0x0C85, 0x0C8C),
ValueRange(0x0C8E, 0x0C90), ValueRange(0x0C92, 0x0CA8),
ValueRange(0x0CAA, 0x0CB3), ValueRange(0x0CB5, 0x0CB9),
ValueRange(0x0CBE, 0x0CC4), ValueRange(0x0CC6, 0x0CC8),
ValueRange(0x0CCA, 0x0CCD), ValueRange(0x0CDE, 0x0CDE),
ValueRange(0x0CE0, 0x0CE1), ValueRange(0x0CE6, 0x0CEF),
ValueRange(0x0D02, 0x0D03), ValueRange(0x0D05, 0x0D0C),
ValueRange(0x0D0E, 0x0D10), ValueRange(0x0D12, 0x0D28),
ValueRange(0x0D2A, 0x0D39), ValueRange(0x0D3E, 0x0D43),
ValueRange(0x0D46, 0x0D48), ValueRange(0x0D4A, 0x0D4D),
ValueRange(0x0D60, 0x0D61), ValueRange(0x0D66, 0x0D6F),
ValueRange(0x0E01, 0x0E3A), ValueRange(0x0E40, 0x0E5B),
ValueRange(0x0E81, 0x0E82), ValueRange(0x0E84, 0x0E84),
ValueRange(0x0E87, 0x0E88), ValueRange(0x0E8A, 0x0E8A),
ValueRange(0x0E8D, 0x0E8D), ValueRange(0x0E94, 0x0E97),
ValueRange(0x0E99, 0x0E9F), ValueRange(0x0EA1, 0x0EA3),
ValueRange(0x0EA5, 0x0EA5), ValueRange(0x0EA7, 0x0EA7),
ValueRange(0x0EAA, 0x0EAB), ValueRange(0x0EAD, 0x0EAE),
ValueRange(0x0EB0, 0x0EB9), ValueRange(0x0EBB, 0x0EBD),
ValueRange(0x0EC0, 0x0EC4), ValueRange(0x0EC6, 0x0EC6),
ValueRange(0x0EC8, 0x0ECD), ValueRange(0x0ED0, 0x0ED9),
ValueRange(0x0EDC, 0x0EDD), ValueRange(0x0F00, 0x0F00),
ValueRange(0x0F18, 0x0F19), ValueRange(0x0F20, 0x0F33),
ValueRange(0x0F35, 0x0F35), ValueRange(0x0F37, 0x0F37),
ValueRange(0x0F39, 0x0F39), ValueRange(0x0F3E, 0x0F47),
ValueRange(0x0F49, 0x0F69), ValueRange(0x0F71, 0x0F84),
ValueRange(0x0F86, 0x0F8B), ValueRange(0x0F90, 0x0F95),
ValueRange(0x0F97, 0x0F97), ValueRange(0x0F99, 0x0FAD),
ValueRange(0x0FB1, 0x0FB7), ValueRange(0x0FB9, 0x0FB9),
ValueRange(0x10A0, 0x10C5), ValueRange(0x10D0, 0x10F6),
ValueRange(0x1E00, 0x1E9B), ValueRange(0x1EA0, 0x1EF9),
ValueRange(0x1F00, 0x1F15), ValueRange(0x1F18, 0x1F1D),
ValueRange(0x1F20, 0x1F45), ValueRange(0x1F48, 0x1F4D),
ValueRange(0x1F50, 0x1F57), ValueRange(0x1F59, 0x1F59),
ValueRange(0x1F5B, 0x1F5B), ValueRange(0x1F5D, 0x1F5D),
ValueRange(0x1F5F, 0x1F7D), ValueRange(0x1F80, 0x1FB4),
ValueRange(0x1FB6, 0x1FBC), ValueRange(0x1FBE, 0x1FBE),
ValueRange(0x1FC2, 0x1FC4), ValueRange(0x1FC6, 0x1FCC),
ValueRange(0x1FD0, 0x1FD3), ValueRange(0x1FD6, 0x1FDB),
ValueRange(0x1FE0, 0x1FEC), ValueRange(0x1FF2, 0x1FF4),
ValueRange(0x1FF6, 0x1FFC), ValueRange(0x203F, 0x2040),
ValueRange(0x207F, 0x207F), ValueRange(0x2102, 0x2102),
ValueRange(0x2107, 0x2107), ValueRange(0x210A, 0x2113),
ValueRange(0x2115, 0x2115), ValueRange(0x2118, 0x211D),
ValueRange(0x2124, 0x2124), ValueRange(0x2126, 0x2126),
ValueRange(0x2128, 0x2128), ValueRange(0x212A, 0x2131),
ValueRange(0x2133, 0x2138), ValueRange(0x2160, 0x2182),
ValueRange(0x3005, 0x3007), ValueRange(0x3021, 0x3029),
ValueRange(0x3041, 0x3093), ValueRange(0x309B, 0x309C),
ValueRange(0x30A1, 0x30F6), ValueRange(0x30FB, 0x30FC),
ValueRange(0x3105, 0x312C), ValueRange(0x4E00, 0x9FA5),
ValueRange(0xAC00, 0xD7A3)
]);
immutable ValueRanges c11_Table = ValueRanges([
ValueRange(0x00A8, 0x00A8), ValueRange(0x00AA, 0x00AA),
ValueRange(0x00AD, 0x00AD), ValueRange(0x00AF,0x00AF),
ValueRange(0x00B2, 0x00B5), ValueRange(0x00B7, 0x00BA),
ValueRange(0x00BC, 0x00BE), ValueRange(0x00C0, 0x00D6),
ValueRange(0x00D8, 0x00F6), ValueRange(0x00F8, 0x00FF),
ValueRange(0x0100, 0x167F), ValueRange(0x1681, 0x180D),
ValueRange(0x180F, 0x1FFF), ValueRange(0x200B, 0x200D),
ValueRange(0x202A, 0x202E), ValueRange(0x203F, 0x2040),
ValueRange(0x2054, 0x2054), ValueRange(0x2060, 0x206F),
ValueRange(0x2070, 0x218F), ValueRange(0x2460, 0x24FF),
ValueRange(0x2776, 0x2793), ValueRange(0x2C00, 0x2DFF),
ValueRange(0x2E80, 0x2FFF), ValueRange(0x3004, 0x3007),
ValueRange(0x3021, 0x302F), ValueRange(0x3031, 0x303F),
ValueRange(0x3040, 0xD7FF), ValueRange(0xF900, 0xFD3D),
ValueRange(0xFD40, 0xFDCF), ValueRange(0xFDF0, 0xFE44),
ValueRange(0xFE47, 0xFFFD), ValueRange(0x10000, 0x1FFFD),
ValueRange(0x20000, 0x2FFFD), ValueRange(0x30000, 0x3FFFD),
ValueRange(0x40000, 0x4FFFD), ValueRange(0x50000, 0x5FFFD),
ValueRange(0x60000, 0x6FFFD), ValueRange(0x70000, 0x7FFFD),
ValueRange(0x80000, 0x8FFFD), ValueRange(0x90000, 0x9FFFD),
ValueRange(0xA0000, 0xAFFFD), ValueRange(0xB0000, 0xBFFFD),
ValueRange(0xC0000, 0xCFFFD), ValueRange(0xD0000, 0xDFFFD),
ValueRange(0xE0000, 0xEFFFD),
]);

View file

@ -0,0 +1,184 @@
/**
This module parses the UCD UnicodeData.txt file.
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module unicode_tables.unicodeData;
import unicode_tables.util;
UDEntry[] udEntries;
void parseUnicodeData(string dataFile)
{
import std.algorithm : countUntil, endsWith;
import std.file : readText;
import std.string : lineSplitter, strip, split;
import std.conv : parse;
bool expectedRangeEnd, nextRangeEnd;
foreach (line; readText(dataFile).lineSplitter)
{
{
// handle end of line comment
ptrdiff_t offset = line.countUntil('#');
if (offset >= 0)
line = line[0 .. offset];
line = line.strip;
}
string[] fields = line.split(";");
{
foreach (ref field; fields)
{
field = field.strip;
}
if (fields.length == 0)
{
continue;
}
else if (fields.length != 15)
{
continue;
}
}
{
/+
How first field ranges are specified (the First, Last bit):
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
+/
if (fields[1].endsWith(">"))
{
if (fields[1].endsWith("First>"))
{
nextRangeEnd = true;
}
else if (fields[1].endsWith("Last>"))
{
assert(nextRangeEnd);
nextRangeEnd = false;
expectedRangeEnd = true;
}
else if (fields[1] == "<control>")
{
if (expectedRangeEnd)
{
nextRangeEnd = false;
expectedRangeEnd = false;
continue;
}
}
else
{
continue;
}
}
else if (expectedRangeEnd)
{
continue;
}
}
uint character = parse!uint(fields[0], 16);
if (expectedRangeEnd)
{
udEntries[$ - 1].range.end = character;
expectedRangeEnd = false;
continue;
}
{
UDEntry entry;
entry.range = ValueRange(character);
static foreach (GC; __traits(allMembers, GeneralCategory))
{
if (fields[2] == GC)
entry.generalCategory = __traits(getMember, GeneralCategory, GC);
}
entry.canonicalCombiningClass = parse!int(fields[3]);
udEntries ~= entry;
}
}
}
struct UDEntry
{
ValueRange range;
GeneralCategory generalCategory;
int canonicalCombiningClass;
@safe:
bool isStarter()
{
return canonicalCombiningClass == 0;
}
bool isAlpha()
{
switch (generalCategory)
{
case GeneralCategory.Lu:
case GeneralCategory.Ll:
case GeneralCategory.Lt:
case GeneralCategory.Lm:
case GeneralCategory.Lo:
return true;
default:
return false;
}
}
}
enum GeneralCategory
{
None, ///
Lu, ///
Ll, ///
Lt, ///
LC, ///
Lm, ///
Lo, ///
L, ///
Mn, ///
Mc, ///
Me, ///
M, ///
Nd, ///
Nl, ///
No, ///
N, ///
Pc, ///
Pd, ///
Ps, ///
Pe, ///
Pi, ///
Pf, ///
Po, ///
P, ///
Sm, ///
Sc, ///
Sk, ///
So, ///
S, ///
Zs, ///
Zl, ///
Zp, ///
Z, ///
Cc, ///
Cf, ///
Cs, ///
Co, ///
Cn, ///
C, ///
}

View file

@ -0,0 +1,145 @@
/**
Utilities for working with Unicode ranges.
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module unicode_tables.util;
struct ValueRange
{
dchar start, end;
@safe:
this(dchar index)
{
this.start = index;
this.end = index;
}
this(dchar start, dchar end)
{
assert(end >= start);
this.start = start;
this.end = end;
}
bool isSingle() const
{
return start == end;
}
bool within(dchar index) const
{
return start <= index && end >= index;
}
uint count() const
{
return end + 1 - start;
}
int opCmp(const ValueRange other) const {
return this.start < other.start ? -1 : (this.start > other.start ? 1 : 0);
}
int opApply(scope int delegate(dchar) @safe del) const
{
int result;
foreach (dchar index; start .. end + 1)
{
result = del(index);
if (result)
return result;
}
return result;
}
}
struct ValueRanges
{
ValueRange[] ranges;
@safe:
void add(ValueRange toAdd)
{
if (ranges.length > 0 && (ranges[$ - 1].end >= toAdd.start || ranges[$ - 1].end + 1 == toAdd.start))
{
ranges[$ - 1].end = toAdd.end;
}
else
{
ranges ~= toAdd;
}
}
ValueRanges not(const ref ValueRanges butNotThis) const
{
ValueRanges ret;
foreach (toAdd; this)
{
if (butNotThis.within(toAdd))
continue;
ret.add(ValueRange(toAdd));
}
return ret;
}
ValueRanges merge(const ref ValueRanges andThis) const
{
import std.algorithm : sort;
ValueRanges ret;
auto sorted = sort((this.ranges ~ andThis.ranges).dup);
foreach(range; sorted) {
ret.add(range);
}
return ret;
}
bool within(dchar index) const
{
foreach (range; ranges)
{
if (range.within(index))
return true;
}
return false;
}
uint count() const
{
uint ret;
foreach (range; ranges)
{
ret += range.count;
}
return ret;
}
int opApply(scope int delegate(dchar) @safe del) const
{
int result;
foreach (range; ranges)
{
result = range.opApply(del);
if (result)
return result;
}
return result;
}
}

View file

@ -0,0 +1,206 @@
/**
Generates the Unicode tables and associated Identifier tables for dmd-fe.
These tables are stored in ``dmd.common.identifiertables``.
They are C99, C11, UAX31 and a least restrictive set (All).
You can run this via ``rdmd unicodetables.d``.
You will likely only need to run this program whenever the Unicode standard updates.
It does not need to be run automatically as part of CI, as long as its kept in a working condition when committed, it only needs non-fancy features so it is unlikely to break long term.
Place the updated files from the $(LINK2 https://www.unicode.org/Public/, Unicode database) into the a directory ``UCD-<version>/``, update the ``UCDDirectory`` variable.
Make sure to commit the updated ``UCDDirectory`` variable into the repository so we can keep track of what the latest version it has been updated to.
The update procedure is similar to Phobos's Unicode table generator for ``std.uni``.
If you know one, you can do the other fairly easily.
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
*/
module unicodetables;
import unicode_tables.util;
import unicode_tables.fixedtables;
import std.stdio : File, writeln;
enum {
// don't forget to update me when you commit new tables!
UCDDirectory = "UCD-15.1.0/",
UnicodeDataFile = UCDDirectory ~ "UnicodeData.txt",
DerivedCorePropertiesFile = UCDDirectory ~ "DerivedCoreProperties.txt",
UnicodeTableFile = "../src/dmd/common/identifiertables.d",
}
// Will disable the ASCII ranges in the generated tables.
// Disable if you are not handling elsewhere.
version = IgnoreASCIIRanges;
File tableFile;
int main(string[] args)
{
import std.file : exists;
if (!exists(UnicodeDataFile)) {
writeln("Missing UCD table UnicodeData.txt");
return 1;
} else if (!exists(DerivedCorePropertiesFile)) {
writeln("Missing UCD table DerivedCoreProperties.txt");
return 2;
}
{
tableFile = File(UnicodeTableFile, "w+");
tableFile.writeln("// Generated by compiler/tools/unicode_tables.d DO NOT MODIFY!!!");
tableFile.writeln("module dmd.common.identifiertables;");
tableFile.writeln();
}
{
import unicode_tables.unicodeData;
import unicode_tables.derivedCoreProperties;
parseUnicodeData(UnicodeDataFile);
parseProperties(DerivedCorePropertiesFile);
}
write_XID_Start;
tableFile.writeln;
write_XID_Continue;
tableFile.writeln;
write_other_tables;
tableFile.writeln;
write_least_restrictive_table;
return 0;
}
void writeTable(string name, const ValueRanges vr)
{
tableFile.writeln("static immutable dchar[2][] ", name, " = [");
foreach (entry; vr.ranges)
{
tableFile.writefln!" [0x%X, 0x%X],"(entry.start, entry.end);
}
tableFile.writeln("];");
}
void write_XID_Start()
{
import unicode_tables.derivedCoreProperties;
import std.algorithm : sort;
ValueRanges start = ValueRanges(propertyXID_StartRanges.ranges.dup);
version(IgnoreASCIIRanges)
{
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
start = start.not(ASCII_Table);
}
else
{
// This may be not needed, as we'll handle ASCII elsewhere in lexer,
// but if we don't in some place we'll want this instead.
start.add(ValueRange(0x5F)); // add _
start.ranges.sort!((a, b) => a.start < b.start);
}
tableFile.writeln("/**");
tableFile.writeln("UAX31 profile Start");
tableFile.writeln("Entries: ", start.count);
tableFile.writeln("*/");
writeTable("UAX31_Start", start);
}
void write_XID_Continue()
{
import unicode_tables.derivedCoreProperties;
ValueRanges cont = ValueRanges(propertyXID_ContinueRanges.ranges.dup);
version(IgnoreASCIIRanges)
{
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
cont = cont.not(ASCII_Table);
}
tableFile.writeln("/**");
tableFile.writeln("UAX31 profile Continue");
tableFile.writeln("Entries: ", cont.count);
tableFile.writeln("*/");
writeTable("UAX31_Continue", cont);
}
void write_other_tables()
{
tableFile.writeln("/**");
tableFile.writeln("C99 Start");
tableFile.writeln("Entries: ", c99_Table.count);
tableFile.writeln("*/");
tableFile.writeln("alias FixedTable_C99_Start = FixedTable_C99_Continue;");
tableFile.writeln;
tableFile.writeln("/**");
tableFile.writeln("C99 Continue");
tableFile.writeln("Entries: ", c99_Table.count);
tableFile.writeln("*/");
writeTable("FixedTable_C99_Continue", c99_Table);
tableFile.writeln;
tableFile.writeln("/**");
tableFile.writeln("C11 Start");
tableFile.writeln("Entries: ", c11_Table.count);
tableFile.writeln("*/");
tableFile.writeln("alias FixedTable_C11_Start = FixedTable_C11_Continue;");
tableFile.writeln;
tableFile.writeln("/**");
tableFile.writeln("C11 Continue");
tableFile.writeln("Entries: ", c11_Table.count);
tableFile.writeln("*/");
writeTable("FixedTable_C11_Continue", c11_Table);
}
void write_least_restrictive_table() {
import unicode_tables.derivedCoreProperties;
ValueRanges toMerge = c99_Table.merge(c11_Table);
ValueRanges lrs = propertyXID_StartRanges.merge(toMerge);
ValueRanges lrc = propertyXID_ContinueRanges.merge(toMerge);
ValueRanges lr = lrs.merge(lrc);
version(IgnoreASCIIRanges)
{
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
lrs = lrs.not(ASCII_Table);
lrc = lrc.not(ASCII_Table);
lr = lr.not(ASCII_Table);
}
tableFile.writeln("/**");
tableFile.writeln("Least restrictive with both Start and Continue");
tableFile.writeln("Entries: ", lr.count);
tableFile.writeln("*/");
writeTable("LeastRestrictive_OfAll", lr);
tableFile.writeln;
tableFile.writeln("/**");
tableFile.writeln("Least restrictive Start");
tableFile.writeln("Entries: ", lrs.count);
tableFile.writeln("*/");
writeTable("LeastRestrictive_Start", lrs);
tableFile.writeln;
tableFile.writeln("/**");
tableFile.writeln("Least restrictive Continue");
tableFile.writeln("Entries: ", lrc.count);
tableFile.writeln("*/");
writeTable("LeastRestrictive_Continue", lrc);
}