mirror of
https://github.com/dlang/dmd.git
synced 2025-04-28 06:00:13 +03:00
Implement UAX31 character ranges (#15307)
This commit is contained in:
parent
e74da19bcd
commit
dffd899508
32 changed files with 5902 additions and 458 deletions
12
changelog/dmd.identifier-tables.dd
Normal file
12
changelog/dmd.identifier-tables.dd
Normal file
|
@ -0,0 +1,12 @@
|
|||
Expansion of identifier tables to allow new characters to match C23 have been added along with CLI configurability
|
||||
|
||||
You can currently choose between ``c99``, ``c11``, ``UAX31`` (C23's) and ``all`` (the least restrictive set) for both D and ImportC.
|
||||
|
||||
This can be done with ``-identifiers=<table>`` and for ImportC ``-identifiers-importc=<table>``.
|
||||
|
||||
The default table for D is currently set to ``all``, while ImportC is set to ``c11``.
|
||||
Previously both D and ImportC used the ``c99`` tables.
|
||||
|
||||
D's table will be swapped over at a later date to [UAX31](https://unicode.org/reports/tr31/), this should be done in 2.117.
|
||||
If you find yourself at this time using ``c99`` specific characters and not willing to change them, you may switch back to ``all``.
|
||||
Although it should be unlikely that you will need to.
|
6
changelog/dmd.importc-unicode.dd
Normal file
6
changelog/dmd.importc-unicode.dd
Normal file
|
@ -0,0 +1,6 @@
|
|||
ImportC has improved Unicode support
|
||||
|
||||
Universal Character Names are now supported, allowing you to use the ``\uXXXX`` and ``\UXXXXXXXX`` syntax where ``X`` is a hex digit as part of an identifier.
|
||||
|
||||
DigitalMars sppn does not support anything newer than C99.
|
||||
It is known to be limited and using any Unicode character not in those ranges will result in an error.
|
|
@ -1584,7 +1584,7 @@ auto sourceFiles()
|
|||
stringtable.d utf.d
|
||||
"),
|
||||
common: fileArray(env["COMMON"], "
|
||||
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d
|
||||
bitfields.d file.d int128.d blake3.d outbuffer.d smallbuffer.d charactertables.d identifiertables.d
|
||||
"),
|
||||
commonHeaders: fileArray(env["COMMON"], "
|
||||
outbuffer.h
|
||||
|
|
|
@ -466,6 +466,26 @@ dmd -cov -unittest myprog.d
|
|||
|
||||
$(P Note that multiple `-i=...` options are allowed, each one adds a pattern.)}"
|
||||
),
|
||||
Option("identifiers=<table>",
|
||||
"Specify the non-ASCII tables for D identifiers",
|
||||
`Set the identifier table to use for the non-ASCII values.
|
||||
$(UL
|
||||
$(LI $(I UAX31): UAX31)
|
||||
$(LI $(I c99): C99)
|
||||
$(LI $(I c11): C11)
|
||||
$(LI $(I all): All, the least restrictive set, which comes all others (default))
|
||||
)`
|
||||
),
|
||||
Option("identifiers-importc=<table>",
|
||||
"Specify the non-ASCII tables for ImportC identifiers",
|
||||
`Set the identifier table to use for the non-ASCII values.
|
||||
$(UL
|
||||
$(LI $(I UAX31): UAX31)
|
||||
$(LI $(I c99): C99)
|
||||
$(LI $(I c11): C11 (default))
|
||||
$(LI $(I all): All, the least restrictive set, which comes all others)
|
||||
)`
|
||||
),
|
||||
Option("ignore",
|
||||
"deprecated flag, unsupported pragmas are always ignored now"
|
||||
),
|
||||
|
|
267
compiler/src/dmd/common/charactertables.d
Normal file
267
compiler/src/dmd/common/charactertables.d
Normal file
|
@ -0,0 +1,267 @@
|
|||
/**
|
||||
* Character tables related to identifiers.
|
||||
*
|
||||
* Supports UAX31, C99, C11 and least restrictive (All).
|
||||
*
|
||||
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
|
||||
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
|
||||
* Documentation: https://dlang.org/phobos/dmd_common_charactertables.html
|
||||
* Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/common/charactertables.d
|
||||
*/
|
||||
module dmd.common.charactertables;
|
||||
|
||||
@safe nothrow @nogc pure:
|
||||
|
||||
extern(C++):
|
||||
|
||||
///
|
||||
enum IdentifierTable {
|
||||
UAX31, ///
|
||||
C99, ///
|
||||
C11, ///
|
||||
LR, /// Least Restrictive aka All
|
||||
}
|
||||
|
||||
///
|
||||
struct IdentifierCharLookup
|
||||
{
|
||||
@safe nothrow @nogc pure:
|
||||
|
||||
///
|
||||
extern(C++) bool function(dchar) isStart;
|
||||
///
|
||||
extern(C++) bool function(dchar) isContinue;
|
||||
|
||||
/// Lookup the table given the table name
|
||||
static IdentifierCharLookup forTable(IdentifierTable table)
|
||||
{
|
||||
import dmd.common.identifiertables;
|
||||
|
||||
// Awful solution to require these lambdas.
|
||||
// However without them the extern(C++) ABI issues crop up for isInRange,
|
||||
// and then it can't access the tables.
|
||||
final switch(table) {
|
||||
case IdentifierTable.UAX31:
|
||||
return IdentifierCharLookup(
|
||||
(c) => isInRange!UAX31_Start(c),
|
||||
(c) => isInRange!UAX31_Continue(c));
|
||||
case IdentifierTable.C99:
|
||||
return IdentifierCharLookup(
|
||||
(c) => isInRange!FixedTable_C99_Start(c),
|
||||
(c) => isInRange!FixedTable_C99_Continue(c));
|
||||
case IdentifierTable.C11:
|
||||
return IdentifierCharLookup(
|
||||
(c) => isInRange!FixedTable_C11_Start(c),
|
||||
(c) => isInRange!FixedTable_C11_Continue(c));
|
||||
case IdentifierTable.LR:
|
||||
return IdentifierCharLookup(
|
||||
(c) => isInRange!LeastRestrictive_Start(c),
|
||||
(c) => isInRange!LeastRestrictive_Continue(c));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
Convenience function for use in places where we just don't care,
|
||||
what the identifier ranges are, or if it is start/continue.
|
||||
|
||||
Returns: is character a member of least restrictive of all.
|
||||
*/
|
||||
bool isAnyIdentifierCharacter(dchar c)
|
||||
{
|
||||
import dmd.common.identifiertables;
|
||||
return isInRange!LeastRestrictive_OfAll(c);
|
||||
}
|
||||
|
||||
///
|
||||
unittest
|
||||
{
|
||||
assert(isAnyContinue('ğ'));
|
||||
}
|
||||
|
||||
/**
|
||||
Convenience function for use in places where we just don't care,
|
||||
what the identifier ranges are.
|
||||
|
||||
Returns: is character a member of restrictive Start
|
||||
*/
|
||||
bool isAnyStart(dchar c)
|
||||
{
|
||||
import dmd.common.identifiertables;
|
||||
return isInRange!LeastRestrictive_Start(c);
|
||||
}
|
||||
|
||||
///
|
||||
unittest
|
||||
{
|
||||
assert(isAnyStart('ğ'));
|
||||
}
|
||||
|
||||
/**
|
||||
Convenience function for use in places where we just don't care,
|
||||
what the identifier ranges are.
|
||||
|
||||
Returns: is character a member of least restrictive Continue
|
||||
*/
|
||||
bool isAnyContinue(dchar c)
|
||||
{
|
||||
import dmd.common.identifiertables;
|
||||
return isInRange!LeastRestrictive_Continue(c);
|
||||
}
|
||||
|
||||
///
|
||||
unittest
|
||||
{
|
||||
assert(isAnyContinue('ğ'));
|
||||
}
|
||||
|
||||
/// UTF line separator
|
||||
enum LS = 0x2028;
|
||||
/// UTF paragraph separator
|
||||
enum PS = 0x2029;
|
||||
|
||||
private
|
||||
{
|
||||
enum CMoctal = 0x1;
|
||||
enum CMhex = 0x2;
|
||||
enum CMidchar = 0x4;
|
||||
enum CMzerosecond = 0x8;
|
||||
enum CMdigitsecond = 0x10;
|
||||
enum CMsinglechar = 0x20;
|
||||
}
|
||||
|
||||
///
|
||||
bool isoctal(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMoctal) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool ishex(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMhex) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool isidchar(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMidchar) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool isZeroSecond(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMzerosecond) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool isDigitSecond(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMdigitsecond) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool issinglechar(const char c)
|
||||
{
|
||||
return (cmtable[c] & CMsinglechar) != 0;
|
||||
}
|
||||
|
||||
///
|
||||
bool c_isxdigit(const int c)
|
||||
{
|
||||
return (( c >= '0' && c <= '9') ||
|
||||
( c >= 'a' && c <= 'f') ||
|
||||
( c >= 'A' && c <= 'F'));
|
||||
}
|
||||
|
||||
///
|
||||
bool c_isalnum(const int c)
|
||||
{
|
||||
return (( c >= '0' && c <= '9') ||
|
||||
( c >= 'a' && c <= 'z') ||
|
||||
( c >= 'A' && c <= 'Z'));
|
||||
}
|
||||
|
||||
extern(D) private:
|
||||
|
||||
// originally from dmd.root.utf
|
||||
bool isInRange(alias Ranges)(dchar c)
|
||||
{
|
||||
size_t high = Ranges.length - 1;
|
||||
// Shortcut search if c is out of range
|
||||
size_t low = (c < Ranges[0][0] || Ranges[high][1] < c) ? high + 1 : 0;
|
||||
// Binary search
|
||||
while (low <= high)
|
||||
{
|
||||
const size_t mid = low + ((high - low) >> 1);
|
||||
if (c < Ranges[mid][0])
|
||||
high = mid - 1;
|
||||
else if (Ranges[mid][1] < c)
|
||||
low = mid + 1;
|
||||
else
|
||||
{
|
||||
assert(Ranges[mid][0] <= c && c <= Ranges[mid][1]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/********************************************
|
||||
* Do our own char maps
|
||||
*/
|
||||
// originally from dmd.lexer (was private)
|
||||
static immutable cmtable = ()
|
||||
{
|
||||
ubyte[256] table;
|
||||
foreach (const c; 0 .. table.length)
|
||||
{
|
||||
if ('0' <= c && c <= '7')
|
||||
table[c] |= CMoctal;
|
||||
if (c_isxdigit(c))
|
||||
table[c] |= CMhex;
|
||||
if (c_isalnum(c) || c == '_')
|
||||
table[c] |= CMidchar;
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 'x': case 'X':
|
||||
case 'b': case 'B':
|
||||
table[c] |= CMzerosecond;
|
||||
break;
|
||||
|
||||
case '0': .. case '9':
|
||||
case 'e': case 'E':
|
||||
case 'f': case 'F':
|
||||
case 'l': case 'L':
|
||||
case 'p': case 'P':
|
||||
case 'u': case 'U':
|
||||
case 'i':
|
||||
case '.':
|
||||
case '_':
|
||||
table[c] |= CMzerosecond | CMdigitsecond;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '\\':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case 0:
|
||||
case 0x1A:
|
||||
case '\'':
|
||||
break;
|
||||
default:
|
||||
if (!(c & 0x80))
|
||||
table[c] |= CMsinglechar;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return table;
|
||||
}();
|
20
compiler/src/dmd/common/charactertables.h
Normal file
20
compiler/src/dmd/common/charactertables.h
Normal file
|
@ -0,0 +1,20 @@
|
|||
/**
|
||||
* Character tables related to identifiers.
|
||||
*
|
||||
* Supports UAX31, C99, C11 and least restrictive (All).
|
||||
*
|
||||
* Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
* Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
|
||||
* License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
* Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/common/charactertables.d, common/charactertables.d)
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
struct IdentifierCharLookup final
|
||||
{
|
||||
bool(*isStart)(char32_t);
|
||||
bool(*isContinue)(char32_t);
|
||||
|
||||
// constructor not provided here.
|
||||
};
|
4241
compiler/src/dmd/common/identifiertables.d
Normal file
4241
compiler/src/dmd/common/identifiertables.d
Normal file
File diff suppressed because it is too large
Load diff
|
@ -72,12 +72,14 @@ void mangleToBuffer(TemplateInstance ti, ref OutBuffer buf)
|
|||
/// Returns: `true` if the given character is a valid mangled character
|
||||
package bool isValidMangling(dchar c) nothrow
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
return
|
||||
c >= 'A' && c <= 'Z' ||
|
||||
c >= 'a' && c <= 'z' ||
|
||||
c >= '0' && c <= '9' ||
|
||||
c != 0 && strchr("$%().:?@[]_", c) ||
|
||||
isUniAlpha(c);
|
||||
isAnyIdentifierCharacter(c);
|
||||
}
|
||||
|
||||
// valid mangled characters
|
||||
|
|
|
@ -2106,43 +2106,13 @@ int getMarkdownIndent(ref OutBuffer buf, size_t from, size_t to) @safe
|
|||
return indent;
|
||||
}
|
||||
|
||||
/************************************************
|
||||
* Scan forward to one of:
|
||||
* start of identifier
|
||||
* beginning of next line
|
||||
* end of buf
|
||||
*/
|
||||
size_t skiptoident(ref OutBuffer buf, size_t i) @safe
|
||||
{
|
||||
const slice = buf[];
|
||||
while (i < slice.length)
|
||||
{
|
||||
dchar c;
|
||||
size_t oi = i;
|
||||
if (utf_decodeChar(slice, i, c))
|
||||
{
|
||||
/* Ignore UTF errors, but still consume input
|
||||
*/
|
||||
break;
|
||||
}
|
||||
if (c >= 0x80)
|
||||
{
|
||||
if (!isUniAlpha(c))
|
||||
continue;
|
||||
}
|
||||
else if (!(isalpha(c) || c == '_' || c == '\n'))
|
||||
continue;
|
||||
i = oi;
|
||||
break;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/************************************************
|
||||
* Scan forward past end of identifier.
|
||||
*/
|
||||
size_t skippastident(ref OutBuffer buf, size_t i) @safe
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
const slice = buf[];
|
||||
while (i < slice.length)
|
||||
{
|
||||
|
@ -2156,7 +2126,8 @@ size_t skippastident(ref OutBuffer buf, size_t i) @safe
|
|||
}
|
||||
if (c >= 0x80)
|
||||
{
|
||||
if (isUniAlpha(c))
|
||||
// we don't care if it is start/continue here
|
||||
if (isAnyIdentifierCharacter(c))
|
||||
continue;
|
||||
}
|
||||
else if (isalnum(c) || c == '_')
|
||||
|
@ -2173,6 +2144,8 @@ size_t skippastident(ref OutBuffer buf, size_t i) @safe
|
|||
*/
|
||||
size_t skipPastIdentWithDots(ref OutBuffer buf, size_t i) @safe
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
const slice = buf[];
|
||||
bool lastCharWasDot;
|
||||
while (i < slice.length)
|
||||
|
@ -2203,7 +2176,8 @@ size_t skipPastIdentWithDots(ref OutBuffer buf, size_t i) @safe
|
|||
{
|
||||
if (c >= 0x80)
|
||||
{
|
||||
if (isUniAlpha(c))
|
||||
// we don't care if it is start/continue here
|
||||
if (isAnyIdentifierCharacter(c))
|
||||
{
|
||||
lastCharWasDot = false;
|
||||
continue;
|
||||
|
@ -5249,6 +5223,8 @@ bool isCVariadicArg(const(char)[] p) @nogc nothrow pure @safe
|
|||
@trusted
|
||||
bool isIdStart(const(char)* p) @nogc nothrow pure
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
dchar c = *p;
|
||||
if (isalpha(c) || c == '_')
|
||||
return true;
|
||||
|
@ -5257,7 +5233,7 @@ bool isIdStart(const(char)* p) @nogc nothrow pure
|
|||
size_t i = 0;
|
||||
if (utf_decodeChar(p[0 .. 4], i, c))
|
||||
return false; // ignore errors
|
||||
if (isUniAlpha(c))
|
||||
if (isAnyStart(c))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -5269,6 +5245,8 @@ bool isIdStart(const(char)* p) @nogc nothrow pure
|
|||
@trusted
|
||||
bool isIdTail(const(char)* p) @nogc nothrow pure
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
dchar c = *p;
|
||||
if (isalnum(c) || c == '_')
|
||||
return true;
|
||||
|
@ -5277,7 +5255,7 @@ bool isIdTail(const(char)* p) @nogc nothrow pure
|
|||
size_t i = 0;
|
||||
if (utf_decodeChar(p[0 .. 4], i, c))
|
||||
return false; // ignore errors
|
||||
if (isUniAlpha(c))
|
||||
if (isAnyContinue(c))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
|
|
@ -6118,6 +6118,15 @@ enum class CHECKACTION : uint8_t
|
|||
context = 3u,
|
||||
};
|
||||
|
||||
enum class CLIIdentifierTable : uint8_t
|
||||
{
|
||||
default_ = 0u,
|
||||
C99 = 1u,
|
||||
C11 = 2u,
|
||||
UAX31 = 3u,
|
||||
All = 4u,
|
||||
};
|
||||
|
||||
enum class JsonFieldFlags : uint32_t
|
||||
{
|
||||
none = 0u,
|
||||
|
@ -6137,6 +6146,8 @@ struct CompileEnv final
|
|||
bool previewIn;
|
||||
bool ddocOutput;
|
||||
bool masm;
|
||||
IdentifierCharLookup cCharLookupTable;
|
||||
IdentifierCharLookup dCharLookupTable;
|
||||
CompileEnv() :
|
||||
versionNumber(),
|
||||
date(),
|
||||
|
@ -6145,10 +6156,12 @@ struct CompileEnv final
|
|||
timestamp(),
|
||||
previewIn(),
|
||||
ddocOutput(),
|
||||
masm()
|
||||
masm(),
|
||||
cCharLookupTable(),
|
||||
dCharLookupTable()
|
||||
{
|
||||
}
|
||||
CompileEnv(uint32_t versionNumber, _d_dynamicArray< const char > date = {}, _d_dynamicArray< const char > time = {}, _d_dynamicArray< const char > vendor = {}, _d_dynamicArray< const char > timestamp = {}, bool previewIn = false, bool ddocOutput = false, bool masm = false) :
|
||||
CompileEnv(uint32_t versionNumber, _d_dynamicArray< const char > date = {}, _d_dynamicArray< const char > time = {}, _d_dynamicArray< const char > vendor = {}, _d_dynamicArray< const char > timestamp = {}, bool previewIn = false, bool ddocOutput = false, bool masm = false, IdentifierCharLookup cCharLookupTable = IdentifierCharLookup(), IdentifierCharLookup dCharLookupTable = IdentifierCharLookup()) :
|
||||
versionNumber(versionNumber),
|
||||
date(date),
|
||||
time(time),
|
||||
|
@ -6156,7 +6169,9 @@ struct CompileEnv final
|
|||
timestamp(timestamp),
|
||||
previewIn(previewIn),
|
||||
ddocOutput(ddocOutput),
|
||||
masm(masm)
|
||||
masm(masm),
|
||||
cCharLookupTable(cCharLookupTable),
|
||||
dCharLookupTable(dCharLookupTable)
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -7804,6 +7819,56 @@ extern _d_real cimagl(complex_t x);
|
|||
|
||||
extern void browse(const char* url);
|
||||
|
||||
enum class IdentifierTable
|
||||
{
|
||||
UAX31 = 0,
|
||||
C99 = 1,
|
||||
C11 = 2,
|
||||
LR = 3,
|
||||
};
|
||||
|
||||
struct IdentifierCharLookup final
|
||||
{
|
||||
bool(*isStart)(char32_t );
|
||||
bool(*isContinue)(char32_t );
|
||||
static IdentifierCharLookup forTable(IdentifierTable table);
|
||||
IdentifierCharLookup() :
|
||||
isStart(),
|
||||
isContinue()
|
||||
{
|
||||
}
|
||||
IdentifierCharLookup(bool(*isStart)(char32_t ), bool(*isContinue)(char32_t ) = nullptr) :
|
||||
isStart(isStart),
|
||||
isContinue(isContinue)
|
||||
{}
|
||||
};
|
||||
|
||||
extern bool isAnyIdentifierCharacter(char32_t c);
|
||||
|
||||
extern bool isAnyStart(char32_t c);
|
||||
|
||||
extern bool isAnyContinue(char32_t c);
|
||||
|
||||
enum : int32_t { LS = 8232 };
|
||||
|
||||
enum : int32_t { PS = 8233 };
|
||||
|
||||
extern bool isoctal(const char c);
|
||||
|
||||
extern bool ishex(const char c);
|
||||
|
||||
extern bool isidchar(const char c);
|
||||
|
||||
extern bool isZeroSecond(const char c);
|
||||
|
||||
extern bool isDigitSecond(const char c);
|
||||
|
||||
extern bool issinglechar(const char c);
|
||||
|
||||
extern bool c_isxdigit(const int32_t c);
|
||||
|
||||
extern bool c_isalnum(const int32_t c);
|
||||
|
||||
extern void error(const Loc& loc, const char* format, ...);
|
||||
|
||||
extern void error(const char* filename, uint32_t linnum, uint32_t charnum, const char* format, ...);
|
||||
|
@ -8013,6 +8078,8 @@ struct Param final
|
|||
CHECKENABLE useSwitchError;
|
||||
CHECKENABLE boundscheck;
|
||||
CHECKACTION checkAction;
|
||||
CLIIdentifierTable dIdentifierTable;
|
||||
CLIIdentifierTable cIdentifierTable;
|
||||
_d_dynamicArray< const char > argv0;
|
||||
Array<const char* > modFileAliasStrings;
|
||||
Array<const char* > imppath;
|
||||
|
@ -8088,6 +8155,8 @@ struct Param final
|
|||
useSwitchError((CHECKENABLE)0u),
|
||||
boundscheck((CHECKENABLE)0u),
|
||||
checkAction((CHECKACTION)0u),
|
||||
dIdentifierTable((CLIIdentifierTable)0u),
|
||||
cIdentifierTable((CLIIdentifierTable)0u),
|
||||
argv0(),
|
||||
modFileAliasStrings(),
|
||||
imppath(),
|
||||
|
@ -8119,7 +8188,7 @@ struct Param final
|
|||
mapfile()
|
||||
{
|
||||
}
|
||||
Param(bool obj, bool multiobj = false, bool trace = false, bool tracegc = false, bool vcg_ast = false, DiagnosticReporting useDeprecated = (DiagnosticReporting)1u, bool useUnitTests = false, bool useInline = false, bool release = false, bool preservePaths = false, DiagnosticReporting warnings = (DiagnosticReporting)2u, bool cov = false, uint8_t covPercent = 0u, bool ctfe_cov = false, bool ignoreUnsupportedPragmas = true, bool useModuleInfo = true, bool useTypeInfo = true, bool useExceptions = true, bool useGC = true, bool betterC = false, bool addMain = false, bool allInst = false, bool bitfields = false, CppStdRevision cplusplus = (CppStdRevision)201103u, Help help = Help(), Verbose v = Verbose(), FeatureState useDIP25 = (FeatureState)2u, FeatureState useDIP1000 = (FeatureState)0u, bool ehnogc = false, bool useDIP1021 = false, FeatureState fieldwise = (FeatureState)0u, bool fixAliasThis = false, FeatureState rvalueRefParam = (FeatureState)0u, FeatureState noSharedAccess = (FeatureState)0u, bool previewIn = false, bool inclusiveInContracts = false, bool shortenedMethods = true, bool fixImmutableConv = false, bool fix16997 = true, FeatureState dtorFields = (FeatureState)0u, FeatureState systemVariables = (FeatureState)0u, CHECKENABLE useInvariants = (CHECKENABLE)0u, CHECKENABLE useIn = (CHECKENABLE)0u, CHECKENABLE useOut = (CHECKENABLE)0u, CHECKENABLE useArrayBounds = (CHECKENABLE)0u, CHECKENABLE useAssert = (CHECKENABLE)0u, CHECKENABLE useSwitchError = (CHECKENABLE)0u, CHECKENABLE boundscheck = (CHECKENABLE)0u, CHECKACTION checkAction = (CHECKACTION)0u, _d_dynamicArray< const char > argv0 = {}, Array<const char* > modFileAliasStrings = Array<const char* >(), Array<const char* > imppath = Array<const char* >(), Array<const char* > fileImppath = Array<const char* >(), _d_dynamicArray< const char > objdir = {}, _d_dynamicArray< const char > objname = {}, _d_dynamicArray< const char > libname = {}, Output ddoc = Output(), Output dihdr = Output(), Output cxxhdr = Output(), Output json = Output(), JsonFieldFlags jsonFieldFlags = (JsonFieldFlags)0u, Output makeDeps = Output(), Output mixinOut = Output(), Output moduleDeps = Output(), uint32_t debuglevel = 0u, uint32_t versionlevel = 0u, bool run = false, Array<const char* > runargs = Array<const char* >(), Array<const char* > cppswitches = Array<const char* >(), const char* cpp = nullptr, Array<const char* > objfiles = Array<const char* >(), Array<const char* > linkswitches = Array<const char* >(), Array<bool > linkswitchIsForCC = Array<bool >(), Array<const char* > libfiles = Array<const char* >(), Array<const char* > dllfiles = Array<const char* >(), _d_dynamicArray< const char > deffile = {}, _d_dynamicArray< const char > resfile = {}, _d_dynamicArray< const char > exefile = {}, _d_dynamicArray< const char > mapfile = {}) :
|
||||
Param(bool obj, bool multiobj = false, bool trace = false, bool tracegc = false, bool vcg_ast = false, DiagnosticReporting useDeprecated = (DiagnosticReporting)1u, bool useUnitTests = false, bool useInline = false, bool release = false, bool preservePaths = false, DiagnosticReporting warnings = (DiagnosticReporting)2u, bool cov = false, uint8_t covPercent = 0u, bool ctfe_cov = false, bool ignoreUnsupportedPragmas = true, bool useModuleInfo = true, bool useTypeInfo = true, bool useExceptions = true, bool useGC = true, bool betterC = false, bool addMain = false, bool allInst = false, bool bitfields = false, CppStdRevision cplusplus = (CppStdRevision)201103u, Help help = Help(), Verbose v = Verbose(), FeatureState useDIP25 = (FeatureState)2u, FeatureState useDIP1000 = (FeatureState)0u, bool ehnogc = false, bool useDIP1021 = false, FeatureState fieldwise = (FeatureState)0u, bool fixAliasThis = false, FeatureState rvalueRefParam = (FeatureState)0u, FeatureState noSharedAccess = (FeatureState)0u, bool previewIn = false, bool inclusiveInContracts = false, bool shortenedMethods = true, bool fixImmutableConv = false, bool fix16997 = true, FeatureState dtorFields = (FeatureState)0u, FeatureState systemVariables = (FeatureState)0u, CHECKENABLE useInvariants = (CHECKENABLE)0u, CHECKENABLE useIn = (CHECKENABLE)0u, CHECKENABLE useOut = (CHECKENABLE)0u, CHECKENABLE useArrayBounds = (CHECKENABLE)0u, CHECKENABLE useAssert = (CHECKENABLE)0u, CHECKENABLE useSwitchError = (CHECKENABLE)0u, CHECKENABLE boundscheck = (CHECKENABLE)0u, CHECKACTION checkAction = (CHECKACTION)0u, CLIIdentifierTable dIdentifierTable = (CLIIdentifierTable)0u, CLIIdentifierTable cIdentifierTable = (CLIIdentifierTable)0u, _d_dynamicArray< const char > argv0 = {}, Array<const char* > modFileAliasStrings = Array<const char* >(), Array<const char* > imppath = Array<const char* >(), Array<const char* > fileImppath = Array<const char* >(), _d_dynamicArray< const char > objdir = {}, _d_dynamicArray< const char > objname = {}, _d_dynamicArray< const char > libname = {}, Output ddoc = Output(), Output dihdr = Output(), Output cxxhdr = Output(), Output json = Output(), JsonFieldFlags jsonFieldFlags = (JsonFieldFlags)0u, Output makeDeps = Output(), Output mixinOut = Output(), Output moduleDeps = Output(), uint32_t debuglevel = 0u, uint32_t versionlevel = 0u, bool run = false, Array<const char* > runargs = Array<const char* >(), Array<const char* > cppswitches = Array<const char* >(), const char* cpp = nullptr, Array<const char* > objfiles = Array<const char* >(), Array<const char* > linkswitches = Array<const char* >(), Array<bool > linkswitchIsForCC = Array<bool >(), Array<const char* > libfiles = Array<const char* >(), Array<const char* > dllfiles = Array<const char* >(), _d_dynamicArray< const char > deffile = {}, _d_dynamicArray< const char > resfile = {}, _d_dynamicArray< const char > exefile = {}, _d_dynamicArray< const char > mapfile = {}) :
|
||||
obj(obj),
|
||||
multiobj(multiobj),
|
||||
trace(trace),
|
||||
|
@ -8169,6 +8238,8 @@ struct Param final
|
|||
useSwitchError(useSwitchError),
|
||||
boundscheck(boundscheck),
|
||||
checkAction(checkAction),
|
||||
dIdentifierTable(dIdentifierTable),
|
||||
cIdentifierTable(cIdentifierTable),
|
||||
argv0(argv0),
|
||||
modFileAliasStrings(modFileAliasStrings),
|
||||
imppath(imppath),
|
||||
|
|
|
@ -72,6 +72,16 @@ enum FeatureState : ubyte
|
|||
enabled = 2, /// Specified as `-preview=`
|
||||
}
|
||||
|
||||
/// Different identifier tables specifiable by CLI
|
||||
enum CLIIdentifierTable : ubyte
|
||||
{
|
||||
default_ = 0, /// Not specified by user
|
||||
C99 = 1, /// Tables from C99 standard
|
||||
C11 = 2, /// Tables from C11 standard
|
||||
UAX31 = 3, /// Tables from the Unicode Standard Annex 31: UNICODE IDENTIFIERS AND SYNTAX
|
||||
All = 4, /// The least restrictive set of all other tables
|
||||
}
|
||||
|
||||
extern(C++) struct Output
|
||||
{
|
||||
bool doOutput; // Output is enabled
|
||||
|
@ -199,6 +209,9 @@ extern (C++) struct Param
|
|||
|
||||
CHECKACTION checkAction = CHECKACTION.D; // action to take when bounds, asserts or switch defaults are violated
|
||||
|
||||
CLIIdentifierTable dIdentifierTable = CLIIdentifierTable.default_;
|
||||
CLIIdentifierTable cIdentifierTable = CLIIdentifierTable.default_;
|
||||
|
||||
const(char)[] argv0; // program name
|
||||
Array!(const(char)*) modFileAliasStrings; // array of char*'s of -I module filename alias strings
|
||||
Array!(const(char)*) imppath; // array of char*'s of where to look for import modules
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#include "root/dcompat.h"
|
||||
#include "root/ctfloat.h"
|
||||
#include "common/outbuffer.h"
|
||||
#include "common/charactertables.h"
|
||||
#include "root/filename.h"
|
||||
#include "compiler.h"
|
||||
|
||||
|
@ -82,6 +83,16 @@ enum class FeatureState : unsigned char
|
|||
enabled = 2, /// Specified as `-preview=`
|
||||
};
|
||||
|
||||
/// Different identifier tables specifiable by CLI
|
||||
enum class CLIIdentifierTable : unsigned char
|
||||
{
|
||||
default_ = 0, /// Not specified by user
|
||||
C99 = 1, /// Tables from C99 standard
|
||||
C11 = 2, /// Tables from C11 standard
|
||||
UAX31 = 3, /// Tables from the Unicode Standard Annex 31: UNICODE IDENTIFIERS AND SYNTAX
|
||||
All = 4, /// The least restrictive set of all other tables
|
||||
};
|
||||
|
||||
struct Output
|
||||
{
|
||||
/// Configuration for the compiler generator
|
||||
|
@ -200,6 +211,9 @@ struct Param
|
|||
|
||||
CHECKACTION checkAction; // action to take when bounds, asserts or switch defaults are violated
|
||||
|
||||
CLIIdentifierTable dIdentifierTable;
|
||||
CLIIdentifierTable cIdentifierTable;
|
||||
|
||||
DString argv0; // program name
|
||||
Array<const char *> modFileAliasStrings; // array of char*'s of -I module filename alias strings
|
||||
Array<const char *> imppath; // array of char*'s of where to look for import modules
|
||||
|
@ -274,6 +288,9 @@ struct CompileEnv
|
|||
DString timestamp;
|
||||
d_bool previewIn;
|
||||
d_bool ddocOutput;
|
||||
d_bool masm;
|
||||
IdentifierCharLookup cCharLookupTable;
|
||||
IdentifierCharLookup dCharLookupTable;
|
||||
};
|
||||
|
||||
struct Global
|
||||
|
|
|
@ -315,28 +315,83 @@ nothrow:
|
|||
/**********************************
|
||||
* ditto
|
||||
*/
|
||||
extern (D) static bool isValidIdentifier(const(char)[] str) @safe
|
||||
extern (D) static bool isValidIdentifier(const(char)[] str) @trusted
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
if (str.length == 0 ||
|
||||
(str[0] >= '0' && str[0] <= '9')) // beware of isdigit() on signed chars
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t idx = 0;
|
||||
while (idx < str.length)
|
||||
// In a previous implementation this was implemented quite naively,
|
||||
// by utilizing the libc.
|
||||
// However we can do better, by copying the lexer approach to identifier validation.
|
||||
|
||||
const(char)* p = &str[0], pEnd = str.ptr + str.length;
|
||||
|
||||
// handle start characters
|
||||
{
|
||||
dchar dc;
|
||||
const s = utf_decodeChar(str, idx, dc);
|
||||
if (s ||
|
||||
!((dc >= 0x80 && isUniAlpha(dc)) || isalnum(dc) || dc == '_'))
|
||||
const c = *p;
|
||||
|
||||
if (isidchar(c))
|
||||
p++;
|
||||
else if (c & 0x80)
|
||||
{
|
||||
size_t countDecoded;
|
||||
dchar decoded;
|
||||
|
||||
if (utf_decodeChar(p[0 .. pEnd - p], countDecoded, decoded) is null ||
|
||||
isAnyStart(decoded))
|
||||
p += countDecoded;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// handle continue characters
|
||||
while(p !is pEnd)
|
||||
{
|
||||
const c = *p;
|
||||
|
||||
if (isidchar(c)) // handles ASCII subset
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
else if (c & 0x80)
|
||||
{
|
||||
size_t countDecoded;
|
||||
dchar decoded;
|
||||
|
||||
if (utf_decodeChar(p[0 .. pEnd - p], countDecoded, decoded) is null ||
|
||||
isAnyContinue(decoded))
|
||||
{
|
||||
p += countDecoded;
|
||||
continue;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///
|
||||
unittest
|
||||
{
|
||||
assert(Identifier.isValidIdentifier("tes123_t".ptr));
|
||||
assert(!Identifier.isValidIdentifier("tes123_^t".ptr));
|
||||
assert(Identifier.isValidIdentifier("te123s_ğt".ptr));
|
||||
assert(!Identifier.isValidIdentifier("t^e123s_ğt".ptr));
|
||||
}
|
||||
|
||||
extern (D) static Identifier lookup(const(char)* s, size_t len)
|
||||
{
|
||||
return lookup(s[0 .. len]);
|
||||
|
|
|
@ -22,9 +22,11 @@ import dmd.errorsink;
|
|||
import dmd.id;
|
||||
import dmd.identifier;
|
||||
import dmd.location;
|
||||
import dmd.common.smallbuffer;
|
||||
import dmd.common.outbuffer;
|
||||
import dmd.common.charactertables;
|
||||
import dmd.root.array;
|
||||
import dmd.root.ctfloat;
|
||||
import dmd.common.outbuffer;
|
||||
import dmd.root.port;
|
||||
import dmd.root.rmem;
|
||||
import dmd.root.utf;
|
||||
|
@ -42,6 +44,8 @@ version (DMDLIB)
|
|||
*/
|
||||
struct CompileEnv
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
uint versionNumber; /// __VERSION__
|
||||
const(char)[] date; /// __DATE__
|
||||
const(char)[] time; /// __TIME__
|
||||
|
@ -51,6 +55,10 @@ struct CompileEnv
|
|||
bool previewIn; /// `in` means `[ref] scope const`, accepts rvalues
|
||||
bool ddocOutput; /// collect embedded documentation comments
|
||||
bool masm; /// use MASM inline asm syntax
|
||||
|
||||
// these need a default otherwise tests won't work.
|
||||
IdentifierCharLookup cCharLookupTable;
|
||||
IdentifierCharLookup dCharLookupTable;
|
||||
}
|
||||
|
||||
/***********************************************************
|
||||
|
@ -66,6 +74,8 @@ class Lexer
|
|||
|
||||
Token token;
|
||||
|
||||
IdentifierCharLookup charLookup;
|
||||
|
||||
// For ImportC
|
||||
bool Ccompile; /// true if compiling ImportC
|
||||
|
||||
|
@ -142,6 +152,8 @@ class Lexer
|
|||
{
|
||||
this.compileEnv.versionNumber = 1;
|
||||
this.compileEnv.vendor = "DLF";
|
||||
this.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
|
||||
this.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
|
||||
}
|
||||
//initKeywords();
|
||||
/* If first line starts with '#!', ignore the line
|
||||
|
@ -175,6 +187,16 @@ class Lexer
|
|||
}
|
||||
endOfLine();
|
||||
}
|
||||
|
||||
// setup the identifier table lookup functions
|
||||
if (this.Ccompile)
|
||||
{
|
||||
charLookup = this.compileEnv.cCharLookupTable;
|
||||
}
|
||||
else
|
||||
{
|
||||
charLookup = this.compileEnv.dCharLookupTable;
|
||||
}
|
||||
}
|
||||
|
||||
/***********************
|
||||
|
@ -306,6 +328,8 @@ class Lexer
|
|||
t.blockComment = null;
|
||||
t.lineComment = null;
|
||||
|
||||
size_t universalCharacterName4, universalCharacterName8;
|
||||
|
||||
while (1)
|
||||
{
|
||||
t.ptr = p;
|
||||
|
@ -395,11 +419,36 @@ class Lexer
|
|||
continue; // skip white space
|
||||
|
||||
case '\\':
|
||||
if (Ccompile && (p[1] == '\r' || p[1] == '\n'))
|
||||
if (Ccompile)
|
||||
{
|
||||
if (p[1] == '\r' || p[1] == '\n')
|
||||
{
|
||||
++p; // ignore \ followed by new line, like VC does
|
||||
continue;
|
||||
}
|
||||
else if (p[1] == 'u')
|
||||
{
|
||||
// Universal Character Name (C) 2 byte
|
||||
// \uXXXX
|
||||
// let the main case handling for identifiers process this
|
||||
|
||||
// case_indent will always increment, so subtract to prevent branching on the fast path
|
||||
p--;
|
||||
|
||||
goto case_ident;
|
||||
}
|
||||
else if (p[1] == 'U')
|
||||
{
|
||||
// Universal Character Name (C) 4 byte
|
||||
// \UXXXXXXXX
|
||||
// let the main case handling for identifiers process this
|
||||
|
||||
// case_indent will always increment, so subtract to prevent branching on the fast path
|
||||
p--;
|
||||
|
||||
goto case_ident;
|
||||
}
|
||||
}
|
||||
goto default;
|
||||
|
||||
case '0':
|
||||
|
@ -586,23 +635,161 @@ class Lexer
|
|||
case '_':
|
||||
case_ident:
|
||||
{
|
||||
while (1)
|
||||
IdentLoop: while (1)
|
||||
{
|
||||
// If this is changed, change the decrement in C's universal character name code above
|
||||
// For syntax \uXXXX and \UXXXXXXXX
|
||||
const c = *++p;
|
||||
|
||||
// Is this the first character of the identifier
|
||||
// For the universal character name this will line up,
|
||||
// for the main switch it won't since it wasn't the first,
|
||||
// for the default it won't either because a decode increments.
|
||||
const isStartCharacter = t.ptr is p;
|
||||
|
||||
if (isidchar(c))
|
||||
continue;
|
||||
else if (c & 0x80)
|
||||
{
|
||||
const s = p;
|
||||
const u = decodeUTF();
|
||||
if (isUniAlpha(u))
|
||||
|
||||
if (isStartCharacter)
|
||||
{
|
||||
if (charLookup.isStart(u))
|
||||
continue;
|
||||
error(t.loc, "char 0x%04x not allowed in identifier", u);
|
||||
error(t.loc, "character 0x%04x is not allowed as a start character in an identifier", u);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (charLookup.isContinue(u))
|
||||
continue;
|
||||
error(t.loc, "character 0x%04x is not allowed as a continue character in an identifier", u);
|
||||
}
|
||||
|
||||
p = s;
|
||||
}
|
||||
else if (Ccompile && c == '\\')
|
||||
{
|
||||
uint times;
|
||||
const s = p;
|
||||
p++;
|
||||
|
||||
if (*p == 'u')
|
||||
{
|
||||
// Universal Character Name (C) 2 byte
|
||||
// \uXXXX
|
||||
p++;
|
||||
times = 4;
|
||||
}
|
||||
else if (*p == 'U')
|
||||
{
|
||||
// Universal Character Name (C) 4 byte
|
||||
// \UXXXXXXXX
|
||||
p++;
|
||||
times = 8;
|
||||
}
|
||||
else
|
||||
{
|
||||
error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid u/U", *p);
|
||||
p = s;
|
||||
break;
|
||||
}
|
||||
|
||||
foreach(_; 0 .. times)
|
||||
{
|
||||
const hc = *p;
|
||||
p++;
|
||||
|
||||
if ((hc >= '0' && hc <= '9') || (hc >= 'a' && hc <= 'f') || (hc >= 'A' && hc <= 'F'))
|
||||
continue;
|
||||
|
||||
error(t.loc, "char 0x%x is not allowed to follow '\\' expecting a C universal character name in format \\uXXXX or \\UXXXXXXXX with hex digits instead of X with invalid hex digit", hc);
|
||||
p = s;
|
||||
break IdentLoop;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
Identifier id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
|
||||
|
||||
Identifier id;
|
||||
|
||||
if (universalCharacterName4 > 0 || universalCharacterName8 > 0)
|
||||
{
|
||||
auto priorValidation = t.ptr[0 .. p - t.ptr];
|
||||
const(char)* priorVPtr = priorValidation.ptr;
|
||||
const possibleLength = (
|
||||
priorValidation.length - (
|
||||
(universalCharacterName4 * 6) +
|
||||
(universalCharacterName8 * 10)
|
||||
)) + (
|
||||
(universalCharacterName4 * 3) +
|
||||
(universalCharacterName8 * 4)
|
||||
);
|
||||
|
||||
char[64] buffer = void;
|
||||
SmallBuffer!char sb = SmallBuffer!char(possibleLength, buffer[]);
|
||||
|
||||
char[] storage = sb.extent;
|
||||
size_t offset;
|
||||
|
||||
while(priorVPtr < &priorValidation[$-1] + 1)
|
||||
{
|
||||
if (*priorVPtr == '\\')
|
||||
{
|
||||
dchar tempDchar = 0;
|
||||
uint times;
|
||||
|
||||
// universal character name (C)
|
||||
if (priorVPtr[1] == 'u')
|
||||
times = 4;
|
||||
else if (priorVPtr[1] == 'U')
|
||||
times = 8;
|
||||
else
|
||||
assert(0, "ICE: Universal character name is 2 or 4 bytes only");
|
||||
priorVPtr += 2;
|
||||
|
||||
foreach(_; 0 .. times)
|
||||
{
|
||||
char c = *++priorVPtr;
|
||||
if (c >= '0' && c <= '9')
|
||||
c -= '0';
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
c -= 'a' - 10;
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
c -= 'A' - 10;
|
||||
|
||||
tempDchar <<= 4;
|
||||
tempDchar |= c;
|
||||
}
|
||||
|
||||
utf_encodeChar(&storage[offset], tempDchar);
|
||||
offset += utf_codeLengthChar(tempDchar);
|
||||
|
||||
// Could be an error instead of a warning,
|
||||
// but hey it was written specifically so why worry?
|
||||
if (priorVPtr is priorValidation.ptr)
|
||||
{
|
||||
if (!charLookup.isStart(tempDchar))
|
||||
warning(t.loc, "char 0x%x is not allowed start character for an identifier", tempDchar);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!charLookup.isContinue(tempDchar))
|
||||
warning(t.loc, "char 0x%x is not allowed continue character for an identifier", tempDchar);
|
||||
}
|
||||
}
|
||||
else
|
||||
storage[offset++] = *++priorVPtr;
|
||||
}
|
||||
|
||||
id = Identifier.idPool(storage[0 .. offset], false);
|
||||
}
|
||||
else
|
||||
id = Identifier.idPool((cast(char*)t.ptr)[0 .. p - t.ptr], false);
|
||||
|
||||
t.ident = id;
|
||||
t.value = cast(TOK)id.getValue();
|
||||
|
||||
|
@ -1174,9 +1361,11 @@ class Lexer
|
|||
if (c & 0x80)
|
||||
{
|
||||
c = decodeUTF();
|
||||
// Check for start of unicode identifier
|
||||
if (isUniAlpha(c))
|
||||
|
||||
// Check for start of an identifier
|
||||
if (charLookup.isStart(c))
|
||||
goto case_ident;
|
||||
|
||||
if (c == PS || c == LS)
|
||||
{
|
||||
endOfLine();
|
||||
|
@ -1688,7 +1877,7 @@ class Lexer
|
|||
delimright = ']';
|
||||
else if (c == '<')
|
||||
delimright = '>';
|
||||
else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
|
||||
else if (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c)))
|
||||
{
|
||||
// Start of identifier; must be a heredoc
|
||||
Token tok;
|
||||
|
@ -1736,7 +1925,9 @@ class Lexer
|
|||
}
|
||||
else if (c == delimright)
|
||||
goto Ldone;
|
||||
if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid)
|
||||
|
||||
// we're looking for a new identifier token
|
||||
if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && charLookup.isStart(c))) && hereid)
|
||||
{
|
||||
Token tok;
|
||||
auto psave = p;
|
||||
|
@ -2988,6 +3179,11 @@ class Lexer
|
|||
eSink.deprecation(loc, format, args);
|
||||
}
|
||||
|
||||
void warning(T...)(const ref Loc loc, const(char)* format, T args)
|
||||
{
|
||||
eSink.warning(loc, format, args);
|
||||
}
|
||||
|
||||
void deprecation(T...)(const(char)* format, T args)
|
||||
{
|
||||
eSink.deprecation(token.loc, format, args);
|
||||
|
@ -3416,124 +3612,6 @@ class Lexer
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/******************************* Private *****************************************/
|
||||
|
||||
private:
|
||||
|
||||
private enum LS = 0x2028; // UTF line separator
|
||||
private enum PS = 0x2029; // UTF paragraph separator
|
||||
|
||||
/********************************************
|
||||
* Do our own char maps
|
||||
*/
|
||||
private static immutable cmtable = ()
|
||||
{
|
||||
ubyte[256] table;
|
||||
foreach (const c; 0 .. table.length)
|
||||
{
|
||||
if ('0' <= c && c <= '7')
|
||||
table[c] |= CMoctal;
|
||||
if (c_isxdigit(c))
|
||||
table[c] |= CMhex;
|
||||
if (c_isalnum(c) || c == '_')
|
||||
table[c] |= CMidchar;
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case 'x': case 'X':
|
||||
case 'b': case 'B':
|
||||
table[c] |= CMzerosecond;
|
||||
break;
|
||||
|
||||
case '0': .. case '9':
|
||||
case 'e': case 'E':
|
||||
case 'f': case 'F':
|
||||
case 'l': case 'L':
|
||||
case 'p': case 'P':
|
||||
case 'u': case 'U':
|
||||
case 'i':
|
||||
case '.':
|
||||
case '_':
|
||||
table[c] |= CMzerosecond | CMdigitsecond;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
switch (c)
|
||||
{
|
||||
case '\\':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case 0:
|
||||
case 0x1A:
|
||||
case '\'':
|
||||
break;
|
||||
default:
|
||||
if (!(c & 0x80))
|
||||
table[c] |= CMsinglechar;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return table;
|
||||
}();
|
||||
|
||||
private
|
||||
{
|
||||
enum CMoctal = 0x1;
|
||||
enum CMhex = 0x2;
|
||||
enum CMidchar = 0x4;
|
||||
enum CMzerosecond = 0x8;
|
||||
enum CMdigitsecond = 0x10;
|
||||
enum CMsinglechar = 0x20;
|
||||
}
|
||||
|
||||
private bool isoctal(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMoctal) != 0;
|
||||
}
|
||||
|
||||
private bool ishex(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMhex) != 0;
|
||||
}
|
||||
|
||||
private bool isidchar(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMidchar) != 0;
|
||||
}
|
||||
|
||||
private bool isZeroSecond(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMzerosecond) != 0;
|
||||
}
|
||||
|
||||
private bool isDigitSecond(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMdigitsecond) != 0;
|
||||
}
|
||||
|
||||
private bool issinglechar(const char c) pure @nogc @safe
|
||||
{
|
||||
return (cmtable[c] & CMsinglechar) != 0;
|
||||
}
|
||||
|
||||
private bool c_isxdigit(const int c) pure @nogc @safe
|
||||
{
|
||||
return (( c >= '0' && c <= '9') ||
|
||||
( c >= 'a' && c <= 'f') ||
|
||||
( c >= 'A' && c <= 'F'));
|
||||
}
|
||||
|
||||
private bool c_isalnum(const int c) pure @nogc @safe
|
||||
{
|
||||
return (( c >= '0' && c <= '9') ||
|
||||
( c >= 'a' && c <= 'z') ||
|
||||
( c >= 'A' && c <= 'Z'));
|
||||
}
|
||||
|
||||
/******************************* Unittest *****************************************/
|
||||
|
||||
unittest
|
||||
|
|
|
@ -157,6 +157,8 @@ private:
|
|||
*/
|
||||
private int tryMain(size_t argc, const(char)** argv, ref Param params)
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
Strings files;
|
||||
Strings libmodules;
|
||||
global._init();
|
||||
|
@ -168,6 +170,52 @@ private int tryMain(size_t argc, const(char)** argv, ref Param params)
|
|||
global.compileEnv.previewIn = global.params.previewIn;
|
||||
global.compileEnv.ddocOutput = global.params.ddoc.doOutput;
|
||||
|
||||
final switch(global.params.cIdentifierTable)
|
||||
{
|
||||
case CLIIdentifierTable.C99:
|
||||
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C99);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.C11:
|
||||
case CLIIdentifierTable.default_:
|
||||
// ImportC is defined against C11, not C23.
|
||||
// If it was C23 this needs to be changed to UAX31 instead.
|
||||
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C11);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.UAX31:
|
||||
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.UAX31);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.All:
|
||||
global.compileEnv.cCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
|
||||
break;
|
||||
}
|
||||
|
||||
final switch(global.params.dIdentifierTable)
|
||||
{
|
||||
case CLIIdentifierTable.C99:
|
||||
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C99);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.C11:
|
||||
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.C11);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.UAX31:
|
||||
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.UAX31);
|
||||
break;
|
||||
|
||||
case CLIIdentifierTable.All:
|
||||
case CLIIdentifierTable.default_:
|
||||
// @@@DEPRECATED_2.119@@@
|
||||
// Change the default to UAX31,
|
||||
// this is a breaking change as C99 (what D used for ~23 years),
|
||||
// has characters that are not in UAX31.
|
||||
global.compileEnv.dCharLookupTable = IdentifierCharLookup.forTable(IdentifierTable.LR);
|
||||
break;
|
||||
}
|
||||
|
||||
if (params.help.usage)
|
||||
{
|
||||
usage();
|
||||
|
|
|
@ -1383,6 +1383,58 @@ bool parseCommandLine(const ref Strings arguments, const size_t argc, ref Param
|
|||
params.useInline = true;
|
||||
params.dihdr.fullOutput = true;
|
||||
}
|
||||
else if (startsWith(p + 1, "identifiers-importc"))
|
||||
{
|
||||
enum len = "-identifiers-importc=".length;
|
||||
// Parse:
|
||||
// -identifiers=table
|
||||
immutable string msg = "Only `UAX31`, `c99`, `c11`, `all`, allowed for `-identifiers-importc`";
|
||||
if (Identifier.isValidIdentifier(p + len))
|
||||
{
|
||||
const ident = p + len;
|
||||
switch (ident.toDString())
|
||||
{
|
||||
case "c99": params.cIdentifierTable = CLIIdentifierTable.C99; break;
|
||||
case "c11": params.cIdentifierTable = CLIIdentifierTable.C11; break;
|
||||
case "UAX31": params.cIdentifierTable = CLIIdentifierTable.UAX31; break;
|
||||
case "all": params.cIdentifierTable = CLIIdentifierTable.All; break;
|
||||
default:
|
||||
errorInvalidSwitch(p, msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
errorInvalidSwitch(p, msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (startsWith(p + 1, "identifiers"))
|
||||
{
|
||||
enum len = "-identifiers=".length;
|
||||
// Parse:
|
||||
// -identifiers=table
|
||||
immutable string msg = "Only `UAX31`, `c99`, `c11`, `all`, allowed for `-identifiers`";
|
||||
if (Identifier.isValidIdentifier(p + len))
|
||||
{
|
||||
const ident = p + len;
|
||||
switch (ident.toDString())
|
||||
{
|
||||
case "c99": params.dIdentifierTable = CLIIdentifierTable.C99; break;
|
||||
case "c11": params.dIdentifierTable = CLIIdentifierTable.C11; break;
|
||||
case "UAX31": params.dIdentifierTable = CLIIdentifierTable.UAX31; break;
|
||||
case "all": params.dIdentifierTable = CLIIdentifierTable.All; break;
|
||||
default:
|
||||
errorInvalidSwitch(p, msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
errorInvalidSwitch(p, msg);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (arg == "-i")
|
||||
includeImports = true;
|
||||
else if (startsWith(p + 1, "i="))
|
||||
|
|
|
@ -67,6 +67,8 @@ void pragmaDeclSemantic(PragmaDeclaration pd, Scope* sc)
|
|||
}
|
||||
version (all)
|
||||
{
|
||||
import dmd.common.charactertables;
|
||||
|
||||
/* Note: D language specification should not have any assumption about backend
|
||||
* implementation. Ideally pragma(mangle) can accept a string of any content.
|
||||
*
|
||||
|
@ -94,7 +96,7 @@ void pragmaDeclSemantic(PragmaDeclaration pd, Scope* sc)
|
|||
.error(pd.loc, "%s `%s` %.*s", pd.kind, pd.toPrettyChars, cast(int)msg.length, msg.ptr);
|
||||
break;
|
||||
}
|
||||
if (!isUniAlpha(c))
|
||||
if (!isAnyIdentifierCharacter(c))
|
||||
{
|
||||
.error(pd.loc, "%s `%s` char `0x%04x` not allowed in mangled name", pd.kind, pd.toPrettyChars, c);
|
||||
break;
|
||||
|
|
|
@ -27,281 +27,6 @@ bool utf_isValidDchar(dchar c)
|
|||
return false;
|
||||
}
|
||||
|
||||
/*******************************
|
||||
* Return !=0 if unicode alpha.
|
||||
* Use table from C99 Appendix D.
|
||||
*/
|
||||
bool isUniAlpha(dchar c)
|
||||
{
|
||||
static immutable wchar[2][] ALPHA_TABLE =
|
||||
[
|
||||
[0x00AA, 0x00AA],
|
||||
[0x00B5, 0x00B5],
|
||||
[0x00B7, 0x00B7],
|
||||
[0x00BA, 0x00BA],
|
||||
[0x00C0, 0x00D6],
|
||||
[0x00D8, 0x00F6],
|
||||
[0x00F8, 0x01F5],
|
||||
[0x01FA, 0x0217],
|
||||
[0x0250, 0x02A8],
|
||||
[0x02B0, 0x02B8],
|
||||
[0x02BB, 0x02BB],
|
||||
[0x02BD, 0x02C1],
|
||||
[0x02D0, 0x02D1],
|
||||
[0x02E0, 0x02E4],
|
||||
[0x037A, 0x037A],
|
||||
[0x0386, 0x0386],
|
||||
[0x0388, 0x038A],
|
||||
[0x038C, 0x038C],
|
||||
[0x038E, 0x03A1],
|
||||
[0x03A3, 0x03CE],
|
||||
[0x03D0, 0x03D6],
|
||||
[0x03DA, 0x03DA],
|
||||
[0x03DC, 0x03DC],
|
||||
[0x03DE, 0x03DE],
|
||||
[0x03E0, 0x03E0],
|
||||
[0x03E2, 0x03F3],
|
||||
[0x0401, 0x040C],
|
||||
[0x040E, 0x044F],
|
||||
[0x0451, 0x045C],
|
||||
[0x045E, 0x0481],
|
||||
[0x0490, 0x04C4],
|
||||
[0x04C7, 0x04C8],
|
||||
[0x04CB, 0x04CC],
|
||||
[0x04D0, 0x04EB],
|
||||
[0x04EE, 0x04F5],
|
||||
[0x04F8, 0x04F9],
|
||||
[0x0531, 0x0556],
|
||||
[0x0559, 0x0559],
|
||||
[0x0561, 0x0587],
|
||||
[0x05B0, 0x05B9],
|
||||
[0x05BB, 0x05BD],
|
||||
[0x05BF, 0x05BF],
|
||||
[0x05C1, 0x05C2],
|
||||
[0x05D0, 0x05EA],
|
||||
[0x05F0, 0x05F2],
|
||||
[0x0621, 0x063A],
|
||||
[0x0640, 0x0652],
|
||||
[0x0660, 0x0669],
|
||||
[0x0670, 0x06B7],
|
||||
[0x06BA, 0x06BE],
|
||||
[0x06C0, 0x06CE],
|
||||
[0x06D0, 0x06DC],
|
||||
[0x06E5, 0x06E8],
|
||||
[0x06EA, 0x06ED],
|
||||
[0x06F0, 0x06F9],
|
||||
[0x0901, 0x0903],
|
||||
[0x0905, 0x0939],
|
||||
[0x093D, 0x094D],
|
||||
[0x0950, 0x0952],
|
||||
[0x0958, 0x0963],
|
||||
[0x0966, 0x096F],
|
||||
[0x0981, 0x0983],
|
||||
[0x0985, 0x098C],
|
||||
[0x098F, 0x0990],
|
||||
[0x0993, 0x09A8],
|
||||
[0x09AA, 0x09B0],
|
||||
[0x09B2, 0x09B2],
|
||||
[0x09B6, 0x09B9],
|
||||
[0x09BE, 0x09C4],
|
||||
[0x09C7, 0x09C8],
|
||||
[0x09CB, 0x09CD],
|
||||
[0x09DC, 0x09DD],
|
||||
[0x09DF, 0x09E3],
|
||||
[0x09E6, 0x09F1],
|
||||
[0x0A02, 0x0A02],
|
||||
[0x0A05, 0x0A0A],
|
||||
[0x0A0F, 0x0A10],
|
||||
[0x0A13, 0x0A28],
|
||||
[0x0A2A, 0x0A30],
|
||||
[0x0A32, 0x0A33],
|
||||
[0x0A35, 0x0A36],
|
||||
[0x0A38, 0x0A39],
|
||||
[0x0A3E, 0x0A42],
|
||||
[0x0A47, 0x0A48],
|
||||
[0x0A4B, 0x0A4D],
|
||||
[0x0A59, 0x0A5C],
|
||||
[0x0A5E, 0x0A5E],
|
||||
[0x0A66, 0x0A6F],
|
||||
[0x0A74, 0x0A74],
|
||||
[0x0A81, 0x0A83],
|
||||
[0x0A85, 0x0A8B],
|
||||
[0x0A8D, 0x0A8D],
|
||||
[0x0A8F, 0x0A91],
|
||||
[0x0A93, 0x0AA8],
|
||||
[0x0AAA, 0x0AB0],
|
||||
[0x0AB2, 0x0AB3],
|
||||
[0x0AB5, 0x0AB9],
|
||||
[0x0ABD, 0x0AC5],
|
||||
[0x0AC7, 0x0AC9],
|
||||
[0x0ACB, 0x0ACD],
|
||||
[0x0AD0, 0x0AD0],
|
||||
[0x0AE0, 0x0AE0],
|
||||
[0x0AE6, 0x0AEF],
|
||||
[0x0B01, 0x0B03],
|
||||
[0x0B05, 0x0B0C],
|
||||
[0x0B0F, 0x0B10],
|
||||
[0x0B13, 0x0B28],
|
||||
[0x0B2A, 0x0B30],
|
||||
[0x0B32, 0x0B33],
|
||||
[0x0B36, 0x0B39],
|
||||
[0x0B3D, 0x0B43],
|
||||
[0x0B47, 0x0B48],
|
||||
[0x0B4B, 0x0B4D],
|
||||
[0x0B5C, 0x0B5D],
|
||||
[0x0B5F, 0x0B61],
|
||||
[0x0B66, 0x0B6F],
|
||||
[0x0B82, 0x0B83],
|
||||
[0x0B85, 0x0B8A],
|
||||
[0x0B8E, 0x0B90],
|
||||
[0x0B92, 0x0B95],
|
||||
[0x0B99, 0x0B9A],
|
||||
[0x0B9C, 0x0B9C],
|
||||
[0x0B9E, 0x0B9F],
|
||||
[0x0BA3, 0x0BA4],
|
||||
[0x0BA8, 0x0BAA],
|
||||
[0x0BAE, 0x0BB5],
|
||||
[0x0BB7, 0x0BB9],
|
||||
[0x0BBE, 0x0BC2],
|
||||
[0x0BC6, 0x0BC8],
|
||||
[0x0BCA, 0x0BCD],
|
||||
[0x0BE7, 0x0BEF],
|
||||
[0x0C01, 0x0C03],
|
||||
[0x0C05, 0x0C0C],
|
||||
[0x0C0E, 0x0C10],
|
||||
[0x0C12, 0x0C28],
|
||||
[0x0C2A, 0x0C33],
|
||||
[0x0C35, 0x0C39],
|
||||
[0x0C3E, 0x0C44],
|
||||
[0x0C46, 0x0C48],
|
||||
[0x0C4A, 0x0C4D],
|
||||
[0x0C60, 0x0C61],
|
||||
[0x0C66, 0x0C6F],
|
||||
[0x0C82, 0x0C83],
|
||||
[0x0C85, 0x0C8C],
|
||||
[0x0C8E, 0x0C90],
|
||||
[0x0C92, 0x0CA8],
|
||||
[0x0CAA, 0x0CB3],
|
||||
[0x0CB5, 0x0CB9],
|
||||
[0x0CBE, 0x0CC4],
|
||||
[0x0CC6, 0x0CC8],
|
||||
[0x0CCA, 0x0CCD],
|
||||
[0x0CDE, 0x0CDE],
|
||||
[0x0CE0, 0x0CE1],
|
||||
[0x0CE6, 0x0CEF],
|
||||
[0x0D02, 0x0D03],
|
||||
[0x0D05, 0x0D0C],
|
||||
[0x0D0E, 0x0D10],
|
||||
[0x0D12, 0x0D28],
|
||||
[0x0D2A, 0x0D39],
|
||||
[0x0D3E, 0x0D43],
|
||||
[0x0D46, 0x0D48],
|
||||
[0x0D4A, 0x0D4D],
|
||||
[0x0D60, 0x0D61],
|
||||
[0x0D66, 0x0D6F],
|
||||
[0x0E01, 0x0E3A],
|
||||
[0x0E40, 0x0E5B],
|
||||
[0x0E81, 0x0E82],
|
||||
[0x0E84, 0x0E84],
|
||||
[0x0E87, 0x0E88],
|
||||
[0x0E8A, 0x0E8A],
|
||||
[0x0E8D, 0x0E8D],
|
||||
[0x0E94, 0x0E97],
|
||||
[0x0E99, 0x0E9F],
|
||||
[0x0EA1, 0x0EA3],
|
||||
[0x0EA5, 0x0EA5],
|
||||
[0x0EA7, 0x0EA7],
|
||||
[0x0EAA, 0x0EAB],
|
||||
[0x0EAD, 0x0EAE],
|
||||
[0x0EB0, 0x0EB9],
|
||||
[0x0EBB, 0x0EBD],
|
||||
[0x0EC0, 0x0EC4],
|
||||
[0x0EC6, 0x0EC6],
|
||||
[0x0EC8, 0x0ECD],
|
||||
[0x0ED0, 0x0ED9],
|
||||
[0x0EDC, 0x0EDD],
|
||||
[0x0F00, 0x0F00],
|
||||
[0x0F18, 0x0F19],
|
||||
[0x0F20, 0x0F33],
|
||||
[0x0F35, 0x0F35],
|
||||
[0x0F37, 0x0F37],
|
||||
[0x0F39, 0x0F39],
|
||||
[0x0F3E, 0x0F47],
|
||||
[0x0F49, 0x0F69],
|
||||
[0x0F71, 0x0F84],
|
||||
[0x0F86, 0x0F8B],
|
||||
[0x0F90, 0x0F95],
|
||||
[0x0F97, 0x0F97],
|
||||
[0x0F99, 0x0FAD],
|
||||
[0x0FB1, 0x0FB7],
|
||||
[0x0FB9, 0x0FB9],
|
||||
[0x10A0, 0x10C5],
|
||||
[0x10D0, 0x10F6],
|
||||
[0x1E00, 0x1E9B],
|
||||
[0x1EA0, 0x1EF9],
|
||||
[0x1F00, 0x1F15],
|
||||
[0x1F18, 0x1F1D],
|
||||
[0x1F20, 0x1F45],
|
||||
[0x1F48, 0x1F4D],
|
||||
[0x1F50, 0x1F57],
|
||||
[0x1F59, 0x1F59],
|
||||
[0x1F5B, 0x1F5B],
|
||||
[0x1F5D, 0x1F5D],
|
||||
[0x1F5F, 0x1F7D],
|
||||
[0x1F80, 0x1FB4],
|
||||
[0x1FB6, 0x1FBC],
|
||||
[0x1FBE, 0x1FBE],
|
||||
[0x1FC2, 0x1FC4],
|
||||
[0x1FC6, 0x1FCC],
|
||||
[0x1FD0, 0x1FD3],
|
||||
[0x1FD6, 0x1FDB],
|
||||
[0x1FE0, 0x1FEC],
|
||||
[0x1FF2, 0x1FF4],
|
||||
[0x1FF6, 0x1FFC],
|
||||
[0x203F, 0x2040],
|
||||
[0x207F, 0x207F],
|
||||
[0x2102, 0x2102],
|
||||
[0x2107, 0x2107],
|
||||
[0x210A, 0x2113],
|
||||
[0x2115, 0x2115],
|
||||
[0x2118, 0x211D],
|
||||
[0x2124, 0x2124],
|
||||
[0x2126, 0x2126],
|
||||
[0x2128, 0x2128],
|
||||
[0x212A, 0x2131],
|
||||
[0x2133, 0x2138],
|
||||
[0x2160, 0x2182],
|
||||
[0x3005, 0x3007],
|
||||
[0x3021, 0x3029],
|
||||
[0x3041, 0x3093],
|
||||
[0x309B, 0x309C],
|
||||
[0x30A1, 0x30F6],
|
||||
[0x30FB, 0x30FC],
|
||||
[0x3105, 0x312C],
|
||||
[0x4E00, 0x9FA5],
|
||||
[0xAC00, 0xD7A3]
|
||||
];
|
||||
|
||||
size_t high = ALPHA_TABLE.length - 1;
|
||||
// Shortcut search if c is out of range
|
||||
size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
|
||||
// Binary search
|
||||
while (low <= high)
|
||||
{
|
||||
const size_t mid = low + ((high - low) >> 1);
|
||||
if (c < ALPHA_TABLE[mid][0])
|
||||
high = mid - 1;
|
||||
else if (ALPHA_TABLE[mid][1] < c)
|
||||
low = mid + 1;
|
||||
else
|
||||
{
|
||||
assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the code length of c in code units.
|
||||
*/
|
||||
|
|
9
compiler/test/compilable/ident_UAX31.c
Normal file
9
compiler/test/compilable/ident_UAX31.c
Normal file
|
@ -0,0 +1,9 @@
|
|||
// REQUIRED_ARGS: -identifiers-importc=UAX31
|
||||
|
||||
// sppn doesn't support anything newer than c99
|
||||
// DISABLED: win32omf
|
||||
|
||||
// verify that the UAX31 identifier set is applied.
|
||||
|
||||
int \u00F8ide\u00F9nt;
|
||||
int øideùnt2;
|
5
compiler/test/compilable/ident_UAX31.d
Normal file
5
compiler/test/compilable/ident_UAX31.d
Normal file
|
@ -0,0 +1,5 @@
|
|||
// REQUIRED_ARGS: -identifiers=UAX31
|
||||
|
||||
// verify that the UAX31 identifier set is applied.
|
||||
|
||||
int øideùnt;
|
11
compiler/test/compilable/ident_all.c
Normal file
11
compiler/test/compilable/ident_all.c
Normal file
|
@ -0,0 +1,11 @@
|
|||
// REQUIRED_ARGS: -identifiers-importc=all
|
||||
|
||||
// sppn doesn't support anything newer than c99
|
||||
// DISABLED: win32omf
|
||||
|
||||
// verify that the All identifier set is applied.
|
||||
|
||||
int \u00F8ide\u00F9nt;
|
||||
int \u00AAide\u00B5nt;
|
||||
int \u00A8ide\u00AFnt;
|
||||
int \u00F8ide\u00F9nt;
|
10
compiler/test/compilable/ident_all.d
Normal file
10
compiler/test/compilable/ident_all.d
Normal file
|
@ -0,0 +1,10 @@
|
|||
// REQUIRED_ARGS: -identifiers=all
|
||||
|
||||
// verify that the UAX31 identifier set is applied.
|
||||
|
||||
int øideùnt;
|
||||
int ªideµnt;
|
||||
int ¨ide¯nt;
|
||||
|
||||
// just to play it safe, do we support one unicode then another at start?
|
||||
int øùident;
|
9
compiler/test/compilable/ident_c11.c
Normal file
9
compiler/test/compilable/ident_c11.c
Normal file
|
@ -0,0 +1,9 @@
|
|||
// REQUIRED_ARGS: -identifiers-importc=c11
|
||||
|
||||
// sppn doesn't support anything newer than c99
|
||||
// DISABLED: win32omf
|
||||
|
||||
// verify that the C11 identifier set is applied.
|
||||
|
||||
int \u00A8ide\u00AFnt;
|
||||
int ¨ide¯nt;
|
5
compiler/test/compilable/ident_c11.d
Normal file
5
compiler/test/compilable/ident_c11.d
Normal file
|
@ -0,0 +1,5 @@
|
|||
// REQUIRED_ARGS: -identifiers=c11
|
||||
|
||||
// verify that the C11 identifier set is applied.
|
||||
|
||||
int ¨ide¯nt;
|
6
compiler/test/compilable/ident_c99.c
Normal file
6
compiler/test/compilable/ident_c99.c
Normal file
|
@ -0,0 +1,6 @@
|
|||
// REQUIRED_ARGS: -identifiers-importc=c99
|
||||
|
||||
// verify that the C99 identifier set is applied.
|
||||
|
||||
int \u00AAide\u00B5nt;
|
||||
int ªideµnt2;
|
5
compiler/test/compilable/ident_c99.d
Normal file
5
compiler/test/compilable/ident_c99.d
Normal file
|
@ -0,0 +1,5 @@
|
|||
// REQUIRED_ARGS: -identifiers=c99
|
||||
|
||||
// verify that the C99 identifier set is applied.
|
||||
|
||||
int ªideµnt;
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
TEST_OUTPUT:
|
||||
---
|
||||
fail_compilation/lexer23465.d(19): Error: char 0x1f37a not allowed in identifier
|
||||
fail_compilation/lexer23465.d(19): Error: character 0x1f37a is not allowed as a continue character in an identifier
|
||||
fail_compilation/lexer23465.d(19): Error: character 0x1f37a is not a valid token
|
||||
fail_compilation/lexer23465.d(20): Error: character '\' is not a valid token
|
||||
fail_compilation/lexer23465.d(21): Error: unterminated /+ +/ comment
|
||||
|
|
77
compiler/tools/unicode_tables/derivedCoreProperties.d
Normal file
77
compiler/tools/unicode_tables/derivedCoreProperties.d
Normal file
|
@ -0,0 +1,77 @@
|
|||
/**
|
||||
This module parses the UCD DerivedCoreProperties.txt file.
|
||||
|
||||
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
|
||||
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
*/
|
||||
module unicode_tables.derivedCoreProperties;
|
||||
import unicode_tables.util;
|
||||
|
||||
ValueRanges propertyXID_StartRanges, propertyXID_ContinueRanges;
|
||||
|
||||
void parseProperties(string dataFile)
|
||||
{
|
||||
import std.algorithm : countUntil, startsWith;
|
||||
import std.file : readText;
|
||||
import std.string : lineSplitter, strip, split;
|
||||
import std.conv : parse;
|
||||
|
||||
foreach (line; readText(dataFile).lineSplitter)
|
||||
{
|
||||
{
|
||||
// handle end of line comment
|
||||
ptrdiff_t offset = line.countUntil('#');
|
||||
if (offset >= 0)
|
||||
line = line[0 .. offset];
|
||||
line = line.strip;
|
||||
}
|
||||
|
||||
string[] fields = line.split(";");
|
||||
{
|
||||
foreach (ref field; fields)
|
||||
{
|
||||
field = field.strip;
|
||||
}
|
||||
|
||||
if (fields.length == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else if (fields.length != 2)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ValueRange range;
|
||||
|
||||
{
|
||||
range.start = parse!uint(fields[0], 16);
|
||||
|
||||
if (fields[0].startsWith(".."))
|
||||
{
|
||||
fields[0] = fields[0][2 .. $];
|
||||
range.end = parse!uint(fields[0], 16);
|
||||
}
|
||||
else
|
||||
{
|
||||
range.end = range.start;
|
||||
}
|
||||
}
|
||||
|
||||
switch (fields[1])
|
||||
{
|
||||
case "XID_Start":
|
||||
propertyXID_StartRanges.add(range);
|
||||
break;
|
||||
|
||||
case "XID_Continue":
|
||||
propertyXID_ContinueRanges.add(range);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
165
compiler/tools/unicode_tables/fixedtables.d
Normal file
165
compiler/tools/unicode_tables/fixedtables.d
Normal file
|
@ -0,0 +1,165 @@
|
|||
/**
|
||||
Known fixed tables.
|
||||
|
||||
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
|
||||
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
*/
|
||||
module unicode_tables.fixedtables;
|
||||
import unicode_tables.util;
|
||||
|
||||
immutable ValueRanges ASCII_Table = ValueRanges([
|
||||
ValueRange(0, 127)
|
||||
]);
|
||||
|
||||
immutable ValueRanges c99_Table = ValueRanges([
|
||||
ValueRange(0x00AA, 0x00AA), ValueRange(0x00B5, 0x00B5),
|
||||
ValueRange(0x00B7, 0x00B7), ValueRange(0x00BA, 0x00BA),
|
||||
ValueRange(0x00C0, 0x00D6), ValueRange(0x00D8, 0x00F6),
|
||||
ValueRange(0x00F8, 0x01F5), ValueRange(0x01FA, 0x0217),
|
||||
ValueRange(0x0250, 0x02A8), ValueRange(0x02B0, 0x02B8),
|
||||
ValueRange(0x02BB, 0x02BB), ValueRange(0x02BD, 0x02C1),
|
||||
ValueRange(0x02D0, 0x02D1), ValueRange(0x02E0, 0x02E4),
|
||||
ValueRange(0x037A, 0x037A), ValueRange(0x0386, 0x0386),
|
||||
ValueRange(0x0388, 0x038A), ValueRange(0x038C, 0x038C),
|
||||
ValueRange(0x038E, 0x03A1), ValueRange(0x03A3, 0x03CE),
|
||||
ValueRange(0x03D0, 0x03D6), ValueRange(0x03DA, 0x03DA),
|
||||
ValueRange(0x03DC, 0x03DC), ValueRange(0x03DE, 0x03DE),
|
||||
ValueRange(0x03E0, 0x03E0), ValueRange(0x03E2, 0x03F3),
|
||||
ValueRange(0x0401, 0x040C), ValueRange(0x040E, 0x044F),
|
||||
ValueRange(0x0451, 0x045C), ValueRange(0x045E, 0x0481),
|
||||
ValueRange(0x0490, 0x04C4), ValueRange(0x04C7, 0x04C8),
|
||||
ValueRange(0x04CB, 0x04CC), ValueRange(0x04D0, 0x04EB),
|
||||
ValueRange(0x04EE, 0x04F5), ValueRange(0x04F8, 0x04F9),
|
||||
ValueRange(0x0531, 0x0556), ValueRange(0x0559, 0x0559),
|
||||
ValueRange(0x0561, 0x0587), ValueRange(0x05B0, 0x05B9),
|
||||
ValueRange(0x05BB, 0x05BD), ValueRange(0x05BF, 0x05BF),
|
||||
ValueRange(0x05C1, 0x05C2), ValueRange(0x05D0, 0x05EA),
|
||||
ValueRange(0x05F0, 0x05F2), ValueRange(0x0621, 0x063A),
|
||||
ValueRange(0x0640, 0x0652), ValueRange(0x0660, 0x0669),
|
||||
ValueRange(0x0670, 0x06B7), ValueRange(0x06BA, 0x06BE),
|
||||
ValueRange(0x06C0, 0x06CE), ValueRange(0x06D0, 0x06DC),
|
||||
ValueRange(0x06E5, 0x06E8), ValueRange(0x06EA, 0x06ED),
|
||||
ValueRange(0x06F0, 0x06F9), ValueRange(0x0901, 0x0903),
|
||||
ValueRange(0x0905, 0x0939), ValueRange(0x093D, 0x094D),
|
||||
ValueRange(0x0950, 0x0952), ValueRange(0x0958, 0x0963),
|
||||
ValueRange(0x0966, 0x096F), ValueRange(0x0981, 0x0983),
|
||||
ValueRange(0x0985, 0x098C), ValueRange(0x098F, 0x0990),
|
||||
ValueRange(0x0993, 0x09A8), ValueRange(0x09AA, 0x09B0),
|
||||
ValueRange(0x09B2, 0x09B2), ValueRange(0x09B6, 0x09B9),
|
||||
ValueRange(0x09BE, 0x09C4), ValueRange(0x09C7, 0x09C8),
|
||||
ValueRange(0x09CB, 0x09CD), ValueRange(0x09DC, 0x09DD),
|
||||
ValueRange(0x09DF, 0x09E3), ValueRange(0x09E6, 0x09F1),
|
||||
ValueRange(0x0A02, 0x0A02), ValueRange(0x0A05, 0x0A0A),
|
||||
ValueRange(0x0A0F, 0x0A10), ValueRange(0x0A13, 0x0A28),
|
||||
ValueRange(0x0A2A, 0x0A30), ValueRange(0x0A32, 0x0A33),
|
||||
ValueRange(0x0A35, 0x0A36), ValueRange(0x0A38, 0x0A39),
|
||||
ValueRange(0x0A3E, 0x0A42), ValueRange(0x0A47, 0x0A48),
|
||||
ValueRange(0x0A4B, 0x0A4D), ValueRange(0x0A59, 0x0A5C),
|
||||
ValueRange(0x0A5E, 0x0A5E), ValueRange(0x0A66, 0x0A6F),
|
||||
ValueRange(0x0A74, 0x0A74), ValueRange(0x0A81, 0x0A83),
|
||||
ValueRange(0x0A85, 0x0A8B), ValueRange(0x0A8D, 0x0A8D),
|
||||
ValueRange(0x0A8F, 0x0A91), ValueRange(0x0A93, 0x0AA8),
|
||||
ValueRange(0x0AAA, 0x0AB0), ValueRange(0x0AB2, 0x0AB3),
|
||||
ValueRange(0x0AB5, 0x0AB9), ValueRange(0x0ABD, 0x0AC5),
|
||||
ValueRange(0x0AC7, 0x0AC9), ValueRange(0x0ACB, 0x0ACD),
|
||||
ValueRange(0x0AD0, 0x0AD0), ValueRange(0x0AE0, 0x0AE0),
|
||||
ValueRange(0x0AE6, 0x0AEF), ValueRange(0x0B01, 0x0B03),
|
||||
ValueRange(0x0B05, 0x0B0C), ValueRange(0x0B0F, 0x0B10),
|
||||
ValueRange(0x0B13, 0x0B28), ValueRange(0x0B2A, 0x0B30),
|
||||
ValueRange(0x0B32, 0x0B33), ValueRange(0x0B36, 0x0B39),
|
||||
ValueRange(0x0B3D, 0x0B43), ValueRange(0x0B47, 0x0B48),
|
||||
ValueRange(0x0B4B, 0x0B4D), ValueRange(0x0B5C, 0x0B5D),
|
||||
ValueRange(0x0B5F, 0x0B61), ValueRange(0x0B66, 0x0B6F),
|
||||
ValueRange(0x0B82, 0x0B83), ValueRange(0x0B85, 0x0B8A),
|
||||
ValueRange(0x0B8E, 0x0B90), ValueRange(0x0B92, 0x0B95),
|
||||
ValueRange(0x0B99, 0x0B9A), ValueRange(0x0B9C, 0x0B9C),
|
||||
ValueRange(0x0B9E, 0x0B9F), ValueRange(0x0BA3, 0x0BA4),
|
||||
ValueRange(0x0BA8, 0x0BAA), ValueRange(0x0BAE, 0x0BB5),
|
||||
ValueRange(0x0BB7, 0x0BB9), ValueRange(0x0BBE, 0x0BC2),
|
||||
ValueRange(0x0BC6, 0x0BC8), ValueRange(0x0BCA, 0x0BCD),
|
||||
ValueRange(0x0BE7, 0x0BEF), ValueRange(0x0C01, 0x0C03),
|
||||
ValueRange(0x0C05, 0x0C0C), ValueRange(0x0C0E, 0x0C10),
|
||||
ValueRange(0x0C12, 0x0C28), ValueRange(0x0C2A, 0x0C33),
|
||||
ValueRange(0x0C35, 0x0C39), ValueRange(0x0C3E, 0x0C44),
|
||||
ValueRange(0x0C46, 0x0C48), ValueRange(0x0C4A, 0x0C4D),
|
||||
ValueRange(0x0C60, 0x0C61), ValueRange(0x0C66, 0x0C6F),
|
||||
ValueRange(0x0C82, 0x0C83), ValueRange(0x0C85, 0x0C8C),
|
||||
ValueRange(0x0C8E, 0x0C90), ValueRange(0x0C92, 0x0CA8),
|
||||
ValueRange(0x0CAA, 0x0CB3), ValueRange(0x0CB5, 0x0CB9),
|
||||
ValueRange(0x0CBE, 0x0CC4), ValueRange(0x0CC6, 0x0CC8),
|
||||
ValueRange(0x0CCA, 0x0CCD), ValueRange(0x0CDE, 0x0CDE),
|
||||
ValueRange(0x0CE0, 0x0CE1), ValueRange(0x0CE6, 0x0CEF),
|
||||
ValueRange(0x0D02, 0x0D03), ValueRange(0x0D05, 0x0D0C),
|
||||
ValueRange(0x0D0E, 0x0D10), ValueRange(0x0D12, 0x0D28),
|
||||
ValueRange(0x0D2A, 0x0D39), ValueRange(0x0D3E, 0x0D43),
|
||||
ValueRange(0x0D46, 0x0D48), ValueRange(0x0D4A, 0x0D4D),
|
||||
ValueRange(0x0D60, 0x0D61), ValueRange(0x0D66, 0x0D6F),
|
||||
ValueRange(0x0E01, 0x0E3A), ValueRange(0x0E40, 0x0E5B),
|
||||
ValueRange(0x0E81, 0x0E82), ValueRange(0x0E84, 0x0E84),
|
||||
ValueRange(0x0E87, 0x0E88), ValueRange(0x0E8A, 0x0E8A),
|
||||
ValueRange(0x0E8D, 0x0E8D), ValueRange(0x0E94, 0x0E97),
|
||||
ValueRange(0x0E99, 0x0E9F), ValueRange(0x0EA1, 0x0EA3),
|
||||
ValueRange(0x0EA5, 0x0EA5), ValueRange(0x0EA7, 0x0EA7),
|
||||
ValueRange(0x0EAA, 0x0EAB), ValueRange(0x0EAD, 0x0EAE),
|
||||
ValueRange(0x0EB0, 0x0EB9), ValueRange(0x0EBB, 0x0EBD),
|
||||
ValueRange(0x0EC0, 0x0EC4), ValueRange(0x0EC6, 0x0EC6),
|
||||
ValueRange(0x0EC8, 0x0ECD), ValueRange(0x0ED0, 0x0ED9),
|
||||
ValueRange(0x0EDC, 0x0EDD), ValueRange(0x0F00, 0x0F00),
|
||||
ValueRange(0x0F18, 0x0F19), ValueRange(0x0F20, 0x0F33),
|
||||
ValueRange(0x0F35, 0x0F35), ValueRange(0x0F37, 0x0F37),
|
||||
ValueRange(0x0F39, 0x0F39), ValueRange(0x0F3E, 0x0F47),
|
||||
ValueRange(0x0F49, 0x0F69), ValueRange(0x0F71, 0x0F84),
|
||||
ValueRange(0x0F86, 0x0F8B), ValueRange(0x0F90, 0x0F95),
|
||||
ValueRange(0x0F97, 0x0F97), ValueRange(0x0F99, 0x0FAD),
|
||||
ValueRange(0x0FB1, 0x0FB7), ValueRange(0x0FB9, 0x0FB9),
|
||||
ValueRange(0x10A0, 0x10C5), ValueRange(0x10D0, 0x10F6),
|
||||
ValueRange(0x1E00, 0x1E9B), ValueRange(0x1EA0, 0x1EF9),
|
||||
ValueRange(0x1F00, 0x1F15), ValueRange(0x1F18, 0x1F1D),
|
||||
ValueRange(0x1F20, 0x1F45), ValueRange(0x1F48, 0x1F4D),
|
||||
ValueRange(0x1F50, 0x1F57), ValueRange(0x1F59, 0x1F59),
|
||||
ValueRange(0x1F5B, 0x1F5B), ValueRange(0x1F5D, 0x1F5D),
|
||||
ValueRange(0x1F5F, 0x1F7D), ValueRange(0x1F80, 0x1FB4),
|
||||
ValueRange(0x1FB6, 0x1FBC), ValueRange(0x1FBE, 0x1FBE),
|
||||
ValueRange(0x1FC2, 0x1FC4), ValueRange(0x1FC6, 0x1FCC),
|
||||
ValueRange(0x1FD0, 0x1FD3), ValueRange(0x1FD6, 0x1FDB),
|
||||
ValueRange(0x1FE0, 0x1FEC), ValueRange(0x1FF2, 0x1FF4),
|
||||
ValueRange(0x1FF6, 0x1FFC), ValueRange(0x203F, 0x2040),
|
||||
ValueRange(0x207F, 0x207F), ValueRange(0x2102, 0x2102),
|
||||
ValueRange(0x2107, 0x2107), ValueRange(0x210A, 0x2113),
|
||||
ValueRange(0x2115, 0x2115), ValueRange(0x2118, 0x211D),
|
||||
ValueRange(0x2124, 0x2124), ValueRange(0x2126, 0x2126),
|
||||
ValueRange(0x2128, 0x2128), ValueRange(0x212A, 0x2131),
|
||||
ValueRange(0x2133, 0x2138), ValueRange(0x2160, 0x2182),
|
||||
ValueRange(0x3005, 0x3007), ValueRange(0x3021, 0x3029),
|
||||
ValueRange(0x3041, 0x3093), ValueRange(0x309B, 0x309C),
|
||||
ValueRange(0x30A1, 0x30F6), ValueRange(0x30FB, 0x30FC),
|
||||
ValueRange(0x3105, 0x312C), ValueRange(0x4E00, 0x9FA5),
|
||||
ValueRange(0xAC00, 0xD7A3)
|
||||
]);
|
||||
|
||||
immutable ValueRanges c11_Table = ValueRanges([
|
||||
ValueRange(0x00A8, 0x00A8), ValueRange(0x00AA, 0x00AA),
|
||||
ValueRange(0x00AD, 0x00AD), ValueRange(0x00AF,0x00AF),
|
||||
ValueRange(0x00B2, 0x00B5), ValueRange(0x00B7, 0x00BA),
|
||||
ValueRange(0x00BC, 0x00BE), ValueRange(0x00C0, 0x00D6),
|
||||
ValueRange(0x00D8, 0x00F6), ValueRange(0x00F8, 0x00FF),
|
||||
ValueRange(0x0100, 0x167F), ValueRange(0x1681, 0x180D),
|
||||
ValueRange(0x180F, 0x1FFF), ValueRange(0x200B, 0x200D),
|
||||
ValueRange(0x202A, 0x202E), ValueRange(0x203F, 0x2040),
|
||||
ValueRange(0x2054, 0x2054), ValueRange(0x2060, 0x206F),
|
||||
ValueRange(0x2070, 0x218F), ValueRange(0x2460, 0x24FF),
|
||||
ValueRange(0x2776, 0x2793), ValueRange(0x2C00, 0x2DFF),
|
||||
ValueRange(0x2E80, 0x2FFF), ValueRange(0x3004, 0x3007),
|
||||
ValueRange(0x3021, 0x302F), ValueRange(0x3031, 0x303F),
|
||||
ValueRange(0x3040, 0xD7FF), ValueRange(0xF900, 0xFD3D),
|
||||
ValueRange(0xFD40, 0xFDCF), ValueRange(0xFDF0, 0xFE44),
|
||||
ValueRange(0xFE47, 0xFFFD), ValueRange(0x10000, 0x1FFFD),
|
||||
ValueRange(0x20000, 0x2FFFD), ValueRange(0x30000, 0x3FFFD),
|
||||
ValueRange(0x40000, 0x4FFFD), ValueRange(0x50000, 0x5FFFD),
|
||||
ValueRange(0x60000, 0x6FFFD), ValueRange(0x70000, 0x7FFFD),
|
||||
ValueRange(0x80000, 0x8FFFD), ValueRange(0x90000, 0x9FFFD),
|
||||
ValueRange(0xA0000, 0xAFFFD), ValueRange(0xB0000, 0xBFFFD),
|
||||
ValueRange(0xC0000, 0xCFFFD), ValueRange(0xD0000, 0xDFFFD),
|
||||
ValueRange(0xE0000, 0xEFFFD),
|
||||
]);
|
184
compiler/tools/unicode_tables/unicodeData.d
Normal file
184
compiler/tools/unicode_tables/unicodeData.d
Normal file
|
@ -0,0 +1,184 @@
|
|||
/**
|
||||
This module parses the UCD UnicodeData.txt file.
|
||||
|
||||
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
|
||||
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
*/
|
||||
module unicode_tables.unicodeData;
|
||||
import unicode_tables.util;
|
||||
|
||||
UDEntry[] udEntries;
|
||||
|
||||
void parseUnicodeData(string dataFile)
|
||||
{
|
||||
import std.algorithm : countUntil, endsWith;
|
||||
import std.file : readText;
|
||||
import std.string : lineSplitter, strip, split;
|
||||
import std.conv : parse;
|
||||
|
||||
bool expectedRangeEnd, nextRangeEnd;
|
||||
|
||||
foreach (line; readText(dataFile).lineSplitter)
|
||||
{
|
||||
{
|
||||
// handle end of line comment
|
||||
ptrdiff_t offset = line.countUntil('#');
|
||||
if (offset >= 0)
|
||||
line = line[0 .. offset];
|
||||
line = line.strip;
|
||||
}
|
||||
|
||||
string[] fields = line.split(";");
|
||||
{
|
||||
foreach (ref field; fields)
|
||||
{
|
||||
field = field.strip;
|
||||
}
|
||||
|
||||
if (fields.length == 0)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
else if (fields.length != 15)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
/+
|
||||
How first field ranges are specified (the First, Last bit):
|
||||
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
+/
|
||||
|
||||
if (fields[1].endsWith(">"))
|
||||
{
|
||||
if (fields[1].endsWith("First>"))
|
||||
{
|
||||
nextRangeEnd = true;
|
||||
}
|
||||
else if (fields[1].endsWith("Last>"))
|
||||
{
|
||||
assert(nextRangeEnd);
|
||||
nextRangeEnd = false;
|
||||
expectedRangeEnd = true;
|
||||
}
|
||||
else if (fields[1] == "<control>")
|
||||
{
|
||||
if (expectedRangeEnd)
|
||||
{
|
||||
nextRangeEnd = false;
|
||||
expectedRangeEnd = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else if (expectedRangeEnd)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
uint character = parse!uint(fields[0], 16);
|
||||
|
||||
if (expectedRangeEnd)
|
||||
{
|
||||
udEntries[$ - 1].range.end = character;
|
||||
expectedRangeEnd = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
{
|
||||
UDEntry entry;
|
||||
entry.range = ValueRange(character);
|
||||
|
||||
static foreach (GC; __traits(allMembers, GeneralCategory))
|
||||
{
|
||||
if (fields[2] == GC)
|
||||
entry.generalCategory = __traits(getMember, GeneralCategory, GC);
|
||||
}
|
||||
|
||||
entry.canonicalCombiningClass = parse!int(fields[3]);
|
||||
|
||||
udEntries ~= entry;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct UDEntry
|
||||
{
|
||||
ValueRange range;
|
||||
GeneralCategory generalCategory;
|
||||
int canonicalCombiningClass;
|
||||
|
||||
@safe:
|
||||
|
||||
bool isStarter()
|
||||
{
|
||||
return canonicalCombiningClass == 0;
|
||||
}
|
||||
|
||||
bool isAlpha()
|
||||
{
|
||||
switch (generalCategory)
|
||||
{
|
||||
case GeneralCategory.Lu:
|
||||
case GeneralCategory.Ll:
|
||||
case GeneralCategory.Lt:
|
||||
case GeneralCategory.Lm:
|
||||
case GeneralCategory.Lo:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
enum GeneralCategory
|
||||
{
|
||||
None, ///
|
||||
Lu, ///
|
||||
Ll, ///
|
||||
Lt, ///
|
||||
LC, ///
|
||||
Lm, ///
|
||||
Lo, ///
|
||||
L, ///
|
||||
Mn, ///
|
||||
Mc, ///
|
||||
Me, ///
|
||||
M, ///
|
||||
Nd, ///
|
||||
Nl, ///
|
||||
No, ///
|
||||
N, ///
|
||||
Pc, ///
|
||||
Pd, ///
|
||||
Ps, ///
|
||||
Pe, ///
|
||||
Pi, ///
|
||||
Pf, ///
|
||||
Po, ///
|
||||
P, ///
|
||||
Sm, ///
|
||||
Sc, ///
|
||||
Sk, ///
|
||||
So, ///
|
||||
S, ///
|
||||
Zs, ///
|
||||
Zl, ///
|
||||
Zp, ///
|
||||
Z, ///
|
||||
Cc, ///
|
||||
Cf, ///
|
||||
Cs, ///
|
||||
Co, ///
|
||||
Cn, ///
|
||||
C, ///
|
||||
}
|
145
compiler/tools/unicode_tables/util.d
Normal file
145
compiler/tools/unicode_tables/util.d
Normal file
|
@ -0,0 +1,145 @@
|
|||
/**
|
||||
Utilities for working with Unicode ranges.
|
||||
|
||||
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole
|
||||
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
*/
|
||||
module unicode_tables.util;
|
||||
|
||||
struct ValueRange
|
||||
{
|
||||
dchar start, end;
|
||||
@safe:
|
||||
|
||||
this(dchar index)
|
||||
{
|
||||
this.start = index;
|
||||
this.end = index;
|
||||
}
|
||||
|
||||
this(dchar start, dchar end)
|
||||
{
|
||||
assert(end >= start);
|
||||
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
bool isSingle() const
|
||||
{
|
||||
return start == end;
|
||||
}
|
||||
|
||||
bool within(dchar index) const
|
||||
{
|
||||
return start <= index && end >= index;
|
||||
}
|
||||
|
||||
uint count() const
|
||||
{
|
||||
return end + 1 - start;
|
||||
}
|
||||
|
||||
int opCmp(const ValueRange other) const {
|
||||
return this.start < other.start ? -1 : (this.start > other.start ? 1 : 0);
|
||||
}
|
||||
|
||||
int opApply(scope int delegate(dchar) @safe del) const
|
||||
{
|
||||
int result;
|
||||
|
||||
foreach (dchar index; start .. end + 1)
|
||||
{
|
||||
result = del(index);
|
||||
if (result)
|
||||
return result;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
struct ValueRanges
|
||||
{
|
||||
ValueRange[] ranges;
|
||||
|
||||
@safe:
|
||||
|
||||
void add(ValueRange toAdd)
|
||||
{
|
||||
if (ranges.length > 0 && (ranges[$ - 1].end >= toAdd.start || ranges[$ - 1].end + 1 == toAdd.start))
|
||||
{
|
||||
ranges[$ - 1].end = toAdd.end;
|
||||
}
|
||||
else
|
||||
{
|
||||
ranges ~= toAdd;
|
||||
}
|
||||
}
|
||||
|
||||
ValueRanges not(const ref ValueRanges butNotThis) const
|
||||
{
|
||||
ValueRanges ret;
|
||||
|
||||
foreach (toAdd; this)
|
||||
{
|
||||
if (butNotThis.within(toAdd))
|
||||
continue;
|
||||
ret.add(ValueRange(toAdd));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
ValueRanges merge(const ref ValueRanges andThis) const
|
||||
{
|
||||
import std.algorithm : sort;
|
||||
ValueRanges ret;
|
||||
|
||||
auto sorted = sort((this.ranges ~ andThis.ranges).dup);
|
||||
|
||||
foreach(range; sorted) {
|
||||
ret.add(range);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool within(dchar index) const
|
||||
{
|
||||
foreach (range; ranges)
|
||||
{
|
||||
if (range.within(index))
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
uint count() const
|
||||
{
|
||||
uint ret;
|
||||
|
||||
foreach (range; ranges)
|
||||
{
|
||||
ret += range.count;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int opApply(scope int delegate(dchar) @safe del) const
|
||||
{
|
||||
int result;
|
||||
|
||||
foreach (range; ranges)
|
||||
{
|
||||
result = range.opApply(del);
|
||||
if (result)
|
||||
return result;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
206
compiler/tools/unicodetables.d
Normal file
206
compiler/tools/unicodetables.d
Normal file
|
@ -0,0 +1,206 @@
|
|||
/**
|
||||
Generates the Unicode tables and associated Identifier tables for dmd-fe.
|
||||
|
||||
These tables are stored in ``dmd.common.identifiertables``.
|
||||
They are C99, C11, UAX31 and a least restrictive set (All).
|
||||
|
||||
You can run this via ``rdmd unicodetables.d``.
|
||||
|
||||
You will likely only need to run this program whenever the Unicode standard updates.
|
||||
It does not need to be run automatically as part of CI, as long as its kept in a working condition when committed, it only needs non-fancy features so it is unlikely to break long term.
|
||||
|
||||
Place the updated files from the $(LINK2 https://www.unicode.org/Public/, Unicode database) into the a directory ``UCD-<version>/``, update the ``UCDDirectory`` variable.
|
||||
Make sure to commit the updated ``UCDDirectory`` variable into the repository so we can keep track of what the latest version it has been updated to.
|
||||
|
||||
The update procedure is similar to Phobos's Unicode table generator for ``std.uni``.
|
||||
If you know one, you can do the other fairly easily.
|
||||
|
||||
Copyright: Copyright (C) 1999-2024 by The D Language Foundation, All Rights Reserved
|
||||
Authors: $(LINK2 https://cattermole.co.nz, Richard (Rikki) Andrew Cattermole)
|
||||
License: $(LINK2 https://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
|
||||
*/
|
||||
module unicodetables;
|
||||
import unicode_tables.util;
|
||||
import unicode_tables.fixedtables;
|
||||
import std.stdio : File, writeln;
|
||||
|
||||
enum {
|
||||
// don't forget to update me when you commit new tables!
|
||||
UCDDirectory = "UCD-15.1.0/",
|
||||
UnicodeDataFile = UCDDirectory ~ "UnicodeData.txt",
|
||||
DerivedCorePropertiesFile = UCDDirectory ~ "DerivedCoreProperties.txt",
|
||||
|
||||
UnicodeTableFile = "../src/dmd/common/identifiertables.d",
|
||||
}
|
||||
|
||||
// Will disable the ASCII ranges in the generated tables.
|
||||
// Disable if you are not handling elsewhere.
|
||||
version = IgnoreASCIIRanges;
|
||||
|
||||
File tableFile;
|
||||
|
||||
int main(string[] args)
|
||||
{
|
||||
import std.file : exists;
|
||||
|
||||
if (!exists(UnicodeDataFile)) {
|
||||
writeln("Missing UCD table UnicodeData.txt");
|
||||
return 1;
|
||||
} else if (!exists(DerivedCorePropertiesFile)) {
|
||||
writeln("Missing UCD table DerivedCoreProperties.txt");
|
||||
return 2;
|
||||
}
|
||||
|
||||
{
|
||||
tableFile = File(UnicodeTableFile, "w+");
|
||||
tableFile.writeln("// Generated by compiler/tools/unicode_tables.d DO NOT MODIFY!!!");
|
||||
tableFile.writeln("module dmd.common.identifiertables;");
|
||||
tableFile.writeln();
|
||||
}
|
||||
|
||||
{
|
||||
import unicode_tables.unicodeData;
|
||||
import unicode_tables.derivedCoreProperties;
|
||||
|
||||
parseUnicodeData(UnicodeDataFile);
|
||||
parseProperties(DerivedCorePropertiesFile);
|
||||
}
|
||||
|
||||
write_XID_Start;
|
||||
tableFile.writeln;
|
||||
|
||||
write_XID_Continue;
|
||||
tableFile.writeln;
|
||||
|
||||
write_other_tables;
|
||||
tableFile.writeln;
|
||||
|
||||
write_least_restrictive_table;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void writeTable(string name, const ValueRanges vr)
|
||||
{
|
||||
tableFile.writeln("static immutable dchar[2][] ", name, " = [");
|
||||
|
||||
foreach (entry; vr.ranges)
|
||||
{
|
||||
tableFile.writefln!" [0x%X, 0x%X],"(entry.start, entry.end);
|
||||
}
|
||||
|
||||
tableFile.writeln("];");
|
||||
}
|
||||
|
||||
void write_XID_Start()
|
||||
{
|
||||
import unicode_tables.derivedCoreProperties;
|
||||
import std.algorithm : sort;
|
||||
|
||||
ValueRanges start = ValueRanges(propertyXID_StartRanges.ranges.dup);
|
||||
|
||||
version(IgnoreASCIIRanges)
|
||||
{
|
||||
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
|
||||
start = start.not(ASCII_Table);
|
||||
}
|
||||
else
|
||||
{
|
||||
// This may be not needed, as we'll handle ASCII elsewhere in lexer,
|
||||
// but if we don't in some place we'll want this instead.
|
||||
start.add(ValueRange(0x5F)); // add _
|
||||
start.ranges.sort!((a, b) => a.start < b.start);
|
||||
}
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("UAX31 profile Start");
|
||||
tableFile.writeln("Entries: ", start.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("UAX31_Start", start);
|
||||
}
|
||||
|
||||
void write_XID_Continue()
|
||||
{
|
||||
import unicode_tables.derivedCoreProperties;
|
||||
|
||||
ValueRanges cont = ValueRanges(propertyXID_ContinueRanges.ranges.dup);
|
||||
|
||||
version(IgnoreASCIIRanges)
|
||||
{
|
||||
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
|
||||
cont = cont.not(ASCII_Table);
|
||||
}
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("UAX31 profile Continue");
|
||||
tableFile.writeln("Entries: ", cont.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("UAX31_Continue", cont);
|
||||
}
|
||||
|
||||
void write_other_tables()
|
||||
{
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("C99 Start");
|
||||
tableFile.writeln("Entries: ", c99_Table.count);
|
||||
tableFile.writeln("*/");
|
||||
tableFile.writeln("alias FixedTable_C99_Start = FixedTable_C99_Continue;");
|
||||
tableFile.writeln;
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("C99 Continue");
|
||||
tableFile.writeln("Entries: ", c99_Table.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("FixedTable_C99_Continue", c99_Table);
|
||||
tableFile.writeln;
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("C11 Start");
|
||||
tableFile.writeln("Entries: ", c11_Table.count);
|
||||
tableFile.writeln("*/");
|
||||
tableFile.writeln("alias FixedTable_C11_Start = FixedTable_C11_Continue;");
|
||||
tableFile.writeln;
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("C11 Continue");
|
||||
tableFile.writeln("Entries: ", c11_Table.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("FixedTable_C11_Continue", c11_Table);
|
||||
}
|
||||
|
||||
void write_least_restrictive_table() {
|
||||
import unicode_tables.derivedCoreProperties;
|
||||
|
||||
ValueRanges toMerge = c99_Table.merge(c11_Table);
|
||||
ValueRanges lrs = propertyXID_StartRanges.merge(toMerge);
|
||||
ValueRanges lrc = propertyXID_ContinueRanges.merge(toMerge);
|
||||
ValueRanges lr = lrs.merge(lrc);
|
||||
|
||||
version(IgnoreASCIIRanges)
|
||||
{
|
||||
// Remove ASCII ranges as its always a waste of time, since its handles elsewhere.
|
||||
lrs = lrs.not(ASCII_Table);
|
||||
lrc = lrc.not(ASCII_Table);
|
||||
lr = lr.not(ASCII_Table);
|
||||
}
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("Least restrictive with both Start and Continue");
|
||||
tableFile.writeln("Entries: ", lr.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("LeastRestrictive_OfAll", lr);
|
||||
tableFile.writeln;
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("Least restrictive Start");
|
||||
tableFile.writeln("Entries: ", lrs.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("LeastRestrictive_Start", lrs);
|
||||
tableFile.writeln;
|
||||
|
||||
tableFile.writeln("/**");
|
||||
tableFile.writeln("Least restrictive Continue");
|
||||
tableFile.writeln("Entries: ", lrc.count);
|
||||
tableFile.writeln("*/");
|
||||
writeTable("LeastRestrictive_Continue", lrc);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue