mirror of
https://github.com/ldc-developers/ldc.git
synced 2025-05-08 20:06:03 +03:00
193 lines
4.1 KiB
C
193 lines
4.1 KiB
C
// utf.c
|
|
// Copyright (c) 2003 by Digital Mars
|
|
// All Rights Reserved
|
|
// written by Walter Bright
|
|
// http://www.digitalmars.com
|
|
// License for redistribution is by either the Artistic License
|
|
// in artistic.txt, or the GNU General Public License in gnu.txt.
|
|
// See the included readme.txt for details.
|
|
|
|
// Description of UTF-8 at:
|
|
// http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
|
|
|
|
#include <stdio.h>
|
|
#include <assert.h>
|
|
|
|
#include "utf.h"
|
|
|
|
int utf_isValidDchar(dchar_t c)
|
|
{
|
|
return c < 0xD800 ||
|
|
(c > 0xDFFF && c <= 0x10FFFF && c != 0xFFFE && c != 0xFFFF);
|
|
}
|
|
|
|
/********************************************
|
|
* Decode a single UTF-8 character sequence.
|
|
* Returns:
|
|
* NULL success
|
|
* !=NULL error message string
|
|
*/
|
|
|
|
const char *utf_decodeChar(unsigned char *s, size_t len, size_t *pidx, dchar_t *presult)
|
|
{
|
|
dchar_t V;
|
|
size_t i = *pidx;
|
|
unsigned char u = s[i];
|
|
|
|
assert(i >= 0 && i < len);
|
|
|
|
if (u & 0x80)
|
|
{ unsigned n;
|
|
unsigned char u2;
|
|
|
|
/* The following encodings are valid, except for the 5 and 6 byte
|
|
* combinations:
|
|
* 0xxxxxxx
|
|
* 110xxxxx 10xxxxxx
|
|
* 1110xxxx 10xxxxxx 10xxxxxx
|
|
* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
for (n = 1; ; n++)
|
|
{
|
|
if (n > 4)
|
|
goto Lerr; // only do the first 4 of 6 encodings
|
|
if (((u << n) & 0x80) == 0)
|
|
{
|
|
if (n == 1)
|
|
goto Lerr;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Pick off (7 - n) significant bits of B from first byte of octet
|
|
V = (dchar_t)(u & ((1 << (7 - n)) - 1));
|
|
|
|
if (i + (n - 1) >= len)
|
|
goto Lerr; // off end of string
|
|
|
|
/* The following combinations are overlong, and illegal:
|
|
* 1100000x (10xxxxxx)
|
|
* 11100000 100xxxxx (10xxxxxx)
|
|
* 11110000 1000xxxx (10xxxxxx 10xxxxxx)
|
|
* 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
|
|
* 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
|
|
*/
|
|
u2 = s[i + 1];
|
|
if ((u & 0xFE) == 0xC0 ||
|
|
(u == 0xE0 && (u2 & 0xE0) == 0x80) ||
|
|
(u == 0xF0 && (u2 & 0xF0) == 0x80) ||
|
|
(u == 0xF8 && (u2 & 0xF8) == 0x80) ||
|
|
(u == 0xFC && (u2 & 0xFC) == 0x80))
|
|
goto Lerr; // overlong combination
|
|
|
|
for (unsigned j = 1; j != n; j++)
|
|
{
|
|
u = s[i + j];
|
|
if ((u & 0xC0) != 0x80)
|
|
goto Lerr; // trailing bytes are 10xxxxxx
|
|
V = (V << 6) | (u & 0x3F);
|
|
}
|
|
if (!utf_isValidDchar(V))
|
|
goto Lerr;
|
|
i += n;
|
|
}
|
|
else
|
|
{
|
|
V = (dchar_t) u;
|
|
i++;
|
|
}
|
|
|
|
assert(utf_isValidDchar(V));
|
|
*pidx = i;
|
|
*presult = V;
|
|
return NULL;
|
|
|
|
Lerr:
|
|
*presult = (dchar_t) s[i];
|
|
*pidx = i + 1;
|
|
return "invalid UTF-8 sequence";
|
|
}
|
|
|
|
/***************************************************
|
|
* Validate a UTF-8 string.
|
|
* Returns:
|
|
* NULL success
|
|
* !=NULL error message string
|
|
*/
|
|
|
|
const char *utf_validateString(unsigned char *s, size_t len)
|
|
{
|
|
size_t idx;
|
|
const char *err = NULL;
|
|
dchar_t dc;
|
|
|
|
for (idx = 0; idx < len; )
|
|
{
|
|
err = utf_decodeChar(s, len, &idx, &dc);
|
|
if (err)
|
|
break;
|
|
}
|
|
return err;
|
|
}
|
|
|
|
|
|
/********************************************
|
|
* Decode a single UTF-16 character sequence.
|
|
* Returns:
|
|
* NULL success
|
|
* !=NULL error message string
|
|
*/
|
|
|
|
|
|
const char *utf_decodeWchar(unsigned short *s, size_t len, size_t *pidx, dchar_t *presult)
|
|
{
|
|
const char *msg;
|
|
size_t i = *pidx;
|
|
unsigned u = s[i];
|
|
|
|
assert(i >= 0 && i < len);
|
|
if (u & ~0x7F)
|
|
{ if (u >= 0xD800 && u <= 0xDBFF)
|
|
{ unsigned u2;
|
|
|
|
if (i + 1 == len)
|
|
{ msg = "surrogate UTF-16 high value past end of string";
|
|
goto Lerr;
|
|
}
|
|
u2 = s[i + 1];
|
|
if (u2 < 0xDC00 || u2 > 0xDFFF)
|
|
{ msg = "surrogate UTF-16 low value out of range";
|
|
goto Lerr;
|
|
}
|
|
u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
|
|
i += 2;
|
|
}
|
|
else if (u >= 0xDC00 && u <= 0xDFFF)
|
|
{ msg = "unpaired surrogate UTF-16 value";
|
|
goto Lerr;
|
|
}
|
|
else if (u == 0xFFFE || u == 0xFFFF)
|
|
{ msg = "illegal UTF-16 value";
|
|
goto Lerr;
|
|
}
|
|
else
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
i++;
|
|
}
|
|
|
|
assert(utf_isValidDchar(u));
|
|
*pidx = i;
|
|
*presult = (dchar_t)u;
|
|
return NULL;
|
|
|
|
Lerr:
|
|
*presult = (dchar_t)s[i];
|
|
*pidx = i + 1;
|
|
return msg;
|
|
}
|
|
|