mirror of
https://github.com/dlang/phobos.git
synced 2025-04-27 13:40:20 +03:00
Byte Order Mark (BOM) handling functions rewrite
* move to std.encoding * less overengineering https://github.com/D-Programming-Language/phobos/pull/3870 rework Don't use top-level selective import in std.math because of DMD issue 314. some quickfur comments whitespace remove used import steven suggestion utfBom andrei nitpicks andrei null
This commit is contained in:
parent
aa8cf8646f
commit
3d37aee77d
1 changed files with 172 additions and 0 deletions
172
std/encoding.d
172
std/encoding.d
|
@ -3361,3 +3361,175 @@ version(unittest)
|
|||
return "0123456789ABCDEF"[n & 0xF];
|
||||
}
|
||||
}
|
||||
|
||||
import std.typecons;
|
||||
|
||||
/** Definitions of common Byte Order Marks.
|
||||
The elements of the $(D enum) can used as indices into $(D bomTable) to get
|
||||
matching $(D BOMSeq).
|
||||
*/
|
||||
enum BOM
|
||||
{
|
||||
none = 0, /// no BOM was found
|
||||
utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
|
||||
utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
|
||||
utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38]
|
||||
[0x2B, 0x2F, 0x76, 0x39],
|
||||
[0x2B, 0x2F, 0x76, 0x2B],
|
||||
[0x2B, 0x2F, 0x76, 0x2F],
|
||||
[0x2B, 0x2F, 0x76, 0x38, 0x2D]
|
||||
*/
|
||||
utf1 = 8, /// [0xF7, 0x64, 0x4C]
|
||||
utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
|
||||
scsu = 10, /// [0x0E, 0xFE, 0xFF]
|
||||
bocu1 = 11, /// [0xFB, 0xEE, 0x28]
|
||||
gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
|
||||
utf8 = 13, /// [0xEF, 0xBB, 0xBF]
|
||||
utf16be = 14, /// [0xFE, 0xFF]
|
||||
utf16le = 15 /// [0xFF, 0xFE]
|
||||
}
|
||||
|
||||
/// The type stored inside $(D bomTable).
|
||||
alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
|
||||
|
||||
/** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
|
||||
*/
|
||||
immutable bomTable = [
|
||||
BOMSeq(BOM.none, null),
|
||||
BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
|
||||
BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
|
||||
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
|
||||
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
|
||||
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
|
||||
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
|
||||
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
|
||||
BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
|
||||
BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
|
||||
BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
|
||||
BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
|
||||
BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
|
||||
BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
|
||||
BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
|
||||
BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
|
||||
];
|
||||
|
||||
/** Returns a $(D BOMSeq) for a given $(D input).
|
||||
If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
|
||||
returned. The $(D BOM) sequence at the beginning of the range will
|
||||
not be comsumed from the passed range. If you pass a reference type
|
||||
range make sure that $(D save) creates a deep copy.
|
||||
|
||||
Params:
|
||||
input = The sequence to check for the $(D BOM)
|
||||
|
||||
Returns:
|
||||
the found $(D BOMSeq) corresponding to the passed $(D input).
|
||||
*/
|
||||
immutable(BOMSeq) getBOM(Range)(Range input)
|
||||
if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
|
||||
{
|
||||
import std.algorithm.searching : startsWith;
|
||||
foreach (it; bomTable[1 .. $])
|
||||
{
|
||||
if (startsWith(input.save, it.sequence))
|
||||
{
|
||||
return it;
|
||||
}
|
||||
}
|
||||
|
||||
return bomTable[0];
|
||||
}
|
||||
|
||||
///
|
||||
unittest
|
||||
{
|
||||
import std.format : format;
|
||||
|
||||
auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
|
||||
|
||||
auto entry = getBOM(cast(ubyte[])ts);
|
||||
version(BigEndian)
|
||||
{
|
||||
assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
|
||||
}
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
import std.format : format;
|
||||
|
||||
foreach (idx, it; bomTable)
|
||||
{
|
||||
auto s = it[1] ~ cast(ubyte[])"hello world";
|
||||
auto i = getBOM(s);
|
||||
assert(i[0] == bomTable[idx][0]);
|
||||
|
||||
if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
|
||||
{
|
||||
assert(i[0] == BOM.init + idx);
|
||||
assert(i[1] == it[1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unittest
|
||||
{
|
||||
struct BOMInputRange
|
||||
{
|
||||
ubyte[] arr;
|
||||
|
||||
@property ubyte front()
|
||||
{
|
||||
return this.arr.front;
|
||||
}
|
||||
|
||||
@property bool empty()
|
||||
{
|
||||
return this.arr.empty;
|
||||
}
|
||||
|
||||
void popFront()
|
||||
{
|
||||
this.arr = this.arr[1 .. $];
|
||||
}
|
||||
|
||||
@property typeof(this) save()
|
||||
{
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
static assert( isInputRange!BOMInputRange);
|
||||
static assert(!isArray!BOMInputRange);
|
||||
|
||||
ubyte[] dummyEnd = [0,0,0,0];
|
||||
|
||||
foreach (idx, it; bomTable[1 .. $])
|
||||
{
|
||||
{
|
||||
auto ir = BOMInputRange(it.sequence.dup);
|
||||
|
||||
auto b = getBOM(ir);
|
||||
assert(b.schema == it.schema);
|
||||
assert(ir.arr == it.sequence);
|
||||
}
|
||||
|
||||
{
|
||||
auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
|
||||
size_t oldLen = noBom.length;
|
||||
assert(oldLen - 4 < it.sequence.length);
|
||||
|
||||
auto ir = BOMInputRange(noBom.dup);
|
||||
auto b = getBOM(ir);
|
||||
assert(b.schema == BOM.none);
|
||||
assert(noBom.length == oldLen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Constant defining a fully decoded BOM */
|
||||
enum dchar utfBOM = 0xfeff;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue