Byte Order Mark (BOM) handling functions rewrite

* move to std.encoding
* less overengineering

https://github.com/D-Programming-Language/phobos/pull/3870 rework

Don't use top-level selective import in std.math because of DMD issue 314.

some quickfur comments

whitespace

remove used import

steven suggestion

utfBom

andrei nitpicks

andrei null
This commit is contained in:
Robert burner Schadek 2016-01-17 21:47:56 +01:00 committed by Robert burner Schadek
parent aa8cf8646f
commit 3d37aee77d

View file

@ -3361,3 +3361,175 @@ version(unittest)
return "0123456789ABCDEF"[n & 0xF];
}
}
import std.typecons;
/** Definitions of common Byte Order Marks.
The elements of the $(D enum) can used as indices into $(D bomTable) to get
matching $(D BOMSeq).
*/
enum BOM
{
none = 0, /// no BOM was found
utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38]
[0x2B, 0x2F, 0x76, 0x39],
[0x2B, 0x2F, 0x76, 0x2B],
[0x2B, 0x2F, 0x76, 0x2F],
[0x2B, 0x2F, 0x76, 0x38, 0x2D]
*/
utf1 = 8, /// [0xF7, 0x64, 0x4C]
utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
scsu = 10, /// [0x0E, 0xFE, 0xFF]
bocu1 = 11, /// [0xFB, 0xEE, 0x28]
gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
utf8 = 13, /// [0xEF, 0xBB, 0xBF]
utf16be = 14, /// [0xFE, 0xFF]
utf16le = 15 /// [0xFF, 0xFE]
}
/// The type stored inside $(D bomTable).
alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
/** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
*/
immutable bomTable = [
BOMSeq(BOM.none, null),
BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
];
/** Returns a $(D BOMSeq) for a given $(D input).
If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is
returned. The $(D BOM) sequence at the beginning of the range will
not be comsumed from the passed range. If you pass a reference type
range make sure that $(D save) creates a deep copy.
Params:
input = The sequence to check for the $(D BOM)
Returns:
the found $(D BOMSeq) corresponding to the passed $(D input).
*/
immutable(BOMSeq) getBOM(Range)(Range input)
if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte))
{
import std.algorithm.searching : startsWith;
foreach (it; bomTable[1 .. $])
{
if (startsWith(input.save, it.sequence))
{
return it;
}
}
return bomTable[0];
}
///
unittest
{
import std.format : format;
auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
auto entry = getBOM(cast(ubyte[])ts);
version(BigEndian)
{
assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
}
else
{
assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
}
}
unittest
{
import std.format : format;
foreach (idx, it; bomTable)
{
auto s = it[1] ~ cast(ubyte[])"hello world";
auto i = getBOM(s);
assert(i[0] == bomTable[idx][0]);
if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
{
assert(i[0] == BOM.init + idx);
assert(i[1] == it[1]);
}
}
}
unittest
{
struct BOMInputRange
{
ubyte[] arr;
@property ubyte front()
{
return this.arr.front;
}
@property bool empty()
{
return this.arr.empty;
}
void popFront()
{
this.arr = this.arr[1 .. $];
}
@property typeof(this) save()
{
return this;
}
}
static assert( isInputRange!BOMInputRange);
static assert(!isArray!BOMInputRange);
ubyte[] dummyEnd = [0,0,0,0];
foreach (idx, it; bomTable[1 .. $])
{
{
auto ir = BOMInputRange(it.sequence.dup);
auto b = getBOM(ir);
assert(b.schema == it.schema);
assert(ir.arr == it.sequence);
}
{
auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
size_t oldLen = noBom.length;
assert(oldLen - 4 < it.sequence.length);
auto ir = BOMInputRange(noBom.dup);
auto b = getBOM(ir);
assert(b.schema == BOM.none);
assert(noBom.length == oldLen);
}
}
}
/** Constant defining a fully decoded BOM */
enum dchar utfBOM = 0xfeff;