diff --git a/std/encoding.d b/std/encoding.d index b2acf2f71..c4b475888 100644 --- a/std/encoding.d +++ b/std/encoding.d @@ -3361,3 +3361,175 @@ version(unittest) return "0123456789ABCDEF"[n & 0xF]; } } + +import std.typecons; + +/** Definitions of common Byte Order Marks. +The elements of the $(D enum) can used as indices into $(D bomTable) to get +matching $(D BOMSeq). +*/ +enum BOM +{ + none = 0, /// no BOM was found + utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF] + utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00] + utf7 = 3, /* [0x2B, 0x2F, 0x76, 0x38] + [0x2B, 0x2F, 0x76, 0x39], + [0x2B, 0x2F, 0x76, 0x2B], + [0x2B, 0x2F, 0x76, 0x2F], + [0x2B, 0x2F, 0x76, 0x38, 0x2D] + */ + utf1 = 8, /// [0xF7, 0x64, 0x4C] + utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73] + scsu = 10, /// [0x0E, 0xFE, 0xFF] + bocu1 = 11, /// [0xFB, 0xEE, 0x28] + gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33] + utf8 = 13, /// [0xEF, 0xBB, 0xBF] + utf16be = 14, /// [0xFE, 0xFF] + utf16le = 15 /// [0xFF, 0xFE] +} + +/// The type stored inside $(D bomTable). +alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence"); + +/** Mapping of a byte sequence to $(B Byte Order Mark (BOM)) +*/ +immutable bomTable = [ + BOMSeq(BOM.none, null), + BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])), + BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])), + BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])), + BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])), + BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])), + BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])), + BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])), + BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])), + BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])), + BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])), + BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])), + BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])), + BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])), + BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])), + BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE])) +]; + +/** Returns a $(D BOMSeq) for a given $(D input). +If no $(D BOM) is present the $(D BOMSeq) for $(D BOM.none) is +returned. The $(D BOM) sequence at the beginning of the range will +not be comsumed from the passed range. If you pass a reference type +range make sure that $(D save) creates a deep copy. + +Params: + input = The sequence to check for the $(D BOM) + +Returns: + the found $(D BOMSeq) corresponding to the passed $(D input). +*/ +immutable(BOMSeq) getBOM(Range)(Range input) + if (isForwardRange!Range && is(Unqual!(ElementType!Range) == ubyte)) +{ + import std.algorithm.searching : startsWith; + foreach (it; bomTable[1 .. $]) + { + if (startsWith(input.save, it.sequence)) + { + return it; + } + } + + return bomTable[0]; +} + +/// +unittest +{ + import std.format : format; + + auto ts = dchar(0x0000FEFF) ~ "Hello World"d; + + auto entry = getBOM(cast(ubyte[])ts); + version(BigEndian) + { + assert(entry.schema == BOM.utf32be, format("%s", entry.schema)); + } + else + { + assert(entry.schema == BOM.utf32le, format("%s", entry.schema)); + } +} + +unittest +{ + import std.format : format; + + foreach (idx, it; bomTable) + { + auto s = it[1] ~ cast(ubyte[])"hello world"; + auto i = getBOM(s); + assert(i[0] == bomTable[idx][0]); + + if (idx < 4 || idx > 7) // get around the multiple utf7 bom's + { + assert(i[0] == BOM.init + idx); + assert(i[1] == it[1]); + } + } +} + +unittest +{ + struct BOMInputRange + { + ubyte[] arr; + + @property ubyte front() + { + return this.arr.front; + } + + @property bool empty() + { + return this.arr.empty; + } + + void popFront() + { + this.arr = this.arr[1 .. $]; + } + + @property typeof(this) save() + { + return this; + } + } + + static assert( isInputRange!BOMInputRange); + static assert(!isArray!BOMInputRange); + + ubyte[] dummyEnd = [0,0,0,0]; + + foreach (idx, it; bomTable[1 .. $]) + { + { + auto ir = BOMInputRange(it.sequence.dup); + + auto b = getBOM(ir); + assert(b.schema == it.schema); + assert(ir.arr == it.sequence); + } + + { + auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd; + size_t oldLen = noBom.length; + assert(oldLen - 4 < it.sequence.length); + + auto ir = BOMInputRange(noBom.dup); + auto b = getBOM(ir); + assert(b.schema == BOM.none); + assert(noBom.length == oldLen); + } + } +} + +/** Constant defining a fully decoded BOM */ +enum dchar utfBOM = 0xfeff;