arsd/characterencodings.d

482 lines
18 KiB
D
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// helper program is in ~me/encodings.d to make more tables from wikipedia
/**
This is meant to help get data from the wild into utf8 strings
so you can work with them easily inside D.
The main function is convertToUtf8(), which takes a byte array
of your raw data (a byte array because it isn't really a D string
yet until it is utf8), and a runtime string telling it's current
encoding.
The current encoding argument is meant to come from the data's
metadata, and is flexible on exact format - it is case insensitive
and takes several variations on the names.
This way, you should be able to send it the encoding string directly
from an XML document, a HTTP header, or whatever you have, and it
ought to just work.
Example:
---
auto data = cast(immutable(ubyte)[])
std.file.read("my-windows-file.txt");
string utf8String = convertToUtf8(data, "windows-1252");
// utf8String can now be used
---
The encodings currently implemented for decoding are:
$(LIST
* UTF-8 (a no-op; it simply casts the array to string)
* UTF-16,
* UTF-32,
* Windows-1252,
* ISO 8859 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, and 16.
* KOI8-R
)
It treats ISO 8859-1, Latin-1, and Windows-1252 the same way, since
those labels are pretty much de-facto the same thing in wild documents (people mislabel them a lot and I found it more useful to just deal with it than to be pedantic).
This module currently makes no attempt to look at control characters.
*/
module arsd.characterencodings;
import std.string;
import std.array;
import std.conv;
// FIXME: use replacement char here instead
/// Like convertToUtf8, but if the encoding is unknown, it just strips all chars > 127 and calls it done instead of throwing
string convertToUtf8Lossy(immutable(ubyte)[] data, string dataCharacterEncoding) {
try {
auto ret = convertToUtf8(data, dataCharacterEncoding);
import std.utf;
validate(ret);
return ret;
} catch(Exception e) {
string ret;
foreach(b; data)
if(b < 128)
ret ~= b;
else
ret ~= '\uFFFD';
return ret;
}
}
/// Takes data from a given character encoding and returns it as UTF-8
string convertToUtf8(immutable(ubyte)[] data, string dataCharacterEncoding) {
// just to normalize the passed string...
auto encoding = dataCharacterEncoding.toLower();
encoding = encoding.replace(" ", "");
encoding = encoding.replace("-", "");
encoding = encoding.replace("_", "");
// should be good enough.
switch(encoding) {
default:
throw new Exception("I don't know how to convert " ~ dataCharacterEncoding ~ " to UTF-8");
// since the input is immutable, these are ok too.
// just want to cover all the bases with one runtime function.
case "utf16":
case "utf16le":
return to!string(cast(wstring) data);
case "utf32":
case "utf32le":
return to!string(cast(dstring) data);
// FIXME: does the big endian to little endian conversion work?
case "ascii":
case "usascii": // utf-8 is a superset of ascii
case "utf8":
return cast(string) data;
// and now the various 8 bit encodings we support.
case "windows1252":
return decodeImpl(data, ISO_8859_1, Windows_1252);
case "windows1251":
return decodeImpl(data, Windows_1251, Windows_1251_Lower);
case "koi8r":
return decodeImpl(data, KOI8_R, KOI8_R_Lower);
case "latin1":
case "iso88591":
// Why am I putting Windows_1252 here? A lot of
// stuff in the wild is mislabeled, so this will
// do some good in the Just Works department.
// Regardless, I don't handle the
// control char set in that zone anyway right now.
return decodeImpl(data, ISO_8859_1, Windows_1252);
case "iso88592":
return decodeImpl(data, ISO_8859_2);
case "iso88593":
return decodeImpl(data, ISO_8859_3);
case "iso88594":
return decodeImpl(data, ISO_8859_4);
case "iso88595":
return decodeImpl(data, ISO_8859_5);
case "iso88596":
return decodeImpl(data, ISO_8859_6);
case "iso88597":
return decodeImpl(data, ISO_8859_7);
case "iso88598":
return decodeImpl(data, ISO_8859_8);
case "iso88599":
return decodeImpl(data, ISO_8859_9);
case "iso885910":
return decodeImpl(data, ISO_8859_10);
case "iso885911":
return decodeImpl(data, ISO_8859_11);
case "iso885913":
return decodeImpl(data, ISO_8859_13);
case "iso885914":
return decodeImpl(data, ISO_8859_14);
case "iso885915":
return decodeImpl(data, ISO_8859_15);
case "iso885916":
return decodeImpl(data, ISO_8859_16);
}
assert(0);
}
/// Tries to determine the current encoding based on the content.
/// Only really helps with the UTF variants.
/// Returns null if it can't be reasonably sure.
string tryToDetermineEncoding(in ubyte[] rawdata) {
import std.utf;
try {
validate!string(cast(string) rawdata);
// the odds of non stuff validating as utf-8 are pretty low
return "UTF-8";
} catch(UTFException t) {
// it's definitely not UTF-8!
// we'll look at the first few characters. If there's a
// BOM, it's probably UTF-16 or UTF-32
if(rawdata.length > 4) {
// not checking for utf8 bom; if it was that, we
// wouldn't be here.
if(rawdata[0] == 0xff && rawdata[1] == 0xfe)
return "UTF-16 LE";
else if(rawdata[0] == 0xfe && rawdata[1] == 0xff)
return "UTF-16 BE";
else if(rawdata[0] == 0x00 && rawdata[1] == 0x00
&& rawdata[2] == 0xfe && rawdata[3] == 0xff)
return "UTF-32 BE";
else if(rawdata[0] == 0xff && rawdata[1] == 0xfe
&& rawdata[2] == 0x00 && rawdata[3] == 0x00)
return "UTF-32 LE";
else {
// this space is intentionally left blank
}
}
}
// we don't know with enough confidence. The app will have to find another way.
return null;
}
// this function actually does the work, using the translation tables
// below.
string decodeImpl(in ubyte[] data, in dchar[] chars160to255, in dchar[] chars128to159 = null, in dchar[] chars0to127 = null)
in {
assert(chars160to255.length == 256 - 160);
assert(chars128to159 is null || chars128to159.length == 160 - 128);
assert(chars0to127 is null || chars0to127.length == 128 - 0);
}
out(ret) {
import std.utf;
validate(ret);
}
do {
string utf8;
/// I'm sure this could be a lot more efficient, but whatever, it
/// works.
foreach(octet; data) {
if(octet < 128) {
if(chars0to127 !is null)
utf8 ~= chars0to127[octet];
else
utf8 ~= cast(char) octet; // ascii is the same
} else if(octet < 160) {
if(chars128to159 !is null)
utf8 ~= chars128to159[octet - 128];
else
utf8 ~= " ";
} else {
utf8 ~= chars160to255[octet - 160];
}
}
return utf8;
}
// Here come the translation tables.
// this table gives characters for decimal 128 through 159.
// the < 128 characters are the same as ascii, and > 159 the same as
// iso 8859 1, seen below.
immutable dchar[] Windows_1252 = [
'€', ' ', '', 'ƒ', '„', '…', '†', '‡',
'ˆ', '‰', 'Š', '', 'Œ', ' ', 'Ž', ' ',
' ', '', '', '“', '”', '•', '', '—',
'˜', '™', 'š', '', 'œ', ' ', 'ž', 'Ÿ'];
// the following tables give the characters from decimal 160 up to 255
// in the given encodings.
immutable dchar[] ISO_8859_1 = [
' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
'¨', '©', 'ª', '«', '¬', '­', '®', '¯',
'°', '±', '²', '³', '´', 'µ', '¶', '·',
'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
immutable dchar[] ISO_8859_2 = [
' ', 'Ą', '˘', 'Ł', '¤', 'Ľ', 'Ś', '§',
'¨', 'Š', 'Ş', 'Ť', 'Ź', '­', 'Ž', 'Ż',
'°', 'ą', '˛', 'ł', '´', 'ľ', 'ś', 'ˇ',
'¸', 'š', 'ş', 'ť', 'ź', '˝', 'ž', 'ż',
'Ŕ', 'Á', 'Â', 'Ă', 'Ä', 'Ĺ', 'Ć', 'Ç',
'Č', 'É', 'Ę', 'Ë', 'Ě', 'Í', 'Î', 'Ď',
'Đ', 'Ń', 'Ň', 'Ó', 'Ô', 'Ő', 'Ö', '×',
'Ř', 'Ů', 'Ú', 'Ű', 'Ü', 'Ý', 'Ţ', 'ß',
'ŕ', 'á', 'â', 'ă', 'ä', 'ĺ', 'ć', 'ç',
'č', 'é', 'ę', 'ë', 'ě', 'í', 'î', 'ď',
'đ', 'ń', 'ň', 'ó', 'ô', 'ő', 'ö', '÷',
'ř', 'ů', 'ú', 'ű', 'ü', 'ý', 'ţ', '˙'];
immutable dchar[] ISO_8859_3 = [
' ', 'Ħ', '˘', '£', '¤', ' ', 'Ĥ', '§',
'¨', 'İ', 'Ş', 'Ğ', 'Ĵ', '­', ' ', 'Ż',
'°', 'ħ', '²', '³', '´', 'µ', 'ĥ', '·',
'¸', 'ı', 'ş', 'ğ', 'ĵ', '½', ' ', 'ż',
'À', 'Á', 'Â', ' ', 'Ä', 'Ċ', 'Ĉ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
' ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ġ', 'Ö', '×',
'Ĝ', 'Ù', 'Ú', 'Û', 'Ü', 'Ŭ', 'Ŝ', 'ß',
'à', 'á', 'â', ' ', 'ä', 'ċ', 'ĉ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
' ', 'ñ', 'ò', 'ó', 'ô', 'ġ', 'ö', '÷',
'ĝ', 'ù', 'ú', 'û', 'ü', 'ŭ', 'ŝ', '˙'];
immutable dchar[] ISO_8859_4 = [
' ', 'Ą', 'ĸ', 'Ŗ', '¤', 'Ĩ', 'Ļ', '§',
'¨', 'Š', 'Ē', 'Ģ', 'Ŧ', '­', 'Ž', '¯',
'°', 'ą', '˛', 'ŗ', '´', 'ĩ', 'ļ', 'ˇ',
'¸', 'š', 'ē', 'ģ', 'ŧ', 'Ŋ', 'ž', 'ŋ',
'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ī',
'Đ', 'Ņ', 'Ō', 'Ķ', 'Ô', 'Õ', 'Ö', '×',
'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ũ', 'Ū', 'ß',
'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ī',
'đ', 'ņ', 'ō', 'ķ', 'ô', 'õ', 'ö', '÷',
'ø', 'ų', 'ú', 'û', 'ü', 'ũ', 'ū', '˙'];
immutable dchar[] ISO_8859_5 = [
' ', 'Ё', 'Ђ', 'Ѓ', 'Є', 'Ѕ', 'І', 'Ї',
'Ј', 'Љ', 'Њ', 'Ћ', 'Ќ', '­', 'Ў', 'Џ',
'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я',
'№', 'ё', 'ђ', 'ѓ', 'є', 'ѕ', 'і', 'ї',
'ј', 'љ', 'њ', 'ћ', 'ќ', '§', 'ў', 'џ'];
immutable dchar[] ISO_8859_6 = [
' ', ' ', ' ', ' ', '¤', ' ', ' ', ' ',
' ', ' ', ' ', ' ', '،', '­', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', '؛', ' ', ' ', ' ', '؟',
' ', 'ء', 'آ', 'أ', 'ؤ', 'إ', 'ئ', 'ا',
'ب', 'ة', 'ت', 'ث', 'ج', 'ح', 'خ', 'د',
'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط',
'ظ', 'ع', 'غ', ' ', ' ', ' ', ' ', ' ',
'ـ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه',
'و', 'ى', 'ي', 'ً', 'ٌ', 'ٍ', 'َ', 'ُ',
'ِ', 'ّ', 'ْ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' '];
immutable dchar[] ISO_8859_7 = [
' ', '', '', '£', '€', '₯', '¦', '§',
'¨', '©', 'ͺ', '«', '¬', '­', ' ', '―',
'°', '±', '²', '³', '΄', '΅', 'Ά', '·',
'Έ', 'Ή', 'Ί', '»', 'Ό', '½', 'Ύ', 'Ώ',
'ΐ', 'Α', 'Β', 'Γ', 'Δ', 'Ε', 'Ζ', 'Η',
'Θ', 'Ι', 'Κ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο',
'Π', 'Ρ', ' ', 'Σ', 'Τ', 'Υ', 'Φ', 'Χ',
'Ψ', 'Ω', 'Ϊ', 'Ϋ', 'ά', 'έ', 'ή', 'ί',
'ΰ', 'α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η',
'θ', 'ι', 'κ', 'λ', 'μ', 'ν', 'ξ', 'ο',
'π', 'ρ', 'ς', 'σ', 'τ', 'υ', 'φ', 'χ',
'ψ', 'ω', 'ϊ', 'ϋ', 'ό', 'ύ', 'ώ', ' '];
immutable dchar[] ISO_8859_8 = [
' ', ' ', '¢', '£', '¤', '¥', '¦', '§',
'¨', '©', '×', '«', '¬', '­', '®', '¯',
'°', '±', '²', '³', '´', 'µ', '¶', '·',
'¸', '¹', '÷', '»', '¼', '½', '¾', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',
' ', ' ', ' ', ' ', ' ', ' ', ' ', '‗',
'א', 'ב', 'ג', 'ד', 'ה', 'ו', 'ז', 'ח',
'ט', 'י', 'ך', 'כ', 'ל', 'ם', 'מ', 'ן',
'נ', 'ס', 'ע', 'ף', 'פ', 'ץ', 'צ', 'ק',
// v v those are wrong
'ר', 'ש', 'ת', ' ', ' ', ' ', ' ', ' ']; // FIXME: those ones marked wrong are supposed to be left to right and right to left markers, not spaces. lol maybe it isn't wrong
immutable dchar[] ISO_8859_9 = [
' ', '¡', '¢', '£', '¤', '¥', '¦', '§',
'¨', '©', 'ª', '«', '¬', '­', '®', '¯',
'°', '±', '²', '³', '´', 'µ', '¶', '·',
'¸', '¹', 'º', '»', '¼', '½', '¾', '¿',
'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
'Ğ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×',
'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'İ', 'Ş', 'ß',
'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
'ğ', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷',
'ø', 'ù', 'ú', 'û', 'ü', 'ı', 'ş', 'ÿ'];
immutable dchar[] ISO_8859_10 = [
' ', 'Ą', 'Ē', 'Ģ', 'Ī', 'Ĩ', 'Ķ', '§',
'Ļ', 'Đ', 'Š', 'Ŧ', 'Ž', '­', 'Ū', 'Ŋ',
'°', 'ą', 'ē', 'ģ', 'ī', 'ĩ', 'ķ', '·',
'ļ', 'đ', 'š', 'ŧ', 'ž', '―', 'ū', 'ŋ',
'Ā', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Į',
'Č', 'É', 'Ę', 'Ë', 'Ė', 'Í', 'Î', 'Ï',
'Ð', 'Ņ', 'Ō', 'Ó', 'Ô', 'Õ', 'Ö', 'Ũ',
'Ø', 'Ų', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
'ā', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'į',
'č', 'é', 'ę', 'ë', 'ė', 'í', 'î', 'ï',
'ð', 'ņ', 'ō', 'ó', 'ô', 'õ', 'ö', 'ũ',
'ø', 'ų', 'ú', 'û', 'ü', 'ý', 'þ', 'ĸ'];
immutable dchar[] ISO_8859_11 = [
' ', 'ก', 'ข', 'ฃ', 'ค', 'ฅ', 'ฆ', 'ง',
'จ', 'ฉ', 'ช', 'ซ', 'ฌ', 'ญ', 'ฎ', 'ฏ',
'ฐ', 'ฑ', 'ฒ', 'ณ', 'ด', 'ต', 'ถ', 'ท',
'ธ', 'น', 'บ', 'ป', 'ผ', 'ฝ', 'พ', 'ฟ',
'ภ', 'ม', 'ย', 'ร', 'ฤ', 'ล', 'ฦ', 'ว',
'ศ', 'ษ', 'ส', 'ห', 'ฬ', 'อ', 'ฮ', 'ฯ',
'ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื',
'ุ', 'ู', 'ฺ', ' ', ' ', ' ', ' ', '฿',
'เ', 'แ', 'โ', 'ใ', 'ไ', 'ๅ', 'ๆ', '็',
'่', '้', '๊', '๋', '์', 'ํ', '๎', '๏',
'', '๑', '๒', '๓', '๔', '๕', '๖', '๗',
'๘', '๙', '๚', '๛', ' ', ' ', ' ', ' '];
immutable dchar[] ISO_8859_13 = [
' ', '”', '¢', '£', '¤', '„', '¦', '§',
'Ø', '©', 'Ŗ', '«', '¬', '­', '®', 'Æ',
'°', '±', '²', '³', '“', 'µ', '¶', '·',
'ø', '¹', 'ŗ', '»', '¼', '½', '¾', 'æ',
'Ą', 'Į', 'Ā', 'Ć', 'Ä', 'Å', 'Ę', 'Ē',
'Č', 'É', 'Ź', 'Ė', 'Ģ', 'Ķ', 'Ī', 'Ļ',
'Š', 'Ń', 'Ņ', 'Ó', 'Ō', 'Ő', 'Ö', '×',
'Ų', 'Ł', 'Ś', 'Ū', 'Ü', 'Ż', 'Ž', 'ß',
'ą', 'į', 'ā', 'ć', 'ä', 'å', 'ę', 'ē',
'č', 'é', 'ź', 'ė', 'ģ', 'ķ', 'ī', 'ļ',
'š', 'ń', 'ņ', 'ó', 'ō', 'ő', 'ö', '÷',
'ų', 'ł', 'ś', 'ū', 'ü', 'ż', 'ž', ''];
immutable dchar[] ISO_8859_14 = [
' ', 'Ḃ', 'ḃ', '£', 'Ċ', 'ċ', 'Ḋ', '§',
'Ẁ', '©', 'Ẃ', 'ḋ', 'Ỳ', '­', '®', 'Ÿ',
'Ḟ', 'ḟ', 'Ġ', 'ġ', 'Ṁ', 'ṁ', '¶', 'Ṗ',
'ẁ', 'ṗ', 'ẃ', 'Ṡ', 'ỳ', 'Ẅ', 'ẅ', 'ṡ',
'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
'Ŵ', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ṫ',
'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Ŷ', 'ß',
'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
'ŵ', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', 'ṫ',
'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'ŷ', 'ÿ'];
immutable dchar[] ISO_8859_15 = [
' ', '¡', '¢', '£', '€', '¥', 'Š', '§',
'š', '©', 'ª', '«', '¬', '­', '®', '¯',
'°', '±', '²', '³', 'Ž', 'µ', '¶', '·',
'ž', '¹', 'º', '»', 'Œ', 'œ', 'Ÿ', '¿',
'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', '×',
'Ø', 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß',
'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
'ð', 'ñ', 'ò', 'ó', 'ô', 'ő', 'ö', '÷',
'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'þ', 'ÿ'];
immutable dchar[] ISO_8859_16 = [
' ', 'Ą', 'ą', 'Ł', '€', '„', 'Š', '§',
'š', '©', 'Ș', '«', 'Ź', '­', 'ź', 'Ż',
'°', '±', 'Č', 'ł', 'Ž', '”', '¶', '·',
'ž', 'č', 'ș', '»', 'Œ', 'œ', 'Ÿ', 'ż',
'À', 'Á', 'Â', 'Ă', 'Ä', 'Ć', 'Æ', 'Ç',
'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ï',
'Ð', 'Ń', 'Ò', 'Ó', 'Ô', 'Ő', 'Ö', 'Ś',
'Ű', 'Ù', 'Ú', 'Û', 'Ü', 'Ę', 'Ț', 'ß',
'à', 'á', 'â', 'ă', 'ä', 'ć', 'æ', 'ç',
'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï',
'đ', 'ń', 'ò', 'ó', 'ô', 'ő', 'ö', 'ś',
'ű', 'ù', 'ú', 'û', 'ü', 'ę', 'ț', 'ÿ'];
immutable dchar[] KOI8_R_Lower = [
'─', '│', '┌', '┐', '└', '┘', '├', '┤',
'┬', '┴', '┼', '▀', '▄', '█', '▌', '▐',
'░', '▒', '▓', '⌠', '■', '∙', '√', '≈',
'≤', '≥', '\u00a0', '⌡', '°', '²', '·', '÷'];
immutable dchar[] KOI8_R = [
'═', '║', '╒', 'ё', '╓', '╔', '╕', '╖',
'╗', '╘', '╙', '╚', '╛', '╜', '╝', '╞',
'╟', '╠', '╡', 'ё', '╢', '╣', '╤', '╥',
'╦', '╧', '╨', '╩', '╪', '╫', '╬', '©',
'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ',
'ю', 'а', 'б', 'ц', 'д', 'е', 'ф', 'г',
'х', 'и', 'й', 'к', 'л', 'м', 'н', 'о',
'п', 'я', 'р', 'с', 'т', 'у', 'ж', 'в',
'ь', 'ы', 'з', 'ш', 'э', 'щ', 'ч', 'ъ'];
immutable dchar[] Windows_1251_Lower = [
'Ђ', 'Ѓ', '', 'ѓ', '„', '…', '†', '‡',
'€', '‰', 'Љ', '', 'Њ', 'Ќ', 'Ћ', 'Џ',
'ђ', '', '', '“', '”', '•', '', '—',
' ', '™', 'љ', '', 'њ', 'ќ', 'ћ', 'џ'];
immutable dchar[] Windows_1251 = [
' ', 'Ў', 'ў', 'Ј', '¤', 'Ґ', '¦', '§',
'Ё', '©', 'Є', '«', '¬', '­', '®', 'Ї',
'°', '±', 'І', 'і', 'ґ', 'µ', '¶', '·',
'ё', '№', 'є', '»', 'ј', 'Ѕ', 'ѕ', 'ї',
'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З',
'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П',
'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч',
'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я',
'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з',
'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п',
'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч',
'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я'];