/* * Copyright (C) 2000-2004 by Digital Mars, www.digitalmars.com * Written by Walter Bright * * This software is provided 'as-is', without any express or implied * warranty. In no event will the authors be held liable for any damages * arising from the use of this software. * * Permission is granted to anyone to use this software for any purpose, * including commercial applications, and to alter it and redistribute it * freely, subject to the following restrictions: * * o The origin of this software must not be misrepresented; you must not * claim that you wrote the original software. If you use this software * in a product, an acknowledgment in the product documentation would be * appreciated but is not required. * o Altered source versions must be plainly marked as such, and must not * be misrepresented as being the original software. * o This notice may not be removed or altered from any source * distribution. */ module std.uri; //debug=uri; // uncomment to turn on debugging printf's /* ====================== URI Functions ================ */ private import std.ctype; private import std.c.stdlib; private import std.utf; class URIerror : Error { this() { super("URI error"); } } enum { URI_Alpha = 1, URI_Reserved = 2, URI_Mark = 4, URI_Digit = 8, URI_Hash = 0x10, // '#' } char[16] hex2ascii = "0123456789ABCDEF"; ubyte[128] uri_flags; // indexed by character static this() { // Initialize uri_flags[] static void helper(char[] p, uint flags) { int i; for (i = 0; i < p.length; i++) uri_flags[p[i]] |= flags; } uri_flags['#'] |= URI_Hash; for (int i = 'A'; i <= 'Z'; i++) { uri_flags[i] |= URI_Alpha; uri_flags[i + 0x20] |= URI_Alpha; // lowercase letters } helper("0123456789", URI_Digit); helper(";/?:@&=+$,", URI_Reserved); helper("-_.!~*'()", URI_Mark); } private char[] URI_Encode(dchar[] string, uint unescapedSet) { uint len; uint j; uint k; dchar V; dchar C; // result buffer char *R; uint Rlen; uint Rsize; // alloc'd size char buffer[50]; len = string.length; R = buffer; Rsize = buffer.length; Rlen = 0; for (k = 0; k != len; k++) { C = string[k]; // if (C in unescapedSet) if (C < uri_flags.length && uri_flags[C] & unescapedSet) { if (Rlen == Rsize) { char* R2; Rsize *= 2; R2 = cast(char *)alloca(Rsize * char.sizeof); if (!R2) goto LthrowURIerror; R2[0..Rlen] = R[0..Rlen]; R = R2; } R[Rlen] = cast(char)C; Rlen++; } else { char[6] Octet; uint L; V = C; // Transform V into octets if (V <= 0x7F) { Octet[0] = cast(char) V; L = 1; } else if (V <= 0x7FF) { Octet[0] = cast(char)(0xC0 | (V >> 6)); Octet[1] = cast(char)(0x80 | (V & 0x3F)); L = 2; } else if (V <= 0xFFFF) { Octet[0] = cast(char)(0xE0 | (V >> 12)); Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[2] = cast(char)(0x80 | (V & 0x3F)); L = 3; } else if (V <= 0x1FFFFF) { Octet[0] = cast(char)(0xF0 | (V >> 18)); Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[3] = cast(char)(0x80 | (V & 0x3F)); L = 4; } /+ else if (V <= 0x3FFFFFF) { Octet[0] = cast(char)(0xF8 | (V >> 24)); Octet[1] = cast(char)(0x80 | ((V >> 18) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[3] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[4] = cast(char)(0x80 | (V & 0x3F)); L = 5; } else if (V <= 0x7FFFFFFF) { Octet[0] = cast(char)(0xFC | (V >> 30)); Octet[1] = cast(char)(0x80 | ((V >> 24) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 18) & 0x3F)); Octet[3] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[4] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[5] = cast(char)(0x80 | (V & 0x3F)); L = 6; } +/ else { goto LthrowURIerror; // undefined UTF-32 code } if (Rlen + L * 3 > Rsize) { char *R2; Rsize = 2 * (Rlen + L * 3); R2 = cast(char *)alloca(Rsize * char.sizeof); if (!R2) goto LthrowURIerror; R2[0..Rlen] = R[0..Rlen]; R = R2; } for (j = 0; j < L; j++) { R[Rlen] = '%'; R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; R[Rlen + 2] = hex2ascii[Octet[j] & 15]; Rlen += 3; } } } char[] result = new char[Rlen]; result[] = R[0..Rlen]; return result; LthrowURIerror: throw new URIerror(); return null; } uint ascii2hex(dchar c) { return (c <= '9') ? c - '0' : (c <= 'F') ? c - 'A' + 10 : c - 'a' + 10; } private dchar[] URI_Decode(char[] string, uint reservedSet) { uint len; uint j; uint k; uint V; dchar C; char* s; //printf("URI_Decode('%.*s')\n", string); // Result array, allocated on stack dchar* R; uint Rlen; uint Rsize; // alloc'd size len = string.length; s = string; // Preallocate result buffer R guaranteed to be large enough for result Rsize = len; R = cast(dchar *)alloca(Rsize * dchar.sizeof); if (!R) goto LthrowURIerror; Rlen = 0; for (k = 0; k != len; k++) { char B; uint start; C = s[k]; if (C != '%') { R[Rlen] = C; Rlen++; continue; } start = k; if (k + 2 >= len) goto LthrowURIerror; if (!isxdigit(s[k + 1]) || !isxdigit(s[k + 2])) goto LthrowURIerror; B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); k += 2; if ((B & 0x80) == 0) { C = B; } else { uint n; for (n = 1; ; n++) { if (n > 4) goto LthrowURIerror; if (((B << n) & 0x80) == 0) { if (n == 1) goto LthrowURIerror; break; } } // Pick off (7 - n) significant bits of B from first byte of octet V = B & ((1 << (7 - n)) - 1); // (!!!) if (k + (3 * (n - 1)) >= len) goto LthrowURIerror; for (j = 1; j != n; j++) { k++; if (s[k] != '%') goto LthrowURIerror; if (!isxdigit(s[k + 1]) || !isxdigit(s[k + 2])) goto LthrowURIerror; B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); if ((B & 0xC0) != 0x80) goto LthrowURIerror; k += 2; V = (V << 6) | (B & 0x3F); } if (V > 0x10FFFF) goto LthrowURIerror; C = V; } if (C < uri_flags.length && uri_flags[C] & reservedSet) { // R ~= s[start .. k + 1]; int width = (k + 1) - start; for (int ii = 0; ii < width; ii++) R[Rlen + ii] = s[start + ii]; Rlen += width; } else { R[Rlen] = C; Rlen++; } } assert(Rlen <= Rsize); // enforce our preallocation size guarantee // Copy array on stack to array in memory dchar[] d = new dchar[Rlen]; d[] = R[0..Rlen]; return d; LthrowURIerror: throw new URIerror(); return null; } char[] decode(char[] encodedURI) { dchar[] s; s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); return std.utf.toUTF8(s); } char[] decodeComponent(char[] encodedURIComponent) { dchar[] s; s = URI_Decode(encodedURIComponent, 0); return std.utf.toUTF8(s); } char[] encode(char[] uri) { dchar[] s; s = std.utf.toUTF32(uri); return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); } char[] encodeComponent(char[] uriComponent) { dchar[] s; s = std.utf.toUTF32(uriComponent); return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); } unittest { debug(uri) printf("uri.encodeURI.unittest\n"); char[] s = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; char[] t = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; char[] r; r = encode(s); //printf("r = '%.*s'\n", r); assert(r == t); r = decode(t); //printf("r = '%.*s'\n", r); assert(r == s); r = encode( decode("%E3%81%82%E3%81%82") ); assert(r == "%E3%81%82%E3%81%82"); }