// Written in the D programming language. /** * Encode and decode Uniform Resource Identifiers (URIs). * URIs are used in internet transfer protocols. * Valid URI characters consist of letters, digits, * and the characters $(B ;/?:@&=+$,-_.!~*'()) * Reserved URI characters are $(B ;/?:@&=+$,) * Escape sequences consist of $(B %) followed by two hex digits. * * See_Also: * $(LINK2 http://www.ietf.org/rfc/rfc3986.txt, RFC 3986)
* $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia) * Macros: * WIKI = Phobos/StdUri * * Copyright: Copyright Digital Mars 2000 - 2009. * License: Boost License 1.0. * Authors: $(WEB digitalmars.com, Walter Bright) * Source: $(PHOBOSSRC std/_uri.d) */ /* Copyright Digital Mars 2000 - 2009. * Distributed under the Boost Software License, Version 1.0. * (See accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) */ module std.uri; //debug=uri; // uncomment to turn on debugging writefln's debug(uri) private import std.stdio; /* ====================== URI Functions ================ */ private import std.ascii; private import std.c.stdlib; private import std.utf; import std.exception; class URIexception : Exception { this() { super("URI error"); } } enum { URI_Alpha = 1, URI_Reserved = 2, URI_Mark = 4, URI_Digit = 8, URI_Hash = 0x10, // '#' } immutable char[16] hex2ascii = "0123456789ABCDEF"; __gshared ubyte[128] uri_flags; // indexed by character shared static this() { // Initialize uri_flags[] static void helper(immutable char[] p, uint flags) { for (int i = 0; i < p.length; i++) uri_flags[p[i]] |= flags; } uri_flags['#'] |= URI_Hash; for (int i = 'A'; i <= 'Z'; i++) { uri_flags[i] |= URI_Alpha; uri_flags[i + 0x20] |= URI_Alpha; // lowercase letters } helper("0123456789", URI_Digit); helper(";/?:@&=+$,", URI_Reserved); helper("-_.!~*'()", URI_Mark); } private string URI_Encode(dstring string, uint unescapedSet) { uint j; uint k; dchar V; dchar C; // result buffer char[50] buffer = void; char* R; uint Rlen; uint Rsize; // alloc'd size auto len = string.length; R = buffer.ptr; Rsize = buffer.length; Rlen = 0; for (k = 0; k != len; k++) { C = string[k]; // if (C in unescapedSet) if (C < uri_flags.length && uri_flags[C] & unescapedSet) { if (Rlen == Rsize) { char* R2; Rsize *= 2; if (Rsize > 1024) { R2 = (new char[Rsize]).ptr; } else { R2 = cast(char *)alloca(Rsize * char.sizeof); if (!R2) goto LthrowURIexception; } R2[0..Rlen] = R[0..Rlen]; R = R2; } R[Rlen] = cast(char)C; Rlen++; } else { char[6] Octet; uint L; V = C; // Transform V into octets if (V <= 0x7F) { Octet[0] = cast(char) V; L = 1; } else if (V <= 0x7FF) { Octet[0] = cast(char)(0xC0 | (V >> 6)); Octet[1] = cast(char)(0x80 | (V & 0x3F)); L = 2; } else if (V <= 0xFFFF) { Octet[0] = cast(char)(0xE0 | (V >> 12)); Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[2] = cast(char)(0x80 | (V & 0x3F)); L = 3; } else if (V <= 0x1FFFFF) { Octet[0] = cast(char)(0xF0 | (V >> 18)); Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[3] = cast(char)(0x80 | (V & 0x3F)); L = 4; } /+ else if (V <= 0x3FFFFFF) { Octet[0] = cast(char)(0xF8 | (V >> 24)); Octet[1] = cast(char)(0x80 | ((V >> 18) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[3] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[4] = cast(char)(0x80 | (V & 0x3F)); L = 5; } else if (V <= 0x7FFFFFFF) { Octet[0] = cast(char)(0xFC | (V >> 30)); Octet[1] = cast(char)(0x80 | ((V >> 24) & 0x3F)); Octet[2] = cast(char)(0x80 | ((V >> 18) & 0x3F)); Octet[3] = cast(char)(0x80 | ((V >> 12) & 0x3F)); Octet[4] = cast(char)(0x80 | ((V >> 6) & 0x3F)); Octet[5] = cast(char)(0x80 | (V & 0x3F)); L = 6; } +/ else { goto LthrowURIexception; // undefined UTF-32 code } if (Rlen + L * 3 > Rsize) { char *R2; Rsize = 2 * (Rlen + L * 3); if (Rsize > 1024) { R2 = (new char[Rsize]).ptr; } else { R2 = cast(char *)alloca(Rsize * char.sizeof); if (!R2) goto LthrowURIexception; } R2[0..Rlen] = R[0..Rlen]; R = R2; } for (j = 0; j < L; j++) { R[Rlen] = '%'; R[Rlen + 1] = hex2ascii[Octet[j] >> 4]; R[Rlen + 2] = hex2ascii[Octet[j] & 15]; Rlen += 3; } } } return R[0..Rlen].idup; LthrowURIexception: throw new URIexception(); } uint ascii2hex(dchar c) { return (c <= '9') ? c - '0' : (c <= 'F') ? c - 'A' + 10 : c - 'a' + 10; } private dstring URI_Decode(string string, uint reservedSet) { uint j; uint k; uint V; dchar C; // Result array, allocated on stack dchar* R; uint Rlen; auto len = string.length; auto s = string.ptr; // Preallocate result buffer R guaranteed to be large enough for result auto Rsize = len; if (Rsize > 1024 / dchar.sizeof) { R = (new dchar[Rsize]).ptr; } else { R = cast(dchar *)alloca(Rsize * dchar.sizeof); if (!R) goto LthrowURIexception; } Rlen = 0; for (k = 0; k != len; k++) { char B; uint start; C = s[k]; if (C != '%') { R[Rlen] = C; Rlen++; continue; } start = k; if (k + 2 >= len) goto LthrowURIexception; if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) goto LthrowURIexception; B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); k += 2; if ((B & 0x80) == 0) { C = B; } else { uint n; for (n = 1; ; n++) { if (n > 4) goto LthrowURIexception; if (((B << n) & 0x80) == 0) { if (n == 1) goto LthrowURIexception; break; } } // Pick off (7 - n) significant bits of B from first byte of octet V = B & ((1 << (7 - n)) - 1); // (!!!) if (k + (3 * (n - 1)) >= len) goto LthrowURIexception; for (j = 1; j != n; j++) { k++; if (s[k] != '%') goto LthrowURIexception; if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2])) goto LthrowURIexception; B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2])); if ((B & 0xC0) != 0x80) goto LthrowURIexception; k += 2; V = (V << 6) | (B & 0x3F); } if (V > 0x10FFFF) goto LthrowURIexception; C = V; } if (C < uri_flags.length && uri_flags[C] & reservedSet) { // R ~= s[start .. k + 1]; int width = (k + 1) - start; for (int ii = 0; ii < width; ii++) R[Rlen + ii] = s[start + ii]; Rlen += width; } else { R[Rlen] = C; Rlen++; } } assert(Rlen <= Rsize); // enforce our preallocation size guarantee // Copy array on stack to array in memory return R[0..Rlen].idup; LthrowURIexception: throw new URIexception(); } /************************************* * Decodes the URI string encodedURI into a UTF-8 string and returns it. * Escape sequences that resolve to reserved URI characters are not replaced. * Escape sequences that resolve to the '#' character are not replaced. */ string decode(string encodedURI) { auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash); return std.utf.toUTF8(s); } /******************************* * Decodes the URI string encodedURI into a UTF-8 string and returns it. All * escape sequences are decoded. */ string decodeComponent(string encodedURIComponent) { auto s = URI_Decode(encodedURIComponent, 0); return std.utf.toUTF8(s); } /***************************** * Encodes the UTF-8 string uri into a URI and returns that URI. Any character * not a valid URI character is escaped. The '#' character is not escaped. */ string encode(string uri) { auto s = std.utf.toUTF32(uri); return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark); } /******************************** * Encodes the UTF-8 string uriComponent into a URI and returns that URI. * Any character not a letter, digit, or one of -_.!~*'() is escaped. */ string encodeComponent(string uriComponent) { auto s = std.utf.toUTF32(uriComponent); return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark); } /*************************** * Does string s[] start with a URL? * Returns: * -1 it does not * len it does, and s[0..len] is the slice of s[] that is that URL */ size_t uriLength(string s) { /* Must start with one of: * http:// * https:// * www. */ import std.string : icmp; size_t i; if (s.length <= 4) goto Lno; if (s.length > 7 && std.string.icmp(s[0 .. 7], "http://") == 0) { i = 7; } else { if (s.length > 8 && std.string.icmp(s[0 .. 8], "https://") == 0) i = 8; else goto Lno; } // if (icmp(s[0 .. 4], "www.") == 0) // i = 4; size_t lastdot; for (; i < s.length; i++) { auto c = s[i]; if (isAlphaNum(c)) continue; if (c == '-' || c == '_' || c == '?' || c == '=' || c == '%' || c == '&' || c == '/' || c == '+' || c == '#' || c == '~' || c == '$') continue; if (c == '.') { lastdot = i; continue; } break; } //if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) if (!lastdot) goto Lno; return i; Lno: return -1; } unittest { string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!"; assert (uriLength(s1) == 49); string s2 = "no uri here"; assert (uriLength(s2) == -1); } /*************************** * Does string s[] start with an email address? * Returns: * -1 it does not * len it does, and s[0..i] is the slice of s[] that is that email address * References: * RFC2822 */ size_t emailLength(string s) { size_t i; if (!isAlpha(s[0])) goto Lno; for (i = 1; 1; i++) { if (i == s.length) goto Lno; auto c = s[i]; if (isAlphaNum(c)) continue; if (c == '-' || c == '_' || c == '.') continue; if (c != '@') goto Lno; i++; break; } /* Now do the part past the '@' */ size_t lastdot; for (; i < s.length; i++) { auto c = s[i]; if (isAlphaNum(c)) continue; if (c == '-' || c == '_') continue; if (c == '.') { lastdot = i; continue; } break; } if (!lastdot || (i - lastdot != 3 && i - lastdot != 4)) goto Lno; return i; Lno: return -1; } unittest { string s1 = "my.e-mail@www.example-domain.com with garbage added"; assert (emailLength(s1) == 32); string s2 = "no email address here"; assert (emailLength(s2) == -1); } unittest { debug(uri) writeln("uri.encodeURI.unittest"); string s = "http://www.digitalmars.com/~fred/fred's RX.html#foo"; string t = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo"; auto r = encode(s); debug(uri) writefln("r = '%s'", r); assert(r == t); r = decode(t); debug(uri) writefln("r = '%s'", r); assert(r == s); r = encode( decode("%E3%81%82%E3%81%82") ); assert(r == "%E3%81%82%E3%81%82"); r = encodeComponent("c++"); assert(r == "c%2B%2B"); auto str = new char[10_000_000]; str[] = 'A'; r = encodeComponent(assumeUnique(str)); foreach (char c; r) assert(c == 'A'); r = decode("%41%42%43"); debug(uri) writeln(r); }