/* * Copyright (c) 2010 Nick Galbreath * http://code.google.com/p/stringencoders/source/browse/#svn/trunk/javascript * * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated documentation * files (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, * copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* * url processing in the spirit of python's urlparse module * see `pydoc urlparse` or * http://docs.python.org/library/urlparse.html * * urlsplit: break apart a URL into components * urlunsplit: reconsistute a URL from componets * urljoin: join an absolute and another URL * urldefrag: remove the fragment from a URL * * Take a look at the tests in urlparse-test.html * * On URL Normalization: * * urlsplit only does minor normalization the components Only scheme * and hostname are lowercased urljoin does a bit more, normalizing * paths with "." and "..". * urlnormalize adds additional normalization * * * removes default port numbers * http://abc.com:80/ -> http://abc.com/, etc * * normalizes path * http://abc.com -> http://abc.com/ * and other "." and ".." cleanups * * if file, remove query and fragment * * It does not do: * * normalizes escaped hex values * http://abc.com/%7efoo -> http://abc.com/%7Efoo * * normalize '+' <--> '%20' * * Differences with Python * * The javascript urlsplit returns a normal object with the following * properties: scheme, netloc, hostname, port, path, query, fragment. * All properties are read-write. * * In python, the resulting object is not a dict, but a specialized, * read-only, and has alternative tuple interface (e.g. obj[0] == * obj.scheme). It's not clear why such a simple function requires * a unique datastructure. * * urlunsplit in javascript takes an duck-typed object, * { scheme: 'http', netloc: 'abc.com', ...} * while in * python it takes a list-like object. * ['http', 'abc.com'... ] * * For all functions, the javascript version use * hostname+port if netloc is missing. In python * hostname+port were always ignored. * * Similar functionality in different languages: * * http://php.net/manual/en/function.parse-url.php * returns assocative array but cannot handle relative URL * * TODO: test allowfragments more * TODO: test netloc missing, but hostname present */ var urlparse = {}; // Unlike to be useful standalone // // NORMALIZE PATH with "../" and "./" // http://en.wikipedia.org/wiki/URL_normalization // http://tools.ietf.org/html/rfc3986#section-5.2.3 // urlparse.normalizepath = function(path) { if (!path || path === '/') { return '/'; } var parts = path.split('/'); var newparts = []; // make sure path always starts with '/' if (parts[0]) { newparts.push(''); } for (var i = 0; i < parts.length; ++i) { if (parts[i] === '..') { if (newparts.length > 1) { newparts.pop(); } else { newparts.push(parts[i]); } } else if (parts[i] != '.') { newparts.push(parts[i]); } } path = newparts.join('/'); if (!path) { path = '/'; } return path; }; // // Does many of the normalizations that the stock // python urlsplit/urlunsplit/urljoin neglects // // Doesn't do hex-escape normalization on path or query // %7e -> %7E // Nor, '+' <--> %20 translation // urlparse.urlnormalize = function(url) { var parts = urlparse.urlsplit(url); switch (parts.scheme) { case 'file': // files can't have query strings // and we don't bother with fragments parts.query = ''; parts.fragment = ''; break; case 'http': case 'https': // remove default port if ((parts.scheme === 'http' && parts.port == 80) || (parts.scheme === 'https' && parts.port == 443)) { delete parts.port; // hostname is already lower case parts.netloc = parts.hostname; } break; default: // if we don't have specific normalizations for this // scheme, return the original url unmolested return url; } // for [file|http|https]. Not sure about other schemes parts.path = urlparse.normalizepath(parts.path); return urlparse.urlunsplit(parts); }; urlparse.urldefrag = function(url) { var idx = url.indexOf('#'); if (idx == -1) { return [ url, '' ]; } else { return [ url.substr(0,idx), url.substr(idx+1) ]; } }; urlparse.urlsplit = function(url, default_scheme, allow_fragments) { var leftover; if (typeof allow_fragments === 'undefined') { allow_fragments = true; } // scheme (optional), host, port var fullurl = /^([A-Za-z]+)?(:?\/\/)([0-9.\-A-Za-z]*)(?::(\d+))?(.*)$/; // path, query, fragment var parse_leftovers = /([^?#]*)?(?:\?([^#]*))?(?:#(.*))?$/; var o = {}; var parts = url.match(fullurl); if (parts) { o.scheme = parts[1] || default_scheme || ''; o.hostname = parts[3].toLowerCase() || ''; o.port = parseInt(parts[4],10) || ''; // Probably should grab the netloc from regexp // and then parse again for hostname/port o.netloc = parts[3]; if (parts[4]) { o.netloc += ':' + parts[4]; } leftover = parts[5]; } else { o.scheme = default_scheme || ''; o.netloc = ''; o.hostname = ''; leftover = url; } o.scheme = o.scheme.toLowerCase(); parts = leftover.match(parse_leftovers); o.path = parts[1] || ''; o.query = parts[2] || ''; if (allow_fragments) { o.fragment = parts[3] || ''; } else { o.fragment = ''; } return o; } urlparse.urlunsplit = function(o) { var s = ''; if (o.scheme) { s += o.scheme + '://'; } if (o.netloc) { if (s == '') { s += '//'; } s += o.netloc; } else if (o.hostname) { // extension. Python only uses netloc if (s == '') { s += '//'; } s += o.hostname; if (o.port) { s += ':' + o.port; } } if (o.path) { s += o.path; } if (o.query) { s += '?' + o.query; } if (o.fragment) { s += '#' + o.fragment; } return s; } urlparse.urljoin = function(base, url, allow_fragments) { if (typeof allow_fragments === 'undefined') { allow_fragments = true; } var url_parts = urlparse.urlsplit(url); // if url parts has a scheme (i.e. absolute) // then nothing to do if (url_parts.scheme) { if (! allow_fragments) { return url; } else { return urlparse.urldefrag(url)[0]; } } var base_parts = urlparse.urlsplit(base); // copy base, only if not present if (!base_parts.scheme) { base_parts.scheme = url_parts.scheme; } // copy netloc, only if not present if (!base_parts.netloc || !base_parts.hostname) { base_parts.netloc = url_parts.netloc; base_parts.hostname = url_parts.hostname; base_parts.port = url_parts.port; } // paths if (url_parts.path.length > 0) { if (url_parts.path.charAt(0) == '/') { base_parts.path = url_parts.path; } else { // relative path.. get rid of "current filename" and // replace. Same as var parts = // base_parts.path.split('/'); parts[parts.length-1] = // url_parts.path; base_parts.path = parts.join('/'); var idx = base_parts.path.lastIndexOf('/'); if (idx == -1) { base_parts.path = url_parts.path; } else { base_parts.path = base_parts.path.substr(0,idx) + '/' + url_parts.path; } } } // clean up path base_parts.path = urlparse.normalizepath(base_parts.path); // copy query string base_parts.query = url_parts.query; // copy fragments if (allow_fragments) { base_parts.fragment = url_parts.fragment; } else { base_parts.fragment = ''; } return urlparse.urlunsplit(base_parts); }