You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
522 lines
17 KiB
522 lines
17 KiB
/*******************************************************************************
|
|
|
|
uMatrix - a Chromium browser extension to black/white list requests.
|
|
Copyright (C) 2014-2016 Raymond Hill
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see {http://www.gnu.org/licenses/}.
|
|
|
|
Home: https://github.com/gorhill/uMatrix
|
|
*/
|
|
|
|
/* global µMatrix, publicSuffixList */
|
|
|
|
'use strict';
|
|
|
|
/*******************************************************************************
|
|
|
|
RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
|
|
|
|
Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
|
|
|
|
*/
|
|
|
|
/******************************************************************************/
|
|
|
|
µMatrix.URI = (function() {
|
|
|
|
/******************************************************************************/
|
|
|
|
// Favorite regex tool: http://regex101.com/
|
|
|
|
// Ref: <http://tools.ietf.org/html/rfc3986#page-50>
|
|
// I removed redundant capture groups: capture less = peform faster. See
|
|
// <http://jsperf.com/old-uritools-vs-new-uritools>
|
|
// Performance improvements welcomed.
|
|
// jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
|
|
var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
|
|
|
|
// Derived
|
|
var reSchemeFromURI = /^[^:\/?#]+:/;
|
|
var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
|
|
var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//;
|
|
var rePathFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]*)?([^?#]*)/;
|
|
|
|
// These are to parse authority field, not parsed by above official regex
|
|
// IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
|
|
// if it fails, the IPv6 compatible regex istr used. This helps
|
|
// peformance by avoiding the use of a too complicated regex first.
|
|
|
|
// https://github.com/gorhill/httpswitchboard/issues/211
|
|
// "While a hostname may not contain other characters, such as the
|
|
// "underscore character (_), other DNS names may contain the underscore"
|
|
var reHostPortFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]*)(:\d*)?$/i;
|
|
var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
|
|
|
|
var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
|
|
var reHostFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]+)(?::\d*)?$/i;
|
|
var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
|
|
|
|
// Coarse (but fast) tests
|
|
var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
|
|
var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
|
|
|
|
// Accurate tests
|
|
// Source.: http://stackoverflow.com/questions/5284147/validating-ipv4-addresses-with-regexp/5284410#5284410
|
|
var reIPv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)(\.|$)){4}/;
|
|
|
|
// Source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
|
|
var reIPv6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
|
|
|
|
/******************************************************************************/
|
|
|
|
var reset = function(o) {
|
|
o.scheme = '';
|
|
o.hostname = '';
|
|
o._ipv4 = undefined;
|
|
o._ipv6 = undefined;
|
|
o.port = '';
|
|
o.path = '';
|
|
o.query = '';
|
|
o.fragment = '';
|
|
return o;
|
|
};
|
|
|
|
var resetAuthority = function(o) {
|
|
o.hostname = '';
|
|
o._ipv4 = undefined;
|
|
o._ipv6 = undefined;
|
|
o.port = '';
|
|
return o;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// This will be exported
|
|
|
|
var URI = {
|
|
scheme: '',
|
|
authority: '',
|
|
hostname: '',
|
|
_ipv4: undefined,
|
|
_ipv6: undefined,
|
|
port: '',
|
|
domain: undefined,
|
|
path: '',
|
|
query: '',
|
|
fragment: '',
|
|
schemeBit: (1 << 0),
|
|
userBit: (1 << 1),
|
|
passwordBit: (1 << 2),
|
|
hostnameBit: (1 << 3),
|
|
portBit: (1 << 4),
|
|
pathBit: (1 << 5),
|
|
queryBit: (1 << 6),
|
|
fragmentBit: (1 << 7),
|
|
allBits: (0xFFFF)
|
|
};
|
|
|
|
URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
|
|
URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
|
|
|
|
/******************************************************************************/
|
|
|
|
// See: https://en.wikipedia.org/wiki/URI_scheme#Examples
|
|
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
|
//
|
|
// foo://example.com:8042/over/there?name=ferret#nose
|
|
// \_/ \______________/\_________/ \_________/ \__/
|
|
// | | | | |
|
|
// scheme authority path query fragment
|
|
// | _____________________|__
|
|
// / \ / \
|
|
// urn:example:animal:ferret:nose
|
|
|
|
URI.set = function(uri) {
|
|
if ( uri === undefined ) {
|
|
return reset(URI);
|
|
}
|
|
var matches = reRFC3986.exec(uri);
|
|
if ( !matches ) {
|
|
return reset(URI);
|
|
}
|
|
this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
|
|
this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
|
|
this.path = matches[3] !== undefined ? matches[3] : '';
|
|
|
|
// <http://tools.ietf.org/html/rfc3986#section-6.2.3>
|
|
// "In general, a URI that uses the generic syntax for authority
|
|
// "with an empty path should be normalized to a path of '/'."
|
|
if ( this.authority !== '' && this.path === '' ) {
|
|
this.path = '/';
|
|
}
|
|
this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
|
|
this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
|
|
|
|
// Assume very simple authority, i.e. just a hostname (highest likelihood
|
|
// case for µMatrix)
|
|
if ( reHostFromNakedAuthority.test(this.authority) ) {
|
|
this.hostname = this.authority;
|
|
this.port = '';
|
|
return this;
|
|
}
|
|
// Authority contains more than just a hostname
|
|
matches = reHostPortFromAuthority.exec(this.authority);
|
|
if ( !matches ) {
|
|
matches = reIPv6PortFromAuthority.exec(this.authority);
|
|
if ( !matches ) {
|
|
return resetAuthority(URI);
|
|
}
|
|
}
|
|
this.hostname = matches[1] !== undefined ? matches[1] : '';
|
|
// http://en.wikipedia.org/wiki/FQDN
|
|
if ( this.hostname.slice(-1) === '.' ) {
|
|
this.hostname = this.hostname.slice(0, -1);
|
|
}
|
|
this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
|
|
return this;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
|
|
//
|
|
// foo://example.com:8042/over/there?name=ferret#nose
|
|
// \_/ \______________/\_________/ \_________/ \__/
|
|
// | | | | |
|
|
// scheme authority path query fragment
|
|
// | _____________________|__
|
|
// / \ / \
|
|
// urn:example:animal:ferret:nose
|
|
|
|
URI.assemble = function(bits) {
|
|
if ( bits === undefined ) {
|
|
bits = this.allBits;
|
|
}
|
|
var s = [];
|
|
if ( this.scheme && (bits & this.schemeBit) ) {
|
|
s.push(this.scheme, ':');
|
|
}
|
|
if ( this.hostname && (bits & this.hostnameBit) ) {
|
|
s.push('//', this.hostname);
|
|
}
|
|
if ( this.port && (bits & this.portBit) ) {
|
|
s.push(':', this.port);
|
|
}
|
|
if ( this.path && (bits & this.pathBit) ) {
|
|
s.push(this.path);
|
|
}
|
|
if ( this.query && (bits & this.queryBit) ) {
|
|
s.push('?', this.query);
|
|
}
|
|
if ( this.fragment && (bits & this.fragmentBit) ) {
|
|
s.push('#', this.fragment);
|
|
}
|
|
return s.join('');
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.schemeFromURI = function(uri) {
|
|
var matches = reSchemeFromURI.exec(uri);
|
|
if ( matches === null ) {
|
|
return '';
|
|
}
|
|
return matches[0].slice(0, -1).toLowerCase();
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.isSecureScheme = function(scheme) {
|
|
return scheme === 'https' ||
|
|
scheme === 'wss' ||
|
|
scheme === 'ftps';
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.authorityFromURI = function(uri) {
|
|
var matches = reAuthorityFromURI.exec(uri);
|
|
if ( !matches ) {
|
|
return '';
|
|
}
|
|
return matches[1].slice(2).toLowerCase();
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// The most used function, so it better be fast.
|
|
|
|
URI.hostnameFromURI = function(uri) {
|
|
var matches = reCommonHostnameFromURL.exec(uri);
|
|
if ( matches ) {
|
|
return matches[1];
|
|
}
|
|
matches = reAuthorityFromURI.exec(uri);
|
|
if ( !matches ) {
|
|
return '';
|
|
}
|
|
var authority = matches[1].slice(2);
|
|
// Assume very simple authority (most common case for µMatrix)
|
|
if ( reHostFromNakedAuthority.test(authority) ) {
|
|
return authority.toLowerCase();
|
|
}
|
|
matches = reHostFromAuthority.exec(authority);
|
|
if ( !matches ) {
|
|
matches = reIPv6FromAuthority.exec(authority);
|
|
if ( !matches ) {
|
|
return '';
|
|
}
|
|
}
|
|
// http://en.wikipedia.org/wiki/FQDN
|
|
var hostname = matches[1];
|
|
if ( hostname.slice(-1) === '.' ) {
|
|
hostname = hostname.slice(0, -1);
|
|
}
|
|
return hostname.toLowerCase();
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.domainFromHostname = function(hostname) {
|
|
// Try to skip looking up the PSL database
|
|
var entry = domainCache[hostname];
|
|
if ( entry !== undefined ) {
|
|
entry.tstamp = Date.now();
|
|
return entry.domain;
|
|
}
|
|
// Meh.. will have to search it
|
|
if ( reIPAddressNaive.test(hostname) === false ) {
|
|
return domainCacheAdd(hostname, psl.getDomain(hostname));
|
|
}
|
|
return domainCacheAdd(hostname, hostname);
|
|
};
|
|
|
|
URI.domain = function() {
|
|
return this.domainFromHostname(this.hostname);
|
|
};
|
|
|
|
// It is expected that there is higher-scoped `publicSuffixList` lingering
|
|
// somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
|
|
var psl = publicSuffixList;
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.pathFromURI = function(uri) {
|
|
var matches = rePathFromURI.exec(uri);
|
|
return matches !== null ? matches[1] : '';
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Trying to alleviate the worries of looking up too often the domain name from
|
|
// a hostname. With a cache, uBlock benefits given that it deals with a
|
|
// specific set of hostnames within a narrow time span -- in other words, I
|
|
// believe probability of cache hit are high in uBlock.
|
|
|
|
var domainCache = Object.create(null);
|
|
var domainCacheCount = 0;
|
|
var domainCacheCountLowWaterMark = 75;
|
|
var domainCacheCountHighWaterMark = 100;
|
|
var domainCacheEntryJunkyard = [];
|
|
var domainCacheEntryJunkyardMax = domainCacheCountHighWaterMark - domainCacheCountLowWaterMark;
|
|
|
|
var DomainCacheEntry = function(domain) {
|
|
this.init(domain);
|
|
};
|
|
|
|
DomainCacheEntry.prototype.init = function(domain) {
|
|
this.domain = domain;
|
|
this.tstamp = Date.now();
|
|
return this;
|
|
};
|
|
|
|
DomainCacheEntry.prototype.dispose = function() {
|
|
this.domain = '';
|
|
if ( domainCacheEntryJunkyard.length < domainCacheEntryJunkyardMax ) {
|
|
domainCacheEntryJunkyard.push(this);
|
|
}
|
|
};
|
|
|
|
var domainCacheEntryFactory = function(domain) {
|
|
var entry = domainCacheEntryJunkyard.pop();
|
|
if ( entry ) {
|
|
return entry.init(domain);
|
|
}
|
|
return new DomainCacheEntry(domain);
|
|
};
|
|
|
|
var domainCacheAdd = function(hostname, domain) {
|
|
var entry = domainCache[hostname];
|
|
if ( entry !== undefined ) {
|
|
entry.tstamp = Date.now();
|
|
} else {
|
|
domainCache[hostname] = domainCacheEntryFactory(domain);
|
|
domainCacheCount += 1;
|
|
if ( domainCacheCount === domainCacheCountHighWaterMark ) {
|
|
domainCachePrune();
|
|
}
|
|
}
|
|
return domain;
|
|
};
|
|
|
|
var domainCacheEntrySort = function(a, b) {
|
|
return domainCache[b].tstamp - domainCache[a].tstamp;
|
|
};
|
|
|
|
var domainCachePrune = function() {
|
|
var hostnames = Object.keys(domainCache)
|
|
.sort(domainCacheEntrySort)
|
|
.slice(domainCacheCountLowWaterMark);
|
|
var i = hostnames.length;
|
|
domainCacheCount -= i;
|
|
var hostname;
|
|
while ( i-- ) {
|
|
hostname = hostnames[i];
|
|
domainCache[hostname].dispose();
|
|
delete domainCache[hostname];
|
|
}
|
|
};
|
|
|
|
var domainCacheReset = function() {
|
|
domainCache = Object.create(null);
|
|
domainCacheCount = 0;
|
|
};
|
|
|
|
psl.onChanged.addListener(domainCacheReset);
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.domainFromURI = function(uri) {
|
|
if ( !uri ) {
|
|
return '';
|
|
}
|
|
return this.domainFromHostname(this.hostnameFromURI(uri));
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Normalize the way µMatrix expects it
|
|
|
|
URI.normalizedURI = function() {
|
|
// Will be removed:
|
|
// - port
|
|
// - user id/password
|
|
// - fragment
|
|
return this.assemble(this.normalizeBits);
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.rootURL = function() {
|
|
if ( !this.hostname ) {
|
|
return '';
|
|
}
|
|
return this.assemble(this.schemeBit | this.hostnameBit);
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.isValidHostname = function(hostname) {
|
|
var r;
|
|
try {
|
|
r = reValidHostname.test(hostname);
|
|
}
|
|
catch (e) {
|
|
return false;
|
|
}
|
|
return r;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Return the parent domain. For IP address, there is no parent domain.
|
|
|
|
URI.parentHostnameFromHostname = function(hostname) {
|
|
// `locahost` => ``
|
|
// `example.org` => `example.org`
|
|
// `www.example.org` => `example.org`
|
|
// `tomato.www.example.org` => `example.org`
|
|
var domain = this.domainFromHostname(hostname);
|
|
|
|
// `locahost` === `` => bye
|
|
// `example.org` === `example.org` => bye
|
|
// `www.example.org` !== `example.org` => stay
|
|
// `tomato.www.example.org` !== `example.org` => stay
|
|
if ( domain === '' || domain === hostname ) {
|
|
return undefined;
|
|
}
|
|
|
|
// Parent is hostname minus first label
|
|
return hostname.slice(hostname.indexOf('.') + 1);
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Return all possible parent hostnames which can be derived from `hostname`,
|
|
// ordered from direct parent up to domain inclusively.
|
|
|
|
URI.parentHostnamesFromHostname = function(hostname) {
|
|
// TODO: I should create an object which is optimized to receive
|
|
// the list of hostnames by making it reusable (junkyard etc.) and which
|
|
// has its own element counter property in order to avoid memory
|
|
// alloc/dealloc.
|
|
var domain = this.domainFromHostname(hostname);
|
|
if ( domain === '' || domain === hostname ) {
|
|
return [];
|
|
}
|
|
var nodes = [];
|
|
var pos;
|
|
for (;;) {
|
|
pos = hostname.indexOf('.');
|
|
if ( pos < 0 ) {
|
|
break;
|
|
}
|
|
hostname = hostname.slice(pos + 1);
|
|
nodes.push(hostname);
|
|
if ( hostname === domain ) {
|
|
break;
|
|
}
|
|
}
|
|
return nodes;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Return all possible hostnames which can be derived from `hostname`,
|
|
// ordered from self up to domain inclusively.
|
|
|
|
URI.allHostnamesFromHostname = function(hostname) {
|
|
var nodes = this.parentHostnamesFromHostname(hostname);
|
|
nodes.unshift(hostname);
|
|
return nodes;
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
URI.toString = function() {
|
|
return this.assemble();
|
|
};
|
|
|
|
/******************************************************************************/
|
|
|
|
// Export
|
|
|
|
return URI;
|
|
|
|
/******************************************************************************/
|
|
|
|
})();
|
|
|
|
/******************************************************************************/
|
|
|