You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

522 lines
17 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. /*******************************************************************************
  2. uMatrix - a Chromium browser extension to black/white list requests.
  3. Copyright (C) 2014-2016 Raymond Hill
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see {http://www.gnu.org/licenses/}.
  14. Home: https://github.com/gorhill/uMatrix
  15. */
  16. /* global µMatrix, publicSuffixList */
  17. 'use strict';
  18. /*******************************************************************************
  19. RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
  20. Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
  21. */
  22. /******************************************************************************/
  23. µMatrix.URI = (function() {
  24. /******************************************************************************/
  25. // Favorite regex tool: http://regex101.com/
  26. // Ref: <http://tools.ietf.org/html/rfc3986#page-50>
  27. // I removed redundant capture groups: capture less = peform faster. See
  28. // <http://jsperf.com/old-uritools-vs-new-uritools>
  29. // Performance improvements welcomed.
  30. // jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
  31. var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
  32. // Derived
  33. var reSchemeFromURI = /^[^:\/?#]+:/;
  34. var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
  35. var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//;
  36. var rePathFromURI = /^(?:[^:\/?#]+:)?(?:\/\/[^\/?#]*)?([^?#]*)/;
  37. // These are to parse authority field, not parsed by above official regex
  38. // IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
  39. // if it fails, the IPv6 compatible regex istr used. This helps
  40. // peformance by avoiding the use of a too complicated regex first.
  41. // https://github.com/gorhill/httpswitchboard/issues/211
  42. // "While a hostname may not contain other characters, such as the
  43. // "underscore character (_), other DNS names may contain the underscore"
  44. var reHostPortFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]*)(:\d*)?$/i;
  45. var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
  46. var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
  47. var reHostFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]+)(?::\d*)?$/i;
  48. var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
  49. // Coarse (but fast) tests
  50. var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
  51. var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
  52. // Accurate tests
  53. // Source.: http://stackoverflow.com/questions/5284147/validating-ipv4-addresses-with-regexp/5284410#5284410
  54. var reIPv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)(\.|$)){4}/;
  55. // Source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
  56. var reIPv6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
  57. /******************************************************************************/
  58. var reset = function(o) {
  59. o.scheme = '';
  60. o.hostname = '';
  61. o._ipv4 = undefined;
  62. o._ipv6 = undefined;
  63. o.port = '';
  64. o.path = '';
  65. o.query = '';
  66. o.fragment = '';
  67. return o;
  68. };
  69. var resetAuthority = function(o) {
  70. o.hostname = '';
  71. o._ipv4 = undefined;
  72. o._ipv6 = undefined;
  73. o.port = '';
  74. return o;
  75. };
  76. /******************************************************************************/
  77. // This will be exported
  78. var URI = {
  79. scheme: '',
  80. authority: '',
  81. hostname: '',
  82. _ipv4: undefined,
  83. _ipv6: undefined,
  84. port: '',
  85. domain: undefined,
  86. path: '',
  87. query: '',
  88. fragment: '',
  89. schemeBit: (1 << 0),
  90. userBit: (1 << 1),
  91. passwordBit: (1 << 2),
  92. hostnameBit: (1 << 3),
  93. portBit: (1 << 4),
  94. pathBit: (1 << 5),
  95. queryBit: (1 << 6),
  96. fragmentBit: (1 << 7),
  97. allBits: (0xFFFF)
  98. };
  99. URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
  100. URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
  101. /******************************************************************************/
  102. // See: https://en.wikipedia.org/wiki/URI_scheme#Examples
  103. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  104. //
  105. // foo://example.com:8042/over/there?name=ferret#nose
  106. // \_/ \______________/\_________/ \_________/ \__/
  107. // | | | | |
  108. // scheme authority path query fragment
  109. // | _____________________|__
  110. // / \ / \
  111. // urn:example:animal:ferret:nose
  112. URI.set = function(uri) {
  113. if ( uri === undefined ) {
  114. return reset(URI);
  115. }
  116. var matches = reRFC3986.exec(uri);
  117. if ( !matches ) {
  118. return reset(URI);
  119. }
  120. this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
  121. this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
  122. this.path = matches[3] !== undefined ? matches[3] : '';
  123. // <http://tools.ietf.org/html/rfc3986#section-6.2.3>
  124. // "In general, a URI that uses the generic syntax for authority
  125. // "with an empty path should be normalized to a path of '/'."
  126. if ( this.authority !== '' && this.path === '' ) {
  127. this.path = '/';
  128. }
  129. this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
  130. this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
  131. // Assume very simple authority, i.e. just a hostname (highest likelihood
  132. // case for µMatrix)
  133. if ( reHostFromNakedAuthority.test(this.authority) ) {
  134. this.hostname = this.authority;
  135. this.port = '';
  136. return this;
  137. }
  138. // Authority contains more than just a hostname
  139. matches = reHostPortFromAuthority.exec(this.authority);
  140. if ( !matches ) {
  141. matches = reIPv6PortFromAuthority.exec(this.authority);
  142. if ( !matches ) {
  143. return resetAuthority(URI);
  144. }
  145. }
  146. this.hostname = matches[1] !== undefined ? matches[1] : '';
  147. // http://en.wikipedia.org/wiki/FQDN
  148. if ( this.hostname.slice(-1) === '.' ) {
  149. this.hostname = this.hostname.slice(0, -1);
  150. }
  151. this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
  152. return this;
  153. };
  154. /******************************************************************************/
  155. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  156. //
  157. // foo://example.com:8042/over/there?name=ferret#nose
  158. // \_/ \______________/\_________/ \_________/ \__/
  159. // | | | | |
  160. // scheme authority path query fragment
  161. // | _____________________|__
  162. // / \ / \
  163. // urn:example:animal:ferret:nose
  164. URI.assemble = function(bits) {
  165. if ( bits === undefined ) {
  166. bits = this.allBits;
  167. }
  168. var s = [];
  169. if ( this.scheme && (bits & this.schemeBit) ) {
  170. s.push(this.scheme, ':');
  171. }
  172. if ( this.hostname && (bits & this.hostnameBit) ) {
  173. s.push('//', this.hostname);
  174. }
  175. if ( this.port && (bits & this.portBit) ) {
  176. s.push(':', this.port);
  177. }
  178. if ( this.path && (bits & this.pathBit) ) {
  179. s.push(this.path);
  180. }
  181. if ( this.query && (bits & this.queryBit) ) {
  182. s.push('?', this.query);
  183. }
  184. if ( this.fragment && (bits & this.fragmentBit) ) {
  185. s.push('#', this.fragment);
  186. }
  187. return s.join('');
  188. };
  189. /******************************************************************************/
  190. URI.schemeFromURI = function(uri) {
  191. var matches = reSchemeFromURI.exec(uri);
  192. if ( matches === null ) {
  193. return '';
  194. }
  195. return matches[0].slice(0, -1).toLowerCase();
  196. };
  197. /******************************************************************************/
  198. URI.isSecureScheme = function(scheme) {
  199. return scheme === 'https' ||
  200. scheme === 'wss' ||
  201. scheme === 'ftps';
  202. };
  203. /******************************************************************************/
  204. URI.authorityFromURI = function(uri) {
  205. var matches = reAuthorityFromURI.exec(uri);
  206. if ( !matches ) {
  207. return '';
  208. }
  209. return matches[1].slice(2).toLowerCase();
  210. };
  211. /******************************************************************************/
  212. // The most used function, so it better be fast.
  213. URI.hostnameFromURI = function(uri) {
  214. var matches = reCommonHostnameFromURL.exec(uri);
  215. if ( matches ) {
  216. return matches[1];
  217. }
  218. matches = reAuthorityFromURI.exec(uri);
  219. if ( !matches ) {
  220. return '';
  221. }
  222. var authority = matches[1].slice(2);
  223. // Assume very simple authority (most common case for µMatrix)
  224. if ( reHostFromNakedAuthority.test(authority) ) {
  225. return authority.toLowerCase();
  226. }
  227. matches = reHostFromAuthority.exec(authority);
  228. if ( !matches ) {
  229. matches = reIPv6FromAuthority.exec(authority);
  230. if ( !matches ) {
  231. return '';
  232. }
  233. }
  234. // http://en.wikipedia.org/wiki/FQDN
  235. var hostname = matches[1];
  236. if ( hostname.slice(-1) === '.' ) {
  237. hostname = hostname.slice(0, -1);
  238. }
  239. return hostname.toLowerCase();
  240. };
  241. /******************************************************************************/
  242. URI.domainFromHostname = function(hostname) {
  243. // Try to skip looking up the PSL database
  244. var entry = domainCache[hostname];
  245. if ( entry !== undefined ) {
  246. entry.tstamp = Date.now();
  247. return entry.domain;
  248. }
  249. // Meh.. will have to search it
  250. if ( reIPAddressNaive.test(hostname) === false ) {
  251. return domainCacheAdd(hostname, psl.getDomain(hostname));
  252. }
  253. return domainCacheAdd(hostname, hostname);
  254. };
  255. URI.domain = function() {
  256. return this.domainFromHostname(this.hostname);
  257. };
  258. // It is expected that there is higher-scoped `publicSuffixList` lingering
  259. // somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
  260. var psl = publicSuffixList;
  261. /******************************************************************************/
  262. URI.pathFromURI = function(uri) {
  263. var matches = rePathFromURI.exec(uri);
  264. return matches !== null ? matches[1] : '';
  265. };
  266. /******************************************************************************/
  267. // Trying to alleviate the worries of looking up too often the domain name from
  268. // a hostname. With a cache, uBlock benefits given that it deals with a
  269. // specific set of hostnames within a narrow time span -- in other words, I
  270. // believe probability of cache hit are high in uBlock.
  271. var domainCache = Object.create(null);
  272. var domainCacheCount = 0;
  273. var domainCacheCountLowWaterMark = 75;
  274. var domainCacheCountHighWaterMark = 100;
  275. var domainCacheEntryJunkyard = [];
  276. var domainCacheEntryJunkyardMax = domainCacheCountHighWaterMark - domainCacheCountLowWaterMark;
  277. var DomainCacheEntry = function(domain) {
  278. this.init(domain);
  279. };
  280. DomainCacheEntry.prototype.init = function(domain) {
  281. this.domain = domain;
  282. this.tstamp = Date.now();
  283. return this;
  284. };
  285. DomainCacheEntry.prototype.dispose = function() {
  286. this.domain = '';
  287. if ( domainCacheEntryJunkyard.length < domainCacheEntryJunkyardMax ) {
  288. domainCacheEntryJunkyard.push(this);
  289. }
  290. };
  291. var domainCacheEntryFactory = function(domain) {
  292. var entry = domainCacheEntryJunkyard.pop();
  293. if ( entry ) {
  294. return entry.init(domain);
  295. }
  296. return new DomainCacheEntry(domain);
  297. };
  298. var domainCacheAdd = function(hostname, domain) {
  299. var entry = domainCache[hostname];
  300. if ( entry !== undefined ) {
  301. entry.tstamp = Date.now();
  302. } else {
  303. domainCache[hostname] = domainCacheEntryFactory(domain);
  304. domainCacheCount += 1;
  305. if ( domainCacheCount === domainCacheCountHighWaterMark ) {
  306. domainCachePrune();
  307. }
  308. }
  309. return domain;
  310. };
  311. var domainCacheEntrySort = function(a, b) {
  312. return domainCache[b].tstamp - domainCache[a].tstamp;
  313. };
  314. var domainCachePrune = function() {
  315. var hostnames = Object.keys(domainCache)
  316. .sort(domainCacheEntrySort)
  317. .slice(domainCacheCountLowWaterMark);
  318. var i = hostnames.length;
  319. domainCacheCount -= i;
  320. var hostname;
  321. while ( i-- ) {
  322. hostname = hostnames[i];
  323. domainCache[hostname].dispose();
  324. delete domainCache[hostname];
  325. }
  326. };
  327. var domainCacheReset = function() {
  328. domainCache = Object.create(null);
  329. domainCacheCount = 0;
  330. };
  331. psl.onChanged.addListener(domainCacheReset);
  332. /******************************************************************************/
  333. URI.domainFromURI = function(uri) {
  334. if ( !uri ) {
  335. return '';
  336. }
  337. return this.domainFromHostname(this.hostnameFromURI(uri));
  338. };
  339. /******************************************************************************/
  340. // Normalize the way µMatrix expects it
  341. URI.normalizedURI = function() {
  342. // Will be removed:
  343. // - port
  344. // - user id/password
  345. // - fragment
  346. return this.assemble(this.normalizeBits);
  347. };
  348. /******************************************************************************/
  349. URI.rootURL = function() {
  350. if ( !this.hostname ) {
  351. return '';
  352. }
  353. return this.assemble(this.schemeBit | this.hostnameBit);
  354. };
  355. /******************************************************************************/
  356. URI.isValidHostname = function(hostname) {
  357. var r;
  358. try {
  359. r = reValidHostname.test(hostname);
  360. }
  361. catch (e) {
  362. return false;
  363. }
  364. return r;
  365. };
  366. /******************************************************************************/
  367. // Return the parent domain. For IP address, there is no parent domain.
  368. URI.parentHostnameFromHostname = function(hostname) {
  369. // `locahost` => ``
  370. // `example.org` => `example.org`
  371. // `www.example.org` => `example.org`
  372. // `tomato.www.example.org` => `example.org`
  373. var domain = this.domainFromHostname(hostname);
  374. // `locahost` === `` => bye
  375. // `example.org` === `example.org` => bye
  376. // `www.example.org` !== `example.org` => stay
  377. // `tomato.www.example.org` !== `example.org` => stay
  378. if ( domain === '' || domain === hostname ) {
  379. return undefined;
  380. }
  381. // Parent is hostname minus first label
  382. return hostname.slice(hostname.indexOf('.') + 1);
  383. };
  384. /******************************************************************************/
  385. // Return all possible parent hostnames which can be derived from `hostname`,
  386. // ordered from direct parent up to domain inclusively.
  387. URI.parentHostnamesFromHostname = function(hostname) {
  388. // TODO: I should create an object which is optimized to receive
  389. // the list of hostnames by making it reusable (junkyard etc.) and which
  390. // has its own element counter property in order to avoid memory
  391. // alloc/dealloc.
  392. var domain = this.domainFromHostname(hostname);
  393. if ( domain === '' || domain === hostname ) {
  394. return [];
  395. }
  396. var nodes = [];
  397. var pos;
  398. for (;;) {
  399. pos = hostname.indexOf('.');
  400. if ( pos < 0 ) {
  401. break;
  402. }
  403. hostname = hostname.slice(pos + 1);
  404. nodes.push(hostname);
  405. if ( hostname === domain ) {
  406. break;
  407. }
  408. }
  409. return nodes;
  410. };
  411. /******************************************************************************/
  412. // Return all possible hostnames which can be derived from `hostname`,
  413. // ordered from self up to domain inclusively.
  414. URI.allHostnamesFromHostname = function(hostname) {
  415. var nodes = this.parentHostnamesFromHostname(hostname);
  416. nodes.unshift(hostname);
  417. return nodes;
  418. };
  419. /******************************************************************************/
  420. URI.toString = function() {
  421. return this.assemble();
  422. };
  423. /******************************************************************************/
  424. // Export
  425. return URI;
  426. /******************************************************************************/
  427. })();
  428. /******************************************************************************/