You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

498 lines
16 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. /*******************************************************************************
  2. µMatrix - a Chromium browser extension to black/white list requests.
  3. Copyright (C) 2014 Raymond Hill
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see {http://www.gnu.org/licenses/}.
  14. Home: https://github.com/gorhill/uMatrix
  15. */
  16. /* global µMatrix, publicSuffixList */
  17. /*******************************************************************************
  18. RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
  19. Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
  20. */
  21. /******************************************************************************/
  22. µMatrix.URI = (function() {
  23. 'use strict';
  24. /******************************************************************************/
  25. // Favorite regex tool: http://regex101.com/
  26. // Ref: <http://tools.ietf.org/html/rfc3986#page-50>
  27. // I removed redundant capture groups: capture less = peform faster. See
  28. // <http://jsperf.com/old-uritools-vs-new-uritools>
  29. // Performance improvements welcomed.
  30. // jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
  31. var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
  32. // Derived
  33. var reSchemeFromURI = /^[^:\/?#]+:/;
  34. var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
  35. var reCommonHostnameFromURL = /^https?:\/\/([0-9a-z_][0-9a-z._-]*[0-9a-z])\//;
  36. // These are to parse authority field, not parsed by above official regex
  37. // IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
  38. // if it fails, the IPv6 compatible regex istr used. This helps
  39. // peformance by avoiding the use of a too complicated regex first.
  40. // https://github.com/gorhill/httpswitchboard/issues/211
  41. // "While a hostname may not contain other characters, such as the
  42. // "underscore character (_), other DNS names may contain the underscore"
  43. var reHostPortFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]*)(:\d*)?$/i;
  44. var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
  45. var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
  46. var reHostFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]+)(?::\d*)?$/i;
  47. var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
  48. // Coarse (but fast) tests
  49. var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
  50. var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
  51. // Accurate tests
  52. // Source.: http://stackoverflow.com/questions/5284147/validating-ipv4-addresses-with-regexp/5284410#5284410
  53. var reIPv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)(\.|$)){4}/;
  54. // Source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
  55. var reIPv6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
  56. /******************************************************************************/
  57. var reset = function(o) {
  58. o.scheme = '';
  59. o.hostname = '';
  60. o._ipv4 = undefined;
  61. o._ipv6 = undefined;
  62. o.port = '';
  63. o.path = '';
  64. o.query = '';
  65. o.fragment = '';
  66. return o;
  67. };
  68. var resetAuthority = function(o) {
  69. o.hostname = '';
  70. o._ipv4 = undefined;
  71. o._ipv6 = undefined;
  72. o.port = '';
  73. return o;
  74. };
  75. /******************************************************************************/
  76. // This will be exported
  77. var URI = {
  78. scheme: '',
  79. authority: '',
  80. hostname: '',
  81. _ipv4: undefined,
  82. _ipv6: undefined,
  83. port: '',
  84. domain: undefined,
  85. path: '',
  86. query: '',
  87. fragment: '',
  88. schemeBit: (1 << 0),
  89. userBit: (1 << 1),
  90. passwordBit: (1 << 2),
  91. hostnameBit: (1 << 3),
  92. portBit: (1 << 4),
  93. pathBit: (1 << 5),
  94. queryBit: (1 << 6),
  95. fragmentBit: (1 << 7),
  96. allBits: (0xFFFF)
  97. };
  98. URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
  99. URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
  100. /******************************************************************************/
  101. // See: https://en.wikipedia.org/wiki/URI_scheme#Examples
  102. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  103. //
  104. // foo://example.com:8042/over/there?name=ferret#nose
  105. // \_/ \______________/\_________/ \_________/ \__/
  106. // | | | | |
  107. // scheme authority path query fragment
  108. // | _____________________|__
  109. // / \ / \
  110. // urn:example:animal:ferret:nose
  111. URI.set = function(uri) {
  112. if ( uri === undefined ) {
  113. return reset(URI);
  114. }
  115. var matches = reRFC3986.exec(uri);
  116. if ( !matches ) {
  117. return reset(URI);
  118. }
  119. this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
  120. this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
  121. this.path = matches[3] !== undefined ? matches[3] : '';
  122. // <http://tools.ietf.org/html/rfc3986#section-6.2.3>
  123. // "In general, a URI that uses the generic syntax for authority
  124. // "with an empty path should be normalized to a path of '/'."
  125. if ( this.authority !== '' && this.path === '' ) {
  126. this.path = '/';
  127. }
  128. this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
  129. this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
  130. // Assume very simple authority, i.e. just a hostname (highest likelihood
  131. // case for µMatrix)
  132. if ( reHostFromNakedAuthority.test(this.authority) ) {
  133. this.hostname = this.authority;
  134. this.port = '';
  135. return this;
  136. }
  137. // Authority contains more than just a hostname
  138. matches = reHostPortFromAuthority.exec(this.authority);
  139. if ( !matches ) {
  140. matches = reIPv6PortFromAuthority.exec(this.authority);
  141. if ( !matches ) {
  142. return resetAuthority(URI);
  143. }
  144. }
  145. this.hostname = matches[1] !== undefined ? matches[1] : '';
  146. // http://en.wikipedia.org/wiki/FQDN
  147. if ( this.hostname.slice(-1) === '.' ) {
  148. this.hostname = this.hostname.slice(0, -1);
  149. }
  150. this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
  151. return this;
  152. };
  153. /******************************************************************************/
  154. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  155. //
  156. // foo://example.com:8042/over/there?name=ferret#nose
  157. // \_/ \______________/\_________/ \_________/ \__/
  158. // | | | | |
  159. // scheme authority path query fragment
  160. // | _____________________|__
  161. // / \ / \
  162. // urn:example:animal:ferret:nose
  163. URI.assemble = function(bits) {
  164. if ( bits === undefined ) {
  165. bits = this.allBits;
  166. }
  167. var s = [];
  168. if ( this.scheme && (bits & this.schemeBit) ) {
  169. s.push(this.scheme, ':');
  170. }
  171. if ( this.hostname && (bits & this.hostnameBit) ) {
  172. s.push('//', this.hostname);
  173. }
  174. if ( this.port && (bits & this.portBit) ) {
  175. s.push(':', this.port);
  176. }
  177. if ( this.path && (bits & this.pathBit) ) {
  178. s.push(this.path);
  179. }
  180. if ( this.query && (bits & this.queryBit) ) {
  181. s.push('?', this.query);
  182. }
  183. if ( this.fragment && (bits & this.fragmentBit) ) {
  184. s.push('#', this.fragment);
  185. }
  186. return s.join('');
  187. };
  188. /******************************************************************************/
  189. URI.schemeFromURI = function(uri) {
  190. var matches = reSchemeFromURI.exec(uri);
  191. if ( matches === null ) {
  192. return '';
  193. }
  194. return matches[0].slice(0, -1).toLowerCase();
  195. };
  196. /******************************************************************************/
  197. URI.authorityFromURI = function(uri) {
  198. var matches = reAuthorityFromURI.exec(uri);
  199. if ( !matches ) {
  200. return '';
  201. }
  202. return matches[1].slice(2).toLowerCase();
  203. };
  204. /******************************************************************************/
  205. // The most used function, so it better be fast.
  206. URI.hostnameFromURI = function(uri) {
  207. var matches = reCommonHostnameFromURL.exec(uri);
  208. if ( matches ) {
  209. return matches[1];
  210. }
  211. matches = reAuthorityFromURI.exec(uri);
  212. if ( !matches ) {
  213. return '';
  214. }
  215. var authority = matches[1].slice(2);
  216. // Assume very simple authority (most common case for µMatrix)
  217. if ( reHostFromNakedAuthority.test(authority) ) {
  218. return authority.toLowerCase();
  219. }
  220. matches = reHostFromAuthority.exec(authority);
  221. if ( !matches ) {
  222. matches = reIPv6FromAuthority.exec(authority);
  223. if ( !matches ) {
  224. return '';
  225. }
  226. }
  227. // http://en.wikipedia.org/wiki/FQDN
  228. var hostname = matches[1];
  229. if ( hostname.slice(-1) === '.' ) {
  230. hostname = hostname.slice(0, -1);
  231. }
  232. return hostname.toLowerCase();
  233. };
  234. /******************************************************************************/
  235. URI.domainFromHostname = function(hostname) {
  236. // Try to skip looking up the PSL database
  237. if ( domainCache.hasOwnProperty(hostname) ) {
  238. var entry = domainCache[hostname];
  239. entry.tstamp = Date.now();
  240. return entry.domain;
  241. }
  242. // Meh.. will have to search it
  243. if ( reIPAddressNaive.test(hostname) === false ) {
  244. return domainCacheAdd(hostname, psl.getDomain(hostname));
  245. }
  246. return domainCacheAdd(hostname, hostname);
  247. };
  248. URI.domain = function() {
  249. return this.domainFromHostname(this.hostname);
  250. };
  251. // It is expected that there is higher-scoped `publicSuffixList` lingering
  252. // somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
  253. var psl = publicSuffixList;
  254. /******************************************************************************/
  255. // Trying to alleviate the worries of looking up too often the domain name from
  256. // a hostname. With a cache, uBlock benefits given that it deals with a
  257. // specific set of hostnames within a narrow time span -- in other words, I
  258. // believe probability of cache hit are high in uBlock.
  259. var DomainCacheEntry = function(domain) {
  260. this.init(domain);
  261. };
  262. DomainCacheEntry.prototype.init = function(domain) {
  263. this.domain = domain;
  264. this.tstamp = Date.now();
  265. return this;
  266. };
  267. DomainCacheEntry.prototype.dispose = function() {
  268. this.domain = '';
  269. if ( domainCacheEntryJunkyard.length < 25 ) {
  270. domainCacheEntryJunkyard.push(this);
  271. }
  272. };
  273. var domainCacheEntryFactory = function(domain) {
  274. var entry = domainCacheEntryJunkyard.pop();
  275. if ( entry ) {
  276. return entry.init(domain);
  277. }
  278. return new DomainCacheEntry(domain);
  279. };
  280. var domainCacheEntryJunkyard = [];
  281. var domainCacheAdd = function(hostname, domain) {
  282. if ( domainCache.hasOwnProperty(hostname) ) {
  283. domainCache[hostname].tstamp = Date.now();
  284. } else {
  285. domainCache[hostname] = domainCacheEntryFactory(domain);
  286. domainCacheCount += 1;
  287. if ( domainCacheCount === domainCacheCountHighWaterMark ) {
  288. domainCachePrune();
  289. }
  290. }
  291. return domain;
  292. };
  293. var domainCacheEntrySort = function(a, b) {
  294. return b.tstamp - a.tstamp;
  295. };
  296. var domainCachePrune = function() {
  297. var hostnames = Object.keys(domainCache)
  298. .sort(domainCacheEntrySort)
  299. .slice(domainCacheCountLowWaterMark);
  300. var i = hostnames.length;
  301. domainCacheCount -= i;
  302. var hostname;
  303. while ( i-- ) {
  304. hostname = hostnames[i];
  305. domainCache[hostname].dispose();
  306. delete domainCache[hostname];
  307. }
  308. };
  309. var domainCache = {};
  310. var domainCacheCount = 0;
  311. var domainCacheCountLowWaterMark = 75;
  312. var domainCacheCountHighWaterMark = 100;
  313. /******************************************************************************/
  314. URI.domainFromURI = function(uri) {
  315. if ( !uri ) {
  316. return '';
  317. }
  318. return this.domainFromHostname(this.hostnameFromURI(uri));
  319. };
  320. /******************************************************************************/
  321. // Normalize the way µMatrix expects it
  322. URI.normalizedURI = function() {
  323. // Will be removed:
  324. // - port
  325. // - user id/password
  326. // - fragment
  327. return this.assemble(this.normalizeBits);
  328. };
  329. /******************************************************************************/
  330. URI.rootURL = function() {
  331. if ( !this.hostname ) {
  332. return '';
  333. }
  334. return this.assemble(this.schemeBit | this.hostnameBit);
  335. };
  336. /******************************************************************************/
  337. URI.isValidHostname = function(hostname) {
  338. var r;
  339. try {
  340. r = reValidHostname.test(hostname);
  341. }
  342. catch (e) {
  343. return false;
  344. }
  345. return r;
  346. };
  347. /******************************************************************************/
  348. // Return the parent domain. For IP address, there is no parent domain.
  349. URI.parentHostnameFromHostname = function(hostname) {
  350. // `locahost` => ``
  351. // `example.org` => `example.org`
  352. // `www.example.org` => `example.org`
  353. // `tomato.www.example.org` => `example.org`
  354. var domain = this.domainFromHostname(hostname);
  355. // `locahost` === `` => bye
  356. // `example.org` === `example.org` => bye
  357. // `www.example.org` !== `example.org` => stay
  358. // `tomato.www.example.org` !== `example.org` => stay
  359. if ( domain === '' || domain === hostname ) {
  360. return undefined;
  361. }
  362. // Parent is hostname minus first label
  363. return hostname.slice(hostname.indexOf('.') + 1);
  364. };
  365. /******************************************************************************/
  366. // Return all possible parent hostnames which can be derived from `hostname`,
  367. // ordered from direct parent up to domain inclusively.
  368. URI.parentHostnamesFromHostname = function(hostname) {
  369. // TODO: I should create an object which is optimized to receive
  370. // the list of hostnames by making it reusable (junkyard etc.) and which
  371. // has its own element counter property in order to avoid memory
  372. // alloc/dealloc.
  373. var domain = this.domainFromHostname(hostname);
  374. if ( domain === '' || domain === hostname ) {
  375. return [];
  376. }
  377. var nodes = [];
  378. var pos;
  379. for (;;) {
  380. pos = hostname.indexOf('.');
  381. if ( pos < 0 ) {
  382. break;
  383. }
  384. hostname = hostname.slice(pos + 1);
  385. nodes.push(hostname);
  386. if ( hostname === domain ) {
  387. break;
  388. }
  389. }
  390. return nodes;
  391. };
  392. /******************************************************************************/
  393. // Return all possible hostnames which can be derived from `hostname`,
  394. // ordered from self up to domain inclusively.
  395. URI.allHostnamesFromHostname = function(hostname) {
  396. var nodes = this.parentHostnamesFromHostname(hostname);
  397. nodes.unshift(hostname);
  398. return nodes;
  399. };
  400. /******************************************************************************/
  401. URI.toString = function() {
  402. return this.assemble();
  403. };
  404. /******************************************************************************/
  405. // Export
  406. return URI;
  407. /******************************************************************************/
  408. })();
  409. /******************************************************************************/