You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

416 lines
14 KiB

10 years ago
10 years ago
  1. /*******************************************************************************
  2. µMatrix - a Chromium browser extension to black/white list requests.
  3. Copyright (C) 2014 Raymond Hill
  4. This program is free software: you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation, either version 3 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program. If not, see {http://www.gnu.org/licenses/}.
  14. Home: https://github.com/gorhill/uMatrix
  15. */
  16. /* global µMatrix, publicSuffixList */
  17. /*******************************************************************************
  18. RFC 3986 as reference: http://tools.ietf.org/html/rfc3986#appendix-A
  19. Naming convention from https://en.wikipedia.org/wiki/URI_scheme#Examples
  20. */
  21. /******************************************************************************/
  22. // This will inserted as a module in the µMatrix object.
  23. µMatrix.URI = (function() {
  24. /******************************************************************************/
  25. // Favorite regex tool: http://regex101.com/
  26. // Ref: <http://tools.ietf.org/html/rfc3986#page-50>
  27. // I removed redundant capture groups: capture less = peform faster. See
  28. // <http://jsperf.com/old-uritools-vs-new-uritools>
  29. // Performance improvements welcomed.
  30. // jsperf: <http://jsperf.com/old-uritools-vs-new-uritools>
  31. var reRFC3986 = /^([^:\/?#]+:)?(\/\/[^\/?#]*)?([^?#]*)(\?[^#]*)?(#.*)?/;
  32. // Derived
  33. var reSchemeFromURI = /^[^:\/?#]+:/;
  34. var reAuthorityFromURI = /^(?:[^:\/?#]+:)?(\/\/[^\/?#]+)/;
  35. // These are to parse authority field, not parsed by above official regex
  36. // IPv6 is seen as an exception: a non-compatible IPv6 is first tried, and
  37. // if it fails, the IPv6 compatible regex istr used. This helps
  38. // peformance by avoiding the use of a too complicated regex first.
  39. // https://github.com/gorhill/httpswitchboard/issues/211
  40. // "While a hostname may not contain other characters, such as the
  41. // "underscore character (_), other DNS names may contain the underscore"
  42. var reHostPortFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]*)(:\d*)?$/i;
  43. var reIPv6PortFromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]*\])(:\d*)?$/i;
  44. var reHostFromNakedAuthority = /^[0-9a-z._-]+[0-9a-z]$/i;
  45. var reHostFromAuthority = /^(?:[^@]*@)?([0-9a-z._-]+)(?::\d*)?$/i;
  46. var reIPv6FromAuthority = /^(?:[^@]*@)?(\[[0-9a-f:]+\])(?::\d*)?$/i;
  47. // Coarse (but fast) tests
  48. var reValidHostname = /^([a-z\d]+(-*[a-z\d]+)*)(\.[a-z\d]+(-*[a-z\d])*)*$/;
  49. var reIPAddressNaive = /^\d+\.\d+\.\d+\.\d+$|^\[[\da-zA-Z:]+\]$/;
  50. // Accurate tests
  51. // Source.: http://stackoverflow.com/questions/5284147/validating-ipv4-addresses-with-regexp/5284410#5284410
  52. var reIPv4 = /^((25[0-5]|2[0-4]\d|[01]?\d\d?)(\.|$)){4}/;
  53. // Source: http://forums.intermapper.com/viewtopic.php?p=1096#1096
  54. var reIPv6 = /^\s*((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)(\.(25[0-5]|2[0-4]\d|1\d\d|[1-9]?\d)){3}))|:)))(%.+)?\s*$/;
  55. /******************************************************************************/
  56. var reset = function(o) {
  57. o.scheme = '';
  58. o.hostname = '';
  59. o._ipv4 = undefined;
  60. o._ipv6 = undefined;
  61. o.port = '';
  62. o.path = '';
  63. o.query = '';
  64. o.fragment = '';
  65. return o;
  66. };
  67. var resetAuthority = function(o) {
  68. o.hostname = '';
  69. o._ipv4 = undefined;
  70. o._ipv6 = undefined;
  71. o.port = '';
  72. return o;
  73. };
  74. /******************************************************************************/
  75. // This will be exported
  76. var URI = {
  77. scheme: '',
  78. authority: '',
  79. hostname: '',
  80. _ipv4: undefined,
  81. _ipv6: undefined,
  82. port: '',
  83. domain: undefined,
  84. path: '',
  85. query: '',
  86. fragment: '',
  87. schemeBit: (1 << 0),
  88. userBit: (1 << 1),
  89. passwordBit: (1 << 2),
  90. hostnameBit: (1 << 3),
  91. portBit: (1 << 4),
  92. pathBit: (1 << 5),
  93. queryBit: (1 << 6),
  94. fragmentBit: (1 << 7),
  95. allBits: (0xFFFF)
  96. };
  97. URI.authorityBit = (URI.userBit | URI.passwordBit | URI.hostnameBit | URI.portBit);
  98. URI.normalizeBits = (URI.schemeBit | URI.hostnameBit | URI.pathBit | URI.queryBit);
  99. /******************************************************************************/
  100. // See: https://en.wikipedia.org/wiki/URI_scheme#Examples
  101. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  102. //
  103. // foo://example.com:8042/over/there?name=ferret#nose
  104. // \_/ \______________/\_________/ \_________/ \__/
  105. // | | | | |
  106. // scheme authority path query fragment
  107. // | _____________________|__
  108. // / \ / \
  109. // urn:example:animal:ferret:nose
  110. URI.set = function(uri) {
  111. if ( uri === undefined ) {
  112. return reset(URI);
  113. }
  114. var matches = reRFC3986.exec(uri);
  115. if ( !matches ) {
  116. return reset(URI);
  117. }
  118. this.scheme = matches[1] !== undefined ? matches[1].slice(0, -1) : '';
  119. this.authority = matches[2] !== undefined ? matches[2].slice(2).toLowerCase() : '';
  120. this.path = matches[3] !== undefined ? matches[3] : '';
  121. // <http://tools.ietf.org/html/rfc3986#section-6.2.3>
  122. // "In general, a URI that uses the generic syntax for authority
  123. // "with an empty path should be normalized to a path of '/'."
  124. if ( this.authority !== '' && this.path === '' ) {
  125. this.path = '/';
  126. }
  127. this.query = matches[4] !== undefined ? matches[4].slice(1) : '';
  128. this.fragment = matches[5] !== undefined ? matches[5].slice(1) : '';
  129. // Assume very simple authority, i.e. just a hostname (highest likelihood
  130. // case for µMatrix)
  131. if ( reHostFromNakedAuthority.test(this.authority) ) {
  132. this.hostname = this.authority;
  133. this.port = '';
  134. return this;
  135. }
  136. // Authority contains more than just a hostname
  137. matches = reHostPortFromAuthority.exec(this.authority);
  138. if ( !matches ) {
  139. matches = reIPv6PortFromAuthority.exec(this.authority);
  140. if ( !matches ) {
  141. return resetAuthority(URI);
  142. }
  143. }
  144. this.hostname = matches[1] !== undefined ? matches[1] : '';
  145. // http://en.wikipedia.org/wiki/FQDN
  146. if ( this.hostname.slice(-1) === '.' ) {
  147. this.hostname = this.hostname.slice(0, -1);
  148. }
  149. this.port = matches[2] !== undefined ? matches[2].slice(1) : '';
  150. return this;
  151. };
  152. /******************************************************************************/
  153. // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
  154. //
  155. // foo://example.com:8042/over/there?name=ferret#nose
  156. // \_/ \______________/\_________/ \_________/ \__/
  157. // | | | | |
  158. // scheme authority path query fragment
  159. // | _____________________|__
  160. // / \ / \
  161. // urn:example:animal:ferret:nose
  162. URI.assemble = function(bits) {
  163. if ( bits === undefined ) {
  164. bits = this.allBits;
  165. }
  166. var s = [];
  167. if ( this.scheme && (bits & this.schemeBit) ) {
  168. s.push(this.scheme, ':');
  169. }
  170. if ( this.hostname && (bits & this.hostnameBit) ) {
  171. s.push('//', this.hostname);
  172. }
  173. if ( this.port && (bits & this.portBit) ) {
  174. s.push(':', this.port);
  175. }
  176. if ( this.path && (bits & this.pathBit) ) {
  177. s.push(this.path);
  178. }
  179. if ( this.query && (bits & this.queryBit) ) {
  180. s.push('?', this.query);
  181. }
  182. if ( this.fragment && (bits & this.fragmentBit) ) {
  183. s.push('#', this.fragment);
  184. }
  185. return s.join('');
  186. };
  187. /******************************************************************************/
  188. URI.schemeFromURI = function(uri) {
  189. var matches = reSchemeFromURI.exec(uri);
  190. if ( matches === null ) {
  191. return '';
  192. }
  193. return matches[0].slice(0, -1).toLowerCase();
  194. };
  195. /******************************************************************************/
  196. URI.authorityFromURI = function(uri) {
  197. var matches = reAuthorityFromURI.exec(uri);
  198. if ( !matches ) {
  199. return '';
  200. }
  201. return matches[1].slice(2).toLowerCase();
  202. };
  203. /******************************************************************************/
  204. // The most used function, so it better be fast.
  205. URI.hostnameFromURI = function(uri) {
  206. var matches = reAuthorityFromURI.exec(uri);
  207. if ( !matches ) {
  208. return '';
  209. }
  210. var authority = matches[1].slice(2);
  211. // Assume very simple authority (most common case for µMatrix)
  212. if ( reHostFromNakedAuthority.test(authority) ) {
  213. return authority.toLowerCase();
  214. }
  215. matches = reHostFromAuthority.exec(authority);
  216. if ( !matches ) {
  217. matches = reIPv6FromAuthority.exec(authority);
  218. if ( !matches ) {
  219. return '';
  220. }
  221. }
  222. // http://en.wikipedia.org/wiki/FQDN
  223. var hostname = matches[1];
  224. if ( hostname.slice(-1) === '.' ) {
  225. hostname = hostname.slice(0, -1);
  226. }
  227. return hostname.toLowerCase();
  228. };
  229. /******************************************************************************/
  230. // It is expected that there is higher-scoped `publicSuffixList` lingering
  231. // somewhere. Cache it. See <https://github.com/gorhill/publicsuffixlist.js>.
  232. var psl = publicSuffixList;
  233. URI.domainFromHostname = function(hostname) {
  234. if ( !reIPAddressNaive.test(hostname) ) {
  235. return psl.getDomain(hostname);
  236. }
  237. return hostname;
  238. };
  239. URI.domain = function() {
  240. return this.domainFromHostname(this.hostname);
  241. };
  242. /******************************************************************************/
  243. URI.domainFromURI = function(uri) {
  244. if ( !uri ) {
  245. return '';
  246. }
  247. return this.domainFromHostname(this.hostnameFromURI(uri));
  248. };
  249. /******************************************************************************/
  250. // Normalize the way µMatrix expects it
  251. URI.normalizedURI = function() {
  252. // Will be removed:
  253. // - port
  254. // - user id/password
  255. // - fragment
  256. return this.assemble(this.normalizeBits);
  257. };
  258. /******************************************************************************/
  259. URI.rootURL = function() {
  260. if ( !this.hostname ) {
  261. return '';
  262. }
  263. return this.assemble(this.schemeBit | this.hostnameBit);
  264. };
  265. /******************************************************************************/
  266. URI.isValidHostname = function(hostname) {
  267. var r;
  268. try {
  269. r = reValidHostname.test(hostname);
  270. }
  271. catch (e) {
  272. return false;
  273. }
  274. return r;
  275. };
  276. /******************************************************************************/
  277. // Return the parent domain. For IP address, there is no parent domain.
  278. URI.parentHostnameFromHostname = function(hostname) {
  279. // `locahost` => ``
  280. // `example.org` => `example.org`
  281. // `www.example.org` => `example.org`
  282. // `tomato.www.example.org` => `example.org`
  283. var domain = this.domainFromHostname(hostname);
  284. // `locahost` === `` => bye
  285. // `example.org` === `example.org` => bye
  286. // `www.example.org` !== `example.org` => stay
  287. // `tomato.www.example.org` !== `example.org` => stay
  288. if ( domain === '' || domain === hostname ) {
  289. return undefined;
  290. }
  291. // Parent is hostname minus first label
  292. return hostname.slice(hostname.indexOf('.') + 1);
  293. };
  294. /******************************************************************************/
  295. // Return all possible parent hostnames which can be derived from `hostname`,
  296. // ordered from direct parent up to domain inclusively.
  297. URI.parentHostnamesFromHostname = function(hostname) {
  298. // TODO: I should create an object which is optimized to receive
  299. // the list of hostnames by making it reusable (junkyard etc.) and which
  300. // has its own element counter property in order to avoid memory
  301. // alloc/dealloc.
  302. var domain = this.domainFromHostname(hostname);
  303. if ( domain === '' || domain === hostname ) {
  304. return [];
  305. }
  306. var nodes = [];
  307. var pos;
  308. for (;;) {
  309. pos = hostname.indexOf('.');
  310. if ( pos < 0 ) {
  311. break;
  312. }
  313. hostname = hostname.slice(pos + 1);
  314. nodes.push(hostname);
  315. if ( hostname === domain ) {
  316. break;
  317. }
  318. }
  319. return nodes;
  320. };
  321. /******************************************************************************/
  322. // Return all possible hostnames which can be derived from `hostname`,
  323. // ordered from self up to domain inclusively.
  324. URI.allHostnamesFromHostname = function(hostname) {
  325. var nodes = this.parentHostnamesFromHostname(hostname);
  326. nodes.unshift(hostname);
  327. return nodes;
  328. };
  329. /******************************************************************************/
  330. URI.toString = function() {
  331. return this.assemble();
  332. };
  333. /******************************************************************************/
  334. // Export
  335. return URI;
  336. /******************************************************************************/
  337. })();
  338. /******************************************************************************/