You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

831 lines
17 KiB

  1. // The cascadia package is an implementation of CSS selectors.
  2. package cascadia
  3. import (
  4. "errors"
  5. "fmt"
  6. "regexp"
  7. "strconv"
  8. "strings"
  9. "golang.org/x/net/html"
  10. )
  11. // a parser for CSS selectors
  12. type parser struct {
  13. s string // the source text
  14. i int // the current position
  15. }
  16. // parseEscape parses a backslash escape.
  17. func (p *parser) parseEscape() (result string, err error) {
  18. if len(p.s) < p.i+2 || p.s[p.i] != '\\' {
  19. return "", errors.New("invalid escape sequence")
  20. }
  21. start := p.i + 1
  22. c := p.s[start]
  23. switch {
  24. case c == '\r' || c == '\n' || c == '\f':
  25. return "", errors.New("escaped line ending outside string")
  26. case hexDigit(c):
  27. // unicode escape (hex)
  28. var i int
  29. for i = start; i < p.i+6 && i < len(p.s) && hexDigit(p.s[i]); i++ {
  30. // empty
  31. }
  32. v, _ := strconv.ParseUint(p.s[start:i], 16, 21)
  33. if len(p.s) > i {
  34. switch p.s[i] {
  35. case '\r':
  36. i++
  37. if len(p.s) > i && p.s[i] == '\n' {
  38. i++
  39. }
  40. case ' ', '\t', '\n', '\f':
  41. i++
  42. }
  43. }
  44. p.i = i
  45. return string(rune(v)), nil
  46. }
  47. // Return the literal character after the backslash.
  48. result = p.s[start : start+1]
  49. p.i += 2
  50. return result, nil
  51. }
  52. func hexDigit(c byte) bool {
  53. return '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F'
  54. }
  55. // nameStart returns whether c can be the first character of an identifier
  56. // (not counting an initial hyphen, or an escape sequence).
  57. func nameStart(c byte) bool {
  58. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127
  59. }
  60. // nameChar returns whether c can be a character within an identifier
  61. // (not counting an escape sequence).
  62. func nameChar(c byte) bool {
  63. return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || c > 127 ||
  64. c == '-' || '0' <= c && c <= '9'
  65. }
  66. // parseIdentifier parses an identifier.
  67. func (p *parser) parseIdentifier() (result string, err error) {
  68. startingDash := false
  69. if len(p.s) > p.i && p.s[p.i] == '-' {
  70. startingDash = true
  71. p.i++
  72. }
  73. if len(p.s) <= p.i {
  74. return "", errors.New("expected identifier, found EOF instead")
  75. }
  76. if c := p.s[p.i]; !(nameStart(c) || c == '\\') {
  77. return "", fmt.Errorf("expected identifier, found %c instead", c)
  78. }
  79. result, err = p.parseName()
  80. if startingDash && err == nil {
  81. result = "-" + result
  82. }
  83. return
  84. }
  85. // parseName parses a name (which is like an identifier, but doesn't have
  86. // extra restrictions on the first character).
  87. func (p *parser) parseName() (result string, err error) {
  88. i := p.i
  89. loop:
  90. for i < len(p.s) {
  91. c := p.s[i]
  92. switch {
  93. case nameChar(c):
  94. start := i
  95. for i < len(p.s) && nameChar(p.s[i]) {
  96. i++
  97. }
  98. result += p.s[start:i]
  99. case c == '\\':
  100. p.i = i
  101. val, err := p.parseEscape()
  102. if err != nil {
  103. return "", err
  104. }
  105. i = p.i
  106. result += val
  107. default:
  108. break loop
  109. }
  110. }
  111. if result == "" {
  112. return "", errors.New("expected name, found EOF instead")
  113. }
  114. p.i = i
  115. return result, nil
  116. }
  117. // parseString parses a single- or double-quoted string.
  118. func (p *parser) parseString() (result string, err error) {
  119. i := p.i
  120. if len(p.s) < i+2 {
  121. return "", errors.New("expected string, found EOF instead")
  122. }
  123. quote := p.s[i]
  124. i++
  125. loop:
  126. for i < len(p.s) {
  127. switch p.s[i] {
  128. case '\\':
  129. if len(p.s) > i+1 {
  130. switch c := p.s[i+1]; c {
  131. case '\r':
  132. if len(p.s) > i+2 && p.s[i+2] == '\n' {
  133. i += 3
  134. continue loop
  135. }
  136. fallthrough
  137. case '\n', '\f':
  138. i += 2
  139. continue loop
  140. }
  141. }
  142. p.i = i
  143. val, err := p.parseEscape()
  144. if err != nil {
  145. return "", err
  146. }
  147. i = p.i
  148. result += val
  149. case quote:
  150. break loop
  151. case '\r', '\n', '\f':
  152. return "", errors.New("unexpected end of line in string")
  153. default:
  154. start := i
  155. for i < len(p.s) {
  156. if c := p.s[i]; c == quote || c == '\\' || c == '\r' || c == '\n' || c == '\f' {
  157. break
  158. }
  159. i++
  160. }
  161. result += p.s[start:i]
  162. }
  163. }
  164. if i >= len(p.s) {
  165. return "", errors.New("EOF in string")
  166. }
  167. // Consume the final quote.
  168. i++
  169. p.i = i
  170. return result, nil
  171. }
  172. // parseRegex parses a regular expression; the end is defined by encountering an
  173. // unmatched closing ')' or ']' which is not consumed
  174. func (p *parser) parseRegex() (rx *regexp.Regexp, err error) {
  175. i := p.i
  176. if len(p.s) < i+2 {
  177. return nil, errors.New("expected regular expression, found EOF instead")
  178. }
  179. // number of open parens or brackets;
  180. // when it becomes negative, finished parsing regex
  181. open := 0
  182. loop:
  183. for i < len(p.s) {
  184. switch p.s[i] {
  185. case '(', '[':
  186. open++
  187. case ')', ']':
  188. open--
  189. if open < 0 {
  190. break loop
  191. }
  192. }
  193. i++
  194. }
  195. if i >= len(p.s) {
  196. return nil, errors.New("EOF in regular expression")
  197. }
  198. rx, err = regexp.Compile(p.s[p.i:i])
  199. p.i = i
  200. return rx, err
  201. }
  202. // skipWhitespace consumes whitespace characters and comments.
  203. // It returns true if there was actually anything to skip.
  204. func (p *parser) skipWhitespace() bool {
  205. i := p.i
  206. for i < len(p.s) {
  207. switch p.s[i] {
  208. case ' ', '\t', '\r', '\n', '\f':
  209. i++
  210. continue
  211. case '/':
  212. if strings.HasPrefix(p.s[i:], "/*") {
  213. end := strings.Index(p.s[i+len("/*"):], "*/")
  214. if end != -1 {
  215. i += end + len("/**/")
  216. continue
  217. }
  218. }
  219. }
  220. break
  221. }
  222. if i > p.i {
  223. p.i = i
  224. return true
  225. }
  226. return false
  227. }
  228. // consumeParenthesis consumes an opening parenthesis and any following
  229. // whitespace. It returns true if there was actually a parenthesis to skip.
  230. func (p *parser) consumeParenthesis() bool {
  231. if p.i < len(p.s) && p.s[p.i] == '(' {
  232. p.i++
  233. p.skipWhitespace()
  234. return true
  235. }
  236. return false
  237. }
  238. // consumeClosingParenthesis consumes a closing parenthesis and any preceding
  239. // whitespace. It returns true if there was actually a parenthesis to skip.
  240. func (p *parser) consumeClosingParenthesis() bool {
  241. i := p.i
  242. p.skipWhitespace()
  243. if p.i < len(p.s) && p.s[p.i] == ')' {
  244. p.i++
  245. return true
  246. }
  247. p.i = i
  248. return false
  249. }
  250. // parseTypeSelector parses a type selector (one that matches by tag name).
  251. func (p *parser) parseTypeSelector() (result Selector, err error) {
  252. tag, err := p.parseIdentifier()
  253. if err != nil {
  254. return nil, err
  255. }
  256. return typeSelector(tag), nil
  257. }
  258. // parseIDSelector parses a selector that matches by id attribute.
  259. func (p *parser) parseIDSelector() (Selector, error) {
  260. if p.i >= len(p.s) {
  261. return nil, fmt.Errorf("expected id selector (#id), found EOF instead")
  262. }
  263. if p.s[p.i] != '#' {
  264. return nil, fmt.Errorf("expected id selector (#id), found '%c' instead", p.s[p.i])
  265. }
  266. p.i++
  267. id, err := p.parseName()
  268. if err != nil {
  269. return nil, err
  270. }
  271. return attributeEqualsSelector("id", id), nil
  272. }
  273. // parseClassSelector parses a selector that matches by class attribute.
  274. func (p *parser) parseClassSelector() (Selector, error) {
  275. if p.i >= len(p.s) {
  276. return nil, fmt.Errorf("expected class selector (.class), found EOF instead")
  277. }
  278. if p.s[p.i] != '.' {
  279. return nil, fmt.Errorf("expected class selector (.class), found '%c' instead", p.s[p.i])
  280. }
  281. p.i++
  282. class, err := p.parseIdentifier()
  283. if err != nil {
  284. return nil, err
  285. }
  286. return attributeIncludesSelector("class", class), nil
  287. }
  288. // parseAttributeSelector parses a selector that matches by attribute value.
  289. func (p *parser) parseAttributeSelector() (Selector, error) {
  290. if p.i >= len(p.s) {
  291. return nil, fmt.Errorf("expected attribute selector ([attribute]), found EOF instead")
  292. }
  293. if p.s[p.i] != '[' {
  294. return nil, fmt.Errorf("expected attribute selector ([attribute]), found '%c' instead", p.s[p.i])
  295. }
  296. p.i++
  297. p.skipWhitespace()
  298. key, err := p.parseIdentifier()
  299. if err != nil {
  300. return nil, err
  301. }
  302. p.skipWhitespace()
  303. if p.i >= len(p.s) {
  304. return nil, errors.New("unexpected EOF in attribute selector")
  305. }
  306. if p.s[p.i] == ']' {
  307. p.i++
  308. return attributeExistsSelector(key), nil
  309. }
  310. if p.i+2 >= len(p.s) {
  311. return nil, errors.New("unexpected EOF in attribute selector")
  312. }
  313. op := p.s[p.i : p.i+2]
  314. if op[0] == '=' {
  315. op = "="
  316. } else if op[1] != '=' {
  317. return nil, fmt.Errorf(`expected equality operator, found "%s" instead`, op)
  318. }
  319. p.i += len(op)
  320. p.skipWhitespace()
  321. if p.i >= len(p.s) {
  322. return nil, errors.New("unexpected EOF in attribute selector")
  323. }
  324. var val string
  325. var rx *regexp.Regexp
  326. if op == "#=" {
  327. rx, err = p.parseRegex()
  328. } else {
  329. switch p.s[p.i] {
  330. case '\'', '"':
  331. val, err = p.parseString()
  332. default:
  333. val, err = p.parseIdentifier()
  334. }
  335. }
  336. if err != nil {
  337. return nil, err
  338. }
  339. p.skipWhitespace()
  340. if p.i >= len(p.s) {
  341. return nil, errors.New("unexpected EOF in attribute selector")
  342. }
  343. if p.s[p.i] != ']' {
  344. return nil, fmt.Errorf("expected ']', found '%c' instead", p.s[p.i])
  345. }
  346. p.i++
  347. switch op {
  348. case "=":
  349. return attributeEqualsSelector(key, val), nil
  350. case "~=":
  351. return attributeIncludesSelector(key, val), nil
  352. case "|=":
  353. return attributeDashmatchSelector(key, val), nil
  354. case "^=":
  355. return attributePrefixSelector(key, val), nil
  356. case "$=":
  357. return attributeSuffixSelector(key, val), nil
  358. case "*=":
  359. return attributeSubstringSelector(key, val), nil
  360. case "#=":
  361. return attributeRegexSelector(key, rx), nil
  362. }
  363. return nil, fmt.Errorf("attribute operator %q is not supported", op)
  364. }
  365. var expectedParenthesis = errors.New("expected '(' but didn't find it")
  366. var expectedClosingParenthesis = errors.New("expected ')' but didn't find it")
  367. var unmatchedParenthesis = errors.New("unmatched '('")
  368. // parsePseudoclassSelector parses a pseudoclass selector like :not(p).
  369. func (p *parser) parsePseudoclassSelector() (Selector, error) {
  370. if p.i >= len(p.s) {
  371. return nil, fmt.Errorf("expected pseudoclass selector (:pseudoclass), found EOF instead")
  372. }
  373. if p.s[p.i] != ':' {
  374. return nil, fmt.Errorf("expected attribute selector (:pseudoclass), found '%c' instead", p.s[p.i])
  375. }
  376. p.i++
  377. name, err := p.parseIdentifier()
  378. if err != nil {
  379. return nil, err
  380. }
  381. name = toLowerASCII(name)
  382. switch name {
  383. case "not", "has", "haschild":
  384. if !p.consumeParenthesis() {
  385. return nil, expectedParenthesis
  386. }
  387. sel, err := p.parseSelectorGroup()
  388. if err != nil {
  389. return nil, err
  390. }
  391. if !p.consumeClosingParenthesis() {
  392. return nil, expectedClosingParenthesis
  393. }
  394. switch name {
  395. case "not":
  396. return negatedSelector(sel), nil
  397. case "has":
  398. return hasDescendantSelector(sel), nil
  399. case "haschild":
  400. return hasChildSelector(sel), nil
  401. }
  402. case "contains", "containsown":
  403. if !p.consumeParenthesis() {
  404. return nil, expectedParenthesis
  405. }
  406. if p.i == len(p.s) {
  407. return nil, unmatchedParenthesis
  408. }
  409. var val string
  410. switch p.s[p.i] {
  411. case '\'', '"':
  412. val, err = p.parseString()
  413. default:
  414. val, err = p.parseIdentifier()
  415. }
  416. if err != nil {
  417. return nil, err
  418. }
  419. val = strings.ToLower(val)
  420. p.skipWhitespace()
  421. if p.i >= len(p.s) {
  422. return nil, errors.New("unexpected EOF in pseudo selector")
  423. }
  424. if !p.consumeClosingParenthesis() {
  425. return nil, expectedClosingParenthesis
  426. }
  427. switch name {
  428. case "contains":
  429. return textSubstrSelector(val), nil
  430. case "containsown":
  431. return ownTextSubstrSelector(val), nil
  432. }
  433. case "matches", "matchesown":
  434. if !p.consumeParenthesis() {
  435. return nil, expectedParenthesis
  436. }
  437. rx, err := p.parseRegex()
  438. if err != nil {
  439. return nil, err
  440. }
  441. if p.i >= len(p.s) {
  442. return nil, errors.New("unexpected EOF in pseudo selector")
  443. }
  444. if !p.consumeClosingParenthesis() {
  445. return nil, expectedClosingParenthesis
  446. }
  447. switch name {
  448. case "matches":
  449. return textRegexSelector(rx), nil
  450. case "matchesown":
  451. return ownTextRegexSelector(rx), nil
  452. }
  453. case "nth-child", "nth-last-child", "nth-of-type", "nth-last-of-type":
  454. if !p.consumeParenthesis() {
  455. return nil, expectedParenthesis
  456. }
  457. a, b, err := p.parseNth()
  458. if err != nil {
  459. return nil, err
  460. }
  461. if !p.consumeClosingParenthesis() {
  462. return nil, expectedClosingParenthesis
  463. }
  464. if a == 0 {
  465. switch name {
  466. case "nth-child":
  467. return simpleNthChildSelector(b, false), nil
  468. case "nth-of-type":
  469. return simpleNthChildSelector(b, true), nil
  470. case "nth-last-child":
  471. return simpleNthLastChildSelector(b, false), nil
  472. case "nth-last-of-type":
  473. return simpleNthLastChildSelector(b, true), nil
  474. }
  475. }
  476. return nthChildSelector(a, b,
  477. name == "nth-last-child" || name == "nth-last-of-type",
  478. name == "nth-of-type" || name == "nth-last-of-type"),
  479. nil
  480. case "first-child":
  481. return simpleNthChildSelector(1, false), nil
  482. case "last-child":
  483. return simpleNthLastChildSelector(1, false), nil
  484. case "first-of-type":
  485. return simpleNthChildSelector(1, true), nil
  486. case "last-of-type":
  487. return simpleNthLastChildSelector(1, true), nil
  488. case "only-child":
  489. return onlyChildSelector(false), nil
  490. case "only-of-type":
  491. return onlyChildSelector(true), nil
  492. case "input":
  493. return inputSelector, nil
  494. case "empty":
  495. return emptyElementSelector, nil
  496. }
  497. return nil, fmt.Errorf("unknown pseudoclass :%s", name)
  498. }
  499. // parseInteger parses a decimal integer.
  500. func (p *parser) parseInteger() (int, error) {
  501. i := p.i
  502. start := i
  503. for i < len(p.s) && '0' <= p.s[i] && p.s[i] <= '9' {
  504. i++
  505. }
  506. if i == start {
  507. return 0, errors.New("expected integer, but didn't find it.")
  508. }
  509. p.i = i
  510. val, err := strconv.Atoi(p.s[start:i])
  511. if err != nil {
  512. return 0, err
  513. }
  514. return val, nil
  515. }
  516. // parseNth parses the argument for :nth-child (normally of the form an+b).
  517. func (p *parser) parseNth() (a, b int, err error) {
  518. // initial state
  519. if p.i >= len(p.s) {
  520. goto eof
  521. }
  522. switch p.s[p.i] {
  523. case '-':
  524. p.i++
  525. goto negativeA
  526. case '+':
  527. p.i++
  528. goto positiveA
  529. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  530. goto positiveA
  531. case 'n', 'N':
  532. a = 1
  533. p.i++
  534. goto readN
  535. case 'o', 'O', 'e', 'E':
  536. id, err := p.parseName()
  537. if err != nil {
  538. return 0, 0, err
  539. }
  540. id = toLowerASCII(id)
  541. if id == "odd" {
  542. return 2, 1, nil
  543. }
  544. if id == "even" {
  545. return 2, 0, nil
  546. }
  547. return 0, 0, fmt.Errorf("expected 'odd' or 'even', but found '%s' instead", id)
  548. default:
  549. goto invalid
  550. }
  551. positiveA:
  552. if p.i >= len(p.s) {
  553. goto eof
  554. }
  555. switch p.s[p.i] {
  556. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  557. a, err = p.parseInteger()
  558. if err != nil {
  559. return 0, 0, err
  560. }
  561. goto readA
  562. case 'n', 'N':
  563. a = 1
  564. p.i++
  565. goto readN
  566. default:
  567. goto invalid
  568. }
  569. negativeA:
  570. if p.i >= len(p.s) {
  571. goto eof
  572. }
  573. switch p.s[p.i] {
  574. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
  575. a, err = p.parseInteger()
  576. if err != nil {
  577. return 0, 0, err
  578. }
  579. a = -a
  580. goto readA
  581. case 'n', 'N':
  582. a = -1
  583. p.i++
  584. goto readN
  585. default:
  586. goto invalid
  587. }
  588. readA:
  589. if p.i >= len(p.s) {
  590. goto eof
  591. }
  592. switch p.s[p.i] {
  593. case 'n', 'N':
  594. p.i++
  595. goto readN
  596. default:
  597. // The number we read as a is actually b.
  598. return 0, a, nil
  599. }
  600. readN:
  601. p.skipWhitespace()
  602. if p.i >= len(p.s) {
  603. goto eof
  604. }
  605. switch p.s[p.i] {
  606. case '+':
  607. p.i++
  608. p.skipWhitespace()
  609. b, err = p.parseInteger()
  610. if err != nil {
  611. return 0, 0, err
  612. }
  613. return a, b, nil
  614. case '-':
  615. p.i++
  616. p.skipWhitespace()
  617. b, err = p.parseInteger()
  618. if err != nil {
  619. return 0, 0, err
  620. }
  621. return a, -b, nil
  622. default:
  623. return a, 0, nil
  624. }
  625. eof:
  626. return 0, 0, errors.New("unexpected EOF while attempting to parse expression of form an+b")
  627. invalid:
  628. return 0, 0, errors.New("unexpected character while attempting to parse expression of form an+b")
  629. }
  630. // parseSimpleSelectorSequence parses a selector sequence that applies to
  631. // a single element.
  632. func (p *parser) parseSimpleSelectorSequence() (Selector, error) {
  633. var result Selector
  634. if p.i >= len(p.s) {
  635. return nil, errors.New("expected selector, found EOF instead")
  636. }
  637. switch p.s[p.i] {
  638. case '*':
  639. // It's the universal selector. Just skip over it, since it doesn't affect the meaning.
  640. p.i++
  641. case '#', '.', '[', ':':
  642. // There's no type selector. Wait to process the other till the main loop.
  643. default:
  644. r, err := p.parseTypeSelector()
  645. if err != nil {
  646. return nil, err
  647. }
  648. result = r
  649. }
  650. loop:
  651. for p.i < len(p.s) {
  652. var ns Selector
  653. var err error
  654. switch p.s[p.i] {
  655. case '#':
  656. ns, err = p.parseIDSelector()
  657. case '.':
  658. ns, err = p.parseClassSelector()
  659. case '[':
  660. ns, err = p.parseAttributeSelector()
  661. case ':':
  662. ns, err = p.parsePseudoclassSelector()
  663. default:
  664. break loop
  665. }
  666. if err != nil {
  667. return nil, err
  668. }
  669. if result == nil {
  670. result = ns
  671. } else {
  672. result = intersectionSelector(result, ns)
  673. }
  674. }
  675. if result == nil {
  676. result = func(n *html.Node) bool {
  677. return true
  678. }
  679. }
  680. return result, nil
  681. }
  682. // parseSelector parses a selector that may include combinators.
  683. func (p *parser) parseSelector() (result Selector, err error) {
  684. p.skipWhitespace()
  685. result, err = p.parseSimpleSelectorSequence()
  686. if err != nil {
  687. return
  688. }
  689. for {
  690. var combinator byte
  691. if p.skipWhitespace() {
  692. combinator = ' '
  693. }
  694. if p.i >= len(p.s) {
  695. return
  696. }
  697. switch p.s[p.i] {
  698. case '+', '>', '~':
  699. combinator = p.s[p.i]
  700. p.i++
  701. p.skipWhitespace()
  702. case ',', ')':
  703. // These characters can't begin a selector, but they can legally occur after one.
  704. return
  705. }
  706. if combinator == 0 {
  707. return
  708. }
  709. c, err := p.parseSimpleSelectorSequence()
  710. if err != nil {
  711. return nil, err
  712. }
  713. switch combinator {
  714. case ' ':
  715. result = descendantSelector(result, c)
  716. case '>':
  717. result = childSelector(result, c)
  718. case '+':
  719. result = siblingSelector(result, c, true)
  720. case '~':
  721. result = siblingSelector(result, c, false)
  722. }
  723. }
  724. panic("unreachable")
  725. }
  726. // parseSelectorGroup parses a group of selectors, separated by commas.
  727. func (p *parser) parseSelectorGroup() (result Selector, err error) {
  728. result, err = p.parseSelector()
  729. if err != nil {
  730. return
  731. }
  732. for p.i < len(p.s) {
  733. if p.s[p.i] != ',' {
  734. return result, nil
  735. }
  736. p.i++
  737. c, err := p.parseSelector()
  738. if err != nil {
  739. return nil, err
  740. }
  741. result = unionSelector(result, c)
  742. }
  743. return
  744. }