You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

569 lines
13 KiB

  1. package cascadia
  2. import (
  3. "bytes"
  4. "fmt"
  5. "regexp"
  6. "strings"
  7. "golang.org/x/net/html"
  8. )
  9. // the Selector type, and functions for creating them
  10. // A Selector is a function which tells whether a node matches or not.
  11. type Selector func(*html.Node) bool
  12. // hasChildMatch returns whether n has any child that matches a.
  13. func hasChildMatch(n *html.Node, a Selector) bool {
  14. for c := n.FirstChild; c != nil; c = c.NextSibling {
  15. if a(c) {
  16. return true
  17. }
  18. }
  19. return false
  20. }
  21. // hasDescendantMatch performs a depth-first search of n's descendants,
  22. // testing whether any of them match a. It returns true as soon as a match is
  23. // found, or false if no match is found.
  24. func hasDescendantMatch(n *html.Node, a Selector) bool {
  25. for c := n.FirstChild; c != nil; c = c.NextSibling {
  26. if a(c) || (c.Type == html.ElementNode && hasDescendantMatch(c, a)) {
  27. return true
  28. }
  29. }
  30. return false
  31. }
  32. // Compile parses a selector and returns, if successful, a Selector object
  33. // that can be used to match against html.Node objects.
  34. func Compile(sel string) (Selector, error) {
  35. p := &parser{s: sel}
  36. compiled, err := p.parseSelectorGroup()
  37. if err != nil {
  38. return nil, err
  39. }
  40. if p.i < len(sel) {
  41. return nil, fmt.Errorf("parsing %q: %d bytes left over", sel, len(sel)-p.i)
  42. }
  43. return compiled, nil
  44. }
  45. // MustCompile is like Compile, but panics instead of returning an error.
  46. func MustCompile(sel string) Selector {
  47. compiled, err := Compile(sel)
  48. if err != nil {
  49. panic(err)
  50. }
  51. return compiled
  52. }
  53. // MatchAll returns a slice of the nodes that match the selector,
  54. // from n and its children.
  55. func (s Selector) MatchAll(n *html.Node) []*html.Node {
  56. return s.matchAllInto(n, nil)
  57. }
  58. func (s Selector) matchAllInto(n *html.Node, storage []*html.Node) []*html.Node {
  59. if s(n) {
  60. storage = append(storage, n)
  61. }
  62. for child := n.FirstChild; child != nil; child = child.NextSibling {
  63. storage = s.matchAllInto(child, storage)
  64. }
  65. return storage
  66. }
  67. // Match returns true if the node matches the selector.
  68. func (s Selector) Match(n *html.Node) bool {
  69. return s(n)
  70. }
  71. // MatchFirst returns the first node that matches s, from n and its children.
  72. func (s Selector) MatchFirst(n *html.Node) *html.Node {
  73. if s.Match(n) {
  74. return n
  75. }
  76. for c := n.FirstChild; c != nil; c = c.NextSibling {
  77. m := s.MatchFirst(c)
  78. if m != nil {
  79. return m
  80. }
  81. }
  82. return nil
  83. }
  84. // Filter returns the nodes in nodes that match the selector.
  85. func (s Selector) Filter(nodes []*html.Node) (result []*html.Node) {
  86. for _, n := range nodes {
  87. if s(n) {
  88. result = append(result, n)
  89. }
  90. }
  91. return result
  92. }
  93. // typeSelector returns a Selector that matches elements with a given tag name.
  94. func typeSelector(tag string) Selector {
  95. tag = toLowerASCII(tag)
  96. return func(n *html.Node) bool {
  97. return n.Type == html.ElementNode && n.Data == tag
  98. }
  99. }
  100. // toLowerASCII returns s with all ASCII capital letters lowercased.
  101. func toLowerASCII(s string) string {
  102. var b []byte
  103. for i := 0; i < len(s); i++ {
  104. if c := s[i]; 'A' <= c && c <= 'Z' {
  105. if b == nil {
  106. b = make([]byte, len(s))
  107. copy(b, s)
  108. }
  109. b[i] = s[i] + ('a' - 'A')
  110. }
  111. }
  112. if b == nil {
  113. return s
  114. }
  115. return string(b)
  116. }
  117. // attributeSelector returns a Selector that matches elements
  118. // where the attribute named key satisifes the function f.
  119. func attributeSelector(key string, f func(string) bool) Selector {
  120. key = toLowerASCII(key)
  121. return func(n *html.Node) bool {
  122. if n.Type != html.ElementNode {
  123. return false
  124. }
  125. for _, a := range n.Attr {
  126. if a.Key == key && f(a.Val) {
  127. return true
  128. }
  129. }
  130. return false
  131. }
  132. }
  133. // attributeExistsSelector returns a Selector that matches elements that have
  134. // an attribute named key.
  135. func attributeExistsSelector(key string) Selector {
  136. return attributeSelector(key, func(string) bool { return true })
  137. }
  138. // attributeEqualsSelector returns a Selector that matches elements where
  139. // the attribute named key has the value val.
  140. func attributeEqualsSelector(key, val string) Selector {
  141. return attributeSelector(key,
  142. func(s string) bool {
  143. return s == val
  144. })
  145. }
  146. // attributeIncludesSelector returns a Selector that matches elements where
  147. // the attribute named key is a whitespace-separated list that includes val.
  148. func attributeIncludesSelector(key, val string) Selector {
  149. return attributeSelector(key,
  150. func(s string) bool {
  151. for s != "" {
  152. i := strings.IndexAny(s, " \t\r\n\f")
  153. if i == -1 {
  154. return s == val
  155. }
  156. if s[:i] == val {
  157. return true
  158. }
  159. s = s[i+1:]
  160. }
  161. return false
  162. })
  163. }
  164. // attributeDashmatchSelector returns a Selector that matches elements where
  165. // the attribute named key equals val or starts with val plus a hyphen.
  166. func attributeDashmatchSelector(key, val string) Selector {
  167. return attributeSelector(key,
  168. func(s string) bool {
  169. if s == val {
  170. return true
  171. }
  172. if len(s) <= len(val) {
  173. return false
  174. }
  175. if s[:len(val)] == val && s[len(val)] == '-' {
  176. return true
  177. }
  178. return false
  179. })
  180. }
  181. // attributePrefixSelector returns a Selector that matches elements where
  182. // the attribute named key starts with val.
  183. func attributePrefixSelector(key, val string) Selector {
  184. return attributeSelector(key,
  185. func(s string) bool {
  186. return strings.HasPrefix(s, val)
  187. })
  188. }
  189. // attributeSuffixSelector returns a Selector that matches elements where
  190. // the attribute named key ends with val.
  191. func attributeSuffixSelector(key, val string) Selector {
  192. return attributeSelector(key,
  193. func(s string) bool {
  194. return strings.HasSuffix(s, val)
  195. })
  196. }
  197. // attributeSubstringSelector returns a Selector that matches nodes where
  198. // the attribute named key contains val.
  199. func attributeSubstringSelector(key, val string) Selector {
  200. return attributeSelector(key,
  201. func(s string) bool {
  202. return strings.Contains(s, val)
  203. })
  204. }
  205. // attributeRegexSelector returns a Selector that matches nodes where
  206. // the attribute named key matches the regular expression rx
  207. func attributeRegexSelector(key string, rx *regexp.Regexp) Selector {
  208. return attributeSelector(key,
  209. func(s string) bool {
  210. return rx.MatchString(s)
  211. })
  212. }
  213. // intersectionSelector returns a selector that matches nodes that match
  214. // both a and b.
  215. func intersectionSelector(a, b Selector) Selector {
  216. return func(n *html.Node) bool {
  217. return a(n) && b(n)
  218. }
  219. }
  220. // unionSelector returns a selector that matches elements that match
  221. // either a or b.
  222. func unionSelector(a, b Selector) Selector {
  223. return func(n *html.Node) bool {
  224. return a(n) || b(n)
  225. }
  226. }
  227. // negatedSelector returns a selector that matches elements that do not match a.
  228. func negatedSelector(a Selector) Selector {
  229. return func(n *html.Node) bool {
  230. if n.Type != html.ElementNode {
  231. return false
  232. }
  233. return !a(n)
  234. }
  235. }
  236. // writeNodeText writes the text contained in n and its descendants to b.
  237. func writeNodeText(n *html.Node, b *bytes.Buffer) {
  238. switch n.Type {
  239. case html.TextNode:
  240. b.WriteString(n.Data)
  241. case html.ElementNode:
  242. for c := n.FirstChild; c != nil; c = c.NextSibling {
  243. writeNodeText(c, b)
  244. }
  245. }
  246. }
  247. // nodeText returns the text contained in n and its descendants.
  248. func nodeText(n *html.Node) string {
  249. var b bytes.Buffer
  250. writeNodeText(n, &b)
  251. return b.String()
  252. }
  253. // nodeOwnText returns the contents of the text nodes that are direct
  254. // children of n.
  255. func nodeOwnText(n *html.Node) string {
  256. var b bytes.Buffer
  257. for c := n.FirstChild; c != nil; c = c.NextSibling {
  258. if c.Type == html.TextNode {
  259. b.WriteString(c.Data)
  260. }
  261. }
  262. return b.String()
  263. }
  264. // textSubstrSelector returns a selector that matches nodes that
  265. // contain the given text.
  266. func textSubstrSelector(val string) Selector {
  267. return func(n *html.Node) bool {
  268. text := strings.ToLower(nodeText(n))
  269. return strings.Contains(text, val)
  270. }
  271. }
  272. // ownTextSubstrSelector returns a selector that matches nodes that
  273. // directly contain the given text
  274. func ownTextSubstrSelector(val string) Selector {
  275. return func(n *html.Node) bool {
  276. text := strings.ToLower(nodeOwnText(n))
  277. return strings.Contains(text, val)
  278. }
  279. }
  280. // textRegexSelector returns a selector that matches nodes whose text matches
  281. // the specified regular expression
  282. func textRegexSelector(rx *regexp.Regexp) Selector {
  283. return func(n *html.Node) bool {
  284. return rx.MatchString(nodeText(n))
  285. }
  286. }
  287. // ownTextRegexSelector returns a selector that matches nodes whose text
  288. // directly matches the specified regular expression
  289. func ownTextRegexSelector(rx *regexp.Regexp) Selector {
  290. return func(n *html.Node) bool {
  291. return rx.MatchString(nodeOwnText(n))
  292. }
  293. }
  294. // hasChildSelector returns a selector that matches elements
  295. // with a child that matches a.
  296. func hasChildSelector(a Selector) Selector {
  297. return func(n *html.Node) bool {
  298. if n.Type != html.ElementNode {
  299. return false
  300. }
  301. return hasChildMatch(n, a)
  302. }
  303. }
  304. // hasDescendantSelector returns a selector that matches elements
  305. // with any descendant that matches a.
  306. func hasDescendantSelector(a Selector) Selector {
  307. return func(n *html.Node) bool {
  308. if n.Type != html.ElementNode {
  309. return false
  310. }
  311. return hasDescendantMatch(n, a)
  312. }
  313. }
  314. // nthChildSelector returns a selector that implements :nth-child(an+b).
  315. // If last is true, implements :nth-last-child instead.
  316. // If ofType is true, implements :nth-of-type instead.
  317. func nthChildSelector(a, b int, last, ofType bool) Selector {
  318. return func(n *html.Node) bool {
  319. if n.Type != html.ElementNode {
  320. return false
  321. }
  322. parent := n.Parent
  323. if parent == nil {
  324. return false
  325. }
  326. i := -1
  327. count := 0
  328. for c := parent.FirstChild; c != nil; c = c.NextSibling {
  329. if (c.Type != html.ElementNode) || (ofType && c.Data != n.Data) {
  330. continue
  331. }
  332. count++
  333. if c == n {
  334. i = count
  335. if !last {
  336. break
  337. }
  338. }
  339. }
  340. if i == -1 {
  341. // This shouldn't happen, since n should always be one of its parent's children.
  342. return false
  343. }
  344. if last {
  345. i = count - i + 1
  346. }
  347. i -= b
  348. if a == 0 {
  349. return i == 0
  350. }
  351. return i%a == 0 && i/a >= 0
  352. }
  353. }
  354. // simpleNthChildSelector returns a selector that implements :nth-child(b).
  355. // If ofType is true, implements :nth-of-type instead.
  356. func simpleNthChildSelector(b int, ofType bool) Selector {
  357. return func(n *html.Node) bool {
  358. if n.Type != html.ElementNode {
  359. return false
  360. }
  361. parent := n.Parent
  362. if parent == nil {
  363. return false
  364. }
  365. count := 0
  366. for c := parent.FirstChild; c != nil; c = c.NextSibling {
  367. if c.Type != html.ElementNode || (ofType && c.Data != n.Data) {
  368. continue
  369. }
  370. count++
  371. if c == n {
  372. return count == b
  373. }
  374. if count >= b {
  375. return false
  376. }
  377. }
  378. return false
  379. }
  380. }
  381. // simpleNthLastChildSelector returns a selector that implements
  382. // :nth-last-child(b). If ofType is true, implements :nth-last-of-type
  383. // instead.
  384. func simpleNthLastChildSelector(b int, ofType bool) Selector {
  385. return func(n *html.Node) bool {
  386. if n.Type != html.ElementNode {
  387. return false
  388. }
  389. parent := n.Parent
  390. if parent == nil {
  391. return false
  392. }
  393. count := 0
  394. for c := parent.LastChild; c != nil; c = c.PrevSibling {
  395. if c.Type != html.ElementNode || (ofType && c.Data != n.Data) {
  396. continue
  397. }
  398. count++
  399. if c == n {
  400. return count == b
  401. }
  402. if count >= b {
  403. return false
  404. }
  405. }
  406. return false
  407. }
  408. }
  409. // onlyChildSelector returns a selector that implements :only-child.
  410. // If ofType is true, it implements :only-of-type instead.
  411. func onlyChildSelector(ofType bool) Selector {
  412. return func(n *html.Node) bool {
  413. if n.Type != html.ElementNode {
  414. return false
  415. }
  416. parent := n.Parent
  417. if parent == nil {
  418. return false
  419. }
  420. count := 0
  421. for c := parent.FirstChild; c != nil; c = c.NextSibling {
  422. if (c.Type != html.ElementNode) || (ofType && c.Data != n.Data) {
  423. continue
  424. }
  425. count++
  426. if count > 1 {
  427. return false
  428. }
  429. }
  430. return count == 1
  431. }
  432. }
  433. // inputSelector is a Selector that matches input, select, textarea and button elements.
  434. func inputSelector(n *html.Node) bool {
  435. return n.Type == html.ElementNode && (n.Data == "input" || n.Data == "select" || n.Data == "textarea" || n.Data == "button")
  436. }
  437. // emptyElementSelector is a Selector that matches empty elements.
  438. func emptyElementSelector(n *html.Node) bool {
  439. if n.Type != html.ElementNode {
  440. return false
  441. }
  442. for c := n.FirstChild; c != nil; c = c.NextSibling {
  443. switch c.Type {
  444. case html.ElementNode, html.TextNode:
  445. return false
  446. }
  447. }
  448. return true
  449. }
  450. // descendantSelector returns a Selector that matches an element if
  451. // it matches d and has an ancestor that matches a.
  452. func descendantSelector(a, d Selector) Selector {
  453. return func(n *html.Node) bool {
  454. if !d(n) {
  455. return false
  456. }
  457. for p := n.Parent; p != nil; p = p.Parent {
  458. if a(p) {
  459. return true
  460. }
  461. }
  462. return false
  463. }
  464. }
  465. // childSelector returns a Selector that matches an element if
  466. // it matches d and its parent matches a.
  467. func childSelector(a, d Selector) Selector {
  468. return func(n *html.Node) bool {
  469. return d(n) && n.Parent != nil && a(n.Parent)
  470. }
  471. }
  472. // siblingSelector returns a Selector that matches an element
  473. // if it matches s2 and in is preceded by an element that matches s1.
  474. // If adjacent is true, the sibling must be immediately before the element.
  475. func siblingSelector(s1, s2 Selector, adjacent bool) Selector {
  476. return func(n *html.Node) bool {
  477. if !s2(n) {
  478. return false
  479. }
  480. if adjacent {
  481. for n = n.PrevSibling; n != nil; n = n.PrevSibling {
  482. if n.Type == html.TextNode || n.Type == html.CommentNode {
  483. continue
  484. }
  485. return s1(n)
  486. }
  487. return false
  488. }
  489. // Walk backwards looking for element that matches s1
  490. for c := n.PrevSibling; c != nil; c = c.PrevSibling {
  491. if s1(c) {
  492. return true
  493. }
  494. }
  495. return false
  496. }
  497. }