784 lines
28 KiB
EmacsLisp
784 lines
28 KiB
EmacsLisp
;;; esxml-query.el --- select esxml nodes jQuery-style
|
||
|
||
;; Copyright (C) 2017 Vasilij Schneidermann <mail@vasilij.de>
|
||
|
||
;; Author: Vasilij Schneidermann <mail@vasilij.de>
|
||
;; Maintainer: Vasilij Schneidermann
|
||
;; Version: 0.1.1
|
||
;; Keywords: data, lisp
|
||
;; Package-Requires: ((cl-lib "0.1"))
|
||
;;
|
||
;; This program is free software; you can redistribute it and/or
|
||
;; modify it under the terms of the GNU General Public License as
|
||
;; published by the Free Software Foundation, either version 3 of the
|
||
;; License, or (at your option) any later version.
|
||
|
||
;; This program is distributed in the hope that it will be useful,
|
||
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
;; GNU General Public License for more details.
|
||
|
||
;; You should have received a copy of the GNU General Public License
|
||
;; along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||
|
||
;;; Commentary:
|
||
|
||
;; Traditionally people pick one of the following options when faced
|
||
;; with the task of extracting data from XML in Emacs Lisp:
|
||
;;
|
||
;; - Using regular expressions on the unparsed document
|
||
;; - Manual tree traversal with `assoc', `car' and `cdr'
|
||
;;
|
||
;; Browsers faced a similar problem until jQuery happened, shortly
|
||
;; afterwards they started providing the `node.querySelector' and
|
||
;; `node.querySelectorAll' API for retrieving one or all nodes
|
||
;; matching a given CSS selector. This code implements the same API
|
||
;; with the `esxml-query' and `esxml-query-all' functions. The
|
||
;; following table summarizes the currently supported modifiers and
|
||
;; combinators:
|
||
;;
|
||
;; | Name | Supported? | Syntax |
|
||
;; |------------------------------------+------------+-------------|
|
||
;; | Namespaces | No | foo|bar |
|
||
;; | Commas | Yes | foo, bar |
|
||
;; | Descendant combinator | Yes | foo bar |
|
||
;; | Child combinator | Yes | foo>bar |
|
||
;; | Adjacent sibling combinator | No | foo+bar |
|
||
;; | General sibling combinator | No | foo~bar |
|
||
;; | Universal selector | Yes | * |
|
||
;; | Type selector | Yes | tag |
|
||
;; | ID selector | Yes | #foo |
|
||
;; | Class selector | Yes | .foo |
|
||
;; | Attribute selector | Yes | [foo] |
|
||
;; | Exact match attribute selector | Yes | [foo=bar] |
|
||
;; | Prefix match attribute selector | Yes | [foo^=bar] |
|
||
;; | Suffix match attribute selector | Yes | [foo$=bar] |
|
||
;; | Substring match attribute selector | Yes | [foo*=bar] |
|
||
;; | Include match attribute selector | Yes | [foo~=bar] |
|
||
;; | Dash match attribute selector | Yes | [foo|=bar] |
|
||
;; | Attribute selector modifiers | No | [foo=bar i] |
|
||
;; | Pseudo elements | No | ::foo |
|
||
;; | Pseudo classes | No | :foo |
|
||
|
||
;;; Code:
|
||
|
||
(require 'cl-lib)
|
||
|
||
|
||
;;; CSS selector parsing
|
||
|
||
;; https://www.w3.org/TR/selectors/#w3cselgrammar
|
||
;; https://www.w3.org/TR/selectors4/#grammar
|
||
;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#detailed-grammar
|
||
;; https://www.w3.org/TR/2003/WD-css3-syntax-20030813/#tokenization
|
||
|
||
;; you might be wondering why I'm using both level 3 and 4 standards,
|
||
;; well, the level 3 one has a buggy lexer section whereas level 4
|
||
;; omits crucial parser definitions, so both have to be used...
|
||
|
||
;; TODO: support :not
|
||
(defvar esxml--css-selector-token-matchers
|
||
(let* ((h "[0-9a-f]")
|
||
(nl "\n\\|\r\n\\|\r\\|\f")
|
||
(nonascii "[\200-\U0010ffff]")
|
||
(unicode (format "\\\\%s\\{1,6\\}[ \t\r\n\f]?" h))
|
||
(escape (format "\\(?:%s\\)\\|\\\\[ -~\200-\U0010ffff]" unicode))
|
||
(nmstart (format "[a-z_]\\|%s\\|\\(?:%s\\)" nonascii escape))
|
||
(nmchar (format "[a-z0-9_-]\\|%s\\|\\(?:%s\\)" nonascii escape))
|
||
(num "[0-9]+\\|[0-9]*\\.[0-9]+")
|
||
(string1 (format "\"\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|'\\|%s\\|\\(?:%s\\)\\)*\"" nl nonascii escape))
|
||
(string2 (format "'\\(?:[\t !#$%%&(-~]\\|\\\\\\(?:%s\\)\\|\"\\|%s\\|\\(?:%s\\)\\)*'" nl nonascii escape))
|
||
(ident (format "[-]?\\(?:%s\\)\\(?:%s\\)*" nmstart nmchar))
|
||
(unit (format "[-]?\\(?:%s\\)\\(?:%s\\)+" nmstart nmchar))
|
||
(name (format "\\(?:%s\\)+" nmchar)))
|
||
|
||
`((whitespace . "[ \t\r\n\f]+")
|
||
(string . ,(format "\\(?:%s\\|%s\\)" string1 string2))
|
||
(ident . ,ident)
|
||
(hash . ,(format "#%s" name))
|
||
(function . ,(format "%s(" ident))
|
||
(number . ,num)
|
||
(dimension . ,(format "\\(?:%s\\)%s" num unit))
|
||
(prefix-match . "\\^=")
|
||
(suffix-match . "\\$=")
|
||
(substring-match . "\\*=")
|
||
(include-match . "~=")
|
||
(dash-match . "|=")
|
||
(comma . ",")
|
||
(gt . ">")
|
||
(plus . "\\+")
|
||
(minus . "-")
|
||
(tilde . "~")
|
||
(asterisk . "\\*")
|
||
(period . "\\.")
|
||
(equals . "=")
|
||
(colon . ":")
|
||
(lbracket . "\\[")
|
||
(rbracket . "\\]")
|
||
(rparen . ")"))))
|
||
|
||
(defun esxml--tokenize-css-selector (string)
|
||
(let (result)
|
||
(with-temp-buffer
|
||
(insert string)
|
||
(goto-char (point-min))
|
||
(while (not (eobp))
|
||
(let ((max-length 0)
|
||
longest)
|
||
(dolist (matcher esxml--css-selector-token-matchers)
|
||
(let ((id (car matcher))
|
||
(re (cdr matcher)))
|
||
(when (looking-at re)
|
||
(let* ((token (match-string 0))
|
||
(length (length token)))
|
||
(when (> length max-length)
|
||
(setq max-length length)
|
||
(setq longest (cons id token)))))))
|
||
(when (not longest)
|
||
(error "Invalid token detected: %s"
|
||
(buffer-substring (point) (point-max))))
|
||
(push longest result)
|
||
(goto-char (+ (point) max-length)))))
|
||
(nreverse result)))
|
||
|
||
;; the alternative is creating a mutable object with peek/next methods
|
||
;; and passing it around, so I chose the one requiring less typing, a
|
||
;; dynamically bound variable :<
|
||
|
||
(defvar esxml--token-stream)
|
||
|
||
;; TODO: support :not
|
||
;; css-selector:
|
||
;; css-selector-list;
|
||
;; css-selector-list:
|
||
;; complex-css-selector [ comma whitespace* complex-css-selector ]*;
|
||
;; complex-css-selector:
|
||
;; compound-css-selector [ css-combinator compound-css-selector ]* whitespace*;
|
||
;; css-combinator:
|
||
;; whitespace+ | whitespace* [ '>' | '+' | '~' ] whitespace*;
|
||
;; compound-css-selector:
|
||
;; css-type-selector css-modifier* | css-modifier+;
|
||
;; css-type-selector:
|
||
;; IDENT | *;
|
||
;; css-modifier:
|
||
;; css-id | css-class | css-attrib | css-pseudo;
|
||
;; css-id:
|
||
;; HASH;
|
||
;; css-class:
|
||
;; '.' IDENT;
|
||
;; css-attrib:
|
||
;; '[' whitespace* css-attrib-name ']'
|
||
;; | '[' whitespace* css-attrib-name css-attrib-match css-attrib-value whitespace* ']';
|
||
;; css-attrib-name:
|
||
;; IDENT whitespace*;
|
||
;; css-attrib-match:
|
||
;; [ '=' | PREFIX-MATCH | SUFFIX-MATCH | SUBSTRING-MATCH | INCLUDE-MATCH | DASH-MATCH ] whitespace*;
|
||
;; css-attrib-value:
|
||
;; IDENT | STRING;
|
||
;; css-pseudo:
|
||
;; ':' ':'? [ IDENT | css-functional-pseudo ];
|
||
;; css-functional-pseudo:
|
||
;; FUNCTION whitespace* [ css-expression whitespace* ]+ ')';
|
||
;; css-expression:
|
||
;; '+' | '-' | DIMENSION | NUMBER | STRING | IDENT
|
||
|
||
(defun esxml-query-css-escape (string)
|
||
"Returns escaped version of STRING for use in selectors.
|
||
The logic used here corresponds to the CSS.escape API as
|
||
specified in https://drafts.csswg.org/cssom/#the-css.escape()-method."
|
||
(let (chars)
|
||
(dotimes (i (length string))
|
||
(let* ((char (aref string i))
|
||
(unprintablep (or (and (>= char ?\u0001) (<= char ?\u001f))
|
||
(= char ?\u007f)))
|
||
(nonasciip (>= char ?\u0080))
|
||
(digitp (and (>= char ?\u0030) (<= char ?\u0039)))
|
||
(upperp (and (>= char ?\u0041) (<= char ?\u005a)))
|
||
(lowerp (and (>= char ?\u0061) (<= char ?\u007a))))
|
||
(cond
|
||
((= char ?\u0000)
|
||
(push ?\ufffd chars))
|
||
(unprintablep
|
||
(dolist (char (string-to-list (format "\\%x " char)))
|
||
(push char chars)))
|
||
((and (= i 0) digitp)
|
||
(dolist (char (string-to-list (format "\\%x " char)))
|
||
(push char chars)))
|
||
((and (= i 1) digitp (= (aref string 0) ?-))
|
||
(dolist (char (string-to-list (format "\\%x " char)))
|
||
(push char chars)))
|
||
((and (= i 0) (= char ?-) (= (length string) 1))
|
||
(push ?\\ chars)
|
||
(push char chars))
|
||
((or nonasciip (= char ?-) (= char ?_) digitp upperp lowerp)
|
||
(push char chars))
|
||
(t
|
||
(push ?\\ chars)
|
||
(push char chars)))))
|
||
(concat (nreverse chars))))
|
||
|
||
(defun esxml--parse-css-identifier (string)
|
||
;; https://www.w3.org/TR/css-syntax-3/#consume-string-token
|
||
(let* ((code-points (string-to-list string))
|
||
chars
|
||
token)
|
||
(while code-points
|
||
(let ((char (pop code-points)))
|
||
(if (= char ?\\)
|
||
(let ((char (pop code-points)))
|
||
(cond
|
||
((not char))
|
||
((= char ?\n))
|
||
((or (and (>= char ?0) (<= char ?9))
|
||
(and (>= char ?a) (<= char ?f))
|
||
(and (>= char ?A) (<= char ?F)))
|
||
(let ((i 0)
|
||
(hex-chars (list char)))
|
||
(while (and (< i 5) code-points)
|
||
(let ((char (car code-points)))
|
||
(if (or (and (>= char ?0) (<= char ?9))
|
||
(and (>= char ?a) (<= char ?f))
|
||
(and (>= char ?A) (<= char ?F)))
|
||
(push (pop code-points) hex-chars)
|
||
(setq i 5)))
|
||
(setq i (1+ i)))
|
||
(let ((char (car code-points)))
|
||
(when (and char (= char ?\s))
|
||
(pop code-points)))
|
||
(let* ((hex-token (concat (nreverse hex-chars)))
|
||
(code-point (string-to-number hex-token 16)))
|
||
(if (or (zerop code-point)
|
||
(and (>= code-point ?\ud800) (<= code-point ?\udfff))
|
||
(> code-point ?\U0010ffff))
|
||
(push ?\ufffd chars)
|
||
(push code-point chars)))))
|
||
(t ; unspecified: non-hex digit
|
||
(push char chars))))
|
||
(push char chars))))
|
||
(concat (nreverse chars))))
|
||
|
||
(defun esxml--parse-css-string-literal (string)
|
||
(esxml--parse-css-identifier (substring string 1 -1)))
|
||
|
||
(defmacro esxml--with-parse-shorthands (&rest body)
|
||
`(cl-macrolet ((peek () '(car esxml--token-stream))
|
||
(next () '(pop esxml--token-stream))
|
||
(accept (type) `(and (peek) (eq (car (peek)) ,type)
|
||
(cdr (next))))
|
||
(eat-whitespace () '(while (accept 'whitespace))))
|
||
,@body))
|
||
(def-edebug-spec esxml--with-parse-shorthands (body))
|
||
|
||
(defun esxml-parse-css-selector (string)
|
||
"Parse CSS selector STRING into a list of alists.
|
||
Each alist represents a complex CSS selector. The result can be
|
||
passed to `esxml-query' and `esxml-query-all' as the selector
|
||
argument."
|
||
(let* ((esxml--token-stream (esxml--tokenize-css-selector string))
|
||
(result (esxml--parse-css-selector-list)))
|
||
(when esxml--token-stream
|
||
(error "Trailing garbage: %s"
|
||
(mapconcat 'cdr esxml--token-stream "")))
|
||
result))
|
||
|
||
(defun esxml--parse-css-selector-list ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((first (esxml--parse-complex-css-selector))
|
||
result)
|
||
(when (not first)
|
||
(error "Expected at least one selector"))
|
||
(push first result)
|
||
|
||
(while (accept 'comma)
|
||
(eat-whitespace)
|
||
(let ((selector (esxml--parse-complex-css-selector)))
|
||
(when (not selector)
|
||
(error "Expected selector after comma"))
|
||
(push selector result)))
|
||
(nreverse result))))
|
||
|
||
(defun esxml--parse-complex-css-selector ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((first (esxml--parse-compound-css-selector))
|
||
result done)
|
||
(when first
|
||
(push first result)
|
||
|
||
(while (not done)
|
||
(let ((combinator (esxml--parse-css-combinator)))
|
||
(if combinator
|
||
(let ((compound (esxml--parse-compound-css-selector)))
|
||
(cond
|
||
(compound
|
||
(setq result (append (list compound combinator) result)))
|
||
;; allow whitespace before comma
|
||
((not (eq (car (peek)) 'comma))
|
||
(error "Trailing combinator"))))
|
||
(setq done t))))
|
||
(nreverse result)))))
|
||
|
||
(defun esxml--parse-css-combinator ()
|
||
(esxml--with-parse-shorthands
|
||
;; NOTE: whitespace-surrounded combinators are distinguished from
|
||
;; whitespace-only ones by checking whether there has been
|
||
;; whitespace followed by a non-blank combinator
|
||
(let ((leading-whitespace-p (eq (car (peek)) 'whitespace))
|
||
result)
|
||
(eat-whitespace)
|
||
(let ((type (car (peek))))
|
||
(cond
|
||
((member type '(gt plus tilde))
|
||
(next)
|
||
(cond
|
||
((eq type 'gt)
|
||
(setq result '((combinator . child))))
|
||
((eq type 'plus)
|
||
(setq result '((combinator . direct-sibling))))
|
||
((eq type 'tilde)
|
||
(setq result '((combinator . indirect-sibling)))))
|
||
(eat-whitespace))
|
||
(leading-whitespace-p
|
||
(setq result '((combinator . descendant))))
|
||
(t nil)))
|
||
result)))
|
||
|
||
(defun esxml--parse-compound-css-selector ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((type-selector (esxml--parse-css-type-selector))
|
||
done
|
||
result)
|
||
;; NOTE: css-type-selector css-modifier* | css-modifier+; is
|
||
;; equivalent to: [ css-type-selector | css-modifier ] css-modifier*;
|
||
(if type-selector
|
||
(push type-selector result)
|
||
(let ((modifier (esxml--parse-css-modifier)))
|
||
(if modifier
|
||
(push modifier result)
|
||
;; NOTE: this allows the trailing combinator error to be thrown
|
||
(setq done t))))
|
||
|
||
(while (not done)
|
||
(let ((modifier (esxml--parse-css-modifier)))
|
||
(if modifier
|
||
(push modifier result)
|
||
(setq done t))))
|
||
(when (> (cl-count 'id result :key 'car) 1)
|
||
(error "Only one id selector allowed per compound"))
|
||
(nreverse result))))
|
||
|
||
(defun esxml--parse-css-type-selector ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((token (peek)))
|
||
(cond
|
||
((eq (car token) 'ident)
|
||
(next)
|
||
(cons 'tag (intern (esxml--parse-css-identifier (cdr token)))))
|
||
((eq (car token) 'asterisk)
|
||
(next)
|
||
'(wildcard))
|
||
(t nil)))))
|
||
|
||
(defun esxml--parse-css-modifier ()
|
||
(or (esxml--parse-css-id)
|
||
(esxml--parse-css-class)
|
||
(esxml--parse-css-attrib)
|
||
(esxml--parse-css-pseudo)))
|
||
|
||
(defun esxml--parse-css-id ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((value (accept 'hash)))
|
||
(when value
|
||
(cons 'id (substring value 1))))))
|
||
|
||
(defun esxml--parse-css-class ()
|
||
(esxml--with-parse-shorthands
|
||
(when (accept 'period)
|
||
(let ((value (accept 'ident)))
|
||
(if value
|
||
(cons 'class value)
|
||
(error "Expected identifier after period"))))))
|
||
|
||
(defun esxml--parse-css-attrib ()
|
||
(esxml--with-parse-shorthands
|
||
(let (result)
|
||
(when (accept 'lbracket)
|
||
(eat-whitespace)
|
||
(let ((name (esxml--parse-css-attrib-name)))
|
||
(when (not name)
|
||
(error "Expected attribute name"))
|
||
(push (cons 'name (esxml--parse-css-identifier name)) result)
|
||
(when (not (accept 'rbracket))
|
||
(let ((match (esxml--parse-css-attrib-match)))
|
||
(when (not match)
|
||
(error "Expected attribute matcher"))
|
||
(let ((value (esxml--parse-css-attrib-value)))
|
||
(when (not value)
|
||
(error "Expected attribute value"))
|
||
(eat-whitespace)
|
||
(when (not (accept 'rbracket))
|
||
(error "Unterminated attribute"))
|
||
(push (cons match value) result)))))
|
||
(cons 'attribute (nreverse result))))))
|
||
|
||
(defun esxml--parse-css-attrib-name ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((name (accept 'ident)))
|
||
(when name
|
||
(eat-whitespace)
|
||
name))))
|
||
|
||
(defun esxml--parse-css-attrib-match ()
|
||
(esxml--with-parse-shorthands
|
||
(let (result)
|
||
(cond
|
||
((accept 'equals)
|
||
(setq result 'exact-match))
|
||
((accept 'prefix-match)
|
||
(setq result 'prefix-match))
|
||
((accept 'suffix-match)
|
||
(setq result 'suffix-match))
|
||
((accept 'substring-match)
|
||
(setq result 'substring-match))
|
||
((accept 'include-match)
|
||
(setq result 'include-match))
|
||
((accept 'dash-match)
|
||
(setq result 'dash-match)))
|
||
(eat-whitespace)
|
||
result)))
|
||
|
||
(defun esxml--parse-css-attrib-value ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((token (peek)))
|
||
(cond
|
||
((eq (car token) 'ident)
|
||
(next)
|
||
(esxml--parse-css-identifier (cdr token)))
|
||
((eq (car token) 'string)
|
||
(next)
|
||
(esxml--parse-css-string-literal (cdr token)))
|
||
(t nil)))))
|
||
|
||
(defun esxml--parse-css-pseudo ()
|
||
(esxml--with-parse-shorthands
|
||
(let (result type)
|
||
(when (accept 'colon)
|
||
(if (accept 'colon)
|
||
(setq type 'pseudo-element)
|
||
(setq type 'pseudo-class))
|
||
(let ((functional (esxml--parse-css-functional-pseudo)))
|
||
(if functional
|
||
(if (eq type 'pseudo-class)
|
||
(let ((value (car functional))
|
||
(args (cdr functional)))
|
||
(push (cons 'name (esxml--parse-css-identifier value)) result)
|
||
(push (cons 'args args) result))
|
||
(error "Pseudo-elements may not have arguments"))
|
||
(let ((value (accept 'ident)))
|
||
(if value
|
||
(push (cons 'name (esxml--parse-css-identifier value)) result)
|
||
(error "Expected function or identifier")))))
|
||
(cons type (nreverse result))))))
|
||
|
||
(defun esxml--parse-css-functional-pseudo ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((function (accept 'function))
|
||
result)
|
||
(when function
|
||
(push (substring function 0 -1) result)
|
||
(eat-whitespace)
|
||
(let ((expression (esxml--parse-css-expression))
|
||
done)
|
||
(eat-whitespace)
|
||
(when (not expression)
|
||
(error "Expected at least one expression for function"))
|
||
(push expression result)
|
||
(while (not done)
|
||
(setq expression (esxml--parse-css-expression))
|
||
(if expression
|
||
(progn
|
||
(push expression result)
|
||
(eat-whitespace))
|
||
(setq done t))))
|
||
(when (not (accept 'rparen))
|
||
(error "Unterminated function argument list"))
|
||
(nreverse result)))))
|
||
|
||
(defun esxml--parse-css-expression ()
|
||
(esxml--with-parse-shorthands
|
||
(let ((token (peek)))
|
||
(cond
|
||
((accept 'plus)
|
||
'(operator . +))
|
||
((accept 'minus)
|
||
'(operator . -))
|
||
((eq (car token) 'dimension)
|
||
(next)
|
||
(cons 'dimension (esxml--parse-css-identifier (cdr token))))
|
||
((eq (car token) 'number)
|
||
(next)
|
||
(cons 'number (string-to-number (cdr token))))
|
||
((eq (car token) 'string)
|
||
(next)
|
||
(cons 'string (esxml--parse-css-string-literal (cdr token))))
|
||
((eq (car token) 'ident)
|
||
(next)
|
||
(cons 'ident (esxml--parse-css-identifier (cdr token))))
|
||
(t nil)))))
|
||
|
||
|
||
;;; tree traversal
|
||
|
||
;; TODO: these helpers should be part of esxml.el
|
||
(defun esxml-branch-p (node)
|
||
"Non-nil if NODE refers to an esxml branch."
|
||
(and (listp node)
|
||
(>= (length node) 2)
|
||
(symbolp (car node))
|
||
(listp (cadr node))))
|
||
|
||
(defun esxml-node-tag (node)
|
||
"Returns the tag of NODE if available."
|
||
(and (esxml-branch-p node)
|
||
(car node)))
|
||
|
||
(defun esxml-node-attributes (node)
|
||
"Returns the attributes of NODE if available."
|
||
(and (esxml-branch-p node)
|
||
(cadr node)))
|
||
|
||
(defun esxml-node-attribute (attribute node)
|
||
"Returns the attribute ATTRIBUTE of NODE if available."
|
||
(and (esxml-branch-p node)
|
||
(cdr (assq attribute (cadr node)))))
|
||
|
||
(defun esxml-node-children (node)
|
||
"Returns the children of NODE if available."
|
||
(and (esxml-branch-p node)
|
||
(nthcdr 2 node)))
|
||
|
||
(defun esxml-find-node (pred root)
|
||
"Locates a node satisfying PRED starting from ROOT.
|
||
Returns the node or nil if none found."
|
||
(if (funcall pred root)
|
||
root
|
||
(cl-some (lambda (node) (esxml-find-node pred node))
|
||
(esxml-node-children root))))
|
||
|
||
(defun esxml-visit-nodes (function root)
|
||
"Visit nodes by calling FUNCTION on each starting from ROOT."
|
||
(funcall function root)
|
||
(mapc (lambda (node) (esxml-visit-nodes function node))
|
||
(esxml-node-children root)))
|
||
|
||
(defun esxml-find-nodes (pred root)
|
||
"Locates all nodes satisfying PRED starting from ROOT.
|
||
Returns a list of the nodes or nil if none found."
|
||
(let ((acc '()))
|
||
(esxml-visit-nodes
|
||
(lambda (node)
|
||
(when (funcall pred node)
|
||
(push node acc)))
|
||
root)
|
||
(nreverse acc)))
|
||
|
||
(defun esxml-find-descendant (pred root)
|
||
"Locates a node satisfying PRED starting from ROOT's children.
|
||
Returns the node or nil if none found."
|
||
(cl-some (lambda (node) (esxml-find-node pred node))
|
||
(esxml-node-children root)))
|
||
|
||
(defun esxml-find-descendants (pred root)
|
||
"Locates all nodes satisfying PRED starting from ROOT's children.
|
||
Returns a list of the nodes or nil if none found."
|
||
(cl-mapcan (lambda (node) (esxml-find-nodes pred node))
|
||
(esxml-node-children root)))
|
||
|
||
(defun esxml-find-child (pred root)
|
||
"Locates a node satisfying PRED among ROOT's children.
|
||
Returns the node or nil if none found."
|
||
(cl-some (lambda (node) (when (funcall pred node) node))
|
||
(esxml-node-children root)))
|
||
|
||
(defun esxml-find-children (pred root)
|
||
"Locates all nodes satisfying PRED among ROOT's children.
|
||
Returns a list of the nodes or nil if none found."
|
||
(mapcar (lambda (node) (when (funcall pred node) node))
|
||
(esxml-node-children root)))
|
||
|
||
(defun esxml--node-with-children (node children)
|
||
(let ((tag (esxml-node-tag node))
|
||
(attributes (esxml-node-attributes node)))
|
||
(append (list tag attributes) children)))
|
||
|
||
(defun esxml--node-with-attributes (node attributes)
|
||
(let ((tag (esxml-node-tag node))
|
||
(children (esxml-node-children node)))
|
||
(append (list tag attributes) children)))
|
||
|
||
(defun esxml-tree-map (function root)
|
||
"Returns a copy of ROOT with FUNCTION applied to each node."
|
||
(if (esxml-branch-p root)
|
||
(esxml--node-with-children
|
||
(funcall function root)
|
||
(mapcar (lambda (node) (esxml-tree-map function node))
|
||
(esxml-node-children root)))
|
||
(funcall function root)))
|
||
|
||
(defvar esxml--symbol (make-symbol "id"))
|
||
|
||
(defun esxml--decorate-tree (root)
|
||
(let ((i 0))
|
||
(esxml-tree-map
|
||
(lambda (node)
|
||
(let ((attribute (cons esxml--symbol i))
|
||
(attributes (esxml-node-attributes node)))
|
||
(setq attributes (append (list attribute) attributes))
|
||
(setq i (1+ i))
|
||
(if (esxml-branch-p node)
|
||
(esxml--node-with-attributes node attributes)
|
||
node)))
|
||
root)))
|
||
|
||
(defun esxml--undecorate-node (node)
|
||
(if (esxml-branch-p node)
|
||
(let ((attributes (esxml-node-attributes node)))
|
||
(esxml--node-with-attributes node (assq-delete-all esxml--symbol
|
||
attributes)))
|
||
node))
|
||
|
||
(defun esxml--retrieve-decoration (node)
|
||
(esxml-node-attribute esxml--symbol node))
|
||
|
||
|
||
;;; querying
|
||
|
||
;; NOTE: supporting structural pseudo functions, direct siblings and
|
||
;; indirect siblings requires breadth instead of depth traversal,
|
||
;; something that could be emulated without zippers if you had the
|
||
;; parent of the node (and the position of the child)...
|
||
|
||
(defun esxml--node-matches-attribute-p (node modifier)
|
||
(let ((attributes (esxml-node-attributes node))
|
||
haystack)
|
||
(cl-every
|
||
(lambda (item)
|
||
(let ((type (car item))
|
||
(value (cdr item)))
|
||
(cond
|
||
((eq type 'name)
|
||
(let ((match (assq (intern value) attributes)))
|
||
(setq haystack (cdr match))
|
||
match))
|
||
((eq type 'exact-match)
|
||
(equal haystack value))
|
||
((eq type 'prefix-match)
|
||
(string-prefix-p value haystack))
|
||
((eq type 'suffix-match)
|
||
(string-suffix-p value haystack))
|
||
((eq type 'substring-match)
|
||
(string-match-p (regexp-quote value) haystack))
|
||
((eq type 'include-match)
|
||
(member value (split-string haystack " ")))
|
||
((eq type 'dash-match)
|
||
(or (equal value haystack)
|
||
(string-match-p (format "^%s-" (regexp-quote value)) haystack)))
|
||
(t (error "Unknown attribute modifier")))))
|
||
modifier)))
|
||
|
||
(defun esxml--node-matches-modifier-p (node type value)
|
||
(cond
|
||
((eq type 'wildcard)
|
||
t)
|
||
((eq type 'tag)
|
||
(equal (esxml-node-tag node) value))
|
||
((eq type 'id)
|
||
(equal (esxml-node-attribute 'id node) value))
|
||
((eq type 'class)
|
||
(let ((class (esxml-node-attribute 'class node)))
|
||
(and class (member value (split-string class " ")))))
|
||
((eq type 'attribute)
|
||
(esxml--node-matches-attribute-p node value))
|
||
;; TODO: support structural pseudo functions
|
||
;; TODO: error out on invalid pseudo-class arguments
|
||
(t (error "Unimplemented attribute type: %s" type))))
|
||
|
||
(defun esxml--find-node-for (attributes)
|
||
(lambda (node)
|
||
(cl-every
|
||
(lambda (attribute)
|
||
(let ((type (car attribute))
|
||
(value (cdr attribute)))
|
||
(esxml--node-matches-modifier-p node type value)))
|
||
attributes)))
|
||
|
||
(defun esxml--find-nodes (root combinator attributes)
|
||
(let* ((type (cdr (assq 'combinator combinator)))
|
||
(walker (cond
|
||
((not type)
|
||
'esxml-find-nodes)
|
||
((eq type 'descendant)
|
||
'esxml-find-descendants)
|
||
((eq type 'child)
|
||
'esxml-find-children)
|
||
;; TODO: support direct sibling
|
||
;; TODO: support indirect sibling
|
||
(t (error "Unimplemented combinator %s" combinator)))))
|
||
(funcall walker (esxml--find-node-for attributes) root)))
|
||
|
||
(defun esxml--query (selector root)
|
||
(let* ((attributes (pop selector))
|
||
combinator
|
||
(result (esxml--find-nodes root nil attributes)))
|
||
(while (and result selector)
|
||
(setq combinator (pop selector))
|
||
(setq attributes (pop selector))
|
||
(setq result (cl-mapcan
|
||
(lambda (node)
|
||
(esxml--find-nodes node combinator attributes))
|
||
result))
|
||
(setq result (delq nil result)))
|
||
result))
|
||
|
||
(defun esxml--delete-dups (items test)
|
||
(let ((seen (make-hash-table :test test))
|
||
result)
|
||
(while items
|
||
(let ((item (pop items)))
|
||
(when (not (gethash item seen))
|
||
(push item result)
|
||
(puthash item t seen))))
|
||
(nreverse result)))
|
||
|
||
(defun esxml-query-all (selector root)
|
||
"Locates all nodes satisfying SELECTOR starting from ROOT.
|
||
SELECTOR must be a string containing a CSS selector or a parsed
|
||
CSS selector returned by `esxml-parse-css-selector'. Returns a
|
||
list of the nodes or nil if none found."
|
||
(when (stringp selector)
|
||
(setq selector (esxml-parse-css-selector selector)))
|
||
(if (= (length selector) 1)
|
||
;; no commas, we can only get the same nodes repeatedly
|
||
(esxml--delete-dups (esxml--query (car selector) root) 'eq)
|
||
;; commas, nodes might be the same *and* in the wrong order
|
||
(setq root (esxml--decorate-tree root))
|
||
(let (result)
|
||
(while selector
|
||
(setq result (nconc result (esxml--query (pop selector) root))))
|
||
(setq result (cl-sort result '< :key 'esxml--retrieve-decoration))
|
||
(setq result (cl-delete-duplicates result :test '=
|
||
:key 'esxml--retrieve-decoration))
|
||
(mapcar (lambda (node) (esxml--undecorate-node node)) result))))
|
||
|
||
(defun esxml-query (selector root)
|
||
"Locates a node satisfying SELECTOR starting from ROOT.
|
||
SELECTOR must be a string containing a CSS selector or a parsed
|
||
CSS selector returned by `esxml-parse-css-selector'. Returns the
|
||
node or nil if none found."
|
||
;; NOTE: you can do a bit less work (the savings decrease the more
|
||
;; branches the query discards), but it's simpler and safer to just
|
||
;; have the same algorithm for both entry points
|
||
(car (esxml-query-all selector root)))
|
||
|
||
(provide 'esxml-query)
|
||
;;; esxml-query.el ends here
|